aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SIInstructions.td')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstructions.td425
1 files changed, 326 insertions, 99 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index b6b00c2e4257..70f20bb69370 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1,9 +1,8 @@
//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file was originally auto-generated from a GPU register header file and
@@ -12,7 +11,7 @@
//===----------------------------------------------------------------------===//
class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl {
- let SubtargetPredicate = isGCN;
+
}
include "SOPInstructions.td"
@@ -122,7 +121,14 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
-def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
+def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
+ let Defs = [EXEC];
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
+def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
let hasSideEffects = 0;
let mayLoad = 0;
let mayStore = 0;
@@ -155,13 +161,12 @@ def S_SUB_U64_PSEUDO : SPseudoInstSI <
>;
def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
- (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+ (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
>;
def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
- (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+ (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
>;
-
} // End usesCustomInserter = 1, Defs = [SCC]
let usesCustomInserter = 1 in {
@@ -169,23 +174,30 @@ def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
} // End let usesCustomInserter = 1, SALU = 1
-def S_MOV_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
- (ins SSrc_b64:$src0)> {
- let isAsCheapAsAMove = 1;
+// Wrap an instruction by duplicating it, except for setting isTerminator.
+class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
+ base_inst.OutOperandList,
+ base_inst.InOperandList> {
+ let Uses = base_inst.Uses;
+ let Defs = base_inst.Defs;
let isTerminator = 1;
+ let isAsCheapAsAMove = base_inst.isAsCheapAsAMove;
+ let hasSideEffects = base_inst.hasSideEffects;
+ let UseNamedOperandTable = base_inst.UseNamedOperandTable;
+ let CodeSize = base_inst.CodeSize;
}
-def S_XOR_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
- (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
- let isAsCheapAsAMove = 1;
- let isTerminator = 1;
- let Defs = [SCC];
+let WaveSizePredicate = isWave64 in {
+def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
+def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
+def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
}
-def S_ANDN2_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
- (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
- let isAsCheapAsAMove = 1;
- let isTerminator = 1;
+let WaveSizePredicate = isWave32 in {
+def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
+def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
+def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
+def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
}
def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
@@ -195,7 +207,6 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
- let isBarrier = 1;
let isConvergent = 1;
let FixedSize = 1;
let Size = 0;
@@ -222,30 +233,30 @@ let isTerminator = 1 in {
let OtherPredicates = [EnableLateCFGStructurize] in {
def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
(outs),
- (ins SReg_64:$vcc, brtarget:$target),
+ (ins SReg_1:$vcc, brtarget:$target),
[(brcond i1:$vcc, bb:$target)]> {
let Size = 12;
}
}
def SI_IF: CFPseudoInstSI <
- (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
- [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
+ (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target),
+ [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
let Constraints = "";
let Size = 12;
let hasSideEffects = 1;
}
def SI_ELSE : CFPseudoInstSI <
- (outs SReg_64:$dst),
- (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+ (outs SReg_1:$dst),
+ (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
let Size = 12;
let hasSideEffects = 1;
}
def SI_LOOP : CFPseudoInstSI <
- (outs), (ins SReg_64:$saved, brtarget:$target),
- [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
+ (outs), (ins SReg_1:$saved, brtarget:$target),
+ [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> {
let Size = 8;
let isBranch = 1;
let hasSideEffects = 1;
@@ -254,8 +265,7 @@ def SI_LOOP : CFPseudoInstSI <
} // End isTerminator = 1
def SI_END_CF : CFPseudoInstSI <
- (outs), (ins SReg_64:$saved),
- [(int_amdgcn_end_cf i64:$saved)], 1, 1> {
+ (outs), (ins SReg_1:$saved), [], 1, 1> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
@@ -265,8 +275,7 @@ def SI_END_CF : CFPseudoInstSI <
}
def SI_IF_BREAK : CFPseudoInstSI <
- (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
- [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
+ (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
@@ -292,7 +301,7 @@ multiclass PseudoInstKill <dag ins> {
}
}
-defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
+defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
let Defs = [EXEC,VCC] in
@@ -311,7 +320,7 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
}
def SI_PS_LIVE : PseudoInstSI <
- (outs SReg_64:$dst), (ins),
+ (outs SReg_1:$dst), (ins),
[(set i1:$dst, (int_amdgcn_ps_live))]> {
let SALU = 1;
}
@@ -340,6 +349,15 @@ def SI_INIT_EXEC : SPseudoInstSI <
let Defs = [EXEC];
let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
+ let WaveSizePredicate = isWave64;
+}
+
+def SI_INIT_EXEC_LO : SPseudoInstSI <
+ (outs), (ins i32imm:$src), []> {
+ let Defs = [EXEC_LO];
+ let usesCustomInserter = 1;
+ let isAsCheapAsAMove = 1;
+ let WaveSizePredicate = isWave32;
}
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
@@ -374,11 +392,14 @@ def SI_RETURN : SPseudoInstSI <
// This version is only needed so we can fill in the output regiter in
// the custom inserter.
def SI_CALL_ISEL : SPseudoInstSI <
- (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> {
+ (outs), (ins SSrc_b64:$src0, unknown:$callee),
+ [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> {
let Size = 4;
let isCall = 1;
let SchedRW = [WriteBranch];
let usesCustomInserter = 1;
+ // TODO: Should really base this on the call target
+ let isConvergent = 1;
}
// Wrapper around s_swappc_b64 with extra $callee parameter to track
@@ -389,23 +410,14 @@ def SI_CALL : SPseudoInstSI <
let isCall = 1;
let UseNamedOperandTable = 1;
let SchedRW = [WriteBranch];
+ // TODO: Should really base this on the call target
+ let isConvergent = 1;
}
// Tail call handling pseudo
-def SI_TCRETURN_ISEL : SPseudoInstSI<(outs),
- (ins SSrc_b64:$src0, i32imm:$fpdiff),
- [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> {
- let isCall = 1;
- let isTerminator = 1;
- let isReturn = 1;
- let isBarrier = 1;
- let SchedRW = [WriteBranch];
- let usesCustomInserter = 1;
-}
-
-def SI_TCRETURN : SPseudoInstSI <
- (outs),
- (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> {
+def SI_TCRETURN : SPseudoInstSI <(outs),
+ (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff),
+ [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
let Size = 4;
let isCall = 1;
let isTerminator = 1;
@@ -413,6 +425,8 @@ def SI_TCRETURN : SPseudoInstSI <
let isBarrier = 1;
let UseNamedOperandTable = 1;
let SchedRW = [WriteBranch];
+ // TODO: Should really base this on the call target
+ let isConvergent = 1;
}
@@ -424,6 +438,8 @@ def ADJCALLSTACKUP : SPseudoInstSI<
let FixedSize = 1;
let hasSideEffects = 1;
let usesCustomInserter = 1;
+ let SchedRW = [WriteSALU];
+ let Defs = [SCC];
}
def ADJCALLSTACKDOWN : SPseudoInstSI<
@@ -433,6 +449,8 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
let Size = 8; // Worst case. (s_add_u32 + constant)
let hasSideEffects = 1;
let usesCustomInserter = 1;
+ let SchedRW = [WriteSALU];
+ let Defs = [SCC];
}
let Defs = [M0, EXEC, SCC],
@@ -490,9 +508,12 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
// SI_SPILL_32_* instructions.
defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let UseNamedOperandTable = 1, VGPRSpill = 1,
@@ -504,7 +525,9 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let mayStore = 1;
let mayLoad = 0;
// (2 * 4) + (8 * num_subregs) bytes maximum
- let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
def _RESTORE : VPseudoInstSI <
@@ -515,7 +538,9 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let mayLoad = 1;
// (2 * 4) + (8 * num_subregs) bytes maximum
- let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
} // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
}
@@ -524,21 +549,74 @@ defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
+
+multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> {
+ let UseNamedOperandTable = 1, VGPRSpill = 1,
+ Constraints = "@earlyclobber $tmp",
+ SchedRW = [WriteVMEM] in {
+ def _SAVE : VPseudoInstSI <
+ (outs VGPR_32:$tmp),
+ (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+ SReg_32:$soffset, i32imm:$offset)> {
+ let mayStore = 1;
+ let mayLoad = 0;
+ // (2 * 4) + (16 * num_subregs) bytes maximum
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+ }
+
+ def _RESTORE : VPseudoInstSI <
+ (outs vgpr_class:$vdata, VGPR_32:$tmp),
+ (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
+ i32imm:$offset)> {
+ let mayStore = 0;
+ let mayLoad = 1;
+
+ // (2 * 4) + (16 * num_subregs) bytes maximum
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+ }
+ } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
+}
+
+defm SI_SPILL_A32 : SI_SPILL_AGPR <AGPR_32>;
+defm SI_SPILL_A64 : SI_SPILL_AGPR <AReg_64>;
+defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>;
+defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>;
+defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>;
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
(ins si_ga:$ptr_lo, si_ga:$ptr_hi),
[(set SReg_64:$dst,
- (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> {
+ (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> {
let Defs = [SCC];
}
def : GCNPat <
+ (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0),
+ (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
+>;
+
+def : GCNPat <
(AMDGPUinit_exec i64:$src),
(SI_INIT_EXEC (as_i64imm $src))
->;
+> {
+ let WaveSizePredicate = isWave64;
+}
+
+def : GCNPat <
+ (AMDGPUinit_exec i64:$src),
+ (SI_INIT_EXEC_LO (as_i32imm $src))
+> {
+ let WaveSizePredicate = isWave32;
+}
def : GCNPat <
(AMDGPUinit_exec_from_input i32:$input, i32:$shift),
@@ -551,7 +629,7 @@ def : GCNPat<
>;
def : GCNPat<
- (AMDGPUelse i64:$src, bb:$target),
+ (AMDGPUelse i1:$src, bb:$target),
(SI_ELSE $src, $target, 0)
>;
@@ -584,7 +662,12 @@ def : Pat <
// TODO: we could add more variants for other types of conditionals
def : Pat <
- (int_amdgcn_icmp i1:$src, (i1 0), (i32 33)),
+ (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
+ (COPY $src) // Return the SGPRs representing i1 src
+>;
+
+def : Pat <
+ (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
(COPY $src) // Return the SGPRs representing i1 src
>;
@@ -592,7 +675,7 @@ def : Pat <
// VOP1 Patterns
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] in {
+let OtherPredicates = [UnsafeFPMath] in {
//def : RcpPat<V_RCP_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F64_e32, f64>;
@@ -615,7 +698,7 @@ def : GCNPat <
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-} // End SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath]
+} // End OtherPredicates = [UnsafeFPMath]
// f16_to_fp patterns
@@ -706,17 +789,18 @@ def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
let SubtargetPredicate = Has16BitInsts;
}
-multiclass SelectPat <ValueType vt, Instruction inst> {
+multiclass SelectPat <ValueType vt> {
def : GCNPat <
- (vt (select i1:$src0, vt:$src1, vt:$src2)),
- (inst $src2, $src1, $src0)
+ (vt (select i1:$src0, (VOP3Mods_f32 vt:$src1, i32:$src1_mods),
+ (VOP3Mods_f32 vt:$src2, i32:$src2_mods))),
+ (V_CNDMASK_B32_e64 $src2_mods, $src2, $src1_mods, $src1, $src0)
>;
}
-defm : SelectPat <i16, V_CNDMASK_B32_e64>;
-defm : SelectPat <i32, V_CNDMASK_B32_e64>;
-defm : SelectPat <f16, V_CNDMASK_B32_e64>;
-defm : SelectPat <f32, V_CNDMASK_B32_e64>;
+defm : SelectPat <i16>;
+defm : SelectPat <i32>;
+defm : SelectPat <f16>;
+defm : SelectPat <f32>;
let AddedComplexity = 1 in {
def : GCNPat <
@@ -749,6 +833,22 @@ foreach Index = 0-2 in {
>;
}
+foreach Index = 0-2 in {
+ def Extract_Element_v3i32_#Index : Extract_Element <
+ i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v3i32_#Index : Insert_Element <
+ i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v3f32_#Index : Extract_Element <
+ f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v3f32_#Index : Insert_Element <
+ f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-3 in {
def Extract_Element_v4i32_#Index : Extract_Element <
i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -765,6 +865,22 @@ foreach Index = 0-3 in {
>;
}
+foreach Index = 0-4 in {
+ def Extract_Element_v5i32_#Index : Extract_Element <
+ i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v5i32_#Index : Insert_Element <
+ i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v5f32_#Index : Extract_Element <
+ f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v5f32_#Index : Insert_Element <
+ f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-7 in {
def Extract_Element_v8i32_#Index : Extract_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -818,7 +934,23 @@ def : Pat <
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
>;
-let SubtargetPredicate = isGCN in {
+foreach Index = 0-31 in {
+ def Extract_Element_v32i32_#Index : Extract_Element <
+ i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Insert_Element_v32i32_#Index : Insert_Element <
+ i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v32f32_#Index : Extract_Element <
+ f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Insert_Element_v32f32_#Index : Insert_Element <
+ f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
// FIXME: Why do only some of these type combinations for SReg and
// VReg?
@@ -882,6 +1014,10 @@ def : BitConvert <i64, v4f16, VReg_64>;
def : BitConvert <v4i32, v4f32, VReg_128>;
def : BitConvert <v4f32, v4i32, VReg_128>;
+// 96-bit bitcast
+def : BitConvert <v3i32, v3f32, SGPR_96>;
+def : BitConvert <v3f32, v3i32, SGPR_96>;
+
// 128-bit bitcast
def : BitConvert <v2i64, v4i32, SReg_128>;
def : BitConvert <v4i32, v2i64, SReg_128>;
@@ -892,6 +1028,10 @@ def : BitConvert <v4i32, v2f64, VReg_128>;
def : BitConvert <v2i64, v2f64, VReg_128>;
def : BitConvert <v2f64, v2i64, VReg_128>;
+// 160-bit bitcast
+def : BitConvert <v5i32, v5f32, SGPR_160>;
+def : BitConvert <v5f32, v5i32, SGPR_160>;
+
// 256-bit bitcast
def : BitConvert <v8i32, v8f32, SReg_256>;
def : BitConvert <v8f32, v8i32, SReg_256>;
@@ -902,7 +1042,9 @@ def : BitConvert <v8f32, v8i32, VReg_256>;
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
-} // End SubtargetPredicate = isGCN
+// 1024-bit bitcast
+def : BitConvert <v32i32, v32f32, VReg_1024>;
+def : BitConvert <v32f32, v32i32, VReg_1024>;
/********** =================== **********/
/********** Src & Dst modifiers **********/
@@ -1070,6 +1212,16 @@ def : GCNPat <
(S_MOV_B32 imm:$imm)
>;
+def : GCNPat <
+ (VGPRImm<(SIlds tglobaladdr:$ga)>),
+ (V_MOV_B32_e32 $ga)
+>;
+
+def : GCNPat <
+ (SIlds tglobaladdr:$ga),
+ (S_MOV_B32 $ga)
+>;
+
// FIXME: Workaround for ordering issue with peephole optimizer where
// a register class copy interferes with immediate folding. Should
// use s_mov_b32, which can be shrunk to s_movk_i32
@@ -1104,7 +1256,16 @@ def : GCNPat <
def : GCNPat <
(i1 imm:$imm),
(S_MOV_B64 (i64 (as_i64imm $imm)))
->;
+> {
+ let WaveSizePredicate = isWave64;
+}
+
+def : GCNPat <
+ (i1 imm:$imm),
+ (S_MOV_B32 (i32 (as_i32imm $imm)))
+> {
+ let WaveSizePredicate = isWave32;
+}
def : GCNPat <
(f64 InlineFPImm<f64>:$imm),
@@ -1115,18 +1276,18 @@ def : GCNPat <
/********** Intrinsic Patterns **********/
/********** ================== **********/
-let SubtargetPredicate = isGCN in {
def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
-}
def : GCNPat <
(i32 (sext i1:$src0)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0)
>;
class Ext32Pat <SDNode ext> : GCNPat <
(i32 (ext i1:$src0)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1), $src0)
>;
def : Ext32Pat <zext>;
@@ -1144,8 +1305,6 @@ def : GCNPat <
// VOP3 Patterns
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN in {
-
def : IMad24Pat<V_MAD_I32_I24, 1>;
def : UMad24Pat<V_MAD_U32_U24, 1>;
@@ -1153,8 +1312,6 @@ def : UMad24Pat<V_MAD_U32_U24, 1>;
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
-}
-
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -1261,8 +1418,9 @@ def : GCNPat <
class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,
- (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
- (S_MOV_B32 (i32 0)), sub1)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
+ sub0, (S_MOV_B32 (i32 0)), sub1)
>;
@@ -1280,8 +1438,10 @@ def : GCNPat <
def : GCNPat <
(i64 (sext i1:$src)),
(REG_SEQUENCE VReg_64,
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0,
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1)
>;
class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat <
@@ -1296,10 +1456,12 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// If we need to perform a logical operation on i1 values, we need to
// use vector comparisons since there is only one SCC register. Vector
-// comparisons still write to a pair of SGPRs, so treat these as
-// 64-bit comparisons. When legalizing SGPR copies, instructions
-// resulting in the copies from SCC to these instructions will be
-// moved to the VALU.
+// comparisons may write to a pair of SGPRs or a single SGPR, so treat
+// these as 32 or 64-bit comparisons. When legalizing SGPR copies,
+// instructions resulting in the copies from SCC to these instructions
+// will be moved to the VALU.
+
+let WaveSizePredicate = isWave64 in {
def : GCNPat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B64 $src0, $src1)
@@ -1336,35 +1498,89 @@ def : GCNPat <
(S_NOT_B64 $src0)
>;
}
+} // end isWave64
+
+let WaveSizePredicate = isWave32 in {
+def : GCNPat <
+ (i1 (and i1:$src0, i1:$src1)),
+ (S_AND_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (xor i1:$src0, i1:$src1)),
+ (S_XOR_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (add i1:$src0, i1:$src1)),
+ (S_XOR_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (sub i1:$src0, i1:$src1)),
+ (S_XOR_B32 $src0, $src1)
+>;
+
+let AddedComplexity = 1 in {
+def : GCNPat <
+ (i1 (add i1:$src0, (i1 -1))),
+ (S_NOT_B32 $src0)
+>;
+
+def : GCNPat <
+ (i1 (sub i1:$src0, (i1 -1))),
+ (S_NOT_B32 $src0)
+>;
+}
+} // end isWave32
def : GCNPat <
(f16 (sint_to_fp i1:$src)),
- (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src))
+ (V_CVT_F16_F32_e32 (
+ V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
+ $src))
>;
def : GCNPat <
(f16 (uint_to_fp i1:$src)),
- (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src))
+ (V_CVT_F16_F32_e32 (
+ V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
+ $src))
>;
def : GCNPat <
(f32 (sint_to_fp i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
+ $src)
>;
def : GCNPat <
(f32 (uint_to_fp i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
+ $src)
>;
def : GCNPat <
(f64 (sint_to_fp i1:$src)),
- (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+ (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1),
+ $src))
>;
def : GCNPat <
(f64 (uint_to_fp i1:$src)),
- (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
+ (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1),
+ $src))
>;
//===----------------------------------------------------------------------===//
@@ -1417,7 +1633,7 @@ def : GCNPat<
def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
- (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+ (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
>;
}
@@ -1478,6 +1694,14 @@ def : GCNPat <
>;
} // End OtherPredicates = [HasDLInsts]
+let SubtargetPredicate = isGFX10Plus in
+def : GCNPat <
+ (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+ (f16 (VOP3NoMods f32:$src2))),
+ (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ SRCMODS.NONE, $src2, $clamp, $omod)
+>;
// Allow integer inputs
class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
@@ -1568,7 +1792,7 @@ def : GCNPat <
// Fract Patterns
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isSI in {
+let SubtargetPredicate = isGFX6 in {
// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
@@ -1595,7 +1819,7 @@ def : GCNPat <
DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-} // End SubtargetPredicates = isSI
+} // End SubtargetPredicates = isGFX6
//============================================================================//
// Miscellaneous Optimization Patterns
@@ -1609,6 +1833,13 @@ def : GCNPat<
(S_SUB_I32 $src0, NegSubInlineConst32:$src1)
>;
+// Avoid pointlessly materializing a constant in VGPR.
+// FIXME: Should also do this for readlane, but tablegen crashes on
+// the ignored src1.
+def : GCNPat<
+ (int_amdgcn_readfirstlane (i32 imm:$src)),
+ (S_MOV_B32 $src)
+>;
multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
def : GCNPat <
@@ -1622,8 +1853,6 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
>;
}
-let SubtargetPredicate = isGCN in {
-
defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
@@ -1633,8 +1862,6 @@ defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
-}
-
// This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
class FPMed3Pat<ValueType vt,
@@ -1683,8 +1910,8 @@ multiclass Int16Med3Pat<Instruction med3Inst,
def : FPMed3Pat<f32, V_MED3_F32>;
-let OtherPredicates = [isGFX9] in {
+let OtherPredicates = [isGFX9Plus] in {
def : FP16Med3Pat<f16, V_MED3_F16>;
defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
-} // End Predicates = [isGFX9]
+} // End Predicates = [isGFX9Plus]