diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-01-20 11:41:25 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-01-20 11:41:25 +0000 |
commit | d9484dd61cc151c4f34c31e07f693fefa66316b5 (patch) | |
tree | ab0560b3da293f1fafd3269c59692e929418f5c2 /contrib/llvm/lib/Target/X86/X86InstrSSE.td | |
parent | 79e0962d4c3cf1f0acf359a9d69cb3ac68c414c4 (diff) | |
parent | d8e91e46262bc44006913e6796843909f1ac7bcd (diff) | |
download | src-d9484dd61cc151c4f34c31e07f693fefa66316b5.tar.gz src-d9484dd61cc151c4f34c31e07f693fefa66316b5.zip |
Merge llvm trunk r351319, resolve conflicts, and update FREEBSD-Xlist.
Notes
Notes:
svn path=/projects/clang800-import/; revision=343210
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86InstrSSE.td')
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86InstrSSE.td | 1304 |
1 files changed, 777 insertions, 527 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index 6a9b20998210..e2bcd18ce660 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -35,7 +35,7 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class @@ -57,7 +57,7 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in { !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -81,7 +81,7 @@ multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], d>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class @@ -103,7 +103,7 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), pat_rm, d>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -265,8 +265,6 @@ let Predicates = [UseAVX] in { (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), - (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzload addr:$src)), (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; @@ -349,8 +347,6 @@ let Predicates = [UseSSE2] in { (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), - (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzload addr:$src)), (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; } @@ -593,8 +589,21 @@ let Predicates = [HasAVX, NoVLX] in { // available and changing the domain is beneficial. def : Pat<(alignedloadv4i64 addr:$src), (VMOVAPSYrm addr:$src)>; + def : Pat<(alignedloadv8i32 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(alignedloadv16i16 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(alignedloadv32i8 addr:$src), + (VMOVAPSYrm addr:$src)>; def : Pat<(loadv4i64 addr:$src), (VMOVUPSYrm addr:$src)>; + def : Pat<(loadv8i32 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(loadv16i16 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(loadv32i8 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), (VMOVAPSYmr addr:$dst, VR256:$src)>; def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), @@ -619,8 +628,20 @@ let Predicates = [HasAVX, NoVLX] in { let Predicates = [UseSSE1] in { def : Pat<(alignedloadv2i64 addr:$src), (MOVAPSrm addr:$src)>; + def : Pat<(alignedloadv4i32 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(alignedloadv8i16 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(alignedloadv16i8 addr:$src), + (MOVAPSrm addr:$src)>; def : Pat<(loadv2i64 addr:$src), (MOVUPSrm addr:$src)>; + def : Pat<(loadv4i32 addr:$src), + (MOVUPSrm addr:$src)>; + def : Pat<(loadv8i16 addr:$src), + (MOVUPSrm addr:$src)>; + def : Pat<(loadv16i8 addr:$src), + (MOVUPSrm addr:$src)>; def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), (MOVAPSmr addr:$dst, VR128:$src)>; @@ -652,7 +673,7 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode, (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), !strconcat(base_opc, "s", asm_opr), [], SSEPackedSingle>, PS, - Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; + Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; def PDrm : PI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), @@ -660,7 +681,7 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode, [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))))], SSEPackedDouble>, PD, - Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; + Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; } multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, @@ -820,19 +841,6 @@ let Constraints = "$src1 = $dst" in { Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable; } -// TODO: This is largely to trick fastisel into ignoring the pattern. -def UnpckhUnary : PatFrag<(ops node:$src1, node:$src2), - (X86Unpckh node:$src1, node:$src2), [{ - return N->getOperand(0) == N->getOperand(1); -}]>; - -let Predicates = [UseSSE2] in { - // TODO: This is a hack pattern to allow lowering to emit unpckh instead of - // movhlps for sse2 without changing a bunch of tests. - def : Pat<(v2f64 (UnpckhUnary VR128:$src, VR128:$src)), - (MOVHLPSrr VR128:$src, VR128:$src)>; -} - //===----------------------------------------------------------------------===// // SSE 1 & 2 - Conversion Instructions //===----------------------------------------------------------------------===// @@ -858,7 +866,7 @@ let hasSideEffects = 0 in { let mayLoad = 1 in def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, [(set RC:$dst, (DstTy (sint_to_fp - (SrcTy (bitconvert (ld_frag addr:$src))))))], d>, + (SrcTy (ld_frag addr:$src)))))], d>, Sched<[sched.Folded]>; } } @@ -874,7 +882,7 @@ let hasSideEffects = 0, Predicates = [UseAVX] in { def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } // hasSideEffects = 0 } @@ -1001,18 +1009,17 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", // Conversion Instructions Intrinsics - Match intrinsics which expect MM // and/or XMM operand(s). -// FIXME: We probably want to match the rm form only when optimizing for -// size, to avoid false depenendecies (see sse_fp_unop_s for details) multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - Intrinsic Int, Operand memop, ComplexPattern mem_cpat, - string asm, X86FoldableSchedWrite sched> { + ValueType DstVT, ValueType SrcVT, SDNode OpNode, + Operand memop, ComplexPattern mem_cpat, string asm, + X86FoldableSchedWrite sched> { def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (Int SrcRC:$src))]>, + [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>, Sched<[sched]>; def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (Int mem_cpat:$src))]>, + [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>, Sched<[sched.Folded]>; } @@ -1032,21 +1039,21 @@ let hasSideEffects = 0 in { !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - []>, Sched<[sched.Folded, ReadAfterLd]>; + []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } let Predicates = [UseAVX] in { -defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, - int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", +defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, + X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, VEX, VEX_LIG; -defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, - int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", +defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, + X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG; } -defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, +defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD; -defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, +defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W; @@ -1078,60 +1085,60 @@ let isCodeGenOnly = 1 in { // Aliases for intrinsics let isCodeGenOnly = 1 in { let Predicates = [UseAVX] in { -defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, +defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", WriteCvtSS2I>, XS, VEX; -defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, - int_x86_sse_cvttss2si64, ssmem, sse_load_f32, +defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, + X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", WriteCvtSS2I>, XS, VEX, VEX_W; -defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, +defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", WriteCvtSS2I>, XD, VEX; -defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, - int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, +defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, + X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", WriteCvtSS2I>, XD, VEX, VEX_W; } -defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, +defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", WriteCvtSS2I>, XS; -defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, - int_x86_sse_cvttss2si64, ssmem, sse_load_f32, +defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, + X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", WriteCvtSS2I>, XS, REX_W; -defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, +defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", WriteCvtSD2I>, XD; -defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, - int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, +defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, + X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", WriteCvtSD2I>, XD, REX_W; } // isCodeGenOnly = 1 let Predicates = [UseAVX] in { -defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, +defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, ssmem, sse_load_f32, "cvtss2si", WriteCvtSS2I>, XS, VEX, VEX_LIG; -defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, +defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, ssmem, sse_load_f32, "cvtss2si", WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG; } -defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, +defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, ssmem, sse_load_f32, "cvtss2si", WriteCvtSS2I>, XS; -defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, +defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, ssmem, sse_load_f32, "cvtss2si", WriteCvtSS2I>, XS, REX_W; -defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64, +defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, WriteCvtI2PS>, PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; -defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64, +defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, WriteCvtI2PSY>, PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; -defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64, +defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, "cvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, WriteCvtI2PS>, PS, Requires<[UseSSE2]>; @@ -1186,7 +1193,7 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XD, VEX_4V, VEX_LIG, VEX_WIG, - Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; + Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; } def : Pat<(f32 (fpround FR64:$src)), @@ -1217,7 +1224,7 @@ def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, sse_load_f64:$src2))]>, XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, - Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; + Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; let Constraints = "$src1 = $dst" in { def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -1231,7 +1238,7 @@ def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, sse_load_f64:$src2))]>, XD, Requires<[UseSSE2]>, - Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; + Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; } } // isCodeGenOnly = 1 @@ -1248,7 +1255,7 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XS, VEX_4V, VEX_LIG, VEX_WIG, - Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>, + Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, Requires<[UseAVX, OptForSize]>; } @@ -1295,7 +1302,7 @@ def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>, - Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>; + Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -1307,7 +1314,7 @@ def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "cvtss2sd\t{$src2, $dst|$dst, $src2}", []>, XS, Requires<[UseSSE2]>, - Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>; + Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; } } // isCodeGenOnly = 1 @@ -1690,7 +1697,7 @@ let hasSideEffects = 0, mayLoad = 1 in def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, + (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>, VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", @@ -1700,7 +1707,7 @@ def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>, + (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, VEX_WIG; def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), @@ -1714,7 +1721,7 @@ let hasSideEffects = 0, mayLoad = 1 in def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, + (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>, Sched<[WriteCvtI2PDLd]>; def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", @@ -1826,7 +1833,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), (ld_frag addr:$src2), imm:$cc))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { @@ -1836,7 +1843,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, let mayLoad = 1 in def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>, - Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable; + Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable; } } @@ -1878,7 +1885,7 @@ let mayLoad = 1 in (ins VR128:$src1, memop:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, mem_cpat:$src, imm:$cc))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } let isCodeGenOnly = 1 in { @@ -1920,7 +1927,7 @@ let mayLoad = 1 in !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), (ld_frag addr:$src2)))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -1938,7 +1945,7 @@ let mayLoad = 1 in !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), mem_cpat:$src2))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Defs = [EFLAGS] in { @@ -2003,7 +2010,7 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, [(set RC:$dst, (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { @@ -2013,7 +2020,7 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, let mayLoad = 1 in def rmi_alt : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), - asm_alt, [], d>, Sched<[sched.Folded, ReadAfterLd]>, + asm_alt, [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable; } } @@ -2109,7 +2116,7 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], d>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, @@ -2165,58 +2172,58 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, asm, [(set RC:$dst, (vt (OpNode RC:$src1, (mem_frag addr:$src2))))], d>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Predicates = [HasAVX, NoVLX] in { -defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, +defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load, VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; -defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, +defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load, VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; -defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, +defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load, VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; -defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, +defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load, VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; -defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, +defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load, VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; -defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, +defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load, VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; -defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, +defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load, VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; -defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, +defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; }// Predicates = [HasAVX, NoVLX] let Constraints = "$src1 = $dst" in { - defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, + defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop, VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; - defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, + defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop, VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; - defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, + defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop, VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; - defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, + defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop, VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; } // Constraints = "$src1 = $dst" let Predicates = [HasAVX1Only] in { - def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))), (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))), (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; @@ -2253,6 +2260,16 @@ let Predicates = [HasAVX] in { SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; + + // Also support integer VTs to avoid a int->fp bitcast in the DAG. + def : Pat<(X86movmsk (v4i32 VR128:$src)), + (VMOVMSKPSrr VR128:$src)>; + def : Pat<(X86movmsk (v2i64 VR128:$src)), + (VMOVMSKPDrr VR128:$src)>; + def : Pat<(X86movmsk (v8i32 VR256:$src)), + (VMOVMSKPSYrr VR256:$src)>; + def : Pat<(X86movmsk (v4i64 VR256:$src)), + (VMOVMSKPDYrr VR256:$src)>; } defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", @@ -2260,6 +2277,14 @@ defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", SSEPackedDouble>, PD; +let Predicates = [UseSSE2] in { + // Also support integer VTs to avoid a int->fp bitcast in the DAG. + def : Pat<(X86movmsk (v4i32 VR128:$src)), + (MOVMSKPSrr VR128:$src)>; + def : Pat<(X86movmsk (v2i64 VR128:$src)), + (MOVMSKPDrr VR128:$src)>; +} + //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// @@ -2284,9 +2309,8 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpVT (OpNode RC:$src1, - (bitconvert (memop_frag addr:$src2)))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } } // ExeDomain = SSEPackedInt @@ -2296,16 +2320,16 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, Predicate prd> { let Predicates = [HasAVX, prd] in defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, - VR128, loadv2i64, i128mem, sched.XMM, + VR128, load, i128mem, sched.XMM, IsCommutable, 0>, VEX_4V, VEX_WIG; let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, - memopv2i64, i128mem, sched.XMM, IsCommutable, 1>; + memop, i128mem, sched.XMM, IsCommutable, 1>; let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, - OpVT256, VR256, loadv4i64, i256mem, sched.YMM, + OpVT256, VR256, load, i256mem, sched.YMM, IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; } @@ -2365,24 +2389,136 @@ defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>; let isCommutable = 0 in defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>; +let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), + (VPANDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), + (VPANDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), + (VPANDYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), + (VPORYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), + (VPORYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), + (VPORYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), + (VPXORYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), + (VPXORYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), + (VPXORYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), + (VPANDNYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), + (VPANDNYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), + (VPANDNYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), + (VPANDYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), + (VPANDYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), + (VPANDYrm VR256:$src1, addr:$src2)>; + + def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), + (VPORYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), + (VPORYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), + (VPORYrm VR256:$src1, addr:$src2)>; + + def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), + (VPXORYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), + (VPXORYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), + (VPXORYrm VR256:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), + (VPANDNYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), + (VPANDNYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), + (VPANDNYrm VR256:$src1, addr:$src2)>; +} + // If only AVX1 is supported, we need to handle integer operations with // floating point instructions since the integer versions aren't available. let Predicates = [HasAVX1Only] in { + def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), + (VANDPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), + (VANDPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), + (VANDPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), (VANDPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), + (VORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), + (VORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), + (VORPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), (VORPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), + (VXORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), + (VXORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), + (VXORPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), (VXORPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), + (VANDNPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), + (VANDNPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), + (VANDNPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), (VANDNPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), (VANDPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), (VORPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), (VXORPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), (VANDNPSYrm VR256:$src1, addr:$src2)>; } @@ -2480,6 +2616,122 @@ let Predicates = [UseSSE2] in { FR64)>; } +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), + (VPANDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), + (VPANDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), + (VPANDrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), + (VPORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), + (VPORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), + (VPORrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), + (VPXORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), + (VPXORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), + (VPXORrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), + (VPANDNrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), + (VPANDNrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), + (VPANDNrr VR128:$src1, VR128:$src2)>; + + def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), + (VPANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), + (VPANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), + (VPANDrm VR128:$src1, addr:$src2)>; + + def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), + (VPORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), + (VPORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), + (VPORrm VR128:$src1, addr:$src2)>; + + def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), + (VPXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), + (VPXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), + (VPXORrm VR128:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), + (VPANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), + (VPANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), + (VPANDNrm VR128:$src1, addr:$src2)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), + (PANDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), + (PANDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), + (PANDrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), + (PORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), + (PORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), + (PORrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), + (PXORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), + (PXORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), + (PXORrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), + (PANDNrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), + (PANDNrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), + (PANDNrr VR128:$src1, VR128:$src2)>; + + def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), + (PANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), + (PANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), + (PANDrm VR128:$src1, addr:$src2)>; + + def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), + (PORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), + (PORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), + (PORrm VR128:$src1, addr:$src2)>; + + def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), + (PXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), + (PXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), + (PXORrm VR128:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), + (PANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), + (PANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), + (PANDNrm VR128:$src1, addr:$src2)>; +} + // Patterns for packed operations when we don't have integer type available. def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), (ANDPSrr VR128:$src1, VR128:$src2)>; @@ -2713,7 +2965,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), [(set RC:$dst, (OpNode (load addr:$src1)))], d>, - Sched<[sched.Folded, ReadAfterLd]>, + Sched<[sched.Folded]>, Requires<[target, OptForSize]>; let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in { @@ -2723,7 +2975,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, let mayLoad = 1 in def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -2777,7 +3029,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, let mayLoad = 1 in def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [], d>, Sched<[sched.Folded, ReadAfterLd]>; + [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCodeGenOnly = 1, ExeDomain = d in { def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -2787,7 +3039,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, Sched<[sched.Folded, ReadAfterLd]>; + []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -3306,6 +3558,19 @@ def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", let Predicates = [HasAVX, NoVLX] in { // Additional patterns for other integer sizes. + def : Pat<(alignedloadv4i32 addr:$src), + (VMOVDQArm addr:$src)>; + def : Pat<(alignedloadv8i16 addr:$src), + (VMOVDQArm addr:$src)>; + def : Pat<(alignedloadv16i8 addr:$src), + (VMOVDQArm addr:$src)>; + def : Pat<(loadv4i32 addr:$src), + (VMOVDQUrm addr:$src)>; + def : Pat<(loadv8i16 addr:$src), + (VMOVDQUrm addr:$src)>; + def : Pat<(loadv16i8 addr:$src), + (VMOVDQUrm addr:$src)>; + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), @@ -3345,8 +3610,8 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), - (bitconvert (memop_frag addr:$src2)))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (memop_frag addr:$src2))))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } } // ExeDomain = SSEPackedInt @@ -3358,13 +3623,13 @@ defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, SchedWriteVecALU, 1, NoVLX>; defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, SchedWriteVecALU, 1, NoVLX>; -defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8, +defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16, +defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8, +defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16, +defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; @@ -3380,13 +3645,13 @@ defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, SchedWriteVecALU, 0, NoVLX>; defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, SchedWriteVecALU, 0, NoVLX>; -defm PSUBSB : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8, +defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; -defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16, +defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; -defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, +defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; -defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, +defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; @@ -3405,28 +3670,28 @@ defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, let Predicates = [HasAVX, NoVLX_Or_NoBWI] in defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, - loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>, + load, i128mem, SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, - VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM, + VR256, load, i256mem, SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, - memopv2i64, i128mem, SchedWriteVecIMul.XMM>; + memop, i128mem, SchedWriteVecIMul.XMM>; let Predicates = [HasAVX, NoVLX_Or_NoBWI] in defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, - loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>, + load, i128mem, SchedWritePSADBW.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, - loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>, + load, i256mem, SchedWritePSADBW.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, - memopv2i64, i128mem, SchedWritePSADBW.XMM>; + memop, i128mem, SchedWritePSADBW.XMM>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Logical Instructions @@ -3453,8 +3718,8 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (DstVT (OpNode RC:$src1, - (SrcVT (bitconvert (ld_frag addr:$src2))))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (SrcVT (ld_frag addr:$src2)))))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), !if(Is2Addr, @@ -3473,16 +3738,16 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, let Predicates = [HasAVX, prd] in defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, - DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG; + DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, - DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L, + DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, - memopv2i64>; + memop>; } multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, @@ -3582,7 +3847,7 @@ let Predicates = [HasAVX, prd] in { !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), + (vt128 (OpNode (load addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; } @@ -3600,7 +3865,7 @@ let Predicates = [HasAVX2, prd] in { !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), + (vt256 (OpNode (load addr:$src1), (i8 imm:$src2))))]>, VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; } @@ -3618,7 +3883,7 @@ let Predicates = [UseSSE2] in { !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), + (vt128 (OpNode (memop addr:$src1), (i8 imm:$src2))))]>, Sched<[sched.XMM.Folded]>; } @@ -3658,8 +3923,8 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OutVT (OpNode (ArgVT RC:$src1), - (bitconvert (ld_frag addr:$src2)))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (ld_frag addr:$src2))))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, @@ -3683,53 +3948,53 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OutVT (OpNode (ArgVT RC:$src1), - (bitconvert (ld_frag addr:$src2)))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (ld_frag addr:$src2))))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; } } // ExeDomain = SSEPackedInt @@ -3754,89 +4019,88 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, !if(Is2Addr, !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, - (bitconvert (ld_frag addr:$src2)))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX, NoVLX] in { defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2, NoVLX] in { defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; } } // ExeDomain = SSEPackedInt @@ -3864,7 +4128,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { [(set VR128:$dst, (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), imm:$src3))]>, - Sched<[WriteVecInsertLd, ReadAfterLd]>; + Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; } // Extract @@ -4155,7 +4419,7 @@ let Predicates = [UseAVX] in { (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzload addr:$src)), (VMOVDI2PDIrm addr:$src)>; @@ -4180,7 +4444,7 @@ let Predicates = [UseSSE2] in { (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzload addr:$src)), (MOVDI2PDIrm addr:$src)>; @@ -4335,30 +4599,30 @@ defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (VMOVSHDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (X86Movshdup (load addr:$src))), (VMOVSHDUPrm addr:$src)>; def : Pat<(v4i32 (X86Movsldup VR128:$src)), (VMOVSLDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (X86Movsldup (load addr:$src))), (VMOVSLDUPrm addr:$src)>; def : Pat<(v8i32 (X86Movshdup VR256:$src)), (VMOVSHDUPYrr VR256:$src)>; - def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))), + def : Pat<(v8i32 (X86Movshdup (load addr:$src))), (VMOVSHDUPYrm addr:$src)>; def : Pat<(v8i32 (X86Movsldup VR256:$src)), (VMOVSLDUPYrr VR256:$src)>; - def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))), + def : Pat<(v8i32 (X86Movsldup (load addr:$src))), (VMOVSLDUPYrm addr:$src)>; } let Predicates = [UseSSE3] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (MOVSHDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), + def : Pat<(v4i32 (X86Movshdup (memop addr:$src))), (MOVSHDUPrm addr:$src)>; def : Pat<(v4i32 (X86Movsldup VR128:$src)), (MOVSLDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), + def : Pat<(v4i32 (X86Movsldup (memop addr:$src))), (MOVSLDUPrm addr:$src)>; } @@ -4405,12 +4669,16 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; let Predicates = [HasAVX, NoVLX] in { def : Pat<(X86Movddup (loadv2f64 addr:$src)), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; } let Predicates = [UseSSE3] in { // No need for aligned memory as this only loads 64-bits. def : Pat<(X86Movddup (loadv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))), + (MOVDDUPrm addr:$src)>; } //===---------------------------------------------------------------------===// @@ -4453,7 +4721,7 @@ multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Predicates = [HasAVX] in { @@ -4504,7 +4772,7 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, X86MemOperand x86memop, SDNode OpNode, @@ -4522,7 +4790,7 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Predicates = [HasAVX] in { @@ -4580,7 +4848,7 @@ multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>, + (vt (OpNode (ld_frag addr:$src))))]>, Sched<[sched.XMM.Folded]>; } @@ -4597,19 +4865,19 @@ multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, (ins i256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>, + (vt (OpNode (load addr:$src))))]>, Sched<[sched.YMM.Folded]>; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, - loadv2i64>, VEX, VEX_WIG; + load>, VEX, VEX_WIG; defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, - loadv2i64>, VEX, VEX_WIG; + load>, VEX, VEX_WIG; } let Predicates = [HasAVX, NoVLX] in { defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, - loadv2i64>, VEX, VEX_WIG; + load>, VEX, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, @@ -4623,11 +4891,11 @@ let Predicates = [HasAVX2, NoVLX] in { } defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, - memopv2i64>; + memop>; defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, - memopv2i64>; + memop>; defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, - memopv2i64>; + memop>; //===---------------------------------------------------------------------===// // SSSE3 - Packed Binary Operator Instructions @@ -4652,9 +4920,8 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, - (DstVT (OpNode (OpVT RC:$src1), - (bitconvert (memop_frag addr:$src2)))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. @@ -4675,9 +4942,8 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, - (IntId128 VR128:$src1, - (bitconvert (ld_frag addr:$src2))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, @@ -4693,83 +4959,83 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (IntId256 VR256:$src1, (load addr:$src2)))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { let isCommutable = 0 in { defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, - VR128, loadv2i64, i128mem, + VR128, load, i128mem, SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, - v16i8, VR128, loadv2i64, i128mem, + v16i8, VR128, load, i128mem, SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; } defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, - VR128, loadv2i64, i128mem, + VR128, load, i128mem, SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; } let ImmT = NoImm, Predicates = [HasAVX] in { let isCommutable = 0 in { defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, - loadv2i64, i128mem, + load, i128mem, SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, - loadv2i64, i128mem, + load, i128mem, SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, - loadv2i64, i128mem, + load, i128mem, SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, - loadv2i64, i128mem, + load, i128mem, SchedWritePHAdd.XMM, 0>, VEX_4V; defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", int_x86_ssse3_psign_b_128, - SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", int_x86_ssse3_psign_w_128, - SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", int_x86_ssse3_psign_d_128, - SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", int_x86_ssse3_phadd_sw_128, - SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", int_x86_ssse3_phsub_sw_128, - SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; } } let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { let isCommutable = 0 in { defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, - VR256, loadv4i64, i256mem, + VR256, load, i256mem, SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, - v32i8, VR256, loadv4i64, i256mem, + v32i8, VR256, load, i256mem, SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; } defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, - VR256, loadv4i64, i256mem, + VR256, load, i256mem, SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; } let ImmT = NoImm, Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, - VR256, loadv4i64, i256mem, + VR256, load, i256mem, SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, - loadv4i64, i256mem, + load, i256mem, SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, - VR256, loadv4i64, i256mem, + VR256, load, i256mem, SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, - loadv4i64, i256mem, + load, i256mem, SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L; defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; @@ -4790,33 +5056,33 @@ let isCommutable = 0 in { let ImmT = NoImm, Constraints = "$src1 = $dst" in { let isCommutable = 0 in { defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memop, i128mem, SchedWritePHAdd.XMM>; defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memop, i128mem, SchedWritePHAdd.XMM>; defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memop, i128mem, SchedWritePHAdd.XMM>; defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memop, i128mem, SchedWritePHAdd.XMM>; defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, - SchedWriteVecALU.XMM, memopv2i64>; + SchedWriteVecALU.XMM, memop>; defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, - SchedWriteVecALU.XMM, memopv2i64>; + SchedWriteVecALU.XMM, memop>; defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, - SchedWriteVecALU.XMM, memopv2i64>; + SchedWriteVecALU.XMM, memop>; defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, - memopv2i64, i128mem, SchedWriteVarShuffle.XMM>; + memop, i128mem, SchedWriteVarShuffle.XMM>; defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", int_x86_ssse3_phadd_sw_128, - SchedWritePHAdd.XMM, memopv2i64>; + SchedWritePHAdd.XMM, memop>; defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", int_x86_ssse3_phsub_sw_128, - SchedWritePHAdd.XMM, memopv2i64>; + SchedWritePHAdd.XMM, memop>; defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, - v16i8, VR128, memopv2i64, i128mem, + v16i8, VR128, memop, i128mem, SchedWriteVecIMul.XMM>; } defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, - VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>; + VR128, memop, i128mem, SchedWriteVecIMul.XMM>; } //===---------------------------------------------------------------------===// @@ -4843,20 +5109,20 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, (VT (X86PAlignr RC:$src1, - (bitconvert (memop_frag addr:$src2)), + (memop_frag addr:$src2), (i8 imm:$src3))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in - defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem, + defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem, SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in - defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem, + defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem, SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in - defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem, + defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem, SchedWriteShuffle.XMM>; //===---------------------------------------------------------------------===// @@ -4936,34 +5202,72 @@ defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; -// AVX2 Patterns -multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> { +// Patterns that we also need for any_extend. +// Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg. +multiclass SS41I_pmovx_avx2_patterns_base<string OpcPrefix, SDNode ExtOp> { // Register-Register patterns - let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), - (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; + let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; } - let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), + + let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; + } + + // AVX2 Register-Memory patterns + let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + } + + let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + } +} + +// AVX2 Patterns +multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, + SDNode ExtOp, SDNode InVecOp> : + SS41I_pmovx_avx2_patterns_base<OpcPrefix, ExtOp> { + + // Register-Register patterns + let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), + def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), - (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), + def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; - - def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), - (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; } // Simple Register-Memory patterns - let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; } - let Predicates = [HasAVX, NoVLX] in { + let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), @@ -4979,60 +5283,39 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO } // AVX2 Register-Memory patterns - let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; - } - let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; } } -defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; -defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; +defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; +defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; +defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>; // SSE4.1/AVX patterns. multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, @@ -5082,7 +5365,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, (!cast<I>(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), (!cast<I>(OpcPrefix#BWrm) addr:$src)>; } let Predicates = [HasAVX, NoVLX] in { @@ -5092,7 +5375,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, (!cast<I>(OpcPrefix#BDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), (!cast<I>(OpcPrefix#BDrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), @@ -5101,7 +5384,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, (!cast<I>(OpcPrefix#BQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), (!cast<I>(OpcPrefix#BQrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), @@ -5112,7 +5395,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, (!cast<I>(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), (!cast<I>(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), @@ -5121,7 +5404,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, (!cast<I>(OpcPrefix#WQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#WQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), (!cast<I>(OpcPrefix#WQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), @@ -5132,7 +5415,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, (!cast<I>(OpcPrefix#DQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), (!cast<I>(OpcPrefix#DQrm) addr:$src)>; } } @@ -5298,8 +5581,8 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), - imm:$src3))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; + (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>, + Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; } let Predicates = [HasAVX, NoBWI] in @@ -5324,8 +5607,8 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), - imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; + (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>, + Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; } let Predicates = [HasAVX, NoDQI] in @@ -5350,8 +5633,8 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), - imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; + (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>, + Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; } let Predicates = [HasAVX, NoDQI] in @@ -5383,7 +5666,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { (X86insertps VR128:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), imm:$src3))]>, - Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; + Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; } let ExeDomain = SSEPackedSingle in { @@ -5446,7 +5729,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, Sched<[sched.Folded, ReadAfterLd]>; + []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, hasSideEffects = 0 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { @@ -5461,7 +5744,7 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, Sched<[sched.Folded, ReadAfterLd]>; + []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedDouble, hasSideEffects = 0 } @@ -5479,7 +5762,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, Sched<[sched.Folded, ReadAfterLd]>; + []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, hasSideEffects = 0 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { @@ -5494,7 +5777,7 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, Sched<[sched.Folded, ReadAfterLd]>; + []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedDouble, hasSideEffects = 0 } @@ -5522,7 +5805,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { @@ -5545,7 +5828,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 } @@ -5846,7 +6129,7 @@ def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, - Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>, + Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>, VEX, VEX_WIG; def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), @@ -5856,7 +6139,7 @@ def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, - Sched<[SchedWriteVecTest.YMM.Folded, ReadAfterLd]>, + Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>, VEX, VEX_L, VEX_WIG; } @@ -5868,7 +6151,7 @@ def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), "ptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, - Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>; + Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>; } // The bit test instructions below are AVX only @@ -5882,7 +6165,7 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, - Sched<[sched.Folded, ReadAfterLd]>, VEX; + Sched<[sched.Folded, sched.ReadAfterFold]>, VEX; } let Defs = [EFLAGS], Predicates = [HasAVX] in { @@ -5950,7 +6233,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>, + (v8i16 (OpNode (ld_frag addr:$src))))]>, Sched<[Sched.Folded]>; } @@ -5958,10 +6241,10 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, // model, although the naming is misleading. let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", - X86phminpos, loadv2i64, + X86phminpos, load, WritePHMINPOS>, VEX, VEX_WIG; defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", - X86phminpos, memopv2i64, + X86phminpos, memop, WritePHMINPOS>; /// SS48I_binop_rm - Simple SSE41 binary operator. @@ -5983,118 +6266,118 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Predicates = [HasAVX, NoVLX] in { defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, - loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>, + load, i128mem, SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX2, NoVLX] in { defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, - loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>, + load, i256mem, SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, - memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>; + memop, i128mem, SchedWriteVecIMul.XMM, 1>; } let Predicates = [HasAVX, NoVLX] in defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, - loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>, + load, i128mem, SchedWritePMULLD.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX] in defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX] in defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, - loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>, + load, i256mem, SchedWritePMULLD.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Predicates = [HasAVX2] in defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in { defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, - memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>; + memop, i128mem, SchedWritePMULLD.XMM, 1>; defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; } /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate @@ -6120,9 +6403,8 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (IntId RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate @@ -6148,9 +6430,8 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } def BlendCommuteImm2 : SDNodeXForm<imm, [{ @@ -6171,28 +6452,28 @@ def BlendCommuteImm8 : SDNodeXForm<imm, [{ let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - VR128, loadv2i64, i128mem, 0, + VR128, load, i128mem, 0, SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; } let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, - VR128, loadv4f32, f128mem, 0, + VR128, load, f128mem, 0, SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; let ExeDomain = SSEPackedDouble in defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, - VR128, loadv2f64, f128mem, 0, + VR128, load, f128mem, 0, SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; let ExeDomain = SSEPackedSingle in defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, - VR256, loadv8f32, i256mem, 0, + VR256, load, i256mem, 0, SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, - VR256, loadv4i64, i256mem, 0, + VR256, load, i256mem, 0, SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; } } @@ -6200,17 +6481,17 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv2i64, i128mem, 1, + VR128, memop, i128mem, 1, SchedWriteMPSAD.XMM>; } let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, - VR128, memopv4f32, f128mem, 1, + VR128, memop, f128mem, 1, SchedWriteDPPS.XMM>; let ExeDomain = SSEPackedDouble in defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, - VR128, memopv2f64, f128mem, 1, + VR128, memop, f128mem, 1, SchedWriteDPPD.XMM>; } @@ -6238,56 +6519,54 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } // Pattern to commute if load is in first source. - def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), - RC:$src1, imm:$src3)), + def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)), (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, (commuteXForm imm:$src3))>; } let Predicates = [HasAVX] in { defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, - VR128, loadv4f32, f128mem, 0, SSEPackedSingle, + VR128, load, f128mem, 0, SSEPackedSingle, SchedWriteFBlend.XMM, BlendCommuteImm4>, VEX_4V, VEX_WIG; defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, - VR256, loadv8f32, f256mem, 0, SSEPackedSingle, + VR256, load, f256mem, 0, SSEPackedSingle, SchedWriteFBlend.YMM, BlendCommuteImm8>, VEX_4V, VEX_L, VEX_WIG; defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, - VR128, loadv2f64, f128mem, 0, SSEPackedDouble, + VR128, load, f128mem, 0, SSEPackedDouble, SchedWriteFBlend.XMM, BlendCommuteImm2>, VEX_4V, VEX_WIG; defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, - VR256, loadv4f64, f256mem, 0, SSEPackedDouble, + VR256, load, f256mem, 0, SSEPackedDouble, SchedWriteFBlend.YMM, BlendCommuteImm4>, VEX_4V, VEX_L, VEX_WIG; defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, - VR128, loadv2i64, i128mem, 0, SSEPackedInt, + VR128, load, i128mem, 0, SSEPackedInt, SchedWriteBlend.XMM, BlendCommuteImm8>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX2] in { defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, - VR256, loadv4i64, i256mem, 0, SSEPackedInt, + VR256, load, i256mem, 0, SSEPackedInt, SchedWriteBlend.YMM, BlendCommuteImm8>, VEX_4V, VEX_L, VEX_WIG; } defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, - VR128, memopv4f32, f128mem, 1, SSEPackedSingle, + VR128, memop, f128mem, 1, SSEPackedSingle, SchedWriteFBlend.XMM, BlendCommuteImm4>; defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, - VR128, memopv2f64, f128mem, 1, SSEPackedDouble, + VR128, memop, f128mem, 1, SSEPackedDouble, SchedWriteFBlend.XMM, BlendCommuteImm2>; defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, - VR128, memopv2i64, i128mem, 1, SSEPackedInt, + VR128, memop, i128mem, 1, SSEPackedInt, SchedWriteBlend.XMM, BlendCommuteImm8>; // For insertion into the zero index (low half) of a 256-bit vector, it is @@ -6321,20 +6600,20 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), + (IntId RC:$src1, (mem_frag addr:$src2), RC:$src3))], SSEPackedInt>, TAPD, VEX_4V, - Sched<[sched.Folded, ReadAfterLd, + Sched<[sched.Folded, sched.ReadAfterFold, // x86memop:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, // RC::$src3 - ReadAfterLd]>; + sched.ReadAfterFold]>; } let Predicates = [HasAVX] in { let ExeDomain = SSEPackedDouble in { defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, - loadv2f64, int_x86_sse41_blendvpd, + load, int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, loadv4f64, int_x86_avx_blendv_pd_256, @@ -6342,20 +6621,20 @@ defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, - loadv4f32, int_x86_sse41_blendvps, + load, int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, loadv8f32, int_x86_avx_blendv_ps_256, SchedWriteFVarBlend.YMM>, VEX_L; } // ExeDomain = SSEPackedSingle defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, - loadv2i64, int_x86_sse41_pblendvb, + load, int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; } let Predicates = [HasAVX2] in { defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, - loadv4i64, int_x86_avx2_pblendvb, + load, int_x86_avx2_pblendvb, SchedWriteVarBlend.YMM>, VEX_L; } @@ -6486,18 +6765,18 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), [(set VR128:$dst, (IntId VR128:$src1, - (bitconvert (mem_frag addr:$src2)), XMM0))]>, - Sched<[sched.Folded, ReadAfterLd]>; + (mem_frag addr:$src2), XMM0))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } } let ExeDomain = SSEPackedDouble in -defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, +defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem, int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; let ExeDomain = SSEPackedSingle in -defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, +defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem, int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; -defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, +defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem, int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; // Aliases with the implicit xmm0 argument @@ -6553,6 +6832,12 @@ let Predicates = [HasAVX2, NoVLX] in { (VMOVNTDQAYrm addr:$src)>; def : Pat<(v4i64 (alignednontemporalload addr:$src)), (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v8i32 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v16i16 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v32i8 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; } let Predicates = [HasAVX, NoVLX] in { @@ -6562,6 +6847,12 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVNTDQArm addr:$src)>; def : Pat<(v2i64 (alignednontemporalload addr:$src)), (VMOVNTDQArm addr:$src)>; + def : Pat<(v4i32 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; + def : Pat<(v8i16 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; + def : Pat<(v16i8 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; } let Predicates = [UseSSE41] in { @@ -6571,6 +6862,12 @@ let Predicates = [UseSSE41] in { (MOVNTDQArm addr:$src)>; def : Pat<(v2i64 (alignednontemporalload addr:$src)), (MOVNTDQArm addr:$src)>; + def : Pat<(v4i32 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; + def : Pat<(v8i16 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; + def : Pat<(v16i8 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; } } // AddedComplexity @@ -6598,22 +6895,22 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Predicates = [HasAVX] in defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2] in defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM>; + memop, i128mem, SchedWriteVecALU.XMM>; //===----------------------------------------------------------------------===// // SSE4.2 - String/text Processing Instructions @@ -6628,7 +6925,7 @@ multiclass pcmpistrm_SS42AI<string asm> { def rm :SS42AI<0x62, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), - []>, Sched<[WritePCmpIStrM.Folded, ReadAfterLd]>; + []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>; } let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { @@ -6646,7 +6943,7 @@ multiclass SS42AI_pcmpestrm<string asm> { def rm : SS42AI<0x60, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src3, u8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), - []>, Sched<[WritePCmpEStrM.Folded, ReadAfterLd]>; + []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>; } let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { @@ -6664,7 +6961,7 @@ multiclass SS42AI_pcmpistri<string asm> { def rm : SS42AI<0x63, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), - []>, Sched<[WritePCmpIStrI.Folded, ReadAfterLd]>; + []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>; } let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { @@ -6682,7 +6979,7 @@ multiclass SS42AI_pcmpestri<string asm> { def rm : SS42AI<0x61, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src3, u8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), - []>, Sched<[WritePCmpEStrI.Folded, ReadAfterLd]>; + []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>; } let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { @@ -6712,7 +7009,7 @@ class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>, - Sched<[WriteCRC32.Folded, ReadAfterLd]>; + Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>; let Constraints = "$src1 = $dst" in { def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, @@ -6764,10 +7061,10 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), [!if(UsesXMM0, (set VR128:$dst, (IntId VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), + (memop addr:$src2), XMM0)), (set VR128:$dst, (IntId VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8, - Sched<[sched.Folded, ReadAfterLd]>; + (memop addr:$src2))))]>, T8, + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { @@ -6783,9 +7080,10 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)), + (memop addr:$src2), (i8 imm:$src3)))]>, TA, - Sched<[SchedWriteVecIMul.XMM.Folded, ReadAfterLd]>; + Sched<[SchedWriteVecIMul.XMM.Folded, + SchedWriteVecIMul.XMM.ReadAfterFold]>; defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, SchedWriteVecIMul.XMM>; @@ -6828,46 +7126,46 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, MemOp:$src2), "", [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, - Sched<[WriteAESDecEnc.Folded, ReadAfterLd]>; + Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>; } } // Perform One Round of an AES Encryption/Decryption Flow let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", - int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG; + int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG; defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", - int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG; + int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG; defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", - int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG; + int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG; defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", - int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG; + int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG; } let Predicates = [NoVLX, HasVAES] in { defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", - int_x86_aesni_aesenc_256, loadv4i64, 0, VR256, + int_x86_aesni_aesenc_256, load, 0, VR256, i256mem>, VEX_4V, VEX_L, VEX_WIG; defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", - int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256, + int_x86_aesni_aesenclast_256, load, 0, VR256, i256mem>, VEX_4V, VEX_L, VEX_WIG; defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", - int_x86_aesni_aesdec_256, loadv4i64, 0, VR256, + int_x86_aesni_aesdec_256, load, 0, VR256, i256mem>, VEX_4V, VEX_L, VEX_WIG; defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", - int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256, + int_x86_aesni_aesdeclast_256, load, 0, VR256, i256mem>, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", - int_x86_aesni_aesenc, memopv2i64, 1>; + int_x86_aesni_aesenc, memop, 1>; defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", - int_x86_aesni_aesenclast, memopv2i64, 1>; + int_x86_aesni_aesenclast, memop, 1>; defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", - int_x86_aesni_aesdec, memopv2i64, 1>; + int_x86_aesni_aesdec, memop, 1>; defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", - int_x86_aesni_aesdeclast, memopv2i64, 1>; + int_x86_aesni_aesdeclast, memop, 1>; } // Perform the AES InvMixColumn Transformation @@ -6881,7 +7179,7 @@ let Predicates = [HasAVX, HasAES] in { def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1), "vaesimc\t{$src1, $dst|$dst, $src1}", - [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, + [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>, Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; } def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), @@ -6892,7 +7190,7 @@ def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1), "aesimc\t{$src1, $dst|$dst, $src1}", - [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, + [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>, Sched<[WriteAESIMC.Folded]>; // AES Round Key Generation Assist @@ -6907,7 +7205,7 @@ let Predicates = [HasAVX, HasAES] in { (ins i128mem:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>, Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; } def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), @@ -6920,7 +7218,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>, Sched<[WriteAESKeyGen.Folded]>; //===----------------------------------------------------------------------===// @@ -6948,12 +7246,12 @@ let Predicates = [NoAVX, HasPCLMUL] in { (ins VR128:$src1, i128mem:$src2, u8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, - (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2), + (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), imm:$src3))]>, - Sched<[WriteCLMul.Folded, ReadAfterLd]>; + Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; } // Constraints = "$src1 = $dst" - def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1, + def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, (i8 imm:$src3)), (PCLMULQDQrm VR128:$src1, addr:$src2, (PCLMULCommuteImm imm:$src3))>; @@ -6986,7 +7284,7 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set RC:$dst, (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>, - Sched<[WriteCLMul.Folded, ReadAfterLd]>; + Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; // We can commute a load in the first operand by swapping the sources and // rotating the immediate. @@ -6996,11 +7294,11 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, } let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in -defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64, +defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load, int_x86_pclmulqdq>, VEX_4V, VEX_WIG; let Predicates = [NoVLX, HasVPCLMULQDQ] in -defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64, +defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load, int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, @@ -7156,11 +7454,11 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTI128 addr:$src)>; -def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), +def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), (VBROADCASTI128 addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), +def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), (VBROADCASTI128 addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), +def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), (VBROADCASTI128 addr:$src)>; } @@ -7174,11 +7472,11 @@ def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), let Predicates = [HasAVX1Only] in { def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTF128 addr:$src)>; -def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), +def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), (VBROADCASTF128 addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), +def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), (VBROADCASTF128 addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), +def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), (VBROADCASTF128 addr:$src)>; } @@ -7194,7 +7492,7 @@ let mayLoad = 1 in def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f128mem:$src2, u8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; + []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; } // To create a 256-bit all ones value, we should produce VCMPTRUEPS @@ -7211,7 +7509,7 @@ multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, (INSERT_get_vinsert128_imm VR256:$ins))>; def : Pat<(vinsert128_insert:$ins (To VR256:$src1), - (From (bitconvert (memop_frag addr:$src2))), + (From (memop_frag addr:$src2)), (iPTR imm)), (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, (INSERT_get_vinsert128_imm VR256:$ins))>; @@ -7224,9 +7522,9 @@ let Predicates = [HasAVX, NoVLX] in { let Predicates = [HasAVX1Only] in { defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>; - defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>; - defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>; - defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>; + defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>; + defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>; + defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>; } //===----------------------------------------------------------------------===// @@ -7315,7 +7613,7 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop_f, - X86MemOperand x86memop_i, PatFrag i_frag, + X86MemOperand x86memop_i, ValueType f_vt, ValueType i_vt, X86FoldableSchedWrite sched, X86FoldableSchedWrite varsched> { @@ -7329,8 +7627,8 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, (ins RC:$src1, x86memop_i:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, - (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V, - Sched<[varsched.Folded, ReadAfterLd]>; + (i_vt (load addr:$src2)))))]>, VEX_4V, + Sched<[varsched.Folded, sched.ReadAfterFold]>; def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), @@ -7348,18 +7646,18 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, let ExeDomain = SSEPackedSingle in { defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, - loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM, + v4f32, v4i32, SchedWriteFShuffle.XMM, SchedWriteFVarShuffle.XMM>; defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, - loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM, + v8f32, v8i32, SchedWriteFShuffle.YMM, SchedWriteFVarShuffle.YMM>, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, - loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM, + v2f64, v2i64, SchedWriteFShuffle.XMM, SchedWriteFVarShuffle.XMM>; defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, - loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM, + v4f64, v4i64, SchedWriteFShuffle.YMM, SchedWriteFVarShuffle.YMM>, VEX_L; } @@ -7380,7 +7678,7 @@ def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), (i8 imm:$src3)))]>, VEX_4V, VEX_L, - Sched<[WriteFShuffle256Ld, ReadAfterLd]>; + Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; } // Immediate transform to help with commuting. @@ -7440,8 +7738,7 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set RC:$dst, (X86cvtph2ps (bc_v8i16 - (loadv2i64 addr:$src))))]>, + [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>, T8PD, VEX, Sched<[sched.Folded]>; } @@ -7515,7 +7812,7 @@ let Predicates = [HasF16C, NoVLX] in { /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, X86FoldableSchedWrite sched, - RegisterClass RC, PatFrag memop_frag, + RegisterClass RC, X86MemOperand x86memop, SDNodeXForm commuteXForm> { let isCommutable = 1 in def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), @@ -7529,22 +7826,20 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, - Sched<[sched.Folded, ReadAfterLd]>, VEX_4V; + (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V; // Pattern to commute if load is in first source. - def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), - RC:$src1, imm:$src3)), + def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)), (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, (commuteXForm imm:$src3))>; } defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, - SchedWriteBlend.XMM, VR128, loadv2i64, i128mem, + SchedWriteBlend.XMM, VR128, i128mem, BlendCommuteImm4>; defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, - SchedWriteBlend.YMM, VR256, loadv4i64, i256mem, + SchedWriteBlend.YMM, VR256, i256mem, BlendCommuteImm8>, VEX_L; // For insertion into the zero index (low half) of a 256-bit vector, it is @@ -7743,6 +8038,8 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVDDUPrr VR128:$src)>; def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), (VMOVDDUPrm addr:$src)>; + def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))), + (VMOVDDUPrm addr:$src)>; } let Predicates = [HasAVX1Only] in { @@ -7778,7 +8075,7 @@ let Predicates = [HasAVX1Only] in { // VPERM - Permute instructions // -multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, +multiclass avx2_perm<bits<8> opc, string OpcodeStr, ValueType OpVT, X86FoldableSchedWrite Sched, X86MemOperand memOp> { let Predicates = [HasAVX2, NoVLX] in { @@ -7795,16 +8092,14 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, - (bitconvert (mem_frag addr:$src2)))))]>, - Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; + (load addr:$src2))))]>, + Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L; } } -defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256, - i256mem>; +defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>; let ExeDomain = SSEPackedSingle in -defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256, - f256mem>; +defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>; multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, ValueType OpVT, X86FoldableSchedWrite Sched, @@ -7824,7 +8119,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermi (mem_frag addr:$src1), (i8 imm:$src2))))]>, - Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; + Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; } } @@ -7849,7 +8144,7 @@ def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), (i8 imm:$src3)))]>, - Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; + Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; let Predicates = [HasAVX2] in def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), @@ -7869,14 +8164,14 @@ let mayLoad = 1 in def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i128mem:$src2, u8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; + []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; } let Predicates = [HasAVX2, NoVLX] in { defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>; - defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>; - defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>; - defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>; + defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>; + defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>; + defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>; } //===----------------------------------------------------------------------===// @@ -7941,7 +8236,7 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, ValueType MaskVT, string BlendStr, ValueType ZeroVT> { // masked store - def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)), + def: Pat<(X86mstore (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; // masked load def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)), @@ -8035,8 +8330,9 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, - (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, - VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, ReadAfterLd]>; + (vt128 (load addr:$src2)))))]>, + VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, + SchedWriteVarVecShift.XMM.ReadAfterFold]>; def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -8048,8 +8344,9 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, - (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>, - VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, ReadAfterLd]>; + (vt256 (load addr:$src2)))))]>, + VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, + SchedWriteVarVecShift.YMM.ReadAfterFold]>; } let Predicates = [HasAVX2, NoVLX] in { @@ -8061,13 +8358,11 @@ let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)), (VPSRAVDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4i32 (X86vsrav VR128:$src1, - (bitconvert (loadv2i64 addr:$src2)))), + def : Pat<(v4i32 (X86vsrav VR128:$src1, (load addr:$src2))), (VPSRAVDrm VR128:$src1, addr:$src2)>; def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)), (VPSRAVDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86vsrav VR256:$src1, - (bitconvert (loadv4i64 addr:$src2)))), + def : Pat<(v8i32 (X86vsrav VR256:$src1, (load addr:$src2))), (VPSRAVDYrm VR256:$src1, addr:$src2)>; } @@ -8132,51 +8427,6 @@ let Predicates = [UseAVX2] in { } //===----------------------------------------------------------------------===// -// Extra selection patterns for f128, f128mem - -// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. -def : Pat<(alignedstore (f128 VR128:$src), addr:$dst), - (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; -def : Pat<(store (f128 VR128:$src), addr:$dst), - (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; - -def : Pat<(alignedloadf128 addr:$src), - (COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>; -def : Pat<(loadf128 addr:$src), - (COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>; - -// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 -def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))), - (COPY_TO_REGCLASS - (ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), - VR128)>; - -def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)), - (COPY_TO_REGCLASS - (ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), - (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; - -def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))), - (COPY_TO_REGCLASS - (ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), - VR128)>; - -def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)), - (COPY_TO_REGCLASS - (ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), - (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; - -def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))), - (COPY_TO_REGCLASS - (XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), - VR128)>; - -def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), - (COPY_TO_REGCLASS - (XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), - (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; - -//===----------------------------------------------------------------------===// // GFNI instructions //===----------------------------------------------------------------------===// @@ -8194,8 +8444,8 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, - (bitconvert (MemOpFrag addr:$src2)))))]>, - Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>, T8PD; + (MemOpFrag addr:$src2))))]>, + Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD; } } @@ -8212,9 +8462,9 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", [(set RC:$dst, (OpVT (OpNode RC:$src1, - (bitconvert (MemOpFrag addr:$src2)), + (MemOpFrag addr:$src2), imm:$src3)))], SSEPackedInt>, - Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>; + Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>; } } @@ -8222,24 +8472,24 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { let Constraints = "$src1 = $dst", Predicates = [HasGFNI, UseSSE2] in defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, - VR128, loadv2i64, i128mem, 1>; + VR128, load, i128mem, 1>; let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128, - loadv2i64, i128mem>, VEX_4V, VEX_W; + load, i128mem>, VEX_4V, VEX_W; defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256, - loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W; + load, i256mem>, VEX_4V, VEX_L, VEX_W; } } // GF2P8MULB let Constraints = "$src1 = $dst", Predicates = [HasGFNI, UseSSE2] in -defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64, +defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop, i128mem, 1>; let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { - defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64, + defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, i128mem>, VEX_4V; - defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64, + defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, i256mem>, VEX_4V, VEX_L; } // GF2P8AFFINEINVQB, GF2P8AFFINEQB |