diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-06-01 20:58:36 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-06-01 20:58:36 +0000 |
commit | f382538d471e38a9b98f016c4caebd24c8d60b62 (patch) | |
tree | d30f3d58b1044b5355d50c17a6a96c6a0b35703a | |
parent | ee2f195dd3e40f49698ca4dc2666ec09c770e80d (diff) |
Vendor import of llvm trunk r304460:vendor/llvm/llvm-trunk-r304460
Notes
Notes:
svn path=/vendor/llvm/dist/; revision=319461
svn path=/vendor/llvm/llvm-trunk-r304460/; revision=319462; tag=vendor/llvm/llvm-trunk-r304460
378 files changed, 12474 insertions, 9898 deletions
diff --git a/docs/Vectorizers.rst b/docs/Vectorizers.rst index 317271af4032..92d6200e169f 100644 --- a/docs/Vectorizers.rst +++ b/docs/Vectorizers.rst @@ -44,12 +44,12 @@ Users can control the vectorization SIMD width using the command line flag "-for $ clang -mllvm -force-vector-width=8 ... $ opt -loop-vectorize -force-vector-width=8 ... -Users can control the unroll factor using the command line flag "-force-vector-unroll" +Users can control the unroll factor using the command line flag "-force-vector-interleave" .. code-block:: console - $ clang -mllvm -force-vector-unroll=2 ... - $ opt -loop-vectorize -force-vector-unroll=2 ... + $ clang -mllvm -force-vector-interleave=2 ... + $ opt -loop-vectorize -force-vector-interleave=2 ... Pragma loop hint directives ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 6cbe3a1f515e..7211508e975a 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -454,6 +454,9 @@ public: /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; + /// \brief Enable inline expansion of memcmp + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const; + /// \brief Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -828,6 +831,7 @@ public: unsigned VF) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; + virtual bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -1047,6 +1051,9 @@ public: bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) override { + return Impl.expandMemCmp(I, MaxLoadSize); + } bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index ad1a7cb748fe..d73a60eba850 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -274,6 +274,8 @@ public: bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { return false; } + bool enableInterleavedAccessVectorization() { return false; } bool isFPVectorizationPotentiallyUnsafe() { return false; } diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h index b1ee76159c4b..612779b1ce86 100644 --- a/include/llvm/Analysis/ValueTracking.h +++ b/include/llvm/Analysis/ValueTracking.h @@ -85,6 +85,8 @@ template <typename T> class ArrayRef; const Instruction *CxtI = nullptr, const DominatorTree *DT = nullptr); + bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI); + /// Return true if the given value is known to be non-zero when defined. For /// vectors, return true if every element is known to be non-zero when /// defined. For pointers, if the context instruction and dominator tree are diff --git a/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h b/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h index f32233b3a9e4..e3549d8988cd 100644 --- a/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h +++ b/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h @@ -396,7 +396,7 @@ protected: mutable DenseMap<unsigned, std::unique_ptr<const InstructionMapping>> MapOfInstructionMappings; - /// Create a RegisterBankInfo that can accomodate up to \p NumRegBanks + /// Create a RegisterBankInfo that can accommodate up to \p NumRegBanks /// RegisterBank instances. RegisterBankInfo(RegisterBank **RegBanks, unsigned NumRegBanks); diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index 2300a106c358..bc5d2353f63e 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -410,12 +410,22 @@ namespace ISD { /// then the result type must also be a vector type. SETCC, - /// Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but + /// Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, and /// op #2 is a *carry value*. This operator checks the result of /// "LHS - RHS - Carry", and can be used to compare two wide integers: /// (setcce lhshi rhshi (subc lhslo rhslo) cc). Only valid for integers. + /// FIXME: This node is deprecated in favor of SETCCCARRY. + /// It is kept around for now to provide a smooth transition path + /// toward the use of SETCCCARRY and will eventually be removed. SETCCE, + /// Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but + /// op #2 is a boolean indicating if there is an incoming carry. This + /// operator checks the result of "LHS - RHS - Carry", and can be used to + /// compare two wide integers: (setcce lhshi rhshi (subc lhslo rhslo) cc). + /// Only valid for integers. + SETCCCARRY, + /// SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded /// integer shift operations. The operation ordering is: /// [Lo,Hi] = op [LoLHS,HiLHS], Amt diff --git a/include/llvm/CodeGen/MIRYamlMapping.h b/include/llvm/CodeGen/MIRYamlMapping.h index 47b40de6fe1f..30e88fe38ac3 100644 --- a/include/llvm/CodeGen/MIRYamlMapping.h +++ b/include/llvm/CodeGen/MIRYamlMapping.h @@ -381,7 +381,6 @@ struct MachineFunction { StringRef Name; unsigned Alignment = 0; bool ExposesReturnsTwice = false; - bool NoVRegs; // GISel MachineFunctionProperties. bool Legalized = false; bool RegBankSelected = false; @@ -406,7 +405,6 @@ template <> struct MappingTraits<MachineFunction> { YamlIO.mapRequired("name", MF.Name); YamlIO.mapOptional("alignment", MF.Alignment); YamlIO.mapOptional("exposesReturnsTwice", MF.ExposesReturnsTwice); - YamlIO.mapOptional("noVRegs", MF.NoVRegs); YamlIO.mapOptional("legalized", MF.Legalized); YamlIO.mapOptional("regBankSelected", MF.RegBankSelected); YamlIO.mapOptional("selected", MF.Selected); diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index 8da48c379d00..26ed8bb487a2 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -335,6 +335,9 @@ public: return make_range(livein_begin(), livein_end()); } + /// Remove entry from the livein set and return iterator to the next. + livein_iterator removeLiveIn(livein_iterator I); + /// Get the clobber mask for the start of this basic block. Funclets use this /// to prevent register allocation across funclet transitions. const uint32_t *getBeginClobberMask(const TargetRegisterInfo *TRI) const; diff --git a/include/llvm/CodeGen/MachineConstantPool.h b/include/llvm/CodeGen/MachineConstantPool.h index d2036c4a29a5..1705a0f7e59b 100644 --- a/include/llvm/CodeGen/MachineConstantPool.h +++ b/include/llvm/CodeGen/MachineConstantPool.h @@ -1,4 +1,4 @@ -//===-- CodeGen/MachineConstantPool.h - Abstract Constant Pool --*- C++ -*-===// +//===- CodeGen/MachineConstantPool.h - Abstract Constant Pool ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -18,29 +18,28 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/MC/SectionKind.h" -#include <cassert> #include <climits> #include <vector> namespace llvm { class Constant; -class FoldingSetNodeID; class DataLayout; -class TargetMachine; -class Type; +class FoldingSetNodeID; class MachineConstantPool; class raw_ostream; +class Type; /// Abstract base class for all machine specific constantpool value subclasses. /// class MachineConstantPoolValue { virtual void anchor(); + Type *Ty; public: explicit MachineConstantPoolValue(Type *ty) : Ty(ty) {} - virtual ~MachineConstantPoolValue() {} + virtual ~MachineConstantPoolValue() = default; /// getType - get type of this MachineConstantPoolValue. /// @@ -81,6 +80,7 @@ public: : Alignment(A) { Val.ConstVal = V; } + MachineConstantPoolEntry(MachineConstantPoolValue *V, unsigned A) : Alignment(A) { Val.MachineCPVal = V; @@ -153,13 +153,12 @@ public: /// print - Used by the MachineFunction printer to print information about /// constant pool objects. Implemented in MachineFunction.cpp - /// void print(raw_ostream &OS) const; /// dump - Call print(cerr) to be called from the debugger. void dump() const; }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINECONSTANTPOOL_H diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h index 5859a4e61fdd..10125864cd90 100644 --- a/include/llvm/CodeGen/MachineFunction.h +++ b/include/llvm/CodeGen/MachineFunction.h @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/MachineFunction.h --------------------------*- C++ -*-===// +//===- llvm/CodeGen/MachineFunction.h ---------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -18,38 +18,61 @@ #ifndef LLVM_CODEGEN_MACHINEFUNCTION_H #define LLVM_CODEGEN_MACHINEFUNCTION_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/ilist.h" +#include "llvm/ADT/iterator.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/ArrayRecycler.h" +#include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Recycler.h" +#include <cassert> +#include <cstdint> +#include <memory> +#include <utility> +#include <vector> namespace llvm { -class Value; +class BasicBlock; +class BlockAddress; +class DataLayout; +class DIExpression; +class DILocalVariable; +class DILocation; class Function; -class GCModuleInfo; -class MachineRegisterInfo; -class MachineFrameInfo; +class GlobalValue; class MachineConstantPool; +class MachineFrameInfo; +class MachineFunction; class MachineJumpTableInfo; class MachineModuleInfo; +class MachineRegisterInfo; class MCContext; +class MCInstrDesc; class Pass; class PseudoSourceValueManager; +class raw_ostream; +class SlotIndexes; class TargetMachine; -class TargetSubtargetInfo; class TargetRegisterClass; -struct MachinePointerInfo; +class TargetSubtargetInfo; struct WinEHFuncInfo; template <> struct ilist_alloc_traits<MachineBasicBlock> { @@ -137,27 +160,33 @@ public: bool hasProperty(Property P) const { return Properties[static_cast<unsigned>(P)]; } + MachineFunctionProperties &set(Property P) { Properties.set(static_cast<unsigned>(P)); return *this; } + MachineFunctionProperties &reset(Property P) { Properties.reset(static_cast<unsigned>(P)); return *this; } + /// Reset all the properties. MachineFunctionProperties &reset() { Properties.reset(); return *this; } + MachineFunctionProperties &set(const MachineFunctionProperties &MFP) { Properties |= MFP.Properties; return *this; } + MachineFunctionProperties &reset(const MachineFunctionProperties &MFP) { Properties.reset(MFP.Properties); return *this; } + // Returns true if all properties set in V (i.e. required by a pass) are set // in this. bool verifyRequiredProperties(const MachineFunctionProperties &V) const { @@ -180,18 +209,17 @@ struct SEHHandler { const BlockAddress *RecoverBA; }; - /// This structure is used to retain landing pad info for the current function. struct LandingPadInfo { MachineBasicBlock *LandingPadBlock; // Landing pad block. SmallVector<MCSymbol *, 1> BeginLabels; // Labels prior to invoke. SmallVector<MCSymbol *, 1> EndLabels; // Labels after invoke. SmallVector<SEHHandler, 1> SEHHandlers; // SEH handlers active at this lpad. - MCSymbol *LandingPadLabel; // Label at beginning of landing pad. - std::vector<int> TypeIds; // List of type ids (filters negative). + MCSymbol *LandingPadLabel = nullptr; // Label at beginning of landing pad. + std::vector<int> TypeIds; // List of type ids (filters negative). explicit LandingPadInfo(MachineBasicBlock *MBB) - : LandingPadBlock(MBB), LandingPadLabel(nullptr) {} + : LandingPadBlock(MBB) {} }; class MachineFunction { @@ -239,7 +267,7 @@ class MachineFunction { Recycler<MachineBasicBlock> BasicBlockRecycler; // List of machine basic blocks in function - typedef ilist<MachineBasicBlock> BasicBlockListType; + using BasicBlockListType = ilist<MachineBasicBlock>; BasicBlockListType BasicBlocks; /// FunctionNumber - This provides a unique ID for each function emitted in @@ -281,7 +309,7 @@ class MachineFunction { std::vector<LandingPadInfo> LandingPads; /// Map a landing pad's EH symbol to the call site indexes. - DenseMap<MCSymbol*, SmallVector<unsigned, 4> > LPadToCallSiteMap; + DenseMap<MCSymbol*, SmallVector<unsigned, 4>> LPadToCallSiteMap; /// Map of invoke call site index values to associated begin EH_LABEL. DenseMap<MCSymbol*, unsigned> CallSiteMap; @@ -303,9 +331,6 @@ class MachineFunction { /// \} - MachineFunction(const MachineFunction &) = delete; - void operator=(const MachineFunction&) = delete; - /// Clear all the members of this MachineFunction, but the ones used /// to initialize again the MachineFunction. /// More specifically, this deallocates all the dynamically allocated @@ -316,8 +341,8 @@ class MachineFunction { /// In particular, the XXXInfo data structure. /// \pre Fn, Target, MMI, and FunctionNumber are properly set. void init(); -public: +public: struct VariableDbgInfo { const DILocalVariable *Var; const DIExpression *Expr; @@ -328,11 +353,13 @@ public: unsigned Slot, const DILocation *Loc) : Var(Var), Expr(Expr), Slot(Slot), Loc(Loc) {} }; - typedef SmallVector<VariableDbgInfo, 4> VariableDbgInfoMapTy; + using VariableDbgInfoMapTy = SmallVector<VariableDbgInfo, 4>; VariableDbgInfoMapTy VariableDbgInfos; MachineFunction(const Function *Fn, const TargetMachine &TM, unsigned FunctionNum, MachineModuleInfo &MMI); + MachineFunction(const MachineFunction &) = delete; + MachineFunction &operator=(const MachineFunction &) = delete; ~MachineFunction(); /// Reset the instance as if it was just created. @@ -350,19 +377,15 @@ public: const DataLayout &getDataLayout() const; /// getFunction - Return the LLVM function that this machine code represents - /// const Function *getFunction() const { return Fn; } /// getName - Return the name of the corresponding LLVM function. - /// StringRef getName() const; /// getFunctionNumber - Return a unique ID for the current function. - /// unsigned getFunctionNumber() const { return FunctionNumber; } /// getTarget - Return the target machine this machine code is compiled with - /// const TargetMachine &getTarget() const { return Target; } /// getSubtarget - Return the subtarget for which this machine code is being @@ -378,14 +401,12 @@ public: } /// getRegInfo - Return information about the registers currently in use. - /// MachineRegisterInfo &getRegInfo() { return *RegInfo; } const MachineRegisterInfo &getRegInfo() const { return *RegInfo; } /// getFrameInfo - Return the frame info object for the current function. /// This object contains information about objects allocated on the stack /// frame of the current function in an abstract way. - /// MachineFrameInfo &getFrameInfo() { return *FrameInfo; } const MachineFrameInfo &getFrameInfo() const { return *FrameInfo; } @@ -402,7 +423,6 @@ public: /// getConstantPool - Return the constant pool object for the current /// function. - /// MachineConstantPool *getConstantPool() { return ConstantPool; } const MachineConstantPool *getConstantPool() const { return ConstantPool; } @@ -413,11 +433,9 @@ public: WinEHFuncInfo *getWinEHFuncInfo() { return WinEHInfo; } /// getAlignment - Return the alignment (log2, not bytes) of the function. - /// unsigned getAlignment() const { return Alignment; } /// setAlignment - Set the alignment (log2, not bytes) of the function. - /// void setAlignment(unsigned A) { Alignment = A; } /// ensureAlignment - Make sure the function is at least 1 << A bytes aligned. @@ -487,7 +505,6 @@ public: bool shouldSplitStack() const; /// getNumBlockIDs - Return the number of MBB ID's allocated. - /// unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); } /// RenumberBlocks - This discards all of the MachineBasicBlock numbers and @@ -499,7 +516,6 @@ public: /// print - Print out the MachineFunction in a format suitable for debugging /// to the specified stream. - /// void print(raw_ostream &OS, const SlotIndexes* = nullptr) const; /// viewCFG - This function is meant for use from the debugger. You can just @@ -507,7 +523,6 @@ public: /// program, displaying the CFG of the current function with the code for each /// basic block inside. This depends on there being a 'dot' and 'gv' program /// in your path. - /// void viewCFG() const; /// viewCFGOnly - This function is meant for use from the debugger. It works @@ -518,7 +533,6 @@ public: void viewCFGOnly() const; /// dump - Print the current MachineFunction to cerr, useful for debugger use. - /// void dump() const; /// Run the current MachineFunction through the machine code verifier, useful @@ -528,10 +542,10 @@ public: bool AbortOnError = true) const; // Provide accessors for the MachineBasicBlock list... - typedef BasicBlockListType::iterator iterator; - typedef BasicBlockListType::const_iterator const_iterator; - typedef BasicBlockListType::const_reverse_iterator const_reverse_iterator; - typedef BasicBlockListType::reverse_iterator reverse_iterator; + using iterator = BasicBlockListType::iterator; + using const_iterator = BasicBlockListType::const_iterator; + using const_reverse_iterator = BasicBlockListType::const_reverse_iterator; + using reverse_iterator = BasicBlockListType::reverse_iterator; /// Support for MachineBasicBlock::getNextNode(). static BasicBlockListType MachineFunction::* @@ -590,11 +604,9 @@ public: //===--------------------------------------------------------------------===// // Internal functions used to automatically number MachineBasicBlocks - // /// \brief Adds the MBB to the internal numbering. Returns the unique number /// assigned to the MBB. - /// unsigned addToMBBNumbering(MachineBasicBlock *MBB) { MBBNumbering.push_back(MBB); return (unsigned)MBBNumbering.size()-1; @@ -610,7 +622,6 @@ public: /// CreateMachineInstr - Allocate a new MachineInstr. Use this instead /// of `new MachineInstr'. - /// MachineInstr *CreateMachineInstr(const MCInstrDesc &MCID, const DebugLoc &DL, bool NoImp = false); @@ -623,16 +634,13 @@ public: MachineInstr *CloneMachineInstr(const MachineInstr *Orig); /// DeleteMachineInstr - Delete the given MachineInstr. - /// void DeleteMachineInstr(MachineInstr *MI); /// CreateMachineBasicBlock - Allocate a new MachineBasicBlock. Use this /// instead of `new MachineBasicBlock'. - /// MachineBasicBlock *CreateMachineBasicBlock(const BasicBlock *bb = nullptr); /// DeleteMachineBasicBlock - Delete the given MachineBasicBlock. - /// void DeleteMachineBasicBlock(MachineBasicBlock *MBB); /// getMachineMemOperand - Allocate a new MachineMemOperand. @@ -653,7 +661,7 @@ public: MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO, int64_t Offset, uint64_t Size); - typedef ArrayRecycler<MachineOperand>::Capacity OperandCapacity; + using OperandCapacity = ArrayRecycler<MachineOperand>::Capacity; /// Allocate an array of MachineOperands. This is only intended for use by /// internal MachineInstr functions. @@ -700,7 +708,6 @@ public: //===--------------------------------------------------------------------===// // Label Manipulation. - // /// getJTISymbol - Return the MCSymbol for the specified non-empty jump table. /// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a @@ -858,13 +865,16 @@ template <> struct GraphTraits<MachineFunction*> : static NodeRef getEntryNode(MachineFunction *F) { return &F->front(); } // nodes_iterator/begin/end - Allow iteration over all nodes in the graph - typedef pointer_iterator<MachineFunction::iterator> nodes_iterator; + using nodes_iterator = pointer_iterator<MachineFunction::iterator>; + static nodes_iterator nodes_begin(MachineFunction *F) { return nodes_iterator(F->begin()); } + static nodes_iterator nodes_end(MachineFunction *F) { return nodes_iterator(F->end()); } + static unsigned size (MachineFunction *F) { return F->size(); } }; template <> struct GraphTraits<const MachineFunction*> : @@ -872,37 +882,39 @@ template <> struct GraphTraits<const MachineFunction*> : static NodeRef getEntryNode(const MachineFunction *F) { return &F->front(); } // nodes_iterator/begin/end - Allow iteration over all nodes in the graph - typedef pointer_iterator<MachineFunction::const_iterator> nodes_iterator; + using nodes_iterator = pointer_iterator<MachineFunction::const_iterator>; + static nodes_iterator nodes_begin(const MachineFunction *F) { return nodes_iterator(F->begin()); } + static nodes_iterator nodes_end (const MachineFunction *F) { return nodes_iterator(F->end()); } + static unsigned size (const MachineFunction *F) { return F->size(); } }; - // Provide specializations of GraphTraits to be able to treat a function as a // graph of basic blocks... and to walk it in inverse order. Inverse order for // a function is considered to be when traversing the predecessor edges of a BB // instead of the successor edges. // -template <> struct GraphTraits<Inverse<MachineFunction*> > : - public GraphTraits<Inverse<MachineBasicBlock*> > { +template <> struct GraphTraits<Inverse<MachineFunction*>> : + public GraphTraits<Inverse<MachineBasicBlock*>> { static NodeRef getEntryNode(Inverse<MachineFunction *> G) { return &G.Graph->front(); } }; -template <> struct GraphTraits<Inverse<const MachineFunction*> > : - public GraphTraits<Inverse<const MachineBasicBlock*> > { +template <> struct GraphTraits<Inverse<const MachineFunction*>> : + public GraphTraits<Inverse<const MachineBasicBlock*>> { static NodeRef getEntryNode(Inverse<const MachineFunction *> G) { return &G.Graph->front(); } }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEFUNCTION_H diff --git a/include/llvm/CodeGen/MachineFunctionInitializer.h b/include/llvm/CodeGen/MachineFunctionInitializer.h index c644c9783e2f..0fbcb480b1ab 100644 --- a/include/llvm/CodeGen/MachineFunctionInitializer.h +++ b/include/llvm/CodeGen/MachineFunctionInitializer.h @@ -1,4 +1,4 @@ -//===- MachineFunctionInitializer.h - machine function initializer ---------===// +//=- MachineFunctionInitializer.h - machine function initializer --*- C++ -*-=// // // The LLVM Compiler Infrastructure // @@ -25,7 +25,7 @@ class MachineFunctionInitializer { virtual void anchor(); public: - virtual ~MachineFunctionInitializer() {} + virtual ~MachineFunctionInitializer() = default; /// Initialize the machine function. /// @@ -35,4 +35,4 @@ public: } // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEFUNCTIONINITIALIZER_H diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h index 8d040beff7a6..95401e98b297 100644 --- a/include/llvm/CodeGen/MachineInstr.h +++ b/include/llvm/CodeGen/MachineInstr.h @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/MachineInstr.h - MachineInstr class --------*- C++ -*-===// +//===- llvm/CodeGen/MachineInstr.h - MachineInstr class ---------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,7 +17,6 @@ #define LLVM_CODEGEN_MACHINEINSTR_H #include "llvm/ADT/DenseMapInfo.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" #include "llvm/ADT/iterator_range.h" @@ -28,19 +27,27 @@ #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/ArrayRecycler.h" #include "llvm/Target/TargetOpcodes.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <utility> namespace llvm { -class StringRef; template <typename T> class ArrayRef; -template <typename T> class SmallVectorImpl; -class DILocalVariable; class DIExpression; +class DILocalVariable; +class MachineBasicBlock; +class MachineFunction; +class MachineMemOperand; +class MachineRegisterInfo; +class ModuleSlotTracker; +class raw_ostream; +template <typename T> class SmallVectorImpl; +class StringRef; class TargetInstrInfo; class TargetRegisterClass; class TargetRegisterInfo; -class MachineFunction; -class MachineMemOperand; //===----------------------------------------------------------------------===// /// Representation of each machine instruction. @@ -53,7 +60,7 @@ class MachineInstr : public ilist_node_with_parent<MachineInstr, MachineBasicBlock, ilist_sentinel_tracking<true>> { public: - typedef MachineMemOperand **mmo_iterator; + using mmo_iterator = MachineMemOperand **; /// Flags to specify different kinds of comments to output in /// assembly code. These flags carry semantic information not @@ -72,43 +79,39 @@ public: BundledPred = 1 << 2, // Instruction has bundled predecessors. BundledSucc = 1 << 3 // Instruction has bundled successors. }; + private: const MCInstrDesc *MCID; // Instruction descriptor. - MachineBasicBlock *Parent; // Pointer to the owning basic block. + MachineBasicBlock *Parent = nullptr; // Pointer to the owning basic block. // Operands are allocated by an ArrayRecycler. - MachineOperand *Operands; // Pointer to the first operand. - unsigned NumOperands; // Number of operands on instruction. - typedef ArrayRecycler<MachineOperand>::Capacity OperandCapacity; + MachineOperand *Operands = nullptr; // Pointer to the first operand. + unsigned NumOperands = 0; // Number of operands on instruction. + using OperandCapacity = ArrayRecycler<MachineOperand>::Capacity; OperandCapacity CapOperands; // Capacity of the Operands array. - uint8_t Flags; // Various bits of additional + uint8_t Flags = 0; // Various bits of additional // information about machine // instruction. - uint8_t AsmPrinterFlags; // Various bits of information used by + uint8_t AsmPrinterFlags = 0; // Various bits of information used by // the AsmPrinter to emit helpful // comments. This is *not* semantic // information. Do not use this for // anything other than to convey comment // information to AsmPrinter. - uint8_t NumMemRefs; // Information on memory references. + uint8_t NumMemRefs = 0; // Information on memory references. // Note that MemRefs == nullptr, means 'don't know', not 'no memory access'. // Calling code must treat missing information conservatively. If the number // of memory operands required to be precise exceeds the maximum value of // NumMemRefs - currently 256 - we remove the operands entirely. Note also // that this is a non-owning reference to a shared copy on write buffer owned // by the MachineFunction and created via MF.allocateMemRefsArray. - mmo_iterator MemRefs; + mmo_iterator MemRefs = nullptr; DebugLoc debugLoc; // Source line information. - MachineInstr(const MachineInstr&) = delete; - void operator=(const MachineInstr&) = delete; - // Use MachineFunction::DeleteMachineInstr() instead. - ~MachineInstr() = delete; - // Intrusive list support friend struct ilist_traits<MachineInstr>; friend struct ilist_callback_traits<MachineBasicBlock>; @@ -128,6 +131,11 @@ private: friend class MachineFunction; public: + MachineInstr(const MachineInstr &) = delete; + MachineInstr &operator=(const MachineInstr &) = delete; + // Use MachineFunction::DeleteMachineInstr() instead. + ~MachineInstr() = delete; + const MachineBasicBlock* getParent() const { return Parent; } MachineBasicBlock* getParent() { return Parent; } @@ -178,7 +186,6 @@ public: Flags &= ~((uint8_t)Flag); } - /// Return true if MI is in a bundle (but not the first MI in a bundle). /// /// A bundle looks like this before it's finalized: @@ -263,7 +270,6 @@ public: /// earlier. /// /// If this method returns, the caller should try to recover from the error. - /// void emitError(StringRef Msg) const; /// Returns the target instruction descriptor of this MachineInstr. @@ -273,7 +279,6 @@ public: unsigned getOpcode() const { return MCID->Opcode; } /// Access to explicit operands of the instruction. - /// unsigned getNumOperands() const { return NumOperands; } const MachineOperand& getOperand(unsigned i) const { @@ -289,8 +294,8 @@ public: unsigned getNumExplicitOperands() const; /// iterator/begin/end - Iterate over all operands of a machine instruction. - typedef MachineOperand *mop_iterator; - typedef const MachineOperand *const_mop_iterator; + using mop_iterator = MachineOperand *; + using const_mop_iterator = const MachineOperand *; mop_iterator operands_begin() { return Operands; } mop_iterator operands_end() { return Operands + NumOperands; } @@ -713,7 +718,6 @@ public: return hasProperty(MCID::ExtraDefRegAllocReq, Type); } - enum MICheckType { CheckDefs, // Check all operands for equality CheckKillDead, // Check all operands including kill / dead markers @@ -767,6 +771,7 @@ public: /// Returns true if the MachineInstr represents a label. bool isLabel() const { return isEHLabel() || isGCLabel(); } + bool isCFIInstruction() const { return getOpcode() == TargetOpcode::CFI_INSTRUCTION; } @@ -775,6 +780,7 @@ public: bool isPosition() const { return isLabel() || isCFIInstruction(); } bool isDebugValue() const { return getOpcode() == TargetOpcode::DBG_VALUE; } + /// A DBG_VALUE is indirect iff the first operand is a register and /// the second operand is an immediate. bool isIndirectDebugValue() const { @@ -787,29 +793,38 @@ public: bool isKill() const { return getOpcode() == TargetOpcode::KILL; } bool isImplicitDef() const { return getOpcode()==TargetOpcode::IMPLICIT_DEF; } bool isInlineAsm() const { return getOpcode() == TargetOpcode::INLINEASM; } + bool isMSInlineAsm() const { return getOpcode() == TargetOpcode::INLINEASM && getInlineAsmDialect(); } + bool isStackAligningInlineAsm() const; InlineAsm::AsmDialect getInlineAsmDialect() const; + bool isInsertSubreg() const { return getOpcode() == TargetOpcode::INSERT_SUBREG; } + bool isSubregToReg() const { return getOpcode() == TargetOpcode::SUBREG_TO_REG; } + bool isRegSequence() const { return getOpcode() == TargetOpcode::REG_SEQUENCE; } + bool isBundle() const { return getOpcode() == TargetOpcode::BUNDLE; } + bool isCopy() const { return getOpcode() == TargetOpcode::COPY; } + bool isFullCopy() const { return isCopy() && !getOperand(0).getSubReg() && !getOperand(1).getSubReg(); } + bool isExtractSubreg() const { return getOpcode() == TargetOpcode::EXTRACT_SUBREG; } @@ -978,7 +993,6 @@ public: /// /// The flag operand is an immediate that can be decoded with methods like /// InlineAsm::hasRegClassConstraint(). - /// int findInlineAsmFlagIdx(unsigned OpIdx, unsigned *GroupNo = nullptr) const; /// Compute the static register class constraint for operand OpIdx. @@ -987,7 +1001,6 @@ public: /// /// Returns NULL if the static register class constraint cannot be /// determined. - /// const TargetRegisterClass* getRegClassConstraint(unsigned OpIdx, const TargetInstrInfo *TII, @@ -1328,6 +1341,6 @@ inline raw_ostream& operator<<(raw_ostream &OS, const MachineInstr &MI) { return OS; } -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEINSTR_H diff --git a/include/llvm/CodeGen/MachineInstrBundleIterator.h b/include/llvm/CodeGen/MachineInstrBundleIterator.h index 3104185385ea..5fe4964ff116 100644 --- a/include/llvm/CodeGen/MachineInstrBundleIterator.h +++ b/include/llvm/CodeGen/MachineInstrBundleIterator.h @@ -15,34 +15,37 @@ #define LLVM_CODEGEN_MACHINEINSTRBUNDLEITERATOR_H #include "llvm/ADT/ilist.h" +#include "llvm/ADT/simple_ilist.h" +#include <cassert> #include <iterator> +#include <type_traits> namespace llvm { template <class T, bool IsReverse> struct MachineInstrBundleIteratorTraits; template <class T> struct MachineInstrBundleIteratorTraits<T, false> { - typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type; - typedef typename list_type::iterator instr_iterator; - typedef typename list_type::iterator nonconst_instr_iterator; - typedef typename list_type::const_iterator const_instr_iterator; + using list_type = simple_ilist<T, ilist_sentinel_tracking<true>>; + using instr_iterator = typename list_type::iterator; + using nonconst_instr_iterator = typename list_type::iterator; + using const_instr_iterator = typename list_type::const_iterator; }; template <class T> struct MachineInstrBundleIteratorTraits<T, true> { - typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type; - typedef typename list_type::reverse_iterator instr_iterator; - typedef typename list_type::reverse_iterator nonconst_instr_iterator; - typedef typename list_type::const_reverse_iterator const_instr_iterator; + using list_type = simple_ilist<T, ilist_sentinel_tracking<true>>; + using instr_iterator = typename list_type::reverse_iterator; + using nonconst_instr_iterator = typename list_type::reverse_iterator; + using const_instr_iterator = typename list_type::const_reverse_iterator; }; template <class T> struct MachineInstrBundleIteratorTraits<const T, false> { - typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type; - typedef typename list_type::const_iterator instr_iterator; - typedef typename list_type::iterator nonconst_instr_iterator; - typedef typename list_type::const_iterator const_instr_iterator; + using list_type = simple_ilist<T, ilist_sentinel_tracking<true>>; + using instr_iterator = typename list_type::const_iterator; + using nonconst_instr_iterator = typename list_type::iterator; + using const_instr_iterator = typename list_type::const_iterator; }; template <class T> struct MachineInstrBundleIteratorTraits<const T, true> { - typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type; - typedef typename list_type::const_reverse_iterator instr_iterator; - typedef typename list_type::reverse_iterator nonconst_instr_iterator; - typedef typename list_type::const_reverse_iterator const_instr_iterator; + using list_type = simple_ilist<T, ilist_sentinel_tracking<true>>; + using instr_iterator = typename list_type::const_reverse_iterator; + using nonconst_instr_iterator = typename list_type::reverse_iterator; + using const_instr_iterator = typename list_type::const_reverse_iterator; }; template <bool IsReverse> struct MachineInstrBundleIteratorHelper; @@ -104,27 +107,27 @@ template <> struct MachineInstrBundleIteratorHelper<true> { /// inside bundles (i.e. walk top level MIs only). template <typename Ty, bool IsReverse = false> class MachineInstrBundleIterator : MachineInstrBundleIteratorHelper<IsReverse> { - typedef MachineInstrBundleIteratorTraits<Ty, IsReverse> Traits; - typedef typename Traits::instr_iterator instr_iterator; + using Traits = MachineInstrBundleIteratorTraits<Ty, IsReverse>; + using instr_iterator = typename Traits::instr_iterator; + instr_iterator MII; public: - typedef typename instr_iterator::value_type value_type; - typedef typename instr_iterator::difference_type difference_type; - typedef typename instr_iterator::pointer pointer; - typedef typename instr_iterator::reference reference; - typedef std::bidirectional_iterator_tag iterator_category; - - typedef typename instr_iterator::const_pointer const_pointer; - typedef typename instr_iterator::const_reference const_reference; + using value_type = typename instr_iterator::value_type; + using difference_type = typename instr_iterator::difference_type; + using pointer = typename instr_iterator::pointer; + using reference = typename instr_iterator::reference; + using const_pointer = typename instr_iterator::const_pointer; + using const_reference = typename instr_iterator::const_reference; + using iterator_category = std::bidirectional_iterator_tag; private: - typedef typename Traits::nonconst_instr_iterator nonconst_instr_iterator; - typedef typename Traits::const_instr_iterator const_instr_iterator; - typedef MachineInstrBundleIterator< - typename nonconst_instr_iterator::value_type, IsReverse> - nonconst_iterator; - typedef MachineInstrBundleIterator<Ty, !IsReverse> reverse_iterator; + using nonconst_instr_iterator = typename Traits::nonconst_instr_iterator; + using const_instr_iterator = typename Traits::const_instr_iterator; + using nonconst_iterator = + MachineInstrBundleIterator<typename nonconst_instr_iterator::value_type, + IsReverse>; + using reverse_iterator = MachineInstrBundleIterator<Ty, !IsReverse>; public: MachineInstrBundleIterator(instr_iterator MI) : MII(MI) { @@ -138,12 +141,14 @@ public: "MachineInstrBundleIterator with a " "bundled MI"); } + MachineInstrBundleIterator(pointer MI) : MII(MI) { // FIXME: This conversion should be explicit. assert((!MI || !MI->isBundledWithPred()) && "It's not legal to initialize " "MachineInstrBundleIterator " "with a bundled MI"); } + // Template allows conversion from const to nonconst. template <class OtherTy> MachineInstrBundleIterator( @@ -151,6 +156,7 @@ public: typename std::enable_if<std::is_convertible<OtherTy *, Ty *>::value, void *>::type = nullptr) : MII(I.getInstrIterator()) {} + MachineInstrBundleIterator() : MII(nullptr) {} /// Explicit conversion between forward/reverse iterators. @@ -280,4 +286,4 @@ public: } // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEINSTRBUNDLEITERATOR_H diff --git a/include/llvm/CodeGen/MachineLoopInfo.h b/include/llvm/CodeGen/MachineLoopInfo.h index 5c814f22f99b..58cffaade9d2 100644 --- a/include/llvm/CodeGen/MachineLoopInfo.h +++ b/include/llvm/CodeGen/MachineLoopInfo.h @@ -33,6 +33,8 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" namespace llvm { @@ -71,6 +73,7 @@ public: private: friend class LoopInfoBase<MachineBasicBlock, MachineLoop>; + explicit MachineLoop(MachineBasicBlock *MBB) : LoopBase<MachineBasicBlock, MachineLoop>(MBB) {} }; @@ -79,11 +82,9 @@ private: extern template class LoopInfoBase<MachineBasicBlock, MachineLoop>; class MachineLoopInfo : public MachineFunctionPass { - LoopInfoBase<MachineBasicBlock, MachineLoop> LI; friend class LoopBase<MachineBasicBlock, MachineLoop>; - void operator=(const MachineLoopInfo &) = delete; - MachineLoopInfo(const MachineLoopInfo &) = delete; + LoopInfoBase<MachineBasicBlock, MachineLoop> LI; public: static char ID; // Pass identification, replacement for typeid @@ -91,6 +92,8 @@ public: MachineLoopInfo() : MachineFunctionPass(ID) { initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry()); } + MachineLoopInfo(const MachineLoopInfo &) = delete; + MachineLoopInfo &operator=(const MachineLoopInfo &) = delete; LoopInfoBase<MachineBasicBlock, MachineLoop>& getBase() { return LI; } @@ -103,7 +106,7 @@ public: bool SpeculativePreheader = false) const; /// The iterator interface to the top-level loops in the current function. - typedef LoopInfoBase<MachineBasicBlock, MachineLoop>::iterator iterator; + using iterator = LoopInfoBase<MachineBasicBlock, MachineLoop>::iterator; inline iterator begin() const { return LI.begin(); } inline iterator end() const { return LI.end(); } bool empty() const { return LI.empty(); } @@ -166,11 +169,10 @@ public: } }; - // Allow clients to walk the list of nested loops... template <> struct GraphTraits<const MachineLoop*> { - typedef const MachineLoop *NodeRef; - typedef MachineLoopInfo::iterator ChildIteratorType; + using NodeRef = const MachineLoop *; + using ChildIteratorType = MachineLoopInfo::iterator; static NodeRef getEntryNode(const MachineLoop *L) { return L; } static ChildIteratorType child_begin(NodeRef N) { return N->begin(); } @@ -178,14 +180,14 @@ template <> struct GraphTraits<const MachineLoop*> { }; template <> struct GraphTraits<MachineLoop*> { - typedef MachineLoop *NodeRef; - typedef MachineLoopInfo::iterator ChildIteratorType; + using NodeRef = MachineLoop *; + using ChildIteratorType = MachineLoopInfo::iterator; static NodeRef getEntryNode(MachineLoop *L) { return L; } static ChildIteratorType child_begin(NodeRef N) { return N->begin(); } static ChildIteratorType child_end(NodeRef N) { return N->end(); } }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINELOOPINFO_H diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h index f46ef41879d1..d4ac58c3bd22 100644 --- a/include/llvm/CodeGen/MachineModuleInfo.h +++ b/include/llvm/CodeGen/MachineModuleInfo.h @@ -31,35 +31,26 @@ #ifndef LLVM_CODEGEN_MACHINEMODULEINFO_H #define LLVM_CODEGEN_MACHINEMODULEINFO_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PointerIntPair.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/IR/DebugLoc.h" -#include "llvm/IR/ValueHandle.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MachineLocation.h" #include "llvm/Pass.h" -#include "llvm/Support/DataTypes.h" +#include <memory> +#include <utility> +#include <vector> namespace llvm { -//===----------------------------------------------------------------------===// -// Forward declarations. -class BlockAddress; +class BasicBlock; class CallInst; -class Constant; -class GlobalVariable; -class LandingPadInst; -class MDNode; -class MMIAddrLabelMap; -class MachineBasicBlock; +class Function; class MachineFunction; class MachineFunctionInitializer; +class MMIAddrLabelMap; class Module; -class PointerType; -class StructType; +class TargetMachine; //===----------------------------------------------------------------------===// /// This class can be derived from and used by targets to hold private @@ -69,11 +60,12 @@ class StructType; /// class MachineModuleInfoImpl { public: - typedef PointerIntPair<MCSymbol*, 1, bool> StubValueTy; + using StubValueTy = PointerIntPair<MCSymbol *, 1, bool>; + using SymbolListTy = std::vector<std::pair<MCSymbol *, StubValueTy>>; + virtual ~MachineModuleInfoImpl(); - typedef std::vector<std::pair<MCSymbol*, StubValueTy> > SymbolListTy; -protected: +protected: /// Return the entries from a DenseMap in a deterministic sorted orer. /// Clears the map. static SymbolListTy getSortedStubs(DenseMap<MCSymbol*, StubValueTy>&); @@ -252,6 +244,6 @@ public: /// which will link in MSVCRT's floating-point support. void computeUsesVAFloatArgument(const CallInst &I, MachineModuleInfo &MMI); -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEMODULEINFO_H diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h index f3e04cffcda6..3bcfc1c4254b 100644 --- a/include/llvm/CodeGen/Passes.h +++ b/include/llvm/CodeGen/Passes.h @@ -133,6 +133,10 @@ namespace llvm { // instruction and update the MachineFunctionInfo with that information. extern char &ShrinkWrapID; + /// LiveRangeShrink pass. Move instruction close to its definition to shrink + /// the definition's live range. + extern char &LiveRangeShrinkID; + /// Greedy register allocator. extern char &RAGreedyID; diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h index f0c826dc1d45..fcf1937c186e 100644 --- a/include/llvm/CodeGen/TargetPassConfig.h +++ b/include/llvm/CodeGen/TargetPassConfig.h @@ -22,7 +22,7 @@ namespace llvm { class PassConfigImpl; class ScheduleDAGInstrs; -class TargetMachine; +class LLVMTargetMachine; struct MachineSchedContext; // The old pass manager infrastructure is hidden in a legacy namespace now. @@ -103,7 +103,7 @@ private: bool AddingMachinePasses; protected: - TargetMachine *TM; + LLVMTargetMachine *TM; PassConfigImpl *Impl; // Internal data structures bool Initialized; // Flagged after all passes are configured. @@ -120,7 +120,7 @@ protected: bool RequireCodeGenSCCOrder; public: - TargetPassConfig(TargetMachine *tm, PassManagerBase &pm); + TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm); // Dummy constructor. TargetPassConfig(); diff --git a/include/llvm/DebugInfo/CodeView/CodeView.h b/include/llvm/DebugInfo/CodeView/CodeView.h index 3316e71916ed..4e8c8feb7a12 100644 --- a/include/llvm/DebugInfo/CodeView/CodeView.h +++ b/include/llvm/DebugInfo/CodeView/CodeView.h @@ -6,6 +6,10 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// +// Defines constants and basic types describing CodeView debug information. +// +//===----------------------------------------------------------------------===// #ifndef LLVM_DEBUGINFO_CODEVIEW_CODEVIEW_H #define LLVM_DEBUGINFO_CODEVIEW_CODEVIEW_H @@ -22,28 +26,28 @@ namespace codeview { /// documentation and headers talk about this as the "leaf" type. enum class TypeRecordKind : uint16_t { #define TYPE_RECORD(lf_ename, value, name) name = value, -#include "TypeRecords.def" +#include "CodeViewTypes.def" }; /// Duplicate copy of the above enum, but using the official CV names. Useful /// for reference purposes and when dealing with unknown record types. enum TypeLeafKind : uint16_t { #define CV_TYPE(name, val) name = val, -#include "TypeRecords.def" +#include "CodeViewTypes.def" }; /// Distinguishes individual records in the Symbols subsection of a .debug$S /// section. Equivalent to SYM_ENUM_e in cvinfo.h. enum class SymbolRecordKind : uint16_t { #define SYMBOL_RECORD(lf_ename, value, name) name = value, -#include "CVSymbolTypes.def" +#include "CodeViewSymbols.def" }; /// Duplicate copy of the above enum, but using the official CV names. Useful /// for reference purposes and when dealing with unknown record types. enum SymbolKind : uint16_t { #define CV_SYMBOL(name, val) name = val, -#include "CVSymbolTypes.def" +#include "CodeViewSymbols.def" }; #define CV_DEFINE_ENUM_CLASS_FLAGS_OPERATORS(Class) \ @@ -280,7 +284,7 @@ CV_DEFINE_ENUM_CLASS_FLAGS_OPERATORS(MethodOptions) /// Equivalent to CV_LABEL_TYPE_e. enum class LabelType : uint16_t { Near = 0x0, - Far = 0x4, + Far = 0x4, }; /// Equivalent to CV_modifier_t. diff --git a/include/llvm/DebugInfo/CodeView/CVSymbolTypes.def b/include/llvm/DebugInfo/CodeView/CodeViewSymbols.def index 32813d861d90..32813d861d90 100644 --- a/include/llvm/DebugInfo/CodeView/CVSymbolTypes.def +++ b/include/llvm/DebugInfo/CodeView/CodeViewSymbols.def diff --git a/include/llvm/DebugInfo/CodeView/TypeRecords.def b/include/llvm/DebugInfo/CodeView/CodeViewTypes.def index 8c193bb13cb7..8c193bb13cb7 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecords.def +++ b/include/llvm/DebugInfo/CodeView/CodeViewTypes.def diff --git a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h index c1a5152930ff..428ff153d5d1 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h +++ b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h @@ -34,6 +34,17 @@ class SymbolDeserializer : public SymbolVisitorCallbacks { }; public: + template <typename T> static Error deserializeAs(CVSymbol Symbol, T &Record) { + SymbolDeserializer S(nullptr); + if (auto EC = S.visitSymbolBegin(Symbol)) + return EC; + if (auto EC = S.visitKnownRecord(Symbol, Record)) + return EC; + if (auto EC = S.visitSymbolEnd(Symbol)) + return EC; + return Error::success(); + } + explicit SymbolDeserializer(SymbolVisitorDelegate *Delegate) : Delegate(Delegate) {} @@ -54,7 +65,7 @@ public: return visitKnownRecordImpl(CVR, Record); \ } #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" private: template <typename T> Error visitKnownRecordImpl(CVSymbol &CVR, T &Record) { diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/include/llvm/DebugInfo/CodeView/SymbolRecord.h index c5a5549bf818..a3e4dff647bd 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolRecord.h +++ b/include/llvm/DebugInfo/CodeView/SymbolRecord.h @@ -35,8 +35,6 @@ protected: public: SymbolRecordKind getKind() const { return Kind; } - -private: SymbolRecordKind Kind; }; diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h b/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h index 0a1837a0d935..5d072a3b2723 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h +++ b/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h @@ -29,7 +29,7 @@ public: #define SYMBOL_RECORD(EnumName, EnumVal, Name) \ Error visitKnownRecord(CVSymbol &CVR, Name &Record) override; #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" private: Optional<SymbolKind> Kind; diff --git a/include/llvm/DebugInfo/CodeView/SymbolSerializer.h b/include/llvm/DebugInfo/CodeView/SymbolSerializer.h index f2e99bd83326..a8fe1a3ae1d0 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolSerializer.h +++ b/include/llvm/DebugInfo/CodeView/SymbolSerializer.h @@ -45,6 +45,17 @@ class SymbolSerializer : public SymbolVisitorCallbacks { } public: + template <typename SymType> + static CVSymbol writeOneSymbol(SymType &Sym, BumpPtrAllocator &Storage) { + CVSymbol Result; + Result.Type = static_cast<SymbolKind>(Sym.Kind); + SymbolSerializer Serializer(Storage); + consumeError(Serializer.visitSymbolBegin(Result)); + consumeError(Serializer.visitKnownRecord(Result, Sym)); + consumeError(Serializer.visitSymbolEnd(Result)); + return Result; + } + explicit SymbolSerializer(BumpPtrAllocator &Storage); virtual Error visitSymbolBegin(CVSymbol &Record) override; @@ -55,7 +66,7 @@ public: return visitKnownRecordImpl(CVR, Record); \ } #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" private: template <typename RecordKind> diff --git a/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h b/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h index 96a93bf7e576..5f4205bd6e08 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h +++ b/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h @@ -59,7 +59,7 @@ public: return Error::success(); \ } #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" private: std::vector<SymbolVisitorCallbacks *> Pipeline; diff --git a/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h b/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h index aaa9d2e85e13..2ef7eabdaa9d 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h +++ b/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h @@ -39,7 +39,7 @@ public: return Error::success(); \ } #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" }; } // end namespace codeview diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h index c064e19a7e90..77dbc91a7d38 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h +++ b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h @@ -39,7 +39,7 @@ public: Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override; #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: StringRef getTypeName(TypeIndex Index) const; diff --git a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h index a9c5cf42fc5b..965cdfd85f48 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h +++ b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h @@ -41,6 +41,7 @@ public: TypeDeserializer() = default; template <typename T> static Error deserializeAs(CVType &CVT, T &Record) { + Record.Kind = static_cast<TypeRecordKind>(CVT.kind()); MappingInfo I(CVT.content()); if (auto EC = I.Mapping.visitTypeBegin(CVT)) return EC; @@ -75,7 +76,7 @@ public: #define MEMBER_RECORD(EnumName, EnumVal, Name) #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: template <typename RecordType> @@ -127,7 +128,7 @@ public: } #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: template <typename RecordType> diff --git a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h index 65b3a33e6548..afb8b3636361 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h +++ b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h @@ -58,7 +58,7 @@ public: Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override; #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: void printMemberAttributes(MemberAttributes Attrs); diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h index 92745ebfcded..3a64a437aa4d 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecord.h +++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h @@ -123,13 +123,13 @@ protected: public: TypeRecordKind getKind() const { return Kind; } -private: TypeRecordKind Kind; }; // LF_MODIFIER class ModifierRecord : public TypeRecord { public: + ModifierRecord() = default; explicit ModifierRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} ModifierRecord(TypeIndex ModifiedType, ModifierOptions Modifiers) : TypeRecord(TypeRecordKind::Modifier), ModifiedType(ModifiedType), @@ -145,6 +145,7 @@ public: // LF_PROCEDURE class ProcedureRecord : public TypeRecord { public: + ProcedureRecord() = default; explicit ProcedureRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} ProcedureRecord(TypeIndex ReturnType, CallingConvention CallConv, FunctionOptions Options, uint16_t ParameterCount, @@ -169,6 +170,7 @@ public: // LF_MFUNCTION class MemberFunctionRecord : public TypeRecord { public: + MemberFunctionRecord() = default; explicit MemberFunctionRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} MemberFunctionRecord(TypeIndex ReturnType, TypeIndex ClassType, @@ -203,6 +205,7 @@ public: // LF_LABEL class LabelRecord : public TypeRecord { public: + LabelRecord() = default; explicit LabelRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} LabelRecord(LabelType Mode) : TypeRecord(TypeRecordKind::Label), Mode(Mode) {} @@ -213,6 +216,7 @@ public: // LF_MFUNC_ID class MemberFuncIdRecord : public TypeRecord { public: + MemberFuncIdRecord() = default; explicit MemberFuncIdRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} MemberFuncIdRecord(TypeIndex ClassType, TypeIndex FunctionType, StringRef Name) @@ -230,6 +234,7 @@ public: // LF_ARGLIST class ArgListRecord : public TypeRecord { public: + ArgListRecord() = default; explicit ArgListRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} ArgListRecord(TypeRecordKind Kind, ArrayRef<TypeIndex> Indices) @@ -243,6 +248,7 @@ public: // LF_SUBSTR_LIST class StringListRecord : public TypeRecord { public: + StringListRecord() = default; explicit StringListRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} StringListRecord(TypeRecordKind Kind, ArrayRef<TypeIndex> Indices) @@ -267,6 +273,7 @@ public: static const uint32_t PointerSizeShift = 13; static const uint32_t PointerSizeMask = 0xFF; + PointerRecord() = default; explicit PointerRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} PointerRecord(TypeIndex ReferentType, uint32_t Attrs) @@ -341,6 +348,7 @@ private: // LF_NESTTYPE class NestedTypeRecord : public TypeRecord { public: + NestedTypeRecord() = default; explicit NestedTypeRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} NestedTypeRecord(TypeIndex Type, StringRef Name) : TypeRecord(TypeRecordKind::NestedType), Type(Type), Name(Name) {} @@ -355,6 +363,7 @@ public: // LF_FIELDLIST class FieldListRecord : public TypeRecord { public: + FieldListRecord() = default; explicit FieldListRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} explicit FieldListRecord(ArrayRef<uint8_t> Data) : TypeRecord(TypeRecordKind::FieldList), Data(Data) {} @@ -365,6 +374,7 @@ public: // LF_ARRAY class ArrayRecord : public TypeRecord { public: + ArrayRecord() = default; explicit ArrayRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} ArrayRecord(TypeIndex ElementType, TypeIndex IndexType, uint64_t Size, StringRef Name) @@ -384,6 +394,7 @@ public: class TagRecord : public TypeRecord { protected: + TagRecord() = default; explicit TagRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} TagRecord(TypeRecordKind Kind, uint16_t MemberCount, ClassOptions Options, TypeIndex FieldList, StringRef Name, StringRef UniqueName) @@ -416,6 +427,7 @@ public: // LF_CLASS, LF_STRUCTURE, LF_INTERFACE class ClassRecord : public TagRecord { public: + ClassRecord() = default; explicit ClassRecord(TypeRecordKind Kind) : TagRecord(Kind) {} ClassRecord(TypeRecordKind Kind, uint16_t MemberCount, ClassOptions Options, TypeIndex FieldList, TypeIndex DerivationList, @@ -447,6 +459,7 @@ public: // LF_UNION struct UnionRecord : public TagRecord { + UnionRecord() = default; explicit UnionRecord(TypeRecordKind Kind) : TagRecord(Kind) {} UnionRecord(uint16_t MemberCount, ClassOptions Options, TypeIndex FieldList, uint64_t Size, StringRef Name, StringRef UniqueName) @@ -468,6 +481,7 @@ struct UnionRecord : public TagRecord { // LF_ENUM class EnumRecord : public TagRecord { public: + EnumRecord() = default; explicit EnumRecord(TypeRecordKind Kind) : TagRecord(Kind) {} EnumRecord(uint16_t MemberCount, ClassOptions Options, TypeIndex FieldList, StringRef Name, StringRef UniqueName, TypeIndex UnderlyingType) @@ -482,6 +496,7 @@ public: // LF_BITFIELD class BitFieldRecord : public TypeRecord { public: + BitFieldRecord() = default; explicit BitFieldRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} BitFieldRecord(TypeIndex Type, uint8_t BitSize, uint8_t BitOffset) : TypeRecord(TypeRecordKind::BitField), Type(Type), BitSize(BitSize), @@ -498,6 +513,7 @@ public: // LF_VTSHAPE class VFTableShapeRecord : public TypeRecord { public: + VFTableShapeRecord() = default; explicit VFTableShapeRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} explicit VFTableShapeRecord(ArrayRef<VFTableSlotKind> Slots) : TypeRecord(TypeRecordKind::VFTableShape), SlotsRef(Slots) {} @@ -518,6 +534,7 @@ public: // LF_TYPESERVER2 class TypeServer2Record : public TypeRecord { public: + TypeServer2Record() = default; explicit TypeServer2Record(TypeRecordKind Kind) : TypeRecord(Kind) {} TypeServer2Record(StringRef Guid, uint32_t Age, StringRef Name) : TypeRecord(TypeRecordKind::TypeServer2), Guid(Guid), Age(Age), @@ -537,6 +554,7 @@ public: // LF_STRING_ID class StringIdRecord : public TypeRecord { public: + StringIdRecord() = default; explicit StringIdRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} StringIdRecord(TypeIndex Id, StringRef String) : TypeRecord(TypeRecordKind::StringId), Id(Id), String(String) {} @@ -551,6 +569,7 @@ public: // LF_FUNC_ID class FuncIdRecord : public TypeRecord { public: + FuncIdRecord() = default; explicit FuncIdRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} FuncIdRecord(TypeIndex ParentScope, TypeIndex FunctionType, StringRef Name) : TypeRecord(TypeRecordKind::FuncId), ParentScope(ParentScope), @@ -570,6 +589,7 @@ public: // LF_UDT_SRC_LINE class UdtSourceLineRecord : public TypeRecord { public: + UdtSourceLineRecord() = default; explicit UdtSourceLineRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} UdtSourceLineRecord(TypeIndex UDT, TypeIndex SourceFile, uint32_t LineNumber) : TypeRecord(TypeRecordKind::UdtSourceLine), UDT(UDT), @@ -587,6 +607,7 @@ public: // LF_UDT_MOD_SRC_LINE class UdtModSourceLineRecord : public TypeRecord { public: + UdtModSourceLineRecord() = default; explicit UdtModSourceLineRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} UdtModSourceLineRecord(TypeIndex UDT, TypeIndex SourceFile, uint32_t LineNumber, uint16_t Module) @@ -607,6 +628,7 @@ public: // LF_BUILDINFO class BuildInfoRecord : public TypeRecord { public: + BuildInfoRecord() = default; explicit BuildInfoRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} BuildInfoRecord(ArrayRef<TypeIndex> ArgIndices) : TypeRecord(TypeRecordKind::BuildInfo), @@ -619,6 +641,7 @@ public: // LF_VFTABLE class VFTableRecord : public TypeRecord { public: + VFTableRecord() = default; explicit VFTableRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} VFTableRecord(TypeIndex CompleteClass, TypeIndex OverriddenVFTable, uint32_t VFPtrOffset, StringRef Name, @@ -646,7 +669,7 @@ public: // LF_ONEMETHOD class OneMethodRecord : public TypeRecord { public: - OneMethodRecord() : TypeRecord(TypeRecordKind::OneMethod) {} + OneMethodRecord() = default; explicit OneMethodRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} OneMethodRecord(TypeIndex Type, MemberAttributes Attrs, int32_t VFTableOffset, StringRef Name) @@ -678,6 +701,7 @@ public: // LF_METHODLIST class MethodOverloadListRecord : public TypeRecord { public: + MethodOverloadListRecord() = default; explicit MethodOverloadListRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} MethodOverloadListRecord(ArrayRef<OneMethodRecord> Methods) : TypeRecord(TypeRecordKind::MethodOverloadList), Methods(Methods) {} @@ -689,6 +713,7 @@ public: /// For method overload sets. LF_METHOD class OverloadedMethodRecord : public TypeRecord { public: + OverloadedMethodRecord() = default; explicit OverloadedMethodRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} OverloadedMethodRecord(uint16_t NumOverloads, TypeIndex MethodList, StringRef Name) @@ -706,6 +731,7 @@ public: // LF_MEMBER class DataMemberRecord : public TypeRecord { public: + DataMemberRecord() = default; explicit DataMemberRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} DataMemberRecord(MemberAttributes Attrs, TypeIndex Type, uint64_t Offset, StringRef Name) @@ -730,6 +756,7 @@ public: // LF_STMEMBER class StaticDataMemberRecord : public TypeRecord { public: + StaticDataMemberRecord() = default; explicit StaticDataMemberRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} StaticDataMemberRecord(MemberAttributes Attrs, TypeIndex Type, StringRef Name) : TypeRecord(TypeRecordKind::StaticDataMember), Attrs(Attrs), Type(Type), @@ -750,6 +777,7 @@ public: // LF_ENUMERATE class EnumeratorRecord : public TypeRecord { public: + EnumeratorRecord() = default; explicit EnumeratorRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} EnumeratorRecord(MemberAttributes Attrs, APSInt Value, StringRef Name) : TypeRecord(TypeRecordKind::Enumerator), Attrs(Attrs), @@ -770,6 +798,7 @@ public: // LF_VFUNCTAB class VFPtrRecord : public TypeRecord { public: + VFPtrRecord() = default; explicit VFPtrRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} VFPtrRecord(TypeIndex Type) : TypeRecord(TypeRecordKind::VFPtr), Type(Type) {} @@ -782,6 +811,7 @@ public: // LF_BCLASS, LF_BINTERFACE class BaseClassRecord : public TypeRecord { public: + BaseClassRecord() = default; explicit BaseClassRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} BaseClassRecord(MemberAttributes Attrs, TypeIndex Type, uint64_t Offset) : TypeRecord(TypeRecordKind::BaseClass), Attrs(Attrs), Type(Type), @@ -802,6 +832,7 @@ public: // LF_VBCLASS, LF_IVBCLASS class VirtualBaseClassRecord : public TypeRecord { public: + VirtualBaseClassRecord() = default; explicit VirtualBaseClassRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} VirtualBaseClassRecord(TypeRecordKind Kind, MemberAttributes Attrs, TypeIndex BaseType, TypeIndex VBPtrType, @@ -831,6 +862,7 @@ public: /// together. The first will end in an LF_INDEX record that points to the next. class ListContinuationRecord : public TypeRecord { public: + ListContinuationRecord() = default; explicit ListContinuationRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} ListContinuationRecord(TypeIndex ContinuationIndex) : TypeRecord(TypeRecordKind::ListContinuation), diff --git a/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h b/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h index 924ca0470fad..6156223b2560 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h +++ b/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h @@ -37,7 +37,7 @@ public: Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override; #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: Optional<TypeLeafKind> TypeKind; diff --git a/include/llvm/DebugInfo/CodeView/TypeSerializer.h b/include/llvm/DebugInfo/CodeView/TypeSerializer.h index 435c43f7edcb..1dee86a1da79 100644 --- a/include/llvm/DebugInfo/CodeView/TypeSerializer.h +++ b/include/llvm/DebugInfo/CodeView/TypeSerializer.h @@ -106,7 +106,7 @@ public: return visitKnownMemberImpl<Name##Record>(CVR, Record); \ } #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: template <typename RecordKind> diff --git a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h index 7bdc9ecb20cf..907ed1010e5b 100644 --- a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h +++ b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h @@ -13,8 +13,8 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" -#include "llvm/DebugInfo/CodeView/TypeSerializer.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/DebugInfo/CodeView/TypeSerializer.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Error.h" #include <algorithm> diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h index ed48df33249f..126fb8abb0da 100644 --- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h +++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h @@ -94,7 +94,7 @@ public: } #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: template <typename T> Error visitKnownRecordImpl(CVType &CVR, T &Record) { diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h index 0ea754deb425..d7a473306bc2 100644 --- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h +++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h @@ -58,7 +58,11 @@ public: #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" +#undef TYPE_RECORD +#undef TYPE_RECORD_ALIAS +#undef MEMBER_RECORD +#undef MEMBER_RECORD_ALIAS }; } // end namespace codeview diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h index d51408122fc9..2ab1c9508522 100644 --- a/include/llvm/DebugInfo/DIContext.h +++ b/include/llvm/DebugInfo/DIContext.h @@ -146,6 +146,14 @@ enum DIDumpType { DIDT_TUIndex, }; +/// Container for dump options that control which debug information will be +/// dumped. +struct DIDumpOptions { + DIDumpType DumpType = DIDT_All; + bool DumpEH = false; + bool SummarizeTypes = false; +}; + class DIContext { public: enum DIContextKind { @@ -158,8 +166,7 @@ public: DIContextKind getKind() const { return Kind; } - virtual void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All, - bool DumpEH = false, bool SummarizeTypes = false) = 0; + virtual void dump(raw_ostream &OS, DIDumpOptions DumpOpts) = 0; virtual bool verify(raw_ostream &OS, DIDumpType DumpType = DIDT_All) { // No verifier? Just say things went well. diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h index 7fa68f3f2314..519ecf618558 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFContext.h +++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h @@ -105,8 +105,7 @@ public: return DICtx->getKind() == CK_DWARF; } - void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All, - bool DumpEH = false, bool SummarizeTypes = false) override; + void dump(raw_ostream &OS, DIDumpOptions DumpOpts) override; bool verify(raw_ostream &OS, DIDumpType DumpType = DIDT_All) override; diff --git a/include/llvm/DebugInfo/PDB/Native/RawConstants.h b/include/llvm/DebugInfo/PDB/Native/RawConstants.h index f5d4df8feb2e..e1bd86b2870b 100644 --- a/include/llvm/DebugInfo/PDB/Native/RawConstants.h +++ b/include/llvm/DebugInfo/PDB/Native/RawConstants.h @@ -12,7 +12,6 @@ #include "llvm/ADT/BitmaskEnum.h" #include "llvm/DebugInfo/CodeView/CodeView.h" - #include <cstdint> namespace llvm { diff --git a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h index dd2698c354a2..156abb59a6be 100644 --- a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h +++ b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h @@ -38,7 +38,7 @@ public: #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD(EnumName, EnumVal, Name) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: template <typename RecordKind> diff --git a/include/llvm/DebugInfo/PDB/PDBContext.h b/include/llvm/DebugInfo/PDB/PDBContext.h index 84ab8ed173cb..0ce49f5ef922 100644 --- a/include/llvm/DebugInfo/PDB/PDBContext.h +++ b/include/llvm/DebugInfo/PDB/PDBContext.h @@ -41,8 +41,7 @@ namespace pdb { return DICtx->getKind() == CK_PDB; } - void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All, - bool DumpEH = false, bool SummarizeTypes = false) override; + void dump(raw_ostream &OS, DIDumpOptions DIDumpOpts) override; DILineInfo getLineInfoForAddress( uint64_t Address, diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h index ace309ed95a4..687863857698 100644 --- a/include/llvm/IR/Attributes.h +++ b/include/llvm/IR/Attributes.h @@ -228,34 +228,31 @@ public: bool operator==(const AttributeSet &O) { return SetNode == O.SetNode; } bool operator!=(const AttributeSet &O) { return !(*this == O); } - /// Add an argument attribute. Because - /// attribute sets are immutable, this returns a new set. - AttributeSet addAttribute(LLVMContext &C, - Attribute::AttrKind Kind) const; + /// Add an argument attribute. Returns a new set because attribute sets are + /// immutable. + AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const; - /// Add a target-dependent attribute. Because - /// attribute sets are immutable, this returns a new set. + /// Add a target-dependent attribute. Returns a new set because attribute sets + /// are immutable. AttributeSet addAttribute(LLVMContext &C, StringRef Kind, StringRef Value = StringRef()) const; - /// Add attributes to the attribute set. Because - /// attribute sets are immutable, this returns a new set. + /// Add attributes to the attribute set. Returns a new set because attribute + /// sets are immutable. AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const; - /// Remove the specified attribute from this set. Because - /// attribute sets are immutable, this returns a new set. - AttributeSet removeAttribute(LLVMContext &C, - Attribute::AttrKind Kind) const; + /// Remove the specified attribute from this set. Returns a new set because + /// attribute sets are immutable. + AttributeSet removeAttribute(LLVMContext &C, Attribute::AttrKind Kind) const; - /// Remove the specified attribute from this set. Because - /// attribute sets are immutable, this returns a new set. - AttributeSet removeAttribute(LLVMContext &C, - StringRef Kind) const; + /// Remove the specified attribute from this set. Returns a new set because + /// attribute sets are immutable. + AttributeSet removeAttribute(LLVMContext &C, StringRef Kind) const; - /// Remove the specified attributes from this set. Because - /// attribute sets are immutable, this returns a new set. + /// Remove the specified attributes from this set. Returns a new set because + /// attribute sets are immutable. AttributeSet removeAttributes(LLVMContext &C, - const AttrBuilder &AttrsToRemove) const; + const AttrBuilder &AttrsToRemove) const; /// Return the number of attributes in this set. unsigned getNumAttributes() const; @@ -377,73 +374,138 @@ public: static AttributeList get(LLVMContext &C, unsigned Index, const AttrBuilder &B); - /// Add an argument attribute to the list. Returns a new list because - /// attribute lists are immutable. - AttributeList addParamAttribute(LLVMContext &C, unsigned ArgNo, - Attribute::AttrKind Kind) const { - return addAttribute(C, ArgNo + FirstArgIndex, Kind); - } - - /// \brief Add an attribute to the attribute set at the given index. Because - /// attribute sets are immutable, this returns a new set. + /// \brief Add an attribute to the attribute set at the given index. + /// Returns a new list because attribute lists are immutable. AttributeList addAttribute(LLVMContext &C, unsigned Index, Attribute::AttrKind Kind) const; - /// \brief Add an attribute to the attribute set at the given index. Because - /// attribute sets are immutable, this returns a new set. + /// \brief Add an attribute to the attribute set at the given index. + /// Returns a new list because attribute lists are immutable. AttributeList addAttribute(LLVMContext &C, unsigned Index, StringRef Kind, StringRef Value = StringRef()) const; - /// Add an attribute to the attribute set at the given indices. Because - /// attribute sets are immutable, this returns a new set. - AttributeList addAttribute(LLVMContext &C, ArrayRef<unsigned> Indices, - Attribute A) const; + /// Add an attribute to the attribute set at the given index. + /// Returns a new list because attribute lists are immutable. + AttributeList addAttribute(LLVMContext &C, unsigned Index, Attribute A) const; - /// \brief Add attributes to the attribute set at the given index. Because - /// attribute sets are immutable, this returns a new set. + /// \brief Add attributes to the attribute set at the given index. + /// Returns a new list because attribute lists are immutable. AttributeList addAttributes(LLVMContext &C, unsigned Index, const AttrBuilder &B) const; + /// Add an argument attribute to the list. Returns a new list because + /// attribute lists are immutable. + AttributeList addParamAttribute(LLVMContext &C, unsigned ArgNo, + Attribute::AttrKind Kind) const { + return addAttribute(C, ArgNo + FirstArgIndex, Kind); + } + + /// Add an argument attribute to the list. Returns a new list because + /// attribute lists are immutable. + AttributeList addParamAttribute(LLVMContext &C, unsigned ArgNo, + StringRef Kind, + StringRef Value = StringRef()) const { + return addAttribute(C, ArgNo + FirstArgIndex, Kind, Value); + } + + /// Add an attribute to the attribute list at the given arg indices. Returns a + /// new list because attribute lists are immutable. + AttributeList addParamAttribute(LLVMContext &C, ArrayRef<unsigned> ArgNos, + Attribute A) const; + + /// Add an argument attribute to the list. Returns a new list because + /// attribute lists are immutable. + AttributeList addParamAttributes(LLVMContext &C, unsigned ArgNo, + const AttrBuilder &B) const { + return addAttributes(C, ArgNo + FirstArgIndex, B); + } + /// \brief Remove the specified attribute at the specified index from this - /// attribute list. Because attribute lists are immutable, this returns the - /// new list. + /// attribute list. Returns a new list because attribute lists are immutable. AttributeList removeAttribute(LLVMContext &C, unsigned Index, Attribute::AttrKind Kind) const; /// \brief Remove the specified attribute at the specified index from this - /// attribute list. Because attribute lists are immutable, this returns the - /// new list. + /// attribute list. Returns a new list because attribute lists are immutable. AttributeList removeAttribute(LLVMContext &C, unsigned Index, StringRef Kind) const; /// \brief Remove the specified attributes at the specified index from this - /// attribute list. Because attribute lists are immutable, this returns the - /// new list. + /// attribute list. Returns a new list because attribute lists are immutable. AttributeList removeAttributes(LLVMContext &C, unsigned Index, const AttrBuilder &AttrsToRemove) const; /// \brief Remove all attributes at the specified index from this - /// attribute list. Because attribute lists are immutable, this returns the - /// new list. + /// attribute list. Returns a new list because attribute lists are immutable. AttributeList removeAttributes(LLVMContext &C, unsigned Index) const; - /// \brief Add the dereferenceable attribute to the attribute set at the given - /// index. Because attribute sets are immutable, this returns a new set. + /// \brief Remove the specified attribute at the specified arg index from this + /// attribute list. Returns a new list because attribute lists are immutable. + AttributeList removeParamAttribute(LLVMContext &C, unsigned ArgNo, + Attribute::AttrKind Kind) const { + return removeAttribute(C, ArgNo + FirstArgIndex, Kind); + } + + /// \brief Remove the specified attribute at the specified arg index from this + /// attribute list. Returns a new list because attribute lists are immutable. + AttributeList removeParamAttribute(LLVMContext &C, unsigned ArgNo, + StringRef Kind) const { + return removeAttribute(C, ArgNo + FirstArgIndex, Kind); + } + + /// \brief Remove the specified attribute at the specified arg index from this + /// attribute list. Returns a new list because attribute lists are immutable. + AttributeList removeParamAttributes(LLVMContext &C, unsigned ArgNo, + const AttrBuilder &AttrsToRemove) const { + return removeAttributes(C, ArgNo + FirstArgIndex, AttrsToRemove); + } + + /// \brief Remove all attributes at the specified arg index from this + /// attribute list. Returns a new list because attribute lists are immutable. + AttributeList removeParamAttributes(LLVMContext &C, unsigned ArgNo) const { + return removeAttributes(C, ArgNo + FirstArgIndex); + } + + /// \Brief Add the dereferenceable attribute to the attribute set at the given + /// index. Returns a new list because attribute lists are immutable. AttributeList addDereferenceableAttr(LLVMContext &C, unsigned Index, uint64_t Bytes) const; + /// \Brief Add the dereferenceable attribute to the attribute set at the given + /// arg index. Returns a new list because attribute lists are immutable. + AttributeList addDereferenceableParamAttr(LLVMContext &C, unsigned ArgNo, + uint64_t Bytes) const { + return addDereferenceableAttr(C, ArgNo + FirstArgIndex, Bytes); + } + /// \brief Add the dereferenceable_or_null attribute to the attribute set at - /// the given index. Because attribute sets are immutable, this returns a new - /// set. + /// the given index. Returns a new list because attribute lists are immutable. AttributeList addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index, uint64_t Bytes) const; + /// \brief Add the dereferenceable_or_null attribute to the attribute set at + /// the given arg index. Returns a new list because attribute lists are + /// immutable. + AttributeList addDereferenceableOrNullParamAttr(LLVMContext &C, + unsigned ArgNo, + uint64_t Bytes) const { + return addDereferenceableOrNullAttr(C, ArgNo + FirstArgIndex, Bytes); + } + /// Add the allocsize attribute to the attribute set at the given index. - /// Because attribute sets are immutable, this returns a new set. + /// Returns a new list because attribute lists are immutable. AttributeList addAllocSizeAttr(LLVMContext &C, unsigned Index, unsigned ElemSizeArg, const Optional<unsigned> &NumElemsArg); + /// Add the allocsize attribute to the attribute set at the given arg index. + /// Returns a new list because attribute lists are immutable. + AttributeList addAllocSizeParamAttr(LLVMContext &C, unsigned ArgNo, + unsigned ElemSizeArg, + const Optional<unsigned> &NumElemsArg) { + return addAllocSizeAttr(C, ArgNo + FirstArgIndex, ElemSizeArg, NumElemsArg); + } + //===--------------------------------------------------------------------===// // AttributeList Accessors //===--------------------------------------------------------------------===// @@ -473,6 +535,21 @@ public: /// \brief Return true if attribute exists at the given index. bool hasAttributes(unsigned Index) const; + /// \brief Return true if the attribute exists for the given argument + bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { + return hasAttribute(ArgNo + FirstArgIndex, Kind); + } + + /// \brief Return true if the attribute exists for the given argument + bool hasParamAttr(unsigned ArgNo, StringRef Kind) const { + return hasAttribute(ArgNo + FirstArgIndex, Kind); + } + + /// \brief Return true if attributes exists for the given argument + bool hasParamAttrs(unsigned ArgNo) const { + return hasAttributes(ArgNo + FirstArgIndex); + } + /// \brief Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but /// may be faster. bool hasFnAttribute(Attribute::AttrKind Kind) const; @@ -496,6 +573,16 @@ public: /// \brief Return the attribute object that exists at the given index. Attribute getAttribute(unsigned Index, StringRef Kind) const; + /// \brief Return the attribute object that exists at the arg index. + Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { + return getAttribute(ArgNo + FirstArgIndex, Kind); + } + + /// \brief Return the attribute object that exists at the given index. + Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const { + return getAttribute(ArgNo + FirstArgIndex, Kind); + } + /// \brief Return the alignment of the return value. unsigned getRetAlignment() const; @@ -508,10 +595,22 @@ public: /// \brief Get the number of dereferenceable bytes (or zero if unknown). uint64_t getDereferenceableBytes(unsigned Index) const; + /// \brief Get the number of dereferenceable bytes (or zero if unknown) of an + /// arg. + uint64_t getParamDereferenceableBytes(unsigned ArgNo) const { + return getDereferenceableBytes(ArgNo + FirstArgIndex); + } + /// \brief Get the number of dereferenceable_or_null bytes (or zero if /// unknown). uint64_t getDereferenceableOrNullBytes(unsigned Index) const; + /// \brief Get the number of dereferenceable_or_null bytes (or zero if + /// unknown) of an arg. + uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const { + return getDereferenceableOrNullBytes(ArgNo + FirstArgIndex); + } + /// Get the allocsize argument numbers (or pair(0, 0) if unknown). std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs(unsigned Index) const; diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h index f27e5c50a47f..29f512ddd076 100644 --- a/include/llvm/IR/Function.h +++ b/include/llvm/IR/Function.h @@ -214,10 +214,6 @@ public: addAttribute(AttributeList::FunctionIndex, Attr); } - void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - addAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); - } - /// @brief Remove function attributes from this function. void removeFnAttr(Attribute::AttrKind Kind) { removeAttribute(AttributeList::FunctionIndex, Kind); @@ -229,10 +225,6 @@ public: getContext(), AttributeList::FunctionIndex, Kind)); } - void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - removeAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); - } - /// \brief Set the entry count for this function. /// /// Entry count is the number of times this function was executed based on @@ -299,6 +291,15 @@ public: /// @brief adds the attributes to the list of attributes. void addAttributes(unsigned i, const AttrBuilder &Attrs); + /// @brief adds the attribute to the list of attributes for the given arg. + void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind); + + /// @brief adds the attribute to the list of attributes for the given arg. + void addParamAttr(unsigned ArgNo, Attribute Attr); + + /// @brief adds the attributes to the list of attributes for the given arg. + void addParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs); + /// @brief removes the attribute from the list of attributes. void removeAttribute(unsigned i, Attribute::AttrKind Kind); @@ -308,6 +309,15 @@ public: /// @brief removes the attributes from the list of attributes. void removeAttributes(unsigned i, const AttrBuilder &Attrs); + /// @brief removes the attribute from the list of attributes. + void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind); + + /// @brief removes the attribute from the list of attributes. + void removeParamAttr(unsigned ArgNo, StringRef Kind); + + /// @brief removes the attribute from the list of attributes. + void removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs); + /// @brief check if an attributes is in the list of attributes. bool hasAttribute(unsigned i, Attribute::AttrKind Kind) const { return getAttributes().hasAttribute(i, Kind); @@ -329,10 +339,18 @@ public: /// @brief adds the dereferenceable attribute to the list of attributes. void addDereferenceableAttr(unsigned i, uint64_t Bytes); + /// @brief adds the dereferenceable attribute to the list of attributes for + /// the given arg. + void addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes); + /// @brief adds the dereferenceable_or_null attribute to the list of /// attributes. void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes); + /// @brief adds the dereferenceable_or_null attribute to the list of + /// attributes for the given arg. + void addDereferenceableOrNullParamAttr(unsigned ArgNo, uint64_t Bytes); + /// @brief Extract the alignment for a call or parameter (0=unknown). unsigned getParamAlignment(unsigned ArgNo) const { return AttributeSets.getParamAlignment(ArgNo); @@ -345,6 +363,12 @@ public: return AttributeSets.getDereferenceableBytes(i); } + /// @brief Extract the number of dereferenceable bytes for a parameter. + /// @param ArgNo Index of an argument, with 0 being the first function arg. + uint64_t getParamDereferenceableBytes(unsigned ArgNo) const { + return AttributeSets.getParamDereferenceableBytes(ArgNo); + } + /// @brief Extract the number of dereferenceable_or_null bytes for a call or /// parameter (0=unknown). /// @param i AttributeList index, referring to a return value or argument. @@ -352,6 +376,13 @@ public: return AttributeSets.getDereferenceableOrNullBytes(i); } + /// @brief Extract the number of dereferenceable_or_null bytes for a + /// parameter. + /// @param ArgNo AttributeList ArgNo, referring to an argument. + uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const { + return AttributeSets.getParamDereferenceableOrNullBytes(ArgNo); + } + /// @brief Determine if the function does not access memory. bool doesNotAccessMemory() const { return hasFnAttribute(Attribute::ReadNone); diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h index 6fab59613dd6..1f7990b99ebe 100644 --- a/include/llvm/IR/Instructions.h +++ b/include/llvm/IR/Instructions.h @@ -1660,6 +1660,9 @@ public: /// Adds the attribute to the indicated argument void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind); + /// Adds the attribute to the indicated argument + void addParamAttr(unsigned ArgNo, Attribute Attr); + /// removes the attribute from the list of attributes. void removeAttribute(unsigned i, Attribute::AttrKind Kind); @@ -1669,6 +1672,9 @@ public: /// Removes the attribute from the given argument void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind); + /// Removes the attribute from the given argument + void removeParamAttr(unsigned ArgNo, StringRef Kind); + /// adds the dereferenceable attribute to the list of attributes. void addDereferenceableAttr(unsigned i, uint64_t Bytes); @@ -1704,6 +1710,18 @@ public: return getAttributes().getAttribute(i, Kind); } + /// Get the attribute of a given kind from a given arg + Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + return getAttributes().getParamAttr(ArgNo, Kind); + } + + /// Get the attribute of a given kind from a given arg + Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + return getAttributes().getParamAttr(ArgNo, Kind); + } + /// Return true if the data operand at index \p i has the attribute \p /// A. /// diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 3ca21c15577b..5b9796d4fba6 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -188,6 +188,7 @@ void initializeLintPass(PassRegistry&); void initializeLiveDebugValuesPass(PassRegistry&); void initializeLiveDebugVariablesPass(PassRegistry&); void initializeLiveIntervalsPass(PassRegistry&); +void initializeLiveRangeShrinkPass(PassRegistry&); void initializeLiveRegMatrixPass(PassRegistry&); void initializeLiveStacksPass(PassRegistry&); void initializeLiveVariablesPass(PassRegistry&); diff --git a/include/llvm/Object/WindowsResource.h b/include/llvm/Object/WindowsResource.h index f94ad09ce0c6..2484f551aee0 100644 --- a/include/llvm/Object/WindowsResource.h +++ b/include/llvm/Object/WindowsResource.h @@ -30,11 +30,18 @@ #define LLVM_INCLUDE_LLVM_OBJECT_RESFILE_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Object/Binary.h" +#include "llvm/Object/Error.h" #include "llvm/Support/BinaryByteStream.h" #include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" +#include "llvm/Support/ScopedPrinter.h" + +#include <map> namespace llvm { namespace object { @@ -44,23 +51,44 @@ class WindowsResource; class ResourceEntryRef { public: Error moveNext(bool &End); + bool checkTypeString() const { return IsStringType; } + ArrayRef<UTF16> getTypeString() const { return Type; } + uint16_t getTypeID() const { return TypeID; } + bool checkNameString() const { return IsStringName; } + ArrayRef<UTF16> getNameString() const { return Name; } + uint16_t getNameID() const { return NameID; } + uint16_t getLanguage() const { return Suffix->Language; } private: friend class WindowsResource; ResourceEntryRef(BinaryStreamRef Ref, const WindowsResource *Owner, Error &Err); + Error loadNext(); + struct HeaderSuffix { + support::ulittle32_t DataVersion; + support::ulittle16_t MemoryFlags; + support::ulittle16_t Language; + support::ulittle32_t Version; + support::ulittle32_t Characteristics; + }; + BinaryStreamReader Reader; - BinaryStreamRef HeaderBytes; - BinaryStreamRef DataBytes; + bool IsStringType; + ArrayRef<UTF16> Type; + uint16_t TypeID; + bool IsStringName; + ArrayRef<UTF16> Name; + uint16_t NameID; + const HeaderSuffix *Suffix = nullptr; + ArrayRef<uint8_t> Data; const WindowsResource *OwningRes = nullptr; }; class WindowsResource : public Binary { public: - ~WindowsResource() override; Expected<ResourceEntryRef> getHeadEntry(); static bool classof(const Binary *V) { return V->isWinRes(); } @@ -76,6 +104,36 @@ private: BinaryByteStream BBS; }; +class WindowsResourceParser { +public: + WindowsResourceParser(); + + Error parse(WindowsResource *WR); + + void printTree() const; + +private: + class TreeNode { + public: + TreeNode() = default; + explicit TreeNode(ArrayRef<UTF16> Ref); + void addEntry(const ResourceEntryRef &Entry); + void print(ScopedPrinter &Writer, StringRef Name) const; + + private: + TreeNode &addTypeNode(const ResourceEntryRef &Entry); + TreeNode &addNameNode(const ResourceEntryRef &Entry); + TreeNode &addLanguageNode(const ResourceEntryRef &Entry); + TreeNode &addChild(uint32_t ID); + TreeNode &addChild(ArrayRef<UTF16> NameRef); + std::vector<UTF16> Name; + std::map<uint32_t, std::unique_ptr<TreeNode>> IDChildren; + std::map<std::string, std::unique_ptr<TreeNode>> StringChildren; + }; + + TreeNode Root; +}; + } // namespace object } // namespace llvm diff --git a/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h b/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h new file mode 100644 index 000000000000..6ddae2e2b41c --- /dev/null +++ b/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h @@ -0,0 +1,91 @@ +//===- CodeViewYAMLDebugSections.h - CodeView YAMLIO debug sections -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OBJECTYAML_CODEVIEWYAMLDEBUGSECTIONS_H +#define LLVM_OBJECTYAML_CODEVIEWYAMLDEBUGSECTIONS_H + +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/DebugSubsection.h" +#include "llvm/ObjectYAML/YAML.h" + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +struct C13FragmentBase; +} + +struct SourceLineEntry { + uint32_t Offset; + uint32_t LineStart; + uint32_t EndDelta; + bool IsStatement; +}; + +struct SourceColumnEntry { + uint16_t StartColumn; + uint16_t EndColumn; +}; + +struct SourceLineBlock { + StringRef FileName; + std::vector<SourceLineEntry> Lines; + std::vector<SourceColumnEntry> Columns; +}; + +struct HexFormattedString { + std::vector<uint8_t> Bytes; +}; + +struct SourceFileChecksumEntry { + StringRef FileName; + codeview::FileChecksumKind Kind; + HexFormattedString ChecksumBytes; +}; + +struct SourceLineInfo { + uint32_t RelocOffset; + uint32_t RelocSegment; + codeview::LineFlags Flags; + uint32_t CodeSize; + + std::vector<SourceLineBlock> Blocks; +}; + +struct InlineeSite { + uint32_t Inlinee; + StringRef FileName; + uint32_t SourceLineNum; + std::vector<StringRef> ExtraFiles; +}; + +struct InlineeInfo { + bool HasExtraFiles; + std::vector<InlineeSite> Sites; +}; + +struct SourceFileInfo { + std::vector<SourceFileChecksumEntry> FileChecksums; + std::vector<SourceLineInfo> LineFragments; + std::vector<InlineeInfo> Inlinees; +}; + +struct C13DebugSection { + std::vector<detail::C13FragmentBase> Fragments; +}; +} // namespace CodeViewYAML +} // namespace llvm + +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceFileInfo) + +#endif diff --git a/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h b/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h new file mode 100644 index 000000000000..ee4e2ac9d404 --- /dev/null +++ b/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h @@ -0,0 +1,41 @@ +//===- CodeViewYAMLSymbols.h - CodeView YAMLIO Symbol implementation ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OBJECTYAML_CODEVIEWYAMLSYMBOLS_H +#define LLVM_OBJECTYAML_CODEVIEWYAMLSYMBOLS_H + +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/SymbolRecord.h" +#include "llvm/ObjectYAML/YAML.h" + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +struct SymbolRecordBase; +} + +struct SymbolRecord { + std::shared_ptr<detail::SymbolRecordBase> Symbol; + + codeview::CVSymbol toCodeViewSymbol(BumpPtrAllocator &Allocator) const; + static Expected<SymbolRecord> fromCodeViewSymbol(codeview::CVSymbol Symbol); +}; + +} // namespace CodeViewYAML +} // namespace llvm + +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SymbolRecord) +LLVM_YAML_IS_SEQUENCE_VECTOR(CodeViewYAML::SymbolRecord) + +#endif diff --git a/include/llvm/ObjectYAML/CodeViewYAMLTypes.h b/include/llvm/ObjectYAML/CodeViewYAMLTypes.h new file mode 100644 index 000000000000..a57ada34a4fa --- /dev/null +++ b/include/llvm/ObjectYAML/CodeViewYAMLTypes.h @@ -0,0 +1,48 @@ +//===- CodeViewYAMLTypes.h - CodeView YAMLIO Type Record implementation ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OBJECTYAML_CODEVIEWYAMLTYPES_H +#define LLVM_OBJECTYAML_CODEVIEWYAMLTYPES_H + +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/ObjectYAML/YAML.h" + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +struct LeafRecordBase; +struct MemberRecordBase; +} + +struct MemberRecord { + std::shared_ptr<detail::MemberRecordBase> Member; +}; + +struct LeafRecord { + std::shared_ptr<detail::LeafRecordBase> Leaf; + + codeview::CVType toCodeViewRecord(BumpPtrAllocator &Allocator) const; + static Expected<LeafRecord> fromCodeViewRecord(codeview::CVType Type); +}; +} // namespace CodeViewYAML +} // namespace llvm + +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::LeafRecord) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::MemberRecord) + +LLVM_YAML_IS_SEQUENCE_VECTOR(CodeViewYAML::LeafRecord) +LLVM_YAML_IS_SEQUENCE_VECTOR(CodeViewYAML::MemberRecord) + +#endif diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h index efa36d957fbd..12b05e4ff0c5 100644 --- a/include/llvm/Passes/PassBuilder.h +++ b/include/llvm/Passes/PassBuilder.h @@ -192,6 +192,39 @@ public: buildFunctionSimplificationPipeline(OptimizationLevel Level, bool DebugLogging = false); + /// Construct the core LLVM module canonicalization and simplification + /// pipeline. + /// + /// This pipeline focuses on canonicalizing and simplifying the entire module + /// of IR. Much like the function simplification pipeline above, it is + /// suitable to run repeatedly over the IR and is not expected to destroy + /// important information. It does, however, perform inlining and other + /// heuristic based simplifications that are not strictly reversible. + /// + /// Note that \p Level cannot be `O0` here. The pipelines produced are + /// only intended for use when attempting to optimize code. If frontends + /// require some transformations for semantic reasons, they should explicitly + /// build them. + ModulePassManager + buildModuleSimplificationPipeline(OptimizationLevel Level, + bool DebugLogging = false); + + /// Construct the core LLVM module optimization pipeline. + /// + /// This pipeline focuses on optimizing the execution speed of the IR. It + /// uses cost modeling and thresholds to balance code growth against runtime + /// improvements. It includes vectorization and other information destroying + /// transformations. It also cannot generally be run repeatedly on a module + /// without potentially seriously regressing either runtime performance of + /// the code or serious code size growth. + /// + /// Note that \p Level cannot be `O0` here. The pipelines produced are + /// only intended for use when attempting to optimize code. If frontends + /// require some transformations for semantic reasons, they should explicitly + /// build them. + ModulePassManager buildModuleOptimizationPipeline(OptimizationLevel Level, + bool DebugLogging = false); + /// Build a per-module default optimization pipeline. /// /// This provides a good default optimization pipeline for per-module @@ -206,6 +239,36 @@ public: ModulePassManager buildPerModuleDefaultPipeline(OptimizationLevel Level, bool DebugLogging = false); + /// Build a pre-link, ThinLTO-targeting default optimization pipeline to + /// a pass manager. + /// + /// This adds the pre-link optimizations tuned to prepare a module for + /// a ThinLTO run. It works to minimize the IR which needs to be analyzed + /// without making irreversible decisions which could be made better during + /// the LTO run. + /// + /// Note that \p Level cannot be `O0` here. The pipelines produced are + /// only intended for use when attempting to optimize code. If frontends + /// require some transformations for semantic reasons, they should explicitly + /// build them. + ModulePassManager + buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level, + bool DebugLogging = false); + + /// Build an ThinLTO default optimization pipeline to a pass manager. + /// + /// This provides a good default optimization pipeline for link-time + /// optimization and code generation. It is particularly tuned to fit well + /// when IR coming into the LTO phase was first run through \c + /// addPreLinkLTODefaultPipeline, and the two coordinate closely. + /// + /// Note that \p Level cannot be `O0` here. The pipelines produced are + /// only intended for use when attempting to optimize code. If frontends + /// require some transformations for semantic reasons, they should explicitly + /// build them. + ModulePassManager buildThinLTODefaultPipeline(OptimizationLevel Level, + bool DebugLogging = false); + /// Build a pre-link, LTO-targeting default optimization pipeline to a pass /// manager. /// diff --git a/include/llvm/Support/ARMTargetParser.def b/include/llvm/Support/ARMTargetParser.def index 32dc57a0fedf..65cb2715a6a5 100644 --- a/include/llvm/Support/ARMTargetParser.def +++ b/include/llvm/Support/ARMTargetParser.def @@ -206,7 +206,7 @@ ARM_CPU_NAME("cortex-a5", AK_ARMV7A, FK_NEON_VFPV4, false, ARM_CPU_NAME("cortex-a7", AK_ARMV7A, FK_NEON_VFPV4, false, (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB)) -ARM_CPU_NAME("cortex-a8", AK_ARMV7A, FK_NEON, true, ARM::AEK_SEC) +ARM_CPU_NAME("cortex-a8", AK_ARMV7A, FK_NEON, false, ARM::AEK_SEC) ARM_CPU_NAME("cortex-a9", AK_ARMV7A, FK_NEON_FP16, false, (ARM::AEK_SEC | ARM::AEK_MP)) ARM_CPU_NAME("cortex-a12", AK_ARMV7A, FK_NEON_VFPV4, false, (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | @@ -236,7 +236,7 @@ ARM_CPU_NAME("cortex-m23", AK_ARMV8MBaseline, FK_NONE, false, ARM::AEK_NONE) ARM_CPU_NAME("cortex-m33", AK_ARMV8MMainline, FK_FPV5_SP_D16, false, ARM::AEK_DSP) ARM_CPU_NAME("cortex-a32", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) ARM_CPU_NAME("cortex-a35", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) -ARM_CPU_NAME("cortex-a53", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true, ARM::AEK_CRC) +ARM_CPU_NAME("cortex-a53", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) ARM_CPU_NAME("cortex-a57", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) ARM_CPU_NAME("cortex-a72", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) ARM_CPU_NAME("cortex-a73", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) diff --git a/include/llvm/Support/BinaryStreamReader.h b/include/llvm/Support/BinaryStreamReader.h index 56375f41d2c0..29e8a2ab08aa 100644 --- a/include/llvm/Support/BinaryStreamReader.h +++ b/include/llvm/Support/BinaryStreamReader.h @@ -14,6 +14,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Support/BinaryStreamArray.h" #include "llvm/Support/BinaryStreamRef.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" #include "llvm/Support/type_traits.h" @@ -104,6 +105,13 @@ public: /// returns an appropriate error code. Error readCString(StringRef &Dest); + /// Similar to readCString, however read a null-terminated UTF16 string + /// instead. + /// + /// \returns a success error code if the data was successfully read, otherwise + /// returns an appropriate error code. + Error readWideString(ArrayRef<UTF16> &Dest); + /// Read a \p Length byte string into \p Dest. Whether a copy occurs depends /// on the implementation of the underlying stream. Updates the stream's /// offset to point after the newly read data. diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h index ffea679fab82..8949d69ce724 100644 --- a/include/llvm/Support/YAMLTraits.h +++ b/include/llvm/Support/YAMLTraits.h @@ -1606,6 +1606,44 @@ template <typename T> struct StdMapStringCustomMappingTraitsImpl { } \ } +#define LLVM_YAML_DECLARE_MAPPING_TRAITS(Type) \ + namespace llvm { \ + namespace yaml { \ + template <> struct MappingTraits<Type> { \ + static void mapping(IO &IO, Type &Obj); \ + }; \ + } \ + } + +#define LLVM_YAML_DECLARE_ENUM_TRAITS(Type) \ + namespace llvm { \ + namespace yaml { \ + template <> struct ScalarEnumerationTraits<Type> { \ + static void enumeration(IO &io, Type &Value); \ + }; \ + } \ + } + +#define LLVM_YAML_DECLARE_BITSET_TRAITS(Type) \ + namespace llvm { \ + namespace yaml { \ + template <> struct ScalarBitSetTraits<Type> { \ + static void bitset(IO &IO, Type &Options); \ + }; \ + } \ + } + +#define LLVM_YAML_DECLARE_SCALAR_TRAITS(Type, MustQuote) \ + namespace llvm { \ + namespace yaml { \ + template <> struct ScalarTraits<Type> { \ + static void output(const Type &Value, void *ctx, llvm::raw_ostream &Out); \ + static StringRef input(StringRef Scalar, void *ctxt, Type &Value); \ + static bool mustQuote(StringRef) { return MustQuote; } \ + }; \ + } \ + } + /// Utility for declaring that a std::vector of a particular type /// should be considered a YAML document list. #define LLVM_YAML_IS_DOCUMENT_LIST_VECTOR(_type) \ diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h index 437f68b24e57..3c181f0e511b 100644 --- a/include/llvm/TableGen/Record.h +++ b/include/llvm/TableGen/Record.h @@ -1237,7 +1237,6 @@ class RecordVal { public: RecordVal(Init *N, RecTy *T, bool P); - RecordVal(StringRef N, RecTy *T, bool P); StringRef getName() const; Init *getNameInit() const { return Name; } @@ -1340,7 +1339,6 @@ public: } void setName(Init *Name); // Also updates RecordKeeper. - void setName(StringRef Name); // Also updates RecordKeeper. ArrayRef<SMLoc> getLoc() const { return Locs; } @@ -1378,13 +1376,11 @@ public: } RecordVal *getValue(const Init *Name) { - for (RecordVal &Val : Values) - if (Val.Name == Name) return &Val; - return nullptr; + return const_cast<RecordVal *>(static_cast<const Record *>(this)->getValue(Name)); } RecordVal *getValue(StringRef Name) { - return getValue(StringInit::get(Name)); + return const_cast<RecordVal *>(static_cast<const Record *>(this)->getValue(Name)); } void addTemplateArg(Init *Name) { @@ -1492,7 +1488,7 @@ public: /// its value as a string, throwing an exception if the field does not exist /// or if the value is not a string. /// - std::string getValueAsString(StringRef FieldName) const; + StringRef getValueAsString(StringRef FieldName) const; /// This method looks up the specified field and returns /// its value as a BitsInit, throwing an exception if the field does not exist @@ -1522,7 +1518,7 @@ public: /// returns its value as a vector of strings, throwing an exception if the /// field does not exist or if the value is not the right type. /// - std::vector<std::string> getValueAsListOfStrings(StringRef FieldName) const; + std::vector<StringRef> getValueAsListOfStrings(StringRef FieldName) const; /// This method looks up the specified field and returns its /// value as a Record, throwing an exception if the field does not exist or if diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 17182b958ecb..7258a5cc2d89 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1143,6 +1143,16 @@ public: return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; } + /// Get maximum # of load operations permitted for memcmp + /// + /// This function returns the maximum number of load operations permitted + /// to replace a call to memcmp. The value is set by the target at the + /// performance threshold for such a replacement. If OptSize is true, + /// return the limit for functions that have OptSize attribute. + unsigned getMaxExpandSizeMemcmp(bool OptSize) const { + return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp; + } + /// \brief Get maximum # of store operations permitted for llvm.memmove /// /// This function returns the maximum number of store operations permitted @@ -2330,6 +2340,8 @@ protected: /// Maximum number of store operations that may be substituted for a call to /// memcpy, used for functions with OptSize attribute. unsigned MaxStoresPerMemcpyOptSize; + unsigned MaxLoadsPerMemcmp; + unsigned MaxLoadsPerMemcmpOptSize; /// \brief Specify maximum bytes of store instructions per memmove call. /// diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h index 73ae2ad12988..ed390799cfc3 100644 --- a/include/llvm/Target/TargetMachine.h +++ b/include/llvm/Target/TargetMachine.h @@ -299,6 +299,12 @@ public: bool addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx, raw_pwrite_stream &OS, bool DisableVerify = true) override; + + /// Returns true if the target is expected to pass all machine verifier + /// checks. This is a stopgap measure to fix targets one by one. We will + /// remove this at some point and always enable the verifier when + /// EXPENSIVE_CHECKS is enabled. + virtual bool isMachineVerifierClean() const { return true; } }; } // end namespace llvm diff --git a/include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h b/include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h new file mode 100644 index 000000000000..bf04bbfe92d8 --- /dev/null +++ b/include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h @@ -0,0 +1,41 @@ +//===- ThinLTOBitcodeWriter.h - Bitcode writing pass for ThinLTO ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass prepares a module containing type metadata for ThinLTO by splitting +// it into regular and thin LTO parts if possible, and writing both parts to +// a multi-module bitcode file. Modules that do not contain type metadata are +// written unmodified as a single module. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_IPO_THINLTOBITCODEWRITER_H +#define LLVM_TRANSFORMS_IPO_THINLTOBITCODEWRITER_H + +#include <llvm/IR/PassManager.h> +#include <llvm/Support/raw_ostream.h> + +namespace llvm { + +class ThinLTOBitcodeWriterPass + : public PassInfoMixin<ThinLTOBitcodeWriterPass> { + raw_ostream &OS; + raw_ostream *ThinLinkOS; + +public: + // Writes bitcode to OS. Also write thin link file to ThinLinkOS, if + // it's not nullptr. + ThinLTOBitcodeWriterPass(raw_ostream &OS, raw_ostream *ThinLinkOS) + : OS(OS), ThinLinkOS(ThinLinkOS) {} + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} // namespace llvm + +#endif diff --git a/include/llvm/Transforms/Scalar/GVN.h b/include/llvm/Transforms/Scalar/GVN.h index 3f97789cabbc..589aaaca02fe 100644 --- a/include/llvm/Transforms/Scalar/GVN.h +++ b/include/llvm/Transforms/Scalar/GVN.h @@ -68,24 +68,6 @@ public: class ValueTable { DenseMap<Value *, uint32_t> valueNumbering; DenseMap<Expression, uint32_t> expressionNumbering; - - // Expressions is the vector of Expression. ExprIdx is the mapping from - // value number to the index of Expression in Expressions. We use it - // instead of a DenseMap because filling such mapping is faster than - // filling a DenseMap and the compile time is a little better. - uint32_t nextExprNumber; - std::vector<Expression> Expressions; - std::vector<uint32_t> ExprIdx; - // Value number to PHINode mapping. Used for phi-translate in scalarpre. - DenseMap<uint32_t, PHINode *> NumberingPhi; - // Cache for phi-translate in scalarpre. - typedef DenseMap<std::pair<uint32_t, const BasicBlock *>, uint32_t> - PhiTranslateMap; - PhiTranslateMap PhiTranslateTable; - // Map the block to reversed postorder traversal number. It is used to - // find back edge easily. - DenseMap<const BasicBlock *, uint32_t> BlockRPONumber; - AliasAnalysis *AA; MemoryDependenceResults *MD; DominatorTree *DT; @@ -97,10 +79,6 @@ public: Value *LHS, Value *RHS); Expression createExtractvalueExpr(ExtractValueInst *EI); uint32_t lookupOrAddCall(CallInst *C); - uint32_t phiTranslateImpl(const BasicBlock *BB, const BasicBlock *PhiBlock, - uint32_t Num, GVN &Gvn); - std::pair<uint32_t, bool> assignExpNewValueNum(Expression &exp); - bool areAllValsInBB(uint32_t num, const BasicBlock *BB, GVN &Gvn); public: ValueTable(); @@ -109,12 +87,9 @@ public: ~ValueTable(); uint32_t lookupOrAdd(Value *V); - uint32_t lookup(Value *V, bool Verify = true) const; + uint32_t lookup(Value *V) const; uint32_t lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Pred, Value *LHS, Value *RHS); - uint32_t phiTranslate(const BasicBlock *BB, const BasicBlock *PhiBlock, - uint32_t Num, GVN &Gvn); - void assignBlockRPONumber(Function &F); bool exists(Value *V) const; void add(Value *V, uint32_t num); void clear(); diff --git a/include/llvm/Transforms/Utils/CodeExtractor.h b/include/llvm/Transforms/Utils/CodeExtractor.h index a602498e5f22..7e23544af1ab 100644 --- a/include/llvm/Transforms/Utils/CodeExtractor.h +++ b/include/llvm/Transforms/Utils/CodeExtractor.h @@ -25,6 +25,7 @@ template <typename T> class ArrayRef; class BranchProbabilityInfo; class DominatorTree; class Function; + class Instruction; class Loop; class Module; class RegionNode; @@ -103,7 +104,17 @@ template <typename T> class ArrayRef; /// a code sequence, that sequence is modified, including changing these /// sets, before extraction occurs. These modifications won't have any /// significant impact on the cost however. - void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs) const; + void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, + const ValueSet &Allocas) const; + /// Find the set of allocas whose life ranges are contained within the + /// outlined region. + /// + /// Allocas which have life_time markers contained in the outlined region + /// should be pushed to the outlined function. The address bitcasts that + /// are used by the lifetime markers are also candidates for shrink- + /// wrapping. The instructions that need to be sinked are collected in + /// 'Allocas'. + void findAllocas(ValueSet &Allocas) const; private: void severSplitPHINodes(BasicBlock *&Header); diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap index 5e15e8d49802..e0780885d159 100644 --- a/include/llvm/module.modulemap +++ b/include/llvm/module.modulemap @@ -95,8 +95,8 @@ module LLVM_DebugInfo_CodeView { module * { export * } // These are intended for (repeated) textual inclusion. - textual header "DebugInfo/CodeView/TypeRecords.def" - textual header "DebugInfo/CodeView/CVSymbolTypes.def" + textual header "DebugInfo/CodeView/CodeViewTypes.def" + textual header "DebugInfo/CodeView/CodeViewSymbols.def" } module LLVM_ExecutionEngine { diff --git a/lib/Analysis/CFLGraph.h b/lib/Analysis/CFLGraph.h index a8fb12b72568..54782b6bd4ad 100644 --- a/lib/Analysis/CFLGraph.h +++ b/lib/Analysis/CFLGraph.h @@ -210,6 +210,11 @@ template <typename CFLAA> class CFLGraphBuilder { void addDerefEdge(Value *From, Value *To, bool IsRead) { assert(From != nullptr && To != nullptr); + // FIXME: This is subtly broken, due to how we model some instructions + // (e.g. extractvalue, extractelement) as loads. Since those take + // non-pointer operands, we'll entirely skip adding edges for those. + // + // addAssignEdge seems to have a similar issue with insertvalue, etc. if (!From->getType()->isPointerTy() || !To->getType()->isPointerTy()) return; addNode(From); @@ -540,6 +545,7 @@ template <typename CFLAA> class CFLGraphBuilder { case Instruction::ExtractValue: { auto *Ptr = CE->getOperand(0); addLoadEdge(Ptr, CE); + break; } case Instruction::ShuffleVector: { auto *From1 = CE->getOperand(0); diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index 79517ec6a3a8..6a1af87450c9 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -1739,6 +1739,7 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, if ((Name == "round" && TLI->has(LibFunc_round)) || (Name == "roundf" && TLI->has(LibFunc_roundf))) return ConstantFoldFP(round, V, Ty); + break; case 's': if ((Name == "sin" && TLI->has(LibFunc_sin)) || (Name == "sinf" && TLI->has(LibFunc_sinf))) @@ -1807,6 +1808,7 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U))) return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(), /*roundTowardZero=*/false, Ty); + LLVM_FALLTHROUGH; case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: case Intrinsic::x86_sse2_cvttsd2si: diff --git a/lib/Analysis/EHPersonalities.cpp b/lib/Analysis/EHPersonalities.cpp index ebf0a370b0b0..b12ae9884e3d 100644 --- a/lib/Analysis/EHPersonalities.cpp +++ b/lib/Analysis/EHPersonalities.cpp @@ -27,8 +27,10 @@ EHPersonality llvm::classifyEHPersonality(const Value *Pers) { return StringSwitch<EHPersonality>(F->getName()) .Case("__gnat_eh_personality", EHPersonality::GNU_Ada) .Case("__gxx_personality_v0", EHPersonality::GNU_CXX) + .Case("__gxx_personality_seh0",EHPersonality::GNU_CXX) .Case("__gxx_personality_sj0", EHPersonality::GNU_CXX_SjLj) .Case("__gcc_personality_v0", EHPersonality::GNU_C) + .Case("__gcc_personality_seh0",EHPersonality::GNU_C) .Case("__gcc_personality_sj0", EHPersonality::GNU_C_SjLj) .Case("__objc_personality_v0", EHPersonality::GNU_ObjC) .Case("_except_handler3", EHPersonality::MSVC_X86SEH) diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 122442bafb11..66ac847455cd 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -103,13 +103,8 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) { return false; // If we have a DominatorTree then do a precise test. - if (DT) { - if (!DT->isReachableFromEntry(P->getParent())) - return true; - if (!DT->isReachableFromEntry(I->getParent())) - return false; + if (DT) return DT->dominates(I, P); - } // Otherwise, if the instruction is in the entry block and is not an invoke, // then it obviously dominates all phi nodes. diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp index 66a0d145dcd8..188885063b39 100644 --- a/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -691,6 +691,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // load query, we can safely ignore it (scan past it). if (isLoad) continue; + LLVM_FALLTHROUGH; default: // Otherwise, there is a potential dependence. Return a clobber. return MemDepResult::getClobber(Inst); diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index f55ce202bcbb..d96697cafbe9 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -8182,6 +8182,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred, case ICmpInst::ICMP_SGE: std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ICmpInst::ICMP_SLE: // X s<= (X + C)<nsw> if C >= 0 if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && C.isNonNegative()) @@ -8195,6 +8196,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred, case ICmpInst::ICMP_SGT: std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ICmpInst::ICMP_SLT: // X s< (X + C)<nsw> if C > 0 if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && @@ -8552,6 +8554,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(SharperMin))) return true; + LLVM_FALLTHROUGH; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_UGT: @@ -8566,6 +8569,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min))) return true; + LLVM_FALLTHROUGH; default: // No change diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 7a8d4f3be24f..ac646716476b 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -215,6 +215,10 @@ bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) c return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } +bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const { + return TTIImpl->expandMemCmp(I, MaxLoadSize); +} + bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index bd79cd56a18b..a5dceb6c2271 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -172,6 +172,18 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS, } +bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) { + for (const User *U : CxtI->users()) { + if (const ICmpInst *IC = dyn_cast<ICmpInst>(U)) + if (IC->isEquality()) + if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) + if (C->isNullValue()) + continue; + return false; + } + return true; +} + static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth, const Query &Q); @@ -2327,6 +2339,7 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple, case Instruction::SExt: if (!LookThroughSExt) return false; // otherwise fall through to ZExt + LLVM_FALLTHROUGH; case Instruction::ZExt: return ComputeMultiple(I->getOperand(0), Base, Multiple, LookThroughSExt, Depth+1); diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index c1d81ac203a1..a402b4ddd462 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -378,18 +378,10 @@ private: ModuleToSummariesForIndex->count(ModulePath); } - bool hasValueId(GlobalValue::GUID ValGUID) { - const auto &VMI = GUIDToValueIdMap.find(ValGUID); - return VMI != GUIDToValueIdMap.end(); - } - void assignValueId(GlobalValue::GUID ValGUID) { - unsigned &ValueId = GUIDToValueIdMap[ValGUID]; - if (ValueId == 0) - ValueId = ++GlobalValueId; - } - unsigned getValueId(GlobalValue::GUID ValGUID) { + Optional<unsigned> getValueId(GlobalValue::GUID ValGUID) { auto VMI = GUIDToValueIdMap.find(ValGUID); - assert(VMI != GUIDToValueIdMap.end()); + if (VMI == GUIDToValueIdMap.end()) + return None; return VMI->second; } std::map<GlobalValue::GUID, unsigned> &valueIds() { return GUIDToValueIdMap; } @@ -3413,12 +3405,6 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 3); Stream.EmitRecord(bitc::FS_VERSION, ArrayRef<uint64_t>{INDEX_VERSION}); - // Create value IDs for undefined references. - forEachSummary([&](GVInfo I) { - for (auto &RI : I.second->refs()) - assignValueId(RI.getGUID()); - }); - for (const auto &GVI : valueIds()) { Stream.EmitRecord(bitc::FS_VALUE_GUID, ArrayRef<uint64_t>{GVI.second, GVI.first}); @@ -3492,9 +3478,9 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { GlobalValueSummary *S = I.second; assert(S); - assert(hasValueId(I.first)); - unsigned ValueId = getValueId(I.first); - SummaryToValueIdMap[S] = ValueId; + auto ValueId = getValueId(I.first); + assert(ValueId); + SummaryToValueIdMap[S] = *ValueId; if (auto *AS = dyn_cast<AliasSummary>(S)) { // Will process aliases as a post-pass because the reader wants all @@ -3504,11 +3490,14 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { } if (auto *VS = dyn_cast<GlobalVarSummary>(S)) { - NameVals.push_back(ValueId); + NameVals.push_back(*ValueId); NameVals.push_back(Index.getModuleId(VS->modulePath())); NameVals.push_back(getEncodedGVSummaryFlags(VS->flags())); for (auto &RI : VS->refs()) { - NameVals.push_back(getValueId(RI.getGUID())); + auto RefValueId = getValueId(RI.getGUID()); + if (!RefValueId) + continue; + NameVals.push_back(*RefValueId); } // Emit the finished record. @@ -3522,15 +3511,22 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { auto *FS = cast<FunctionSummary>(S); writeFunctionTypeMetadataRecords(Stream, FS); - NameVals.push_back(ValueId); + NameVals.push_back(*ValueId); NameVals.push_back(Index.getModuleId(FS->modulePath())); NameVals.push_back(getEncodedGVSummaryFlags(FS->flags())); NameVals.push_back(FS->instCount()); - NameVals.push_back(FS->refs().size()); + // Fill in below + NameVals.push_back(0); + unsigned Count = 0; for (auto &RI : FS->refs()) { - NameVals.push_back(getValueId(RI.getGUID())); + auto RefValueId = getValueId(RI.getGUID()); + if (!RefValueId) + continue; + NameVals.push_back(*RefValueId); + Count++; } + NameVals[4] = Count; bool HasProfileData = false; for (auto &EI : FS->calls()) { @@ -3543,15 +3539,19 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // If this GUID doesn't have a value id, it doesn't have a function // summary and we don't need to record any calls to it. GlobalValue::GUID GUID = EI.first.getGUID(); - if (!hasValueId(GUID)) { + auto CallValueId = getValueId(GUID); + if (!CallValueId) { // For SamplePGO, the indirect call targets for local functions will // have its original name annotated in profile. We try to find the // corresponding PGOFuncName as the GUID. GUID = Index.getGUIDFromOriginalID(GUID); - if (GUID == 0 || !hasValueId(GUID)) + if (GUID == 0) + continue; + CallValueId = getValueId(GUID); + if (!CallValueId) continue; } - NameVals.push_back(getValueId(GUID)); + NameVals.push_back(*CallValueId); if (HasProfileData) NameVals.push_back(static_cast<uint8_t>(EI.second.Hotness)); } diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 43b245c66400..5abf50e5bd10 100644 --- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -165,7 +165,7 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; ++I) { unsigned Reg = *I; - if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg))) + if (!IsReturnBlock && !Pristine.test(Reg)) continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { unsigned AliasReg = *AI; diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index d72cf5922987..e61e22abe82a 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -949,6 +949,19 @@ void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) { MCConstantExpr::create(FrameOffset, OutContext)); } +static bool needFuncLabelsForEHOrDebugInfo(const MachineFunction &MF, + MachineModuleInfo *MMI) { + if (!MF.getLandingPads().empty() || MF.hasEHFunclets() || MMI->hasDebugInfo()) + return true; + + // We might emit an EH table that uses function begin and end labels even if + // we don't have any landingpads. + if (!MF.getFunction()->hasPersonalityFn()) + return false; + return !isNoOpWithoutInvoke( + classifyEHPersonality(MF.getFunction()->getPersonalityFn())); +} + /// EmitFunctionBody - This method emits the body and trailer for a /// function. void AsmPrinter::EmitFunctionBody() { @@ -1076,8 +1089,8 @@ void AsmPrinter::EmitFunctionBody() { // Emit target-specific gunk after the function body. EmitFunctionBodyEnd(); - if (!MF->getLandingPads().empty() || MMI->hasDebugInfo() || - MF->hasEHFunclets() || MAI->hasDotTypeDotSizeDirective()) { + if (needFuncLabelsForEHOrDebugInfo(*MF, MMI) || + MAI->hasDotTypeDotSizeDirective()) { // Create a symbol for the end of function. CurrentFnEnd = createTempSymbol("func_end"); OutStreamer->EmitLabel(CurrentFnEnd); @@ -1402,8 +1415,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { CurrentFnBegin = nullptr; CurExceptionSym = nullptr; bool NeedsLocalForSize = MAI->needsLocalForSize(); - if (!MF.getLandingPads().empty() || MMI->hasDebugInfo() || - MF.hasEHFunclets() || NeedsLocalForSize) { + if (needFuncLabelsForEHOrDebugInfo(MF, MMI) || NeedsLocalForSize) { CurrentFnBegin = createTempSymbol("func_begin"); if (NeedsLocalForSize) CurrentFnSymForSize = CurrentFnBegin; diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp index 0a4a7a06cb2e..e14d5be1177a 100644 --- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -309,7 +309,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites, // If some instruction between the previous try-range and the end of the // function may throw, create a call-site entry with no landing pad for the // region following the try-range. - if (SawPotentiallyThrowing && !IsSJLJ && LastLabel != nullptr) { + if (SawPotentiallyThrowing && !IsSJLJ) { CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 }; CallSites.push_back(Site); } diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 2b5863aa5800..55a27e2fb79e 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -49,6 +49,7 @@ add_llvm_library(LLVMCodeGen LivePhysRegs.cpp LiveRangeCalc.cpp LiveRangeEdit.cpp + LiveRangeShrink.cpp LiveRegMatrix.cpp LiveRegUnits.cpp LiveStackAnalysis.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index 2a2715beaadc..4d30c6574b12 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -43,6 +43,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); initializeLiveIntervalsPass(Registry); + initializeLiveRangeShrinkPass(Registry); initializeLiveStacksPass(Registry); initializeLiveVariablesPass(Registry); initializeLocalStackSlotPassPass(Registry); diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 4e85708efafc..568b278dd47c 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -24,12 +24,13 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -60,6 +61,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" #include "llvm/Transforms/Utils/ValueMapper.h" + using namespace llvm; using namespace llvm::PatternMatch; @@ -84,6 +86,12 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, + "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + static cl::opt<bool> DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -144,6 +152,11 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, cl::desc("Enable merging of redundant sexts when one is dominating" " the other."), cl::init(true)); +static cl::opt<unsigned> MemCmpNumLoadsPerBlock( + "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), + cl::desc("The number of loads per basic block for inline expansion of " + "memcmp that is only being compared against zero.")); + namespace { typedef SmallPtrSet<Instruction *, 16> SetOfInstrs; typedef PointerIntPair<Type *, 1, bool> TypeIsSExt; @@ -1629,6 +1642,593 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, return true; } +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + BasicBlock *BB; + PHINode *PhiSrc1; + PHINode *PhiSrc2; + ResultBlock(); + }; + + CallInst *CI; + ResultBlock ResBlock; + unsigned MaxLoadSize; + unsigned NumBlocks; + unsigned NumBlocksNonOneByte; + unsigned NumLoadsPerBlock; + std::vector<BasicBlock *> LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + bool IsUsedForZeroCmp; + int calculateNumBlocks(unsigned Size); + void createLoadCmpBlocks(); + void createResultBlock(); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + void emitLoadCompareBlock(unsigned Index, int LoadSize, int GEPIndex, + bool IsLittleEndian); + void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size, + unsigned &NumBytesProcessed); + void emitLoadCompareByteBlock(unsigned Index, int GEPIndex); + void emitMemCmpResultBlock(bool IsLittleEndian); + Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian); + unsigned getLoadSize(unsigned Size); + unsigned getNumLoads(unsigned Size); + +public: + MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize, + unsigned NumLoadsPerBlock); + Value *getMemCmpExpansion(bool IsLittleEndian); +}; + +MemCmpExpansion::ResultBlock::ResultBlock() + : BB(nullptr), PhiSrc1(nullptr), PhiSrc2(nullptr) {} + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize, + unsigned NumLoadsPerBlock) + : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(NumLoadsPerBlock) { + + IRBuilder<> Builder(CI->getContext()); + + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + + ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + uint64_t Size = SizeCast->getZExtValue(); + + // Calculate how many load compare blocks are required for an expansion of + // given Size. + NumBlocks = calculateNumBlocks(Size); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) + setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); +} + +void MemCmpExpansion::createLoadCmpBlocks() { + for (unsigned i = 0; i < NumBlocks; i++) { + BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", + EndBlock->getParent(), EndBlock); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock() { + ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", + EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp paramters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, int GEPIndex) { + IRBuilder<> Builder(CI->getContext()); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]); + + if (Index < (LoadCmpBlocks.size() - 1)) { + // Early exit branch if difference found to EndBlock, otherwise continue to + // next LoadCmpBlock + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BranchInst *CmpBr = + BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp); + Builder.Insert(CmpBr); + } else { + // The last block has an unconditional branch to EndBlock + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); + } +} + +unsigned MemCmpExpansion::getNumLoads(unsigned Size) { + return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize); +} + +unsigned MemCmpExpansion::getLoadSize(unsigned Size) { + return MinAlign(PowerOf2Floor(Size), MaxLoadSize); +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads( + unsigned Index, unsigned Size, unsigned &NumBytesProcessed) { + + IRBuilder<> Builder(CI->getContext()); + + std::vector<Value *> XorList, OrList; + Value *Diff; + + unsigned RemainingBytes = Size - NumBytesProcessed; + unsigned NumLoadsRemaining = getNumLoads(RemainingBytes); + unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + + for (unsigned i = 0; i < NumLoads; ++i) { + unsigned LoadSize = getLoadSize(RemainingBytes); + unsigned GEPIndex = NumBytesProcessed / LoadSize; + NumBytesProcessed += LoadSize; + RemainingBytes -= LoadSize; + + Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + // Load LoadSizeType from the base address + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType); + } + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExtOrTrunc(Diff, MaxLoadType); + XorList.push_back(Diff); + } + + auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> { + std::vector<Value *> OutList; + for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { + Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); + OutList.push_back(Or); + } + if (InList.size() % 2 != 0) + OutList.push_back(InList.back()); + return OutList; + }; + + // Pair wise OR the XOR results + OrList = pairWiseOr(XorList); + + // Pair wise OR the OR results until one result left + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, OrList[0], + ConstantInt::get(Diff->getType(), 0)); + BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[Index + 1]; + // Early exit branch if difference found to ResultBlock, otherwise continue to + // next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes) + if (Index == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); + } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize, + int GEPIndex, bool IsLittleEndian) { + if (LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex); + return; + } + + IRBuilder<> Builder(CI->getContext()); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + // Load LoadSizeType from the base address + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (IsLittleEndian) { + Function *F = LoadCmpBlocks[Index]->getParent(); + + Function *Bswap = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!IsUsedForZeroCmp) { + ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]); + ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]); + } + + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[Index + 1]; + // Early exit branch if difference found to ResultBlock, otherwise continue to + // next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes) + if (Index == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock(bool IsLittleEndian) { + IRBuilder<> Builder(CI->getContext()); + + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (IsUsedForZeroCmp) { + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); + PhiRes->addIncoming(Res, ResBlock.BB); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, + ResBlock.PhiSrc2); + + Value *Res = + Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), + ConstantInt::get(Builder.getInt32Ty(), 1)); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + PhiRes->addIncoming(Res, ResBlock.BB); +} + +int MemCmpExpansion::calculateNumBlocks(unsigned Size) { + int NumBlocks = 0; + bool haveOneByteLoad = false; + unsigned RemainingSize = Size; + unsigned LoadSize = MaxLoadSize; + while (RemainingSize) { + if (LoadSize == 1) + haveOneByteLoad = true; + NumBlocks += RemainingSize / LoadSize; + RemainingSize = RemainingSize % LoadSize; + LoadSize = LoadSize / 2; + } + NumBlocksNonOneByte = haveOneByteLoad ? (NumBlocks - 1) : NumBlocks; + + if (IsUsedForZeroCmp) + NumBlocks = NumBlocks / NumLoadsPerBlock + + (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0); + + return NumBlocks; +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + IRBuilder<> Builder(CI->getContext()); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.BB); + ResBlock.PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1"); + ResBlock.PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + IRBuilder<> Builder(CI->getContext()); + + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size, + bool IsLittleEndian) { + unsigned NumBytesProcessed = 0; + // This loop populates each of the LoadCmpBlocks with IR sequence to handle + // multiple loads per block + for (unsigned i = 0; i < NumBlocks; ++i) { + emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed); + } + + emitMemCmpResultBlock(IsLittleEndian); + return PhiRes; +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion(bool IsLittleEndian) { + + ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + uint64_t Size = SizeCast->getZExtValue(); + + int LoadSize = MaxLoadSize; + int NumBytesToBeProcessed = Size; + + if (IsUsedForZeroCmp) { + return getMemCmpExpansionZeroCase(Size, IsLittleEndian); + } + + unsigned Index = 0; + // This loop calls emitLoadCompareBlock for comparing SizeVal bytes of the two + // memcmp source. It starts with loading using the maximum load size set by + // the target. It processes any remaining bytes using a load size which is the + // next smallest power of 2. + while (NumBytesToBeProcessed) { + // Calculate how many blocks we can create with the current load size + int NumBlocks = NumBytesToBeProcessed / LoadSize; + int GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize; + NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize; + + // For each NumBlocks, populate the instruction sequence for loading and + // comparing LoadSize bytes + while (NumBlocks--) { + emitLoadCompareBlock(Index, LoadSize, GEPIndex, IsLittleEndian); + Index++; + GEPIndex++; + } + // Get the next LoadSize to use + LoadSize = LoadSize / 2; + } + + emitMemCmpResultBlock(IsLittleEndian); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced wtih new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = call i64 @llvm.bswap.i64(i64 %4) +/// %7 = call i64 @llvm.bswap.i64(i64 %5) +/// %8 = sub i64 %6, %7 +/// %9 = icmp ne i64 %8, 0 +/// br i1 %9, label %res_block, label %loadbb1 +/// res_block: ; preds = %loadbb2, +/// %loadbb1, %loadbb +/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +/// %10 = icmp ult i64 %phi.src1, %phi.src2 +/// %11 = select i1 %10, i32 -1, i32 1 +/// br label %endblock +/// loadbb1: ; preds = %loadbb +/// %12 = bitcast i32* %buffer2 to i8* +/// %13 = bitcast i32* %buffer1 to i8* +/// %14 = bitcast i8* %13 to i32* +/// %15 = bitcast i8* %12 to i32* +/// %16 = getelementptr i32, i32* %14, i32 2 +/// %17 = getelementptr i32, i32* %15, i32 2 +/// %18 = load i32, i32* %16 +/// %19 = load i32, i32* %17 +/// %20 = call i32 @llvm.bswap.i32(i32 %18) +/// %21 = call i32 @llvm.bswap.i32(i32 %19) +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = sub i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb2 +/// loadbb2: ; preds = %loadbb1 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = bitcast i8* %27 to i16* +/// %29 = bitcast i8* %26 to i16* +/// %30 = getelementptr i16, i16* %28, i16 6 +/// %31 = getelementptr i16, i16* %29, i16 6 +/// %32 = load i16, i16* %30 +/// %33 = load i16, i16* %31 +/// %34 = call i16 @llvm.bswap.i16(i16 %32) +/// %35 = call i16 @llvm.bswap.i16(i16 %33) +/// %36 = zext i16 %34 to i64 +/// %37 = zext i16 %35 to i64 +/// %38 = sub i64 %36, %37 +/// %39 = icmp ne i64 %38, 0 +/// br i1 %39, label %res_block, label %loadbb3 +/// loadbb3: ; preds = %loadbb2 +/// %40 = bitcast i32* %buffer2 to i8* +/// %41 = bitcast i32* %buffer1 to i8* +/// %42 = getelementptr i8, i8* %41, i8 14 +/// %43 = getelementptr i8, i8* %40, i8 14 +/// %44 = load i8, i8* %42 +/// %45 = load i8, i8* %43 +/// %46 = zext i8 %44 to i32 +/// %47 = zext i8 %45 to i32 +/// %48 = sub i32 %46, %47 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb3 +/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +/// ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + IRBuilder<> Builder(CI->getContext()); + + // TTI call to check if target would like to expand memcmp and get the + // MaxLoadSize + unsigned MaxLoadSize; + if (!TTI->expandMemCmp(CI, MaxLoadSize)) + return false; + + // Early exit from expansion if -Oz + if (CI->getParent()->getParent()->optForMinSize()) { + return false; + } + + // Early exit from expansion if size is not a constant + ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + + // Early exit from expansion if size greater than max bytes to load + uint64_t SizeVal = SizeCast->getZExtValue(); + + unsigned NumLoads = 0; + unsigned RemainingSize = SizeVal; + unsigned LoadSize = MaxLoadSize; + while (RemainingSize) { + NumLoads += RemainingSize / LoadSize; + RemainingSize = RemainingSize % LoadSize; + LoadSize = LoadSize / 2; + } + + if (NumLoads > + TLI->getMaxExpandSizeMemcmp(CI->getParent()->getParent()->optForSize())) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + // MemCmpHelper object, creates and sets up basic blocks required for + // expanding memcmp with size SizeVal + unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock; + MemCmpExpansion MemCmpHelper(CI, MaxLoadSize, NumLoadsPerBlock); + + Value *Res = MemCmpHelper.getMemCmpExpansion(DL->isLittleEndian()); + + // Replace call with result of expansion and erarse call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -1780,6 +2380,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { CI->eraseFromParent(); return true; } + + LibFunc Func; + if (TLInfo->getLibFunc(*CI->getCalledFunction(), Func) && + Func == LibFunc_memcmp) { + if (expandMemCmp(CI, TTI, TLI, DL)) { + ModifiedDT = true; + return true; + } + } return false; } @@ -4927,6 +5536,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { return true; } + namespace { /// \brief Helper class to promote a scalar operation to a vector one. /// This class is used to move downward extractelement transition. diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp index b2d6652b075e..a3cf2846d2f5 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -74,7 +74,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; ++I) { unsigned Reg = *I; - if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg))) + if (!IsReturnBlock && !Pristine.test(Reg)) continue; for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { unsigned Reg = *AI; diff --git a/lib/CodeGen/GlobalISel/Localizer.cpp b/lib/CodeGen/GlobalISel/Localizer.cpp index c2a568e4b452..c5d0999fe438 100644 --- a/lib/CodeGen/GlobalISel/Localizer.cpp +++ b/lib/CodeGen/GlobalISel/Localizer.cpp @@ -98,12 +98,10 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) { // Create the localized instruction. MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI); LocalizedInstrs.insert(LocalizedMI); - // Move it at the right place. - MachineInstr &MIUse = *MOUse.getParent(); - if (MIUse.getParent() == InsertMBB) - InsertMBB->insert(MIUse, LocalizedMI); - else - InsertMBB->insert(InsertMBB->getFirstNonPHI(), LocalizedMI); + // Don't try to be smart for the insertion point. + // There is no guarantee that the first seen use is the first + // use in the block. + InsertMBB->insert(InsertMBB->getFirstNonPHI(), LocalizedMI); // Set a new register for the definition. unsigned NewReg = diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp index 24e289dd4f1b..444416a77008 100644 --- a/lib/CodeGen/ImplicitNullChecks.cpp +++ b/lib/CodeGen/ImplicitNullChecks.cpp @@ -607,8 +607,20 @@ MachineInstr *ImplicitNullChecks::insertFaultingInstr( .addMBB(HandlerMBB) .addImm(MI->getOpcode()); - for (auto &MO : MI->uses()) - MIB.add(MO); + for (auto &MO : MI->uses()) { + if (MO.isReg()) { + MachineOperand NewMO = MO; + if (MO.isUse()) { + NewMO.setIsKill(false); + } else { + assert(MO.isDef() && "Expected def or use"); + NewMO.setIsDead(false); + } + MIB.add(NewMO); + } else { + MIB.add(MO); + } + } MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); diff --git a/lib/CodeGen/LiveRangeShrink.cpp b/lib/CodeGen/LiveRangeShrink.cpp new file mode 100644 index 000000000000..552f4b5393fe --- /dev/null +++ b/lib/CodeGen/LiveRangeShrink.cpp @@ -0,0 +1,231 @@ +//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +///===---------------------------------------------------------------------===// +/// +/// \file +/// This pass moves instructions close to the definition of its operands to +/// shrink live range of the def instruction. The code motion is limited within +/// the basic block. The moved instruction should have 1 def, and more than one +/// uses, all of which are the only use of the def. +/// +///===---------------------------------------------------------------------===// +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "lrshrink" + +STATISTIC(NumInstrsHoistedToShrinkLiveRange, + "Number of insructions hoisted to shrink live range."); + +using namespace llvm; + +namespace { +class LiveRangeShrink : public MachineFunctionPass { +public: + static char ID; + + LiveRangeShrink() : MachineFunctionPass(ID) { + initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Live Range Shrink"; } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // End anonymous namespace. + +char LiveRangeShrink::ID = 0; +char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID; + +INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false, + false) +namespace { +typedef DenseMap<MachineInstr *, unsigned> InstOrderMap; + +/// Returns \p New if it's dominated by \p Old, otherwise return \p Old. +/// \p M maintains a map from instruction to its dominating order that satisfies +/// M[A] > M[B] guarantees that A is dominated by B. +/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return +/// \p New. +MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old, + const InstOrderMap &M) { + auto NewIter = M.find(&New); + if (NewIter == M.end()) + return Old; + if (Old == nullptr) + return &New; + unsigned OrderOld = M.find(Old)->second; + unsigned OrderNew = NewIter->second; + if (OrderOld != OrderNew) + return OrderOld < OrderNew ? &New : Old; + // OrderOld == OrderNew, we need to iterate down from Old to see if it + // can reach New, if yes, New is dominated by Old. + for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew; + I = I->getNextNode()) + if (I == &New) + return &New; + return Old; +} + +/// Builds Instruction to its dominating order number map \p M by traversing +/// from instruction \p Start. +void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) { + M.clear(); + unsigned i = 0; + for (MachineInstr &I : make_range(Start, Start->getParent()->end())) + M[&I] = i++; +} +} // end anonymous namespace + +bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + + DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); + + InstOrderMap IOM; + // Map from register to instruction order (value of IOM) where the + // register is used last. When moving instructions up, we need to + // make sure all its defs (including dead def) will not cross its + // last use when moving up. + DenseMap<unsigned, std::pair<unsigned, MachineInstr *>> UseMap; + + for (MachineBasicBlock &MBB : MF) { + if (MBB.empty()) + continue; + bool SawStore = false; + BuildInstOrderMap(MBB.begin(), IOM); + UseMap.clear(); + + for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) { + MachineInstr &MI = *Next; + ++Next; + if (MI.isPHI() || MI.isDebugValue()) + continue; + if (MI.mayStore()) + SawStore = true; + + unsigned CurrentOrder = IOM[&MI]; + unsigned Barrier = 0; + MachineInstr *BarrierMI = nullptr; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDebug()) + continue; + if (MO.isUse()) + UseMap[MO.getReg()] = std::make_pair(CurrentOrder, &MI); + else if (MO.isDead() && UseMap.count(MO.getReg())) + // Barrier is the last instruction where MO get used. MI should not + // be moved above Barrier. + if (Barrier < UseMap[MO.getReg()].first) { + Barrier = UseMap[MO.getReg()].first; + BarrierMI = UseMap[MO.getReg()].second; + } + } + + if (!MI.isSafeToMove(nullptr, SawStore)) { + // If MI has side effects, it should become a barrier for code motion. + // IOM is rebuild from the next instruction to prevent later + // instructions from being moved before this MI. + if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) { + BuildInstOrderMap(Next, IOM); + SawStore = false; + } + continue; + } + + const MachineOperand *DefMO = nullptr; + MachineInstr *Insert = nullptr; + + // Number of live-ranges that will be shortened. We do not count + // live-ranges that are defined by a COPY as it could be coalesced later. + unsigned NumEligibleUse = 0; + + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDead() || MO.isDebug()) + continue; + unsigned Reg = MO.getReg(); + // Do not move the instruction if it def/uses a physical register, + // unless it is a constant physical register or a noreg. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!Reg || MRI.isConstantPhysReg(Reg)) + continue; + Insert = nullptr; + break; + } + if (MO.isDef()) { + // Do not move if there is more than one def. + if (DefMO) { + Insert = nullptr; + break; + } + DefMO = &MO; + } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg) && DefMO && + MRI.getRegClass(DefMO->getReg()) == + MRI.getRegClass(MO.getReg())) { + // The heuristic does not handle different register classes yet + // (registers of different sizes, looser/tighter constraints). This + // is because it needs more accurate model to handle register + // pressure correctly. + MachineInstr &DefInstr = *MRI.def_instr_begin(Reg); + if (!DefInstr.isCopy()) + NumEligibleUse++; + Insert = FindDominatedInstruction(DefInstr, Insert, IOM); + } else { + Insert = nullptr; + break; + } + } + + // If Barrier equals IOM[I], traverse forward to find if BarrierMI is + // after Insert, if yes, then we should not hoist. + for (MachineInstr *I = Insert; I && IOM[I] == Barrier; + I = I->getNextNode()) + if (I == BarrierMI) { + Insert = nullptr; + break; + } + // Move the instruction when # of shrunk live range > 1. + if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) { + MachineBasicBlock::iterator I = std::next(Insert->getIterator()); + // Skip all the PHI and debug instructions. + while (I != MBB.end() && (I->isPHI() || I->isDebugValue())) + I = std::next(I); + if (I == MI.getIterator()) + continue; + + // Update the dominator order to be the same as the insertion point. + // We do this to maintain a non-decreasing order without need to update + // all instruction orders after the insertion point. + unsigned NewOrder = IOM[&*I]; + IOM[&MI] = NewOrder; + NumInstrsHoistedToShrinkLiveRange++; + + // Find MI's debug value following MI. + MachineBasicBlock::iterator EndIter = std::next(MI.getIterator()); + if (MI.getOperand(0).isReg()) + for (; EndIter != MBB.end() && EndIter->isDebugValue() && + EndIter->getOperand(0).isReg() && + EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg(); + ++EndIter, ++Next) + IOM[&*EndIter] = NewOrder; + MBB.splice(I, &MBB, MI.getIterator(), EndIter); + } + } + } + return false; +} diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp index bd04acd049db..ff12297e3fc6 100644 --- a/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/lib/CodeGen/MIRParser/MIRParser.cpp @@ -332,8 +332,6 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { MF.setAlignment(YamlMF.Alignment); MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice); - if (YamlMF.NoVRegs) - MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); if (YamlMF.Legalized) MF.getProperties().set(MachineFunctionProperties::Property::Legalized); if (YamlMF.RegBankSelected) diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index 6f6a67d81b0f..293fc7358b8e 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -183,8 +183,6 @@ void MIRPrinter::print(const MachineFunction &MF) { YamlMF.Alignment = MF.getAlignment(); YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice(); - YamlMF.NoVRegs = MF.getProperties().hasProperty( - MachineFunctionProperties::Property::NoVRegs); YamlMF.Legalized = MF.getProperties().hasProperty( MachineFunctionProperties::Property::Legalized); YamlMF.RegBankSelected = MF.getProperties().hasProperty( diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 06112723497b..590acc01008a 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -350,6 +350,13 @@ void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) { LiveIns.erase(I); } +MachineBasicBlock::livein_iterator +MachineBasicBlock::removeLiveIn(MachineBasicBlock::livein_iterator I) { + // Get non-const version of iterator. + LiveInVector::iterator LI = LiveIns.begin() + (I - LiveIns.begin()); + return LiveIns.erase(LI); +} + bool MachineBasicBlock::isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) const { livein_iterator I = find_if( LiveIns, [Reg](const RegisterMaskPair &LI) { return LI.PhysReg == Reg; }); diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp index d665201a5d17..306b75dbbae7 100644 --- a/lib/CodeGen/MachineInstr.cpp +++ b/lib/CodeGen/MachineInstr.cpp @@ -1,4 +1,4 @@ -//===-- lib/CodeGen/MachineInstr.cpp --------------------------------------===// +//===- lib/CodeGen/MachineInstr.cpp ---------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -11,21 +11,34 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/Hashing.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" @@ -35,9 +48,13 @@ #include "llvm/IR/Value.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -45,6 +62,14 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <iterator> +#include <utility> + using namespace llvm; static cl::opt<bool> PrintWholeRegMask( @@ -256,7 +281,7 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { case MachineOperand::MO_GlobalAddress: return getGlobal() == Other.getGlobal() && getOffset() == Other.getOffset(); case MachineOperand::MO_ExternalSymbol: - return !strcmp(getSymbolName(), Other.getSymbolName()) && + return strcmp(getSymbolName(), Other.getSymbolName()) == 0 && getOffset() == Other.getOffset(); case MachineOperand::MO_BlockAddress: return getBlockAddress() == Other.getBlockAddress() && @@ -723,9 +748,7 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) { /// the MCInstrDesc. MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid, DebugLoc dl, bool NoImp) - : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0), Flags(0), - AsmPrinterFlags(0), NumMemRefs(0), MemRefs(nullptr), - debugLoc(std::move(dl)) { + : MCID(&tid), debugLoc(std::move(dl)) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); // Reserve space for the expected number of operands. @@ -742,9 +765,8 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid, /// MachineInstr ctor - Copies MachineInstr arg exactly /// MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI) - : MCID(&MI.getDesc()), Parent(nullptr), Operands(nullptr), NumOperands(0), - Flags(0), AsmPrinterFlags(0), NumMemRefs(MI.NumMemRefs), - MemRefs(MI.MemRefs), debugLoc(MI.getDebugLoc()) { + : MCID(&MI.getDesc()), NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs), + debugLoc(MI.getDebugLoc()) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); CapOperands = OperandCapacity::get(MI.getNumOperands()); @@ -1633,8 +1655,8 @@ bool MachineInstr::mayAlias(AliasAnalysis *AA, MachineInstr &Other, // memory objects. It can save compile time, and possibly catch some // corner cases not currently covered. - assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); - assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); + assert((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); + assert((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; @@ -1667,7 +1689,7 @@ bool MachineInstr::hasOrderedMemoryRef() const { return true; // Check if any of our memory operands are ordered. - return any_of(memoperands(), [](const MachineMemOperand *MMO) { + return llvm::any_of(memoperands(), [](const MachineMemOperand *MMO) { return !MMO->isUnordered(); }); } @@ -1841,7 +1863,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, return; // Print the rest of the operands. - bool OmittedAnyCallClobbers = false; bool FirstOp = true; unsigned AsmDescOp = ~0u; unsigned AsmOpCount = 0; @@ -1878,31 +1899,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) VirtRegs.push_back(MO.getReg()); - // Omit call-clobbered registers which aren't used anywhere. This makes - // call instructions much less noisy on targets where calls clobber lots - // of registers. Don't rely on MO.isDead() because we may be called before - // LiveVariables is run, or we may be looking at a non-allocatable reg. - if (MRI && isCall() && - MO.isReg() && MO.isImplicit() && MO.isDef()) { - unsigned Reg = MO.getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { - if (MRI->use_empty(Reg)) { - bool HasAliasLive = false; - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { - unsigned AliasReg = *AI; - if (!MRI->use_empty(AliasReg)) { - HasAliasLive = true; - break; - } - } - if (!HasAliasLive) { - OmittedAnyCallClobbers = true; - continue; - } - } - } - } - if (FirstOp) FirstOp = false; else OS << ","; OS << " "; if (i < getDesc().NumOperands) { @@ -1984,12 +1980,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, MO.print(OS, MST, TRI); } - // Briefly indicate whether any call clobbers were omitted. - if (OmittedAnyCallClobbers) { - if (!FirstOp) OS << ","; - OS << " ..."; - } - bool HaveSemi = false; const unsigned PrintableFlags = FrameSetup | FrameDestroy; if (Flags & PrintableFlags) { @@ -2255,8 +2245,8 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs, unsigned Reg = MO.getReg(); if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue; // If there are no uses, including partial uses, the def is dead. - if (none_of(UsedRegs, - [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); })) + if (llvm::none_of(UsedRegs, + [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); })) MO.setIsDead(); } diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp index 6cf751d34e26..c1b72430e605 100644 --- a/lib/CodeGen/MachineModuleInfo.cpp +++ b/lib/CodeGen/MachineModuleInfo.cpp @@ -7,27 +7,34 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/ADT/PointerUnion.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/TinyPtrVector.h" -#include "llvm/Analysis/EHPersonalities.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionInitializer.h" -#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Constants.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" -#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Dwarf.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" +#include <algorithm> +#include <cassert> +#include <memory> +#include <utility> +#include <vector> + using namespace llvm; using namespace llvm::dwarf; @@ -37,14 +44,16 @@ INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo", char MachineModuleInfo::ID = 0; // Out of line virtual method. -MachineModuleInfoImpl::~MachineModuleInfoImpl() {} +MachineModuleInfoImpl::~MachineModuleInfoImpl() = default; namespace llvm { + class MMIAddrLabelMapCallbackPtr final : CallbackVH { - MMIAddrLabelMap *Map; + MMIAddrLabelMap *Map = nullptr; + public: - MMIAddrLabelMapCallbackPtr() : Map(nullptr) {} - MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V), Map(nullptr) {} + MMIAddrLabelMapCallbackPtr() = default; + MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V) {} void setPtr(BasicBlock *BB) { ValueHandleBase::operator=(BB); @@ -75,11 +84,12 @@ class MMIAddrLabelMap { /// This is a per-function list of symbols whose corresponding BasicBlock got /// deleted. These symbols need to be emitted at some point in the file, so /// AsmPrinter emits them after the function body. - DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> > + DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>> DeletedAddrLabelsNeedingEmission; -public: +public: MMIAddrLabelMap(MCContext &context) : Context(context) {} + ~MMIAddrLabelMap() { assert(DeletedAddrLabelsNeedingEmission.empty() && "Some labels for deleted blocks never got emitted"); @@ -93,7 +103,8 @@ public: void UpdateForDeletedBlock(BasicBlock *BB); void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New); }; -} + +} // end namespace llvm ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) { assert(BB->hasAddressTaken() && @@ -119,7 +130,7 @@ ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) { /// If we have any deleted symbols for F, return them. void MMIAddrLabelMap:: takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) { - DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> >::iterator I = + DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>::iterator I = DeletedAddrLabelsNeedingEmission.find(F); // If there are no entries for the function, just return. @@ -130,7 +141,6 @@ takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) { DeletedAddrLabelsNeedingEmission.erase(I); } - void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) { // If the block got deleted, there is no need for the symbol. If the symbol // was already emitted, we can just forget about it, otherwise we need to @@ -177,7 +187,6 @@ void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) { OldEntry.Symbols.end()); } - void MMIAddrLabelMapCallbackPtr::deleted() { Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr())); } @@ -186,9 +195,6 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) { Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2)); } - -//===----------------------------------------------------------------------===// - MachineModuleInfo::MachineModuleInfo(const TargetMachine *TM) : ImmutablePass(ID), TM(*TM), Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(), @@ -196,11 +202,9 @@ MachineModuleInfo::MachineModuleInfo(const TargetMachine *TM) initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry()); } -MachineModuleInfo::~MachineModuleInfo() { -} +MachineModuleInfo::~MachineModuleInfo() = default; bool MachineModuleInfo::doInitialization(Module &M) { - ObjFileMMI = nullptr; CurCallSite = 0; DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false; @@ -211,7 +215,6 @@ bool MachineModuleInfo::doInitialization(Module &M) { } bool MachineModuleInfo::doFinalization(Module &M) { - Personalities.clear(); delete AddrLabelSymbols; @@ -290,10 +293,12 @@ void MachineModuleInfo::deleteMachineFunctionFor(Function &F) { } namespace { + /// This pass frees the MachineFunction object associated with a Function. class FreeMachineFunction : public FunctionPass { public: static char ID; + FreeMachineFunction() : FunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -311,14 +316,14 @@ public: return "Free MachineFunction"; } }; -char FreeMachineFunction::ID; + } // end anonymous namespace -namespace llvm { -FunctionPass *createFreeMachineFunctionPass() { +char FreeMachineFunction::ID; + +FunctionPass *llvm::createFreeMachineFunctionPass() { return new FreeMachineFunction(); } -} // end namespace llvm //===- MMI building helpers -----------------------------------------------===// diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ab36bc1417ae..fb51a4eb1421 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -280,6 +280,7 @@ namespace { SDValue visitSELECT_CC(SDNode *N); SDValue visitSETCC(SDNode *N); SDValue visitSETCCE(SDNode *N); + SDValue visitSETCCCARRY(SDNode *N); SDValue visitSIGN_EXTEND(SDNode *N); SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); @@ -1457,6 +1458,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SELECT_CC: return visitSELECT_CC(N); case ISD::SETCC: return visitSETCC(N); case ISD::SETCCE: return visitSETCCE(N); + case ISD::SETCCCARRY: return visitSETCCCARRY(N); case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); @@ -1958,7 +1960,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) { // fold (a+b) -> (a|b) iff a and b share no bits. if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && - VT.isInteger() && DAG.haveNoCommonBitsSet(N0, N1)) + DAG.haveNoCommonBitsSet(N0, N1)) return DAG.getNode(ISD::OR, DL, VT, N0, N1); if (SDValue Combined = visitADDLike(N0, N1, N)) @@ -1970,6 +1972,44 @@ SDValue DAGCombiner::visitADD(SDNode *N) { return SDValue(); } +static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { + bool Masked = false; + + // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization. + while (true) { + if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) { + V = V.getOperand(0); + continue; + } + + if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) { + Masked = true; + V = V.getOperand(0); + continue; + } + + break; + } + + // If this is not a carry, return. + if (V.getResNo() != 1) + return SDValue(); + + if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY && + V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO) + return SDValue(); + + // If the result is masked, then no matter what kind of bool it is we can + // return. If it isn't, then we need to make sure the bool type is either 0 or + // 1 and not other values. + if (Masked || + TLI.getBooleanContents(V.getValueType()) == + TargetLoweringBase::ZeroOrOneBooleanContent) + return V; + + return SDValue(); +} + SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) { EVT VT = N0.getValueType(); SDLoc DL(LocReference); @@ -2017,6 +2057,13 @@ SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), N0, N1.getOperand(0), N1.getOperand(2)); + // (add X, Carry) -> (addcarry X, 0, Carry) + if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) + if (SDValue Carry = getAsCarry(TLI, N1)) + return DAG.getNode(ISD::ADDCARRY, DL, + DAG.getVTList(VT, Carry.getValueType()), N0, + DAG.getConstant(0, DL, VT), Carry); + return SDValue(); } @@ -2090,6 +2137,8 @@ SDValue DAGCombiner::visitUADDO(SDNode *N) { } SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { + auto VT = N0.getValueType(); + // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) // If Y + 1 cannot overflow. if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) { @@ -2100,6 +2149,12 @@ SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { N1.getOperand(2)); } + // (uaddo X, Carry) -> (addcarry X, 0, Carry) + if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) + if (SDValue Carry = getAsCarry(TLI, N1)) + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, + DAG.getConstant(0, SDLoc(N), VT), Carry); + return SDValue(); } @@ -2167,6 +2222,41 @@ SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0.getOperand(0), N0.getOperand(1), CarryIn); + /** + * When one of the addcarry argument is itself a carry, we may be facing + * a diamond carry propagation. In which case we try to transform the DAG + * to ensure linear carry propagation if that is possible. + * + * We are trying to get: + * (addcarry X, 0, (addcarry A, B, Z):Carry) + */ + if (auto Y = getAsCarry(TLI, N1)) { + /** + * (uaddo A, B) + * / \ + * Carry Sum + * | \ + * | (addcarry *, 0, Z) + * | / + * \ Carry + * | / + * (addcarry X, *, *) + */ + if (Y.getOpcode() == ISD::UADDO && + CarryIn.getResNo() == 1 && + CarryIn.getOpcode() == ISD::ADDCARRY && + isNullConstant(CarryIn.getOperand(1)) && + CarryIn.getOperand(0) == Y.getValue(0)) { + auto NewY = DAG.getNode(ISD::ADDCARRY, SDLoc(N), Y->getVTList(), + Y.getOperand(0), Y.getOperand(1), + CarryIn.getOperand(2)); + AddToWorklist(NewY.getNode()); + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, + DAG.getConstant(0, SDLoc(N), N0.getValueType()), + NewY.getValue(1)); + } + } + return SDValue(); } @@ -6754,6 +6844,19 @@ SDValue DAGCombiner::visitSETCCE(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Carry = N->getOperand(2); + SDValue Cond = N->getOperand(3); + + // If Carry is false, fold to a regular SETCC. + if (isNullConstant(Carry)) + return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond); + + return SDValue(); +} + /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or /// a build_vector of constants. /// This function is called by the DAGCombiner when visiting sext/zext/aext @@ -7124,12 +7227,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); - CombineTo(N, ExtLoad); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); - CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); + return CombineTo(N, ExtLoad); // Return N so it doesn't get rechecked! } } @@ -7185,10 +7287,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); - CombineTo(N, And); - CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); + return CombineTo(N, And); // Return N so it doesn't get rechecked! } } } @@ -7427,12 +7528,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), - ISD::ZERO_EXTEND); - CombineTo(N, ExtLoad); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + return CombineTo(N, ExtLoad); // Return N so it doesn't get rechecked! } } @@ -7482,11 +7580,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); - CombineTo(N, And); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND); CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, - ISD::ZERO_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + return CombineTo(N, And); // Return N so it doesn't get rechecked! } } } @@ -12777,10 +12873,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { } // If we have load/store pair instructions and we only have two values, - // don't bother. + // don't bother merging. unsigned RequiredAlignment; if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && - St->getAlignment() >= RequiredAlignment) { + StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); continue; } diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 92b0d2ae4015..0d5e07ded25c 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2875,6 +2875,7 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::SELECT_CC: Res = ExpandIntOp_SELECT_CC(N); break; case ISD::SETCC: Res = ExpandIntOp_SETCC(N); break; case ISD::SETCCE: Res = ExpandIntOp_SETCCE(N); break; + case ISD::SETCCCARRY: Res = ExpandIntOp_SETCCCARRY(N); break; case ISD::SINT_TO_FP: Res = ExpandIntOp_SINT_TO_FP(N); break; case ISD::STORE: Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break; case ISD::TRUNCATE: Res = ExpandIntOp_TRUNCATE(N); break; @@ -3009,14 +3010,16 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS, return; } - // Lower with SETCCE if the target supports it. + // Lower with SETCCE or SETCCCARRY if the target supports it. + EVT HiVT = LHSHi.getValueType(); + EVT ExpandVT = TLI.getTypeToExpandTo(*DAG.getContext(), HiVT); + bool HasSETCCCARRY = TLI.isOperationLegalOrCustom(ISD::SETCCCARRY, ExpandVT); + // FIXME: Make all targets support this, then remove the other lowering. - if (TLI.getOperationAction( - ISD::SETCCE, - TLI.getTypeToExpandTo(*DAG.getContext(), LHSLo.getValueType())) == - TargetLowering::Custom) { - // SETCCE can detect < and >= directly. For > and <=, flip operands and - // condition code. + if (HasSETCCCARRY || + TLI.getOperationAction(ISD::SETCCE, ExpandVT) == TargetLowering::Custom) { + // SETCCE/SETCCCARRY can detect < and >= directly. For > and <=, flip + // operands and condition code. bool FlipOperands = false; switch (CCCode) { case ISD::SETGT: CCCode = ISD::SETLT; FlipOperands = true; break; @@ -3030,27 +3033,28 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS, std::swap(LHSHi, RHSHi); } // Perform a wide subtraction, feeding the carry from the low part into - // SETCCE. The SETCCE operation is essentially looking at the high part of - // the result of LHS - RHS. It is negative iff LHS < RHS. It is zero or - // positive iff LHS >= RHS. - SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue); - SDValue LowCmp = DAG.getNode(ISD::SUBC, dl, VTList, LHSLo, RHSLo); - SDValue Res = - DAG.getNode(ISD::SETCCE, dl, getSetCCResultType(LHSLo.getValueType()), - LHSHi, RHSHi, LowCmp.getValue(1), DAG.getCondCode(CCCode)); + // SETCCE/SETCCCARRY. The SETCCE/SETCCCARRY operation is essentially + // looking at the high part of the result of LHS - RHS. It is negative + // iff LHS < RHS. It is zero or positive iff LHS >= RHS. + EVT LoVT = LHSLo.getValueType(); + SDVTList VTList = DAG.getVTList( + LoVT, HasSETCCCARRY ? getSetCCResultType(LoVT) : MVT::Glue); + SDValue LowCmp = DAG.getNode(HasSETCCCARRY ? ISD::USUBO : ISD::SUBC, dl, + VTList, LHSLo, RHSLo); + SDValue Res = DAG.getNode(HasSETCCCARRY ? ISD::SETCCCARRY : ISD::SETCCE, dl, + getSetCCResultType(HiVT), LHSHi, RHSHi, + LowCmp.getValue(1), DAG.getCondCode(CCCode)); NewLHS = Res; NewRHS = SDValue(); return; } - NewLHS = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()), - LHSHi, RHSHi, ISD::SETEQ, false, - DagCombineInfo, dl); + NewLHS = TLI.SimplifySetCC(getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ, + false, DagCombineInfo, dl); if (!NewLHS.getNode()) - NewLHS = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), - LHSHi, RHSHi, ISD::SETEQ); - NewLHS = DAG.getSelect(dl, LoCmp.getValueType(), - NewLHS, LoCmp, HiCmp); + NewLHS = + DAG.getSetCC(dl, getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ); + NewLHS = DAG.getSelect(dl, LoCmp.getValueType(), NewLHS, LoCmp, HiCmp); NewRHS = SDValue(); } @@ -3103,8 +3107,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) { } // Otherwise, update N to have the operands specified. - return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS, - DAG.getCondCode(CCCode)), 0); + return SDValue( + DAG.UpdateNodeOperands(N, NewLHS, NewRHS, DAG.getCondCode(CCCode)), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) { @@ -3125,6 +3129,24 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) { LowCmp.getValue(1), Cond); } +SDValue DAGTypeLegalizer::ExpandIntOp_SETCCCARRY(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Carry = N->getOperand(2); + SDValue Cond = N->getOperand(3); + SDLoc dl = SDLoc(N); + + SDValue LHSLo, LHSHi, RHSLo, RHSHi; + GetExpandedInteger(LHS, LHSLo, LHSHi); + GetExpandedInteger(RHS, RHSLo, RHSHi); + + // Expand to a SUBE for the low part and a smaller SETCCCARRY for the high. + SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), Carry.getValueType()); + SDValue LowCmp = DAG.getNode(ISD::SUBCARRY, dl, VTList, LHSLo, RHSLo, Carry); + return DAG.getNode(ISD::SETCCCARRY, dl, N->getValueType(0), LHSHi, RHSHi, + LowCmp.getValue(1), Cond); +} + SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) { // The value being shifted is legal, but the shift amount is too big. // It follows that either the result of the shift is undefined, or the diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 4c3b514856b7..8e999188d8e1 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -381,6 +381,7 @@ private: SDValue ExpandIntOp_SELECT_CC(SDNode *N); SDValue ExpandIntOp_SETCC(SDNode *N); SDValue ExpandIntOp_SETCCE(SDNode *N); + SDValue ExpandIntOp_SETCCCARRY(SDNode *N); SDValue ExpandIntOp_Shift(SDNode *N); SDValue ExpandIntOp_SINT_TO_FP(SDNode *N); SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo); diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 4f4025d8ae6a..579112c9bfc8 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -226,6 +226,7 @@ private: void UnscheduleNodeBottomUp(SUnit*); void RestoreHazardCheckerBottomUp(); void BacktrackBottomUp(SUnit*, SUnit*); + SUnit *TryUnfoldSU(SUnit *); SUnit *CopyAndMoveSuccessors(SUnit*); void InsertCopiesAndMoveSuccs(SUnit*, unsigned, const TargetRegisterClass*, @@ -780,7 +781,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { } /// CapturePred - This does the opposite of ReleasePred. Since SU is being -/// unscheduled, incrcease the succ left count of its predecessors. Remove +/// unscheduled, increase the succ left count of its predecessors. Remove /// them from AvailableQueue if necessary. void ScheduleDAGRRList::CapturePred(SDep *PredEdge) { SUnit *PredSU = PredEdge->getSUnit(); @@ -934,6 +935,146 @@ static bool isOperandOf(const SUnit *SU, SDNode *N) { return false; } +/// TryUnfold - Attempt to unfold +SUnit *ScheduleDAGRRList::TryUnfoldSU(SUnit *SU) { + SDNode *N = SU->getNode(); + // Use while over if to ease fall through. + SmallVector<SDNode *, 2> NewNodes; + if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes)) + return nullptr; + + // unfolding an x86 DEC64m operation results in store, dec, load which + // can't be handled here so quit + if (NewNodes.size() == 3) + return nullptr; + + assert(NewNodes.size() == 2 && "Expected a load folding node!"); + + N = NewNodes[1]; + SDNode *LoadNode = NewNodes[0]; + unsigned NumVals = N->getNumValues(); + unsigned OldNumVals = SU->getNode()->getNumValues(); + + // LoadNode may already exist. This can happen when there is another + // load from the same location and producing the same type of value + // but it has different alignment or volatileness. + bool isNewLoad = true; + SUnit *LoadSU; + if (LoadNode->getNodeId() != -1) { + LoadSU = &SUnits[LoadNode->getNodeId()]; + // If LoadSU has already been scheduled, we should clone it but + // this would negate the benefit to unfolding so just return SU. + if (LoadSU->isScheduled) + return SU; + isNewLoad = false; + } else { + LoadSU = CreateNewSUnit(LoadNode); + LoadNode->setNodeId(LoadSU->NodeNum); + + InitNumRegDefsLeft(LoadSU); + computeLatency(LoadSU); + } + + DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n"); + + // Now that we are committed to unfolding replace DAG Uses. + for (unsigned i = 0; i != NumVals; ++i) + DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i)); + DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals - 1), + SDValue(LoadNode, 1)); + + SUnit *NewSU = CreateNewSUnit(N); + assert(N->getNodeId() == -1 && "Node already inserted!"); + N->setNodeId(NewSU->NodeNum); + + const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); + for (unsigned i = 0; i != MCID.getNumOperands(); ++i) { + if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) { + NewSU->isTwoAddress = true; + break; + } + } + if (MCID.isCommutable()) + NewSU->isCommutable = true; + + InitNumRegDefsLeft(NewSU); + computeLatency(NewSU); + + // Record all the edges to and from the old SU, by category. + SmallVector<SDep, 4> ChainPreds; + SmallVector<SDep, 4> ChainSuccs; + SmallVector<SDep, 4> LoadPreds; + SmallVector<SDep, 4> NodePreds; + SmallVector<SDep, 4> NodeSuccs; + for (SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) + ChainPreds.push_back(Pred); + else if (isOperandOf(Pred.getSUnit(), LoadNode)) + LoadPreds.push_back(Pred); + else + NodePreds.push_back(Pred); + } + for (SDep &Succ : SU->Succs) { + if (Succ.isCtrl()) + ChainSuccs.push_back(Succ); + else + NodeSuccs.push_back(Succ); + } + + // Now assign edges to the newly-created nodes. + for (const SDep &Pred : ChainPreds) { + RemovePred(SU, Pred); + if (isNewLoad) + AddPred(LoadSU, Pred); + } + for (const SDep &Pred : LoadPreds) { + RemovePred(SU, Pred); + if (isNewLoad) + AddPred(LoadSU, Pred); + } + for (const SDep &Pred : NodePreds) { + RemovePred(SU, Pred); + AddPred(NewSU, Pred); + } + for (SDep D : NodeSuccs) { + SUnit *SuccDep = D.getSUnit(); + D.setSUnit(SU); + RemovePred(SuccDep, D); + D.setSUnit(NewSU); + AddPred(SuccDep, D); + // Balance register pressure. + if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled && + !D.isCtrl() && NewSU->NumRegDefsLeft > 0) + --NewSU->NumRegDefsLeft; + } + for (SDep D : ChainSuccs) { + SUnit *SuccDep = D.getSUnit(); + D.setSUnit(SU); + RemovePred(SuccDep, D); + if (isNewLoad) { + D.setSUnit(LoadSU); + AddPred(SuccDep, D); + } + } + + // Add a data dependency to reflect that NewSU reads the value defined + // by LoadSU. + SDep D(LoadSU, SDep::Data, 0); + D.setLatency(LoadSU->Latency); + AddPred(NewSU, D); + + if (isNewLoad) + AvailableQueue->addNode(LoadSU); + AvailableQueue->addNode(NewSU); + + ++NumUnfolds; + + if (NewSU->NumSuccsLeft == 0) + NewSU->isAvailable = true; + + return NewSU; +} + /// CopyAndMoveSuccessors - Clone the specified node and move its scheduled /// successors to the newly created node. SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) { @@ -959,135 +1100,16 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) { return nullptr; } + // If possible unfold instruction. if (TryUnfold) { - SmallVector<SDNode*, 2> NewNodes; - if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes)) + SUnit *UnfoldSU = TryUnfoldSU(SU); + if (!UnfoldSU) return nullptr; - - // unfolding an x86 DEC64m operation results in store, dec, load which - // can't be handled here so quit - if (NewNodes.size() == 3) - return nullptr; - - DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n"); - assert(NewNodes.size() == 2 && "Expected a load folding node!"); - - N = NewNodes[1]; - SDNode *LoadNode = NewNodes[0]; - unsigned NumVals = N->getNumValues(); - unsigned OldNumVals = SU->getNode()->getNumValues(); - for (unsigned i = 0; i != NumVals; ++i) - DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i)); - DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1), - SDValue(LoadNode, 1)); - - // LoadNode may already exist. This can happen when there is another - // load from the same location and producing the same type of value - // but it has different alignment or volatileness. - bool isNewLoad = true; - SUnit *LoadSU; - if (LoadNode->getNodeId() != -1) { - LoadSU = &SUnits[LoadNode->getNodeId()]; - isNewLoad = false; - } else { - LoadSU = CreateNewSUnit(LoadNode); - LoadNode->setNodeId(LoadSU->NodeNum); - - InitNumRegDefsLeft(LoadSU); - computeLatency(LoadSU); - } - - SUnit *NewSU = CreateNewSUnit(N); - assert(N->getNodeId() == -1 && "Node already inserted!"); - N->setNodeId(NewSU->NodeNum); - - const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); - for (unsigned i = 0; i != MCID.getNumOperands(); ++i) { - if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) { - NewSU->isTwoAddress = true; - break; - } - } - if (MCID.isCommutable()) - NewSU->isCommutable = true; - - InitNumRegDefsLeft(NewSU); - computeLatency(NewSU); - - // Record all the edges to and from the old SU, by category. - SmallVector<SDep, 4> ChainPreds; - SmallVector<SDep, 4> ChainSuccs; - SmallVector<SDep, 4> LoadPreds; - SmallVector<SDep, 4> NodePreds; - SmallVector<SDep, 4> NodeSuccs; - for (SDep &Pred : SU->Preds) { - if (Pred.isCtrl()) - ChainPreds.push_back(Pred); - else if (isOperandOf(Pred.getSUnit(), LoadNode)) - LoadPreds.push_back(Pred); - else - NodePreds.push_back(Pred); - } - for (SDep &Succ : SU->Succs) { - if (Succ.isCtrl()) - ChainSuccs.push_back(Succ); - else - NodeSuccs.push_back(Succ); - } - - // Now assign edges to the newly-created nodes. - for (const SDep &Pred : ChainPreds) { - RemovePred(SU, Pred); - if (isNewLoad) - AddPred(LoadSU, Pred); - } - for (const SDep &Pred : LoadPreds) { - RemovePred(SU, Pred); - if (isNewLoad) - AddPred(LoadSU, Pred); - } - for (const SDep &Pred : NodePreds) { - RemovePred(SU, Pred); - AddPred(NewSU, Pred); - } - for (SDep D : NodeSuccs) { - SUnit *SuccDep = D.getSUnit(); - D.setSUnit(SU); - RemovePred(SuccDep, D); - D.setSUnit(NewSU); - AddPred(SuccDep, D); - // Balance register pressure. - if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled - && !D.isCtrl() && NewSU->NumRegDefsLeft > 0) - --NewSU->NumRegDefsLeft; - } - for (SDep D : ChainSuccs) { - SUnit *SuccDep = D.getSUnit(); - D.setSUnit(SU); - RemovePred(SuccDep, D); - if (isNewLoad) { - D.setSUnit(LoadSU); - AddPred(SuccDep, D); - } - } - - // Add a data dependency to reflect that NewSU reads the value defined - // by LoadSU. - SDep D(LoadSU, SDep::Data, 0); - D.setLatency(LoadSU->Latency); - AddPred(NewSU, D); - - if (isNewLoad) - AvailableQueue->addNode(LoadSU); - AvailableQueue->addNode(NewSU); - - ++NumUnfolds; - - if (NewSU->NumSuccsLeft == 0) { - NewSU->isAvailable = true; - return NewSU; - } - SU = NewSU; + SU = UnfoldSU; + N = SU->getNode(); + // If this can be scheduled don't bother duplicating and just return + if (SU->NumSuccsLeft == 0) + return SU; } DEBUG(dbgs() << " Duplicating SU #" << SU->NodeNum << "\n"); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index c37d7080f2c5..0dbd9e846aa6 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -214,6 +214,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FPOWI: return "fpowi"; case ISD::SETCC: return "setcc"; case ISD::SETCCE: return "setcce"; + case ISD::SETCCCARRY: return "setcccarry"; case ISD::SELECT: return "select"; case ISD::VSELECT: return "vselect"; case ISD::SELECT_CC: return "select_cc"; diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 0def5ae6d0d0..900c0318b179 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -842,9 +842,10 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { initActions(); // Perform these initializations only once. - MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8; - MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize - = MaxStoresPerMemmoveOptSize = 4; + MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = + MaxLoadsPerMemcmp = 8; + MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = + MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4; UseUnderscoreSetJmp = false; UseUnderscoreLongJmp = false; HasMultipleConditionRegisters = false; @@ -926,6 +927,7 @@ void TargetLoweringBase::initActions() { // ADDCARRY operations default to expand setOperationAction(ISD::ADDCARRY, VT, Expand); setOperationAction(ISD::SUBCARRY, VT, Expand); + setOperationAction(ISD::SETCCCARRY, VT, Expand); // These default to Expand so they will be expanded to CTLZ/CTTZ by default. setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index 83348058eca9..72d5e995ac22 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -261,9 +261,9 @@ TargetPassConfig::~TargetPassConfig() { // Out of line constructor provides default values for pass options and // registers all common codegen passes. -TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) +TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm) : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false), - AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false), + AddingMachinePasses(false), TM(&TM), Impl(nullptr), Initialized(false), DisableVerify(false), EnableTailMerge(true), RequireCodeGenSCCOrder(false) { @@ -282,9 +282,9 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) substitutePass(&PostRAMachineLICMID, &MachineLICMID); if (StringRef(PrintMachineInstrs.getValue()).equals("")) - TM->Options.PrintMachineCode = true; + TM.Options.PrintMachineCode = true; - if (TM->Options.EnableIPRA) + if (TM.Options.EnableIPRA) setRequiresCodeGenSCCOrder(); } @@ -310,7 +310,7 @@ void TargetPassConfig::insertPass(AnalysisID TargetPassID, /// /// Targets may override this to extend TargetPassConfig. TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) { - return new TargetPassConfig(this, PM); + return new TargetPassConfig(*this, PM); } TargetPassConfig::TargetPassConfig() @@ -430,7 +430,12 @@ void TargetPassConfig::addPrintPass(const std::string &Banner) { } void TargetPassConfig::addVerifyPass(const std::string &Banner) { - if (VerifyMachineCode) + bool Verify = VerifyMachineCode; +#ifdef EXPENSIVE_CHECKS + if (VerifyMachineCode == cl::BOU_UNSET) + Verify = TM->isMachineVerifierClean(); +#endif + if (Verify) PM->add(createMachineVerifierPass(Banner)); } diff --git a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp index 4c78caf03477..d058f4864975 100644 --- a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp +++ b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp @@ -46,7 +46,7 @@ Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) { } #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) \ SYMBOL_RECORD(EnumVal, EnumVal, AliasName) -#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" } if (auto EC = Callbacks.visitSymbolEnd(Record)) diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp index 705b548141b0..f0debd9e9702 100644 --- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp +++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp @@ -71,7 +71,7 @@ static Error visitMemberRecord(CVMemberRecord &Record, MEMBER_RECORD(EnumVal, EnumVal, AliasName) #define TYPE_RECORD(EnumName, EnumVal, Name) #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" } if (auto EC = Callbacks.visitMemberEnd(Record)) @@ -155,7 +155,7 @@ Error CVTypeVisitor::finishVisitation(CVType &Record) { TYPE_RECORD(EnumVal, EnumVal, AliasName) #define MEMBER_RECORD(EnumName, EnumVal, Name) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" } if (auto EC = Callbacks.visitTypeEnd(Record)) diff --git a/lib/DebugInfo/CodeView/EnumTables.cpp b/lib/DebugInfo/CodeView/EnumTables.cpp index 0441110c85ef..01d8ccf2d31e 100644 --- a/lib/DebugInfo/CodeView/EnumTables.cpp +++ b/lib/DebugInfo/CodeView/EnumTables.cpp @@ -20,13 +20,13 @@ using namespace codeview; static const EnumEntry<SymbolKind> SymbolTypeNames[] = { #define CV_SYMBOL(enum, val) {#enum, enum}, -#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" #undef CV_SYMBOL }; static const EnumEntry<TypeLeafKind> TypeLeafNames[] = { #define CV_TYPE(name, val) {#name, name}, -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" #undef CV_TYPE }; diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp index 2f5a7d256c60..3d49a7198d1a 100644 --- a/lib/DebugInfo/CodeView/SymbolDumper.cpp +++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp @@ -41,7 +41,7 @@ public: #define SYMBOL_RECORD(EnumName, EnumVal, Name) \ Error visitKnownRecord(CVSymbol &CVR, Name &Record) override; #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" Error visitSymbolBegin(CVSymbol &Record) override; Error visitSymbolEnd(CVSymbol &Record) override; diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp index 84f52a055815..04b0384d8190 100644 --- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp +++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp @@ -26,7 +26,7 @@ using namespace llvm::codeview; static const EnumEntry<TypeLeafKind> LeafTypeNames[] = { #define CV_TYPE(enum, val) {#enum, enum}, -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" }; #define ENUM_ENTRY(enum_class, enum) \ @@ -155,7 +155,7 @@ static StringRef getLeafTypeName(TypeLeafKind LT) { #define TYPE_RECORD(ename, value, name) \ case ename: \ return #name; -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" default: break; } diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index 5ed55ce4c0dc..1be156d6ea9b 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -84,8 +84,12 @@ static void dumpAccelSection(raw_ostream &OS, StringRef Name, Accel.dump(OS); } -void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH, - bool SummarizeTypes) { +void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts){ + + DIDumpType DumpType = DumpOpts.DumpType; + bool DumpEH = DumpOpts.DumpEH; + bool SummarizeTypes = DumpOpts.SummarizeTypes; + if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) { OS << ".debug_abbrev contents:\n"; getDebugAbbrev()->dump(OS); diff --git a/lib/DebugInfo/PDB/Native/InfoStream.cpp b/lib/DebugInfo/PDB/Native/InfoStream.cpp index 2a1d12e82390..7c6069652da6 100644 --- a/lib/DebugInfo/PDB/Native/InfoStream.cpp +++ b/lib/DebugInfo/PDB/Native/InfoStream.cpp @@ -79,6 +79,7 @@ Error InfoStream::reload() { break; case uint32_t(PdbRaw_FeatureSig::MinimalDebugInfo): Features |= PdbFeatureMinimalDebugInfo; + break; default: continue; } diff --git a/lib/DebugInfo/PDB/PDBContext.cpp b/lib/DebugInfo/PDB/PDBContext.cpp index 94b81ecf561e..f6b6b951ebe1 100644 --- a/lib/DebugInfo/PDB/PDBContext.cpp +++ b/lib/DebugInfo/PDB/PDBContext.cpp @@ -29,8 +29,7 @@ PDBContext::PDBContext(const COFFObjectFile &Object, Session->setLoadAddress(ImageBase.get()); } -void PDBContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH, - bool SummarizeTypes) {} +void PDBContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts){} DILineInfo PDBContext::getLineInfoForAddress(uint64_t Address, DILineInfoSpecifier Specifier) { diff --git a/lib/Fuzzer/test/dump_coverage.test b/lib/Fuzzer/test/dump_coverage.test index 8acc8304fc60..bd85ed718e19 100644 --- a/lib/Fuzzer/test/dump_coverage.test +++ b/lib/Fuzzer/test/dump_coverage.test @@ -4,11 +4,11 @@ RUN: sancov -covered-functions LLVMFuzzer-NullDerefTest* %t_workdir/*.sancov | F RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' LLVMFuzzer-DSOTest -dump_coverage=1 -runs=0 2>&1 | FileCheck %s --check-prefix=DSO RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' not LLVMFuzzer-NullDerefTest -dump_coverage=0 2>&1 | FileCheck %s --check-prefix=NOCOV -CHECK: SanitizerCoverage: {{.*}}LLVMFuzzer-NullDerefTest.{{.*}}.sancov {{.*}} PCs written +CHECK: SanitizerCoverage: {{.*}}LLVMFuzzer-NullDerefTest.{{.*}}.sancov: {{.*}} PCs written SANCOV: LLVMFuzzerTestOneInput -DSO: SanitizerCoverage: {{.*}}LLVMFuzzer-DSOTest.{{.*}}.sancov {{.*}} PCs written -DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO1.{{.*}}.sancov {{.*}} PCs written -DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO2.{{.*}}.sancov {{.*}} PCs written +DSO: SanitizerCoverage: {{.*}}LLVMFuzzer-DSOTest.{{.*}}.sancov: {{.*}} PCs written +DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO1.{{.*}}.sancov: {{.*}} PCs written +DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO2.{{.*}}.sancov: {{.*}} PCs written NOCOV-NOT: SanitizerCoverage: {{.*}} PCs written diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp index 19b7c3027232..a76c944f0005 100644 --- a/lib/IR/Attributes.cpp +++ b/lib/IR/Attributes.cpp @@ -1006,6 +1006,10 @@ AttributeList AttributeList::get(LLVMContext &C, for (AttributeList List : Attrs) MaxSize = std::max(MaxSize, List.getNumAttrSets()); + // If every list was empty, there is no point in merging the lists. + if (MaxSize == 0) + return AttributeList(); + SmallVector<AttributeSet, 8> NewAttrSets(MaxSize); for (unsigned I = 0; I < MaxSize; ++I) { AttrBuilder CurBuilder; @@ -1033,24 +1037,11 @@ AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index, return addAttributes(C, Index, B); } -AttributeList AttributeList::addAttribute(LLVMContext &C, - ArrayRef<unsigned> Indices, +AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index, Attribute A) const { - assert(std::is_sorted(Indices.begin(), Indices.end())); - - SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end()); - unsigned MaxIndex = attrIdxToArrayIdx(Indices.back()); - if (MaxIndex >= AttrSets.size()) - AttrSets.resize(MaxIndex + 1); - - for (unsigned Index : Indices) { - Index = attrIdxToArrayIdx(Index); - AttrBuilder B(AttrSets[Index]); - B.addAttribute(A); - AttrSets[Index] = AttributeSet::get(C, B); - } - - return getImpl(C, AttrSets); + AttrBuilder B; + B.addAttribute(A); + return addAttributes(C, Index, B); } AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index, @@ -1082,6 +1073,26 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index, return getImpl(C, AttrSets); } +AttributeList AttributeList::addParamAttribute(LLVMContext &C, + ArrayRef<unsigned> ArgNos, + Attribute A) const { + assert(std::is_sorted(ArgNos.begin(), ArgNos.end())); + + SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end()); + unsigned MaxIndex = attrIdxToArrayIdx(ArgNos.back() + FirstArgIndex); + if (MaxIndex >= AttrSets.size()) + AttrSets.resize(MaxIndex + 1); + + for (unsigned ArgNo : ArgNos) { + unsigned Index = attrIdxToArrayIdx(ArgNo + FirstArgIndex); + AttrBuilder B(AttrSets[Index]); + B.addAttribute(A); + AttrSets[Index] = AttributeSet::get(C, B); + } + + return getImpl(C, AttrSets); +} + AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index, Attribute::AttrKind Kind) const { if (!hasAttribute(Index, Kind)) return *this; diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp index 39de4b0a97fa..fc68c0e3cad9 100644 --- a/lib/IR/Function.cpp +++ b/lib/IR/Function.cpp @@ -118,15 +118,13 @@ unsigned Argument::getParamAlignment() const { uint64_t Argument::getDereferenceableBytes() const { assert(getType()->isPointerTy() && "Only pointers have dereferenceable bytes"); - return getParent()->getDereferenceableBytes(getArgNo() + - AttributeList::FirstArgIndex); + return getParent()->getParamDereferenceableBytes(getArgNo()); } uint64_t Argument::getDereferenceableOrNullBytes() const { assert(getType()->isPointerTy() && "Only pointers have dereferenceable bytes"); - return getParent()->getDereferenceableOrNullBytes( - getArgNo() + AttributeList::FirstArgIndex); + return getParent()->getParamDereferenceableOrNullBytes(getArgNo()); } bool Argument::hasNestAttr() const { @@ -169,21 +167,20 @@ bool Argument::onlyReadsMemory() const { void Argument::addAttrs(AttrBuilder &B) { AttributeList AL = getParent()->getAttributes(); - AL = AL.addAttributes(Parent->getContext(), - getArgNo() + AttributeList::FirstArgIndex, B); + AL = AL.addParamAttributes(Parent->getContext(), getArgNo(), B); getParent()->setAttributes(AL); } void Argument::addAttr(Attribute::AttrKind Kind) { - getParent()->addAttribute(getArgNo() + AttributeList::FirstArgIndex, Kind); + getParent()->addParamAttr(getArgNo(), Kind); } void Argument::addAttr(Attribute Attr) { - getParent()->addAttribute(getArgNo() + AttributeList::FirstArgIndex, Attr); + getParent()->addParamAttr(getArgNo(), Attr); } void Argument::removeAttr(Attribute::AttrKind Kind) { - getParent()->removeAttribute(getArgNo() + AttributeList::FirstArgIndex, Kind); + getParent()->removeParamAttr(getArgNo(), Kind); } bool Argument::hasAttribute(Attribute::AttrKind Kind) const { @@ -365,6 +362,24 @@ void Function::addAttributes(unsigned i, const AttrBuilder &Attrs) { setAttributes(PAL); } +void Function::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); +} + +void Function::addParamAttr(unsigned ArgNo, Attribute Attr) { + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr); + setAttributes(PAL); +} + +void Function::addParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) { + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttributes(getContext(), ArgNo, Attrs); + setAttributes(PAL); +} + void Function::removeAttribute(unsigned i, Attribute::AttrKind Kind) { AttributeList PAL = getAttributes(); PAL = PAL.removeAttribute(getContext(), i, Kind); @@ -383,18 +398,49 @@ void Function::removeAttributes(unsigned i, const AttrBuilder &Attrs) { setAttributes(PAL); } +void Function::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); +} + +void Function::removeParamAttr(unsigned ArgNo, StringRef Kind) { + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); +} + +void Function::removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) { + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttributes(getContext(), ArgNo, Attrs); + setAttributes(PAL); +} + void Function::addDereferenceableAttr(unsigned i, uint64_t Bytes) { AttributeList PAL = getAttributes(); PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes); setAttributes(PAL); } +void Function::addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes) { + AttributeList PAL = getAttributes(); + PAL = PAL.addDereferenceableParamAttr(getContext(), ArgNo, Bytes); + setAttributes(PAL); +} + void Function::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) { AttributeList PAL = getAttributes(); PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes); setAttributes(PAL); } +void Function::addDereferenceableOrNullParamAttr(unsigned ArgNo, + uint64_t Bytes) { + AttributeList PAL = getAttributes(); + PAL = PAL.addDereferenceableOrNullParamAttr(getContext(), ArgNo, Bytes); + setAttributes(PAL); +} + const std::string &Function::getGC() const { assert(hasGC() && "Function has no collector"); return getContext().getGC(*this); diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp index d7baa9ebc223..46c27331ff95 100644 --- a/lib/IR/Instructions.cpp +++ b/lib/IR/Instructions.cpp @@ -393,7 +393,17 @@ void CallInst::addAttribute(unsigned i, Attribute Attr) { } void CallInst::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - addAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); +} + +void CallInst::addParamAttr(unsigned ArgNo, Attribute Attr) { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr); + setAttributes(PAL); } void CallInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) { @@ -409,7 +419,17 @@ void CallInst::removeAttribute(unsigned i, StringRef Kind) { } void CallInst::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - removeAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); +} + +void CallInst::removeParamAttr(unsigned ArgNo, StringRef Kind) { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); } void CallInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) { @@ -808,7 +828,9 @@ void InvokeInst::addAttribute(unsigned i, Attribute Attr) { } void InvokeInst::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - addAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); } void InvokeInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) { @@ -824,7 +846,9 @@ void InvokeInst::removeAttribute(unsigned i, StringRef Kind) { } void InvokeInst::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - removeAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); } void InvokeInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) { diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp index 668667a53562..f9c41f5c9744 100644 --- a/lib/LTO/LTOBackend.cpp +++ b/lib/LTO/LTOBackend.cpp @@ -136,7 +136,8 @@ createTargetMachine(Config &Conf, const Target *TheTarget, Module &M) { Conf.CodeModel, Conf.CGOptLevel)); } -static void runNewPMPasses(Module &Mod, TargetMachine *TM, unsigned OptLevel) { +static void runNewPMPasses(Module &Mod, TargetMachine *TM, unsigned OptLevel, + bool IsThinLTO) { PassBuilder PB(TM); AAManager AA; @@ -180,7 +181,10 @@ static void runNewPMPasses(Module &Mod, TargetMachine *TM, unsigned OptLevel) { break; } - MPM = PB.buildLTODefaultPipeline(OL, false /* DebugLogging */); + if (IsThinLTO) + MPM = PB.buildThinLTODefaultPipeline(OL, false /* DebugLogging */); + else + MPM = PB.buildLTODefaultPipeline(OL, false /* DebugLogging */); MPM.run(Mod, MAM); // FIXME (davide): verify the output. @@ -258,17 +262,12 @@ static void runOldPMPasses(Config &Conf, Module &Mod, TargetMachine *TM, bool opt(Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod, bool IsThinLTO, ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary) { - // There's still no ThinLTO pipeline hooked up in the new pass manager, - // once there is one, we can just remove this. - if (LTOUseNewPM && IsThinLTO) - report_fatal_error("ThinLTO not supported with the new PM yet!"); - // FIXME: Plumb the combined index into the new pass manager. if (!Conf.OptPipeline.empty()) runNewPMCustomPasses(Mod, TM, Conf.OptPipeline, Conf.AAPipeline, Conf.DisableVerify); else if (LTOUseNewPM) - runNewPMPasses(Mod, TM, Conf.OptLevel); + runNewPMPasses(Mod, TM, Conf.OptLevel, IsThinLTO); else runOldPMPasses(Conf, Mod, TM, IsThinLTO, ExportSummary, ImportSummary); return !Conf.PostOptModuleHook || Conf.PostOptModuleHook(Task, Mod); diff --git a/lib/MC/MCCodeView.cpp b/lib/MC/MCCodeView.cpp index a0a0ef312276..6c9a4f9f982d 100644 --- a/lib/MC/MCCodeView.cpp +++ b/lib/MC/MCCodeView.cpp @@ -12,11 +12,11 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCCodeView.h" -#include "llvm/MC/MCAsmLayout.h" #include "llvm/ADT/STLExtras.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/Line.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCValue.h" diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp index bfb8875f47d4..084159a61f55 100644 --- a/lib/Object/MachOObjectFile.cpp +++ b/lib/Object/MachOObjectFile.cpp @@ -3269,7 +3269,6 @@ void MachOBindEntry::moveNext() { if (ImmValue) { SignExtended = MachO::BIND_OPCODE_MASK | ImmValue; Ordinal = SignExtended; - LibraryOrdinalSet = true; if (Ordinal < MachO::BIND_SPECIAL_DYLIB_FLAT_LOOKUP) { *E = malformedError("for BIND_OPCODE_SET_DYLIB_SPECIAL_IMM unknown " "special ordinal: " + Twine((int)Ordinal) + " for opcode at: " @@ -3279,6 +3278,7 @@ void MachOBindEntry::moveNext() { } } else Ordinal = 0; + LibraryOrdinalSet = true; DEBUG_WITH_TYPE( "mach-o-bind", dbgs() << "BIND_OPCODE_SET_DYLIB_SPECIAL_IMM: " diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp index b52563469094..e46d38e466a0 100644 --- a/lib/Object/WindowsResource.cpp +++ b/lib/Object/WindowsResource.cpp @@ -12,20 +12,23 @@ //===----------------------------------------------------------------------===// #include "llvm/Object/WindowsResource.h" -#include "llvm/Object/Error.h" +#include "llvm/Support/COFF.h" +#include <sstream> #include <system_error> namespace llvm { namespace object { -static const size_t ResourceMagicSize = 16; - -static const size_t NullEntrySize = 16; - #define RETURN_IF_ERROR(X) \ if (auto EC = X) \ return EC; +const uint32_t MIN_HEADER_SIZE = 7 * sizeof(uint32_t) + 2 * sizeof(uint16_t); + +static const size_t ResourceMagicSize = 16; + +static const size_t NullEntrySize = 16; + WindowsResource::WindowsResource(MemoryBufferRef Source) : Binary(Binary::ID_WinRes, Source) { size_t LeadingSize = ResourceMagicSize + NullEntrySize; @@ -33,8 +36,6 @@ WindowsResource::WindowsResource(MemoryBufferRef Source) support::little); } -WindowsResource::~WindowsResource() = default; - Expected<std::unique_ptr<WindowsResource>> WindowsResource::createWindowsResource(MemoryBufferRef Source) { if (Source.getBufferSize() < ResourceMagicSize + NullEntrySize) @@ -72,19 +73,150 @@ Error ResourceEntryRef::moveNext(bool &End) { return Error::success(); } +static Error readStringOrId(BinaryStreamReader &Reader, uint16_t &ID, + ArrayRef<UTF16> &Str, bool &IsString) { + uint16_t IDFlag; + RETURN_IF_ERROR(Reader.readInteger(IDFlag)); + IsString = IDFlag != 0xffff; + + if (IsString) { + Reader.setOffset( + Reader.getOffset() - + sizeof(uint16_t)); // Re-read the bytes which we used to check the flag. + RETURN_IF_ERROR(Reader.readWideString(Str)); + } else + RETURN_IF_ERROR(Reader.readInteger(ID)); + + return Error::success(); +} + Error ResourceEntryRef::loadNext() { uint32_t DataSize; RETURN_IF_ERROR(Reader.readInteger(DataSize)); uint32_t HeaderSize; RETURN_IF_ERROR(Reader.readInteger(HeaderSize)); - // The data and header size ints are themselves part of the header, so we must - // subtract them from the size. - RETURN_IF_ERROR( - Reader.readStreamRef(HeaderBytes, HeaderSize - 2 * sizeof(uint32_t))); - RETURN_IF_ERROR(Reader.readStreamRef(DataBytes, DataSize)); + + if (HeaderSize < MIN_HEADER_SIZE) + return make_error<GenericBinaryError>("Header size is too small.", + object_error::parse_failed); + + RETURN_IF_ERROR(readStringOrId(Reader, TypeID, Type, IsStringType)); + + RETURN_IF_ERROR(readStringOrId(Reader, NameID, Name, IsStringName)); + RETURN_IF_ERROR(Reader.padToAlignment(sizeof(uint32_t))); + + RETURN_IF_ERROR(Reader.readObject(Suffix)); + + RETURN_IF_ERROR(Reader.readArray(Data, DataSize)); + + RETURN_IF_ERROR(Reader.padToAlignment(sizeof(uint32_t))); + + return Error::success(); +} + +WindowsResourceParser::WindowsResourceParser() {} + +Error WindowsResourceParser::parse(WindowsResource *WR) { + auto EntryOrErr = WR->getHeadEntry(); + if (!EntryOrErr) + return EntryOrErr.takeError(); + + ResourceEntryRef Entry = EntryOrErr.get(); + bool End = false; + + while (!End) { + + Root.addEntry(Entry); + + RETURN_IF_ERROR(Entry.moveNext(End)); + } + return Error::success(); } +void WindowsResourceParser::printTree() const { + ScopedPrinter Writer(outs()); + Root.print(Writer, "Resource Tree"); +} + +void WindowsResourceParser::TreeNode::addEntry(const ResourceEntryRef &Entry) { + TreeNode &TypeNode = addTypeNode(Entry); + TreeNode &NameNode = TypeNode.addNameNode(Entry); + NameNode.addLanguageNode(Entry); +} + +WindowsResourceParser::TreeNode::TreeNode(ArrayRef<UTF16> NameRef) + : Name(NameRef) {} + +WindowsResourceParser::TreeNode & +WindowsResourceParser::TreeNode::addTypeNode(const ResourceEntryRef &Entry) { + if (Entry.checkTypeString()) + return addChild(Entry.getTypeString()); + else + return addChild(Entry.getTypeID()); +} + +WindowsResourceParser::TreeNode & +WindowsResourceParser::TreeNode::addNameNode(const ResourceEntryRef &Entry) { + if (Entry.checkNameString()) + return addChild(Entry.getNameString()); + else + return addChild(Entry.getNameID()); +} + +WindowsResourceParser::TreeNode & +WindowsResourceParser::TreeNode::addLanguageNode( + const ResourceEntryRef &Entry) { + return addChild(Entry.getLanguage()); +} + +WindowsResourceParser::TreeNode & +WindowsResourceParser::TreeNode::addChild(uint32_t ID) { + auto Child = IDChildren.find(ID); + if (Child == IDChildren.end()) { + auto NewChild = llvm::make_unique<WindowsResourceParser::TreeNode>(ID); + WindowsResourceParser::TreeNode &Node = *NewChild; + IDChildren.emplace(ID, std::move(NewChild)); + return Node; + } else + return *(Child->second); +} + +WindowsResourceParser::TreeNode & +WindowsResourceParser::TreeNode::addChild(ArrayRef<UTF16> NameRef) { + std::string NameString; + ArrayRef<UTF16> CorrectedName; + std::vector<UTF16> EndianCorrectedName; + if (llvm::sys::IsBigEndianHost) { + EndianCorrectedName.resize(NameRef.size() + 1); + std::copy(NameRef.begin(), NameRef.end(), EndianCorrectedName.begin() + 1); + EndianCorrectedName[0] = UNI_UTF16_BYTE_ORDER_MARK_SWAPPED; + CorrectedName = makeArrayRef(EndianCorrectedName); + } else + CorrectedName = NameRef; + llvm::convertUTF16ToUTF8String(CorrectedName, NameString); + + auto Child = StringChildren.find(NameString); + if (Child == StringChildren.end()) { + auto NewChild = llvm::make_unique<WindowsResourceParser::TreeNode>(NameRef); + WindowsResourceParser::TreeNode &Node = *NewChild; + StringChildren.emplace(NameString, std::move(NewChild)); + return Node; + } else + return *(Child->second); +} + +void WindowsResourceParser::TreeNode::print(ScopedPrinter &Writer, + StringRef Name) const { + ListScope NodeScope(Writer, Name); + for (auto const &Child : StringChildren) { + Child.second->print(Writer, Child.first); + } + for (auto const &Child : IDChildren) { + Child.second->print(Writer, to_string(Child.first)); + } +} + } // namespace object } // namespace llvm diff --git a/lib/ObjectYAML/CMakeLists.txt b/lib/ObjectYAML/CMakeLists.txt index 37f8fd7bce1a..7af0b9c194e6 100644 --- a/lib/ObjectYAML/CMakeLists.txt +++ b/lib/ObjectYAML/CMakeLists.txt @@ -1,4 +1,7 @@ add_llvm_library(LLVMObjectYAML + CodeViewYAMLTypes.cpp + CodeViewYAMLSymbols.cpp + CodeViewYAMLDebugSections.cpp COFFYAML.cpp DWARFEmitter.cpp DWARFVisitor.cpp diff --git a/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp new file mode 100644 index 000000000000..f652ff57f30d --- /dev/null +++ b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp @@ -0,0 +1,127 @@ +//===- CodeViewYAMLDebugSections.cpp - CodeView YAMLIO debug sections -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ObjectYAML/CodeViewYAMLDebugSections.h" + +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/DebugInfo/CodeView/CodeViewError.h" +#include "llvm/DebugInfo/CodeView/EnumTables.h" +#include "llvm/DebugInfo/CodeView/SymbolRecord.h" + +using namespace llvm; +using namespace llvm::codeview; +using namespace llvm::CodeViewYAML; +using namespace llvm::CodeViewYAML::detail; +using namespace llvm::yaml; + +LLVM_YAML_IS_SEQUENCE_VECTOR(SourceFileChecksumEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(SourceLineEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(SourceColumnEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(SourceLineBlock) +LLVM_YAML_IS_SEQUENCE_VECTOR(SourceLineInfo) +LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeSite) +LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeInfo) +LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) + +LLVM_YAML_DECLARE_SCALAR_TRAITS(HexFormattedString, false) +LLVM_YAML_DECLARE_ENUM_TRAITS(FileChecksumKind) +LLVM_YAML_DECLARE_BITSET_TRAITS(LineFlags) + +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineEntry) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceColumnEntry) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceFileChecksumEntry) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineInfo) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineBlock) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::InlineeInfo) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::InlineeSite) + +void ScalarBitSetTraits<LineFlags>::bitset(IO &io, LineFlags &Flags) { + io.bitSetCase(Flags, "HasColumnInfo", LF_HaveColumns); + io.enumFallback<Hex16>(Flags); +} + +void ScalarEnumerationTraits<FileChecksumKind>::enumeration( + IO &io, FileChecksumKind &Kind) { + io.enumCase(Kind, "None", FileChecksumKind::None); + io.enumCase(Kind, "MD5", FileChecksumKind::MD5); + io.enumCase(Kind, "SHA1", FileChecksumKind::SHA1); + io.enumCase(Kind, "SHA256", FileChecksumKind::SHA256); +} + +void ScalarTraits<HexFormattedString>::output(const HexFormattedString &Value, + void *ctx, raw_ostream &Out) { + StringRef Bytes(reinterpret_cast<const char *>(Value.Bytes.data()), + Value.Bytes.size()); + Out << toHex(Bytes); +} + +StringRef ScalarTraits<HexFormattedString>::input(StringRef Scalar, void *ctxt, + HexFormattedString &Value) { + std::string H = fromHex(Scalar); + Value.Bytes.assign(H.begin(), H.end()); + return StringRef(); +} + +void MappingTraits<SourceLineEntry>::mapping(IO &IO, SourceLineEntry &Obj) { + IO.mapRequired("Offset", Obj.Offset); + IO.mapRequired("LineStart", Obj.LineStart); + IO.mapRequired("IsStatement", Obj.IsStatement); + IO.mapRequired("EndDelta", Obj.EndDelta); +} + +void MappingTraits<SourceColumnEntry>::mapping(IO &IO, SourceColumnEntry &Obj) { + IO.mapRequired("StartColumn", Obj.StartColumn); + IO.mapRequired("EndColumn", Obj.EndColumn); +} + +void MappingTraits<SourceLineBlock>::mapping(IO &IO, SourceLineBlock &Obj) { + IO.mapRequired("FileName", Obj.FileName); + IO.mapRequired("Lines", Obj.Lines); + IO.mapRequired("Columns", Obj.Columns); +} + +void MappingTraits<SourceFileChecksumEntry>::mapping( + IO &IO, SourceFileChecksumEntry &Obj) { + IO.mapRequired("FileName", Obj.FileName); + IO.mapRequired("Kind", Obj.Kind); + IO.mapRequired("Checksum", Obj.ChecksumBytes); +} + +void MappingTraits<SourceLineInfo>::mapping(IO &IO, SourceLineInfo &Obj) { + IO.mapRequired("CodeSize", Obj.CodeSize); + + IO.mapRequired("Flags", Obj.Flags); + IO.mapRequired("RelocOffset", Obj.RelocOffset); + IO.mapRequired("RelocSegment", Obj.RelocSegment); + IO.mapRequired("Blocks", Obj.Blocks); +} + +void MappingTraits<SourceFileInfo>::mapping(IO &IO, SourceFileInfo &Obj) { + IO.mapOptional("Checksums", Obj.FileChecksums); + IO.mapOptional("Lines", Obj.LineFragments); + IO.mapOptional("InlineeLines", Obj.Inlinees); +} + +void MappingTraits<InlineeSite>::mapping(IO &IO, InlineeSite &Obj) { + IO.mapRequired("FileName", Obj.FileName); + IO.mapRequired("LineNum", Obj.SourceLineNum); + IO.mapRequired("Inlinee", Obj.Inlinee); + IO.mapOptional("ExtraFiles", Obj.ExtraFiles); +} + +void MappingTraits<InlineeInfo>::mapping(IO &IO, InlineeInfo &Obj) { + IO.mapRequired("HasExtraFiles", Obj.HasExtraFiles); + IO.mapRequired("Sites", Obj.Sites); +} diff --git a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp new file mode 100644 index 000000000000..6e8bb5c7372c --- /dev/null +++ b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp @@ -0,0 +1,496 @@ +//===- CodeViewYAMLSymbols.cpp - CodeView YAMLIO Symbol implementation ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ObjectYAML/CodeViewYAMLSymbols.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" +#include "llvm/DebugInfo/CodeView/CodeViewError.h" +#include "llvm/DebugInfo/CodeView/EnumTables.h" +#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" +#include "llvm/DebugInfo/CodeView/SymbolSerializer.h" + +using namespace llvm; +using namespace llvm::codeview; +using namespace llvm::CodeViewYAML; +using namespace llvm::CodeViewYAML::detail; +using namespace llvm::yaml; + +LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex) + +// We only need to declare these, the definitions are in CodeViewYAMLTypes.cpp +LLVM_YAML_DECLARE_SCALAR_TRAITS(APSInt, false) +LLVM_YAML_DECLARE_SCALAR_TRAITS(TypeIndex, false) + +LLVM_YAML_DECLARE_ENUM_TRAITS(SymbolKind) + +LLVM_YAML_DECLARE_BITSET_TRAITS(CompileSym2Flags) +LLVM_YAML_DECLARE_BITSET_TRAITS(CompileSym3Flags) +LLVM_YAML_DECLARE_BITSET_TRAITS(ExportFlags) +LLVM_YAML_DECLARE_BITSET_TRAITS(LocalSymFlags) +LLVM_YAML_DECLARE_BITSET_TRAITS(ProcSymFlags) +LLVM_YAML_DECLARE_BITSET_TRAITS(FrameProcedureOptions) +LLVM_YAML_DECLARE_ENUM_TRAITS(CPUType) +LLVM_YAML_DECLARE_ENUM_TRAITS(RegisterId) +LLVM_YAML_DECLARE_ENUM_TRAITS(TrampolineType) +LLVM_YAML_DECLARE_ENUM_TRAITS(ThunkOrdinal) + +void ScalarEnumerationTraits<SymbolKind>::enumeration(IO &io, + SymbolKind &Value) { + auto SymbolNames = getSymbolTypeNames(); + for (const auto &E : SymbolNames) + io.enumCase(Value, E.Name.str().c_str(), E.Value); +} + +void ScalarBitSetTraits<CompileSym2Flags>::bitset(IO &io, + CompileSym2Flags &Flags) { + auto FlagNames = getCompileSym2FlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast<CompileSym2Flags>(E.Value)); + } +} + +void ScalarBitSetTraits<CompileSym3Flags>::bitset(IO &io, + CompileSym3Flags &Flags) { + auto FlagNames = getCompileSym3FlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast<CompileSym3Flags>(E.Value)); + } +} + +void ScalarBitSetTraits<ExportFlags>::bitset(IO &io, ExportFlags &Flags) { + auto FlagNames = getExportSymFlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast<ExportFlags>(E.Value)); + } +} + +void ScalarBitSetTraits<LocalSymFlags>::bitset(IO &io, LocalSymFlags &Flags) { + auto FlagNames = getLocalFlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast<LocalSymFlags>(E.Value)); + } +} + +void ScalarBitSetTraits<ProcSymFlags>::bitset(IO &io, ProcSymFlags &Flags) { + auto FlagNames = getProcSymFlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast<ProcSymFlags>(E.Value)); + } +} + +void ScalarBitSetTraits<FrameProcedureOptions>::bitset( + IO &io, FrameProcedureOptions &Flags) { + auto FlagNames = getFrameProcSymFlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast<FrameProcedureOptions>(E.Value)); + } +} + +void ScalarEnumerationTraits<CPUType>::enumeration(IO &io, CPUType &Cpu) { + auto CpuNames = getCPUTypeNames(); + for (const auto &E : CpuNames) { + io.enumCase(Cpu, E.Name.str().c_str(), static_cast<CPUType>(E.Value)); + } +} + +void ScalarEnumerationTraits<RegisterId>::enumeration(IO &io, RegisterId &Reg) { + auto RegNames = getRegisterNames(); + for (const auto &E : RegNames) { + io.enumCase(Reg, E.Name.str().c_str(), static_cast<RegisterId>(E.Value)); + } + io.enumFallback<Hex16>(Reg); +} + +void ScalarEnumerationTraits<TrampolineType>::enumeration( + IO &io, TrampolineType &Tramp) { + auto TrampNames = getTrampolineNames(); + for (const auto &E : TrampNames) { + io.enumCase(Tramp, E.Name.str().c_str(), + static_cast<TrampolineType>(E.Value)); + } +} + +void ScalarEnumerationTraits<ThunkOrdinal>::enumeration(IO &io, + ThunkOrdinal &Ord) { + auto ThunkNames = getThunkOrdinalNames(); + for (const auto &E : ThunkNames) { + io.enumCase(Ord, E.Name.str().c_str(), static_cast<ThunkOrdinal>(E.Value)); + } +} + +namespace llvm { +namespace CodeViewYAML { +namespace detail { + +struct SymbolRecordBase { + codeview::SymbolKind Kind; + explicit SymbolRecordBase(codeview::SymbolKind K) : Kind(K) {} + + virtual ~SymbolRecordBase() {} + virtual void map(yaml::IO &io) = 0; + virtual codeview::CVSymbol + toCodeViewSymbol(BumpPtrAllocator &Allocator) const = 0; + virtual Error fromCodeViewSymbol(codeview::CVSymbol Type) = 0; +}; + +template <typename T> struct SymbolRecordImpl : public SymbolRecordBase { + explicit SymbolRecordImpl(codeview::SymbolKind K) + : SymbolRecordBase(K), Symbol(static_cast<SymbolRecordKind>(K)) {} + + void map(yaml::IO &io) override; + + codeview::CVSymbol + toCodeViewSymbol(BumpPtrAllocator &Allocator) const override { + return SymbolSerializer::writeOneSymbol(Symbol, Allocator); + } + Error fromCodeViewSymbol(codeview::CVSymbol CVS) override { + return SymbolDeserializer::deserializeAs<T>(CVS, Symbol); + } + + mutable T Symbol; +}; + +template <> void SymbolRecordImpl<ScopeEndSym>::map(IO &IO) {} + +template <> void SymbolRecordImpl<Thunk32Sym>::map(IO &IO) { + IO.mapRequired("Parent", Symbol.Parent); + IO.mapRequired("End", Symbol.End); + IO.mapRequired("Next", Symbol.Next); + IO.mapRequired("Off", Symbol.Offset); + IO.mapRequired("Seg", Symbol.Segment); + IO.mapRequired("Len", Symbol.Length); + IO.mapRequired("Ordinal", Symbol.Thunk); +} + +template <> void SymbolRecordImpl<TrampolineSym>::map(IO &IO) { + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("Size", Symbol.Size); + IO.mapRequired("ThunkOff", Symbol.ThunkOffset); + IO.mapRequired("TargetOff", Symbol.TargetOffset); + IO.mapRequired("ThunkSection", Symbol.ThunkSection); + IO.mapRequired("TargetSection", Symbol.TargetSection); +} + +template <> void SymbolRecordImpl<SectionSym>::map(IO &IO) { + IO.mapRequired("SectionNumber", Symbol.SectionNumber); + IO.mapRequired("Alignment", Symbol.Alignment); + IO.mapRequired("Rva", Symbol.Rva); + IO.mapRequired("Length", Symbol.Length); + IO.mapRequired("Characteristics", Symbol.Characteristics); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl<CoffGroupSym>::map(IO &IO) { + IO.mapRequired("Size", Symbol.Size); + IO.mapRequired("Characteristics", Symbol.Characteristics); + IO.mapRequired("Offset", Symbol.Offset); + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl<ExportSym>::map(IO &IO) { + IO.mapRequired("Ordinal", Symbol.Ordinal); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl<ProcSym>::map(IO &IO) { + // TODO: Print the linkage name + + IO.mapRequired("PtrParent", Symbol.Parent); + IO.mapRequired("PtrEnd", Symbol.End); + IO.mapRequired("PtrNext", Symbol.Next); + IO.mapRequired("CodeSize", Symbol.CodeSize); + IO.mapRequired("DbgStart", Symbol.DbgStart); + IO.mapRequired("DbgEnd", Symbol.DbgEnd); + IO.mapRequired("FunctionType", Symbol.FunctionType); + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("DisplayName", Symbol.Name); +} + +template <> void SymbolRecordImpl<RegisterSym>::map(IO &IO) { + IO.mapRequired("Type", Symbol.Index); + IO.mapRequired("Seg", Symbol.Register); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl<PublicSym32>::map(IO &IO) { + IO.mapRequired("Type", Symbol.Index); + IO.mapRequired("Seg", Symbol.Segment); + IO.mapRequired("Off", Symbol.Offset); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl<ProcRefSym>::map(IO &IO) { + IO.mapRequired("SumName", Symbol.SumName); + IO.mapRequired("SymOffset", Symbol.SymOffset); + IO.mapRequired("Mod", Symbol.Module); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl<EnvBlockSym>::map(IO &IO) { + IO.mapRequired("Entries", Symbol.Fields); +} + +template <> void SymbolRecordImpl<InlineSiteSym>::map(IO &IO) { + IO.mapRequired("PtrParent", Symbol.Parent); + IO.mapRequired("PtrEnd", Symbol.End); + IO.mapRequired("Inlinee", Symbol.Inlinee); + // TODO: The binary annotations +} + +template <> void SymbolRecordImpl<LocalSym>::map(IO &IO) { + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("VarName", Symbol.Name); +} + +template <> void SymbolRecordImpl<DefRangeSym>::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl<DefRangeSubfieldSym>::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl<DefRangeRegisterSym>::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl<DefRangeFramePointerRelSym>::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl<DefRangeSubfieldRegisterSym>::map(IO &IO) { + // TODO: Print the subfields +} + +template <> +void SymbolRecordImpl<DefRangeFramePointerRelFullScopeSym>::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl<DefRangeRegisterRelSym>::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl<BlockSym>::map(IO &IO) { + // TODO: Print the linkage name + IO.mapRequired("PtrParent", Symbol.Parent); + IO.mapRequired("PtrEnd", Symbol.End); + IO.mapRequired("CodeSize", Symbol.CodeSize); + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("BlockName", Symbol.Name); +} + +template <> void SymbolRecordImpl<LabelSym>::map(IO &IO) { + // TODO: Print the linkage name + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("DisplayName", Symbol.Name); +} + +template <> void SymbolRecordImpl<ObjNameSym>::map(IO &IO) { + IO.mapRequired("Signature", Symbol.Signature); + IO.mapRequired("ObjectName", Symbol.Name); +} + +template <> void SymbolRecordImpl<Compile2Sym>::map(IO &IO) { + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("Machine", Symbol.Machine); + IO.mapRequired("FrontendMajor", Symbol.VersionFrontendMajor); + IO.mapRequired("FrontendMinor", Symbol.VersionFrontendMinor); + IO.mapRequired("FrontendBuild", Symbol.VersionFrontendBuild); + IO.mapRequired("BackendMajor", Symbol.VersionBackendMajor); + IO.mapRequired("BackendMinor", Symbol.VersionBackendMinor); + IO.mapRequired("BackendBuild", Symbol.VersionBackendBuild); + IO.mapRequired("Version", Symbol.Version); +} + +template <> void SymbolRecordImpl<Compile3Sym>::map(IO &IO) { + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("Machine", Symbol.Machine); + IO.mapRequired("FrontendMajor", Symbol.VersionFrontendMajor); + IO.mapRequired("FrontendMinor", Symbol.VersionFrontendMinor); + IO.mapRequired("FrontendBuild", Symbol.VersionFrontendBuild); + IO.mapRequired("FrontendQFE", Symbol.VersionFrontendQFE); + IO.mapRequired("BackendMajor", Symbol.VersionBackendMajor); + IO.mapRequired("BackendMinor", Symbol.VersionBackendMinor); + IO.mapRequired("BackendBuild", Symbol.VersionBackendBuild); + IO.mapRequired("BackendQFE", Symbol.VersionBackendQFE); + IO.mapRequired("Version", Symbol.Version); +} + +template <> void SymbolRecordImpl<FrameProcSym>::map(IO &IO) { + IO.mapRequired("TotalFrameBytes", Symbol.TotalFrameBytes); + IO.mapRequired("PaddingFrameBytes", Symbol.PaddingFrameBytes); + IO.mapRequired("OffsetToPadding", Symbol.OffsetToPadding); + IO.mapRequired("BytesOfCalleeSavedRegisters", + Symbol.BytesOfCalleeSavedRegisters); + IO.mapRequired("OffsetOfExceptionHandler", Symbol.OffsetOfExceptionHandler); + IO.mapRequired("SectionIdOfExceptionHandler", + Symbol.SectionIdOfExceptionHandler); + IO.mapRequired("Flags", Symbol.Flags); +} + +template <> void SymbolRecordImpl<CallSiteInfoSym>::map(IO &IO) { + // TODO: Map Linkage Name + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("Type", Symbol.Type); +} + +template <> void SymbolRecordImpl<FileStaticSym>::map(IO &IO) { + IO.mapRequired("Index", Symbol.Index); + IO.mapRequired("ModFilenameOffset", Symbol.ModFilenameOffset); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl<HeapAllocationSiteSym>::map(IO &IO) { + // TODO: Map Linkage Name + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("CallInstructionSize", Symbol.CallInstructionSize); + IO.mapRequired("Type", Symbol.Type); +} + +template <> void SymbolRecordImpl<FrameCookieSym>::map(IO &IO) { + // TODO: Map Linkage Name + IO.mapRequired("Register", Symbol.Register); + IO.mapRequired("CookieKind", Symbol.CookieKind); + IO.mapRequired("Flags", Symbol.Flags); +} + +template <> void SymbolRecordImpl<CallerSym>::map(IO &IO) { + IO.mapRequired("FuncID", Symbol.Indices); +} + +template <> void SymbolRecordImpl<UDTSym>::map(IO &IO) { + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("UDTName", Symbol.Name); +} + +template <> void SymbolRecordImpl<BuildInfoSym>::map(IO &IO) { + IO.mapRequired("BuildId", Symbol.BuildId); +} + +template <> void SymbolRecordImpl<BPRelativeSym>::map(IO &IO) { + IO.mapRequired("Offset", Symbol.Offset); + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("VarName", Symbol.Name); +} + +template <> void SymbolRecordImpl<RegRelativeSym>::map(IO &IO) { + IO.mapRequired("Offset", Symbol.Offset); + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("Register", Symbol.Register); + IO.mapRequired("VarName", Symbol.Name); +} + +template <> void SymbolRecordImpl<ConstantSym>::map(IO &IO) { + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("Value", Symbol.Value); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl<DataSym>::map(IO &IO) { + // TODO: Map linkage name + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("DisplayName", Symbol.Name); +} + +template <> void SymbolRecordImpl<ThreadLocalDataSym>::map(IO &IO) { + // TODO: Map linkage name + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("DisplayName", Symbol.Name); +} +} +} +} + +CVSymbol CodeViewYAML::SymbolRecord::toCodeViewSymbol( + BumpPtrAllocator &Allocator) const { + return Symbol->toCodeViewSymbol(Allocator); +} + +namespace llvm { +namespace yaml { +template <> struct MappingTraits<SymbolRecordBase> { + static void mapping(IO &io, SymbolRecordBase &Record) { Record.map(io); } +}; +} +} + +template <typename SymbolType> +static inline Expected<CodeViewYAML::SymbolRecord> +fromCodeViewSymbolImpl(CVSymbol Symbol) { + CodeViewYAML::SymbolRecord Result; + + auto Impl = std::make_shared<SymbolRecordImpl<SymbolType>>(Symbol.kind()); + if (auto EC = Impl->fromCodeViewSymbol(Symbol)) + return std::move(EC); + Result.Symbol = Impl; + return Result; +} + +Expected<CodeViewYAML::SymbolRecord> +CodeViewYAML::SymbolRecord::fromCodeViewSymbol(CVSymbol Symbol) { +#define SYMBOL_RECORD(EnumName, EnumVal, ClassName) \ + case EnumName: \ + return fromCodeViewSymbolImpl<ClassName>(Symbol); +#define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) \ + SYMBOL_RECORD(EnumName, EnumVal, ClassName) + switch (Symbol.kind()) { +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" + default: { llvm_unreachable("Unknown symbol kind!"); } + } + return make_error<CodeViewError>(cv_error_code::corrupt_record); +} + +template <typename ConcreteType> +static void mapSymbolRecordImpl(IO &IO, const char *Class, SymbolKind Kind, + CodeViewYAML::SymbolRecord &Obj) { + if (!IO.outputting()) + Obj.Symbol = std::make_shared<SymbolRecordImpl<ConcreteType>>(Kind); + + IO.mapRequired(Class, *Obj.Symbol); +} + +void MappingTraits<CodeViewYAML::SymbolRecord>::mapping( + IO &IO, CodeViewYAML::SymbolRecord &Obj) { + SymbolKind Kind; + if (IO.outputting()) + Kind = Obj.Symbol->Kind; + IO.mapRequired("Kind", Kind); + +#define SYMBOL_RECORD(EnumName, EnumVal, ClassName) \ + case EnumName: \ + mapSymbolRecordImpl<ClassName>(IO, #ClassName, Kind, Obj); \ + break; +#define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) \ + SYMBOL_RECORD(EnumName, EnumVal, ClassName) + switch (Kind) { +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" + default: { llvm_unreachable("Unknown symbol kind!"); } + } +} diff --git a/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/lib/ObjectYAML/CodeViewYAMLTypes.cpp new file mode 100644 index 000000000000..4e82a299a672 --- /dev/null +++ b/lib/ObjectYAML/CodeViewYAMLTypes.cpp @@ -0,0 +1,712 @@ +//===- CodeViewYAMLTypes.cpp - CodeView YAMLIO types implementation -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ObjectYAML/CodeViewYAMLTypes.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" +#include "llvm/DebugInfo/CodeView/CodeViewError.h" +#include "llvm/DebugInfo/CodeView/EnumTables.h" +#include "llvm/DebugInfo/CodeView/TypeDeserializer.h" +#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h" + +using namespace llvm; +using namespace llvm::codeview; +using namespace llvm::CodeViewYAML; +using namespace llvm::CodeViewYAML::detail; +using namespace llvm::yaml; + +LLVM_YAML_IS_SEQUENCE_VECTOR(OneMethodRecord) +LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) +LLVM_YAML_IS_SEQUENCE_VECTOR(VFTableSlotKind) +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex) + +LLVM_YAML_DECLARE_SCALAR_TRAITS(TypeIndex, false) +LLVM_YAML_DECLARE_SCALAR_TRAITS(APSInt, false) + +LLVM_YAML_DECLARE_ENUM_TRAITS(TypeLeafKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(PointerToMemberRepresentation) +LLVM_YAML_DECLARE_ENUM_TRAITS(VFTableSlotKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(CallingConvention) +LLVM_YAML_DECLARE_ENUM_TRAITS(PointerKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(PointerMode) +LLVM_YAML_DECLARE_ENUM_TRAITS(HfaKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(MemberAccess) +LLVM_YAML_DECLARE_ENUM_TRAITS(MethodKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(WindowsRTClassKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(LabelType) + +LLVM_YAML_DECLARE_BITSET_TRAITS(PointerOptions) +LLVM_YAML_DECLARE_BITSET_TRAITS(ModifierOptions) +LLVM_YAML_DECLARE_BITSET_TRAITS(FunctionOptions) +LLVM_YAML_DECLARE_BITSET_TRAITS(ClassOptions) +LLVM_YAML_DECLARE_BITSET_TRAITS(MethodOptions) + +LLVM_YAML_DECLARE_MAPPING_TRAITS(OneMethodRecord) +LLVM_YAML_DECLARE_MAPPING_TRAITS(MemberPointerInfo) + +namespace llvm { +namespace CodeViewYAML { +namespace detail { + +struct LeafRecordBase { + TypeLeafKind Kind; + explicit LeafRecordBase(TypeLeafKind K) : Kind(K) {} + + virtual ~LeafRecordBase() {} + virtual void map(yaml::IO &io) = 0; + virtual CVType toCodeViewRecord(BumpPtrAllocator &Allocator) const = 0; + virtual Error fromCodeViewRecord(CVType Type) = 0; +}; + +template <typename T> struct LeafRecordImpl : public LeafRecordBase { + explicit LeafRecordImpl(TypeLeafKind K) + : LeafRecordBase(K), Record(static_cast<TypeRecordKind>(K)) {} + + void map(yaml::IO &io) override; + + Error fromCodeViewRecord(CVType Type) override { + return TypeDeserializer::deserializeAs<T>(Type, Record); + } + + CVType toCodeViewRecord(BumpPtrAllocator &Allocator) const override { + TypeTableBuilder Table(Allocator); + Table.writeKnownType(Record); + return CVType(Kind, Table.records().front()); + } + + mutable T Record; +}; + +template <> struct LeafRecordImpl<FieldListRecord> : public LeafRecordBase { + explicit LeafRecordImpl(TypeLeafKind K) : LeafRecordBase(K) {} + + void map(yaml::IO &io) override; + CVType toCodeViewRecord(BumpPtrAllocator &Allocator) const override; + Error fromCodeViewRecord(CVType Type) override; + + std::vector<MemberRecord> Members; +}; + +struct MemberRecordBase { + TypeLeafKind Kind; + explicit MemberRecordBase(TypeLeafKind K) : Kind(K) {} + + virtual ~MemberRecordBase() {} + virtual void map(yaml::IO &io) = 0; + virtual void writeTo(FieldListRecordBuilder &FLRB) = 0; +}; + +template <typename T> struct MemberRecordImpl : public MemberRecordBase { + explicit MemberRecordImpl(TypeLeafKind K) + : MemberRecordBase(K), Record(static_cast<TypeRecordKind>(K)) {} + void map(yaml::IO &io) override; + + void writeTo(FieldListRecordBuilder &FLRB) override { + FLRB.writeMemberType(Record); + } + + mutable T Record; +}; +} +} +} + +void ScalarTraits<TypeIndex>::output(const TypeIndex &S, void *, + llvm::raw_ostream &OS) { + OS << S.getIndex(); +} + +StringRef ScalarTraits<TypeIndex>::input(StringRef Scalar, void *Ctx, + TypeIndex &S) { + uint32_t I; + StringRef Result = ScalarTraits<uint32_t>::input(Scalar, Ctx, I); + S.setIndex(I); + return Result; +} + +void ScalarTraits<APSInt>::output(const APSInt &S, void *, + llvm::raw_ostream &OS) { + S.print(OS, true); +} + +StringRef ScalarTraits<APSInt>::input(StringRef Scalar, void *Ctx, APSInt &S) { + S = APSInt(Scalar); + return ""; +} + +void ScalarEnumerationTraits<TypeLeafKind>::enumeration(IO &io, + TypeLeafKind &Value) { +#define CV_TYPE(name, val) io.enumCase(Value, #name, name); +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" +#undef CV_TYPE +} + +void ScalarEnumerationTraits<PointerToMemberRepresentation>::enumeration( + IO &IO, PointerToMemberRepresentation &Value) { + IO.enumCase(Value, "Unknown", PointerToMemberRepresentation::Unknown); + IO.enumCase(Value, "SingleInheritanceData", + PointerToMemberRepresentation::SingleInheritanceData); + IO.enumCase(Value, "MultipleInheritanceData", + PointerToMemberRepresentation::MultipleInheritanceData); + IO.enumCase(Value, "VirtualInheritanceData", + PointerToMemberRepresentation::VirtualInheritanceData); + IO.enumCase(Value, "GeneralData", PointerToMemberRepresentation::GeneralData); + IO.enumCase(Value, "SingleInheritanceFunction", + PointerToMemberRepresentation::SingleInheritanceFunction); + IO.enumCase(Value, "MultipleInheritanceFunction", + PointerToMemberRepresentation::MultipleInheritanceFunction); + IO.enumCase(Value, "VirtualInheritanceFunction", + PointerToMemberRepresentation::VirtualInheritanceFunction); + IO.enumCase(Value, "GeneralFunction", + PointerToMemberRepresentation::GeneralFunction); +} + +void ScalarEnumerationTraits<VFTableSlotKind>::enumeration( + IO &IO, VFTableSlotKind &Kind) { + IO.enumCase(Kind, "Near16", VFTableSlotKind::Near16); + IO.enumCase(Kind, "Far16", VFTableSlotKind::Far16); + IO.enumCase(Kind, "This", VFTableSlotKind::This); + IO.enumCase(Kind, "Outer", VFTableSlotKind::Outer); + IO.enumCase(Kind, "Meta", VFTableSlotKind::Meta); + IO.enumCase(Kind, "Near", VFTableSlotKind::Near); + IO.enumCase(Kind, "Far", VFTableSlotKind::Far); +} + +void ScalarEnumerationTraits<CallingConvention>::enumeration( + IO &IO, CallingConvention &Value) { + IO.enumCase(Value, "NearC", CallingConvention::NearC); + IO.enumCase(Value, "FarC", CallingConvention::FarC); + IO.enumCase(Value, "NearPascal", CallingConvention::NearPascal); + IO.enumCase(Value, "FarPascal", CallingConvention::FarPascal); + IO.enumCase(Value, "NearFast", CallingConvention::NearFast); + IO.enumCase(Value, "FarFast", CallingConvention::FarFast); + IO.enumCase(Value, "NearStdCall", CallingConvention::NearStdCall); + IO.enumCase(Value, "FarStdCall", CallingConvention::FarStdCall); + IO.enumCase(Value, "NearSysCall", CallingConvention::NearSysCall); + IO.enumCase(Value, "FarSysCall", CallingConvention::FarSysCall); + IO.enumCase(Value, "ThisCall", CallingConvention::ThisCall); + IO.enumCase(Value, "MipsCall", CallingConvention::MipsCall); + IO.enumCase(Value, "Generic", CallingConvention::Generic); + IO.enumCase(Value, "AlphaCall", CallingConvention::AlphaCall); + IO.enumCase(Value, "PpcCall", CallingConvention::PpcCall); + IO.enumCase(Value, "SHCall", CallingConvention::SHCall); + IO.enumCase(Value, "ArmCall", CallingConvention::ArmCall); + IO.enumCase(Value, "AM33Call", CallingConvention::AM33Call); + IO.enumCase(Value, "TriCall", CallingConvention::TriCall); + IO.enumCase(Value, "SH5Call", CallingConvention::SH5Call); + IO.enumCase(Value, "M32RCall", CallingConvention::M32RCall); + IO.enumCase(Value, "ClrCall", CallingConvention::ClrCall); + IO.enumCase(Value, "Inline", CallingConvention::Inline); + IO.enumCase(Value, "NearVector", CallingConvention::NearVector); +} + +void ScalarEnumerationTraits<PointerKind>::enumeration(IO &IO, + PointerKind &Kind) { + IO.enumCase(Kind, "Near16", PointerKind::Near16); + IO.enumCase(Kind, "Far16", PointerKind::Far16); + IO.enumCase(Kind, "Huge16", PointerKind::Huge16); + IO.enumCase(Kind, "BasedOnSegment", PointerKind::BasedOnSegment); + IO.enumCase(Kind, "BasedOnValue", PointerKind::BasedOnValue); + IO.enumCase(Kind, "BasedOnSegmentValue", PointerKind::BasedOnSegmentValue); + IO.enumCase(Kind, "BasedOnAddress", PointerKind::BasedOnAddress); + IO.enumCase(Kind, "BasedOnSegmentAddress", + PointerKind::BasedOnSegmentAddress); + IO.enumCase(Kind, "BasedOnType", PointerKind::BasedOnType); + IO.enumCase(Kind, "BasedOnSelf", PointerKind::BasedOnSelf); + IO.enumCase(Kind, "Near32", PointerKind::Near32); + IO.enumCase(Kind, "Far32", PointerKind::Far32); + IO.enumCase(Kind, "Near64", PointerKind::Near64); +} + +void ScalarEnumerationTraits<PointerMode>::enumeration(IO &IO, + PointerMode &Mode) { + IO.enumCase(Mode, "Pointer", PointerMode::Pointer); + IO.enumCase(Mode, "LValueReference", PointerMode::LValueReference); + IO.enumCase(Mode, "PointerToDataMember", PointerMode::PointerToDataMember); + IO.enumCase(Mode, "PointerToMemberFunction", + PointerMode::PointerToMemberFunction); + IO.enumCase(Mode, "RValueReference", PointerMode::RValueReference); +} + +void ScalarEnumerationTraits<HfaKind>::enumeration(IO &IO, HfaKind &Value) { + IO.enumCase(Value, "None", HfaKind::None); + IO.enumCase(Value, "Float", HfaKind::Float); + IO.enumCase(Value, "Double", HfaKind::Double); + IO.enumCase(Value, "Other", HfaKind::Other); +} + +void ScalarEnumerationTraits<MemberAccess>::enumeration(IO &IO, + MemberAccess &Access) { + IO.enumCase(Access, "None", MemberAccess::None); + IO.enumCase(Access, "Private", MemberAccess::Private); + IO.enumCase(Access, "Protected", MemberAccess::Protected); + IO.enumCase(Access, "Public", MemberAccess::Public); +} + +void ScalarEnumerationTraits<MethodKind>::enumeration(IO &IO, + MethodKind &Kind) { + IO.enumCase(Kind, "Vanilla", MethodKind::Vanilla); + IO.enumCase(Kind, "Virtual", MethodKind::Virtual); + IO.enumCase(Kind, "Static", MethodKind::Static); + IO.enumCase(Kind, "Friend", MethodKind::Friend); + IO.enumCase(Kind, "IntroducingVirtual", MethodKind::IntroducingVirtual); + IO.enumCase(Kind, "PureVirtual", MethodKind::PureVirtual); + IO.enumCase(Kind, "PureIntroducingVirtual", + MethodKind::PureIntroducingVirtual); +} + +void ScalarEnumerationTraits<WindowsRTClassKind>::enumeration( + IO &IO, WindowsRTClassKind &Value) { + IO.enumCase(Value, "None", WindowsRTClassKind::None); + IO.enumCase(Value, "Ref", WindowsRTClassKind::RefClass); + IO.enumCase(Value, "Value", WindowsRTClassKind::ValueClass); + IO.enumCase(Value, "Interface", WindowsRTClassKind::Interface); +} + +void ScalarEnumerationTraits<LabelType>::enumeration(IO &IO, LabelType &Value) { + IO.enumCase(Value, "Near", LabelType::Near); + IO.enumCase(Value, "Far", LabelType::Far); +} + +void ScalarBitSetTraits<PointerOptions>::bitset(IO &IO, + PointerOptions &Options) { + IO.bitSetCase(Options, "None", PointerOptions::None); + IO.bitSetCase(Options, "Flat32", PointerOptions::Flat32); + IO.bitSetCase(Options, "Volatile", PointerOptions::Volatile); + IO.bitSetCase(Options, "Const", PointerOptions::Const); + IO.bitSetCase(Options, "Unaligned", PointerOptions::Unaligned); + IO.bitSetCase(Options, "Restrict", PointerOptions::Restrict); + IO.bitSetCase(Options, "WinRTSmartPointer", + PointerOptions::WinRTSmartPointer); +} + +void ScalarBitSetTraits<ModifierOptions>::bitset(IO &IO, + ModifierOptions &Options) { + IO.bitSetCase(Options, "None", ModifierOptions::None); + IO.bitSetCase(Options, "Const", ModifierOptions::Const); + IO.bitSetCase(Options, "Volatile", ModifierOptions::Volatile); + IO.bitSetCase(Options, "Unaligned", ModifierOptions::Unaligned); +} + +void ScalarBitSetTraits<FunctionOptions>::bitset(IO &IO, + FunctionOptions &Options) { + IO.bitSetCase(Options, "None", FunctionOptions::None); + IO.bitSetCase(Options, "CxxReturnUdt", FunctionOptions::CxxReturnUdt); + IO.bitSetCase(Options, "Constructor", FunctionOptions::Constructor); + IO.bitSetCase(Options, "ConstructorWithVirtualBases", + FunctionOptions::ConstructorWithVirtualBases); +} + +void ScalarBitSetTraits<ClassOptions>::bitset(IO &IO, ClassOptions &Options) { + IO.bitSetCase(Options, "None", ClassOptions::None); + IO.bitSetCase(Options, "HasConstructorOrDestructor", + ClassOptions::HasConstructorOrDestructor); + IO.bitSetCase(Options, "HasOverloadedOperator", + ClassOptions::HasOverloadedOperator); + IO.bitSetCase(Options, "Nested", ClassOptions::Nested); + IO.bitSetCase(Options, "ContainsNestedClass", + ClassOptions::ContainsNestedClass); + IO.bitSetCase(Options, "HasOverloadedAssignmentOperator", + ClassOptions::HasOverloadedAssignmentOperator); + IO.bitSetCase(Options, "HasConversionOperator", + ClassOptions::HasConversionOperator); + IO.bitSetCase(Options, "ForwardReference", ClassOptions::ForwardReference); + IO.bitSetCase(Options, "Scoped", ClassOptions::Scoped); + IO.bitSetCase(Options, "HasUniqueName", ClassOptions::HasUniqueName); + IO.bitSetCase(Options, "Sealed", ClassOptions::Sealed); + IO.bitSetCase(Options, "Intrinsic", ClassOptions::Intrinsic); +} + +void ScalarBitSetTraits<MethodOptions>::bitset(IO &IO, MethodOptions &Options) { + IO.bitSetCase(Options, "None", MethodOptions::None); + IO.bitSetCase(Options, "Pseudo", MethodOptions::Pseudo); + IO.bitSetCase(Options, "NoInherit", MethodOptions::NoInherit); + IO.bitSetCase(Options, "NoConstruct", MethodOptions::NoConstruct); + IO.bitSetCase(Options, "CompilerGenerated", MethodOptions::CompilerGenerated); + IO.bitSetCase(Options, "Sealed", MethodOptions::Sealed); +} + +void MappingTraits<MemberPointerInfo>::mapping(IO &IO, MemberPointerInfo &MPI) { + IO.mapRequired("ContainingType", MPI.ContainingType); + IO.mapRequired("Representation", MPI.Representation); +} + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +template <> void LeafRecordImpl<ModifierRecord>::map(IO &IO) { + IO.mapRequired("ModifiedType", Record.ModifiedType); + IO.mapRequired("Modifiers", Record.Modifiers); +} + +template <> void LeafRecordImpl<ProcedureRecord>::map(IO &IO) { + IO.mapRequired("ReturnType", Record.ReturnType); + IO.mapRequired("CallConv", Record.CallConv); + IO.mapRequired("Options", Record.Options); + IO.mapRequired("ParameterCount", Record.ParameterCount); + IO.mapRequired("ArgumentList", Record.ArgumentList); +} + +template <> void LeafRecordImpl<MemberFunctionRecord>::map(IO &IO) { + IO.mapRequired("ReturnType", Record.ReturnType); + IO.mapRequired("ClassType", Record.ClassType); + IO.mapRequired("ThisType", Record.ThisType); + IO.mapRequired("CallConv", Record.CallConv); + IO.mapRequired("Options", Record.Options); + IO.mapRequired("ParameterCount", Record.ParameterCount); + IO.mapRequired("ArgumentList", Record.ArgumentList); + IO.mapRequired("ThisPointerAdjustment", Record.ThisPointerAdjustment); +} + +template <> void LeafRecordImpl<LabelRecord>::map(IO &IO) { + IO.mapRequired("Mode", Record.Mode); +} + +template <> void LeafRecordImpl<MemberFuncIdRecord>::map(IO &IO) { + IO.mapRequired("ClassType", Record.ClassType); + IO.mapRequired("FunctionType", Record.FunctionType); + IO.mapRequired("Name", Record.Name); +} + +template <> void LeafRecordImpl<ArgListRecord>::map(IO &IO) { + IO.mapRequired("ArgIndices", Record.ArgIndices); +} + +template <> void LeafRecordImpl<StringListRecord>::map(IO &IO) { + IO.mapRequired("StringIndices", Record.StringIndices); +} + +template <> void LeafRecordImpl<PointerRecord>::map(IO &IO) { + IO.mapRequired("ReferentType", Record.ReferentType); + IO.mapRequired("Attrs", Record.Attrs); + IO.mapOptional("MemberInfo", Record.MemberInfo); +} + +template <> void LeafRecordImpl<ArrayRecord>::map(IO &IO) { + IO.mapRequired("ElementType", Record.ElementType); + IO.mapRequired("IndexType", Record.IndexType); + IO.mapRequired("Size", Record.Size); + IO.mapRequired("Name", Record.Name); +} + +void LeafRecordImpl<FieldListRecord>::map(IO &IO) { + IO.mapRequired("FieldList", Members); +} +} +} +} + +namespace { +class MemberRecordConversionVisitor : public TypeVisitorCallbacks { +public: + explicit MemberRecordConversionVisitor(std::vector<MemberRecord> &Records) + : Records(Records) {} + +#define TYPE_RECORD(EnumName, EnumVal, Name) +#define MEMBER_RECORD(EnumName, EnumVal, Name) \ + Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override { \ + return visitKnownMemberImpl(Record); \ + } +#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) +#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" +private: + template <typename T> Error visitKnownMemberImpl(T &Record) { + TypeLeafKind K = static_cast<TypeLeafKind>(Record.getKind()); + auto Impl = std::make_shared<MemberRecordImpl<T>>(K); + Impl->Record = Record; + Records.push_back(MemberRecord{Impl}); + return Error::success(); + } + + std::vector<MemberRecord> &Records; +}; +} + +Error LeafRecordImpl<FieldListRecord>::fromCodeViewRecord(CVType Type) { + MemberRecordConversionVisitor V(Members); + return visitMemberRecordStream(Type.content(), V); +} + +CVType LeafRecordImpl<FieldListRecord>::toCodeViewRecord( + BumpPtrAllocator &Allocator) const { + TypeTableBuilder TTB(Allocator); + FieldListRecordBuilder FLRB(TTB); + FLRB.begin(); + for (const auto &Member : Members) { + Member.Member->writeTo(FLRB); + } + FLRB.end(true); + return CVType(Kind, TTB.records().front()); +} + +void MappingTraits<OneMethodRecord>::mapping(IO &io, OneMethodRecord &Record) { + io.mapRequired("Type", Record.Type); + io.mapRequired("Attrs", Record.Attrs.Attrs); + io.mapRequired("VFTableOffset", Record.VFTableOffset); + io.mapRequired("Name", Record.Name); +} + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +template <> void LeafRecordImpl<ClassRecord>::map(IO &IO) { + IO.mapRequired("MemberCount", Record.MemberCount); + IO.mapRequired("Options", Record.Options); + IO.mapRequired("FieldList", Record.FieldList); + IO.mapRequired("Name", Record.Name); + IO.mapRequired("UniqueName", Record.UniqueName); + + IO.mapRequired("DerivationList", Record.DerivationList); + IO.mapRequired("VTableShape", Record.VTableShape); + IO.mapRequired("Size", Record.Size); +} + +template <> void LeafRecordImpl<UnionRecord>::map(IO &IO) { + IO.mapRequired("MemberCount", Record.MemberCount); + IO.mapRequired("Options", Record.Options); + IO.mapRequired("FieldList", Record.FieldList); + IO.mapRequired("Name", Record.Name); + IO.mapRequired("UniqueName", Record.UniqueName); + + IO.mapRequired("Size", Record.Size); +} + +template <> void LeafRecordImpl<EnumRecord>::map(IO &IO) { + IO.mapRequired("NumEnumerators", Record.MemberCount); + IO.mapRequired("Options", Record.Options); + IO.mapRequired("FieldList", Record.FieldList); + IO.mapRequired("Name", Record.Name); + IO.mapRequired("UniqueName", Record.UniqueName); + + IO.mapRequired("UnderlyingType", Record.UnderlyingType); +} + +template <> void LeafRecordImpl<BitFieldRecord>::map(IO &IO) { + IO.mapRequired("Type", Record.Type); + IO.mapRequired("BitSize", Record.BitSize); + IO.mapRequired("BitOffset", Record.BitOffset); +} + +template <> void LeafRecordImpl<VFTableShapeRecord>::map(IO &IO) { + IO.mapRequired("Slots", Record.Slots); +} + +template <> void LeafRecordImpl<TypeServer2Record>::map(IO &IO) { + IO.mapRequired("Guid", Record.Guid); + IO.mapRequired("Age", Record.Age); + IO.mapRequired("Name", Record.Name); +} + +template <> void LeafRecordImpl<StringIdRecord>::map(IO &IO) { + IO.mapRequired("Id", Record.Id); + IO.mapRequired("String", Record.String); +} + +template <> void LeafRecordImpl<FuncIdRecord>::map(IO &IO) { + IO.mapRequired("ParentScope", Record.ParentScope); + IO.mapRequired("FunctionType", Record.FunctionType); + IO.mapRequired("Name", Record.Name); +} + +template <> void LeafRecordImpl<UdtSourceLineRecord>::map(IO &IO) { + IO.mapRequired("UDT", Record.UDT); + IO.mapRequired("SourceFile", Record.SourceFile); + IO.mapRequired("LineNumber", Record.LineNumber); +} + +template <> void LeafRecordImpl<UdtModSourceLineRecord>::map(IO &IO) { + IO.mapRequired("UDT", Record.UDT); + IO.mapRequired("SourceFile", Record.SourceFile); + IO.mapRequired("LineNumber", Record.LineNumber); + IO.mapRequired("Module", Record.Module); +} + +template <> void LeafRecordImpl<BuildInfoRecord>::map(IO &IO) { + IO.mapRequired("ArgIndices", Record.ArgIndices); +} + +template <> void LeafRecordImpl<VFTableRecord>::map(IO &IO) { + IO.mapRequired("CompleteClass", Record.CompleteClass); + IO.mapRequired("OverriddenVFTable", Record.OverriddenVFTable); + IO.mapRequired("VFPtrOffset", Record.VFPtrOffset); + IO.mapRequired("MethodNames", Record.MethodNames); +} + +template <> void LeafRecordImpl<MethodOverloadListRecord>::map(IO &IO) { + IO.mapRequired("Methods", Record.Methods); +} + +template <> void MemberRecordImpl<OneMethodRecord>::map(IO &IO) { + MappingTraits<OneMethodRecord>::mapping(IO, Record); +} + +template <> void MemberRecordImpl<OverloadedMethodRecord>::map(IO &IO) { + IO.mapRequired("NumOverloads", Record.NumOverloads); + IO.mapRequired("MethodList", Record.MethodList); + IO.mapRequired("Name", Record.Name); +} + +template <> void MemberRecordImpl<NestedTypeRecord>::map(IO &IO) { + IO.mapRequired("Type", Record.Type); + IO.mapRequired("Name", Record.Name); +} + +template <> void MemberRecordImpl<DataMemberRecord>::map(IO &IO) { + IO.mapRequired("Attrs", Record.Attrs.Attrs); + IO.mapRequired("Type", Record.Type); + IO.mapRequired("FieldOffset", Record.FieldOffset); + IO.mapRequired("Name", Record.Name); +} + +template <> void MemberRecordImpl<StaticDataMemberRecord>::map(IO &IO) { + IO.mapRequired("Attrs", Record.Attrs.Attrs); + IO.mapRequired("Type", Record.Type); + IO.mapRequired("Name", Record.Name); +} + +template <> void MemberRecordImpl<EnumeratorRecord>::map(IO &IO) { + IO.mapRequired("Attrs", Record.Attrs.Attrs); + IO.mapRequired("Value", Record.Value); + IO.mapRequired("Name", Record.Name); +} + +template <> void MemberRecordImpl<VFPtrRecord>::map(IO &IO) { + IO.mapRequired("Type", Record.Type); +} + +template <> void MemberRecordImpl<BaseClassRecord>::map(IO &IO) { + IO.mapRequired("Attrs", Record.Attrs.Attrs); + IO.mapRequired("Type", Record.Type); + IO.mapRequired("Offset", Record.Offset); +} + +template <> void MemberRecordImpl<VirtualBaseClassRecord>::map(IO &IO) { + IO.mapRequired("Attrs", Record.Attrs.Attrs); + IO.mapRequired("BaseType", Record.BaseType); + IO.mapRequired("VBPtrType", Record.VBPtrType); + IO.mapRequired("VBPtrOffset", Record.VBPtrOffset); + IO.mapRequired("VTableIndex", Record.VTableIndex); +} + +template <> void MemberRecordImpl<ListContinuationRecord>::map(IO &IO) { + IO.mapRequired("ContinuationIndex", Record.ContinuationIndex); +} +} +} +} + +template <typename T> +static inline Expected<LeafRecord> fromCodeViewRecordImpl(CVType Type) { + LeafRecord Result; + + auto Impl = std::make_shared<LeafRecordImpl<T>>(Type.kind()); + if (auto EC = Impl->fromCodeViewRecord(Type)) + return std::move(EC); + Result.Leaf = Impl; + return Result; +} + +Expected<LeafRecord> LeafRecord::fromCodeViewRecord(CVType Type) { +#define TYPE_RECORD(EnumName, EnumVal, ClassName) \ + case EnumName: \ + return fromCodeViewRecordImpl<ClassName##Record>(Type); +#define TYPE_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) \ + TYPE_RECORD(EnumName, EnumVal, ClassName) +#define MEMBER_RECORD(EnumName, EnumVal, ClassName) +#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) + switch (Type.kind()) { +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" + default: { llvm_unreachable("Unknown leaf kind!"); } + } + return make_error<CodeViewError>(cv_error_code::corrupt_record); +} + +CVType LeafRecord::toCodeViewRecord(BumpPtrAllocator &Allocator) const { + return Leaf->toCodeViewRecord(Allocator); +} + +namespace llvm { +namespace yaml { +template <> struct MappingTraits<LeafRecordBase> { + static void mapping(IO &io, LeafRecordBase &Record) { Record.map(io); } +}; + +template <> struct MappingTraits<MemberRecordBase> { + static void mapping(IO &io, MemberRecordBase &Record) { Record.map(io); } +}; +} +} + +template <typename ConcreteType> +static void mapLeafRecordImpl(IO &IO, const char *Class, TypeLeafKind Kind, + LeafRecord &Obj) { + if (!IO.outputting()) + Obj.Leaf = std::make_shared<LeafRecordImpl<ConcreteType>>(Kind); + + if (Kind == LF_FIELDLIST) + Obj.Leaf->map(IO); + else + IO.mapRequired(Class, *Obj.Leaf); +} + +void MappingTraits<LeafRecord>::mapping(IO &IO, LeafRecord &Obj) { + TypeLeafKind Kind; + if (IO.outputting()) + Kind = Obj.Leaf->Kind; + IO.mapRequired("Kind", Kind); + +#define TYPE_RECORD(EnumName, EnumVal, ClassName) \ + case EnumName: \ + mapLeafRecordImpl<ClassName##Record>(IO, #ClassName, Kind, Obj); \ + break; +#define TYPE_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) \ + TYPE_RECORD(EnumName, EnumVal, ClassName) +#define MEMBER_RECORD(EnumName, EnumVal, ClassName) +#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) + switch (Kind) { +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" + default: { llvm_unreachable("Unknown leaf kind!"); } + } +} + +template <typename ConcreteType> +static void mapMemberRecordImpl(IO &IO, const char *Class, TypeLeafKind Kind, + MemberRecord &Obj) { + if (!IO.outputting()) + Obj.Member = std::make_shared<MemberRecordImpl<ConcreteType>>(Kind); + + IO.mapRequired(Class, *Obj.Member); +} + +void MappingTraits<MemberRecord>::mapping(IO &IO, MemberRecord &Obj) { + TypeLeafKind Kind; + if (IO.outputting()) + Kind = Obj.Member->Kind; + IO.mapRequired("Kind", Kind); + +#define MEMBER_RECORD(EnumName, EnumVal, ClassName) \ + case EnumName: \ + mapMemberRecordImpl<ClassName##Record>(IO, #ClassName, Kind, Obj); \ + break; +#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) \ + MEMBER_RECORD(EnumName, EnumVal, ClassName) +#define TYPE_RECORD(EnumName, EnumVal, ClassName) +#define TYPE_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) + switch (Kind) { +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" + default: { llvm_unreachable("Unknown member kind!"); } + } +} diff --git a/lib/ObjectYAML/LLVMBuild.txt b/lib/ObjectYAML/LLVMBuild.txt index b8d1d2f1779e..44657e916a91 100644 --- a/lib/ObjectYAML/LLVMBuild.txt +++ b/lib/ObjectYAML/LLVMBuild.txt @@ -11,4 +11,4 @@ type = Library name = ObjectYAML parent = Libraries -required_libraries = Support +required_libraries = Support DebugInfoCodeView diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index abc53e97aa72..eb81e58b9b0e 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -164,7 +164,8 @@ static cl::opt<bool> EnableGVNHoist( "enable-npm-gvn-hoist", cl::init(false), cl::Hidden, cl::desc("Enable the GVN hoisting pass for the new PM (default = off)")); -static Regex DefaultAliasRegex("^(default|lto-pre-link|lto)<(O[0123sz])>$"); +static Regex DefaultAliasRegex( + "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$"); static bool isOptimizingForSize(PassBuilder::OptimizationLevel Level) { switch (Level) { @@ -345,6 +346,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, LPM2.addPass(IndVarSimplifyPass()); LPM2.addPass(LoopIdiomRecognizePass()); LPM2.addPass(LoopDeletionPass()); + // FIXME: The old pass manager has a hack to disable loop unrolling during + // ThinLTO when using sample PGO. Need to either fix it or port some + // workaround. LPM2.addPass(LoopUnrollPass::createFull(Level)); // We provide the opt remark emitter pass for LICM to use. We only need to do @@ -454,14 +458,10 @@ static void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, } ModulePassManager -PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, - bool DebugLogging) { - assert(Level != O0 && "Must request optimizations for the default pipeline!"); +PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, + bool DebugLogging) { ModulePassManager MPM(DebugLogging); - // Force any function attributes we want the rest of the pipeline te observe. - MPM.addPass(ForceFunctionAttrsPass()); - // Do basic inference of function attributes from known properties of system // libraries and other oracles. MPM.addPass(InferFunctionAttrsPass()); @@ -504,16 +504,16 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, GlobalCleanupPM.addPass(SimplifyCFGPass()); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM))); - // Add all the requested passes for PGO Instrumentation, if requested. + // Add all the requested passes for PGO, if requested. if (PGOOpt) { assert(PGOOpt->RunProfileGen || PGOOpt->SamplePGO || !PGOOpt->ProfileUseFile.empty()); addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen, PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile); - } - // Indirect call promotion that promotes intra-module targes only. - MPM.addPass(PGOIndirectCallPromotion(false, PGOOpt && PGOOpt->SamplePGO)); + // Indirect call promotion that promotes intra-module targes only. + MPM.addPass(PGOIndirectCallPromotion(false, PGOOpt && PGOOpt->SamplePGO)); + } // Require the GlobalsAA analysis for the module so we can query it within // the CGSCC pipeline. @@ -562,17 +562,30 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, createModuleToPostOrderCGSCCPassAdaptor(createDevirtSCCRepeatedPass( std::move(MainCGPipeline), MaxDevirtIterations, DebugLogging))); - // This ends the canonicalization and simplification phase of the pipeline. - // At this point, we expect to have canonical and simple IR which we begin - // *optimizing* for efficient execution going forward. + return MPM; +} + +ModulePassManager +PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, + bool DebugLogging) { + ModulePassManager MPM(DebugLogging); + + // Optimize globals now that the module is fully simplified. + MPM.addPass(GlobalOptPass()); // Run partial inlining pass to partially inline functions that have // large bodies. if (RunPartialInlining) MPM.addPass(PartialInlinerPass()); - // Eliminate externally available functions now that inlining is over -- we - // won't emit these anyways. + // Remove avail extern fns and globals definitions since we aren't compiling + // an object file for later LTO. For LTO we want to preserve these so they + // are eligible for inlining at link-time. Note if they are unreferenced they + // will be removed by GlobalDCE later, so this only impacts referenced + // available externally globals. Eventually they will be suppressed during + // codegen, but eliminating here enables more opportunity for GlobalDCE as it + // may make globals referenced by available external functions dead and saves + // running remaining passes on the eliminated functions. MPM.addPass(EliminateAvailableExternallyPass()); // Do RPO function attribute inference across the module to forward-propagate @@ -671,6 +684,87 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, } ModulePassManager +PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, + bool DebugLogging) { + assert(Level != O0 && "Must request optimizations for the default pipeline!"); + + ModulePassManager MPM(DebugLogging); + + // Force any function attributes we want the rest of the pipeline to observe. + MPM.addPass(ForceFunctionAttrsPass()); + + // Add the core simplification pipeline. + MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging)); + + // Now add the optimization pipeline. + MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging)); + + return MPM; +} + +ModulePassManager +PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level, + bool DebugLogging) { + assert(Level != O0 && "Must request optimizations for the default pipeline!"); + + ModulePassManager MPM(DebugLogging); + + // Force any function attributes we want the rest of the pipeline to observe. + MPM.addPass(ForceFunctionAttrsPass()); + + // If we are planning to perform ThinLTO later, we don't bloat the code with + // unrolling/vectorization/... now. Just simplify the module as much as we + // can. + MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging)); + + // Run partial inlining pass to partially inline functions that have + // large bodies. + // FIXME: It isn't clear whether this is really the right place to run this + // in ThinLTO. Because there is another canonicalization and simplification + // phase that will run after the thin link, running this here ends up with + // less information than will be available later and it may grow functions in + // ways that aren't beneficial. + if (RunPartialInlining) + MPM.addPass(PartialInlinerPass()); + + // Reduce the size of the IR as much as possible. + MPM.addPass(GlobalOptPass()); + + // Rename anon globals to be able to export them in the summary. + MPM.addPass(NameAnonGlobalPass()); + + return MPM; +} + +ModulePassManager +PassBuilder::buildThinLTODefaultPipeline(OptimizationLevel Level, + bool DebugLogging) { + // FIXME: The summary index is not hooked in the new pass manager yet. + // When it's going to be hooked, enable WholeProgramDevirt and LowerTypeTest + // here. + + ModulePassManager MPM(DebugLogging); + + // Force any function attributes we want the rest of the pipeline to observe. + MPM.addPass(ForceFunctionAttrsPass()); + + // During the ThinLTO backend phase we perform early indirect call promotion + // here, before globalopt. Otherwise imported available_externally functions + // look unreferenced and are removed. + MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, + PGOOpt && PGOOpt->SamplePGO && + !PGOOpt->ProfileUseFile.empty())); + + // Add the core simplification pipeline. + MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging)); + + // Now add the optimization pipeline. + MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging)); + + return MPM; +} + +ModulePassManager PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level, bool DebugLogging) { assert(Level != O0 && "Must request optimizations for the default pipeline!"); @@ -893,9 +987,16 @@ static Optional<int> parseDevirtPassName(StringRef Name) { return Count; } +/// Tests whether a pass name starts with a valid prefix for a default pipeline +/// alias. +static bool startsWithDefaultPipelineAliasPrefix(StringRef Name) { + return Name.startswith("default") || Name.startswith("thinlto") || + Name.startswith("lto"); +} + static bool isModulePassName(StringRef Name) { // Manually handle aliases for pre-configured pipeline fragments. - if (Name.startswith("default") || Name.startswith("lto")) + if (startsWithDefaultPipelineAliasPrefix(Name)) return DefaultAliasRegex.match(Name); // Explicitly handle pass manager names. @@ -1090,7 +1191,7 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM, } // Manually handle aliases for pre-configured pipeline fragments. - if (Name.startswith("default") || Name.startswith("lto")) { + if (startsWithDefaultPipelineAliasPrefix(Name)) { SmallVector<StringRef, 3> Matches; if (!DefaultAliasRegex.match(Name, &Matches)) return false; @@ -1109,6 +1210,10 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM, if (Matches[1] == "default") { MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging)); + } else if (Matches[1] == "thinlto-pre-link") { + MPM.addPass(buildThinLTOPreLinkDefaultPipeline(L, DebugLogging)); + } else if (Matches[1] == "thinlto") { + MPM.addPass(buildThinLTODefaultPipeline(L, DebugLogging)); } else if (Matches[1] == "lto-pre-link") { MPM.addPass(buildLTOPreLinkDefaultPipeline(L, DebugLogging)); } else { diff --git a/lib/Support/BinaryStreamReader.cpp b/lib/Support/BinaryStreamReader.cpp index 862232971162..bfb658cfa0b7 100644 --- a/lib/Support/BinaryStreamReader.cpp +++ b/lib/Support/BinaryStreamReader.cpp @@ -69,6 +69,26 @@ Error BinaryStreamReader::readCString(StringRef &Dest) { return Error::success(); } +Error BinaryStreamReader::readWideString(ArrayRef<UTF16> &Dest) { + uint32_t Length = 0; + uint32_t OriginalOffset = getOffset(); + const UTF16 *C; + while (true) { + if (auto EC = readObject(C)) + return EC; + if (*C == 0x0000) + break; + ++Length; + } + uint32_t NewOffset = getOffset(); + setOffset(OriginalOffset); + + if (auto EC = readArray(Dest, Length)) + return EC; + setOffset(NewOffset); + return Error::success(); +} + Error BinaryStreamReader::readFixedString(StringRef &Dest, uint32_t Length) { ArrayRef<uint8_t> Bytes; if (auto EC = readBytes(Bytes, Length)) diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc index fa28ba1b6ab6..ce638d453c19 100644 --- a/lib/Support/Unix/Path.inc +++ b/lib/Support/Unix/Path.inc @@ -381,6 +381,11 @@ static bool is_local_impl(struct STATVFS &Vfs) { #elif defined(__CYGWIN__) // Cygwin doesn't expose this information; would need to use Win32 API. return false; +#elif defined(__sun) + // statvfs::f_basetype contains a null-terminated FSType name of the mounted target + StringRef fstype(Vfs.f_basetype); + // NFS is the only non-local fstype?? + return !fstype.equals("nfs"); #else return !!(STATVFS_F_FLAG(Vfs) & MNT_LOCAL); #endif diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp index f07208b1fb90..83f7147dc9f6 100644 --- a/lib/TableGen/Record.cpp +++ b/lib/TableGen/Record.cpp @@ -1572,12 +1572,6 @@ RecordVal::RecordVal(Init *N, RecTy *T, bool P) assert(Value && "Cannot create unset value for current type!"); } -RecordVal::RecordVal(StringRef N, RecTy *T, bool P) - : Name(StringInit::get(N)), TyAndPrefix(T, P) { - Value = UnsetInit::get()->convertInitializerTo(T); - assert(Value && "Cannot create unset value for current type!"); -} - StringRef RecordVal::getName() const { return cast<StringInit>(getNameInit())->getValue(); } @@ -1603,8 +1597,7 @@ void Record::init() { // Every record potentially has a def at the top. This value is // replaced with the top-level def name at instantiation time. - RecordVal DN("NAME", StringRecTy::get(), false); - addValue(DN); + addValue(RecordVal(StringInit::get("NAME"), StringRecTy::get(), false)); } void Record::checkName() { @@ -1640,10 +1633,6 @@ void Record::setName(Init *NewName) { // this. See TGParser::ParseDef and TGParser::ParseDefm. } -void Record::setName(StringRef Name) { - setName(StringInit::get(Name)); -} - void Record::resolveReferencesTo(const RecordVal *RV) { for (RecordVal &Value : Values) { if (RV == &Value) // Skip resolve the same field as the given one @@ -1714,7 +1703,7 @@ Init *Record::getValueInit(StringRef FieldName) const { return R->getValue(); } -std::string Record::getValueAsString(StringRef FieldName) const { +StringRef Record::getValueAsString(StringRef FieldName) const { const RecordVal *R = getValue(FieldName); if (!R || !R->getValue()) PrintFatalError(getLoc(), "Record `" + getName() + @@ -1793,10 +1782,10 @@ Record::getValueAsListOfInts(StringRef FieldName) const { return Ints; } -std::vector<std::string> +std::vector<StringRef> Record::getValueAsListOfStrings(StringRef FieldName) const { ListInit *List = getValueAsListInit(FieldName); - std::vector<std::string> Strings; + std::vector<StringRef> Strings; for (Init *I : List->getValues()) { if (StringInit *SI = dyn_cast<StringInit>(I)) Strings.push_back(SI->getValue()); diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp index 96015b06d798..b492cf9495c0 100644 --- a/lib/TableGen/TGParser.cpp +++ b/lib/TableGen/TGParser.cpp @@ -339,7 +339,7 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){ if (!IVal) return Error(Loc, "foreach iterator value is untyped"); - IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false)); + IterRec->addValue(RecordVal(IterVar->getNameInit(), IVal->getType(), false)); if (SetValue(IterRec.get(), Loc, IterVar->getNameInit(), None, IVal)) return Error(Loc, "when instantiating this def"); @@ -378,8 +378,8 @@ static bool isObjectStart(tgtok::TokKind K) { /// GetNewAnonymousName - Generate a unique anonymous name that can be used as /// an identifier. -std::string TGParser::GetNewAnonymousName() { - return "anonymous_" + utostr(AnonCounter++); +Init *TGParser::GetNewAnonymousName() { + return StringInit::get("anonymous_" + utostr(AnonCounter++)); } /// ParseObjectName - If an object name is specified, return it. Otherwise, @@ -2350,7 +2350,7 @@ Record *TGParser::InstantiateMulticlassDef(MultiClass &MC, Record *DefProto, bool IsAnonymous = false; if (!DefmPrefix) { - DefmPrefix = StringInit::get(GetNewAnonymousName()); + DefmPrefix = GetNewAnonymousName(); IsAnonymous = true; } diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h index 76f7d8fe5026..1b2966c9f6c9 100644 --- a/lib/TableGen/TGParser.h +++ b/lib/TableGen/TGParser.h @@ -110,7 +110,7 @@ private: // Semantic analysis methods. bool AddSubMultiClass(MultiClass *CurMC, SubMultiClassReference &SubMultiClass); - std::string GetNewAnonymousName(); + Init *GetNewAnonymousName(); // IterRecord: Map an iterator name to a value. struct IterRecord { diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 4af5fef4287c..abe28460c83a 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -190,6 +190,7 @@ def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, @@ -226,6 +227,7 @@ def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", FeatureCRC, FeatureCrypto, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon ]>; diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td index 3fbbc0be682d..3b71cf8399a0 100644 --- a/lib/Target/AArch64/AArch64SchedM1.td +++ b/lib/Target/AArch64/AArch64SchedM1.td @@ -23,7 +23,7 @@ def ExynosM1Model : SchedMachineModel { let LoopMicroOpBufferSize = 24; // Based on the instruction queue size. let LoadLatency = 4; // Optimistic load cases. let MispredictPenalty = 14; // Minimum branch misprediction penalty. - let CompleteModel = 0; // Use the default model otherwise. + let CompleteModel = 1; // Use the default model otherwise. } //===----------------------------------------------------------------------===// @@ -72,14 +72,14 @@ def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } -def M1WriteLA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5, +def M1WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5, M1WriteA1]>, SchedVar<NoSchedPred, [M1WriteL5]>]>; def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; } def M1WriteS2 : SchedWriteRes<[M1UnitS]> { let Latency = 2; } def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } -def M1WriteSA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2, +def M1WriteSX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2, M1WriteA1]>, SchedVar<NoSchedPred, [M1WriteS1]>]>; @@ -125,13 +125,13 @@ def : WriteRes<WriteAdr, []> { let Latency = 0; } // Load instructions. def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; } def : WriteRes<WriteLDHi, [M1UnitALU]> { let Latency = 4; } -def : SchedAlias<WriteLDIdx, M1WriteLA>; +def : SchedAlias<WriteLDIdx, M1WriteLX>; // Store instructions. def : WriteRes<WriteST, [M1UnitS]> { let Latency = 1; } def : WriteRes<WriteSTP, [M1UnitS]> { let Latency = 1; } def : WriteRes<WriteSTX, [M1UnitS]> { let Latency = 1; } -def : SchedAlias<WriteSTIdx, M1WriteSA>; +def : SchedAlias<WriteSTIdx, M1WriteSX>; // FP data instructions. def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; } @@ -231,6 +231,111 @@ def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; } def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; } def M1WriteTB : SchedWriteRes<[M1UnitC, M1UnitALU]> { let Latency = 2; } +def M1WriteVLDA : SchedWriteRes<[M1UnitL, + M1UnitL]> { let Latency = 6; } +def M1WriteVLDB : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 7; } +def M1WriteVLDC : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 8; } +def M1WriteVLDD : SchedWriteRes<[M1UnitL, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDE : SchedWriteRes<[M1UnitL, + M1UnitNALU]> { let Latency = 6; } +def M1WriteVLDF : SchedWriteRes<[M1UnitL, + M1UnitL]> { let Latency = 10; + let ResourceCycles = [5]; } +def M1WriteVLDG : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDH : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU]> { let Latency = 6; } +def M1WriteVLDI : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 12; + let ResourceCycles = [6]; } +def M1WriteVLDJ : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 9; + let ResourceCycles = [4]; } +def M1WriteVLDK : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 9; + let ResourceCycles = [4]; } +def M1WriteVLDL : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDM : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDN : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 14; + let ResourceCycles = [7]; } + +def M1WriteVSTA : WriteSequence<[WriteVST], 2>; +def M1WriteVSTB : WriteSequence<[WriteVST], 3>; +def M1WriteVSTC : WriteSequence<[WriteVST], 4>; +def M1WriteVSTD : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitFST]> { let Latency = 7; + let ResourceCycles = [7]; } +def M1WriteVSTE : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST]> { let Latency = 8; + let ResourceCycles = [8]; } +def M1WriteVSTF : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 15; + let ResourceCycles = [15]; } +def M1WriteVSTG : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 16; + let ResourceCycles = [16]; } +def M1WriteVSTH : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 14; + let ResourceCycles = [14]; } +def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 17; + let ResourceCycles = [17]; } // Branch instructions def : InstRW<[M1WriteB1], (instrs Bcc)>; @@ -360,8 +465,233 @@ def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64 def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>; // ASIMD load instructions. +def : InstRW<[M1WriteVLDD], (instregex "LD1i(8|16|32)$")>; +def : InstRW<[M1WriteVLDD, + WriteAdr], (instregex "LD1i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVLDE], (instregex "LD1i(64)$")>; +def : InstRW<[M1WriteVLDE, + WriteAdr], (instregex "LD1i(64)_POST$")>; + +def : InstRW<[M1WriteL5], (instregex "LD1Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Rv(1d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(1d)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteL5], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVLDG], (instregex "LD2i(8|16)$")>; +def : InstRW<[M1WriteVLDG, + WriteAdr], (instregex "LD2i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDG], (instregex "LD2i(32)$")>; +def : InstRW<[M1WriteVLDG, + WriteAdr], (instregex "LD2i(32)_POST$")>; +def : InstRW<[M1WriteVLDH], (instregex "LD2i(64)$")>; +def : InstRW<[M1WriteVLDH, + WriteAdr], (instregex "LD2i(64)_POST$")>; + +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(1d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(2d)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(2d)_POST$")>; + +def : InstRW<[M1WriteVLDJ], (instregex "LD3i(8|16)$")>; +def : InstRW<[M1WriteVLDJ, + WriteAdr], (instregex "LD3i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDJ], (instregex "LD3i(32)$")>; +def : InstRW<[M1WriteVLDJ, + WriteAdr], (instregex "LD3i(32)_POST$")>; +def : InstRW<[M1WriteVLDL], (instregex "LD3i(64)$")>; +def : InstRW<[M1WriteVLDL, + WriteAdr], (instregex "LD3i(64)_POST$")>; + +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(1d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(2d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(2d)_POST$")>; + +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(2d)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[M1WriteVLDK], (instregex "LD4i(8|16)$")>; +def : InstRW<[M1WriteVLDK, + WriteAdr], (instregex "LD4i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDK], (instregex "LD4i(32)$")>; +def : InstRW<[M1WriteVLDK, + WriteAdr], (instregex "LD4i(32)_POST$")>; +def : InstRW<[M1WriteVLDM], (instregex "LD4i(64)$")>; +def : InstRW<[M1WriteVLDM, + WriteAdr], (instregex "LD4i(64)_POST$")>; + +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(1d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(2d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(2d)_POST$")>; + +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; // ASIMD store instructions. +def : InstRW<[M1WriteVSTD], (instregex "ST1i(8|16|32)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST1i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVSTD], (instregex "ST1i(64)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST1i(64)_POST$")>; + +def : InstRW<[WriteVST], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteVST], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTA, + WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTA, + WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTB, + WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTB, + WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTC, + WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTC, + WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVSTD], (instregex "ST2i(8|16|32)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVSTD], (instregex "ST2i(64)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2i(64)_POST$")>; + +def : InstRW<[M1WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTE, + WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(2d)$")>; +def : InstRW<[M1WriteVSTE, + WriteAdr], (instregex "ST2Twov(2d)_POST$")>; + +def : InstRW<[M1WriteVSTH], (instregex "ST3i(8|16)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST3i(8|16)_POST$")>; +def : InstRW<[M1WriteVSTH], (instregex "ST3i(32)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST3i(32)_POST$")>; +def : InstRW<[M1WriteVSTF], (instregex "ST3i(64)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST3i(64)_POST$")>; + +def : InstRW<[M1WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTG, + WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(2d)$")>; +def : InstRW<[M1WriteVSTG, + WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[M1WriteVSTH], (instregex "ST4i(8|16)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST4i(8|16)_POST$")>; +def : InstRW<[M1WriteVSTH], (instregex "ST4i(32)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST4i(32)_POST$")>; +def : InstRW<[M1WriteVSTF], (instregex "ST4i(64)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST4i(64)_POST$")>; + +def : InstRW<[M1WriteVSTF], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTI, + WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[M1WriteVSTI, + WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; // Cryptography instructions. def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index cb3f72a524f5..d4a8cecdb29f 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -256,9 +256,9 @@ namespace { /// AArch64 Code Generator Pass Configuration Options. class AArch64PassConfig : public TargetPassConfig { public: - AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM) + AArch64PassConfig(AArch64TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM.getOptLevel() != CodeGenOpt::None) substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); } @@ -317,7 +317,7 @@ TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { } TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { - return new AArch64PassConfig(this, PM); + return new AArch64PassConfig(*this, PM); } void AArch64PassConfig::addIRPasses() { diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index f473944cd528..0959014812d8 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -503,40 +503,37 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); Info.PrivateSegmentSize = FrameInfo.getStackSize(); - if (!FrameInfo.hasCalls()) { - Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || - MRI.isPhysRegUsed(AMDGPU::VCC_HI); - - // If there are no calls, MachineRegisterInfo can tell us the used register - // count easily. - - MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestVGPRReg = Reg; - break; - } - } - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; - } - } + Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || + MRI.isPhysRegUsed(AMDGPU::VCC_HI); - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestSGPRReg) + 1; + // If there are no calls, MachineRegisterInfo can tell us the used register + // count easily. - return Info; + MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestVGPRReg = Reg; + break; + } } - llvm_unreachable("calls not implemented"); + MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestSGPRReg = Reg; + break; + } + } + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestVGPRReg) + 1; + Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestSGPRReg) + 1; + + return Info; } void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 48827f463997..596f02ae4a64 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -456,7 +456,7 @@ namespace { class AMDGPUPassConfig : public TargetPassConfig { public: - AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) + AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { // Exceptions and StackMaps are not supported, so these passes will never do // anything. @@ -487,7 +487,7 @@ public: class R600PassConfig final : public AMDGPUPassConfig { public: - R600PassConfig(TargetMachine *TM, PassManagerBase &PM) + R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) {} ScheduleDAGInstrs *createMachineScheduler( @@ -503,7 +503,7 @@ public: class GCNPassConfig final : public AMDGPUPassConfig { public: - GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) + GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) {} GCNTargetMachine &getGCNTargetMachine() const { @@ -682,7 +682,7 @@ void R600PassConfig::addPreEmitPass() { } TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { - return new R600PassConfig(this, PM); + return new R600PassConfig(*this, PM); } //===----------------------------------------------------------------------===// @@ -844,6 +844,6 @@ void GCNPassConfig::addPreEmitPass() { } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { - return new GCNPassConfig(this, PM); + return new GCNPassConfig(*this, PM); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 934bf7f31bab..a3c7c1982d0a 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -69,7 +69,6 @@ public: return -1; return 0; } - }; //===----------------------------------------------------------------------===// @@ -89,6 +88,10 @@ public: TargetPassConfig *createPassConfig(PassManagerBase &PM) override; const R600Subtarget *getSubtargetImpl(const Function &) const override; + + bool isMachineVerifierClean() const override { + return false; + } }; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index f5541e08e1b7..cc68c971b249 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -161,7 +161,8 @@ public: ImmTyOpSel, ImmTyOpSelHi, ImmTyNegLo, - ImmTyNegHi + ImmTyNegHi, + ImmTySwizzle }; struct TokOp { @@ -474,6 +475,7 @@ public: bool isSWaitCnt() const; bool isHwreg() const; bool isSendMsg() const; + bool isSwizzle() const; bool isSMRDOffset8() const; bool isSMRDOffset20() const; bool isSMRDLiteralOffset() const; @@ -659,6 +661,7 @@ public: case ImmTyOpSelHi: OS << "OpSelHi"; break; case ImmTyNegLo: OS << "NegLo"; break; case ImmTyNegHi: OS << "NegHi"; break; + case ImmTySwizzle: OS << "Swizzle"; break; } } @@ -994,6 +997,12 @@ private: bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + bool trySkipId(const StringRef Id); + bool trySkipToken(const AsmToken::TokenKind Kind); + bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg); + bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string"); + bool parseExpr(int64_t &Imm); + public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -1003,6 +1012,19 @@ public: OperandMatchResultTy parseInterpAttr(OperandVector &Operands); OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op, + const unsigned MinVal, + const unsigned MaxVal, + const StringRef ErrMsg); + OperandMatchResultTy parseSwizzleOp(OperandVector &Operands); + bool parseSwizzleOffset(int64_t &Imm); + bool parseSwizzleMacro(int64_t &Imm); + bool parseSwizzleQuadPerm(int64_t &Imm); + bool parseSwizzleBitmaskPerm(int64_t &Imm); + bool parseSwizzleBroadcast(int64_t &Imm); + bool parseSwizzleSwap(int64_t &Imm); + bool parseSwizzleReverse(int64_t &Imm); + void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } @@ -2785,7 +2807,13 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, OptionalIdx[Op.getImmTy()] = i; } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); + AMDGPUOperand::ImmTy OffsetType = + (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si || + Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle : + AMDGPUOperand::ImmTyOffset; + + addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType); + if (!IsGdsHardcoded) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); } @@ -3384,6 +3412,298 @@ bool AMDGPUOperand::isSendMsg() const { } //===----------------------------------------------------------------------===// +// parser helpers +//===----------------------------------------------------------------------===// + +bool +AMDGPUAsmParser::trySkipId(const StringRef Id) { + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == Id) { + Parser.Lex(); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) { + if (getLexer().getKind() == Kind) { + Parser.Lex(); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind, + const StringRef ErrMsg) { + if (!trySkipToken(Kind)) { + Error(Parser.getTok().getLoc(), ErrMsg); + return false; + } + return true; +} + +bool +AMDGPUAsmParser::parseExpr(int64_t &Imm) { + return !getParser().parseAbsoluteExpression(Imm); +} + +bool +AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { + SMLoc S = Parser.getTok().getLoc(); + if (getLexer().getKind() == AsmToken::String) { + Val = Parser.getTok().getStringContents(); + Parser.Lex(); + return true; + } else { + Error(S, ErrMsg); + return false; + } +} + +//===----------------------------------------------------------------------===// +// swizzle +//===----------------------------------------------------------------------===// + +LLVM_READNONE +static unsigned +encodeBitmaskPerm(const unsigned AndMask, + const unsigned OrMask, + const unsigned XorMask) { + using namespace llvm::AMDGPU::Swizzle; + + return BITMASK_PERM_ENC | + (AndMask << BITMASK_AND_SHIFT) | + (OrMask << BITMASK_OR_SHIFT) | + (XorMask << BITMASK_XOR_SHIFT); +} + +bool +AMDGPUAsmParser::parseSwizzleOperands(const unsigned OpNum, int64_t* Op, + const unsigned MinVal, + const unsigned MaxVal, + const StringRef ErrMsg) { + for (unsigned i = 0; i < OpNum; ++i) { + if (!skipToken(AsmToken::Comma, "expected a comma")){ + return false; + } + SMLoc ExprLoc = Parser.getTok().getLoc(); + if (!parseExpr(Op[i])) { + return false; + } + if (Op[i] < MinVal || Op[i] > MaxVal) { + Error(ExprLoc, ErrMsg); + return false; + } + } + + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + int64_t Lane[LANE_NUM]; + if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX, + "expected a 2-bit lane id")) { + Imm = QUAD_PERM_ENC; + for (auto i = 0; i < LANE_NUM; ++i) { + Imm |= Lane[i] << (LANE_SHIFT * i); + } + return true; + } + return false; +} + +bool +AMDGPUAsmParser::parseSwizzleBroadcast(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + int64_t LaneIdx; + + if (!parseSwizzleOperands(1, &GroupSize, + 2, 32, + "group size must be in the interval [2,32]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + if (parseSwizzleOperands(1, &LaneIdx, + 0, GroupSize - 1, + "lane id must be in the interval [0,group size - 1]")) { + Imm = encodeBitmaskPerm(BITMASK_MAX - GroupSize + 1, LaneIdx, 0); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::parseSwizzleReverse(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + + if (!parseSwizzleOperands(1, &GroupSize, + 2, 32, "group size must be in the interval [2,32]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + + Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize - 1); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleSwap(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + + if (!parseSwizzleOperands(1, &GroupSize, + 1, 16, "group size must be in the interval [1,16]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + + Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + if (!skipToken(AsmToken::Comma, "expected a comma")) { + return false; + } + + StringRef Ctl; + SMLoc StrLoc = Parser.getTok().getLoc(); + if (!parseString(Ctl)) { + return false; + } + if (Ctl.size() != BITMASK_WIDTH) { + Error(StrLoc, "expected a 5-character mask"); + return false; + } + + unsigned AndMask = 0; + unsigned OrMask = 0; + unsigned XorMask = 0; + + for (size_t i = 0; i < Ctl.size(); ++i) { + unsigned Mask = 1 << (BITMASK_WIDTH - 1 - i); + switch(Ctl[i]) { + default: + Error(StrLoc, "invalid mask"); + return false; + case '0': + break; + case '1': + OrMask |= Mask; + break; + case 'p': + AndMask |= Mask; + break; + case 'i': + AndMask |= Mask; + XorMask |= Mask; + break; + } + } + + Imm = encodeBitmaskPerm(AndMask, OrMask, XorMask); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) { + + SMLoc OffsetLoc = Parser.getTok().getLoc(); + + if (!parseExpr(Imm)) { + return false; + } + if (!isUInt<16>(Imm)) { + Error(OffsetLoc, "expected a 16-bit offset"); + return false; + } + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + if (skipToken(AsmToken::LParen, "expected a left parentheses")) { + + SMLoc ModeLoc = Parser.getTok().getLoc(); + bool Ok = false; + + if (trySkipId(IdSymbolic[ID_QUAD_PERM])) { + Ok = parseSwizzleQuadPerm(Imm); + } else if (trySkipId(IdSymbolic[ID_BITMASK_PERM])) { + Ok = parseSwizzleBitmaskPerm(Imm); + } else if (trySkipId(IdSymbolic[ID_BROADCAST])) { + Ok = parseSwizzleBroadcast(Imm); + } else if (trySkipId(IdSymbolic[ID_SWAP])) { + Ok = parseSwizzleSwap(Imm); + } else if (trySkipId(IdSymbolic[ID_REVERSE])) { + Ok = parseSwizzleReverse(Imm); + } else { + Error(ModeLoc, "expected a swizzle mode"); + } + + return Ok && skipToken(AsmToken::RParen, "expected a closing parentheses"); + } + + return false; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + int64_t Imm = 0; + + if (trySkipId("offset")) { + + bool Ok = false; + if (skipToken(AsmToken::Colon, "expected a colon")) { + if (trySkipId("swizzle")) { + Ok = parseSwizzleMacro(Imm); + } else { + Ok = parseSwizzleOffset(Imm); + } + } + + Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTySwizzle)); + + return Ok? MatchOperand_Success : MatchOperand_ParseFail; + } else { + return MatchOperand_NoMatch; + } +} + +bool +AMDGPUOperand::isSwizzle() const { + return isImmTy(ImmTySwizzle); +} + +//===----------------------------------------------------------------------===// // sopp branch targets //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index 357e18108e7e..fc516c3b39c2 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -145,10 +145,10 @@ class DS_1A2D_Off8_RET<string opName, let hasPostISelHook = 1; } -class DS_1A_RET<string opName, RegisterClass rc = VGPR_32> +class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset> : DS_Pseudo<opName, (outs rc:$vdst), - (ins VGPR_32:$addr, offset:$offset, gds:$gds), + (ins VGPR_32:$addr, ofs:$offset, gds:$gds), "$vdst, $addr$offset$gds"> { let has_data0 = 0; @@ -440,7 +440,7 @@ def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">; def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { -def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">; +def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>; } let mayStore = 0 in { diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index a817ff3cbaf0..523eea41897e 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -1160,6 +1160,112 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, O << SImm16; // Unknown simm16 code. } +static void printSwizzleBitmask(const uint16_t AndMask, + const uint16_t OrMask, + const uint16_t XorMask, + raw_ostream &O) { + using namespace llvm::AMDGPU::Swizzle; + + uint16_t Probe0 = ((0 & AndMask) | OrMask) ^ XorMask; + uint16_t Probe1 = ((BITMASK_MASK & AndMask) | OrMask) ^ XorMask; + + O << "\""; + + for (unsigned Mask = 1 << (BITMASK_WIDTH - 1); Mask > 0; Mask >>= 1) { + uint16_t p0 = Probe0 & Mask; + uint16_t p1 = Probe1 & Mask; + + if (p0 == p1) { + if (p0 == 0) { + O << "0"; + } else { + O << "1"; + } + } else { + if (p0 == 0) { + O << "p"; + } else { + O << "i"; + } + } + } + + O << "\""; +} + +void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::Swizzle; + + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm == 0) { + return; + } + + O << " offset:"; + + if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) { + + O << "swizzle(" << IdSymbolic[ID_QUAD_PERM]; + for (auto i = 0; i < LANE_NUM; ++i) { + O << ","; + O << formatDec(Imm & LANE_MASK); + Imm >>= LANE_SHIFT; + } + O << ")"; + + } else if ((Imm & BITMASK_PERM_ENC_MASK) == BITMASK_PERM_ENC) { + + uint16_t AndMask = (Imm >> BITMASK_AND_SHIFT) & BITMASK_MASK; + uint16_t OrMask = (Imm >> BITMASK_OR_SHIFT) & BITMASK_MASK; + uint16_t XorMask = (Imm >> BITMASK_XOR_SHIFT) & BITMASK_MASK; + + if (AndMask == BITMASK_MAX && + OrMask == 0 && + countPopulation(XorMask) == 1) { + + O << "swizzle(" << IdSymbolic[ID_SWAP]; + O << ","; + O << formatDec(XorMask); + O << ")"; + + } else if (AndMask == BITMASK_MAX && + OrMask == 0 && XorMask > 0 && + isPowerOf2_64(XorMask + 1)) { + + O << "swizzle(" << IdSymbolic[ID_REVERSE]; + O << ","; + O << formatDec(XorMask + 1); + O << ")"; + + } else { + + uint16_t GroupSize = BITMASK_MAX - AndMask + 1; + if (GroupSize > 1 && + isPowerOf2_64(GroupSize) && + OrMask < GroupSize && + XorMask == 0) { + + O << "swizzle(" << IdSymbolic[ID_BROADCAST]; + O << ","; + O << formatDec(GroupSize); + O << ","; + O << formatDec(OrMask); + O << ")"; + + } else { + O << "swizzle(" << IdSymbolic[ID_BITMASK_PERM]; + O << ","; + printSwizzleBitmask(AndMask, OrMask, XorMask, O); + O << ")"; + } + } + } else { + printU16ImmDecOperand(MI, OpNo, O); + } +} + void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index c0b8e5c51089..c8094c4b840a 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -193,6 +193,8 @@ private: raw_ostream &O); void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index 80967edee0ab..5cd90323ff67 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -281,6 +281,46 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11] } // namespace Hwreg +namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32. + +enum Id { // id of symbolic names + ID_QUAD_PERM = 0, + ID_BITMASK_PERM, + ID_SWAP, + ID_REVERSE, + ID_BROADCAST +}; + +enum EncBits { + + // swizzle mode encodings + + QUAD_PERM_ENC = 0x8000, + QUAD_PERM_ENC_MASK = 0xFF00, + + BITMASK_PERM_ENC = 0x0000, + BITMASK_PERM_ENC_MASK = 0x8000, + + // QUAD_PERM encodings + + LANE_MASK = 0x3, + LANE_MAX = LANE_MASK, + LANE_SHIFT = 2, + LANE_NUM = 4, + + // BITMASK_PERM encodings + + BITMASK_MASK = 0x1F, + BITMASK_MAX = BITMASK_MASK, + BITMASK_WIDTH = 5, + + BITMASK_AND_SHIFT = 0, + BITMASK_OR_SHIFT = 5, + BITMASK_XOR_SHIFT = 10 +}; + +} // namespace Swizzle + namespace SDWA { enum SdwaSel { diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index b5e3ce3dfe3e..e22166d03e9a 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -826,7 +826,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::RETURN || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { @@ -1149,8 +1150,10 @@ void SIInsertWaitcnts::updateEventWaitCntAfter( // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. - if (TII->isDS(Inst) && (Inst.mayLoad() || Inst.mayStore())) { - if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { + uint64_t TSFlags = Inst.getDesc().TSFlags; + if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) { + if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) && + TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); } else { @@ -1183,7 +1186,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter( Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() && - (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()))) { + (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); } } else if (TII->isSMRD(Inst)) { @@ -1715,6 +1718,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); MLI = &getAnalysis<MachineLoopInfo>(); IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); AMDGPUASI = ST->getAMDGPUAS(); HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); @@ -1859,5 +1863,19 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } + if (!MFI->isEntryFunction()) { + // Wait for any outstanding memory operations that the input registers may + // depend on. We can't track them and it's better to to the wait after the + // costly call sequence. + + // TODO: Could insert earlier and schedule more liberally with operations + // that only use caller preserved registers. + MachineBasicBlock &EntryBB = MF.front(); + BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + Modified = true; + } + return Modified; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index c5287c7f64ba..445bf79a7814 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -383,6 +383,14 @@ def SendMsgMatchClass : AsmOperandClass { let RenderMethod = "addImmOperands"; } +def SwizzleMatchClass : AsmOperandClass { + let Name = "Swizzle"; + let PredicateMethod = "isSwizzle"; + let ParserMethod = "parseSwizzleOp"; + let RenderMethod = "addImmOperands"; + let IsOptional = 1; +} + def ExpTgtMatchClass : AsmOperandClass { let Name = "ExpTgt"; let PredicateMethod = "isExpTgt"; @@ -395,6 +403,11 @@ def SendMsgImm : Operand<i32> { let ParserMatchClass = SendMsgMatchClass; } +def SwizzleImm : Operand<i16> { + let PrintMethod = "printSwizzle"; + let ParserMatchClass = SwizzleMatchClass; +} + def SWaitMatchClass : AsmOperandClass { let Name = "SWaitCnt"; let RenderMethod = "addImmOperands"; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index b91cdddc5520..a648c178101a 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -66,6 +66,12 @@ public: const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; + // Stack access is very expensive. CSRs are also the high registers, and we + // want to minimize the number of used registers. + unsigned getCSRFirstUseCost() const override { + return 100; + } + unsigned getFrameRegister(const MachineFunction &MF) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index b6868de6a74e..03b11ae80500 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -65,5 +65,18 @@ const char* const IdSymbolic[] = { }; } // namespace Hwreg + +namespace Swizzle { + +// This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h. +const char* const IdSymbolic[] = { + "QUAD_PERM", + "BITMASK_PERM", + "SWAP", + "REVERSE", + "BROADCAST", +}; + +} // namespace Swizzle } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index b2dc2c0e364c..ebb2be22b487 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -25,6 +25,12 @@ namespace Hwreg { // Symbolic names for the hwreg(...) syntax. extern const char* const IdSymbolic[]; } // namespace Hwreg + +namespace Swizzle { // Symbolic names for the swizzle(...) syntax. + +extern const char* const IdSymbolic[]; + +} // namespace Swizzle } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 90baabcdb652..ec49f0d37af4 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -757,14 +757,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, MI.eraseFromParent(); } -static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) { - for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) - MBB->addLiveIn(*I); -} - /// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as -/// possible. This only gets used at -O0 so we don't care about efficiency of the -/// generated code. +/// possible. This only gets used at -O0 so we don't care about efficiency of +/// the generated code. bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdrexOp, unsigned StrexOp, @@ -773,16 +768,15 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, bool IsThumb = STI->isThumb(); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Dest = MI.getOperand(0); unsigned StatusReg = MI.getOperand(1).getReg(); - MachineOperand &Addr = MI.getOperand(2); - MachineOperand &Desired = MI.getOperand(3); - MachineOperand &New = MI.getOperand(4); - - LivePhysRegs LiveRegs(TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + bool StatusDead = MI.getOperand(1).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(2).getReg(); + unsigned DesiredReg = MI.getOperand(3).getReg(); + unsigned NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -795,25 +789,35 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, if (UxtOp) { MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII->get(UxtOp), Desired.getReg()) - .addReg(Desired.getReg(), RegState::Kill); + BuildMI(MBB, MBBI, DL, TII->get(UxtOp), DesiredReg) + .addReg(DesiredReg, RegState::Kill); if (!IsThumb) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); } // .Lloadcmp: + // mov wStatus, #0 // ldrex rDest, [rAddr] // cmp rDest, rDesired // bne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(Dest.getReg()); - LoadCmpBB->addLiveIn(Desired.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); + if (!StatusDead) { + if (IsThumb) { + BuildMI(LoadCmpBB, DL, TII->get(ARM::tMOVi8), StatusReg) + .addDef(ARM::CPSR, RegState::Dead) + .addImm(0) + .add(predOps(ARMCC::AL)); + } else { + BuildMI(LoadCmpBB, DL, TII->get(ARM::MOVi), StatusReg) + .addImm(0) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + } + } MachineInstrBuilder MIB; MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg()); - MIB.addReg(Addr.getReg()); + MIB.addReg(AddrReg); if (LdrexOp == ARM::t2LDREX) MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset. MIB.add(predOps(ARMCC::AL)); @@ -821,7 +825,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr; BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) - .add(Desired) + .addReg(DesiredReg) .add(predOps(ARMCC::AL)); unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc; BuildMI(LoadCmpBB, DL, TII->get(Bcc)) @@ -835,21 +839,16 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, // strex rStatus, rNew, [rAddr] // cmp rStatus, #0 // bne .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(New.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); - - - MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg); - MIB.add(New); - MIB.add(Addr); + MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg) + .addReg(NewReg) + .addReg(AddrReg); if (StrexOp == ARM::t2STREX) MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset. MIB.add(predOps(ARMCC::AL)); unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; BuildMI(StoreBB, DL, TII->get(CMPri)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(StoreBB, DL, TII->get(Bcc)) @@ -861,12 +860,24 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute livein lists. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass around the loop to get loop carried registers right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } @@ -894,19 +905,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, DebugLoc DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); unsigned StatusReg = MI.getOperand(1).getReg(); - MachineOperand &Addr = MI.getOperand(2); - MachineOperand &Desired = MI.getOperand(3); - MachineOperand &New = MI.getOperand(4); + bool StatusDead = MI.getOperand(1).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(2).getReg(); + unsigned DesiredReg = MI.getOperand(3).getReg(); + MachineOperand New = MI.getOperand(4); + New.setIsKill(false); unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0); unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1); - unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0); - unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1); - - LivePhysRegs LiveRegs(TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0); + unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -922,26 +933,21 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, // cmp rDestLo, rDesiredLo // sbcs rStatus<dead>, rDestHi, rDesiredHi // bne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(Dest.getReg()); - LoadCmpBB->addLiveIn(Desired.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); - unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD; MachineInstrBuilder MIB; MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD)); addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI); - MIB.addReg(Addr.getReg()).add(predOps(ARMCC::AL)); + MIB.addReg(AddrReg).add(predOps(ARMCC::AL)); unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr; BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(DestLo, getKillRegState(Dest.isDead())) - .addReg(DesiredLo, getKillRegState(Desired.isDead())) + .addReg(DesiredLo) .add(predOps(ARMCC::AL)); BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(DestHi, getKillRegState(Dest.isDead())) - .addReg(DesiredHi, getKillRegState(Desired.isDead())) + .addReg(DesiredHi) .addImm(ARMCC::EQ).addReg(ARM::CPSR, RegState::Kill); unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc; @@ -956,18 +962,14 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, // strexd rStatus, rNewLo, rNewHi, [rAddr] // cmp rStatus, #0 // bne .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(New.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); - unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD; MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg); addExclusiveRegPair(MIB, New, 0, IsThumb, TRI); - MIB.add(Addr).add(predOps(ARMCC::AL)); + MIB.addReg(AddrReg).add(predOps(ARMCC::AL)); unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; BuildMI(StoreBB, DL, TII->get(CMPri)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(StoreBB, DL, TII->get(Bcc)) @@ -979,12 +981,24 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute livein lists. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass around the loop to get loop carried registers right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 4f7a0ab4e220..c2b2502843c0 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -968,8 +968,9 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) continue; - bool isLiveIn = MF.getRegInfo().isLiveIn(Reg); - if (!isLiveIn) + const MachineRegisterInfo &MRI = MF.getRegInfo(); + bool isLiveIn = MRI.isLiveIn(Reg); + if (!isLiveIn && !MRI.isReserved(Reg)) MBB.addLiveIn(Reg); // If NoGap is true, push consecutive registers and then leave the rest // for other instructions. e.g. diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index bee83dfb6f63..423f97ccacd6 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -1413,7 +1413,8 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), // Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them // and make use of the same compressed jump table format as Thumb-2. -let Size = 2 in { +let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1, + isIndirectBranch = 1 in { def tTBB_JT : tPseudoInst<(outs), (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>, Sched<[WriteBr]>; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index bf3d820e7b7d..45471a4e95b3 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -3494,7 +3494,8 @@ def t2B : T2I<(outs), (ins thumb_br_target:$target), IIC_Br, let AsmMatchConverter = "cvtThumbBranches"; } -let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in { +let Size = 4, isNotDuplicable = 1, isBranch = 1, isTerminator = 1, + isBarrier = 1, isIndirectBranch = 1 in { // available in both v8-M.Baseline and Thumb2 targets def t2BR_JT : t2basePseudoInst<(outs), diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index c4f23c66e4ea..f5e4043882ff 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -382,7 +382,7 @@ namespace { /// ARM Code Generator Pass Configuration Options. class ARMPassConfig : public TargetPassConfig { public: - ARMPassConfig(ARMBaseTargetMachine *TM, PassManagerBase &PM) + ARMPassConfig(ARMBaseTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} ARMBaseTargetMachine &getARMTargetMachine() const { @@ -419,7 +419,7 @@ INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix", "ARM Execution Dependency Fix", false, false) TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) { - return new ARMPassConfig(this, PM); + return new ARMPassConfig(*this, PM); } void ARMPassConfig::addIRPasses() { diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index e5eb27114c72..2fcee73228fe 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -60,6 +60,10 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; /// ARM/Thumb little endian target machine. diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 40bf545e8322..b0d1d3fb9ef0 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -729,6 +729,15 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, // linker can handle it. GNU AS produces an error in this case. if (Sym->isExternal() || Value >= 0x400004) IsResolved = false; + // When an ARM function is called from a Thumb function, produce a + // relocation so the linker will use the correct branch instruction for ELF + // binaries. + if (Sym->isELF()) { + unsigned Type = dyn_cast<MCSymbolELF>(Sym)->getType(); + if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) && + !Asm.isThumbFunc(Sym)) + IsResolved = false; + } } // We must always generate a relocation for BL/BLX instructions if we have // a symbol to reference, as the linker relies on knowing the destination diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index f917c35b9ceb..f10427e2ed57 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -698,13 +698,14 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, CopyRegs.insert(ArgReg); // Push the low registers and lr + const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!LoRegsToSave.empty()) { MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) { if (LoRegsToSave.count(Reg)) { - bool isKill = !MF.getRegInfo().isLiveIn(Reg); - if (isKill) + bool isKill = !MRI.isLiveIn(Reg); + if (isKill && !MRI.isReserved(Reg)) MBB.addLiveIn(Reg); MIB.addReg(Reg, getKillRegState(isKill)); @@ -746,8 +747,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, SmallVector<unsigned, 4> RegsToPush; while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) { if (HiRegsToSave.count(*HiRegToSave)) { - bool isKill = !MF.getRegInfo().isLiveIn(*HiRegToSave); - if (isKill) + bool isKill = !MRI.isLiveIn(*HiRegToSave); + if (isKill && !MRI.isReserved(*HiRegToSave)) MBB.addLiveIn(*HiRegToSave); // Emit a MOV from the high reg to the low reg. diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp index ef9c00e4b784..7d3faac1dcc2 100644 --- a/lib/Target/AVR/AVRISelLowering.cpp +++ b/lib/Target/AVR/AVRISelLowering.cpp @@ -1500,9 +1500,9 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI, unsigned DstReg = MI.getOperand(0).getReg(); // BB: - // cp 0, N + // cpi N, 0 // breq RemBB - BuildMI(BB, dl, TII.get(AVR::CPRdRr)).addReg(ShiftAmtSrcReg).addReg(AVR::R0); + BuildMI(BB, dl, TII.get(AVR::CPIRdK)).addReg(ShiftAmtSrcReg).addImm(0); BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB); // LoopBB: diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td index f10ca394f36c..5dd8b2c27b21 100644 --- a/lib/Target/AVR/AVRInstrInfo.td +++ b/lib/Target/AVR/AVRInstrInfo.td @@ -904,7 +904,7 @@ let Defs = [SREG] in // Compares a register with an 8 bit immediate. def CPIRdK : FRdK<0b0011, (outs), - (ins GPR8:$rd, imm_ldi8:$k), + (ins LD8:$rd, imm_ldi8:$k), "cpi\t$rd, $k", [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>; } diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp index fb3262916b4f..2ab0b1080c6a 100644 --- a/lib/Target/AVR/AVRTargetMachine.cpp +++ b/lib/Target/AVR/AVRTargetMachine.cpp @@ -57,7 +57,7 @@ namespace { /// AVR Code Generator Pass Configuration Options. class AVRPassConfig : public TargetPassConfig { public: - AVRPassConfig(AVRTargetMachine *TM, PassManagerBase &PM) + AVRPassConfig(AVRTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} AVRTargetMachine &getAVRTargetMachine() const { @@ -71,7 +71,7 @@ public: } // namespace TargetPassConfig *AVRTargetMachine::createPassConfig(PassManagerBase &PM) { - return new AVRPassConfig(this, PM); + return new AVRPassConfig(*this, PM); } extern "C" void LLVMInitializeAVRTarget() { diff --git a/lib/Target/AVR/AVRTargetMachine.h b/lib/Target/AVR/AVRTargetMachine.h index 10345193d14a..795e94e6af03 100644 --- a/lib/Target/AVR/AVRTargetMachine.h +++ b/lib/Target/AVR/AVRTargetMachine.h @@ -41,6 +41,10 @@ public: TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + bool isMachineVerifierClean() const override { + return false; + } + private: std::unique_ptr<TargetLoweringObjectFile> TLOF; AVRSubtarget SubTarget; diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp index 897695633e46..cf8e73540904 100644 --- a/lib/Target/BPF/BPFTargetMachine.cpp +++ b/lib/Target/BPF/BPFTargetMachine.cpp @@ -58,7 +58,7 @@ namespace { // BPF Code Generator Pass Configuration Options. class BPFPassConfig : public TargetPassConfig { public: - BPFPassConfig(BPFTargetMachine *TM, PassManagerBase &PM) + BPFPassConfig(BPFTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} BPFTargetMachine &getBPFTargetMachine() const { @@ -70,7 +70,7 @@ public: } TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) { - return new BPFPassConfig(this, PM); + return new BPFPassConfig(*this, PM); } // Install an instruction selector pass using diff --git a/lib/Target/BPF/CMakeLists.txt b/lib/Target/BPF/CMakeLists.txt index e2654b0465df..4918653ff19d 100644 --- a/lib/Target/BPF/CMakeLists.txt +++ b/lib/Target/BPF/CMakeLists.txt @@ -4,7 +4,7 @@ tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info) tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info) tablegen(LLVM BPFGenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM BPFGenAsmWriter.inc -gen-asm-writer) -tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher) +tablegen(LLVM BPFGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM BPFGenDAGISel.inc -gen-dag-isel) tablegen(LLVM BPFGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM BPFGenCallingConv.inc -gen-callingconv) diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 8e10c521a77d..e4434136bf86 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -71,6 +71,9 @@ public: return true; } + bool ComplexPatternFuncMutatesDAG() const override { + return true; + } void PreprocessISelDAG() override; void EmitFunctionEntryCode() override; @@ -81,6 +84,7 @@ public: inline bool SelectAddrGP(SDValue &N, SDValue &R); bool SelectGlobalAddress(SDValue &N, SDValue &R, bool UseGP); bool SelectAddrFI(SDValue &N, SDValue &R); + bool DetectUseSxtw(SDValue &N, SDValue &R); StringRef getPassName() const override { return "Hexagon DAG->DAG Pattern Instruction Selection"; @@ -106,7 +110,6 @@ public: void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl); void SelectStore(SDNode *N); void SelectSHL(SDNode *N); - void SelectMul(SDNode *N); void SelectZeroExtend(SDNode *N); void SelectIntrinsicWChain(SDNode *N); void SelectIntrinsicWOChain(SDNode *N); @@ -118,7 +121,7 @@ public: #include "HexagonGenDAGISel.inc" private: - bool isValueExtension(const SDValue &Val, unsigned FromBits, SDValue &Src); + bool keepsLowBits(const SDValue &Val, unsigned NumBits, SDValue &Src); bool isOrEquivalentToAdd(const SDNode *N) const; bool isAlignedMemNode(const MemSDNode *N) const; bool isPositiveHalfWord(const SDNode *N) const; @@ -597,90 +600,6 @@ void HexagonDAGToDAGISel::SelectStore(SDNode *N) { SelectCode(ST); } -void HexagonDAGToDAGISel::SelectMul(SDNode *N) { - SDLoc dl(N); - - // %conv.i = sext i32 %tmp1 to i64 - // %conv2.i = sext i32 %add to i64 - // %mul.i = mul nsw i64 %conv2.i, %conv.i - // - // --- match with the following --- - // - // %mul.i = mpy (%tmp1, %add) - // - - if (N->getValueType(0) == MVT::i64) { - // Shifting a i64 signed multiply. - SDValue MulOp0 = N->getOperand(0); - SDValue MulOp1 = N->getOperand(1); - - SDValue OP0; - SDValue OP1; - - // Handle sign_extend and sextload. - if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) { - SDValue Sext0 = MulOp0.getOperand(0); - if (Sext0.getNode()->getValueType(0) != MVT::i32) { - SelectCode(N); - return; - } - OP0 = Sext0; - } else if (MulOp0.getOpcode() == ISD::LOAD) { - LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode()); - if (LD->getMemoryVT() != MVT::i32 || - LD->getExtensionType() != ISD::SEXTLOAD || - LD->getAddressingMode() != ISD::UNINDEXED) { - SelectCode(N); - return; - } - SDValue Chain = LD->getChain(); - SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32); - OP0 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32, - MVT::Other, - LD->getBasePtr(), TargetConst0, - Chain), 0); - } else { - SelectCode(N); - return; - } - - // Same goes for the second operand. - if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) { - SDValue Sext1 = MulOp1.getOperand(0); - if (Sext1.getNode()->getValueType(0) != MVT::i32) { - SelectCode(N); - return; - } - OP1 = Sext1; - } else if (MulOp1.getOpcode() == ISD::LOAD) { - LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode()); - if (LD->getMemoryVT() != MVT::i32 || - LD->getExtensionType() != ISD::SEXTLOAD || - LD->getAddressingMode() != ISD::UNINDEXED) { - SelectCode(N); - return; - } - SDValue Chain = LD->getChain(); - SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32); - OP1 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32, - MVT::Other, - LD->getBasePtr(), TargetConst0, - Chain), 0); - } else { - SelectCode(N); - return; - } - - // Generate a mpy instruction. - SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_dpmpyss_s0, dl, - MVT::i64, OP0, OP1); - ReplaceNode(N, Result); - return; - } - - SelectCode(N); -} - void HexagonDAGToDAGISel::SelectSHL(SDNode *N) { SDLoc dl(N); SDValue Shl_0 = N->getOperand(0); @@ -843,7 +762,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) { SDValue V = N->getOperand(1); SDValue U; - if (isValueExtension(V, Bits, U)) { + if (keepsLowBits(V, Bits, U)) { SDValue R = CurDAG->getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), N->getOperand(0), U); ReplaceNode(N, R.getNode()); @@ -949,7 +868,6 @@ void HexagonDAGToDAGISel::Select(SDNode *N) { case ISD::SHL: return SelectSHL(N); case ISD::LOAD: return SelectLoad(N); case ISD::STORE: return SelectStore(N); - case ISD::MUL: return SelectMul(N); case ISD::ZERO_EXTEND: return SelectZeroExtend(N); case ISD::INTRINSIC_W_CHAIN: return SelectIntrinsicWChain(N); case ISD::INTRINSIC_WO_CHAIN: return SelectIntrinsicWOChain(N); @@ -1327,7 +1245,7 @@ void HexagonDAGToDAGISel::EmitFunctionEntryCode() { } // Match a frame index that can be used in an addressing mode. -bool HexagonDAGToDAGISel::SelectAddrFI(SDValue& N, SDValue &R) { +bool HexagonDAGToDAGISel::SelectAddrFI(SDValue &N, SDValue &R) { if (N.getOpcode() != ISD::FrameIndex) return false; auto &HFI = *HST->getFrameLowering(); @@ -1388,16 +1306,83 @@ bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R, return false; } -bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val, - unsigned FromBits, SDValue &Src) { +bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) { + // This (complex pattern) function is meant to detect a sign-extension + // i32->i64 on a per-operand basis. This would allow writing single + // patterns that would cover a number of combinations of different ways + // a sign-extensions could be written. For example: + // (mul (DetectUseSxtw x) (DetectUseSxtw y)) -> (M2_dpmpyss_s0 x y) + // could match either one of these: + // (mul (sext x) (sext_inreg y)) + // (mul (sext-load *p) (sext_inreg y)) + // (mul (sext_inreg x) (sext y)) + // etc. + // + // The returned value will have type i64 and its low word will + // contain the value being extended. The high bits are not specified. + // The returned type is i64 because the original type of N was i64, + // but the users of this function should only use the low-word of the + // result, e.g. + // (mul sxtw:x, sxtw:y) -> (M2_dpmpyss_s0 (LoReg sxtw:x), (LoReg sxtw:y)) + + if (N.getValueType() != MVT::i64) + return false; + EVT SrcVT; + unsigned Opc = N.getOpcode(); + switch (Opc) { + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_INREG: { + // sext_inreg has the source type as a separate operand. + EVT T = Opc == ISD::SIGN_EXTEND + ? N.getOperand(0).getValueType() + : cast<VTSDNode>(N.getOperand(1))->getVT(); + if (T.getSizeInBits() != 32) + return false; + R = N.getOperand(0); + break; + } + case ISD::LOAD: { + LoadSDNode *L = cast<LoadSDNode>(N); + if (L->getExtensionType() != ISD::SEXTLOAD) + return false; + // All extending loads extend to i32, so even if the value in + // memory is shorter than 32 bits, it will be i32 after the load. + if (L->getMemoryVT().getSizeInBits() > 32) + return false; + R = N; + break; + } + default: + return false; + } + EVT RT = R.getValueType(); + if (RT == MVT::i64) + return true; + assert(RT == MVT::i32); + // This is only to produce a value of type i64. Do not rely on the + // high bits produced by this. + const SDLoc &dl(N); + SDValue Ops[] = { + CurDAG->getTargetConstant(Hexagon::DoubleRegsRegClassID, dl, MVT::i32), + R, CurDAG->getTargetConstant(Hexagon::isub_hi, dl, MVT::i32), + R, CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32) + }; + SDNode *T = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, + MVT::i64, Ops); + R = SDValue(T, 0); + return true; +} + +bool HexagonDAGToDAGISel::keepsLowBits(const SDValue &Val, unsigned NumBits, + SDValue &Src) { unsigned Opc = Val.getOpcode(); switch (Opc) { case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: { - SDValue const &Op0 = Val.getOperand(0); + const SDValue &Op0 = Val.getOperand(0); EVT T = Op0.getValueType(); - if (T.isInteger() && T.getSizeInBits() == FromBits) { + if (T.isInteger() && T.getSizeInBits() == NumBits) { Src = Op0; return true; } @@ -1408,23 +1393,23 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val, case ISD::AssertZext: if (Val.getOperand(0).getValueType().isInteger()) { VTSDNode *T = cast<VTSDNode>(Val.getOperand(1)); - if (T->getVT().getSizeInBits() == FromBits) { + if (T->getVT().getSizeInBits() == NumBits) { Src = Val.getOperand(0); return true; } } break; case ISD::AND: { - // Check if this is an AND with "FromBits" of lower bits set to 1. - uint64_t FromMask = (1 << FromBits) - 1; + // Check if this is an AND with NumBits of lower bits set to 1. + uint64_t Mask = (1 << NumBits) - 1; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) { - if (C->getZExtValue() == FromMask) { + if (C->getZExtValue() == Mask) { Src = Val.getOperand(1); return true; } } if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) { - if (C->getZExtValue() == FromMask) { + if (C->getZExtValue() == Mask) { Src = Val.getOperand(0); return true; } @@ -1433,16 +1418,16 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val, } case ISD::OR: case ISD::XOR: { - // OR/XOR with the lower "FromBits" bits set to 0. - uint64_t FromMask = (1 << FromBits) - 1; + // OR/XOR with the lower NumBits bits set to 0. + uint64_t Mask = (1 << NumBits) - 1; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) { - if ((C->getZExtValue() & FromMask) == 0) { + if ((C->getZExtValue() & Mask) == 0) { Src = Val.getOperand(1); return true; } } if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) { - if ((C->getZExtValue() & FromMask) == 0) { + if ((C->getZExtValue() & Mask) == 0) { Src = Val.getOperand(0); return true; } diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 5ecf9320d5c2..4c6c6eeafbe0 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1928,11 +1928,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BSWAP, MVT::i32, Legal); setOperationAction(ISD::BSWAP, MVT::i64, Legal); - - // We custom lower i64 to i64 mul, so that it is not considered as a legal - // operation. There is a pattern that will match i64 mul and transform it - // to a series of instructions. - setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MUL, MVT::i64, Legal); for (unsigned IntExpOp : { ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index 66e07c67958e..0fef91ec4d3e 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1769,161 +1769,6 @@ bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr &MI) const { return getType(MI) == HexagonII::TypeCJ && MI.isBranch(); } -bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const { - return (MI.isBranch() && isPredicated(MI)) || - isConditionalTransfer(MI) || - isConditionalALU32(MI) || - isConditionalLoad(MI) || - // Predicated stores which don't have a .new on any operands. - (MI.mayStore() && isPredicated(MI) && !isNewValueStore(MI) && - !isPredicatedNew(MI)); -} - -bool HexagonInstrInfo::isConditionalALU32(const MachineInstr &MI) const { - switch (MI.getOpcode()) { - case Hexagon::A2_paddf: - case Hexagon::A2_paddfnew: - case Hexagon::A2_paddif: - case Hexagon::A2_paddifnew: - case Hexagon::A2_paddit: - case Hexagon::A2_padditnew: - case Hexagon::A2_paddt: - case Hexagon::A2_paddtnew: - case Hexagon::A2_pandf: - case Hexagon::A2_pandfnew: - case Hexagon::A2_pandt: - case Hexagon::A2_pandtnew: - case Hexagon::A2_porf: - case Hexagon::A2_porfnew: - case Hexagon::A2_port: - case Hexagon::A2_portnew: - case Hexagon::A2_psubf: - case Hexagon::A2_psubfnew: - case Hexagon::A2_psubt: - case Hexagon::A2_psubtnew: - case Hexagon::A2_pxorf: - case Hexagon::A2_pxorfnew: - case Hexagon::A2_pxort: - case Hexagon::A2_pxortnew: - case Hexagon::A4_paslhf: - case Hexagon::A4_paslhfnew: - case Hexagon::A4_paslht: - case Hexagon::A4_paslhtnew: - case Hexagon::A4_pasrhf: - case Hexagon::A4_pasrhfnew: - case Hexagon::A4_pasrht: - case Hexagon::A4_pasrhtnew: - case Hexagon::A4_psxtbf: - case Hexagon::A4_psxtbfnew: - case Hexagon::A4_psxtbt: - case Hexagon::A4_psxtbtnew: - case Hexagon::A4_psxthf: - case Hexagon::A4_psxthfnew: - case Hexagon::A4_psxtht: - case Hexagon::A4_psxthtnew: - case Hexagon::A4_pzxtbf: - case Hexagon::A4_pzxtbfnew: - case Hexagon::A4_pzxtbt: - case Hexagon::A4_pzxtbtnew: - case Hexagon::A4_pzxthf: - case Hexagon::A4_pzxthfnew: - case Hexagon::A4_pzxtht: - case Hexagon::A4_pzxthtnew: - case Hexagon::C2_ccombinewf: - case Hexagon::C2_ccombinewt: - return true; - } - return false; -} - -// FIXME - Function name and it's functionality don't match. -// It should be renamed to hasPredNewOpcode() -bool HexagonInstrInfo::isConditionalLoad(const MachineInstr &MI) const { - if (!MI.getDesc().mayLoad() || !isPredicated(MI)) - return false; - - int PNewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode()); - // Instruction with valid predicated-new opcode can be promoted to .new. - return PNewOpcode >= 0; -} - -// Returns true if an instruction is a conditional store. -// -// Note: It doesn't include conditional new-value stores as they can't be -// converted to .new predicate. -bool HexagonInstrInfo::isConditionalStore(const MachineInstr &MI) const { - switch (MI.getOpcode()) { - default: return false; - case Hexagon::S4_storeirbt_io: - case Hexagon::S4_storeirbf_io: - case Hexagon::S4_pstorerbt_rr: - case Hexagon::S4_pstorerbf_rr: - case Hexagon::S2_pstorerbt_io: - case Hexagon::S2_pstorerbf_io: - case Hexagon::S2_pstorerbt_pi: - case Hexagon::S2_pstorerbf_pi: - case Hexagon::S2_pstorerdt_io: - case Hexagon::S2_pstorerdf_io: - case Hexagon::S4_pstorerdt_rr: - case Hexagon::S4_pstorerdf_rr: - case Hexagon::S2_pstorerdt_pi: - case Hexagon::S2_pstorerdf_pi: - case Hexagon::S2_pstorerht_io: - case Hexagon::S2_pstorerhf_io: - case Hexagon::S4_storeirht_io: - case Hexagon::S4_storeirhf_io: - case Hexagon::S4_pstorerht_rr: - case Hexagon::S4_pstorerhf_rr: - case Hexagon::S2_pstorerht_pi: - case Hexagon::S2_pstorerhf_pi: - case Hexagon::S2_pstorerit_io: - case Hexagon::S2_pstorerif_io: - case Hexagon::S4_storeirit_io: - case Hexagon::S4_storeirif_io: - case Hexagon::S4_pstorerit_rr: - case Hexagon::S4_pstorerif_rr: - case Hexagon::S2_pstorerit_pi: - case Hexagon::S2_pstorerif_pi: - - // V4 global address store before promoting to dot new. - case Hexagon::S4_pstorerdt_abs: - case Hexagon::S4_pstorerdf_abs: - case Hexagon::S4_pstorerbt_abs: - case Hexagon::S4_pstorerbf_abs: - case Hexagon::S4_pstorerht_abs: - case Hexagon::S4_pstorerhf_abs: - case Hexagon::S4_pstorerit_abs: - case Hexagon::S4_pstorerif_abs: - return true; - - // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded - // from the "Conditional Store" list. Because a predicated new value store - // would NOT be promoted to a double dot new store. - // This function returns yes for those stores that are predicated but not - // yet promoted to predicate dot new instructions. - } -} - -bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr &MI) const { - switch (MI.getOpcode()) { - case Hexagon::A2_tfrt: - case Hexagon::A2_tfrf: - case Hexagon::C2_cmoveit: - case Hexagon::C2_cmoveif: - case Hexagon::A2_tfrtnew: - case Hexagon::A2_tfrfnew: - case Hexagon::C2_cmovenewit: - case Hexagon::C2_cmovenewif: - case Hexagon::A2_tfrpt: - case Hexagon::A2_tfrpf: - return true; - - default: - return false; - } - return false; -} - // TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle // isFPImm and later getFPImm as well. bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const { @@ -3474,6 +3319,8 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const { // Returns the opcode to use when converting MI, which is a conditional jump, // into a conditional instruction which uses the .new value of the predicate. // We also use branch probabilities to add a hint to the jump. +// If MBPI is null, all edges will be treated as equally likely for the +// purposes of establishing a predication hint. int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, const MachineBranchProbabilityInfo *MBPI) const { // We assume that block can have at most two successors. @@ -3482,9 +3329,16 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, bool Taken = false; const BranchProbability OneHalf(1, 2); + auto getEdgeProbability = [MBPI] (const MachineBasicBlock *Src, + const MachineBasicBlock *Dst) { + if (MBPI) + return MBPI->getEdgeProbability(Src, Dst); + return BranchProbability(1, Src->succ_size()); + }; + if (BrTarget.isMBB()) { const MachineBasicBlock *Dst = BrTarget.getMBB(); - Taken = MBPI->getEdgeProbability(Src, Dst) >= OneHalf; + Taken = getEdgeProbability(Src, Dst) >= OneHalf; } else { // The branch target is not a basic block (most likely a function). // Since BPI only gives probabilities for targets that are basic blocks, @@ -3521,7 +3375,7 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, for (const MachineBasicBlock *SB : B.successors()) { if (!B.isLayoutSuccessor(SB)) continue; - Taken = MBPI->getEdgeProbability(Src, SB) < OneHalf; + Taken = getEdgeProbability(Src, SB) < OneHalf; break; } } else { @@ -3534,7 +3388,7 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, BT = Op.getMBB(); break; } - Taken = BT && MBPI->getEdgeProbability(Src, BT) < OneHalf; + Taken = BT && getEdgeProbability(Src, BT) < OneHalf; } } // if (!Bad) } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h index 97b9bc954688..944d0161a7c8 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/lib/Target/Hexagon/HexagonInstrInfo.h @@ -314,11 +314,6 @@ public: bool isAccumulator(const MachineInstr &MI) const; bool isComplex(const MachineInstr &MI) const; bool isCompoundBranchInstr(const MachineInstr &MI) const; - bool isCondInst(const MachineInstr &MI) const; - bool isConditionalALU32 (const MachineInstr &MI) const; - bool isConditionalLoad(const MachineInstr &MI) const; - bool isConditionalStore(const MachineInstr &MI) const; - bool isConditionalTransfer(const MachineInstr &MI) const; bool isConstExtended(const MachineInstr &MI) const; bool isDeallocRet(const MachineInstr &MI) const; bool isDependent(const MachineInstr &ProdMI, diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index e6ea67d55b43..9aa185fc85a6 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -59,6 +59,9 @@ cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false), cl::desc("Enable Hexagon-specific memcpy for volatile destination.")); +static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000), + cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR")); + static const char *HexagonVolatileMemcpyName = "hexagon_memcpy_forward_vp4cp4n2"; @@ -477,7 +480,7 @@ Value *Simplifier::simplify(Context &C) { WorkListType Q; Q.push_back(C.Root); unsigned Count = 0; - const unsigned Limit = 100000; + const unsigned Limit = SimplifyLimit; while (!Q.empty()) { if (Count++ >= Limit) @@ -501,8 +504,7 @@ Value *Simplifier::simplify(Context &C) { Q.push_back(Op); } } - assert(Count < Limit && "Infinite loop in HLIR/simplify?"); - return C.Root; + return Count < Limit ? C.Root : nullptr; } diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index 81b5e10c1173..70ed123bc898 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -382,48 +382,42 @@ def: T_MType_acc_pat3 <M4_or_andn, and, or>; def: T_MType_acc_pat3 <M4_and_andn, and, and>; def: T_MType_acc_pat3 <M4_xor_andn, and, xor>; +// This complex pattern is really only to detect various forms of +// sign-extension i32->i64. The selected value will be of type i64 +// whose low word is the value being extended. The high word is +// unspecified. +def Usxtw : ComplexPattern<i64, 1, "DetectUseSxtw", [], []>; + def Aext64: PatFrag<(ops node:$Rs), (i64 (anyext node:$Rs))>; -def Sext64: PatFrag<(ops node:$Rs), (i64 (sext node:$Rs))>; def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>; +def Sext64: PatLeaf<(i64 Usxtw:$Rs)>; -// Return true if for a 32 to 64-bit sign-extended load. -def Sext64Ld : PatLeaf<(i64 DoubleRegs:$src1), [{ - LoadSDNode *LD = dyn_cast<LoadSDNode>(N); - if (!LD) - return false; - return LD->getExtensionType() == ISD::SEXTLOAD && - LD->getMemoryVT().getScalarType() == MVT::i32; -}]>; - -def: Pat<(mul (Aext64 I32:$src1), (Aext64 I32:$src2)), - (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>; - -def: Pat<(mul (Sext64 I32:$src1), (Sext64 I32:$src2)), - (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>; +def: Pat<(mul (Aext64 I32:$Rs), (Aext64 I32:$Rt)), + (M2_dpmpyuu_s0 I32:$Rs, I32:$Rt)>; -def: Pat<(mul Sext64Ld:$src1, Sext64Ld:$src2), - (M2_dpmpyss_s0 (LoReg DoubleRegs:$src1), (LoReg DoubleRegs:$src2))>; +def: Pat<(mul Sext64:$Rs, Sext64:$Rt), + (M2_dpmpyss_s0 (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>; // Multiply and accumulate, use full result. // Rxx[+-]=mpy(Rs,Rt) -def: Pat<(add I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))), - (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(add I64:$Rx, (mul Sext64:$Rs, Sext64:$Rt)), + (M2_dpmpyss_acc_s0 I64:$Rx, (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>; -def: Pat<(sub I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))), - (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(sub I64:$Rx, (mul Sext64:$Rs, Sext64:$Rt)), + (M2_dpmpyss_nac_s0 I64:$Rx, (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>; -def: Pat<(add I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))), - (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(add I64:$Rx, (mul (Aext64 I32:$Rs), (Aext64 I32:$Rt))), + (M2_dpmpyuu_acc_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; -def: Pat<(add I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))), - (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(add I64:$Rx, (mul (Zext64 I32:$Rs), (Zext64 I32:$Rt))), + (M2_dpmpyuu_acc_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; -def: Pat<(sub I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))), - (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(sub I64:$Rx, (mul (Aext64 I32:$Rs), (Aext64 I32:$Rt))), + (M2_dpmpyuu_nac_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; -def: Pat<(sub I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))), - (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(sub I64:$Rx, (mul (Zext64 I32:$Rs), (Zext64 I32:$Rt))), + (M2_dpmpyuu_nac_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; class Storepi_pat<PatFrag Store, PatFrag Value, PatFrag Offset, InstHexagon MI> @@ -545,7 +539,8 @@ def: Storexm_simple_pat<truncstorei8, I64, LoReg, S2_storerb_io>; def: Storexm_simple_pat<truncstorei16, I64, LoReg, S2_storerh_io>; def: Storexm_simple_pat<truncstorei32, I64, LoReg, S2_storeri_io>; -def: Pat <(Sext64 I32:$src), (A2_sxtw I32:$src)>; +def: Pat <(i64 (sext I32:$src)), (A2_sxtw I32:$src)>; +def: Pat <(i64 (sext_inreg I64:$src, i32)), (A2_sxtw (LoReg I64:$src))>; def: Pat<(select (i1 (setlt I32:$src, 0)), (sub 0, I32:$src), I32:$src), (A2_abs IntRegs:$src)>; @@ -1159,8 +1154,8 @@ multiclass MinMax_pats_p<PatFrag Op, InstHexagon Inst, InstHexagon SwapInst> { defm: T_MinMax_pats<Op, I64, Inst, SwapInst>; } -def: Pat<(add (Sext64 I32:$Rs), I64:$Rt), - (A2_addsp IntRegs:$Rs, DoubleRegs:$Rt)>; +def: Pat<(add Sext64:$Rs, I64:$Rt), + (A2_addsp (LoReg Sext64:$Rs), DoubleRegs:$Rt)>; let AddedComplexity = 200 in { defm: MinMax_pats_p<setge, A2_maxp, A2_minp>; diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 8e93df6201ae..14ecf297d351 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -223,7 +223,7 @@ namespace { /// Hexagon Code Generator Pass Configuration Options. class HexagonPassConfig : public TargetPassConfig { public: - HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM) + HexagonPassConfig(HexagonTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} HexagonTargetMachine &getHexagonTargetMachine() const { @@ -245,7 +245,7 @@ public: } // namespace TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) { - return new HexagonPassConfig(this, PM); + return new HexagonPassConfig(*this, PM); } void HexagonPassConfig::addIRPasses() { diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index cd474921d4bc..fa08afe4019d 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -273,25 +273,17 @@ bool HexagonPacketizerList::isCallDependent(const MachineInstr &MI, if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister()) return true; - // Check if this is a predicate dependence. - const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg); - if (RC == &Hexagon::PredRegsRegClass) - return true; - - // Assumes that the first operand of the CALLr is the function address. - if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) { - const MachineOperand MO = MI.getOperand(0); - if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) - return true; + // Call-like instructions can be packetized with preceding instructions + // that define registers implicitly used or modified by the call. Explicit + // uses are still prohibited, as in the case of indirect calls: + // r0 = ... + // J2_jumpr r0 + if (DepType == SDep::Data) { + for (const MachineOperand MO : MI.operands()) + if (MO.isReg() && MO.getReg() == DepReg && !MO.isImplicit()) + return true; } - if (HII->isJumpR(MI)) { - const MachineOperand &MO = HII->isPredicated(MI) ? MI.getOperand(1) - : MI.getOperand(0); - assert(MO.isReg() && MO.isUse()); - if (MO.getReg() == DepReg) - return true; - } return false; } @@ -333,11 +325,13 @@ bool HexagonPacketizerList::isNewifiable(const MachineInstr &MI, const TargetRegisterClass *NewRC) { // Vector stores can be predicated, and can be new-value stores, but // they cannot be predicated on a .new predicate value. - if (NewRC == &Hexagon::PredRegsRegClass) + if (NewRC == &Hexagon::PredRegsRegClass) { if (HII->isHVXVec(MI) && MI.mayStore()) return false; - return HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn() || - HII->mayBeNewStore(MI); + return HII->isPredicated(MI) && HII->getDotNewPredOp(MI, nullptr) > 0; + } + // If the class is not PredRegs, it could only apply to new-value stores. + return HII->mayBeNewStore(MI); } // Promote an instructiont to its .cur form. @@ -760,11 +754,14 @@ bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr &MI, return false; } -static bool isImplicitDependency(const MachineInstr &I, unsigned DepReg) { +static bool isImplicitDependency(const MachineInstr &I, bool CheckDef, + unsigned DepReg) { for (auto &MO : I.operands()) { - if (MO.isRegMask() && MO.clobbersPhysReg(DepReg)) + if (CheckDef && MO.isRegMask() && MO.clobbersPhysReg(DepReg)) return true; - if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit()) + if (!MO.isReg() || MO.getReg() != DepReg || !MO.isImplicit()) + continue; + if (CheckDef == MO.isDef()) return true; } return false; @@ -798,7 +795,8 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI, // If dependency is trough an implicitly defined register, we should not // newify the use. - if (isImplicitDependency(PI, DepReg)) + if (isImplicitDependency(PI, true, DepReg) || + isImplicitDependency(MI, false, DepReg)) return false; const MCInstrDesc& MCID = PI.getDesc(); @@ -808,8 +806,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI, // predicate .new if (RC == &Hexagon::PredRegsRegClass) - if (HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn()) - return HII->predCanBeUsedAsDotNew(PI, DepReg); + return HII->predCanBeUsedAsDotNew(PI, DepReg); if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI)) return false; diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp index 2a9bc25d7fad..a2f005ce445a 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -76,7 +76,7 @@ namespace { // Lanai Code Generator Pass Configuration Options. class LanaiPassConfig : public TargetPassConfig { public: - LanaiPassConfig(LanaiTargetMachine *TM, PassManagerBase *PassManager) + LanaiPassConfig(LanaiTargetMachine &TM, PassManagerBase *PassManager) : TargetPassConfig(TM, *PassManager) {} LanaiTargetMachine &getLanaiTargetMachine() const { @@ -91,7 +91,7 @@ public: TargetPassConfig * LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) { - return new LanaiPassConfig(this, &PassManager); + return new LanaiPassConfig(*this, &PassManager); } // Install an instruction selector pass. diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h index 5278c70d909d..083ba6fdf841 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.h +++ b/lib/Target/Lanai/LanaiTargetMachine.h @@ -49,6 +49,10 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; } // namespace llvm diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index bebe5fa35ad4..d8fdc8ba674e 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -52,7 +52,7 @@ namespace { /// MSP430 Code Generator Pass Configuration Options. class MSP430PassConfig : public TargetPassConfig { public: - MSP430PassConfig(MSP430TargetMachine *TM, PassManagerBase &PM) + MSP430PassConfig(MSP430TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} MSP430TargetMachine &getMSP430TargetMachine() const { @@ -65,7 +65,7 @@ public: } // namespace TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) { - return new MSP430PassConfig(this, PM); + return new MSP430PassConfig(*this, PM); } bool MSP430PassConfig::addInstSelector() { diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp index e7ceca9612a9..a222080f6b81 100644 --- a/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/lib/Target/Mips/Mips16FrameLowering.cpp @@ -1,4 +1,4 @@ -//===-- Mips16FrameLowering.cpp - Mips16 Frame Information ----------------===// +//===- Mips16FrameLowering.cpp - Mips16 Frame Information -----------------===// // // The LLVM Compiler Infrastructure // @@ -11,20 +11,29 @@ // //===----------------------------------------------------------------------===// -#include "Mips16FrameLowering.h" #include "MCTargetDesc/MipsBaseInfo.h" +#include "Mips16FrameLowering.h" #include "Mips16InstrInfo.h" #include "MipsInstrInfo.h" #include "MipsRegisterInfo.h" #include "MipsSubtarget.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/Target/TargetOptions.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetFrameLowering.h" +#include <cassert> +#include <cstdint> +#include <vector> using namespace llvm; @@ -63,7 +72,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); - if (CSI.size()) { + if (!CSI.empty()) { const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), @@ -80,7 +89,6 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF, if (hasFP(MF)) BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0) .addReg(Mips::SP).setMIFlag(MachineInstr::FrameSetup); - } void Mips16FrameLowering::emitEpilogue(MachineFunction &MF, diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 092de216e9b8..a9d6ab055892 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -201,7 +201,7 @@ namespace { /// Mips Code Generator Pass Configuration Options. class MipsPassConfig : public TargetPassConfig { public: - MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM) + MipsPassConfig(MipsTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { // The current implementation of long branch pass requires a scratch // register ($at) to be available before branch instructions. Tail merging @@ -227,7 +227,7 @@ public: } // end anonymous namespace TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) { - return new MipsPassConfig(this, PM); + return new MipsPassConfig(*this, PM); } void MipsPassConfig::addIRPasses() { diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index 140d7133f879..a3462868cb11 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -66,6 +66,10 @@ public: bool isLittleEndian() const { return isLittle; } const MipsABIInfo &getABI() const { return ABI; } + + bool isMachineVerifierClean() const override { + return false; + } }; /// Mips32/64 big endian target machine. diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index ab5298d0dcfd..8dfbfece9b8e 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -132,7 +132,7 @@ namespace { class NVPTXPassConfig : public TargetPassConfig { public: - NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM) + NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} NVPTXTargetMachine &getNVPTXTargetMachine() const { @@ -163,7 +163,7 @@ private: } // end anonymous namespace TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { - return new NVPTXPassConfig(this, PM); + return new NVPTXPassConfig(*this, PM); } void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 1ed8e3b1e935..2f3981be22f8 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -65,6 +65,9 @@ public: TargetIRAnalysis getTargetIRAnalysis() override; + bool isMachineVerifierClean() const override { + return false; + } }; // NVPTXTargetMachine. class NVPTXTargetMachine32 : public NVPTXTargetMachine { diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 5fa7b2c6bfb1..54414457388d 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -77,6 +77,11 @@ STATISTIC(SignExtensionsAdded, "Number of sign extensions for compare inputs added."); STATISTIC(ZeroExtensionsAdded, "Number of zero extensions for compare inputs added."); +STATISTIC(NumLogicOpsOnComparison, + "Number of logical ops on i1 values calculated in GPR."); +STATISTIC(OmittedForNonExtendUses, + "Number of compares not eliminated as they have non-extending uses."); + // FIXME: Remove this once the bug has been fixed! cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug", cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden); @@ -275,6 +280,8 @@ private: bool trySETCC(SDNode *N); bool tryEXTEND(SDNode *N); + bool tryLogicOpOfCompares(SDNode *N); + SDValue computeLogicOpInGPR(SDValue LogicOp); SDValue signExtendInputIfNeeded(SDValue Input); SDValue zeroExtendInputIfNeeded(SDValue Input); SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv); @@ -282,6 +289,10 @@ private: int64_t RHSValue, SDLoc dl); SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, int64_t RHSValue, SDLoc dl); + SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts); void PeepholePPC64(); @@ -2501,6 +2512,11 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { return true; } +// Is this opcode a bitwise logical operation? +static bool isLogicOp(unsigned Opc) { + return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR; +} + /// If this node is a sign/zero extension of an integer comparison, /// it can usually be computed in GPR's rather than using comparison /// instructions and ISEL. We only do this on 64-bit targets for now @@ -2513,13 +2529,20 @@ bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) { N->getOpcode() == ISD::SIGN_EXTEND) && "Expecting a zero/sign extend node!"); - if (N->getOperand(0).getOpcode() != ISD::SETCC) + SDValue WideRes; + // If we are zero-extending the result of a logical operation on i1 + // values, we can keep the values in GPRs. + if (isLogicOp(N->getOperand(0).getOpcode()) && + N->getOperand(0).getValueType() == MVT::i1 && + N->getOpcode() == ISD::ZERO_EXTEND) + WideRes = computeLogicOpInGPR(N->getOperand(0)); + else if (N->getOperand(0).getOpcode() != ISD::SETCC) return false; - - SDValue WideRes = - getSETCCInGPR(N->getOperand(0), - N->getOpcode() == ISD::SIGN_EXTEND ? - SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); + else + WideRes = + getSETCCInGPR(N->getOperand(0), + N->getOpcode() == ISD::SIGN_EXTEND ? + SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); if (!WideRes) return false; @@ -2540,6 +2563,159 @@ bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) { return true; } +// Lower a logical operation on i1 values into a GPR sequence if possible. +// The result can be kept in a GPR if requested. +// Three types of inputs can be handled: +// - SETCC +// - TRUNCATE +// - Logical operation (AND/OR/XOR) +// There is also a special case that is handled (namely a complement operation +// achieved with xor %a, -1). +SDValue PPCDAGToDAGISel::computeLogicOpInGPR(SDValue LogicOp) { + assert(isLogicOp(LogicOp.getOpcode()) && + "Can only handle logic operations here."); + assert(LogicOp.getValueType() == MVT::i1 && + "Can only handle logic operations on i1 values here."); + SDLoc dl(LogicOp); + SDValue LHS, RHS; + + // Special case: xor %a, -1 + bool IsBitwiseNegation = isBitwiseNot(LogicOp); + + // Produces a GPR sequence for each operand of the binary logic operation. + // For SETCC, it produces the respective comparison, for TRUNCATE it truncates + // the value in a GPR and for logic operations, it will recursively produce + // a GPR sequence for the operation. + auto getLogicOperand = [&] (SDValue Operand) -> SDValue { + unsigned OperandOpcode = Operand.getOpcode(); + if (OperandOpcode == ISD::SETCC) + return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig); + else if (OperandOpcode == ISD::TRUNCATE) { + SDValue InputOp = Operand.getOperand(0); + EVT InVT = InputOp.getValueType(); + return + SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 : + PPC::RLDICL, dl, InVT, InputOp, + getI64Imm(0, dl), getI64Imm(63, dl)), 0); + } else if (isLogicOp(OperandOpcode)) + return computeLogicOpInGPR(Operand); + return SDValue(); + }; + LHS = getLogicOperand(LogicOp.getOperand(0)); + RHS = getLogicOperand(LogicOp.getOperand(1)); + + // If a GPR sequence can't be produced for the LHS we can't proceed. + // Not producing a GPR sequence for the RHS is only a problem if this isn't + // a bitwise negation operation. + if (!LHS || (!RHS && !IsBitwiseNegation)) + return SDValue(); + + NumLogicOpsOnComparison++; + + // We will use the inputs as 64-bit values. + if (LHS.getValueType() == MVT::i32) + LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext); + if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32) + RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext); + + unsigned NewOpc; + switch (LogicOp.getOpcode()) { + default: llvm_unreachable("Unknown logic operation."); + case ISD::AND: NewOpc = PPC::AND8; break; + case ISD::OR: NewOpc = PPC::OR8; break; + case ISD::XOR: NewOpc = PPC::XOR8; break; + } + + if (IsBitwiseNegation) { + RHS = getI64Imm(1, dl); + NewOpc = PPC::XORI8; + } + + return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0); + +} + +/// Try performing logical operations on results of comparisons in GPRs. +/// It is typically preferred from a performance perspective over performing +/// the operations on individual bits in the CR. We only do this on 64-bit +/// targets for now as the code is specialized for 64-bit (it uses 64-bit +/// instructions and assumes 64-bit registers). +bool PPCDAGToDAGISel::tryLogicOpOfCompares(SDNode *N) { + if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64()) + return false; + if (N->getValueType(0) != MVT::i1) + return false; + assert(isLogicOp(N->getOpcode()) && + "Expected a logic operation on setcc results."); + SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0)); + if (!LoweredLogical) + return false; + + SDLoc dl(N); + bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8; + unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt; + SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32); + SDValue LHS = LoweredLogical.getOperand(0); + SDValue RHS = LoweredLogical.getOperand(1); + SDValue WideOp; + SDValue OpToConvToRecForm; + + // Look through any 32-bit to 64-bit implicit extend nodes to find the opcode + // that is input to the XORI. + if (IsBitwiseNegate && + LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG) + OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1); + else if (IsBitwiseNegate) + // If the input to the XORI isn't an extension, that's what we're after. + OpToConvToRecForm = LoweredLogical.getOperand(0); + else + // If this is not an XORI, it is a reg-reg logical op and we can convert it + // to record-form. + OpToConvToRecForm = LoweredLogical; + + // Get the record-form version of the node we're looking to use to get the + // CR result from. + uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode(); + int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc); + + // Convert the right node to record-form. This is either the logical we're + // looking at or it is the input node to the negation (if we're looking at + // a bitwise negation). + if (NewOpc != -1 && IsBitwiseNegate) { + // The input to the XORI has a record-form. Use it. + assert(LoweredLogical.getConstantOperandVal(1) == 1 && + "Expected a PPC::XORI8 only for bitwise negation."); + // Emit the record-form instruction. + std::vector<SDValue> Ops; + for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++) + Ops.push_back(OpToConvToRecForm.getOperand(i)); + + WideOp = + SDValue(CurDAG->getMachineNode(NewOpc, dl, + OpToConvToRecForm.getValueType(), + MVT::Glue, Ops), 0); + } else { + assert((NewOpc != -1 || !IsBitwiseNegate) && + "No record form available for AND8/OR8/XOR8?"); + WideOp = + SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl, + MVT::i64, MVT::Glue, LHS, RHS), 0); + } + + // Select this node to a single bit from CR0 set by the record-form node + // just created. For bitwise negation, use the EQ bit which is the equivalent + // of negating the result (i.e. it is a bit set when the result of the + // operation is zero). + SDValue SRIdxVal = + CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32); + SDValue CRBit = + SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, + MVT::i1, CR0Reg, SRIdxVal, + WideOp.getValue(1)), 0); + ReplaceNode(N, CRBit.getNode()); + return true; +} + /// If the value isn't guaranteed to be sign-extended to 64-bits, extend it. /// Useful when emitting comparison code for 32-bit values without using /// the compare instruction (which only considers the lower 32-bits). @@ -2677,6 +2853,77 @@ SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS, } } +/// Produces a zero-extended result of comparing two 64-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get64BitZExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6) + // (zext (setcc %a, 0, seteq)) -> (lshr (ctlz %a), 6) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0); + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz, + getI64Imm(58, dl), getI64Imm(63, dl)), + 0); + } + } +} + +/// Produces a sign-extended result of comparing two 64-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get64BitSExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1) + // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA) + // {addcz.reg, addcz.CA} = (addcarry %a, -1) + // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA) + SDValue AddInput = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue Addic = + SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue, + AddInput, getI32Imm(~0U, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic, + Addic, Addic.getValue(1)), 0); + } + } +} + +/// Does this SDValue have any uses for which keeping the value in a GPR is +/// appropriate. This is meant to be used on values that have type i1 since +/// it is somewhat meaningless to ask if values of other types can be kept in +/// GPR's. +static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) { + assert(Compare.getOpcode() == ISD::SETCC && + "An ISD::SETCC node required here."); + + // For values that have a single use, the caller should obviously already have + // checked if that use is an extending use. We check the other uses here. + if (Compare.hasOneUse()) + return true; + // We want the value in a GPR if it is being extended, used for a select, or + // used in logical operations. + for (auto CompareUse : Compare.getNode()->uses()) + if (CompareUse->getOpcode() != ISD::SIGN_EXTEND && + CompareUse->getOpcode() != ISD::ZERO_EXTEND && + CompareUse->getOpcode() != ISD::SELECT && + !isLogicOp(CompareUse->getOpcode())) { + OmittedForNonExtendUses++; + return false; + } + return true; +} + /// Returns an equivalent of a SETCC node but with the result the same width as /// the inputs. This can nalso be used for SELECT_CC if either the true or false /// values is a power of two while the other is zero. @@ -2686,6 +2933,11 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, Compare.getOpcode() == ISD::SELECT_CC) && "An ISD::SETCC node required here."); + // Don't convert this comparison to a GPR sequence because there are uses + // of the i1 result (i.e. uses that require the result in the CR). + if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG)) + return SDValue(); + SDValue LHS = Compare.getOperand(0); SDValue RHS = Compare.getOperand(1); @@ -2694,30 +2946,35 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, ISD::CondCode CC = cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get(); EVT InputVT = LHS.getValueType(); - if (InputVT != MVT::i32) + if (InputVT != MVT::i32 && InputVT != MVT::i64) return SDValue(); - SDLoc dl(Compare); - ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS); - int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; - if (ConvOpts == SetccInGPROpts::ZExtInvert || ConvOpts == SetccInGPROpts::SExtInvert) CC = ISD::getSetCCInverse(CC, true); - if (ISD::isSignedIntSetCC(CC)) { + bool Inputs32Bit = InputVT == MVT::i32; + if (ISD::isSignedIntSetCC(CC) && Inputs32Bit) { LHS = signExtendInputIfNeeded(LHS); RHS = signExtendInputIfNeeded(RHS); - } else if (ISD::isUnsignedIntSetCC(CC)) { + } else if (ISD::isUnsignedIntSetCC(CC) && Inputs32Bit) { LHS = zeroExtendInputIfNeeded(LHS); RHS = zeroExtendInputIfNeeded(RHS); } + SDLoc dl(Compare); + ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS); + int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig || ConvOpts == SetccInGPROpts::SExtInvert; - if (IsSext) + + if (IsSext && Inputs32Bit) return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl); - return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); + else if (Inputs32Bit) + return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); + else if (IsSext) + return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl); + return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl); } void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { @@ -2906,6 +3163,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } case ISD::AND: { + if (tryLogicOpOfCompares(N)) + return; + unsigned Imm, Imm2, SH, MB, ME; uint64_t Imm64; @@ -3025,6 +3285,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (tryBitfieldInsert(N)) return; + if (tryLogicOpOfCompares(N)) + return; + short Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && isIntS16Immediate(N->getOperand(1), Imm)) { @@ -3042,6 +3305,11 @@ void PPCDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } + case ISD::XOR: { + if (tryLogicOpOfCompares(N)) + return; + break; + } case ISD::ADD: { short Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 216efcc4a1ee..41ff9d903aa0 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1041,6 +1041,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; + MaxLoadsPerMemcmp = 128; + } else { + MaxLoadsPerMemcmp = 8; + MaxLoadsPerMemcmpOptSize = 4; } } @@ -1112,6 +1116,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; + case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; @@ -1593,17 +1598,25 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { return true; } - // Check that the mask is shuffling words -static bool isWordShuffleMask(ShuffleVectorSDNode *N) { - for (unsigned i = 0; i < 4; ++i) { - unsigned B0 = N->getMaskElt(i*4); - unsigned B1 = N->getMaskElt(i*4+1); - unsigned B2 = N->getMaskElt(i*4+2); - unsigned B3 = N->getMaskElt(i*4+3); - if (B0 % 4) - return false; - if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1) +// Check that the mask is shuffling N byte elements. +static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width) { + assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) && + "Unexpected element width."); + + unsigned NumOfElem = 16 / Width; + unsigned MaskVal[16]; // Width is never greater than 16 + for (unsigned i = 0; i < NumOfElem; ++i) { + MaskVal[0] = N->getMaskElt(i * Width); + if (MaskVal[0] % Width) { return false; + } + + for (unsigned int j = 1; j < Width; ++j) { + MaskVal[j] = N->getMaskElt(i * Width + j); + if (MaskVal[j] != MaskVal[j-1] + 1) { + return false; + } + } } return true; @@ -1611,7 +1624,7 @@ static bool isWordShuffleMask(ShuffleVectorSDNode *N) { bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, unsigned &InsertAtByte, bool &Swap, bool IsLE) { - if (!isWordShuffleMask(N)) + if (!isNByteElemShuffleMask(N, 4)) return false; // Now we look at mask elements 0,4,8,12 @@ -1688,7 +1701,7 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, bool &Swap, bool IsLE) { assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); // Ensure each byte index of the word is consecutive. - if (!isWordShuffleMask(N)) + if (!isNByteElemShuffleMask(N, 4)) return false; // Now we look at mask elements 0,4,8,12, which are the beginning of words. @@ -1746,6 +1759,66 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, } } +/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap +/// if the inputs to the instruction should be swapped and set \p DM to the +/// value for the immediate. +/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI +/// AND element 0 of the result comes from the first input (LE) or second input +/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered. +/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle +/// mask. +bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, + bool &Swap, bool IsLE) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + + // Ensure each byte index of the double word is consecutive. + if (!isNByteElemShuffleMask(N, 8)) + return false; + + unsigned M0 = N->getMaskElt(0) / 8; + unsigned M1 = N->getMaskElt(8) / 8; + assert(((M0 | M1) < 4) && "A mask element out of bounds?"); + + // If both vector operands for the shuffle are the same vector, the mask will + // contain only elements from the first one and the second one will be undef. + if (N->getOperand(1).isUndef()) { + if ((M0 | M1) < 2) { + DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1); + Swap = false; + return true; + } else + return false; + } + + if (IsLE) { + if (M0 > 1 && M1 < 2) { + Swap = false; + } else if (M0 < 2 && M1 > 1) { + M0 = (M0 + 2) % 4; + M1 = (M1 + 2) % 4; + Swap = true; + } else + return false; + + // Note: if control flow comes here that means Swap is already set above + DM = (((~M1) & 1) << 1) + ((~M0) & 1); + return true; + } else { // BE + if (M0 < 2 && M1 > 1) { + Swap = false; + } else if (M0 > 1 && M1 < 2) { + M0 = (M0 + 2) % 4; + M1 = (M1 + 2) % 4; + Swap = true; + } else + return false; + + // Note: if control flow comes here that means Swap is already set above + DM = (M0 << 1) + (M1 & 1); + return true; + } +} + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. @@ -7760,6 +7833,19 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); } + if (Subtarget.hasVSX() && + PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { + if (Swap) + std::swap(V1, V2); + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); + SDValue Conv2 = + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2); + + SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI); + } + if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 2f9eb95f6de6..7982a4a9e9fb 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -90,6 +90,10 @@ namespace llvm { /// VECSHL, + /// XXPERMDI - The PPC XXPERMDI instruction + /// + XXPERMDI, + /// The CMPB instruction (takes two operands of i32 or i64). CMPB, @@ -454,6 +458,10 @@ namespace llvm { /// for a XXSLDWI instruction. bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, bool &Swap, bool IsLE); + /// isXXPERMDIShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXPERMDI instruction. + bool isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE); /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the /// shift amount, otherwise return -1. diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 165970f9678c..295590b2acf6 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -735,12 +735,12 @@ def RLDICL_32_64 : MDForm_1<30, 0, "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, []>, isPPC64; // End fast-isel. -let isCodeGenOnly = 1 in -def RLDICL_32 : MDForm_1<30, 0, - (outs gprc:$rA), - (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), - "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, - []>, isPPC64; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm RLDICL_32 : MDForm_1r<30, 0, + (outs gprc:$rA), + (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; defm RLDICR : MDForm_1r<30, 1, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index fd6785e963a6..f3c68c443b1b 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1983,3 +1983,7 @@ PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const { return &PPC::VSRCRegClass; return RC; } + +int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) { + return PPC::getRecordFormOpcode(Opcode); +} diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index b30d09e03ec4..8dd4dbb60879 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -290,6 +290,7 @@ public: return Reg >= PPC::V0 && Reg <= PPC::V31; } const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const; + static int getRecordFormOpcode(unsigned Opcode); }; } diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 26b99eced23c..8223aa655e38 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -53,6 +53,10 @@ def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> ]>; +def SDT_PPCxxpermdi: SDTypeProfile<1, 3, [ SDTCisVec<0>, + SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> +]>; + def SDT_PPCvcmp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32> ]>; @@ -170,6 +174,7 @@ def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>; def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>; def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>; +def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>; diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 1589ab03e507..c4139ca8b7bd 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -843,7 +843,9 @@ let Uses = [RM] in { def XXPERMDI : XX3Form_2<60, 10, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM), - "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>; + "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, + [(set v2i64:$XT, (PPCxxpermdi v2i64:$XA, v2i64:$XB, + imm32SExt16:$DM))]>; let isCodeGenOnly = 1 in def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM), "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>; diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index ddae5befee3e..b9004cc8a9f5 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -296,7 +296,7 @@ namespace { /// PPC Code Generator Pass Configuration Options. class PPCPassConfig : public TargetPassConfig { public: - PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM) + PPCPassConfig(PPCTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} PPCTargetMachine &getPPCTargetMachine() const { @@ -316,7 +316,7 @@ public: } // end anonymous namespace TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { - return new PPCPassConfig(this, PM); + return new PPCPassConfig(*this, PM); } void PPCPassConfig::addIRPasses() { diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index f2838351cee5..b8f5a2083d80 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -55,6 +55,10 @@ public: const Triple &TT = getTargetTriple(); return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); }; + + bool isMachineVerifierClean() const override { + return false; + } }; /// PowerPC 32-bit target machine. diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 7ee1317bf72f..5559cdc5fe46 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -215,6 +215,11 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { return LoopHasReductions; } +bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { + MaxLoadSize = 8; + return true; +} + bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; } @@ -239,9 +244,18 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) { } unsigned PPCTTIImpl::getCacheLineSize() { - // This is currently only used for the data prefetch pass which is only - // enabled for BG/Q by default. - return CacheLineSize; + // Check first if the user specified a custom line size. + if (CacheLineSize.getNumOccurrences() > 0) + return CacheLineSize; + + // On P7, P8 or P9 we have a cache line size of 128. + unsigned Directive = ST->getDarwinDirective(); + if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 || + Directive == PPC::DIR_PWR9) + return 128; + + // On other processors return a default of 64 bytes. + return 64; } unsigned PPCTTIImpl::getPrefetchDistance() { diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 6ce70fbd8778..2e0116fee04c 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -60,6 +60,7 @@ public: /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp index a20331cd0a3e..efdde04c582d 100644 --- a/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -56,5 +56,5 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, } TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) { - return new TargetPassConfig(this, PM); + return new TargetPassConfig(*this, PM); } diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 1da4d3604304..49c67e0819f7 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -114,7 +114,7 @@ namespace { /// Sparc Code Generator Pass Configuration Options. class SparcPassConfig : public TargetPassConfig { public: - SparcPassConfig(SparcTargetMachine *TM, PassManagerBase &PM) + SparcPassConfig(SparcTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} SparcTargetMachine &getSparcTargetMachine() const { @@ -128,7 +128,7 @@ public: } // namespace TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) { - return new SparcPassConfig(this, PM); + return new SparcPassConfig(*this, PM); } void SparcPassConfig::addIRPasses() { diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h index 48193fe095be..faf714cbe2c9 100644 --- a/lib/Target/Sparc/SparcTargetMachine.h +++ b/lib/Target/Sparc/SparcTargetMachine.h @@ -40,6 +40,10 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; /// Sparc 32-bit target machine diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index ede5005fa491..f30d52f859d7 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -119,7 +119,7 @@ namespace { /// SystemZ Code Generator Pass Configuration Options. class SystemZPassConfig : public TargetPassConfig { public: - SystemZPassConfig(SystemZTargetMachine *TM, PassManagerBase &PM) + SystemZPassConfig(SystemZTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} SystemZTargetMachine &getSystemZTargetMachine() const { @@ -212,7 +212,7 @@ void SystemZPassConfig::addPreEmitPass() { } TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) { - return new SystemZPassConfig(this, PM); + return new SystemZPassConfig(*this, PM); } TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() { diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index a10ca64fa632..eb2f17a2091c 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -51,6 +51,8 @@ public: } bool targetSchedulesPostRAScheduling() const override { return true; }; + + bool isMachineVerifierClean() const override { return false; } }; } // end namespace llvm diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index b974681fb6af..d9b2b8743649 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -129,7 +129,7 @@ namespace { /// WebAssembly Code Generator Pass Configuration Options. class WebAssemblyPassConfig final : public TargetPassConfig { public: - WebAssemblyPassConfig(WebAssemblyTargetMachine *TM, PassManagerBase &PM) + WebAssemblyPassConfig(WebAssemblyTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} WebAssemblyTargetMachine &getWebAssemblyTargetMachine() const { @@ -154,7 +154,7 @@ TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() { TargetPassConfig * WebAssemblyTargetMachine::createPassConfig(PassManagerBase &PM) { - return new WebAssemblyPassConfig(this, PM); + return new WebAssemblyPassConfig(*this, PM); } FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) { diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 6e062ec59347..b5a926f915af 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -587,6 +587,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPSLLDQZ256rr: case X86::VPSLLDQZ512rr: Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; case X86::VPSLLDQZ128rm: case X86::VPSLLDQZ256rm: case X86::VPSLLDQZ512rm: @@ -604,6 +605,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPSRLDQZ256rr: case X86::VPSRLDQZ512rr: Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; case X86::VPSRLDQZ128rm: case X86::VPSRLDQZ256rm: case X86::VPSRLDQZ512rm: @@ -1091,6 +1093,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m) DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask); @@ -1099,6 +1102,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m) DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask); diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 313920e02c3e..5582526541ba 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -123,18 +123,26 @@ namespace { EdgeBundles *Bundles; // Return a bitmask of FP registers in block's live-in list. - static unsigned calcLiveInMask(MachineBasicBlock *MBB) { + static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) { unsigned Mask = 0; - for (const auto &LI : MBB->liveins()) { - if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6) - continue; - Mask |= 1 << (LI.PhysReg - X86::FP0); + for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(); + I != MBB->livein_end(); ) { + MCPhysReg Reg = I->PhysReg; + static_assert(X86::FP6 - X86::FP0 == 6, "sequential regnums"); + if (Reg >= X86::FP0 && Reg <= X86::FP6) { + Mask |= 1 << (Reg - X86::FP0); + if (RemoveFPs) { + I = MBB->removeLiveIn(I); + continue; + } + } + ++I; } return Mask; } // Partition all the CFG edges into LiveBundles. - void bundleCFG(MachineFunction &MF); + void bundleCFGRecomputeKillFlags(MachineFunction &MF); MachineBasicBlock *MBB; // Current basic block @@ -327,7 +335,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); // Prepare cross-MBB liveness. - bundleCFG(MF); + bundleCFGRecomputeKillFlags(MF); StackTop = 0; @@ -375,13 +383,15 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { /// registers live-out from a block is identical to the live-in set of all /// successors. This is not enforced by the normal live-in lists since /// registers may be implicitly defined, or not used by all successors. -void FPS::bundleCFG(MachineFunction &MF) { +void FPS::bundleCFGRecomputeKillFlags(MachineFunction &MF) { assert(LiveBundles.empty() && "Stale data in LiveBundles"); LiveBundles.resize(Bundles->getNumBundles()); // Gather the actual live-in masks for all MBBs. for (MachineBasicBlock &MBB : MF) { - const unsigned Mask = calcLiveInMask(&MBB); + setKillFlags(MBB); + + const unsigned Mask = calcLiveInMask(&MBB, false); if (!Mask) continue; // Update MBB ingoing bundle mask. @@ -396,7 +406,6 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { bool Changed = false; MBB = &BB; - setKillFlags(BB); setupBlockStack(); for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { @@ -453,6 +462,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { unsigned Reg = DeadRegs[i]; // Check if Reg is live on the stack. An inline-asm register operand that // is in the clobber list and marked dead might not be live on the stack. + static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers"); if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) { DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n"); freeStackSlotAfter(I, Reg-X86::FP0); @@ -506,7 +516,6 @@ void FPS::setupBlockStack() { // Push the fixed live-in registers. for (unsigned i = Bundle.FixCount; i > 0; --i) { - MBB->addLiveIn(X86::ST0+i-1); DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP" << unsigned(Bundle.FixStack[i-1]) << '\n'); pushReg(Bundle.FixStack[i-1]); @@ -515,7 +524,8 @@ void FPS::setupBlockStack() { // Kill off unwanted live-ins. This can happen with a critical edge. // FIXME: We could keep these live registers around as zombies. They may need // to be revived at the end of a short block. It might save a few instrs. - adjustLiveRegs(calcLiveInMask(MBB), MBB->begin()); + unsigned Mask = calcLiveInMask(MBB, /*RemoveFPs=*/true); + adjustLiveRegs(Mask, MBB->begin()); DEBUG(MBB->dump()); } diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 331e56976db7..328a80304602 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1062,6 +1062,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } if (HasFP) { + assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved"); + // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; // If required, include space for extra hidden slot for stashing base pointer. @@ -1124,13 +1126,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, nullptr, DwarfFramePtr)); } } - - // Mark the FramePtr as live-in in every block. Don't do this again for - // funclet prologues. - if (!IsFunclet) { - for (MachineBasicBlock &EveryMBB : MF) - EveryMBB.addLiveIn(MachineFramePtr); - } } else { assert(!IsFunclet && "funclets without FPs not yet implemented"); NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index c899f0fd5100..2a1633de0a23 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -418,8 +418,6 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { case X86ISD::XOR: case X86ISD::OR: case ISD::ADD: - case ISD::ADDC: - case ISD::ADDE: case ISD::ADDCARRY: case ISD::AND: case ISD::OR: diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8d78308afe9d..0a41f35f9320 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1,3 +1,4 @@ + //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure @@ -80,12 +81,6 @@ static cl::opt<int> ExperimentalPrefLoopAlignment( " of the loop header PC will be 0)."), cl::Hidden); -static cl::opt<bool> MulConstantOptimization( - "mul-constant-optimization", cl::init(true), - cl::desc("Replace 'mul x, Const' with more effective instructions like " - "SHIFT, LEA, etc."), - cl::Hidden); - /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without @@ -317,16 +312,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UREM, VT, Expand); } - for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { - if (VT == MVT::i64 && !Subtarget.is64Bit()) - continue; - // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. - setOperationAction(ISD::ADDC, VT, Custom); - setOperationAction(ISD::ADDE, VT, Custom); - setOperationAction(ISD::SUBC, VT, Custom); - setOperationAction(ISD::SUBE, VT, Custom); - } - setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, @@ -428,7 +413,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, continue; setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::SETCCE, VT, Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support @@ -1583,6 +1567,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Support carry in as value rather than glue. setOperationAction(ISD::ADDCARRY, VT, Custom); setOperationAction(ISD::SUBCARRY, VT, Custom); + setOperationAction(ISD::SETCCCARRY, VT, Custom); } if (!Subtarget.is64Bit()) { @@ -16304,6 +16289,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::SHL: if (Op.getNode()->getFlags().hasNoSignedWrap()) break; + LLVM_FALLTHROUGH; default: NeedOF = true; break; @@ -17167,17 +17153,17 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETNE: Invert = true; + case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; - case ISD::SETLT: Swap = true; + case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGT: Opc = X86ISD::PCMPGT; break; - case ISD::SETGE: Swap = true; + case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; - case ISD::SETULT: Swap = true; + case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; - case ISD::SETUGE: Swap = true; + case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } @@ -17398,19 +17384,24 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } -SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); - assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); + assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get()); - assert(Carry.getOpcode() != ISD::CARRY_FALSE); + // Recreate the carry if needed. + EVT CarryVT = Carry.getValueType(); + APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); + Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), + Carry, DAG.getConstant(NegOne, DL, CarryVT)); + SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); - SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); + SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG); if (Op.getSimpleValueType() == MVT::i1) return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); @@ -23269,32 +23260,6 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getNode()->getSimpleValueType(0); - - // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) - return SDValue(); - - SDVTList VTs = DAG.getVTList(VT, MVT::i32); - - unsigned Opc; - bool ExtraOp = false; - switch (Op.getOpcode()) { - default: llvm_unreachable("Invalid code"); - case ISD::ADDC: Opc = X86ISD::ADD; break; - case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; - case ISD::SUBC: Opc = X86ISD::SUB; break; - case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; - } - - if (!ExtraOp) - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), - Op.getOperand(1)); - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), - Op.getOperand(1), Op.getOperand(2)); -} - static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); MVT VT = N->getSimpleValueType(0); @@ -23785,7 +23750,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); - case ISD::SETCCE: return LowerSETCCE(Op, DAG); + case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); @@ -23830,10 +23795,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); - case ISD::ADDC: - case ISD::ADDE: - case ISD::SUBC: - case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::ADD: @@ -28946,12 +28907,118 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +// Try to match patterns such as +// (i16 bitcast (v16i1 x)) +// -> +// (i16 movmsk (16i8 sext (v16i1 x))) +// before the illegal vector is scalarized on subtargets that don't have legal +// vxi1 types. +static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, + const X86Subtarget &Subtarget) { + EVT VT = BitCast.getValueType(); + SDValue N0 = BitCast.getOperand(0); + EVT VecVT = N0->getValueType(0); + + if (!VT.isScalarInteger() || !VecVT.isSimple()) + return SDValue(); + + // With AVX512 vxi1 types are legal and we prefer using k-regs. + // MOVMSK is supported in SSE2 or later. + if (Subtarget.hasAVX512() || !Subtarget.hasSSE2()) + return SDValue(); + + // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and + // v8f64. So all legal 128-bit and 256-bit vectors are covered except for + // v8i16 and v16i16. + // For these two cases, we can shuffle the upper element bytes to a + // consecutive sequence at the start of the vector and treat the results as + // v16i8 or v32i8, and for v61i8 this is the prefferable solution. However, + // for v16i16 this is not the case, because the shuffle is expensive, so we + // avoid sign-exteding to this type entirely. + // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: + // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) + MVT SExtVT; + MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE; + switch (VecVT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v2i1: + SExtVT = MVT::v2i64; + FPCastVT = MVT::v2f64; + break; + case MVT::v4i1: + SExtVT = MVT::v4i32; + FPCastVT = MVT::v4f32; + // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) + // sign-extend to a 256-bit operation to avoid truncation. + if (N0->getOpcode() == ISD::SETCC && + N0->getOperand(0)->getValueType(0).is256BitVector() && + Subtarget.hasInt256()) { + SExtVT = MVT::v4i64; + FPCastVT = MVT::v4f64; + } + break; + case MVT::v8i1: + SExtVT = MVT::v8i16; + // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)), + // sign-extend to a 256-bit operation to match the compare. + // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over + // 256-bit because the shuffle is cheaper than sign extending the result of + // the compare. + if (N0->getOpcode() == ISD::SETCC && + N0->getOperand(0)->getValueType(0).is256BitVector() && + Subtarget.hasInt256()) { + SExtVT = MVT::v8i32; + FPCastVT = MVT::v8f32; + } + break; + case MVT::v16i1: + SExtVT = MVT::v16i8; + // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)), + // it is not profitable to sign-extend to 256-bit because this will + // require an extra cross-lane shuffle which is more exprensive than + // truncating the result of the compare to 128-bits. + break; + case MVT::v32i1: + // TODO: Handle pre-AVX2 cases by splitting to two v16i1's. + if (!Subtarget.hasInt256()) + return SDValue(); + SExtVT = MVT::v32i8; + break; + }; + + SDLoc DL(BitCast); + SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT); + if (SExtVT == MVT::v8i16) { + V = DAG.getBitcast(MVT::v16i8, V); + V = DAG.getVectorShuffle( + MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8), + {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1}); + } else + assert(SExtVT.getScalarType() != MVT::i16 && + "Vectors of i16 must be shuffled"); + if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + V = DAG.getBitcast(FPCastVT, V); + V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); + return DAG.getZExtOrTrunc(V, DL, VT); +} + static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SrcVT = N0.getValueType(); + // Try to match patterns such as + // (i16 bitcast (v16i1 x)) + // -> + // (i16 movmsk (16i8 sext (v16i1 x))) + // before the setcc result is scalarized on subtargets that don't have legal + // vxi1 types. + if (DCI.isBeforeLegalize()) + if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget)) + return V; // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. @@ -29944,6 +30011,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: @@ -29974,6 +30042,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: @@ -30008,6 +30077,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: @@ -30036,6 +30106,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: @@ -30933,75 +31004,6 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, } } -static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, - EVT VT, SDLoc DL) { - - auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) { - SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - DAG.getConstant(Mult, DL, VT)); - Result = DAG.getNode(ISD::SHL, DL, VT, Result, - DAG.getConstant(Shift, DL, MVT::i8)); - Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0), - Result); - return Result; - }; - - auto combineMulMulAddOrSub = [&](bool isAdd) { - SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - DAG.getConstant(9, DL, VT)); - Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT)); - Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0), - Result); - return Result; - }; - - switch (MulAmt) { - default: - break; - case 11: - // mul x, 11 => add ((shl (mul x, 5), 1), x) - return combineMulShlAddOrSub(5, 1, /*isAdd*/ true); - case 21: - // mul x, 21 => add ((shl (mul x, 5), 2), x) - return combineMulShlAddOrSub(5, 2, /*isAdd*/ true); - case 22: - // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x) - return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - combineMulShlAddOrSub(5, 2, /*isAdd*/ true)); - case 19: - // mul x, 19 => sub ((shl (mul x, 5), 2), x) - return combineMulShlAddOrSub(5, 2, /*isAdd*/ false); - case 13: - // mul x, 13 => add ((shl (mul x, 3), 2), x) - return combineMulShlAddOrSub(3, 2, /*isAdd*/ true); - case 23: - // mul x, 13 => sub ((shl (mul x, 3), 3), x) - return combineMulShlAddOrSub(3, 3, /*isAdd*/ false); - case 14: - // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x) - return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - combineMulShlAddOrSub(3, 2, /*isAdd*/ true)); - case 26: - // mul x, 26 => sub ((mul (mul x, 9), 3), x) - return combineMulMulAddOrSub(/*isAdd*/ false); - case 28: - // mul x, 28 => add ((mul (mul x, 9), 3), x) - return combineMulMulAddOrSub(/*isAdd*/ true); - case 29: - // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x) - return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - combineMulMulAddOrSub(/*isAdd*/ true)); - case 30: - // mul x, 30 => sub (sub ((shl x, 5), x), x) - return DAG.getNode( - ISD::SUB, DL, VT, N->getOperand(0), - DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), - DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(5, DL, MVT::i8)))); - } - return SDValue(); -} - /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, @@ -31011,8 +31013,6 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); - if (!MulConstantOptimization) - return SDValue(); // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); @@ -31068,8 +31068,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); - } else if (!Subtarget.slowLEA()) - NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL); + } if (!NewMul) { assert(MulAmt != 0 && @@ -34558,8 +34557,7 @@ static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG, isOneConstant(Carry.getOperand(1)))) Carry = Carry.getOperand(0); - if (Carry.getOpcode() == ISD::SETCC || - Carry.getOpcode() == X86ISD::SETCC || + if (Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { if (Carry.getConstantOperandVal(0) == X86::COND_B) return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1)); @@ -35126,7 +35124,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); - case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget); + case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); @@ -35510,6 +35508,7 @@ TargetLowering::ConstraintWeight switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + LLVM_FALLTHROUGH; case 'R': case 'q': case 'Q': @@ -35861,6 +35860,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::GR64RegClass); break; } + LLVM_FALLTHROUGH; // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i32 || VT == MVT::f32) diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 18106c2eb394..f51b6641db2f 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1163,7 +1163,7 @@ namespace llvm { SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 33fbd41bb631..0aee30081a35 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -195,6 +195,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, // It's not always legal to reference the low 8-bit of the larger // register in 32-bit mode. return false; + LLVM_FALLTHROUGH; case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 53a8e83b36fc..cb21f1bd7706 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -323,7 +323,7 @@ namespace { /// X86 Code Generator Pass Configuration Options. class X86PassConfig : public TargetPassConfig { public: - X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM) + X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} X86TargetMachine &getX86TargetMachine() const { @@ -369,7 +369,7 @@ INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix", "X86 Execution Dependency Fix", false, false) TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { - return new X86PassConfig(this, PM); + return new X86PassConfig(*this, PM); } void X86PassConfig::addIRPasses() { @@ -433,6 +433,7 @@ bool X86PassConfig::addPreISel() { void X86PassConfig::addPreRegAlloc() { if (getOptLevel() != CodeGenOpt::None) { + addPass(&LiveRangeShrinkID); addPass(createX86FixupSetCC()); addPass(createX86OptimizeLEAs()); addPass(createX86CallFrameOptimization()); diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index cf933f52604e..1bf267d34ec2 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -49,6 +49,10 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; } // end namespace llvm diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 2950e2efbea3..1a1cbd474888 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -54,7 +54,7 @@ namespace { /// XCore Code Generator Pass Configuration Options. class XCorePassConfig : public TargetPassConfig { public: - XCorePassConfig(XCoreTargetMachine *TM, PassManagerBase &PM) + XCorePassConfig(XCoreTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} XCoreTargetMachine &getXCoreTargetMachine() const { @@ -70,7 +70,7 @@ public: } // end anonymous namespace TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) { - return new XCorePassConfig(this, PM); + return new XCorePassConfig(*this, PM); } void XCorePassConfig::addIRPasses() { diff --git a/lib/Transforms/Coroutines/CoroCleanup.cpp b/lib/Transforms/Coroutines/CoroCleanup.cpp index 5cf2a8c25d83..359876627fce 100644 --- a/lib/Transforms/Coroutines/CoroCleanup.cpp +++ b/lib/Transforms/Coroutines/CoroCleanup.cpp @@ -101,7 +101,9 @@ namespace { struct CoroCleanup : FunctionPass { static char ID; // Pass identification, replacement for typeid - CoroCleanup() : FunctionPass(ID) {} + CoroCleanup() : FunctionPass(ID) { + initializeCoroCleanupPass(*PassRegistry::getPassRegistry()); + } std::unique_ptr<Lowerer> L; diff --git a/lib/Transforms/Coroutines/CoroEarly.cpp b/lib/Transforms/Coroutines/CoroEarly.cpp index b52989186165..ba05896af150 100644 --- a/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/lib/Transforms/Coroutines/CoroEarly.cpp @@ -183,7 +183,9 @@ namespace { struct CoroEarly : public FunctionPass { static char ID; // Pass identification, replacement for typeid. - CoroEarly() : FunctionPass(ID) {} + CoroEarly() : FunctionPass(ID) { + initializeCoroEarlyPass(*PassRegistry::getPassRegistry()); + } std::unique_ptr<Lowerer> L; diff --git a/lib/Transforms/Coroutines/CoroElide.cpp b/lib/Transforms/Coroutines/CoroElide.cpp index acb22449142b..42fd6d746145 100644 --- a/lib/Transforms/Coroutines/CoroElide.cpp +++ b/lib/Transforms/Coroutines/CoroElide.cpp @@ -258,7 +258,9 @@ static bool replaceDevirtTrigger(Function &F) { namespace { struct CoroElide : FunctionPass { static char ID; - CoroElide() : FunctionPass(ID) {} + CoroElide() : FunctionPass(ID) { + initializeCoroElidePass(*PassRegistry::getPassRegistry()); + } std::unique_ptr<Lowerer> L; diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp index cd549e4be282..613b4a7f03e9 100644 --- a/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/lib/Transforms/Coroutines/CoroSplit.cpp @@ -681,7 +681,9 @@ namespace { struct CoroSplit : public CallGraphSCCPass { static char ID; // Pass identification, replacement for typeid - CoroSplit() : CallGraphSCCPass(ID) {} + CoroSplit() : CallGraphSCCPass(ID) { + initializeCoroSplitPass(*PassRegistry::getPassRegistry()); + } bool Run = false; diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 4c417f1c55eb..bc0967448cdd 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -652,12 +652,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { // only split block when necessary: PHINode *FirstPhi = getFirstPHI(PreReturn); unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size(); + auto IsTrivialPhi = [](PHINode *PN) -> Value * { + Value *CommonValue = PN->getIncomingValue(0); + if (all_of(PN->incoming_values(), + [&](Value *V) { return V == CommonValue; })) + return CommonValue; + return nullptr; + }; + if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) { NewReturnBlock = NewReturnBlock->splitBasicBlock( NewReturnBlock->getFirstNonPHI()->getIterator()); BasicBlock::iterator I = PreReturn->begin(); Instruction *Ins = &NewReturnBlock->front(); + SmallVector<Instruction *, 4> DeadPhis; while (I != PreReturn->end()) { PHINode *OldPhi = dyn_cast<PHINode>(I); if (!OldPhi) @@ -674,8 +683,22 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE); OldPhi->removeIncomingValue(NewE); } + + // After incoming values splitting, the old phi may become trivial. + // Keeping the trivial phi can introduce definition inside the outline + // region which is live-out, causing necessary overhead (load, store + // arg passing etc). + if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { + OldPhi->replaceAllUsesWith(OldPhiVal); + DeadPhis.push_back(OldPhi); + } + ++I; } + + for (auto *DP : DeadPhis) + DP->eraseFromParent(); + for (auto E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast<BasicBlock>(VMap[E]); NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 659cb9df00a2..9dede4cedd1d 100644 --- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -6,14 +6,8 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This pass prepares a module containing type metadata for ThinLTO by splitting -// it into regular and thin LTO parts if possible, and writing both parts to -// a multi-module bitcode file. Modules that do not contain type metadata are -// written unmodified as a single module. -// -//===----------------------------------------------------------------------===// +#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -436,3 +430,15 @@ ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str, raw_ostream *ThinLinkOS) { return new WriteThinLTOBitcode(Str, ThinLinkOS); } + +PreservedAnalyses +llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + writeThinLTOBitcode(OS, ThinLinkOS, + [&FAM](Function &F) -> AAResults & { + return FAM.getResult<AAManager>(F); + }, + M, &AM.getResult<ModuleSummaryIndexAnalysis>(M)); + return PreservedAnalyses::all(); +} diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 92a38f26dde7..b44499ec4be9 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3838,24 +3838,24 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // Mark any parameters that are known to be non-null with the nonnull // attribute. This is helpful for inlining calls to functions with null // checks on their arguments. - SmallVector<unsigned, 4> Indices; + SmallVector<unsigned, 4> ArgNos; unsigned ArgNo = 0; for (Value *V : CS.args()) { if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo, Attribute::NonNull) && isKnownNonNullAt(V, CS.getInstruction(), &DT)) - Indices.push_back(ArgNo + AttributeList::FirstArgIndex); + ArgNos.push_back(ArgNo); ArgNo++; } assert(ArgNo == CS.arg_size() && "sanity check"); - if (!Indices.empty()) { + if (!ArgNos.empty()) { AttributeList AS = CS.getAttributes(); LLVMContext &Ctx = CS.getInstruction()->getContext(); - AS = AS.addAttribute(Ctx, Indices, - Attribute::get(Ctx, Attribute::NonNull)); + AS = AS.addParamAttribute(Ctx, ArgNos, + Attribute::get(Ctx, Attribute::NonNull)); CS.setAttributes(AS); Changed = true; } diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 1e30dbf6b55a..b2d95271479c 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -182,6 +182,14 @@ static cl::opt<bool> cl::desc("Use this option to turn on/off " "memory intrinsic size profiling.")); +// Emit branch probability as optimization remarks. +static cl::opt<bool> + EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden, + cl::desc("When this option is on, the annotated " + "branch probability will be emitted as " + " optimization remarks: -Rpass-analysis=" + "pgo-instr-use")); + // Command line option to turn on CFG dot dump after profile annotation. // Defined in Analysis/BlockFrequencyInfo.cpp: -pgo-view-counts extern cl::opt<bool> PGOViewCounts; @@ -192,6 +200,39 @@ extern cl::opt<std::string> ViewBlockFreqFuncName; namespace { +// Return a string describing the branch condition that can be +// used in static branch probability heuristics: +std::string getBranchCondString(Instruction *TI) { + BranchInst *BI = dyn_cast<BranchInst>(TI); + if (!BI || !BI->isConditional()) + return std::string(); + + Value *Cond = BI->getCondition(); + ICmpInst *CI = dyn_cast<ICmpInst>(Cond); + if (!CI) + return std::string(); + + std::string result; + raw_string_ostream OS(result); + OS << CmpInst::getPredicateName(CI->getPredicate()) << "_"; + CI->getOperand(0)->getType()->print(OS, true); + + Value *RHS = CI->getOperand(1); + ConstantInt *CV = dyn_cast<ConstantInt>(RHS); + if (CV) { + if (CV->isZero()) + OS << "_Zero"; + else if (CV->isOne()) + OS << "_One"; + else if (CV->isAllOnesValue()) + OS << "_MinusOne"; + else + OS << "_Const"; + } + OS.flush(); + return result; +} + /// The select instruction visitor plays three roles specified /// by the mode. In \c VM_counting mode, it simply counts the number of /// select instructions. In \c VM_instrument mode, it inserts code to count @@ -1424,6 +1465,29 @@ void setProfMetadata(Module *M, Instruction *TI, ArrayRef<uint64_t> EdgeCounts, for (const auto &W : Weights) { dbgs() << W << " "; } dbgs() << "\n";); TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); + if (EmitBranchProbability) { + std::string BrCondStr = getBranchCondString(TI); + if (BrCondStr.empty()) + return; + + unsigned WSum = + std::accumulate(Weights.begin(), Weights.end(), 0, + [](unsigned w1, unsigned w2) { return w1 + w2; }); + uint64_t TotalCount = + std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), 0, + [](uint64_t c1, uint64_t c2) { return c1 + c2; }); + BranchProbability BP(Weights[0], WSum); + std::string BranchProbStr; + raw_string_ostream OS(BranchProbStr); + OS << BP; + OS << " (total count : " << TotalCount << ")"; + OS.flush(); + Function *F = TI->getParent()->getParent(); + emitOptimizationRemarkAnalysis( + F->getContext(), "pgo-use-annot", *F, TI->getDebugLoc(), + Twine(BrCondStr) + + " is true with probability : " + Twine(BranchProbStr)); + } } template <> struct GraphTraits<PGOUseFunc *> { diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 300085eccb0c..325b64cd8b43 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -7,24 +7,7 @@ // //===----------------------------------------------------------------------===// // -// Coverage instrumentation that works with AddressSanitizer -// and potentially with other Sanitizers. -// -// We create a Guard variable with the same linkage -// as the function and inject this code into the entry block (SCK_Function) -// or all blocks (SCK_BB): -// if (Guard < 0) { -// __sanitizer_cov(&Guard); -// } -// The accesses to Guard are atomic. The rest of the logic is -// in __sanitizer_cov (it's fine to call it more than once). -// -// With SCK_Edge we also split critical edges this effectively -// instrumenting all edges. -// -// This coverage implementation provides very limited data: -// it only tells if a given function (block) was ever executed. No counters. -// But for many use cases this is what we need and the added slowdown small. +// Coverage instrumentation done on LLVM IR level, works with Sanitizers. // //===----------------------------------------------------------------------===// @@ -56,9 +39,6 @@ using namespace llvm; #define DEBUG_TYPE "sancov" -static const char *const SanCovModuleInitName = "__sanitizer_cov_module_init"; -static const char *const SanCovName = "__sanitizer_cov"; -static const char *const SanCovWithCheckName = "__sanitizer_cov_with_check"; static const char *const SanCovTracePCIndirName = "__sanitizer_cov_trace_pc_indir"; static const char *const SanCovTracePCName = "__sanitizer_cov_trace_pc"; @@ -84,12 +64,6 @@ static cl::opt<int> ClCoverageLevel( "3: all blocks and critical edges"), cl::Hidden, cl::init(0)); -static cl::opt<unsigned> ClCoverageBlockThreshold( - "sanitizer-coverage-block-threshold", - cl::desc("Use a callback with a guard check inside it if there are" - " more than this number of blocks."), - cl::Hidden, cl::init(0)); - static cl::opt<bool> ClExperimentalTracePC("sanitizer-coverage-trace-pc", cl::desc("Experimental pc tracing"), cl::Hidden, cl::init(false)); @@ -151,6 +125,8 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) { Options.TraceGep |= ClGEPTracing; Options.TracePC |= ClExperimentalTracePC; Options.TracePCGuard |= ClTracePCGuard; + if (!Options.TracePCGuard && !Options.TracePC) + Options.TracePCGuard = true; // TracePCGuard is default. Options.NoPrune |= !ClPruneBlocks; return Options; } @@ -184,18 +160,10 @@ private: ArrayRef<Instruction *> SwitchTraceTargets); bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks); void CreateFunctionGuardArray(size_t NumGuards, Function &F); - void SetNoSanitizeMetadata(Instruction *I); - void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx, - bool UseCalls); - unsigned NumberOfInstrumentedBlocks() { - return SanCovFunction->getNumUses() + - SanCovWithCheckFunction->getNumUses(); - } + void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx); StringRef getSanCovTracePCGuardSection() const; StringRef getSanCovTracePCGuardSectionStart() const; StringRef getSanCovTracePCGuardSectionEnd() const; - Function *SanCovFunction; - Function *SanCovWithCheckFunction; Function *SanCovTracePCIndir; Function *SanCovTracePC, *SanCovTracePCGuard; Function *SanCovTraceCmpFunction[4]; @@ -209,7 +177,6 @@ private: LLVMContext *C; const DataLayout *DL; - GlobalVariable *GuardArray; GlobalVariable *FunctionGuardArray; // for trace-pc-guard. bool HasSancovGuardsSection; @@ -230,16 +197,11 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { IntptrPtrTy = PointerType::getUnqual(IntptrTy); Type *VoidTy = Type::getVoidTy(*C); IRBuilder<> IRB(*C); - Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty()); Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty()); Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); Int64Ty = IRB.getInt64Ty(); Int32Ty = IRB.getInt32Ty(); - SanCovFunction = checkSanitizerInterfaceFunction( - M.getOrInsertFunction(SanCovName, VoidTy, Int32PtrTy)); - SanCovWithCheckFunction = checkSanitizerInterfaceFunction( - M.getOrInsertFunction(SanCovWithCheckName, VoidTy, Int32PtrTy)); SanCovTracePCIndir = checkSanitizerInterfaceFunction( M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy)); SanCovTraceCmpFunction[0] = @@ -278,41 +240,10 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { SanCovTracePCGuard = checkSanitizerInterfaceFunction(M.getOrInsertFunction( SanCovTracePCGuardName, VoidTy, Int32PtrTy)); - // At this point we create a dummy array of guards because we don't - // know how many elements we will need. - Type *Int32Ty = IRB.getInt32Ty(); - - if (!Options.TracePCGuard) - GuardArray = - new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage, - nullptr, "__sancov_gen_cov_tmp"); - for (auto &F : M) runOnFunction(F); - auto N = NumberOfInstrumentedBlocks(); - - GlobalVariable *RealGuardArray = nullptr; - if (!Options.TracePCGuard) { - // Now we know how many elements we need. Create an array of guards - // with one extra element at the beginning for the size. - Type *Int32ArrayNTy = ArrayType::get(Int32Ty, N + 1); - RealGuardArray = new GlobalVariable( - M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage, - Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov"); - - // Replace the dummy array with the real one. - GuardArray->replaceAllUsesWith( - IRB.CreatePointerCast(RealGuardArray, Int32PtrTy)); - GuardArray->eraseFromParent(); - } - // Create variable for module (compilation unit) name - Constant *ModNameStrConst = - ConstantDataArray::getString(M.getContext(), M.getName(), true); - GlobalVariable *ModuleName = new GlobalVariable( - M, ModNameStrConst->getType(), true, GlobalValue::PrivateLinkage, - ModNameStrConst, "__sancov_gen_modname"); if (Options.TracePCGuard) { if (HasSancovGuardsSection) { Function *CtorFunc; @@ -339,18 +270,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority); } } - } else if (!Options.TracePC) { - Function *CtorFunc; - std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions( - M, SanCovModuleCtorName, SanCovModuleInitName, - {Int32PtrTy, IntptrTy, Int8PtrTy, Int8PtrTy}, - {IRB.CreatePointerCast(RealGuardArray, Int32PtrTy), - ConstantInt::get(IntptrTy, N), Constant::getNullValue(Int8PtrTy), - IRB.CreatePointerCast(ModuleName, Int8PtrTy)}); - - appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority); } - return true; } @@ -494,13 +414,12 @@ bool SanitizerCoverageModule::InjectCoverage(Function &F, return false; case SanitizerCoverageOptions::SCK_Function: CreateFunctionGuardArray(1, F); - InjectCoverageAtBlock(F, F.getEntryBlock(), 0, false); + InjectCoverageAtBlock(F, F.getEntryBlock(), 0); return true; default: { - bool UseCalls = ClCoverageBlockThreshold < AllBlocks.size(); CreateFunctionGuardArray(AllBlocks.size(), F); for (size_t i = 0, N = AllBlocks.size(); i < N; i++) - InjectCoverageAtBlock(F, *AllBlocks[i], i, UseCalls); + InjectCoverageAtBlock(F, *AllBlocks[i], i); return true; } } @@ -517,8 +436,7 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls( Function &F, ArrayRef<Instruction *> IndirCalls) { if (IndirCalls.empty()) return; - if (!Options.TracePC && !Options.TracePCGuard) - return; + assert(Options.TracePC || Options.TracePCGuard); for (auto I : IndirCalls) { IRBuilder<> IRB(I); CallSite CS(I); @@ -625,13 +543,8 @@ void SanitizerCoverageModule::InjectTraceForCmp( } } -void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) { - I->setMetadata(I->getModule()->getMDKindID("nosanitize"), - MDNode::get(*C, None)); -} - void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, - size_t Idx, bool UseCalls) { + size_t Idx) { BasicBlock::iterator IP = BB.getFirstInsertionPt(); bool IsEntryBB = &BB == &F.getEntryBlock(); DebugLoc EntryLoc; @@ -651,47 +564,14 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, if (Options.TracePC) { IRB.CreateCall(SanCovTracePC); // gets the PC using GET_CALLER_PC. IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge. - } else if (Options.TracePCGuard) { + } else { + assert(Options.TracePCGuard); auto GuardPtr = IRB.CreateIntToPtr( IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy), ConstantInt::get(IntptrTy, Idx * 4)), Int32PtrTy); - if (!UseCalls) { - auto GuardLoad = IRB.CreateLoad(GuardPtr); - GuardLoad->setAtomic(AtomicOrdering::Monotonic); - GuardLoad->setAlignment(8); - SetNoSanitizeMetadata(GuardLoad); // Don't instrument with e.g. asan. - auto Cmp = IRB.CreateICmpNE( - GuardLoad, Constant::getNullValue(GuardLoad->getType())); - auto Ins = SplitBlockAndInsertIfThen( - Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); - IRB.SetInsertPoint(Ins); - IRB.SetCurrentDebugLocation(EntryLoc); - } IRB.CreateCall(SanCovTracePCGuard, GuardPtr); IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge. - } else { - Value *GuardP = IRB.CreateAdd( - IRB.CreatePointerCast(GuardArray, IntptrTy), - ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4)); - GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy); - if (UseCalls) { - IRB.CreateCall(SanCovWithCheckFunction, GuardP); - } else { - LoadInst *Load = IRB.CreateLoad(GuardP); - Load->setAtomic(AtomicOrdering::Monotonic); - Load->setAlignment(4); - SetNoSanitizeMetadata(Load); - Value *Cmp = - IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load); - Instruction *Ins = SplitBlockAndInsertIfThen( - Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); - IRB.SetInsertPoint(Ins); - IRB.SetCurrentDebugLocation(EntryLoc); - // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC. - IRB.CreateCall(SanCovFunction, GuardP); - IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge. - } } } diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index ee493a8ec7e1..7b625b9b136e 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -305,7 +305,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { /// Infer nonnull attributes for the arguments at the specified callsite. static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { - SmallVector<unsigned, 4> Indices; + SmallVector<unsigned, 4> ArgNos; unsigned ArgNo = 0; for (Value *V : CS.args()) { @@ -318,18 +318,19 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, ConstantPointerNull::get(Type), CS.getInstruction()) == LazyValueInfo::False) - Indices.push_back(ArgNo + AttributeList::FirstArgIndex); + ArgNos.push_back(ArgNo); ArgNo++; } assert(ArgNo == CS.arg_size() && "sanity check"); - if (Indices.empty()) + if (ArgNos.empty()) return false; AttributeList AS = CS.getAttributes(); LLVMContext &Ctx = CS.getInstruction()->getContext(); - AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull)); + AS = AS.addParamAttribute(Ctx, ArgNos, + Attribute::get(Ctx, Attribute::NonNull)); CS.setAttributes(AS); return true; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 0d6e0538261d..0490d93f6455 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -80,10 +80,9 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, struct llvm::GVN::Expression { uint32_t opcode; Type *type; - bool commutative; SmallVector<uint32_t, 4> varargs; - Expression(uint32_t o = ~2U) : opcode(o), commutative(false) {} + Expression(uint32_t o = ~2U) : opcode(o) {} bool operator==(const Expression &other) const { if (opcode != other.opcode) @@ -247,7 +246,6 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!"); if (e.varargs[0] > e.varargs[1]) std::swap(e.varargs[0], e.varargs[1]); - e.commutative = true; } if (CmpInst *C = dyn_cast<CmpInst>(I)) { @@ -258,7 +256,6 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (C->getOpcode() << 8) | Predicate; - e.commutative = true; } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) { for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); II != IE; ++II) @@ -284,7 +281,6 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode, Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (Opcode << 8) | Predicate; - e.commutative = true; return e; } @@ -352,25 +348,25 @@ GVN::ValueTable::~ValueTable() = default; /// add - Insert a value into the table with a specified value number. void GVN::ValueTable::add(Value *V, uint32_t num) { valueNumbering.insert(std::make_pair(V, num)); - if (PHINode *PN = dyn_cast<PHINode>(V)) - NumberingPhi[num] = PN; } uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { if (AA->doesNotAccessMemory(C)) { Expression exp = createExpr(C); - uint32_t e = assignExpNewValueNum(exp).first; + uint32_t &e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; valueNumbering[C] = e; return e; } else if (AA->onlyReadsMemory(C)) { Expression exp = createExpr(C); - auto ValNum = assignExpNewValueNum(exp); - if (ValNum.second) { - valueNumbering[C] = ValNum.first; - return ValNum.first; + uint32_t &e = expressionNumbering[exp]; + if (!e) { + e = nextValueNumber++; + valueNumbering[C] = e; + return e; } if (!MD) { - uint32_t e = assignExpNewValueNum(exp).first; + e = nextValueNumber++; valueNumbering[C] = e; return e; } @@ -526,29 +522,23 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) { case Instruction::ExtractValue: exp = createExtractvalueExpr(cast<ExtractValueInst>(I)); break; - case Instruction::PHI: - valueNumbering[V] = nextValueNumber; - NumberingPhi[nextValueNumber] = cast<PHINode>(V); - return nextValueNumber++; default: valueNumbering[V] = nextValueNumber; return nextValueNumber++; } - uint32_t e = assignExpNewValueNum(exp).first; + uint32_t& e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; valueNumbering[V] = e; return e; } /// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. -uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const { +uint32_t GVN::ValueTable::lookup(Value *V) const { DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); - if (Verify) { - assert(VI != valueNumbering.end() && "Value not numbered?"); - return VI->second; - } - return (VI != valueNumbering.end()) ? VI->second : 0; + assert(VI != valueNumbering.end() && "Value not numbered?"); + return VI->second; } /// Returns the value number of the given comparison, @@ -559,29 +549,21 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Predicate, Value *LHS, Value *RHS) { Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS); - return assignExpNewValueNum(exp).first; + uint32_t& e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; + return e; } /// Remove all entries from the ValueTable. void GVN::ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); - NumberingPhi.clear(); - PhiTranslateTable.clear(); - BlockRPONumber.clear(); nextValueNumber = 1; - Expressions.clear(); - ExprIdx.clear(); - nextExprNumber = 0; } /// Remove a value from the value numbering. void GVN::ValueTable::erase(Value *V) { - uint32_t Num = valueNumbering.lookup(V); valueNumbering.erase(V); - // If V is PHINode, V <--> value number is an one-to-one mapping. - if (isa<PHINode>(V)) - NumberingPhi.erase(Num); } /// verifyRemoved - Verify that the value is removed from all internal data @@ -1469,104 +1451,6 @@ bool GVN::processLoad(LoadInst *L) { return false; } -/// Return a pair the first field showing the value number of \p Exp and the -/// second field showing whether it is a value number newly created. -std::pair<uint32_t, bool> -GVN::ValueTable::assignExpNewValueNum(Expression &Exp) { - uint32_t &e = expressionNumbering[Exp]; - bool CreateNewValNum = !e; - if (CreateNewValNum) { - Expressions.push_back(Exp); - if (ExprIdx.size() < nextValueNumber + 1) - ExprIdx.resize(nextValueNumber * 2); - e = nextValueNumber; - ExprIdx[nextValueNumber++] = nextExprNumber++; - } - return {e, CreateNewValNum}; -} - -void GVN::ValueTable::assignBlockRPONumber(Function &F) { - uint32_t NextBlockNumber = 1; - ReversePostOrderTraversal<Function *> RPOT(&F); - for (BasicBlock *BB : RPOT) - BlockRPONumber[BB] = NextBlockNumber++; -} - -/// Return whether all the values related with the same \p num are -/// defined in \p BB. -bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, - GVN &Gvn) { - LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; - while (Vals && Vals->BB == BB) - Vals = Vals->Next; - return !Vals; -} - -/// Wrap phiTranslateImpl to provide caching functionality. -uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, - const BasicBlock *PhiBlock, uint32_t Num, - GVN &Gvn) { - auto FindRes = PhiTranslateTable.find({Num, Pred}); - if (FindRes != PhiTranslateTable.end()) - return FindRes->second; - uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn); - PhiTranslateTable.insert({{Num, Pred}, NewNum}); - return NewNum; -} - -/// Translate value number \p Num using phis, so that it has the values of -/// the phis in BB. -uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, - const BasicBlock *PhiBlock, - uint32_t Num, GVN &Gvn) { - if (PHINode *PN = NumberingPhi[Num]) { - if (BlockRPONumber[Pred] >= BlockRPONumber[PhiBlock]) - return Num; - for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { - if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred) - if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false)) - return TransVal; - } - return Num; - } - - // If there is any value related with Num is defined in a BB other than - // PhiBlock, it cannot depend on a phi in PhiBlock without going through - // a backedge. We can do an early exit in that case to save compile time. - if (!areAllValsInBB(Num, PhiBlock, Gvn)) - return Num; - - if (ExprIdx[Num] == 0 || Num >= ExprIdx.size()) - return Num; - Expression Exp = Expressions[ExprIdx[Num]]; - - for (unsigned i = 0; i < Exp.varargs.size(); i++) { - // For InsertValue and ExtractValue, some varargs are index numbers - // instead of value numbers. Those index numbers should not be - // translated. - if ((i > 1 && Exp.opcode == Instruction::InsertValue) || - (i > 0 && Exp.opcode == Instruction::ExtractValue)) - continue; - Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn); - } - - if (Exp.commutative) { - assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!"); - if (Exp.varargs[0] > Exp.varargs[1]) { - std::swap(Exp.varargs[0], Exp.varargs[1]); - uint32_t Opcode = Exp.opcode >> 8; - if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) - Exp.opcode = (Opcode << 8) | - CmpInst::getSwappedPredicate( - static_cast<CmpInst::Predicate>(Exp.opcode & 255)); - } - } - - if (uint32_t NewNum = expressionNumbering[Exp]) - return NewNum; - return Num; -} - // In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, // and then scan the list to find one whose block dominates the block in @@ -1972,7 +1856,6 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, // Fabricate val-num for dead-code in order to suppress assertion in // performPRE(). assignValNumForDeadCode(); - VN.assignBlockRPONumber(F); bool PREChanged = true; while (PREChanged) { PREChanged = performPRE(F); @@ -2062,9 +1945,7 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, success = false; break; } - uint32_t TValNo = - VN.phiTranslate(Pred, Instr->getParent(), VN.lookup(Op), *this); - if (Value *V = findLeader(Pred, TValNo)) { + if (Value *V = findLeader(Pred, VN.lookup(Op))) { Instr->setOperand(i, V); } else { success = false; @@ -2081,12 +1962,10 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, Instr->insertBefore(Pred->getTerminator()); Instr->setName(Instr->getName() + ".pre"); Instr->setDebugLoc(Instr->getDebugLoc()); - - unsigned Num = VN.lookupOrAdd(Instr); - VN.add(Instr, Num); + VN.add(Instr, ValNo); // Update the availability map to include the new instruction. - addToLeaderTable(Num, Instr, Pred); + addToLeaderTable(ValNo, Instr, Pred); return true; } @@ -2135,8 +2014,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { break; } - uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); - Value *predV = findLeader(P, TValNo); + Value *predV = findLeader(P, ValNo); if (!predV) { predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); PREPred = P; diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index a143b9a3c645..930696b036c0 100644 --- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -98,11 +98,20 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { CallInst *CI; ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition()); + CmpInst::Predicate Predicate; + uint64_t ValueComparedTo = 0; if (!CmpI) { CI = dyn_cast<CallInst>(BSI.getCondition()); + Predicate = CmpInst::ICMP_NE; + ValueComparedTo = 0; } else { - if (CmpI->getPredicate() != CmpInst::ICMP_NE) + Predicate = CmpI->getPredicate(); + if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ) return false; + ConstantInt *CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1)); + if (!CmpConstOperand) + return false; + ValueComparedTo = CmpConstOperand->getZExtValue(); CI = dyn_cast<CallInst>(CmpI->getOperand(0)); } @@ -121,9 +130,8 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { MDBuilder MDB(CI->getContext()); MDNode *Node; - // If expect value is equal to 1 it means that we are more likely to take - // branch 0, in other case more likely is branch 1. - if (ExpectedValue->isOne()) + if ((ExpectedValue->getZExtValue() == ValueComparedTo) == + (Predicate == CmpInst::ICMP_EQ)) Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight); else Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight); diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 5e9f40019ce8..27809f5b6f66 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -613,7 +613,7 @@ private: return CClass; } void initializeCongruenceClasses(Function &F); - const Expression *makePossiblePhiOfOps(Instruction *, bool, + const Expression *makePossiblePhiOfOps(Instruction *, SmallPtrSetImpl<Value *> &); void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue); @@ -1937,7 +1937,8 @@ void NewGVN::touchAndErase(Map &M, const KeyType &Key) { } void NewGVN::addAdditionalUsers(Value *To, Value *User) const { - AdditionalUsers[To].insert(User); + if (isa<Instruction>(To)) + AdditionalUsers[To].insert(User); } void NewGVN::markUsersTouched(Value *V) { @@ -2423,7 +2424,7 @@ static bool okayForPHIOfOps(const Instruction *I) { // When we see an instruction that is an op of phis, generate the equivalent phi // of ops form. const Expression * -NewGVN::makePossiblePhiOfOps(Instruction *I, bool HasBackedge, +NewGVN::makePossiblePhiOfOps(Instruction *I, SmallPtrSetImpl<Value *> &Visited) { if (!okayForPHIOfOps(I)) return nullptr; @@ -2438,24 +2439,6 @@ NewGVN::makePossiblePhiOfOps(Instruction *I, bool HasBackedge, return nullptr; unsigned IDFSNum = InstrToDFSNum(I); - // Pretty much all of the instructions we can convert to phi of ops over a - // backedge that are adds, are really induction variables, and those are - // pretty much pointless to convert. This is very coarse-grained for a - // test, so if we do find some value, we can change it later. - // But otherwise, what can happen is we convert the induction variable from - // - // i = phi (0, tmp) - // tmp = i + 1 - // - // to - // i = phi (0, tmpphi) - // tmpphi = phi(1, tmpphi+1) - // - // Which we don't want to happen. We could just avoid this for all non-cycle - // free phis, and we made go that route. - if (HasBackedge && I->getOpcode() == Instruction::Add) - return nullptr; - SmallPtrSet<const Value *, 8> ProcessedPHIs; // TODO: We don't do phi translation on memory accesses because it's // complicated. For a load, we'd need to be able to simulate a new memoryuse, @@ -2470,6 +2453,16 @@ NewGVN::makePossiblePhiOfOps(Instruction *I, bool HasBackedge, // Convert op of phis to phi of ops for (auto &Op : I->operands()) { + // TODO: We can't handle expressions that must be recursively translated + // IE + // a = phi (b, c) + // f = use a + // g = f + phi of something + // To properly make a phi of ops for g, we'd have to properly translate and + // use the instruction for f. We should add this by splitting out the + // instruction creation we do below. + if (isa<Instruction>(Op) && PHINodeUses.count(cast<Instruction>(Op))) + return nullptr; if (!isa<PHINode>(Op)) continue; auto *OpPHI = cast<PHINode>(Op); @@ -2782,8 +2775,7 @@ void NewGVN::valueNumberInstruction(Instruction *I) { // Make a phi of ops if necessary if (Symbolized && !isa<ConstantExpression>(Symbolized) && !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) { - // FIXME: Backedge argument - auto *PHIE = makePossiblePhiOfOps(I, false, Visited); + auto *PHIE = makePossiblePhiOfOps(I, Visited); if (PHIE) Symbolized = PHIE; } diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index ed72099ec3ed..24d28a6c2831 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" @@ -141,16 +142,77 @@ static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) { return false; } -void CodeExtractor::findInputsOutputs(ValueSet &Inputs, - ValueSet &Outputs) const { +void CodeExtractor::findAllocas(ValueSet &SinkCands) const { + Function *Func = (*Blocks.begin())->getParent(); + for (BasicBlock &BB : *Func) { + if (Blocks.count(&BB)) + continue; + for (Instruction &II : BB) { + auto *AI = dyn_cast<AllocaInst>(&II); + if (!AI) + continue; + + // Returns true if matching life time markers are found within + // the outlined region. + auto GetLifeTimeMarkers = [&](Instruction *Addr) { + Instruction *LifeStart = nullptr, *LifeEnd = nullptr; + for (User *U : Addr->users()) { + if (!definedInRegion(Blocks, U)) + return false; + + IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U); + if (IntrInst) { + if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) + LifeStart = IntrInst; + if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) + LifeEnd = IntrInst; + } + } + return LifeStart && LifeEnd; + }; + + if (GetLifeTimeMarkers(AI)) { + SinkCands.insert(AI); + continue; + } + + // Follow the bitcast: + Instruction *MarkerAddr = nullptr; + for (User *U : AI->users()) { + if (U->stripPointerCasts() == AI) { + Instruction *Bitcast = cast<Instruction>(U); + if (GetLifeTimeMarkers(Bitcast)) { + MarkerAddr = Bitcast; + continue; + } + } + if (!definedInRegion(Blocks, U)) { + MarkerAddr = nullptr; + break; + } + } + if (MarkerAddr) { + if (!definedInRegion(Blocks, MarkerAddr)) + SinkCands.insert(MarkerAddr); + SinkCands.insert(AI); + } + } + } +} + +void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, + const ValueSet &SinkCands) const { + for (BasicBlock *BB : Blocks) { // If a used value is defined outside the region, it's an input. If an // instruction is used outside the region, it's an output. for (Instruction &II : *BB) { for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; - ++OI) - if (definedInCaller(Blocks, *OI)) - Inputs.insert(*OI); + ++OI) { + Value *V = *OI; + if (!SinkCands.count(V) && definedInCaller(Blocks, V)) + Inputs.insert(V); + } for (User *U : II.users()) if (!definedInRegion(Blocks, U)) { @@ -718,7 +780,7 @@ Function *CodeExtractor::extractCodeRegion() { if (!isEligible()) return nullptr; - ValueSet inputs, outputs; + ValueSet inputs, outputs, SinkingCands; // Assumption: this is a single-entry code region, and the header is the first // block in the region. @@ -757,8 +819,15 @@ Function *CodeExtractor::extractCodeRegion() { "newFuncRoot"); newFuncRoot->getInstList().push_back(BranchInst::Create(header)); + findAllocas(SinkingCands); + // Find inputs to, outputs from the code region. - findInputsOutputs(inputs, outputs); + findInputsOutputs(inputs, outputs, SinkingCands); + + // Now sink all instructions which only have non-phi uses inside the region + for (auto *II : SinkingCands) + cast<Instruction>(II)->moveBefore(*newFuncRoot, + newFuncRoot->getFirstInsertionPt()); // Calculate the exit blocks for the extracted region and the total exit // weights for each of those blocks. diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp index 8877aeafecde..9e71cba4f1b7 100644 --- a/lib/Transforms/Utils/PredicateInfo.cpp +++ b/lib/Transforms/Utils/PredicateInfo.cpp @@ -541,7 +541,40 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, // // TODO: Use this algorithm to perform fast single-variable renaming in // promotememtoreg and memoryssa. -void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpsToRename) { +void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) { + // Sort OpsToRename since we are going to iterate it. + SmallVector<Value *, 8> OpsToRename(OpSet.begin(), OpSet.end()); + std::sort(OpsToRename.begin(), OpsToRename.end(), [&](const Value *A, + const Value *B) { + auto *ArgA = dyn_cast_or_null<Argument>(A); + auto *ArgB = dyn_cast_or_null<Argument>(B); + + // If A and B are args, order them based on their arg no. + if (ArgA && !ArgB) + return true; + if (ArgB && !ArgA) + return false; + if (ArgA && ArgB) + return ArgA->getArgNo() < ArgB->getArgNo(); + + // Else, A are B are instructions. + // If they belong to different BBs, order them by the dominance of BBs. + auto *AInst = cast<Instruction>(A); + auto *BInst = cast<Instruction>(B); + if (AInst->getParent() != BInst->getParent()) + return DT.dominates(AInst->getParent(), BInst->getParent()); + + // Else, A and B belong to the same BB. + // Order A and B by their dominance. + auto *BB = AInst->getParent(); + auto LookupResult = OBBMap.find(BB); + if (LookupResult != OBBMap.end()) + return LookupResult->second->dominates(AInst, BInst); + + auto Result = OBBMap.insert({BB, make_unique<OrderedBasicBlock>(BB)}); + return Result.first->second->dominates(AInst, BInst); + }); + ValueDFS_Compare Compare(OBBMap); // Compute liveness, and rename in O(uses) per Op. for (auto *Op : OpsToRename) { diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 49effda5d833..cc6c47e8f978 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -85,20 +85,6 @@ static bool isCallingConvCCompatible(CallInst *CI) { return false; } -/// Return true if it only matters that the value is equal or not-equal to zero. -static bool isOnlyUsedInZeroEqualityComparison(Value *V) { - for (User *U : V->users()) { - if (ICmpInst *IC = dyn_cast<ICmpInst>(U)) - if (IC->isEquality()) - if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) - if (C->isNullValue()) - continue; - // Unknown instruction. - return false; - } - return true; -} - /// Return true if it is only used in equality comparisons with With. static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { for (User *U : V->users()) { diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 2b83b8426d14..8b9a64c220cc 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7170,10 +7170,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); - // Note: Even if all instructions are scalarized, return true if any memory - // accesses appear in the loop to get benefits from address folding etc. bool TypeNotScalarized = - VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF; + VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; return VectorizationCostTy(C, TypeNotScalarized); } @@ -7312,7 +7310,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); - VectorTy = ToVectorTy(RetTy, VF); + VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); // TODO: We need to estimate the cost of intrinsic calls. @@ -7445,9 +7443,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } else if (Legal->isUniform(Op2)) { Op2VK = TargetTransformInfo::OK_UniformValue; } - SmallVector<const Value *, 4> Operands(I->operand_values()); - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + SmallVector<const Value *, 4> Operands(I->operand_values()); + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, + Op2VK, Op1VP, Op2VP, Operands); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); @@ -7470,7 +7469,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } case Instruction::Store: case Instruction::Load: { - VectorTy = ToVectorTy(getMemInstValueType(I), VF); + unsigned Width = VF; + if (Width > 1) { + InstWidening Decision = getWideningDecision(I, Width); + assert(Decision != CM_Unknown && + "CM decision should be taken at this point"); + if (Decision == CM_Scalarize) + Width = 1; + } + VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); } case Instruction::ZExt: @@ -7495,7 +7502,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } Type *SrcScalarTy = I->getOperand(0)->getType(); - Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); + Type *SrcVecTy = + VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; if (canTruncateToMinimalBitwidth(I, VF)) { // This cast is going to be shrunk. This may remove the cast or it might // turn it into slightly different cast. For example, if MinBW == 16, @@ -7515,7 +7523,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } } - return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); } case Instruction::Call: { bool NeedToScalarize; diff --git a/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll b/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll new file mode 100644 index 000000000000..c1d25c1e3c21 --- /dev/null +++ b/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll @@ -0,0 +1,18 @@ +; Ensures that our struct ops are sane. + +; RUN: opt < %s -disable-basicaa -cfl-anders-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -aa-pipeline=cfl-anders-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s + +; Since we ignore non-pointer values, we effectively ignore extractvalue +; instructions. This means that %c "doesn't exist" in test_structure's graph, +; so we currently get MayAlias. +; XFAIL: * + +; CHECK-LABEL: Function: test_structure +; CHECK: NoAlias: i64** %c, { i64**, i64** }* %a +define void @test_structure() { + %a = alloca {i64**, i64**}, align 8 + %b = load {i64**, i64**}, {i64**, i64**}* %a + %c = extractvalue {i64**, i64**} %b, 0 + ret void +} diff --git a/test/Bitcode/thinlto-function-summary-callgraph.ll b/test/Bitcode/thinlto-function-summary-callgraph.ll index 8cc60ad63362..566f3a077e7b 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph.ll @@ -11,20 +11,23 @@ ; RUN: llvm-lto -thinlto-index-stats %p/Inputs/thinlto-function-summary-callgraph-combined.1.bc | FileCheck %s --check-prefix=OLD-COMBINED ; CHECK: <SOURCE_FILENAME +; CHECK-NEXT: <GLOBALVAR ; CHECK-NEXT: <FUNCTION ; "func" -; CHECK-NEXT: <FUNCTION op0=4 op1=4 +; CHECK-NEXT: <FUNCTION op0=17 op1=4 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK ; CHECK-NEXT: <VERSION ; See if the call to func is registered. -; CHECK-NEXT: <PERMODULE {{.*}} op4=1/> +; CHECK-NEXT: <PERMODULE {{.*}} op3=1 ; CHECK-NEXT: </GLOBALVAL_SUMMARY_BLOCK> ; CHECK: <STRTAB_BLOCK -; CHECK-NEXT: blob data = 'mainfunc' +; CHECK-NEXT: blob data = 'undefinedglobmainfunc' ; COMBINED: <GLOBALVAL_SUMMARY_BLOCK ; COMBINED-NEXT: <VERSION +; Only 2 VALUE_GUID since reference to undefinedglob should not be included in +; combined index. ; COMBINED-NEXT: <VALUE_GUID op0=[[FUNCID:[0-9]+]] op1=7289175272376759421/> ; COMBINED-NEXT: <VALUE_GUID ; COMBINED-NEXT: <COMBINED @@ -40,10 +43,12 @@ target triple = "x86_64-unknown-linux-gnu" define i32 @main() #0 { entry: call void (...) @func() - ret i32 0 + %u = load i32, i32* @undefinedglob + ret i32 %u } declare void @func(...) #1 +@undefinedglob = external global i32 ; OLD: Index {{.*}} contains 1 nodes (1 functions, 0 alias, 0 globals) and 1 edges (0 refs and 1 calls) ; OLD-COMBINED: Index {{.*}} contains 2 nodes (2 functions, 0 alias, 0 globals) and 1 edges (0 refs and 1 calls) diff --git a/test/CodeGen/AArch64/GlobalISel/localizer.mir b/test/CodeGen/AArch64/GlobalISel/localizer.mir index 8fbb2040157e..5bf8dac79860 100644 --- a/test/CodeGen/AArch64/GlobalISel/localizer.mir +++ b/test/CodeGen/AArch64/GlobalISel/localizer.mir @@ -12,6 +12,7 @@ define void @non_local_phi_use_followed_by_use() { ret void } define void @non_local_phi_use_followed_by_use_fi() { ret void } define void @float_non_local_phi_use_followed_by_use_fi() { ret void } + define void @non_local_phi() { ret void } ... --- @@ -310,3 +311,51 @@ body: | %3(s32) = PHI %0(s32), %bb.1 %2(s32) = G_FADD %3, %0 ... + +--- +# Make sure we don't insert a constant before PHIs. +# This used to happen for loops of one basic block. +# CHECK-LABEL: name: non_local_phi +name: non_local_phi +legalized: true +regBankSelected: true +tracksRegLiveness: true + +# CHECK: registers: +# Existing registers should be left untouched +# CHECK: - { id: 0, class: fpr } +#CHECK-NEXT: - { id: 1, class: fpr } +#CHECK-NEXT: - { id: 2, class: fpr } +#CHECK-NEXT: - { id: 3, class: fpr } +# The newly created reg should be on the same regbank/regclass as its origin. +#CHECK-NEXT: - { id: 4, class: fpr } + +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + +# CHECK: body: +# CHECK: %0(s32) = G_FCONSTANT float 1.0 +# CHECK-NEXT: %1(s32) = G_FADD %0, %0 + +# CHECK: bb.1: +# CHECK: %3(s32) = PHI %1(s32), %bb.0, %4(s32), %bb.1 +# CHECK: %4(s32) = G_FCONSTANT float 1.0 + +# CHECK-NEXT: %2(s32) = G_FADD %3, %1 +body: | + bb.0: + successors: %bb.1 + + %0(s32) = G_FCONSTANT float 1.0 + %1(s32) = G_FADD %0, %0 + + bb.1: + successors: %bb.1 + + %3(s32) = PHI %1(s32), %bb.0, %0(s32), %bb.1 + %2(s32) = G_FADD %3, %1 + G_BR %bb.1 +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir index 96436209451b..c35d1719f84c 100644 --- a/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir +++ b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir @@ -13,7 +13,6 @@ name: main alignment: 2 exposesReturnsTwice: false -noVRegs: false legalized: true regBankSelected: true selected: false diff --git a/test/CodeGen/AArch64/addcarry-crash.ll b/test/CodeGen/AArch64/addcarry-crash.ll new file mode 100644 index 000000000000..ba833e0b5873 --- /dev/null +++ b/test/CodeGen/AArch64/addcarry-crash.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s | FileCheck %s +target triple = "arm64-apple-ios7.0" + +define i64 @foo(i64* nocapture readonly %ptr, i64 %a, i64 %b, i64 %c) local_unnamed_addr #0 { +; CHECK: ldr w8, [x0, #4] +; CHECK: lsr x9, x1, #32 +; CHECK: cmn x3, x2 +; CHECK: mul x8, x8, x9 +; CHECK: cinc x0, x8, hs +; CHECK: ret +entry: + %0 = lshr i64 %a, 32 + %1 = load i64, i64* %ptr, align 8 + %2 = lshr i64 %1, 32 + %3 = mul nuw i64 %2, %0 + %4 = add i64 %c, %b + %5 = icmp ult i64 %4, %c + %6 = zext i1 %5 to i64 + %7 = add i64 %3, %6 + ret i64 %7 +} + +attributes #0 = { norecurse nounwind readonly } diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll index 1d8787212579..bd7c69c910c0 100644 --- a/test/CodeGen/AArch64/misched-fusion-aes.ll +++ b/test/CodeGen/AArch64/misched-fusion-aes.ll @@ -1,5 +1,7 @@ -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72 -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72 +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a53 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a73 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k) @@ -72,22 +74,22 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ret void ; CHECK-LABEL: aesea: -; CHECKA57A72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VA]] -; CHECKA57A72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VB]] -; CHECKA57A72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VC]] -; CHECKA57A72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VD]] -; CHECKA57A72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VE]] -; CHECKA57A72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VF]] -; CHECKA57A72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VG]] -; CHECKA57A72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VH]] +; CHECKCORTEX: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VA]] +; CHECKCORTEX: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VB]] +; CHECKCORTEX: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VC]] +; CHECKCORTEX: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VD]] +; CHECKCORTEX: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VE]] +; CHECKCORTEX: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VF]] +; CHECKCORTEX: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VG]] +; CHECKCORTEX: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VA]] @@ -173,22 +175,22 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ret void ; CHECK-LABEL: aesda: -; CHECKA57A72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VA]] -; CHECKA57A72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VB]] -; CHECKA57A72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VC]] -; CHECKA57A72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VD]] -; CHECKA57A72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VE]] -; CHECKA57A72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VF]] -; CHECKA57A72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VG]] -; CHECKA57A72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VH]] +; CHECKCORTEX: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VA]] +; CHECKCORTEX: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VB]] +; CHECKCORTEX: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VC]] +; CHECKCORTEX: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VD]] +; CHECKCORTEX: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VE]] +; CHECKCORTEX: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VF]] +; CHECKCORTEX: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VG]] +; CHECKCORTEX: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VA]] diff --git a/test/CodeGen/AArch64/pr33172.ll b/test/CodeGen/AArch64/pr33172.ll new file mode 100644 index 000000000000..1e1da78b28ff --- /dev/null +++ b/test/CodeGen/AArch64/pr33172.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s | FileCheck %s + +; CHECK-LABEL: pr33172 +; CHECK: ldp +; CHECK: stp + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios10.3.0" + +@main.b = external global [200 x float], align 8 +@main.x = external global [200 x float], align 8 + +; Function Attrs: nounwind ssp +define void @pr33172() local_unnamed_addr { +entry: + %wide.load8281058.3 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 12) to i64*), align 8 + %wide.load8291059.3 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 14) to i64*), align 8 + store i64 %wide.load8281058.3, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 12) to i64*), align 8 + store i64 %wide.load8291059.3, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 14) to i64*), align 8 + %wide.load8281058.4 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 16) to i64*), align 8 + %wide.load8291059.4 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 18) to i64*), align 8 + store i64 %wide.load8281058.4, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 16) to i64*), align 8 + store i64 %wide.load8291059.4, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 18) to i64*), align 8 + tail call void @llvm.memset.p0i8.i64(i8* bitcast ([200 x float]* @main.b to i8*), i8 0, i64 undef, i32 8, i1 false) #2 + unreachable +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1 + +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll index a3a78d326a62..02642142ae2c 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0 ; FUNC-LABEL: {{^}}ds_swizzle: -; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100 +; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:swizzle(BITMASK_PERM,"00p11") ; CHECK: s_waitcnt lgkmcnt define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind { %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 diff --git a/test/CodeGen/AMDGPU/merge-m0.mir b/test/CodeGen/AMDGPU/merge-m0.mir index 064db49924e1..720642ad1ddb 100644 --- a/test/CodeGen/AMDGPU/merge-m0.mir +++ b/test/CodeGen/AMDGPU/merge-m0.mir @@ -50,7 +50,6 @@ name: test alignment: 0 exposesReturnsTwice: false -noVRegs: false legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir index cd50e01032c3..cd0d410368c7 100644 --- a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir +++ b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir @@ -86,7 +86,6 @@ name: sdwa_imm_operand alignment: 0 exposesReturnsTwice: false -noVRegs: false legalized: false regBankSelected: false selected: false @@ -248,7 +247,6 @@ body: | name: sdwa_sgpr_operand alignment: 0 exposesReturnsTwice: false -noVRegs: false legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/AMDGPU/waitcnt-permute.mir b/test/CodeGen/AMDGPU/waitcnt-permute.mir new file mode 100644 index 000000000000..44dbd38f2d30 --- /dev/null +++ b/test/CodeGen/AMDGPU/waitcnt-permute.mir @@ -0,0 +1,33 @@ +# RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s + +--- | + define float @waitcnt-permute(i32 %x, i32 %y) { + entry: + %0 = call i32 @llvm.amdgcn.ds.bpermute(i32 %x, i32 %y) + %1 = bitcast i32 %0 to float + %2 = fadd float 1.000000e+00, %1 + ret float %2 + } + + declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) + +... +--- +# CHECK-LABEL: name: waitcnt-permute{{$}} +# CHECK: DS_BPERMUTE_B32 +# CHECK-NEXT: S_WAITCNT 127 + +name: waitcnt-permute +liveins: + - { reg: '%vgpr0' } + - { reg: '%vgpr1' } + - { reg: '%sgpr30_sgpr31' } +body: | + bb.0: + liveins: %vgpr0, %vgpr1, %sgpr30_sgpr31 + + %vgpr0 = DS_BPERMUTE_B32 killed %vgpr0, killed %vgpr1, 0, implicit %exec + %vgpr0 = V_ADD_F32_e32 1065353216, killed %vgpr0, implicit %exec + S_SETPC_B64_return killed %sgpr30_sgpr31, implicit killed %vgpr0 + +... diff --git a/test/CodeGen/ARM/cmpxchg-O0.ll b/test/CodeGen/ARM/cmpxchg-O0.ll index f8ad2bbbbe0e..a3be72112c76 100644 --- a/test/CodeGen/ARM/cmpxchg-O0.ll +++ b/test/CodeGen/ARM/cmpxchg-O0.ll @@ -10,10 +10,11 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind { ; CHECK: dmb ish ; CHECK: uxtb [[DESIRED:r[0-9]+]], [[DESIRED]] ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0 ; CHECK: ldrexb [[OLD:r[0-9]+]], [r0] ; CHECK: cmp [[OLD]], [[DESIRED]] ; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: strexb [[STATUS:r[0-9]+]], r2, [r0] +; CHECK: strexb [[STATUS]], r2, [r0] ; CHECK: cmp{{(\.w)?}} [[STATUS]], #0 ; CHECK: bne [[RETRY]] ; CHECK: [[DONE]]: @@ -29,10 +30,11 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind ; CHECK: dmb ish ; CHECK: uxth [[DESIRED:r[0-9]+]], [[DESIRED]] ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0 ; CHECK: ldrexh [[OLD:r[0-9]+]], [r0] ; CHECK: cmp [[OLD]], [[DESIRED]] ; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: strexh [[STATUS:r[0-9]+]], r2, [r0] +; CHECK: strexh [[STATUS]], r2, [r0] ; CHECK: cmp{{(\.w)?}} [[STATUS]], #0 ; CHECK: bne [[RETRY]] ; CHECK: [[DONE]]: @@ -48,10 +50,11 @@ define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind ; CHECK: dmb ish ; CHECK-NOT: uxt ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0 ; CHECK: ldrex [[OLD:r[0-9]+]], [r0] ; CHECK: cmp [[OLD]], [[DESIRED]] ; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0] +; CHECK: strex [[STATUS]], r2, [r0] ; CHECK: cmp{{(\.w)?}} [[STATUS]], #0 ; CHECK: bne [[RETRY]] ; CHECK: [[DONE]]: diff --git a/test/CodeGen/ARM/v6-jumptable-clobber.mir b/test/CodeGen/ARM/v6-jumptable-clobber.mir index 0e9bc42565f3..6577ef848671 100644 --- a/test/CodeGen/ARM/v6-jumptable-clobber.mir +++ b/test/CodeGen/ARM/v6-jumptable-clobber.mir @@ -190,7 +190,6 @@ name: foo alignment: 1 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false @@ -289,7 +288,6 @@ body: | name: bar alignment: 1 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/AVR/rot.ll b/test/CodeGen/AVR/rot.ll index e43daf3e6aa8..a7b77d97ba69 100644 --- a/test/CodeGen/AVR/rot.ll +++ b/test/CodeGen/AVR/rot.ll @@ -6,7 +6,7 @@ define i8 @rol8(i8 %val, i8 %amt) { ; CHECK: andi r22, 7 - ; CHECK-NEXT: cp r22, r0 + ; CHECK-NEXT: cpi r22, 0 ; CHECK-NEXT: breq LBB0_2 ; CHECK-NEXT: LBB0_1: @@ -32,7 +32,7 @@ define i8 @rol8(i8 %val, i8 %amt) { define i8 @ror8(i8 %val, i8 %amt) { ; CHECK: andi r22, 7 - ; CHECK-NEXT: cp r22, r0 + ; CHECK-NEXT: cpi r22, 0 ; CHECK-NEXT: breq LBB1_2 ; CHECK-NEXT: LBB1_1: diff --git a/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir b/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir new file mode 100644 index 000000000000..2233e3289f11 --- /dev/null +++ b/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir @@ -0,0 +1,17 @@ +# RUN: llc -march=hexagon -start-after if-converter %s -o - | FileCheck %s +# CHECK: p0 = r0 +# CHECK-NEXT: jumpr r31 + +# Make sure that the packetizer does not attempt to newify the J2_jumpr +# only because of the def-use of p0. + +--- +name: fred +tracksRegLiveness: true +body: | + bb.0: + liveins: %d0 + %p0 = C2_tfrrp %r0 + J2_jumpr %r31, implicit-def %pc, implicit %p0 +... + diff --git a/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll b/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll new file mode 100644 index 000000000000..b25010f2a90f --- /dev/null +++ b/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll @@ -0,0 +1,62 @@ +; RUN: opt -march=hexagon -hexagon-loop-idiom -S < %s | FileCheck %s +; +; The number of nested selects caused the simplification loop to take +; more than the maximum number of iterations. This caused the compiler +; to crash under suspicion of an infinite loop. This (still reduced) +; testcase shows a legitimate case where this limit was exceeded. +; Instead of crashing, gracefully abort the simplification. +; +; Check for sane output. +; CHECK: define void @fred + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define void @fred() unnamed_addr #0 { +b0: + %v1 = select i1 false, i32 undef, i32 2 + br label %b2 + +b2: ; preds = %b2, %b0 + %v3 = sext i16 undef to i32 + %v4 = add nsw i32 %v1, %v3 + %v5 = select i1 undef, i32 undef, i32 %v4 + %v6 = icmp slt i32 %v5, undef + %v7 = select i1 %v6, i32 %v5, i32 undef + %v8 = icmp slt i32 %v7, 0 + %v9 = select i1 %v8, i32 %v7, i32 0 + %v10 = sub i32 undef, undef + %v11 = add i32 %v10, %v9 + %v12 = sext i16 undef to i32 + %v13 = sext i16 undef to i32 + %v14 = add nsw i32 %v1, %v13 + %v15 = select i1 undef, i32 undef, i32 %v14 + %v16 = icmp slt i32 %v15, undef + %v17 = select i1 %v16, i32 %v15, i32 undef + %v18 = select i1 undef, i32 %v17, i32 %v12 + %v19 = add i32 undef, %v18 + %v20 = sext i16 undef to i32 + %v21 = sext i16 0 to i32 + %v22 = add nsw i32 %v1, %v21 + %v23 = sext i16 undef to i32 + %v24 = add nsw i32 %v1, %v23 + %v25 = select i1 undef, i32 undef, i32 %v24 + %v26 = icmp slt i32 %v25, %v22 + %v27 = select i1 %v26, i32 %v25, i32 %v22 + %v28 = icmp slt i32 %v27, %v20 + %v29 = select i1 %v28, i32 %v27, i32 %v20 + %v30 = add i32 undef, %v29 + %v31 = add i32 %v11, undef + %v32 = add i32 %v31, undef + %v33 = add i32 %v32, %v19 + %v34 = add i32 %v33, %v30 + %v35 = add nsw i32 %v34, 32768 + %v36 = icmp ult i32 %v35, 65536 + %v37 = select i1 %v36, i32 %v34, i32 undef + br i1 undef, label %b2, label %b38 + +b38: ; preds = %b2 + unreachable +} + +attributes #0 = { "target-cpu"="hexagonv60" } diff --git a/test/CodeGen/Hexagon/mul64-sext.ll b/test/CodeGen/Hexagon/mul64-sext.ll new file mode 100644 index 000000000000..8bbe6649a1fb --- /dev/null +++ b/test/CodeGen/Hexagon/mul64-sext.ll @@ -0,0 +1,93 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +target triple = "hexagon-unknown--elf" + +; CHECK-LABEL: mul_1 +; CHECK: r1:0 = mpy(r2,r0) +define i64 @mul_1(i64 %a0, i64 %a1) #0 { +b2: + %v3 = shl i64 %a0, 32 + %v4 = ashr exact i64 %v3, 32 + %v5 = shl i64 %a1, 32 + %v6 = ashr exact i64 %v5, 32 + %v7 = mul nsw i64 %v6, %v4 + ret i64 %v7 +} + +; CHECK-LABEL: mul_2 +; CHECK: r0 = memb(r0+#0) +; CHECK: r1:0 = mpy(r2,r0) +; CHECK: jumpr r31 +define i64 @mul_2(i8* %a0, i64 %a1) #0 { +b2: + %v3 = load i8, i8* %a0 + %v4 = sext i8 %v3 to i64 + %v5 = shl i64 %a1, 32 + %v6 = ashr exact i64 %v5, 32 + %v7 = mul nsw i64 %v6, %v4 + ret i64 %v7 +} + +; CHECK-LABEL: mul_acc_1 +; CHECK: r5:4 += mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_acc_1(i64 %a0, i64 %a1, i64 %a2) #0 { +b3: + %v4 = shl i64 %a0, 32 + %v5 = ashr exact i64 %v4, 32 + %v6 = shl i64 %a1, 32 + %v7 = ashr exact i64 %v6, 32 + %v8 = mul nsw i64 %v7, %v5 + %v9 = add i64 %a2, %v8 + ret i64 %v9 +} + +; CHECK-LABEL: mul_acc_2 +; CHECK: r2 = memw(r2+#0) +; CHECK: r5:4 += mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_acc_2(i64 %a0, i32* %a1, i64 %a2) #0 { +b3: + %v4 = shl i64 %a0, 32 + %v5 = ashr exact i64 %v4, 32 + %v6 = load i32, i32* %a1 + %v7 = sext i32 %v6 to i64 + %v8 = mul nsw i64 %v7, %v5 + %v9 = add i64 %a2, %v8 + ret i64 %v9 +} + +; CHECK-LABEL: mul_nac_1 +; CHECK: r5:4 -= mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_nac_1(i64 %a0, i64 %a1, i64 %a2) #0 { +b3: + %v4 = shl i64 %a0, 32 + %v5 = ashr exact i64 %v4, 32 + %v6 = shl i64 %a1, 32 + %v7 = ashr exact i64 %v6, 32 + %v8 = mul nsw i64 %v7, %v5 + %v9 = sub i64 %a2, %v8 + ret i64 %v9 +} + +; CHECK-LABEL: mul_nac_2 +; CHECK: r0 = memw(r0+#0) +; CHECK: r5:4 -= mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_nac_2(i32* %a0, i64 %a1, i64 %a2) #0 { +b3: + %v4 = load i32, i32* %a0 + %v5 = sext i32 %v4 to i64 + %v6 = shl i64 %a1, 32 + %v7 = ashr exact i64 %v6, 32 + %v8 = mul nsw i64 %v7, %v5 + %v9 = sub i64 %a2, %v8 + ret i64 %v9 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MIR/Generic/multiRunPass.mir b/test/CodeGen/MIR/Generic/multiRunPass.mir index bca007de80b7..bd1c0d0b458e 100644 --- a/test/CodeGen/MIR/Generic/multiRunPass.mir +++ b/test/CodeGen/MIR/Generic/multiRunPass.mir @@ -7,8 +7,8 @@ # This test ensures that the command line accepts # several run passes on the same command line and # actually create the proper pipeline for it. -# PSEUDO_PEEPHOLE: -expand-isel-pseudos -peephole-opt -# PEEPHOLE_PSEUDO: -peephole-opt -expand-isel-pseudos +# PSEUDO_PEEPHOLE: -expand-isel-pseudos {{(-machineverifier )?}}-peephole-opt +# PEEPHOLE_PSEUDO: -peephole-opt {{(-machineverifier )?}}-expand-isel-pseudos # Make sure there are no other passes happening after what we asked. # CHECK-NEXT: --- | diff --git a/test/CodeGen/Mips/compactbranches/empty-block.mir b/test/CodeGen/Mips/compactbranches/empty-block.mir index 7831e51e3157..7fb1afae9121 100644 --- a/test/CodeGen/Mips/compactbranches/empty-block.mir +++ b/test/CodeGen/Mips/compactbranches/empty-block.mir @@ -39,7 +39,6 @@ name: l5 alignment: 2 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/PowerPC/expand-isel.ll b/test/CodeGen/PowerPC/expand-isel.ll index 553cc3c372e5..c8707bda8e84 100644 --- a/test/CodeGen/PowerPC/expand-isel.ll +++ b/test/CodeGen/PowerPC/expand-isel.ll @@ -212,13 +212,14 @@ cleanup: ret i32 %retval.0 ; CHECK-LABEL: @testComplexISEL -; CHECK: bc 12, 2, [[TRUE:.LBB[0-9]+]] -; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]] -; CHECK-NEXT: [[TRUE]] -; CHECK-NEXT: addi r3, r12, 0 -; CHECK-NEXT: [[SUCCESSOR]] -; CHECK-NEXT: clrldi r3, r3, 32 -; CHECK-NEXT: blr +; CHECK-DAG: [[LI:r[0-9]+]], 1 +; CHECK-DAG: cmplwi [[LD:r[0-9]+]], 0 +; CHECK: beq cr0, [[EQ:.LBB[0-9_]+]] +; CHECK: blr +; CHECK: [[EQ]] +; CHECK: xor [[XOR:r[0-9]+]] +; CHECK: cntlzd [[CZ:r[0-9]+]], [[XOR]] +; CHECK: rldicl [[SH:r[0-9]+]], [[CZ]], 58, 63 } !1 = !{!2, !2, i64 0} diff --git a/test/CodeGen/PowerPC/logic-ops-on-compares.ll b/test/CodeGen/PowerPC/logic-ops-on-compares.ll new file mode 100644 index 000000000000..df021c20ea86 --- /dev/null +++ b/test/CodeGen/PowerPC/logic-ops-on-compares.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +; Function Attrs: nounwind +define signext i32 @logic_ne_32(i32 signext %a, i32 signext %b, i32 signext %c) { +; CHECK-LABEL: logic_ne_32: +; CHECK: xor r7, r3, r4 +; CHECK-NEXT: li r6, 55 +; CHECK-NEXT: xor r5, r5, r6 +; CHECK-NEXT: or r7, r7, r4 +; CHECK-NEXT: cntlzw r5, r5 +; CHECK-NEXT: cntlzw r6, r7 +; CHECK-NEXT: srwi r6, r6, 5 +; CHECK-NEXT: srwi r5, r5, 5 +; CHECK-NEXT: or. r5, r6, r5 +; CHECK-NEXT: bc 4, 1 +entry: + %tobool = icmp eq i32 %a, %b + %tobool1 = icmp eq i32 %b, 0 + %or.cond = and i1 %tobool, %tobool1 + %tobool3 = icmp eq i32 %c, 55 + %or.cond5 = or i1 %or.cond, %tobool3 + br i1 %or.cond5, label %if.end, label %if.then + +if.then: ; preds = %entry + %call = tail call signext i32 @foo(i32 signext %a) #2 + br label %return + +if.end: ; preds = %entry + %call4 = tail call signext i32 @bar(i32 signext %b) #2 + br label %return + +return: ; preds = %if.end, %if.then + %retval.0 = phi i32 [ %call4, %if.end ], [ %call, %if.then ] + ret i32 %retval.0 +} + +define void @neg_truncate_i32(i32 *%ptr) { +; CHECK-LABEL: neg_truncate_i32: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lwz r3, 0(r3) +; CHECK-NEXT: rldicl. r3, r3, 0, 63 +; CHECK-NEXT: bclr 12, 2, 0 +; CHECK-NEXT: # BB#1: # %if.end29.thread136 +; CHECK-NEXT: .LBB1_2: # %if.end29 +entry: + %0 = load i32, i32* %ptr, align 4 + %rem17127 = and i32 %0, 1 + %cmp18 = icmp eq i32 %rem17127, 0 + br label %if.else + +if.else: ; preds = %entry + br i1 %cmp18, label %if.end29, label %if.end29.thread136 + +if.end29.thread136: ; preds = %if.else + unreachable + +if.end29: ; preds = %if.else + ret void + +} + +; Function Attrs: nounwind +define i64 @logic_ne_64(i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: logic_ne_64: +; CHECK: xor r7, r3, r4 +; CHECK-NEXT: li r6, 55 +; CHECK-NEXT: xor r5, r5, r6 +; CHECK-NEXT: or r7, r7, r4 +; CHECK-NEXT: cntlzd r6, r7 +; CHECK-NEXT: cntlzd r5, r5 +; CHECK-NEXT: rldicl r6, r6, 58, 63 +; CHECK-NEXT: rldicl r5, r5, 58, 63 +; CHECK-NEXT: or. r5, r6, r5 +; CHECK-NEXT: bc 4, 1 +entry: + %tobool = icmp eq i64 %a, %b + %tobool1 = icmp eq i64 %b, 0 + %or.cond = and i1 %tobool, %tobool1 + %tobool3 = icmp eq i64 %c, 55 + %or.cond5 = or i1 %or.cond, %tobool3 + br i1 %or.cond5, label %if.end, label %if.then + +if.then: ; preds = %entry + %call = tail call i64 @foo64(i64 %a) #2 + br label %return + +if.end: ; preds = %entry + %call4 = tail call i64 @bar64(i64 %b) #2 + br label %return + +return: ; preds = %if.end, %if.then + %retval.0 = phi i64 [ %call4, %if.end ], [ %call, %if.then ] + ret i64 %retval.0 +} + +define void @neg_truncate_i64(i64 *%ptr) { +; CHECK-LABEL: neg_truncate_i64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: ld r3, 0(r3) +; CHECK-NEXT: rldicl. r3, r3, 0, 63 +; CHECK-NEXT: bclr 12, 2, 0 +; CHECK-NEXT: # BB#1: # %if.end29.thread136 +; CHECK-NEXT: .LBB3_2: # %if.end29 +entry: + %0 = load i64, i64* %ptr, align 4 + %rem17127 = and i64 %0, 1 + %cmp18 = icmp eq i64 %rem17127, 0 + br label %if.else + +if.else: ; preds = %entry + br i1 %cmp18, label %if.end29, label %if.end29.thread136 + +if.end29.thread136: ; preds = %if.else + unreachable + +if.end29: ; preds = %if.else + ret void + +} + +declare signext i32 @foo(i32 signext) +declare signext i32 @bar(i32 signext) +declare i64 @foo64(i64) +declare i64 @bar64(i64) diff --git a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll new file mode 100644 index 000000000000..3095429758f6 --- /dev/null +++ b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll @@ -0,0 +1,121 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +@zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4 +@zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4 +@zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4 +@zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4 +@zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4 +@zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4 +@zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4 +@zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4 + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1 + +; Validate with if(memcmp()) +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16) + %not.tobool = icmp ne i32 %call, 0 + %. = zext i1 %not.tobool to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest01 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} + +; Validate with if(memcmp() == 0) +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16) + %not.cmp = icmp ne i32 %call, 0 + %. = zext i1 %not.cmp to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest02 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} + +; Validate with > 0 +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16) + %not.cmp = icmp slt i32 %call, 1 + %. = zext i1 %not.cmp to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest03 + ; CHECK-LABEL: %res_block + ; CHECK: cmpld + ; CHECK-NEXT: li [[LI:[0-9]+]], 1 + ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 + ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 +} + +; Validate with < 0 +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16) + %call.lobit = lshr i32 %call, 31 + %call.lobit.not = xor i32 %call.lobit, 1 + ret i32 %call.lobit.not + + ; CHECK-LABEL: @zeroEqualityTest04 + ; CHECK-LABEL: %res_block + ; CHECK: cmpld + ; CHECK-NEXT: li [[LI:[0-9]+]], 1 + ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 + ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 +} + +; Validate with memcmp()?: +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) + %not.tobool = icmp eq i32 %call, 0 + %cond = zext i1 %not.tobool to i32 + ret i32 %cond + + ; CHECK-LABEL: @zeroEqualityTest05 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK: li 3, 0 +} + +; Validate with !memcmp()?: +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) + %not.lnot = icmp ne i32 %call, 0 + %cond = zext i1 %not.lnot to i32 + ret i32 %cond + + ; CHECK-LABEL: @zeroEqualityTest06 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} diff --git a/test/CodeGen/PowerPC/memcmp.ll b/test/CodeGen/PowerPC/memcmp.ll new file mode 100644 index 000000000000..bae713cb2072 --- /dev/null +++ b/test/CodeGen/PowerPC/memcmp.ll @@ -0,0 +1,87 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=CHECK + +; Check size 8 +; Function Attrs: nounwind readonly +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2 + ret i32 %call + +; CHECK-LABEL: @test1 +; CHECK: ldbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: ldbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 4 +; Function Attrs: nounwind readonly +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2 + ret i32 %call + +; CHECK-LABEL: @test2 +; CHECK: lwbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: lwbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 2 +; Function Attrs: nounwind readonly +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2 + ret i32 %call + +; CHECK-LABEL: @test3 +; CHECK: lhbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: lhbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 1 +; Function Attrs: nounwind readonly +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2 + ret i32 %call + +; CHECK-LABEL: @test4 +; CHECK: lbz [[LOAD1:[0-9]+]] +; CHECK-NEXT: lbz [[LOAD2:[0-9]+]] +; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: extsw 3, [[SUB]] +; CHECK-NEXT: blr +} + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8*, i8*, i64) #1 diff --git a/test/CodeGen/PowerPC/memcmpIR.ll b/test/CodeGen/PowerPC/memcmpIR.ll new file mode 100644 index 000000000000..f052cc258df8 --- /dev/null +++ b/test/CodeGen/PowerPC/memcmpIR.ll @@ -0,0 +1,194 @@ +; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s +; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE + +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { +entry: + ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]] + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]] + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + + ; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]] + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]] + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16) + ret i32 %call +} + +declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1 + +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) + ret i32 %call +} + +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8* + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: br label %endblock + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15) + ret i32 %call +} + ; CHECK: call = tail call signext i32 @memcmp + ; CHECK-BE: call = tail call signext i32 @memcmp +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65) + ret i32 %call +} + +define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) { + ; CHECK: call = tail call signext i32 @memcmp + ; CHECK-BE: call = tail call signext i32 @memcmp +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %conv = sext i32 %SIZE to i64 + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv) + ret i32 %call +} diff --git a/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll b/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll new file mode 100644 index 000000000000..7ca5332865ca --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll @@ -0,0 +1,49 @@ +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-ppc-prefetching=true | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-ppc-prefetching=true | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -enable-ppc-prefetching=true | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT +; RUN: llc < %s -march=ppc64 -mcpu=a2 -enable-ppc-prefetching=true | FileCheck %s -check-prefix=CHECK-DCBT + +; Function Attrs: nounwind +define signext i32 @check_cache_line() local_unnamed_addr { +entry: + %call = tail call i32* bitcast (i32* (...)* @magici to i32* ()*)() + %call115 = tail call signext i32 bitcast (i32 (...)* @iter to i32 ()*)() + %cmp16 = icmp sgt i32 %call115, 0 + br i1 %cmp16, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add5, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %res.017 = phi i32 [ %add5, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %call, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %res.017 + %1 = add nuw nsw i64 %indvars.iv, 16 + %arrayidx4 = getelementptr inbounds i32, i32* %call, i64 %1 + %2 = load i32, i32* %arrayidx4, align 4 + %add5 = add nsw i32 %add, %2 + %indvars.iv.next = add nuw i64 %indvars.iv, 1 + %call1 = tail call signext i32 bitcast (i32 (...)* @iter to i32 ()*)() + %3 = sext i32 %call1 to i64 + %cmp = icmp slt i64 %indvars.iv.next, %3 + br i1 %cmp, label %for.body, label %for.cond.cleanup +; CHECK-LABEL: check_cache_line +; CHECK: dcbt +; CHECK-NOT: dcbt +; CHECK: blr +; CHECK-DCBT-LABEL: check_cache_line +; CHECK-DCBT: dcbt +; CHECK-DCBT: dcbt +; CHECK-DCBT: blr +} + +declare i32* @magici(...) local_unnamed_addr + +declare signext i32 @iter(...) local_unnamed_addr + diff --git a/test/CodeGen/PowerPC/pristine-and-livein.mir b/test/CodeGen/PowerPC/pristine-and-livein.mir deleted file mode 100644 index 6d93bb68c102..000000000000 --- a/test/CodeGen/PowerPC/pristine-and-livein.mir +++ /dev/null @@ -1,330 +0,0 @@ -# RUN: llc -run-pass=post-RA-sched %s -o - | FileCheck %s - -# CHECK: callee-saved-register: '[[REG:%x[0-9]+]]' -# CHECK: callee-saved-register: '{{%x[0-9]+}}' -# CHECK-NOT: [[REG]] = LI8 0 -# CHECK: STD killed [[REG]], ---- | - ; ModuleID = '<stdin>' - source_filename = "bugpoint-output-4d91ae2.bc" - target datalayout = "e-m:e-i64:64-n32:64" - target triple = "powerpc64le--linux-gnu" - - ; Function Attrs: norecurse nounwind readonly - define i64 @adler32_z(i64 %adler, i8* readonly %buf, i64 %len) local_unnamed_addr #0 { - entry: - %shr = lshr i64 %adler, 16 - %and = and i64 %shr, 65535 - %and1 = and i64 %adler, 65535 - br i1 undef, label %if.then, label %if.end15 - - if.then: ; preds = %entry - %add5 = add nsw i64 %and1, %and - %sub9 = add nsw i64 %add5, 281474976645135 - %shl = shl i64 %add5, 16 - %or = or i64 %shl, %and1 - br label %cleanup - - if.end15: ; preds = %entry - br i1 undef, label %while.cond.preheader, label %while.cond30.preheader - - while.cond30.preheader: ; preds = %if.end15 - br i1 undef, label %while.body33.preheader, label %while.body109.preheader - - while.body33.preheader: ; preds = %while.cond30.preheader - br label %while.body33 - - while.cond.preheader: ; preds = %if.end15 - %sub25 = add i64 %and1, -65521 - %rem = urem i64 %and, 65521 - %shl27 = shl nuw nsw i64 %rem, 16 - %or28 = or i64 %shl27, %and1 - br label %cleanup - - while.body33: ; preds = %do.end, %while.body33.preheader - %indvar = phi i64 [ %indvar.next, %do.end ], [ 0, %while.body33.preheader ] - %sum2.2385 = phi i64 [ %rem102, %do.end ], [ %and, %while.body33.preheader ] - %len.addr.1384 = phi i64 [ %sub34, %do.end ], [ %len, %while.body33.preheader ] - %buf.addr.1383 = phi i8* [ %scevgep390, %do.end ], [ %buf, %while.body33.preheader ] - %adler.addr.3382 = phi i64 [ %rem101, %do.end ], [ %and1, %while.body33.preheader ] - %0 = mul i64 %indvar, 5552 - %1 = add i64 %0, -13 - %scevgep2 = getelementptr i8, i8* %buf, i64 %1 - %sub34 = add i64 %len.addr.1384, -5552 - call void @llvm.ppc.mtctr.i64(i64 347) - br label %do.body - - do.body: ; preds = %do.body, %while.body33 - %adler.addr.4 = phi i64 [ %adler.addr.3382, %while.body33 ], [ %add49, %do.body ] - %sum2.3 = phi i64 [ %sum2.2385, %while.body33 ], [ %add98, %do.body ] - %tmp15.phi = phi i8* [ %scevgep2, %while.body33 ], [ %tmp15.inc, %do.body ] - %tmp15.inc = getelementptr i8, i8* %tmp15.phi, i64 16 - %add38 = add i64 %adler.addr.4, %sum2.3 - %add42 = add i64 %add38, %adler.addr.4 - %add46 = add i64 %add42, %adler.addr.4 - %tmp15 = load i8, i8* %tmp15.inc, align 1, !tbaa !1 - %conv48 = zext i8 %tmp15 to i64 - %add49 = add i64 %adler.addr.4, %conv48 - %add50 = add i64 %add46, %add49 - %add54 = add i64 %add50, %add49 - %add58 = add i64 %add54, %add49 - %add62 = add i64 %add58, %add49 - %add66 = add i64 %add62, %add49 - %add70 = add i64 %add66, %add49 - %add74 = add i64 %add70, %add49 - %add78 = add i64 %add74, %add49 - %add82 = add i64 %add78, %add49 - %add86 = add i64 %add82, %add49 - %add90 = add i64 %add86, %add49 - %add94 = add i64 %add90, %add49 - %add98 = add i64 %add94, %add49 - %2 = call i1 @llvm.ppc.is.decremented.ctr.nonzero() - br i1 %2, label %do.body, label %do.end - - do.end: ; preds = %do.body - %scevgep390 = getelementptr i8, i8* %buf.addr.1383, i64 5552 - %rem101 = urem i64 %add49, 65521 - %rem102 = urem i64 %add98, 65521 - %cmp31 = icmp ugt i64 %sub34, 5551 - %indvar.next = add i64 %indvar, 1 - br i1 %cmp31, label %while.body33, label %while.end103 - - while.end103: ; preds = %do.end - br i1 undef, label %if.end188, label %while.body109.preheader - - while.body109.preheader: ; preds = %while.end103, %while.cond30.preheader - %buf.addr.1.lcssa394400 = phi i8* [ %buf, %while.cond30.preheader ], [ %scevgep390, %while.end103 ] - %arrayidx151 = getelementptr inbounds i8, i8* %buf.addr.1.lcssa394400, i64 10 - %tmp45 = load i8, i8* %arrayidx151, align 1, !tbaa !1 - %conv152 = zext i8 %tmp45 to i64 - br label %while.body109 - - while.body109: ; preds = %while.body109, %while.body109.preheader - %adler.addr.5373 = phi i64 [ %add153, %while.body109 ], [ undef, %while.body109.preheader ] - %add153 = add i64 %adler.addr.5373, %conv152 - br label %while.body109 - - if.end188: ; preds = %while.end103 - %shl189 = shl nuw nsw i64 %rem102, 16 - %or190 = or i64 %shl189, %rem101 - br label %cleanup - - cleanup: ; preds = %if.end188, %while.cond.preheader, %if.then - %retval.0 = phi i64 [ %or, %if.then ], [ %or28, %while.cond.preheader ], [ %or190, %if.end188 ] - ret i64 %retval.0 - } - - ; Function Attrs: nounwind - declare void @llvm.ppc.mtctr.i64(i64) #1 - - ; Function Attrs: nounwind - declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #1 - - ; Function Attrs: nounwind - declare void @llvm.stackprotector(i8*, i8**) #1 - - attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { nounwind } - - !llvm.ident = !{!0} - - !0 = !{!"clang version 5.0.0 "} - !1 = !{!2, !2, i64 0} - !2 = !{!"omnipotent char", !3, i64 0} - !3 = !{!"Simple C/C++ TBAA"} - -... ---- -name: adler32_z -alignment: 4 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true -liveins: - - { reg: '%x3' } - - { reg: '%x4' } - - { reg: '%x5' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -fixedStack: - - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%x30' } - - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%x29' } - - { id: 2, offset: -8, size: 8, alignment: 8, isImmutable: true, isAliased: false } -body: | - bb.0.entry: - successors: %bb.1.if.then(0x40000000), %bb.3.if.end15(0x40000000) - liveins: %x3, %x4, %x5, %x29, %x30 - - %x6 = RLWINM8 %x3, 16, 16, 31 - %x3 = RLDICL killed %x3, 0, 48 - BC undef %cr5lt, %bb.3.if.end15 - - bb.1.if.then: - successors: %bb.2.if.then(0x80000000) - liveins: %x3, %x6, %x29, %x30 - - %x4 = ADD8 %x3, killed %x6 - - bb.2.if.then: - liveins: %lr8, %rm, %x3, %x4 - - %x4 = RLDICR killed %x4, 16, 47 - %x3 = OR8 killed %x4, killed %x3 - BLR8 implicit %lr8, implicit %rm, implicit %x3 - - bb.3.if.end15: - successors: %bb.6.while.cond.preheader(0x40000000), %bb.4.while.cond30.preheader(0x40000000) - liveins: %x3, %x4, %x5, %x6, %x29, %x30 - - BC undef %cr5lt, %bb.6.while.cond.preheader - - bb.4.while.cond30.preheader: - successors: %bb.7.while.body33.preheader(0x40000000), %bb.5(0x40000000) - liveins: %x3, %x4, %x5, %x6, %x29, %x30 - - BCn undef %cr5lt, %bb.7.while.body33.preheader - - bb.5: - successors: %bb.12.while.body109.preheader(0x80000000) - liveins: %x4, %x29, %x30 - - %x7 = OR8 %x4, killed %x4 - B %bb.12.while.body109.preheader - - bb.6.while.cond.preheader: - successors: %bb.2.if.then(0x80000000) - liveins: %x3, %x6, %x29, %x30 - - %x4 = LIS8 15 - %x4 = ORI8 killed %x4, 225 - %x4 = RLDICR killed %x4, 32, 31 - %x4 = ORIS8 killed %x4, 3375 - %x4 = ORI8 killed %x4, 50637 - %x4 = MULHDU %x6, killed %x4 - %x5 = SUBF8 %x4, %x6 - %x5 = RLDICL killed %x5, 63, 1 - %x4 = ADD8 killed %x5, killed %x4 - %x5 = LI8 0 - %x4 = RLDICL killed %x4, 49, 15 - %x5 = ORI8 killed %x5, 65521 - %x4 = MULLD killed %x4, killed %x5 - %x4 = SUBF8 killed %x4, killed %x6 - B %bb.2.if.then - - bb.7.while.body33.preheader: - successors: %bb.8.while.body33(0x80000000) - liveins: %x3, %x4, %x5, %x6, %x29, %x30 - - STD killed %x29, -24, %x1 :: (store 8 into %fixed-stack.1) - STD killed %x30, -16, %x1 :: (store 8 into %fixed-stack.0, align 16) - %x7 = LIS8 15 - %x7 = ORI8 killed %x7, 225 - %x7 = RLDICR killed %x7, 32, 31 - %x8 = LI8 0 - %x7 = ORIS8 killed %x7, 3375 - %x9 = LI8 347 - %x10 = ORI8 killed %x7, 50637 - %x11 = ORI8 %x8, 65521 - %x7 = OR8 %x4, %x4 - - bb.8.while.body33: - successors: %bb.9.do.body(0x80000000) - liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11 - - %x12 = MULLI8 %x8, 5552 - %x12 = ADD8 %x4, killed %x12 - %x12 = ADDI8 killed %x12, -13 - %x5 = ADDI8 killed %x5, -5552 - MTCTR8loop %x9, implicit-def dead %ctr8 - - bb.9.do.body: - successors: %bb.9.do.body(0x7c000000), %bb.10.do.end(0x04000000) - liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11, %x12 - - %x0, %x12 = LBZU8 16, killed %x12 :: (load 1 from %ir.tmp15.inc, !tbaa !1) - %x6 = ADD8 %x3, killed %x6 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x3 = ADD8 killed %x3, killed %x0 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - BDNZ8 %bb.9.do.body, implicit-def %ctr8, implicit %ctr8 - - bb.10.do.end: - successors: %bb.8.while.body33(0x7c000000), %bb.11.while.end103(0x04000000) - liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11 - - %x12 = MULHDU %x3, %x10 - %x0 = MULHDU %x6, %x10 - %x30 = SUBF8 %x12, %x3 - %x29 = SUBF8 %x0, %x6 - %x30 = RLDICL killed %x30, 63, 1 - %x29 = RLDICL killed %x29, 63, 1 - %x12 = ADD8 killed %x30, killed %x12 - %x0 = ADD8 killed %x29, killed %x0 - %cr0 = CMPLDI %x5, 5551 - %x12 = RLDICL killed %x12, 49, 15 - %x0 = RLDICL killed %x0, 49, 15 - %x12 = MULLD killed %x12, %x11 - %x0 = MULLD killed %x0, %x11 - %x7 = ADDI8 killed %x7, 5552 - %x3 = SUBF8 killed %x12, killed %x3 - %x6 = SUBF8 killed %x0, killed %x6 - %x8 = ADDI8 killed %x8, 1 - BCC 44, killed %cr0, %bb.8.while.body33 - - bb.11.while.end103: - successors: %bb.14.if.end188(0x40000000), %bb.12.while.body109.preheader(0x40000000) - liveins: %x3, %x6, %x7 - - %x30 = LD -16, %x1 :: (load 8 from %fixed-stack.0, align 16) - %x29 = LD -24, %x1 :: (load 8 from %fixed-stack.1) - BC undef %cr5lt, %bb.14.if.end188 - - bb.12.while.body109.preheader: - successors: %bb.13.while.body109(0x80000000) - liveins: %x7, %x29, %x30 - - %x3 = LBZ8 10, killed %x7 :: (load 1 from %ir.arrayidx151, !tbaa !1) - %x4 = IMPLICIT_DEF - - bb.13.while.body109: - successors: %bb.13.while.body109(0x80000000) - liveins: %x3, %x4, %x29, %x30 - - %x4 = ADD8 killed %x4, %x3 - B %bb.13.while.body109 - - bb.14.if.end188: - liveins: %x3, %x6, %x29, %x30 - - %x4 = RLDICR killed %x6, 16, 47 - %x3 = OR8 killed %x4, killed %x3 - BLR8 implicit %lr8, implicit %rm, implicit %x3 - -... diff --git a/test/CodeGen/PowerPC/testComparesieqsll.ll b/test/CodeGen/PowerPC/testComparesieqsll.ll new file mode 100644 index 000000000000..57c7365eff03 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesieqsll.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesieqsll.c' + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll_z(i64 %a) { +; CHECK-LABEL: test_ieqsll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll_sext_z(i64 %a) { +; CHECK-LABEL: test_ieqsll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_z_store(i64 %a) { +; CHECK-LABEL: test_ieqsll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_ieqsll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesiequll.ll b/test/CodeGen/PowerPC/testComparesiequll.ll new file mode 100644 index 000000000000..c28929071845 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesiequll.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesiequll.c' + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll_z(i64 %a) { +; CHECK-LABEL: test_iequll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll_sext_z(i64 %a) { +; CHECK-LABEL: test_iequll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_z_store(i64 %a) { +; CHECK-LABEL: test_iequll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_iequll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/testCompareslleqsll.ll b/test/CodeGen/PowerPC/testCompareslleqsll.ll new file mode 100644 index 000000000000..4797ddfbfe97 --- /dev/null +++ b/test/CodeGen/PowerPC/testCompareslleqsll.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll_z(i64 %a) { +; CHECK-LABEL: test_lleqsll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll_sext_z(i64 %a) { +; CHECK-LABEL: test_lleqsll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_z_store(i64 %a) { +; CHECK-LABEL: test_lleqsll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_lleqsll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesllequll.ll b/test/CodeGen/PowerPC/testComparesllequll.ll new file mode 100644 index 000000000000..4dc7be69d2c8 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesllequll.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll_z(i64 %a) { +; CHECK-LABEL: test_llequll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll_sext_z(i64 %a) { +; CHECK-LABEL: test_llequll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_z_store(i64 %a) { +; CHECK-LABEL: test_llequll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_llequll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/vec_xxpermdi.ll b/test/CodeGen/PowerPC/vec_xxpermdi.ll new file mode 100644 index 000000000000..9be2a1864a04 --- /dev/null +++ b/test/CodeGen/PowerPC/vec_xxpermdi.ll @@ -0,0 +1,307 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | \ +; RUN: FileCheck %s -check-prefix=CHECK-LE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | \ +; RUN: FileCheck %s -check-prefix=CHECK-BE + +; Possible LE ShuffleVector masks (Case 1): +; ShuffleVector((vector double)a, (vector double)b, 3, 1) +; ShuffleVector((vector double)a, (vector double)b, 2, 1) +; ShuffleVector((vector double)a, (vector double)b, 3, 0) +; ShuffleVector((vector double)a, (vector double)b, 2, 0) +; which targets at: +; xxpermdi a, b, 0 +; xxpermdi a, b, 1 +; xxpermdi a, b, 2 +; xxpermdi a, b, 3 +; Possible LE Swap ShuffleVector masks (Case 2): +; ShuffleVector((vector double)a, (vector double)b, 1, 3) +; ShuffleVector((vector double)a, (vector double)b, 0, 3) +; ShuffleVector((vector double)a, (vector double)b, 1, 2) +; ShuffleVector((vector double)a, (vector double)b, 0, 2) +; which targets at: +; xxpermdi b, a, 0 +; xxpermdi b, a, 1 +; xxpermdi b, a, 2 +; xxpermdi b, a, 3 +; Possible LE ShuffleVector masks when a == b, b is undef (Case 3): +; ShuffleVector((vector double)a, (vector double)a, 1, 1) +; ShuffleVector((vector double)a, (vector double)a, 0, 1) +; ShuffleVector((vector double)a, (vector double)a, 1, 0) +; ShuffleVector((vector double)a, (vector double)a, 0, 0) +; which targets at: +; xxpermdi a, a, 0 +; xxpermdi a, a, 1 +; xxpermdi a, a, 2 +; xxpermdi a, a, 3 + +; Possible BE ShuffleVector masks (Case 4): +; ShuffleVector((vector double)a, (vector double)b, 0, 2) +; ShuffleVector((vector double)a, (vector double)b, 0, 3) +; ShuffleVector((vector double)a, (vector double)b, 1, 2) +; ShuffleVector((vector double)a, (vector double)b, 1, 3) +; which targets at: +; xxpermdi a, b, 0 +; xxpermdi a, b, 1 +; xxpermdi a, b, 2 +; xxpermdi a, b, 3 +; Possible BE Swap ShuffleVector masks (Case 5): +; ShuffleVector((vector double)a, (vector double)b, 2, 0) +; ShuffleVector((vector double)a, (vector double)b, 3, 0) +; ShuffleVector((vector double)a, (vector double)b, 2, 1) +; ShuffleVector((vector double)a, (vector double)b, 3, 1) +; which targets at: +; xxpermdi b, a, 0 +; xxpermdi b, a, 1 +; xxpermdi b, a, 2 +; xxpermdi b, a, 3 +; Possible BE ShuffleVector masks when a == b, b is undef (Case 6): +; ShuffleVector((vector double)a, (vector double)a, 0, 0) +; ShuffleVector((vector double)a, (vector double)a, 0, 1) +; ShuffleVector((vector double)a, (vector double)a, 1, 0) +; ShuffleVector((vector double)a, (vector double)a, 1, 1) +; which targets at: +; xxpermdi a, a, 0 +; xxpermdi a, a, 1 +; xxpermdi a, a, 2 +; xxpermdi a, a, 3 + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 1> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-LE: xxmrghd 34, 34, 35 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 1> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-LE: xxpermdi 34, 34, 35, 1 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 0> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-LE: xxpermdi 34, 34, 35, 2 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 0> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-LE: xxmrgld 34, 34, 35 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 3> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-LE: xxmrghd 34, 35, 34 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 3> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-LE: xxpermdi 34, 35, 34, 1 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 2> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-LE: xxpermdi 34, 35, 34, 2 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 2> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-LE: xxmrgld 34, 35, 34 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_0(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 1> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_0 +; CHECK-LE: xxspltd 34, 34, 0 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_1(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_1 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_2(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 0> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_2 +; CHCECK-LE: xxswapd 34, 34 +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_3(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 0> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_3 +; CHECK-LE: xxspltd 34, 34, 1 +; CHECK-LE: blr +} + +; Start testing BE +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 2> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-BE: xxmrghd 34, 34, 35 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 3> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-BE: xxpermdi 34, 34, 35, 1 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 2> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-BE: xxpermdi 34, 34, 35, 2 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 3> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-BE: xxmrgld 34, 34, 35 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 0> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-BE: xxmrghd 34, 35, 34 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 1> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-BE: xxpermdi 34, 35, 34, 1 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 0> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-BE: xxpermdi 34, 35, 34, 2 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 1> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-BE: xxmrgld 34, 35, 34 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_0(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 0> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_0 +; CHECK-BE: xxspltd 34, 34, 0 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_1(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_1 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_2(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 0> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_2 +; CHCECK-LE: xxswapd 34, 34 +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_3(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 1> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_3 +; CHECK-BE: xxspltd 34, 34, 1 +; CHECK-BE: blr +} + +; More test cases to test different types of vector inputs +define <16 x i8> @test_be_vec_xxpermdi_v16i8_v16i8(<16 x i8> %VA, <16 x i8> %VB) { + entry: + %0 = shufflevector <16 x i8> %VA, <16 x i8> %VB,<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19> + ret <16 x i8> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v16i8_v16i8 +; CHECK-BE: xxpermdi 34, 34, 35, 1 +; CHECK-BE: blr +} + +define <8 x i16> @test_le_swap_vec_xxpermdi_v8i16_v8i16(<8 x i16> %VA, <8 x i16> %VB) { + entry: + %0 = shufflevector <8 x i16> %VA, <8 x i16> %VB,<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> + ret <8 x i16> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v8i16_v8i16 +; CHECK-LE: xxpermdi 34, 35, 34, 1 +; CHECK-LE: blr +} + +define <4 x i32> @test_le_swap_vec_xxpermdi_v4i32_v4i32(<4 x i32> %VA, <4 x i32> %VB) { + entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB,<4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v4i32_v4i32 +; CHECK-LE: xxpermdi 34, 35, 34, 1 +; CHECK-LE: blr +} diff --git a/test/CodeGen/Thumb2/tbb-removeadd.mir b/test/CodeGen/Thumb2/tbb-removeadd.mir index 89ed98720539..106066791343 100644 --- a/test/CodeGen/Thumb2/tbb-removeadd.mir +++ b/test/CodeGen/Thumb2/tbb-removeadd.mir @@ -39,7 +39,6 @@ name: Func alignment: 1 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll index 4ec703921e29..24aa5b98d0bb 100644 --- a/test/CodeGen/X86/2007-01-08-InstrSched.ll +++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll @@ -13,10 +13,10 @@ define float @foo(float %x) nounwind { ; CHECK: mulss ; CHECK: mulss -; CHECK: mulss -; CHECK: mulss ; CHECK: addss +; CHECK: mulss ; CHECK: addss +; CHECK: mulss ; CHECK: addss ; CHECK: ret } diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll index bc394f6e156f..6c60aed67a7b 100644 --- a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll +++ b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll @@ -5,7 +5,6 @@ define void @test_void_return() { ; CHECK-LABEL: name: test_void_return ; CHECK: alignment: 4 ; CHECK-NEXT: exposesReturnsTwice: false -; CHECK-NEXT: noVRegs: false ; CHECK-NEXT: legalized: false ; CHECK-NEXT: regBankSelected: false ; CHECK-NEXT: selected: false diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll index b9f7fc68cf68..ad82b8cfb775 100644 --- a/test/CodeGen/X86/add-of-carry.ll +++ b/test/CodeGen/X86/add-of-carry.ll @@ -9,9 +9,11 @@ define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp { ; CHECK-LABEL: test1: ; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: adcl %ecx, %eax ; CHECK-NEXT: retl %add4 = add i32 %x, %sum %cmp = icmp ult i32 %add4, %x diff --git a/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll index 3f4ee362e230..3c84af4aa9ec 100644 --- a/test/CodeGen/X86/addcarry.ll +++ b/test/CodeGen/X86/addcarry.ll @@ -86,21 +86,14 @@ entry: define %scalar @pr31719(%scalar* nocapture readonly %this, %scalar %arg.b) { ; CHECK-LABEL: pr31719: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: xorl %r10d, %r10d -; CHECK-NEXT: addq 8(%rsi), %rcx -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: addq 16(%rsi), %r8 -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 24(%rsi), %r9 ; CHECK-NEXT: addq (%rsi), %rdx -; CHECK-NEXT: adcq $0, %rcx -; CHECK-NEXT: adcq %r8, %r10 -; CHECK-NEXT: adcq %r9, %rax +; CHECK-NEXT: adcq 8(%rsi), %rcx +; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: adcq 24(%rsi), %r9 ; CHECK-NEXT: movq %rdx, (%rdi) ; CHECK-NEXT: movq %rcx, 8(%rdi) -; CHECK-NEXT: movq %r10, 16(%rdi) -; CHECK-NEXT: movq %rax, 24(%rdi) +; CHECK-NEXT: movq %r8, 16(%rdi) +; CHECK-NEXT: movq %r9, 24(%rdi) ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: retq entry: @@ -190,9 +183,9 @@ entry: define i64 @shiftadd(i64 %a, i64 %b, i64 %c, i64 %d) { ; CHECK-LABEL: shiftadd: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: leaq (%rdx,%rcx), %rax ; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: adcq $0, %rax +; CHECK-NEXT: adcq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: retq entry: %0 = zext i64 %a to i128 diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index 2aaf14001758..aa28ef5175ed 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -135,88 +135,87 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm6, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -259,198 +258,183 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; SSE2-LABEL: avg_v64i8: ; SSE2: # BB#0: -; SSE2-NEXT: subq $152, %rsp -; SSE2-NEXT: .Lcfi0: -; SSE2-NEXT: .cfi_def_cfa_offset 160 -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 -; SSE2-NEXT: movdqa 32(%rdi), %xmm5 -; SSE2-NEXT: movdqa 48(%rdi), %xmm6 +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsi), %xmm5 +; SSE2-NEXT: movdqa 16(%rsi), %xmm13 +; SSE2-NEXT: movdqa 32(%rsi), %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm15 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm15, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm10 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm5 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: paddd %xmm14, %xmm12 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa 16(%rsi), %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: paddd %xmm15, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: paddd %xmm8, %xmm15 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm13 +; SSE2-NEXT: movdqa %xmm11, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm11, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: paddd %xmm2, %xmm14 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa 48(%rsi), %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa 48(%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm15 +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: psrld $1, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: packuswb %xmm1, %xmm10 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm12, %xmm4 +; SSE2-NEXT: psrld $1, %xmm13 +; SSE2-NEXT: psrld $1, %xmm15 ; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm15, %xmm7 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm13 +; SSE2-NEXT: packuswb %xmm15, %xmm13 +; SSE2-NEXT: packuswb %xmm4, %xmm13 +; SSE2-NEXT: psrld $1, %xmm6 ; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: packuswb %xmm9, %xmm14 -; SSE2-NEXT: packuswb %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm13 ; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm10, %xmm12 -; SSE2-NEXT: packuswb %xmm6, %xmm12 -; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: packuswb %xmm9, %xmm6 ; SSE2-NEXT: psrld $1, %xmm11 +; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm14 ; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: packuswb %xmm11, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: packuswb %xmm14, %xmm11 +; SSE2-NEXT: packuswb %xmm6, %xmm11 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: packuswb %xmm8, %xmm3 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: packuswb %xmm5, %xmm7 +; SSE2-NEXT: packuswb %xmm3, %xmm7 +; SSE2-NEXT: movdqu %xmm7, (%rax) +; SSE2-NEXT: movdqu %xmm11, (%rax) +; SSE2-NEXT: movdqu %xmm13, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm12, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) -; SSE2-NEXT: addq $152, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: @@ -464,21 +448,21 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 @@ -540,13 +524,13 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm7, %zmm3, %zmm3 -; AVX512F-NEXT: vpaddd %zmm6, %zmm2, %zmm2 -; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -673,27 +657,27 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -755,80 +739,79 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -837,7 +820,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: @@ -847,13 +830,13 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -884,9 +867,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 @@ -1047,88 +1030,87 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm6, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1512,27 +1494,27 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -1594,80 +1576,79 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -1676,7 +1657,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: @@ -1686,13 +1667,13 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -1723,9 +1704,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll index 341dd867e4ff..647b7a8f4dfc 100644 --- a/test/CodeGen/X86/avx.ll +++ b/test/CodeGen/X86/avx.ll @@ -113,11 +113,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; CHECK-NOT: mov ; CHECK: insertps $48 ; CHECK: insertps $48 +; CHECK: vaddps ; CHECK: insertps $48 ; CHECK: insertps $48 ; CHECK: vaddps ; CHECK: vaddps -; CHECK: vaddps ; CHECK-NEXT: ret %1 = getelementptr inbounds float, float* %fb, i64 %index %2 = load float, float* %1, align 4 diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll index 63b0281a7339..e29cf09718ad 100644 --- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll +++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll @@ -13,10 +13,10 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k2 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k3 ; CHECK-NEXT: korw %k1, %k0, %k0 -; CHECK-NEXT: korw %k3, %k2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2 +; CHECK-NEXT: korw %k2, %k1, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index 4890afec2164..c03623a2f035 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -852,16 +852,16 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b ; CHECK-NEXT: kxorw %k0, %k0, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: movw $1, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm4 -; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} ; CHECK-NEXT: movw $220, %ax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 -; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 32da0a70218e..431223611fae 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -30,8 +30,8 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -51,8 +51,8 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) @@ -71,8 +71,8 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) @@ -91,8 +91,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -111,8 +111,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -131,8 +131,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) @@ -671,9 +671,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -1616,9 +1616,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) @@ -2031,8 +2031,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3) @@ -2051,8 +2051,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3) @@ -2651,8 +2651,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3) @@ -2989,9 +2989,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, < ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) @@ -3010,9 +3010,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) @@ -3030,9 +3030,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, < ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) @@ -3050,9 +3050,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 563cad04b8c2..b04c1ab38e55 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -479,11 +479,11 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2usi %xmm0, %rax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4) @@ -498,11 +498,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %rcx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2si %xmm0, %rax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4) @@ -517,11 +517,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %rcx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2usi %xmm0, %rax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4) @@ -536,11 +536,11 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %rcx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2si %xmm0, %rax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4) @@ -555,11 +555,11 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2usi %xmm0, %eax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4) @@ -574,11 +574,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %ecx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2si %xmm0, %eax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4) @@ -593,11 +593,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %ecx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2usi %xmm0, %eax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4) @@ -612,11 +612,11 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %ecx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2si %xmm0, %eax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4) @@ -685,8 +685,9 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) -; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask) @@ -4398,8 +4399,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, < ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) @@ -4418,8 +4419,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) @@ -4520,9 +4521,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4) @@ -4543,9 +4544,9 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovapd %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4612,9 +4613,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, < ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovaps %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1} +; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4) diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll index 4ef88ac495c3..96aefdb10584 100644 --- a/test/CodeGen/X86/avx512-mask-spills.ll +++ b/test/CodeGen/X86/avx512-mask-spills.ll @@ -9,13 +9,11 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -34,14 +32,12 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: Lcfi1: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -60,14 +56,12 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: Lcfi2: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -85,14 +79,12 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { ; CHECK-NEXT: Lcfi3: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 +; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload -; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload -; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -106,20 +98,18 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_64i1: ; CHECK: ## BB#0: -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: Lcfi4: -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill -; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq %k0, (%rsp) ## 8-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload -; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq (%rsp), %k0 ## 8-byte Reload ; CHECK-NEXT: vpmovm2b %k0, %zmm0 -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq %cmp_res = icmp ugt <64 x i8> %a, %b diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 9b4e73a18fc2..faa055dfbbf3 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -796,9 +796,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512: @@ -806,9 +806,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1) @@ -826,8 +826,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -836,8 +836,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 51f9a382ccbf..ca01033bf78b 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2159,9 +2159,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: @@ -2169,9 +2169,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4) %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4) @@ -2411,9 +2411,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512: @@ -2421,9 +2421,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index 7df07b0413ed..571f345d4616 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) @@ -29,8 +29,8 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) @@ -49,8 +49,8 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) @@ -69,8 +69,8 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) @@ -89,8 +89,8 @@ define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) @@ -109,8 +109,8 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) @@ -1476,9 +1476,9 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) @@ -1496,9 +1496,9 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1) @@ -1596,8 +1596,8 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) @@ -1616,8 +1616,8 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll index 8f528394f5bd..f8f47c87100a 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll index 37aea45e6107..96254f7c95b0 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -7,8 +7,8 @@ define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index cf79819734a2..636358fb91cb 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -39,8 +39,8 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll index 06ee237593e7..d54208c00987 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -404,8 +404,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3) @@ -424,8 +424,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 52a84deebf51..595b3e0ebb86 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1568,8 +1568,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) @@ -1588,9 +1588,9 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, < ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) @@ -1608,9 +1608,9 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll index ad9ea93c2031..1bfdfd0e634d 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -635,8 +635,8 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, ; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0] ; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3) @@ -680,8 +680,8 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll index 30ecc0d2e49e..9659dc6d455a 100644 --- a/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -13,8 +13,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -41,8 +41,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -69,8 +69,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -97,8 +97,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll index 3ca686cef3bf..b2fe6eba88ab 100644 --- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll @@ -14,8 +14,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -42,8 +42,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -98,8 +98,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -126,8 +126,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -154,8 +154,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -182,8 +182,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -210,8 +210,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 4d906a4fd29a..c2d8df6476b3 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -30,8 +30,8 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) @@ -50,8 +50,8 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) @@ -90,8 +90,8 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) @@ -110,8 +110,8 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) @@ -130,8 +130,8 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) @@ -152,9 +152,9 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -175,9 +175,9 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -198,9 +198,9 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -221,9 +221,9 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -243,9 +243,9 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) @@ -266,9 +266,9 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) @@ -3209,10 +3209,10 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb] -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) @@ -3540,9 +3540,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -3560,9 +3560,9 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -3580,9 +3580,9 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3600,9 +3600,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3720,8 +3720,8 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -3740,8 +3740,8 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -3760,8 +3760,8 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -3780,8 +3780,8 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4642,10 +4642,10 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) @@ -4817,9 +4817,9 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) @@ -4837,9 +4837,9 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 1f324d679564..684b0468cf51 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4388,8 +4388,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4408,8 +4408,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4428,8 +4428,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4528,8 +4528,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4548,8 +4548,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4568,8 +4568,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4588,8 +4588,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4690,9 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04] +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4) @@ -4732,9 +4732,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04] ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05] +; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4) @@ -4755,9 +4755,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8] ; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04] +; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4) diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll index a681c3b0aa42..092b139fca2f 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -6,68 +6,35 @@ ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=AVX512 define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) { -; SSE2-SSSE3-LABEL: v8i16: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i16: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i16: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i16: ; AVX12: ## BB#0: ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrw $7, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $6, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $5, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $4, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i16: @@ -90,22 +57,8 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i32: @@ -113,19 +66,8 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i32: @@ -149,22 +91,8 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) ; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: cmpltps %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: andps %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movd %xmm3, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm3, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4f32: @@ -172,19 +100,8 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) ; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; AVX12-NEXT: vcmpltps %xmm2, %xmm3, %xmm1 ; AVX12-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f32: @@ -208,56 +125,8 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-SSSE3-NEXT: andb $1, %cl -; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v16i8: @@ -265,55 +134,8 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrb $15, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $14, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $13, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $12, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $11, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $10, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $9, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $8, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $7, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $6, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $5, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $4, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $3, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $2, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $1, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $0, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v16i8: @@ -383,14 +205,8 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i8: @@ -405,26 +221,21 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i8: @@ -439,26 +250,21 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX2-NEXT: vpsrad $24, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i8: @@ -537,14 +343,8 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i16: @@ -559,26 +359,21 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i16: @@ -593,26 +388,21 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX2-NEXT: vpsrad $16, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i16: @@ -683,14 +473,8 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i32: @@ -703,24 +487,19 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: @@ -733,24 +512,19 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i32: @@ -801,14 +575,8 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v2i64: @@ -816,13 +584,8 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrq $1, %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovq %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v2i64: @@ -846,14 +609,8 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> ; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: cmpltpd %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: andpd %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movq %xmm3, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v2f64: @@ -861,13 +618,8 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> ; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 ; AVX12-NEXT: vcmpltpd %xmm2, %xmm3, %xmm1 ; AVX12-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrq $1, %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovq %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v2f64: @@ -892,29 +644,15 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; SSE2-SSSE3-NEXT: psrad $24, %xmm3 ; SSE2-SSSE3-NEXT: pslld $24, %xmm2 ; SSE2-SSSE3-NEXT: psrad $24, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pslld $24, %xmm1 ; SSE2-SSSE3-NEXT: psrad $24, %xmm1 ; SSE2-SSSE3-NEXT: pslld $24, %xmm0 ; SSE2-SSSE3-NEXT: psrad $24, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i8: @@ -923,26 +661,15 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3 ; AVX12-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX12-NEXT: vpsrad $24, %xmm2, %xmm2 +; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 ; AVX12-NEXT: vpslld $24, %xmm1, %xmm1 ; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX12-NEXT: vpslld $24, %xmm0, %xmm0 ; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i8: @@ -975,29 +702,15 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; SSE2-SSSE3-NEXT: psrad $16, %xmm3 ; SSE2-SSSE3-NEXT: pslld $16, %xmm2 ; SSE2-SSSE3-NEXT: psrad $16, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pslld $16, %xmm1 ; SSE2-SSSE3-NEXT: psrad $16, %xmm1 ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i16: @@ -1006,26 +719,15 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3 ; AVX12-NEXT: vpslld $16, %xmm2, %xmm2 ; AVX12-NEXT: vpsrad $16, %xmm2, %xmm2 +; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 ; AVX12-NEXT: vpslld $16, %xmm1, %xmm1 ; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX12-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i16: @@ -1052,45 +754,42 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { } define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { -; SSE2-SSSE3-LABEL: v8i8: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: psllw $8, %xmm3 -; SSE2-SSSE3-NEXT: psraw $8, %xmm3 -; SSE2-SSSE3-NEXT: psllw $8, %xmm2 -; SSE2-SSSE3-NEXT: psraw $8, %xmm2 -; SSE2-SSSE3-NEXT: psllw $8, %xmm1 -; SSE2-SSSE3-NEXT: psraw $8, %xmm1 -; SSE2-SSSE3-NEXT: psllw $8, %xmm0 -; SSE2-SSSE3-NEXT: psraw $8, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i8: +; SSE2: ## BB#0: +; SSE2-NEXT: psllw $8, %xmm3 +; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: psllw $8, %xmm2 +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i8: +; SSSE3: ## BB#0: +; SSSE3-NEXT: psllw $8, %xmm3 +; SSSE3-NEXT: psraw $8, %xmm3 +; SSSE3-NEXT: psllw $8, %xmm2 +; SSSE3-NEXT: psraw $8, %xmm2 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT: psllw $8, %xmm1 +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: psllw $8, %xmm0 +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i8: ; AVX12: ## BB#0: @@ -1098,38 +797,16 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3 ; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX12-NEXT: vpsraw $8, %xmm2, %xmm2 +; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 ; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1 ; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrw $7, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $6, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $5, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $4, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i8: diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll index 06b1a76f6bae..a6d6ca155302 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -1,8 +1,83 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSE2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSSE3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512 define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { +; SSE2-SSSE3-LABEL: v4i64: +; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0] +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm10, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm0 +; SSE2-SSSE3-NEXT: psrad $31, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm2 +; SSE2-SSSE3-NEXT: psrad $31, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: v4i64: +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v4i64: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 @@ -12,19 +87,8 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskps %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -45,30 +109,36 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { } define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { -; AVX2-LABEL: v4f64: -; AVX2: ## BB#0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vcmpltpd %ymm2, %ymm3, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; SSE2-SSSE3-LABEL: v4f64: +; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm2 +; SSE2-SSSE3-NEXT: psrad $31, %xmm2 +; SSE2-SSSE3-NEXT: cmpltpd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: cmpltpd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm6 +; SSE2-SSSE3-NEXT: psrad $31, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6 +; SSE2-SSSE3-NEXT: movmskps %xmm6, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: v4f64: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vcmpltpd %ymm2, %ymm3, %ymm1 +; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: vzeroupper +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f64: ; AVX512: ## BB#0: @@ -87,6 +157,78 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> } define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { +; SSE2-LABEL: v16i16: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm4 +; SSE2-NEXT: psllw $7, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i16: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm3, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm7, %xmm5 +; SSSE3-NEXT: pshufb %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtw %xmm6, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSSE3-NEXT: psllw $7, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: pcmpgtb %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX1-LABEL: v16i16: +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v16i16: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 @@ -96,55 +238,8 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -164,6 +259,79 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { } define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { +; SSE2-LABEL: v8i32: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: psraw $15, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i32: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm3, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psllw $15, %xmm0 +; SSSE3-NEXT: psraw $15, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSSE3-NEXT: pshufb %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSSE3-NEXT: psllw $15, %xmm4 +; SSSE3-NEXT: psraw $15, %xmm4 +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm4, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX1-LABEL: v8i32: +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v8i32: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 @@ -173,31 +341,9 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -217,42 +363,74 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { } define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) { -; AVX2-LABEL: v8f32: -; AVX2: ## BB#0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vcmpltps %ymm2, %ymm3, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; SSE2-LABEL: v8f32: +; SSE2: ## BB#0: +; SSE2-NEXT: cmpltps %xmm1, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: cmpltps %xmm0, %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: psraw $15, %xmm0 +; SSE2-NEXT: cmpltps %xmm5, %xmm7 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: cmpltps %xmm4, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8f32: +; SSSE3: ## BB#0: +; SSSE3-NEXT: cmpltps %xmm1, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm3 +; SSSE3-NEXT: cmpltps %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: psllw $15, %xmm2 +; SSSE3-NEXT: psraw $15, %xmm2 +; SSSE3-NEXT: cmpltps %xmm5, %xmm7 +; SSSE3-NEXT: pshufb %xmm1, %xmm7 +; SSSE3-NEXT: cmpltps %xmm4, %xmm6 +; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSSE3-NEXT: psllw $15, %xmm6 +; SSSE3-NEXT: psraw $15, %xmm6 +; SSSE3-NEXT: pand %xmm2, %xmm6 +; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm6, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v8f32: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vcmpltps %ymm2, %ymm3, %ymm1 +; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: vzeroupper +; AVX12-NEXT: retq ; ; AVX512-LABEL: v8f32: ; AVX512: ## BB#0: @@ -270,121 +448,250 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) } define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { +; SSE2-SSSE3-LABEL: v32i8: +; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-SSSE3-NEXT: andb $1, %cl +; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-SSSE3-NEXT: andb $1, %cl +; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: shll $16, %ecx +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: orl %ecx, %eax +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: v32i8: +; AVX1: ## BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: Lcfi0: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: Lcfi1: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: Lcfi2: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: movl (%rsp), %eax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v32i8: ; AVX2: ## BB#0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: Lcfi0: -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: Lcfi1: -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: Lcfi2: -; AVX2-NEXT: .cfi_def_cfa_register %rbp -; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $32, %rsp ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: movl (%rsp), %eax -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll index d1508f99fc71..9bf7b41a4f26 100644 --- a/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-setcc-128.ll @@ -1,69 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSE2 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSSE3 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX2 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=CHECK,AVX512 define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) { -; SSE2-SSSE3-LABEL: v8i16: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i16: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq ; -; AVX1-LABEL: v8i16: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; SSSE3-LABEL: v8i16: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v8i16: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i16: ; AVX512: ## BB#0: @@ -80,41 +46,16 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) { ; SSE2-SSSE3-LABEL: v4i32: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4i32: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4i32: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i32: ; AVX512: ## BB#0: @@ -132,42 +73,16 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) { ; SSE2-SSSE3-LABEL: v4f32: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movaps %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4f32: -; AVX1: ## BB#0: -; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vextractps $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $0, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4f32: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f32: ; AVX512: ## BB#0: @@ -185,111 +100,16 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) { ; SSE2-SSSE3-LABEL: v16i8: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-SSSE3-NEXT: andb $1, %cl -; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v16i8: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $14, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $13, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $12, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $11, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $10, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $9, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $7, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $6, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $5, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $4, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $3, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $2, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $1, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; AVX1-NEXT: retq +; AVX12-LABEL: v16i8: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v16i8: ; AVX512: ## BB#0: @@ -330,14 +150,8 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i8: @@ -353,15 +167,27 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; +; AVX2-LABEL: v2i8: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: retq +; ; AVX512-LABEL: v2i8: ; AVX512: ## BB#0: ; AVX512-NEXT: vpsllq $56, %xmm1, %xmm1 @@ -406,14 +232,8 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i16: @@ -429,15 +249,27 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; +; AVX2-LABEL: v2i16: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: retq +; ; AVX512-LABEL: v2i16: ; AVX512: ## BB#0: ; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1 @@ -478,14 +310,8 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i32: @@ -499,15 +325,25 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; +; AVX2-LABEL: v2i32: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: retq +; ; AVX512-LABEL: v2i32: ; AVX512: ## BB#0: ; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 @@ -538,27 +374,16 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i64: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v2i64: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v2i64: ; AVX512: ## BB#0: @@ -576,27 +401,16 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) { ; SSE2-SSSE3-LABEL: v2f64: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2f64: -; AVX1: ## BB#0: -; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v2f64: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v2f64: ; AVX512: ## BB#0: @@ -618,45 +432,20 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { ; SSE2-SSSE3-NEXT: pslld $24, %xmm0 ; SSE2-SSSE3-NEXT: psrad $24, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4i8: -; AVX1: ## BB#0: -; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4i8: +; AVX12: ## BB#0: +; AVX12-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1 +; AVX12-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i8: ; AVX512: ## BB#0: @@ -682,45 +471,20 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4i16: -; AVX1: ## BB#0: -; AVX1-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4i16: +; AVX12: ## BB#0: +; AVX12-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX12-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i16: ; AVX512: ## BB#0: @@ -739,73 +503,42 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { } define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { -; SSE2-SSSE3-LABEL: v8i8: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: psllw $8, %xmm1 -; SSE2-SSSE3-NEXT: psraw $8, %xmm1 -; SSE2-SSSE3-NEXT: psllw $8, %xmm0 -; SSE2-SSSE3-NEXT: psraw $8, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i8: +; SSE2: ## BB#0: +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq ; -; AVX1-LABEL: v8i8: -; AVX1: ## BB#0: -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; SSSE3-LABEL: v8i8: +; SSSE3: ## BB#0: +; SSSE3-NEXT: psllw $8, %xmm1 +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: psllw $8, %xmm0 +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v8i8: +; AVX12: ## BB#0: +; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1 +; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i8: ; AVX512: ## BB#0: diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll index 51c6ad7c7f9e..b2c619c48d4d 100644 --- a/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-setcc-256.ll @@ -8,55 +8,8 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -76,33 +29,8 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX2-LABEL: v8i32: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -122,33 +50,8 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) { ; AVX2-LABEL: v8f32: ; AVX2: ## BB#0: ; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -167,117 +70,8 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) { define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX2-LABEL: v32i8: ; AVX2: ## BB#0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: Lcfi0: -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: Lcfi1: -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: Lcfi2: -; AVX2-NEXT: .cfi_def_cfa_register %rbp -; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $32, %rsp ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: movl (%rsp), %eax -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -296,21 +90,8 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX2-LABEL: v4i64: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -331,21 +112,8 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) { ; AVX2-LABEL: v4f64: ; AVX2: ## BB#0: ; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll index a9c74df9d0d9..1340b7662a7a 100644 --- a/test/CodeGen/X86/bswap_tree2.ll +++ b/test/CodeGen/X86/bswap_tree2.ll @@ -9,31 +9,32 @@ define i32 @test1(i32 %x) nounwind { ; CHECK-LABEL: test1: ; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000 -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: bswapl %ecx -; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000 +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: shrl $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test1: ; CHECK64: # BB#0: -; CHECK64-NEXT: movl %edi, %ecx -; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: shrl $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx ; CHECK64-NEXT: bswapl %edi ; CHECK64-NEXT: shrl $16, %edi -; CHECK64-NEXT: orl %ecx, %eax -; CHECK64-NEXT: orl %edi, %eax +; CHECK64-NEXT: orl %ecx, %edi +; CHECK64-NEXT: movl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff %byte1 = and i32 %x, 65280 ; 0x0000ff00 diff --git a/test/CodeGen/X86/eh-unknown.ll b/test/CodeGen/X86/eh-unknown.ll new file mode 100644 index 000000000000..7c495bdadc67 --- /dev/null +++ b/test/CodeGen/X86/eh-unknown.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s + +; An unknown personality forces us to emit an Itanium LSDA. Make sure that the +; Itanium call site table actually tells the personality to keep unwinding, +; i.e. we have an entry and it says "has no landing pad". + +declare void @throwit() +declare void @__unknown_ehpersonality(...) + +define void @use_unknown_ehpersonality() + personality void (...)* @__unknown_ehpersonality { +entry: + call void @throwit() + unreachable +} + +; CHECK-LABEL: use_unknown_ehpersonality: +; CHECK: .Lfunc_begin0: +; CHECK: .seh_handler __unknown_ehpersonality, @unwind, @except +; CHECK: callq throwit +; CHECK: .Lfunc_end0: +; CHECK: .seh_handlerdata +; CHECK: .Lexception0: +; CHECK: .byte 255 # @LPStart Encoding = omit +; CHECK: .byte 0 # @TType Encoding = absptr +; CHECK: .asciz "\217\200" # @TType base offset +; CHECK: .byte 3 # Call site Encoding = udata4 +; CHECK: .byte 13 # Call site table length +; CHECK: .long .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 << +; CHECK: .long .Lfunc_end0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Lfunc_end0 +; CHECK: .long 0 # has no landing pad +; CHECK: .byte 0 # On action: cleanup diff --git a/test/CodeGen/X86/fmsubadd-combine.ll b/test/CodeGen/X86/fmsubadd-combine.ll index bd8888966cf2..338a95f6a80c 100644 --- a/test/CodeGen/X86/fmsubadd-combine.ll +++ b/test/CodeGen/X86/fmsubadd-combine.ll @@ -117,9 +117,9 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou ; FMA3_256-NEXT: vsubpd %ymm5, %ymm1, %ymm2 ; FMA3_256-NEXT: vsubpd %ymm4, %ymm0, %ymm3 ; FMA3_256-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA3_256-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA3_256-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] -; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_pd512: @@ -137,9 +137,9 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou ; FMA4-NEXT: vsubpd %ymm5, %ymm1, %ymm2 ; FMA4-NEXT: vsubpd %ymm4, %ymm0, %ymm3 ; FMA4-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] -; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA4-NEXT: retq entry: %AB = fmul <8 x double> %A, %B @@ -157,9 +157,9 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl ; FMA3_256-NEXT: vsubps %ymm5, %ymm1, %ymm2 ; FMA3_256-NEXT: vsubps %ymm4, %ymm0, %ymm3 ; FMA3_256-NEXT: vaddps %ymm5, %ymm1, %ymm1 +; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA3_256-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA3_256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_ps512: @@ -178,9 +178,9 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl ; FMA4-NEXT: vsubps %ymm5, %ymm1, %ymm2 ; FMA4-NEXT: vsubps %ymm4, %ymm0, %ymm3 ; FMA4-NEXT: vaddps %ymm5, %ymm1, %ymm1 +; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA4-NEXT: retq entry: %AB = fmul <16 x float> %A, %B diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll index d68236e9d250..eb06eb75a4d7 100644 --- a/test/CodeGen/X86/fold-tied-op.ll +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -6,9 +6,10 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386--netbsd" ; CHECK-LABEL: fn1 -; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: imull {{.*#+}} 4-byte Folded Reload -; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload ; CHECK: retl %struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll index 98082ec611d4..6c6bc8bdc1d1 100644 --- a/test/CodeGen/X86/fp128-i128.ll +++ b/test/CodeGen/X86/fp128-i128.ll @@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 { ; CHECK-NEXT: andq %rdi, %rcx ; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 ; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: jmp foo # TAILCALL diff --git a/test/CodeGen/X86/gnu-seh-nolpads.ll b/test/CodeGen/X86/gnu-seh-nolpads.ll new file mode 100644 index 000000000000..311f4d522b1d --- /dev/null +++ b/test/CodeGen/X86/gnu-seh-nolpads.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=x86_64-windows-gnu < %s | FileCheck %s + +declare void @throwit() +declare void @__gxx_personality_seh0(...) +declare void @__gcc_personality_seh0(...) + +define void @use_gxx_seh() + personality void (...)* @__gxx_personality_seh0 { +entry: + call void @throwit() + unreachable +} + +; CHECK-LABEL: use_gxx_seh: +; CHECK: .seh_proc use_gxx_seh +; CHECK-NOT: .seh_handler __gxx_personality_seh0 +; CHECK: callq throwit +; CHECK: .seh_handlerdata +; CHECK: .seh_endproc + +define void @use_gcc_seh() + personality void (...)* @__gcc_personality_seh0 { +entry: + call void @throwit() + unreachable +} + +; CHECK-LABEL: use_gcc_seh: +; CHECK: .seh_proc use_gcc_seh +; CHECK-NOT: .seh_handler __gcc_personality_seh0 +; CHECK: callq throwit +; CHECK: .seh_handlerdata +; CHECK: .seh_endproc + diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir index d0ba057fa009..b05c4467d309 100644 --- a/test/CodeGen/X86/implicit-null-checks.mir +++ b/test/CodeGen/X86/implicit-null-checks.mir @@ -379,7 +379,7 @@ liveins: - { reg: '%esi' } # CHECK: bb.0.entry: # CHECK: %eax = MOV32ri 2200000 -# CHECK-NEXT: %eax = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x) +# CHECK-NEXT: %eax = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, %eax, %rdi, 1, _, 0, _, implicit-def %eflags :: (load 4 from %ir.x) # CHECK-NEXT: JMP_1 %bb.1.not_null body: | @@ -544,7 +544,7 @@ liveins: - { reg: '%rsi' } # CHECK: bb.0.entry: # CHECK: %rbx = MOV64rr %rdx -# CHECK-NEXT: %rdi = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x) +# CHECK-NEXT: %rdi = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, %rbx, %rdi, 1, _, 0, _, implicit-def %eflags :: (load 4 from %ir.x) body: | bb.0.entry: @@ -656,7 +656,7 @@ body: | name: use_alternate_load_op # CHECK-LABEL: name: use_alternate_load_op # CHECK: bb.0.entry: -# CHECK: %rax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _ +# CHECK: %rax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _ # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -689,7 +689,7 @@ body: | name: imp_null_check_gep_load_with_use_dep # CHECK-LABEL: name: imp_null_check_gep_load_with_use_dep # CHECK: bb.0.entry: -# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x) +# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x) # CHECK-NEXT: JMP_1 %bb.1.not_null alignment: 4 tracksRegLiveness: true @@ -721,7 +721,7 @@ name: imp_null_check_load_with_base_sep # CHECK-LABEL: name: imp_null_check_load_with_base_sep # CHECK: bb.0.entry: # CHECK: %rsi = ADD64rr %rsi, %rdi, implicit-def dead %eflags -# CHECK-NEXT: %esi = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %esi, %rdi, 1, _, 0, _, implicit-def dead %eflags +# CHECK-NEXT: %esi = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %esi, %rdi, 1, _, 0, _, implicit-def %eflags # CHECK-NEXT: JMP_1 %bb.1.not_null alignment: 4 tracksRegLiveness: true @@ -752,7 +752,7 @@ body: | name: inc_store # CHECK-LABEL: name: inc_store # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, killed %rsi +# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, %rsi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -782,7 +782,7 @@ body: | name: inc_store_plus_offset # CHECK-LABEL: inc_store_plus_offset # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %rsi +# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %rsi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -813,7 +813,7 @@ name: inc_store_with_dep # CHECK-LABEL: inc_store_with_dep # CHECK: bb.0.entry: # CHECK: %esi = ADD32rr killed %esi, killed %esi, implicit-def dead %eflags -# CHECK-NEXT: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %esi +# CHECK-NEXT: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %esi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -972,7 +972,7 @@ body: | name: inc_store_with_reused_base # CHECK-LABEL: inc_store_with_reused_base # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %esi +# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %esi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -1174,7 +1174,7 @@ body: | name: inc_store_with_load_and_store # CHECK-LABEL: inc_store_with_load_and_store # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 2, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, killed %esi, implicit-def dead %eflags +# CHECK: _ = FAULTING_OP 2, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, %esi, implicit-def %eflags # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -1205,7 +1205,7 @@ body: | name: inc_store_and_load_no_alias # CHECK-LABEL: inc_store_and_load_no_alias # CHECK: bb.0.entry: -# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr) +# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr) # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null diff --git a/test/CodeGen/X86/lrshrink.ll b/test/CodeGen/X86/lrshrink.ll new file mode 100644 index 000000000000..a9cf086dbd90 --- /dev/null +++ b/test/CodeGen/X86/lrshrink.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call +; to minimize live-range. + +define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) { +entry: + br i1 %a, label %then, label %else + +then: + br label %else + +else: + %0 = phi i64 [ 4, %entry ], [ 10, %then ] + %r = phi i64 [ %r1, %entry ], [ %r2, %then ] + %s = phi i64 [ %s1, %entry ], [ %s2, %then ] + %t = phi i64 [ %t1, %entry ], [ %t2, %then ] +; CHECK-LABEL: test: +; CHECK: add +; CHECK: add +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add + %1 = tail call i32 @_Z3foov() + %2 = zext i32 %1 to i64 + %3 = tail call i32 @_Z3foov() + %4 = zext i32 %3 to i64 + %5 = tail call i32 @_Z3foov() + %6 = zext i32 %5 to i64 + %7 = add nuw nsw i64 %0, %r + tail call void @llvm.dbg.value(metadata i64 %7, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 + %8 = add nuw nsw i64 %2, %7 + %9 = add nuw nsw i64 %4, %8 + %10 = add nuw nsw i64 %6, %9 + %11 = add nuw nsw i64 %s, %t + tail call void @llvm.dbg.value(metadata i64 %11, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 + %12 = add nuw nsw i64 %10, %11 + ret i64 %12 +} + +declare i32 @_Z3foov() +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!1, !2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, emissionKind: FullDebug) +!1 = !{i32 2, !"Dwarf Version", i32 4} +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !DIFile(filename: "a.c", directory: "./") +!4 = distinct !DISubprogram(name: "test", scope: !3, unit: !0) +!5 = !DILocalVariable(name: "x", scope: !4) +!6 = !DILocation(line: 4, scope: !4) diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index d332b2f3169f..af86df510016 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -129,9 +129,9 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly ; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-8, %rax @@ -246,23 +246,23 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3 ; SSE2-NEXT: pmullw %xmm4, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: pmullw %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm2 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-16, %rax diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll index e62a1d04dad6..94bbe75702cb 100644 --- a/test/CodeGen/X86/misched-matrix.ll +++ b/test/CodeGen/X86/misched-matrix.ll @@ -17,9 +17,9 @@ ; ; TOPDOWN-LABEL: %for.body ; TOPDOWN: movl %{{.*}}, ( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 4( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 8( ; TOPDOWN: movl %{{.*}}, 12( ; TOPDOWN-LABEL: %for.end diff --git a/test/CodeGen/X86/mul-constant-i16.ll b/test/CodeGen/X86/mul-constant-i16.ll index 6d2465ddd3a8..e3e2737cf3e6 100644 --- a/test/CodeGen/X86/mul-constant-i16.ll +++ b/test/CodeGen/X86/mul-constant-i16.ll @@ -188,16 +188,13 @@ define i16 @test_mul_by_11(i16 %x) { ; X86-LABEL: test_mul_by_11: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: imull $11, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_11: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,2), %eax +; X64-NEXT: imull $11, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 11 @@ -228,16 +225,13 @@ define i16 @test_mul_by_13(i16 %x) { ; X86-LABEL: test_mul_by_13: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $13, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_13: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: imull $13, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 13 @@ -247,19 +241,14 @@ define i16 @test_mul_by_13(i16 %x) { define i16 @test_mul_by_14(i16 %x) { ; X86-LABEL: test_mul_by_14: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $14, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_14: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $14, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 14 @@ -349,19 +338,14 @@ define i16 @test_mul_by_19(i16 %x) { ; X86-LABEL: test_mul_by_19: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: shll $2, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $19, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_19: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: shll $2, %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $19, %edi, %eax +; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 19 ret i16 %mul @@ -391,16 +375,13 @@ define i16 @test_mul_by_21(i16 %x) { ; X86-LABEL: test_mul_by_21: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $21, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_21: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: imull $21, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 21 @@ -410,19 +391,14 @@ define i16 @test_mul_by_21(i16 %x) { define i16 @test_mul_by_22(i16 %x) { ; X86-LABEL: test_mul_by_22: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $22, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_22: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $22, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 22 @@ -433,19 +409,14 @@ define i16 @test_mul_by_23(i16 %x) { ; X86-LABEL: test_mul_by_23: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: shll $3, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $23, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_23: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: shll $3, %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $23, %edi, %eax +; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 23 ret i16 %mul @@ -495,19 +466,14 @@ define i16 @test_mul_by_26(i16 %x) { ; X86-LABEL: test_mul_by_26: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $26, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_26: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $26, %edi, %eax +; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 26 ret i16 %mul @@ -536,19 +502,14 @@ define i16 @test_mul_by_27(i16 %x) { define i16 @test_mul_by_28(i16 %x) { ; X86-LABEL: test_mul_by_28: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $28, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_28: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $28, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 28 @@ -558,21 +519,14 @@ define i16 @test_mul_by_28(i16 %x) { define i16 @test_mul_by_29(i16 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $29, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_29: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $29, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 29 @@ -583,22 +537,14 @@ define i16 @test_mul_by_30(i16 %x) { ; X86-LABEL: test_mul_by_30: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $5, %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: imull $30, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_30: ; X64: # BB#0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $5, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: subl %ecx, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $30, %edi, %eax +; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 30 ret i16 %mul @@ -641,30 +587,3 @@ define i16 @test_mul_by_32(i16 %x) { %mul = mul nsw i16 %x, 32 ret i16 %mul } - -; (x*9+42)*(x*5+2) -define i16 @test_mul_spec(i16 %x) nounwind { -; X86-LABEL: test_mul_spec: -; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NEXT: leal 2(%eax,%eax,4), %eax -; X86-NEXT: imull %ecx, %eax -; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> -; X86-NEXT: retl -; -; X64-LABEL: test_mul_spec: -; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal 42(%rdi,%rdi,8), %ecx -; X64-NEXT: leal 2(%rdi,%rdi,4), %eax -; X64-NEXT: imull %ecx, %eax -; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> -; X64-NEXT: retq - %mul = mul nsw i16 %x, 9 - %add = add nsw i16 %mul, 42 - %mul2 = mul nsw i16 %x, 5 - %add2 = add nsw i16 %mul2, 2 - %mul3 = mul nsw i16 %add, %add2 - ret i16 %mul3 -} diff --git a/test/CodeGen/X86/mul-constant-i32.ll b/test/CodeGen/X86/mul-constant-i32.ll index b1e9a929b7f2..76e46e1f1b09 100644 --- a/test/CodeGen/X86/mul-constant-i32.ll +++ b/test/CodeGen/X86/mul-constant-i32.ll @@ -1,12 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG -; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 define i32 @test_mul_by_1(i32 %x) { ; X86-LABEL: test_mul_by_1: @@ -14,40 +8,10 @@ define i32 @test_mul_by_1(i32 %x) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_1: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_1: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_1: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_1: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_1: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_1: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_1: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_1: +; X64: # BB#0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 1 ret i32 %mul } @@ -59,47 +23,11 @@ define i32 @test_mul_by_2(i32 %x) { ; X86-NEXT: addl %eax, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_2: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_2: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_2: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: addl %eax, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_2: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_2: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_2: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_2: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_2: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 2 ret i32 %mul } @@ -110,46 +38,11 @@ define i32 @test_mul_by_3(i32 %x) { ; X86-NEXT: imull $3, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_3: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_3: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_3: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_3: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_3: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_3: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_3: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_3: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 3 ret i32 %mul } @@ -161,47 +54,11 @@ define i32 @test_mul_by_4(i32 %x) { ; X86-NEXT: shll $2, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_4: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_4: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_4: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $2, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_4: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_4: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_4: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_4: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_4: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 4 ret i32 %mul } @@ -212,46 +69,11 @@ define i32 @test_mul_by_5(i32 %x) { ; X86-NEXT: imull $5, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_5: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_5: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_5: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_5: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_5: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_5: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_5: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_5: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 5 ret i32 %mul } @@ -264,46 +86,12 @@ define i32 @test_mul_by_6(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_6: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_6: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_6: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_6: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_6: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_6: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_6: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_6: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: addl %edi, %edi +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 6 ret i32 %mul } @@ -316,46 +104,12 @@ define i32 @test_mul_by_7(i32 %x) { ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_7: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_7: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_7: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_7: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_7: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_7: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_7: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_7: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (,%rdi,8), %eax +; X64-NEXT: subl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 7 ret i32 %mul } @@ -367,47 +121,11 @@ define i32 @test_mul_by_8(i32 %x) { ; X86-NEXT: shll $3, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_8: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_8: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_8: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $3, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_8: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_8: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_8: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_8: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_8: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (,%rdi,8), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 8 ret i32 %mul } @@ -418,46 +136,11 @@ define i32 @test_mul_by_9(i32 %x) { ; X86-NEXT: imull $9, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_9: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_9: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_9: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_9: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_9: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_9: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_9: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_9: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 9 ret i32 %mul } @@ -470,46 +153,12 @@ define i32 @test_mul_by_10(i32 %x) { ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_10: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_10: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_10: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_10: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_10: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_10: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_10: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_10: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: addl %edi, %edi +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 10 ret i32 %mul } @@ -517,49 +166,13 @@ define i32 @test_mul_by_10(i32 %x) { define i32 @test_mul_by_11(i32 %x) { ; X86-LABEL: test_mul_by_11: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: imull $11, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_11: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_11: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_11: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_11: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_11: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_11: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $11, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_11: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_11: +; X64: # BB#0: +; X64-NEXT: imull $11, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 11 ret i32 %mul } @@ -572,46 +185,12 @@ define i32 @test_mul_by_12(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_12: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_12: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_12: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_12: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_12: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_12: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_12: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_12: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: shll $2, %edi +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 12 ret i32 %mul } @@ -619,49 +198,13 @@ define i32 @test_mul_by_12(i32 %x) { define i32 @test_mul_by_13(i32 %x) { ; X86-LABEL: test_mul_by_13: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $13, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_13: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_13: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_13: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_13: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_13: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_13: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $13, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_13: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_13: +; X64: # BB#0: +; X64-NEXT: imull $13, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 13 ret i32 %mul } @@ -669,52 +212,13 @@ define i32 @test_mul_by_13(i32 %x) { define i32 @test_mul_by_14(i32 %x) { ; X86-LABEL: test_mul_by_14: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $14, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_14: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_14: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_14: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_14: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_14: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_14: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $14, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_14: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_14: +; X64: # BB#0: +; X64-NEXT: imull $14, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 14 ret i32 %mul } @@ -727,46 +231,12 @@ define i32 @test_mul_by_15(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_15: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_15: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_15: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_15: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_15: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_15: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_15: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_15: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: leal (%rax,%rax,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 15 ret i32 %mul } @@ -778,47 +248,11 @@ define i32 @test_mul_by_16(i32 %x) { ; X86-NEXT: shll $4, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_16: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shll $4, %edi # sched: [1:0.50] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_16: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shll $4, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_16: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $4, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_16: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_16: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_16: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shll $4, %edi # sched: [1:1.00] -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_16: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shll $4, %edi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_16: +; X64: # BB#0: +; X64-NEXT: shll $4, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 16 ret i32 %mul } @@ -832,49 +266,13 @@ define i32 @test_mul_by_17(i32 %x) { ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_17: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_17: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: shll $4, %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_17: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_17: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_17: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_17: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: shll $4, %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rdi), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_17: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_17: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll $4, %eax +; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 17 ret i32 %mul } @@ -887,46 +285,12 @@ define i32 @test_mul_by_18(i32 %x) { ; X86-NEXT: leal (%eax,%eax,8), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_18: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_18: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_18: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_18: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_18: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_18: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_18: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_18: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: addl %edi, %edi +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 18 ret i32 %mul } @@ -934,54 +298,13 @@ define i32 @test_mul_by_18(i32 %x) { define i32 @test_mul_by_19(i32 %x) { ; X86-LABEL: test_mul_by_19: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: shll $2, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $19, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_19: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: shll $2, %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_19: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: shll $2, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_19: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_19: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_19: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_19: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $19, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_19: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_19: +; X64: # BB#0: +; X64-NEXT: imull $19, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 19 ret i32 %mul } @@ -994,46 +317,12 @@ define i32 @test_mul_by_20(i32 %x) { ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_20: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_20: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_20: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_20: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_20: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_20: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_20: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_20: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: shll $2, %edi +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 20 ret i32 %mul } @@ -1041,49 +330,13 @@ define i32 @test_mul_by_20(i32 %x) { define i32 @test_mul_by_21(i32 %x) { ; X86-LABEL: test_mul_by_21: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $21, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_21: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_21: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_21: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_21: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_21: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_21: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $21, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_21: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_21: +; X64: # BB#0: +; X64-NEXT: imull $21, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 21 ret i32 %mul } @@ -1091,52 +344,13 @@ define i32 @test_mul_by_21(i32 %x) { define i32 @test_mul_by_22(i32 %x) { ; X86-LABEL: test_mul_by_22: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $22, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_22: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_22: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_22: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_22: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_22: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_22: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $22, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_22: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_22: +; X64: # BB#0: +; X64-NEXT: imull $22, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 22 ret i32 %mul } @@ -1144,54 +358,13 @@ define i32 @test_mul_by_22(i32 %x) { define i32 @test_mul_by_23(i32 %x) { ; X86-LABEL: test_mul_by_23: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: shll $3, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $23, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_23: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: shll $3, %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_23: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: shll $3, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_23: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_23: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_23: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_23: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $23, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_23: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_23: +; X64: # BB#0: +; X64-NEXT: imull $23, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 23 ret i32 %mul } @@ -1204,46 +377,12 @@ define i32 @test_mul_by_24(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_24: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: shll $3, %edi # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_24: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: shll $3, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_24: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_24: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_24: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_24: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: shll $3, %edi # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_24: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_24: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: shll $3, %edi +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 24 ret i32 %mul } @@ -1256,46 +395,12 @@ define i32 @test_mul_by_25(i32 %x) { ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_25: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_25: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_25: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_25: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_25: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_25: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rax,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_25: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_25: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 25 ret i32 %mul } @@ -1303,54 +408,13 @@ define i32 @test_mul_by_25(i32 %x) { define i32 @test_mul_by_26(i32 %x) { ; X86-LABEL: test_mul_by_26: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $26, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_26: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_26: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_26: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_26: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_26: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_26: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $26, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_26: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_26: +; X64: # BB#0: +; X64-NEXT: imull $26, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 26 ret i32 %mul } @@ -1363,46 +427,12 @@ define i32 @test_mul_by_27(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_27: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_27: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_27: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_27: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_27: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_27: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_27: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_27: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: leal (%rax,%rax,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 27 ret i32 %mul } @@ -1410,52 +440,13 @@ define i32 @test_mul_by_27(i32 %x) { define i32 @test_mul_by_28(i32 %x) { ; X86-LABEL: test_mul_by_28: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $28, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_28: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_28: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_28: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_28: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_28: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_28: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $28, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_28: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_28: +; X64: # BB#0: +; X64-NEXT: imull $28, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 28 ret i32 %mul } @@ -1463,55 +454,13 @@ define i32 @test_mul_by_28(i32 %x) { define i32 @test_mul_by_29(i32 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $29, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_29: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_29: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_29: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_29: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_29: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_29: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $29, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_29: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_29: +; X64: # BB#0: +; X64-NEXT: imull $29, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 29 ret i32 %mul } @@ -1519,58 +468,13 @@ define i32 @test_mul_by_29(i32 %x) { define i32 @test_mul_by_30(i32 %x) { ; X86-LABEL: test_mul_by_30: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $5, %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: imull $30, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_30: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-HSW-NEXT: movl %edi, %ecx # sched: [1:0.25] -; X64-HSW-NEXT: subl %eax, %ecx # sched: [1:0.25] -; X64-HSW-NEXT: subl %ecx, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_30: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: movl %edi, %ecx # sched: [1:0.17] -; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %ecx # sched: [1:0.50] -; X64-JAG-NEXT: subl %ecx, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_30: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_30: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_30: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_30: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $30, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_30: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_30: +; X64: # BB#0: +; X64-NEXT: imull $30, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 30 ret i32 %mul } @@ -1584,46 +488,12 @@ define i32 @test_mul_by_31(i32 %x) { ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_31: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_31: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_31: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_31: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_31: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_31: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: shll $5, %eax # sched: [1:1.00] -; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_31: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_31: +; X64: # BB#0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll $5, %eax +; X64-NEXT: subl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 31 ret i32 %mul } @@ -1635,124 +505,11 @@ define i32 @test_mul_by_32(i32 %x) { ; X86-NEXT: shll $5, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_32: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shll $5, %edi # sched: [1:0.50] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_32: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shll $5, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_32: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $5, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_32: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_32: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_32: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shll $5, %edi # sched: [1:1.00] -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_32: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shll $5, %edi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_32: +; X64: # BB#0: +; X64-NEXT: shll $5, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 32 ret i32 %mul } - -; (x*9+42)*(x*5+2) -define i32 @test_mul_spec(i32 %x) nounwind { -; X86-LABEL: test_mul_spec: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NEXT: leal 2(%eax,%eax,4), %eax -; X86-NEXT: imull %ecx, %eax -; X86-NEXT: retl -; -; X64-HSW-LABEL: test_mul_spec: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50] -; X64-HSW-NEXT: addl $42, %ecx # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl $2, %eax # sched: [1:0.25] -; X64-HSW-NEXT: imull %ecx, %eax # sched: [4:1.00] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_spec: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50] -; X64-JAG-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: imull %ecx, %eax # sched: [3:1.00] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_spec: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NOOPT-NEXT: leal 2(%eax,%eax,4), %eax -; X86-NOOPT-NEXT: imull %ecx, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_spec: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50] -; HSW-NOOPT-NEXT: addl $42, %ecx # sched: [1:0.25] -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: addl $2, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_spec: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50] -; JAG-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_spec: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00] -; X64-SLM-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: imull %ecx, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_spec: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00] -; SLM-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] - %mul = mul nsw i32 %x, 9 - %add = add nsw i32 %mul, 42 - %mul2 = mul nsw i32 %x, 5 - %add2 = add nsw i32 %mul2, 2 - %mul3 = mul nsw i32 %add, %add2 - ret i32 %mul3 -} diff --git a/test/CodeGen/X86/mul-constant-i64.ll b/test/CodeGen/X86/mul-constant-i64.ll index 22eb0bdc6c3f..8579179a8231 100644 --- a/test/CodeGen/X86/mul-constant-i64.ll +++ b/test/CodeGen/X86/mul-constant-i64.ll @@ -1,55 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG -; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 -define i64 @test_mul_by_1(i64 %x) nounwind { +define i64 @test_mul_by_1(i64 %x) { ; X86-LABEL: test_mul_by_1: ; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_1: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_1: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_1: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_1: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_1: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_1: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_1: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_1: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 1 ret i64 %mul } @@ -63,43 +26,10 @@ define i64 @test_mul_by_2(i64 %x) { ; X86-NEXT: addl %eax, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_2: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_2: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_2: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $1, %eax, %edx -; X86-NOOPT-NEXT: addl %eax, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_2: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_2: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_2: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_2: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_2: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 2 ret i64 %mul } @@ -113,43 +43,10 @@ define i64 @test_mul_by_3(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_3: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_3: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_3: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $3, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_3: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_3: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_3: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_3: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_3: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 3 ret i64 %mul } @@ -163,43 +60,10 @@ define i64 @test_mul_by_4(i64 %x) { ; X86-NEXT: shll $2, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_4: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_4: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_4: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $2, %eax, %edx -; X86-NOOPT-NEXT: shll $2, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_4: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_4: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_4: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_4: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_4: +; X64: # BB#0: +; X64-NEXT: leaq (,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 4 ret i64 %mul } @@ -213,43 +77,10 @@ define i64 @test_mul_by_5(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_5: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_5: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_5: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $5, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_5: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_5: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_5: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_5: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_5: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 5 ret i64 %mul } @@ -264,46 +95,11 @@ define i64 @test_mul_by_6(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_6: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_6: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_6: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $6, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_6: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_6: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_6: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_6: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_6: +; X64: # BB#0: +; X64-NEXT: addq %rdi, %rdi +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 6 ret i64 %mul } @@ -319,46 +115,11 @@ define i64 @test_mul_by_7(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_7: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_7: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_7: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $7, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_7: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_7: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_7: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_7: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_7: +; X64: # BB#0: +; X64-NEXT: leaq (,%rdi,8), %rax +; X64-NEXT: subq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 7 ret i64 %mul } @@ -372,43 +133,10 @@ define i64 @test_mul_by_8(i64 %x) { ; X86-NEXT: shll $3, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_8: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_8: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_8: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $3, %eax, %edx -; X86-NOOPT-NEXT: shll $3, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_8: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_8: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_8: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_8: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_8: +; X64: # BB#0: +; X64-NEXT: leaq (,%rdi,8), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 8 ret i64 %mul } @@ -422,43 +150,10 @@ define i64 @test_mul_by_9(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_9: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_9: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_9: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $9, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_9: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_9: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_9: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_9: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_9: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 9 ret i64 %mul } @@ -473,46 +168,11 @@ define i64 @test_mul_by_10(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_10: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_10: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_10: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $10, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_10: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_10: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_10: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_10: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_10: +; X64: # BB#0: +; X64-NEXT: addq %rdi, %rdi +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 10 ret i64 %mul } @@ -520,53 +180,16 @@ define i64 @test_mul_by_10(i64 %x) { define i64 @test_mul_by_11(i64 %x) { ; X86-LABEL: test_mul_by_11: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %ecx ; X86-NEXT: movl $11, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $11, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_11: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_11: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_11: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $11, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_11: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_11: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_11: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_11: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_11: +; X64: # BB#0: +; X64-NEXT: imulq $11, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 11 ret i64 %mul } @@ -581,46 +204,11 @@ define i64 @test_mul_by_12(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,4), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_12: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_12: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_12: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $12, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_12: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_12: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_12: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_12: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_12: +; X64: # BB#0: +; X64-NEXT: shlq $2, %rdi +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 12 ret i64 %mul } @@ -628,53 +216,16 @@ define i64 @test_mul_by_12(i64 %x) { define i64 @test_mul_by_13(i64 %x) { ; X86-LABEL: test_mul_by_13: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx ; X86-NEXT: movl $13, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $13, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_13: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_13: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_13: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $13, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_13: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_13: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_13: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_13: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_13: +; X64: # BB#0: +; X64-NEXT: imulq $13, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 13 ret i64 %mul } @@ -682,56 +233,16 @@ define i64 @test_mul_by_13(i64 %x) { define i64 @test_mul_by_14(i64 %x) { ; X86-LABEL: test_mul_by_14: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $14, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $14, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_14: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_14: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_14: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $14, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_14: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_14: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_14: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_14: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_14: +; X64: # BB#0: +; X64-NEXT: imulq $14, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 14 ret i64 %mul } @@ -747,46 +258,11 @@ define i64 @test_mul_by_15(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_15: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_15: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_15: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $15, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_15: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_15: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_15: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_15: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_15: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: leaq (%rax,%rax,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 15 ret i64 %mul } @@ -800,49 +276,11 @@ define i64 @test_mul_by_16(i64 %x) { ; X86-NEXT: shll $4, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_16: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $4, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_16: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $4, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_16: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $4, %eax, %edx -; X86-NOOPT-NEXT: shll $4, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_16: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_16: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_16: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $4, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_16: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shlq $4, %rdi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_16: +; X64: # BB#0: +; X64-NEXT: shlq $4, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 16 ret i64 %mul } @@ -859,49 +297,12 @@ define i64 @test_mul_by_17(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_17: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: shlq $4, %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_17: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: shlq $4, %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_17: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $17, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_17: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_17: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_17: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: shlq $4, %rax # sched: [1:1.00] -; X64-SLM-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_17: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_17: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shlq $4, %rax +; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 17 ret i64 %mul } @@ -916,46 +317,11 @@ define i64 @test_mul_by_18(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_18: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_18: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_18: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $18, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_18: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_18: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_18: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_18: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_18: +; X64: # BB#0: +; X64-NEXT: addq %rdi, %rdi +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 18 ret i64 %mul } @@ -963,58 +329,16 @@ define i64 @test_mul_by_18(i64 %x) { define i64 @test_mul_by_19(i64 %x) { ; X86-LABEL: test_mul_by_19: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: shll $2, %eax -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $19, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $19, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_19: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: shlq $2, %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_19: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: shlq $2, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_19: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $19, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_19: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_19: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_19: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_19: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_19: +; X64: # BB#0: +; X64-NEXT: imulq $19, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 19 ret i64 %mul } @@ -1029,46 +353,11 @@ define i64 @test_mul_by_20(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,4), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_20: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_20: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_20: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $20, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_20: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_20: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_20: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_20: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_20: +; X64: # BB#0: +; X64-NEXT: shlq $2, %rdi +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 20 ret i64 %mul } @@ -1076,53 +365,16 @@ define i64 @test_mul_by_20(i64 %x) { define i64 @test_mul_by_21(i64 %x) { ; X86-LABEL: test_mul_by_21: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx ; X86-NEXT: movl $21, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $21, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_21: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_21: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_21: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $21, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_21: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_21: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_21: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_21: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_21: +; X64: # BB#0: +; X64-NEXT: imulq $21, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 21 ret i64 %mul } @@ -1130,56 +382,16 @@ define i64 @test_mul_by_21(i64 %x) { define i64 @test_mul_by_22(i64 %x) { ; X86-LABEL: test_mul_by_22: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $22, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $22, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_22: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_22: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_22: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $22, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_22: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_22: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_22: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_22: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_22: +; X64: # BB#0: +; X64-NEXT: imulq $22, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 22 ret i64 %mul } @@ -1187,58 +399,16 @@ define i64 @test_mul_by_22(i64 %x) { define i64 @test_mul_by_23(i64 %x) { ; X86-LABEL: test_mul_by_23: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %eax -; X86-NEXT: shll $3, %eax -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $23, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $23, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_23: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: shlq $3, %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_23: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: shlq $3, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_23: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $23, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_23: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_23: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_23: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_23: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_23: +; X64: # BB#0: +; X64-NEXT: imulq $23, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 23 ret i64 %mul } @@ -1253,46 +423,11 @@ define i64 @test_mul_by_24(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,8), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_24: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $3, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_24: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $3, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_24: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $24, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_24: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_24: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_24: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $3, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_24: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_24: +; X64: # BB#0: +; X64-NEXT: shlq $3, %rdi +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 24 ret i64 %mul } @@ -1308,46 +443,11 @@ define i64 @test_mul_by_25(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_25: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_25: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_25: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $25, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_25: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_25: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_25: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_25: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_25: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: leaq (%rax,%rax,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 25 ret i64 %mul } @@ -1355,58 +455,16 @@ define i64 @test_mul_by_25(i64 %x) { define i64 @test_mul_by_26(i64 %x) { ; X86-LABEL: test_mul_by_26: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $26, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $26, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_26: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_26: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_26: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $26, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_26: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_26: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_26: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_26: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_26: +; X64: # BB#0: +; X64-NEXT: imulq $26, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 26 ret i64 %mul } @@ -1422,46 +480,11 @@ define i64 @test_mul_by_27(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_27: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_27: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_27: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $27, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_27: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_27: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_27: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_27: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_27: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: leaq (%rax,%rax,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 27 ret i64 %mul } @@ -1469,56 +492,16 @@ define i64 @test_mul_by_27(i64 %x) { define i64 @test_mul_by_28(i64 %x) { ; X86-LABEL: test_mul_by_28: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $28, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $28, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_28: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_28: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_28: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $28, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_28: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_28: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_28: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_28: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_28: +; X64: # BB#0: +; X64-NEXT: imulq $28, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 28 ret i64 %mul } @@ -1526,59 +509,16 @@ define i64 @test_mul_by_28(i64 %x) { define i64 @test_mul_by_29(i64 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $29, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $29, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_29: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_29: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_29: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $29, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_29: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_29: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_29: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_29: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_29: +; X64: # BB#0: +; X64-NEXT: imulq $29, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 29 ret i64 %mul } @@ -1586,60 +526,16 @@ define i64 @test_mul_by_29(i64 %x) { define i64 @test_mul_by_30(i64 %x) { ; X86-LABEL: test_mul_by_30: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $5, %ecx ; X86-NEXT: movl $30, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $30, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_30: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-HSW-NEXT: movq %rdi, %rcx # sched: [1:0.25] -; X64-HSW-NEXT: subq %rax, %rcx # sched: [1:0.25] -; X64-HSW-NEXT: subq %rcx, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_30: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: movq %rdi, %rcx # sched: [1:0.17] -; X64-JAG-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rcx # sched: [1:0.50] -; X64-JAG-NEXT: subq %rcx, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_30: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $30, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_30: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_30: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_30: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_30: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_30: +; X64: # BB#0: +; X64-NEXT: imulq $30, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 30 ret i64 %mul } @@ -1656,49 +552,12 @@ define i64 @test_mul_by_31(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_31: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_31: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_31: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $31, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_31: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_31: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_31: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: shlq $5, %rax # sched: [1:1.00] -; X64-SLM-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_31: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_31: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shlq $5, %rax +; X64-NEXT: subq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 31 ret i64 %mul } @@ -1712,168 +571,11 @@ define i64 @test_mul_by_32(i64 %x) { ; X86-NEXT: shll $5, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_32: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $5, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_32: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $5, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_32: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $5, %eax, %edx -; X86-NOOPT-NEXT: shll $5, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_32: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_32: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_32: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $5, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_32: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shlq $5, %rdi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_32: +; X64: # BB#0: +; X64-NEXT: shlq $5, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 32 ret i64 %mul } - -; (x*9+42)*(x*5+2) -define i64 @test_mul_spec(i64 %x) nounwind { -; X86-LABEL: test_mul_spec: -; X86: # BB#0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl $9, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: leal (%edi,%edi,8), %ebx -; X86-NEXT: addl $42, %esi -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl $5, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: leal (%edi,%edi,4), %edi -; X86-NEXT: addl $2, %ecx -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: imull %esi, %edi -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: retl -; -; X64-HSW-LABEL: test_mul_spec: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50] -; X64-HSW-NEXT: addq $42, %rcx # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq $2, %rax # sched: [1:0.25] -; X64-HSW-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_spec: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50] -; X64-JAG-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_spec: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: pushl %ebx -; X86-NOOPT-NEXT: pushl %edi -; X86-NOOPT-NEXT: pushl %esi -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOOPT-NEXT: movl $9, %edx -; X86-NOOPT-NEXT: movl %ecx, %eax -; X86-NOOPT-NEXT: mull %edx -; X86-NOOPT-NEXT: movl %eax, %esi -; X86-NOOPT-NEXT: leal (%edi,%edi,8), %ebx -; X86-NOOPT-NEXT: addl $42, %esi -; X86-NOOPT-NEXT: adcl %edx, %ebx -; X86-NOOPT-NEXT: movl $5, %edx -; X86-NOOPT-NEXT: movl %ecx, %eax -; X86-NOOPT-NEXT: mull %edx -; X86-NOOPT-NEXT: movl %eax, %ecx -; X86-NOOPT-NEXT: leal (%edi,%edi,4), %edi -; X86-NOOPT-NEXT: addl $2, %ecx -; X86-NOOPT-NEXT: adcl %edx, %edi -; X86-NOOPT-NEXT: movl %esi, %eax -; X86-NOOPT-NEXT: mull %ecx -; X86-NOOPT-NEXT: imull %esi, %edi -; X86-NOOPT-NEXT: addl %edi, %edx -; X86-NOOPT-NEXT: imull %ebx, %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: popl %esi -; X86-NOOPT-NEXT: popl %edi -; X86-NOOPT-NEXT: popl %ebx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_spec: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50] -; HSW-NOOPT-NEXT: addq $42, %rcx # sched: [1:0.25] -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: addq $2, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_spec: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50] -; JAG-NOOPT-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_spec: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00] -; X64-SLM-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_spec: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00] -; SLM-NOOPT-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] - %mul = mul nsw i64 %x, 9 - %add = add nsw i64 %mul, 42 - %mul2 = mul nsw i64 %x, 5 - %add2 = add nsw i64 %mul2, 2 - %mul3 = mul nsw i64 %add, %add2 - ret i64 %mul3 -} diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index d26cf02dd942..0bda41a30c69 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -746,9 +746,9 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE2-LABEL: interleave_24i8_in: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -791,17 +791,17 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE42: # BB#0: ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5] +; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero -; SSE42-NEXT: por %xmm1, %xmm3 +; SSE42-NEXT: por %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm0, %xmm2 -; SSE42-NEXT: movq %xmm2, 16(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: movq %xmm1, 16(%rdi) ; SSE42-NEXT: movdqu %xmm3, (%rdi) ; SSE42-NEXT: retq ; @@ -809,16 +809,16 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; AVX: # BB#0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) -; AVX-NEXT: vmovdqu %xmm1, (%rdi) +; AVX-NEXT: vmovdqu %xmm2, (%rdi) ; AVX-NEXT: retq %s1 = load <8 x i8>, <8 x i8>* %q1, align 4 %s2 = load <8 x i8>, <8 x i8>* %q2, align 4 diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 88cb7a6d5825..50a661fcca11 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -1152,9 +1152,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1166,9 +1166,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: retq ; @@ -1312,17 +1312,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pmuludq %xmm7, %xmm5 +; SSE2-NEXT: pmuludq %xmm7, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pmuludq %xmm8, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm5 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 @@ -1331,22 +1331,22 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_zero_upper: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm4, %xmm1 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: pmuludq %xmm5, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pmuludq %xmm6, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero ; SSE41-NEXT: pmuludq %xmm7, %xmm1 -; SSE41-NEXT: pmuludq %xmm6, %xmm2 -; SSE41-NEXT: pmuludq %xmm5, %xmm0 -; SSE41-NEXT: pmuludq %xmm8, %xmm4 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; @@ -1356,11 +1356,11 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -1467,22 +1467,22 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_sext: ; SSE41: # BB#0: ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm8 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm7 -; SSE41-NEXT: pmovsxwq %xmm0, %xmm5 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE41-NEXT: pmuldq %xmm4, %xmm3 ; SSE41-NEXT: pmovsxdq %xmm2, %xmm2 +; SSE41-NEXT: pmuldq %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; SSE41-NEXT: pmuldq %xmm6, %xmm4 ; SSE41-NEXT: pmovsxdq %xmm1, %xmm0 -; SSE41-NEXT: pmuldq %xmm5, %xmm0 -; SSE41-NEXT: pmuldq %xmm7, %xmm4 -; SSE41-NEXT: pmuldq %xmm6, %xmm2 -; SSE41-NEXT: pmuldq %xmm8, %xmm3 +; SSE41-NEXT: pmuldq %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; @@ -1493,9 +1493,10 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: mul_v8i64_sext: diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll index 571dd6774906..c54909cf93c1 100644 --- a/test/CodeGen/X86/pr32284.ll +++ b/test/CodeGen/X86/pr32284.ll @@ -1,81 +1,17 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx -O0 | FileCheck %s --check-prefix=X86-O0 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx -O0 | FileCheck %s --check-prefix=X64-O0 +; RUN: llc -O0 -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc -O0 -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,686 +; RUN: llc -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,686 +; REQUIRES: asserts @c = external constant i8, align 1 define void @foo() { -; X86-LABEL: foo: -; X86: # BB#0: # %entry -; X86-NEXT: subl $8, %esp -; X86-NEXT: .Lcfi0: -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: movzbl c, %eax -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %cl -; X86-NEXT: testb %al, %al -; X86-NEXT: setne {{[0-9]+}}(%esp) -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %eax, %ecx -; X86-NEXT: setle %dl -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: addl $8, %esp -; X86-NEXT: retl -; -; X86-O0-LABEL: foo: -; X86-O0: # BB#0: # %entry -; X86-O0-NEXT: subl $12, %esp -; X86-O0-NEXT: .Lcfi0: -; X86-O0-NEXT: .cfi_def_cfa_offset 16 -; X86-O0-NEXT: movb c, %al -; X86-O0-NEXT: testb %al, %al -; X86-O0-NEXT: setne {{[0-9]+}}(%esp) -; X86-O0-NEXT: movzbl c, %ecx -; X86-O0-NEXT: testl %ecx, %ecx -; X86-O0-NEXT: setne %al -; X86-O0-NEXT: movzbl %al, %edx -; X86-O0-NEXT: subl %ecx, %edx -; X86-O0-NEXT: setle %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movzbl %al, %ecx -; X86-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-O0-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-O0-NEXT: addl $12, %esp -; X86-O0-NEXT: retl -; -; X64-LABEL: foo: -; X64: # BB#0: # %entry -; X64-NEXT: movzbl {{.*}}(%rip), %eax -; X64-NEXT: testb %al, %al -; X64-NEXT: setne -{{[0-9]+}}(%rsp) -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %cl -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: setle %dl -; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) -; X64-NEXT: retq -; -; X64-O0-LABEL: foo: -; X64-O0: # BB#0: # %entry -; X64-O0-NEXT: movb {{.*}}(%rip), %al -; X64-O0-NEXT: testb %al, %al -; X64-O0-NEXT: setne -{{[0-9]+}}(%rsp) -; X64-O0-NEXT: movzbl {{.*}}(%rip), %ecx -; X64-O0-NEXT: testl %ecx, %ecx -; X64-O0-NEXT: setne %al -; X64-O0-NEXT: movzbl %al, %edx -; X64-O0-NEXT: subl %ecx, %edx -; X64-O0-NEXT: setle %al -; X64-O0-NEXT: andb $1, %al -; X64-O0-NEXT: movzbl %al, %ecx -; X64-O0-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-O0-NEXT: movl %edx, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-O0-NEXT: retq +; CHECK-LABEL: foo: +; CHECK: # BB#0: # %entry +; CHECK-DAG: setne +; CHECK-DAG: setle +; CHECK: ret entry: %a = alloca i8, align 1 %b = alloca i32, align 4 @@ -100,3 +36,125 @@ entry: store i32 %conv8, i32* %b, align 4 ret void } + +@var_5 = external global i32, align 4 +@var_57 = external global i64, align 8 +@_ZN8struct_210member_2_0E = external global i64, align 8 + +define void @f1() { +; CHECK-LABEL: f1: +; CHECK: # BB#0: # %entry +; CHECK: sete +; X64: addq $7093, {{.*}} +; 686: addl $7093, {{.*}} +; CHECK: ret +entry: + %a = alloca i8, align 1 + %0 = load i32, i32* @var_5, align 4 + %conv = sext i32 %0 to i64 + %add = add nsw i64 %conv, 8381627093 + %tobool = icmp ne i64 %add, 0 + %frombool = zext i1 %tobool to i8 + store i8 %frombool, i8* %a, align 1 + %1 = load i32, i32* @var_5, align 4 + %neg = xor i32 %1, -1 + %tobool1 = icmp ne i32 %neg, 0 + %lnot = xor i1 %tobool1, true + %conv2 = zext i1 %lnot to i64 + %2 = load i32, i32* @var_5, align 4 + %conv3 = sext i32 %2 to i64 + %add4 = add nsw i64 %conv3, 7093 + %cmp = icmp sgt i64 %conv2, %add4 + %conv5 = zext i1 %cmp to i64 + store i64 %conv5, i64* @var_57, align 8 + %3 = load i32, i32* @var_5, align 4 + %neg6 = xor i32 %3, -1 + %tobool7 = icmp ne i32 %neg6, 0 + %lnot8 = xor i1 %tobool7, true + %conv9 = zext i1 %lnot8 to i64 + store i64 %conv9, i64* @_ZN8struct_210member_2_0E, align 8 + ret void +} + + +@var_7 = external global i8, align 1 + +define void @f2() { +; CHECK-LABEL: f2: +; CHECK: # BB#0: # %entry +; X64: movzbl {{.*}}(%rip), %[[R:[a-z]*]] +; 686: movzbl {{.*}}, %[[R:[a-z]*]] +; CHECK: test{{[qlwb]}} %[[R]], %[[R]] +; CHECK: sete {{.*}} +; CHECK: ret +entry: + %a = alloca i16, align 2 + %0 = load i8, i8* @var_7, align 1 + %conv = zext i8 %0 to i32 + %1 = load i8, i8* @var_7, align 1 + %tobool = icmp ne i8 %1, 0 + %lnot = xor i1 %tobool, true + %conv1 = zext i1 %lnot to i32 + %xor = xor i32 %conv, %conv1 + %conv2 = trunc i32 %xor to i16 + store i16 %conv2, i16* %a, align 2 + %2 = load i8, i8* @var_7, align 1 + %conv3 = zext i8 %2 to i16 + %tobool4 = icmp ne i16 %conv3, 0 + %lnot5 = xor i1 %tobool4, true + %conv6 = zext i1 %lnot5 to i32 + %3 = load i8, i8* @var_7, align 1 + %conv7 = zext i8 %3 to i32 + %cmp = icmp eq i32 %conv6, %conv7 + %conv8 = zext i1 %cmp to i32 + %conv9 = trunc i32 %conv8 to i16 + store i16 %conv9, i16* undef, align 2 + ret void +} + + +@var_13 = external global i32, align 4 +@var_16 = external global i32, align 4 +@var_46 = external global i32, align 4 + +define void @f3() #0 { +; CHECK-LABEL: f3: +; X64-DAG: movl var_13(%rip), {{.*}} +; X64-DAG: movl var_16(%rip), {{.*}} +; X64-DAG: movl {{.*}},{{.*}}var_46{{.*}} +; X64: retq +; 686-DAG: movl var_13, {{.*}} +; 686-DAG: movl var_16, {{.*}} +; 686-DAG: movl {{.*}},{{.*}}var_46{{.*}} +; 686: retl +entry: + %a = alloca i64, align 8 + %0 = load i32, i32* @var_13, align 4 + %neg = xor i32 %0, -1 + %conv = zext i32 %neg to i64 + %1 = load i32, i32* @var_13, align 4 + %tobool = icmp ne i32 %1, 0 + %lnot = xor i1 %tobool, true + %conv1 = zext i1 %lnot to i64 + %2 = load i32, i32* @var_13, align 4 + %neg2 = xor i32 %2, -1 + %3 = load i32, i32* @var_16, align 4 + %xor = xor i32 %neg2, %3 + %conv3 = zext i32 %xor to i64 + %and = and i64 %conv1, %conv3 + %or = or i64 %conv, %and + store i64 %or, i64* %a, align 8 + %4 = load i32, i32* @var_13, align 4 + %neg4 = xor i32 %4, -1 + %conv5 = zext i32 %neg4 to i64 + %5 = load i32, i32* @var_13, align 4 + %tobool6 = icmp ne i32 %5, 0 + %lnot7 = xor i1 %tobool6, true + %conv8 = zext i1 %lnot7 to i64 + %and9 = and i64 %conv8, 0 + %or10 = or i64 %conv5, %and9 + %conv11 = trunc i64 %or10 to i32 + store i32 %conv11, i32* @var_46, align 4 + ret void +} + diff --git a/test/CodeGen/X86/pr32610.ll b/test/CodeGen/X86/pr32610.ll new file mode 100644 index 000000000000..1116cf6f1b29 --- /dev/null +++ b/test/CodeGen/X86/pr32610.ll @@ -0,0 +1,40 @@ +; RUN: llc -o - %s | FileCheck %s + +; CHECK-LABEL: @pr32610 +; CHECK: movl L_b$non_lazy_ptr, [[BASEREG:%[a-z]+]] +; CHECK: cmpl ([[BASEREG]]), {{%[a-z]+}} +; CHECK: cmpl ([[BASEREG]]), {{%[a-z]+}} + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.13.0" + +@c = external local_unnamed_addr global i32, align 4 +@b = external local_unnamed_addr global [1 x i32], align 4 +@d = external local_unnamed_addr global i32, align 4 + +; Function Attrs: norecurse nounwind optsize ssp +define void @pr32610() local_unnamed_addr #0 { +entry: + %0 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i32 0, i32 undef), align 4, !tbaa !1 + %cmp = icmp eq i32 undef, %0 + %conv = zext i1 %cmp to i32 + %tobool1.i = icmp ne i32 undef, 0 + %or.cond.i = and i1 %cmp, %tobool1.i + %cond.i = select i1 %or.cond.i, i32 %conv, i32 undef + store i32 %cond.i, i32* @c, align 4, !tbaa !1 + %1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i32 0, i32 0), align 4 + %tobool = icmp ne i32 %1, 0 + %2 = select i1 %tobool, i32 %1, i32 undef + store i32 %2, i32* @d, align 4, !tbaa !1 + ret void +} + +attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 5.0.0 (trunk 301507) (llvm/trunk 301505)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll index 5d5150ad62d6..4be3a4c2391b 100644 --- a/test/CodeGen/X86/rotate.ll +++ b/test/CodeGen/X86/rotate.ll @@ -33,8 +33,8 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB0_4: -; 32-NEXT: orl %esi, %eax ; 32-NEXT: orl %ebx, %edx +; 32-NEXT: orl %esi, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -86,8 +86,8 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB1_4: -; 32-NEXT: orl %ebx, %eax ; 32-NEXT: orl %esi, %edx +; 32-NEXT: orl %ebx, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -546,7 +546,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-LABEL: rotr1_64_mem: ; 32: # BB#0: ; 32-NEXT: pushl %esi -; 32-NEXT: movl 8(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: movl (%eax), %ecx ; 32-NEXT: movl 4(%eax), %edx ; 32-NEXT: movl %edx, %esi @@ -555,11 +555,13 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-NEXT: movl %ecx, 4(%eax) ; 32-NEXT: movl %esi, (%eax) ; 32-NEXT: popl %esi - +; 32-NEXT: retl +; ; 64-LABEL: rotr1_64_mem: ; 64: # BB#0: ; 64-NEXT: rorq (%rdi) ; 64-NEXT: retq + %A = load i64, i64 *%Aptr %B = shl i64 %A, 63 %C = lshr i64 %A, 1 @@ -571,7 +573,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { define void @rotr1_32_mem(i32* %Aptr) nounwind { ; 32-LABEL: rotr1_32_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorl (%eax) ; 32-NEXT: retl ; @@ -590,7 +592,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind { define void @rotr1_16_mem(i16* %Aptr) nounwind { ; 32-LABEL: rotr1_16_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorw (%eax) ; 32-NEXT: retl ; @@ -609,7 +611,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind { define void @rotr1_8_mem(i8* %Aptr) nounwind { ; 32-LABEL: rotr1_8_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorb (%eax) ; 32-NEXT: retl ; diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll index b8a8b8afd14f..6a565a5c76f0 100644 --- a/test/CodeGen/X86/sad.ll +++ b/test/CodeGen/X86/sad.ll @@ -149,127 +149,131 @@ middle.block: define i32 @sad_32i8() nounwind { ; SSE2-LABEL: sad_32i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm7 ; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: psubd %xmm10, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: psubd %xmm10, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm11, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] ; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: psubd %xmm2, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; SSE2-NEXT: psubd %xmm4, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm2, %xmm10 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm8 -; SSE2-NEXT: pxor %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm2, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE2-NEXT: psubd %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm2, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE2-NEXT: psubd %xmm6, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; SSE2-NEXT: psubd %xmm9, %xmm8 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 ; SSE2-NEXT: paddd %xmm7, %xmm13 -; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm15, %xmm3 -; SSE2-NEXT: paddd %xmm14, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm15, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: paddd %xmm14, %xmm13 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -398,288 +402,284 @@ middle.block: define i32 @sad_avx64i8() nounwind { ; SSE2-LABEL: sad_avx64i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: subq $184, %rsp -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: subq $200, %rsp +; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 -; SSE2-NEXT: movdqa a+1024(%rax), %xmm4 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm11 -; SSE2-NEXT: movdqa a+1072(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm4, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm6, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm8 -; SSE2-NEXT: movdqa %xmm13, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: psubd %xmm13, %xmm14 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: movdqa b+1056(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: psubd %xmm10, %xmm12 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm11 -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: movdqa b+1072(%rax), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movaps a+1040(%rax), %xmm0 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 +; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 +; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm15, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm15, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm12, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: pxor %xmm0, %xmm10 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm13, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: pxor %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm11 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] ; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: pxor %xmm0, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; SSE2-NEXT: movdqa b+1072(%rax), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa b+1056(%rax), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm8 +; SSE2-NEXT: movdqa b+1024(%rax), %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm12 +; SSE2-NEXT: movdqa b+1040(%rax), %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE2-NEXT: psubd %xmm13, %xmm2 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm15, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm15 +; SSE2-NEXT: paddd %xmm15, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: movdqa %xmm10, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm14, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm14 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm14, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm12, %xmm8 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm0, %xmm12 -; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: paddd %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm2 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm14 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm3, %xmm8 +; SSE2-NEXT: paddd %xmm2, %xmm15 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm13 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: paddd %xmm11, %xmm10 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: paddd %xmm13, %xmm1 +; SSE2-NEXT: paddd %xmm15, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm14, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] -; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: addq $184, %rsp +; SSE2-NEXT: addq $200, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: sad_avx64i8: @@ -688,8 +688,8 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6 ; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7 @@ -697,7 +697,6 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero @@ -705,48 +704,49 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8 +; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 +; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 +; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15 -; AVX2-NEXT: vpabsd %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 +; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpabsd %ymm9, %ymm8 +; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm10, %ymm8 +; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vpabsd %ymm11, %ymm8 ; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpabsd %ymm14, %ymm8 -; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpabsd %ymm13, %ymm8 -; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vpabsd %ymm12, %ymm8 ; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpabsd %ymm11, %ymm8 -; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 -; AVX2-NEXT: vpabsd %ymm10, %ymm8 -; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 -; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm13, %ymm8 +; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vpabsd %ymm14, %ymm8 +; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 ; AVX2-NEXT: vpabsd %ymm15, %ymm8 -; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4 +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -773,21 +773,21 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7 -; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6 -; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5 ; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7 ; AVX512F-NEXT: vpabsd %zmm4, %zmm4 -; AVX512F-NEXT: vpabsd %zmm5, %zmm5 -; AVX512F-NEXT: vpabsd %zmm6, %zmm6 -; AVX512F-NEXT: vpabsd %zmm7, %zmm7 -; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3 -; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2 -; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1 ; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; AVX512F-NEXT: vpabsd %zmm5, %zmm4 +; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vpabsd %zmm6, %zmm4 +; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2 +; AVX512F-NEXT: vpabsd %zmm7, %zmm4 +; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # BB#2: # %middle.block @@ -1154,59 +1154,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-LABEL: sad_nonloop_32i8: ; SSE2: # BB#0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm13, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqu (%rdx), %xmm5 -; SSE2-NEXT: movdqu 16(%rdx), %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: psubd %xmm5, %xmm0 -; SSE2-NEXT: psubd %xmm7, %xmm3 -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: psubd %xmm1, %xmm12 -; SSE2-NEXT: psubd %xmm8, %xmm6 -; SSE2-NEXT: psubd %xmm15, %xmm11 -; SSE2-NEXT: psubd %xmm14, %xmm10 -; SSE2-NEXT: psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqu 16(%rdi), %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm12, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm12, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqu (%rdx), %xmm7 +; SSE2-NEXT: movdqu 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm10 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE2-NEXT: psubd %xmm6, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psubd %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: psubd %xmm3, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm10 @@ -1215,33 +1210,37 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm13, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm13 ; SSE2-NEXT: pxor %xmm1, %xmm13 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: paddd %xmm10, %xmm4 +; SSE2-NEXT: paddd %xmm11, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm11, %xmm6 -; SSE2-NEXT: paddd %xmm9, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm6 ; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm6, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index ce42d0d643e8..1afef86a5f11 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -299,20 +299,21 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; GENERIC-NEXT: testb %dil, %dil ; GENERIC-NEXT: jne LBB7_4 ; GENERIC-NEXT: ## BB#5: +; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; GENERIC-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; GENERIC-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: jmp LBB7_6 ; GENERIC-NEXT: LBB7_4: -; GENERIC-NEXT: movd %r9d, %xmm2 -; GENERIC-NEXT: movd %ecx, %xmm3 -; GENERIC-NEXT: movd %r8d, %xmm4 +; GENERIC-NEXT: movd %r9d, %xmm1 +; GENERIC-NEXT: movd %ecx, %xmm2 +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; GENERIC-NEXT: movd %r8d, %xmm3 ; GENERIC-NEXT: movd %edx, %xmm1 ; GENERIC-NEXT: LBB7_6: -; GENERIC-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1 ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0 ; GENERIC-NEXT: movq %xmm0, 16(%rsi) @@ -339,16 +340,19 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; ATOM-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; ATOM-NEXT: jmp LBB7_6 ; ATOM-NEXT: LBB7_4: -; ATOM-NEXT: movd %r9d, %xmm2 -; ATOM-NEXT: movd %ecx, %xmm3 -; ATOM-NEXT: movd %r8d, %xmm4 +; ATOM-NEXT: movd %r9d, %xmm1 +; ATOM-NEXT: movd %ecx, %xmm2 +; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; ATOM-NEXT: movd %r8d, %xmm3 ; ATOM-NEXT: movd %edx, %xmm1 -; ATOM-NEXT: LBB7_6: -; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; ATOM-NEXT: LBB7_6: ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0 ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1 ; ATOM-NEXT: movq %xmm0, 16(%rsi) diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll index 1b8f8e7ae559..2628f824ea40 100644 --- a/test/CodeGen/X86/setcc-lowering.ll +++ b/test/CodeGen/X86/setcc-lowering.ll @@ -45,64 +45,21 @@ define void @pr26232(i64 %a, <16 x i1> %b) { ; AVX-LABEL: pr26232: ; AVX: # BB#0: # %for_loop599.preheader ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB1_1: # %for_loop599 ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: cmpq $65536, %rdi # imm = 0x10000 ; AVX-NEXT: setl %al -; AVX-NEXT: vmovd %eax, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm0, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $15, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $14, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $13, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $12, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $11, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $10, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $9, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $8, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $7, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $6, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $5, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $3, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $2, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $1, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $0, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: cmpw $0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovd %eax, %xmm3 +; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX-NEXT: vpand %xmm0, %xmm3, %xmm3 +; AVX-NEXT: vpsllw $7, %xmm3, %xmm3 +; AVX-NEXT: vpand %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3 +; AVX-NEXT: vpmovmskb %xmm3, %eax +; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: jne .LBB1_1 ; AVX-NEXT: # BB#2: # %for_exit600 ; AVX-NEXT: retq diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll index 2996edaec3e0..332bf2887fb0 100644 --- a/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -58,25 +58,25 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: ne_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -100,25 +100,25 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: eq_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/shrink_vmul_sse.ll b/test/CodeGen/X86/shrink_vmul_sse.ll index c869dff9e642..6701c247e6fc 100644 --- a/test/CodeGen/X86/shrink_vmul_sse.ll +++ b/test/CodeGen/X86/shrink_vmul_sse.ll @@ -20,9 +20,9 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; CHECK-NEXT: movzbl 1(%edx,%ecx), %edi ; CHECK-NEXT: movzbl (%edx,%ecx), %edx ; CHECK-NEXT: movzbl 1(%eax,%ecx), %ebx +; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movzbl (%eax,%ecx), %eax ; CHECK-NEXT: imull %edx, %eax -; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movl %ebx, 4(%esi,%ecx,4) ; CHECK-NEXT: movl %eax, (%esi,%ecx,4) ; CHECK-NEXT: popl %esi diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 503b9416c8d3..4a0dc9c1eb17 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -273,8 +273,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X32: ## BB#0: ## %entry ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: addss %xmm2, %xmm3 +; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X32-NEXT: retl ; @@ -282,8 +282,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X64: ## BB#0: ## %entry ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: addss %xmm2, %xmm3 +; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X64-NEXT: retq entry: @@ -896,9 +896,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: addps %xmm2, %xmm3 ; X32-NEXT: addps %xmm3, %xmm0 ; X32-NEXT: retl @@ -908,9 +908,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: addps %xmm2, %xmm3 ; X64-NEXT: addps %xmm3, %xmm0 ; X64-NEXT: retq diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll index 226c0adbaf3c..2fb821555dba 100644 --- a/test/CodeGen/X86/vector-bitreverse.ll +++ b/test/CodeGen/X86/vector-bitreverse.ll @@ -2372,10 +2372,10 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm3 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 ; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll index a05a981daa1f..f0a5fe1dbfff 100644 --- a/test/CodeGen/X86/vector-blend.ll +++ b/test/CodeGen/X86/vector-blend.ll @@ -848,10 +848,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: blend_logic_v8i32: @@ -860,10 +860,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm0, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: blend_logic_v8i32: diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index f4d0503f4a79..4181a374c61c 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -11,13 +11,13 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm1 -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> @@ -39,11 +39,11 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> @@ -120,9 +120,9 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16 diff --git a/test/CodeGen/X86/xchg-nofold.ll b/test/CodeGen/X86/xchg-nofold.ll index fddc7906e08f..939fa0404223 100644 --- a/test/CodeGen/X86/xchg-nofold.ll +++ b/test/CodeGen/X86/xchg-nofold.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-linux-gnu < %s | FileCheck %s %"struct.std::atomic" = type { %"struct.std::atomic_bool" } @@ -6,6 +7,28 @@ ; CHECK-LABEL: _Z3fooRSt6atomicIbEb define zeroext i1 @_Z3fooRSt6atomicIbEb(%"struct.std::atomic"* nocapture dereferenceable(1) %a, i1 returned zeroext %b) nounwind { +; CHECK-LABEL: _Z3fooRSt6atomicIbEb: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq $3, %rax +; CHECK-NEXT: movb 2147450880(%rax), %al +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB0_3 +; CHECK-NEXT: # BB#1: +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andl $7, %ecx +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: jge .LBB0_2 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: xchgb %al, (%rdi) +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq __asan_report_store1 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP entry: %frombool.i.i = zext i1 %b to i8 %_M_i.i.i = getelementptr inbounds %"struct.std::atomic", %"struct.std::atomic"* %a, i64 0, i32 0, i32 0, i32 0 @@ -30,7 +53,6 @@ entry: ; <label>:11: ; preds = %6, %entry store atomic i8 %frombool.i.i, i8* %_M_i.i.i seq_cst, align 1 -; CHECK: xchgb %{{.*}}, (%{{.*}}) ret i1 %b } diff --git a/test/DebugInfo/MIR/X86/empty-inline.mir b/test/DebugInfo/MIR/X86/empty-inline.mir index 1766a8f44616..71d10fe9de94 100644 --- a/test/DebugInfo/MIR/X86/empty-inline.mir +++ b/test/DebugInfo/MIR/X86/empty-inline.mir @@ -73,7 +73,6 @@ name: _ZN1C5m_fn3Ev alignment: 4 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/DebugInfo/omit-empty.ll b/test/DebugInfo/omit-empty.ll index 92450050d208..8b277676f94c 100644 --- a/test/DebugInfo/omit-empty.ll +++ b/test/DebugInfo/omit-empty.ll @@ -1,4 +1,5 @@ ; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-objdump -h - | FileCheck %s +; REQUIRES: default_triple ; CHECK-NOT: .debug_ diff --git a/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll b/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll index 092c9dc6b95b..f7f63bd6be80 100644 --- a/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll +++ b/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll @@ -14,8 +14,8 @@ ; clang++ ../1.cc -O3 -g -S -emit-llvm -fno-strict-aliasing ; and add sanitize_address to @_ZN1A1fEv -; Test that __sanitizer_cov call has !dbg pointing to the opening { of A::f(). -; CHECK: call void @__sanitizer_cov(i32*{{.*}}), !dbg [[A:!.*]] +; Test that __sanitizer_cov_trace_pc_guard call has !dbg pointing to the opening { of A::f(). +; CHECK: call void @__sanitizer_cov_trace_pc_guard(i32*{{.*}}), !dbg [[A:!.*]] ; CHECK: [[A]] = !DILocation(line: 6, scope: !{{.*}}) diff --git a/test/Instrumentation/SanitizerCoverage/coverage.ll b/test/Instrumentation/SanitizerCoverage/coverage.ll index d675c9d9c370..7b6b5f00442f 100644 --- a/test/Instrumentation/SanitizerCoverage/coverage.ll +++ b/test/Instrumentation/SanitizerCoverage/coverage.ll @@ -1,16 +1,5 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=0 -S | FileCheck %s --check-prefix=CHECK0 -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s --check-prefix=CHECK1 -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK2 -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=0 -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1 -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK3 ; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK_TRACE_PC -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 \ -; RUN: -S | FileCheck %s --check-prefix=CHECK2 -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1 \ -; RUN: -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK ; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s --check-prefix=CHECKPRUNE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/test/Instrumentation/SanitizerCoverage/seh.ll b/test/Instrumentation/SanitizerCoverage/seh.ll index ce18334ed207..f432573af64a 100644 --- a/test/Instrumentation/SanitizerCoverage/seh.ll +++ b/test/Instrumentation/SanitizerCoverage/seh.ll @@ -1,7 +1,6 @@ ; RUN: opt < %s -sancov -sanitizer-coverage-level=0 -S | FileCheck %s ; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s ; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S | FileCheck %s -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=0 -S | FileCheck %s target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" target triple = "i686-pc-windows-msvc18.0.0" diff --git a/test/MC/AMDGPU/ds-err.s b/test/MC/AMDGPU/ds-err.s index 3951efbb60f9..d9f22f5f3ed2 100644 --- a/test/MC/AMDGPU/ds-err.s +++ b/test/MC/AMDGPU/ds-err.s @@ -21,3 +21,93 @@ ds_write2_b32 v2, v4, v6 offset0:1000000000 // CHECK: invalid operand for instruction ds_write2_b32 v2, v4, v6 offset1:1000000000 +//===----------------------------------------------------------------------===// +// swizzle +//===----------------------------------------------------------------------===// + +// CHECK: error: expected a colon +ds_swizzle_b32 v8, v2 offset + +// CHECK: error: failed parsing operand +ds_swizzle_b32 v8, v2 offset: + +// CHECK: error: expected a colon +ds_swizzle_b32 v8, v2 offset- + +// CHECK: error: expected absolute expression +ds_swizzle_b32 v8, v2 offset:SWIZZLE(QUAD_PERM, 0, 1, 2, 3) + +// CHECK: error: expected a swizzle mode +ds_swizzle_b32 v8, v2 offset:swizzle(quad_perm, 0, 1, 2, 3) + +// CHECK: error: expected a swizzle mode +ds_swizzle_b32 v8, v2 offset:swizzle(XXX,1) + +// CHECK: error: expected a comma +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM + +// CHECK: error: expected a comma +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2) + +// CHECK: error: expected a closing parentheses +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2, 3 + +// CHECK: error: expected a closing parentheses +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2, 3, 4) + +// CHECK: error: expected a 2-bit lane id +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, -1, 1, 2, 3) + +// CHECK: error: expected a 2-bit lane id +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 4, 1, 2, 3) + +// CHECK: error: group size must be in the interval [1,16] +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,0) + +// CHECK: error: group size must be a power of two +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,3) + +// CHECK: error: group size must be in the interval [1,16] +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,17) + +// CHECK: error: group size must be in the interval [1,16] +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,32) + +// CHECK: error: group size must be in the interval [2,32] +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,1) + +// CHECK: error: group size must be a power of two +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,3) + +// CHECK: error: group size must be in the interval [2,32] +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,33) + +// CHECK: error: group size must be in the interval [2,32] +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,1,0) + +// CHECK: error: group size must be a power of two +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,3,1) + +// CHECK: error: group size must be in the interval [2,32] +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,33,1) + +// CHECK: error: lane id must be in the interval [0,group size - 1] +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,-1) + +// CHECK: error: lane id must be in the interval [0,group size - 1] +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,2) + +// CHECK: error: expected a string +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, pppii) + +// CHECK: error: expected a 5-character mask +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "") + +// CHECK: error: expected a 5-character mask +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "ppii") + +// CHECK: error: expected a 5-character mask +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "pppiii") + +// CHECK: invalid mask +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "pppi2") diff --git a/test/MC/AMDGPU/ds.s b/test/MC/AMDGPU/ds.s index 18e4957e32d7..ef36a98f746a 100644 --- a/test/MC/AMDGPU/ds.s +++ b/test/MC/AMDGPU/ds.s @@ -267,10 +267,6 @@ ds_max_rtn_f32 v8, v2, v4 // SICI: ds_max_rtn_f32 v8, v2, v4 ; encoding: [0x00,0x00,0xcc,0xd8,0x02,0x04,0x00,0x08] // VI: ds_max_rtn_f32 v8, v2, v4 ; encoding: [0x00,0x00,0x66,0xd8,0x02,0x04,0x00,0x08] -ds_swizzle_b32 v8, v2 -// SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] -// VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] - ds_read_b32 v8, v2 // SICI: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0xd8,0xd8,0x02,0x00,0x00,0x08] // VI: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0x6c,0xd8,0x02,0x00,0x00,0x08] @@ -506,3 +502,143 @@ ds_nop // NOSI: error: instruction not supported on this GPU // CI: ds_nop ; encoding: [0x00,0x00,0x50,0xd8,0x00,0x00,0x00,0x00] // VI: ds_nop ; encoding: [0x00,0x00,0x28,0xd8,0x00,0x00,0x00,0x00] + +//===----------------------------------------------------------------------===// +// swizzle +//===----------------------------------------------------------------------===// + +ds_swizzle_b32 v8, v2 +// SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0xFFFF +// SICI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2, 3) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,0,1,2,3) ; encoding: [0xe4,0x80,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,0,1,2,3) ; encoding: [0xe4,0x80,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 2, 1, 3, 3) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,2,1,3,3) ; encoding: [0xf6,0x80,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,2,1,3,3) ; encoding: [0xf6,0x80,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,2) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,2) ; encoding: [0x1f,0x08,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,2) ; encoding: [0x1f,0x08,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,4) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,4) ; encoding: [0x1f,0x10,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,4) ; encoding: [0x1f,0x10,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,8) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,8) ; encoding: [0x1f,0x20,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,8) ; encoding: [0x1f,0x20,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,16) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,16) ; encoding: [0x1f,0x40,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,16) ; encoding: [0x1f,0x40,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,2) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,8) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,8) ; encoding: [0x1f,0x1c,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,8) ; encoding: [0x1f,0x1c,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,16) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,16) ; encoding: [0x1f,0x3c,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,16) ; encoding: [0x1f,0x3c,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,32) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,32) ; encoding: [0x1f,0x7c,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,32) ; encoding: [0x1f,0x7c,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,1) ; encoding: [0x3e,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,1) ; encoding: [0x3e,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,1) ; encoding: [0x3c,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,1) ; encoding: [0x3c,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,1) ; encoding: [0x38,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,1) ; encoding: [0x38,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,1) ; encoding: [0x30,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,1) ; encoding: [0x30,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,0) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,0) ; encoding: [0x1e,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,0) ; encoding: [0x1e,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,3) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,3) ; encoding: [0x7c,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,3) ; encoding: [0x7c,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7) ; encoding: [0xf8,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7) ; encoding: [0xf8,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,15) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,15) ; encoding: [0xf0,0x01,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,15) ; encoding: [0xf0,0x01,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,31) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,31) ; encoding: [0xe0,0x03,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,31) ; encoding: [0xe0,0x03,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "pppii") +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "01pip") +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"01pip") ; encoding: [0x07,0x09,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"01pip") ; encoding: [0x07,0x09,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x000 +// SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x001 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000p") ; encoding: [0x01,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000p") ; encoding: [0x01,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x020 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x021 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x21,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x21,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x400 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x00,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x00,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x401 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000i") ; encoding: [0x01,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000i") ; encoding: [0x01,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x420 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x20,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x20,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x421 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x21,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x21,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] diff --git a/test/MC/ARM/big-endian-thumb-fixup.s b/test/MC/ARM/big-endian-thumb-fixup.s index 5023fca26be1..4e81469fe489 100644 --- a/test/MC/ARM/big-endian-thumb-fixup.s +++ b/test/MC/ARM/big-endian-thumb-fixup.s @@ -4,6 +4,7 @@ .text .align 2 .code 16 + .thumb_func @ARM::fixup_arm_thumb_bl .section s_thumb_bl,"ax",%progbits diff --git a/test/MC/ARM/mixed-arm-thumb-bl-fixup.ll b/test/MC/ARM/mixed-arm-thumb-bl-fixup.ll new file mode 100644 index 000000000000..155ce5a425b4 --- /dev/null +++ b/test/MC/ARM/mixed-arm-thumb-bl-fixup.ll @@ -0,0 +1,77 @@ +; RUN: llc -O0 < %s -mtriple armv7-linux-gnueabi -o - \ +; RUN: | llvm-mc -triple armv7-linux-gnueabi -filetype=obj -o - \ +; RUN: | llvm-readobj -r | FileCheck --check-prefix LINUX %s + +; RUN: llc -O0 < %s -mtriple armv7-linux-android -o - \ +; RUN: | llvm-mc -triple armv7-linux-android -filetype=obj -o - \ +; RUN: | llvm-readobj -r | FileCheck --check-prefix LINUX %s + + +; RUN: llc -O0 < %s -mtriple armv7-apple-ios -o - \ +; RUN: | llvm-mc -triple armv7-apple-ios -filetype=obj -o - \ +; RUN: | llvm-readobj -r | FileCheck --check-prefix IOS %s + + +define void @thumb_caller() #0 { + call void @internal_arm_fn() + call void @global_arm_fn() + call void @internal_thumb_fn() + call void @global_thumb_fn() + ret void +} + +define void @arm_caller() #1 { + call void @internal_arm_fn() + call void @global_arm_fn() + call void @internal_thumb_fn() + call void @global_thumb_fn() + ret void +} + +define internal void @internal_thumb_fn() #0 { + ret void +} + +define void @global_thumb_fn() #0 { +entry: + br label %end +end: + br label %end + ret void +} + +define internal void @internal_arm_fn() #1 { + ret void +} + +define void @global_arm_fn() #1 { +entry: + br label %end +end: + br label %end + ret void +} + +attributes #0 = { "target-features"="+thumb-mode" } +attributes #1 = { "target-features"="-thumb-mode" } + +; LINUX: Section (3) .rel.text { +; LINUX-NEXT: 0x2 R_ARM_THM_CALL internal_arm_fn 0x0 +; LINUX-NEXT: 0x6 R_ARM_THM_CALL global_arm_fn 0x0 +; LINUX-NEXT: 0xE R_ARM_THM_CALL global_thumb_fn 0x0 +; LINUX-NEXT: 0x1C R_ARM_CALL internal_arm_fn 0x0 +; LINUX-NEXT: 0x20 R_ARM_CALL global_arm_fn 0x0 +; LINUX-NEXT: 0x24 R_ARM_CALL internal_thumb_fn 0x0 +; LINUX-NEXT: 0x28 R_ARM_CALL global_thumb_fn 0x0 +; LINUX-NEXT: } + +; IOS: Section __text { +; IOS-NEXT: 0x2C 1 2 0 ARM_RELOC_BR24 0 __text +; IOS-NEXT: 0x28 1 2 0 ARM_RELOC_BR24 0 __text +; IOS-NEXT: 0x24 1 2 0 ARM_RELOC_BR24 0 __text +; IOS-NEXT: 0x20 1 2 0 ARM_RELOC_BR24 0 __text +; IOS-NEXT: 0x10 1 2 0 ARM_THUMB_RELOC_BR22 0 __text +; IOS-NEXT: 0xC 1 2 0 ARM_THUMB_RELOC_BR22 0 __text +; IOS-NEXT: 0x8 1 2 0 ARM_THUMB_RELOC_BR22 0 __text +; IOS-NEXT: 0x4 1 2 0 ARM_THUMB_RELOC_BR22 0 __text +; IOS-NEXT: } diff --git a/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt b/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt index 37725e960f92..5fe7a8cd0621 100644 --- a/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt +++ b/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt @@ -990,23 +990,23 @@ # CHECK: ds_read_u16 v5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05] 0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05] -0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05 +# CHECK: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05] +0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff] -0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff +# CHECK: ds_swizzle_b32 v255, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0xff] +0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0xff -# CHECK: ds_swizzle_b32 v5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05] -0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05 +# CHECK: ds_swizzle_b32 v5, v255 ; encoding: [0x00,0x00,0x7a,0xd8,0xff,0x00,0x00,0x05] +0x00,0x00,0x7a,0xd8,0xff,0x00,0x00,0x05 # CHECK: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05] 0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v5, v1 offset:4 ; encoding: [0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05] -0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05 +# CHECK: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05] +0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05] -0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05 +# CHECK: ds_swizzle_b32 v5, v1 gds ; encoding: [0x00,0x00,0x7b,0xd8,0x01,0x00,0x00,0x05] +0x00,0x00,0x7b,0xd8,0x01,0x00,0x00,0x05 # CHECK: ds_permute_b32 v5, v1, v2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05] 0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05 diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll index 0ec356392a2d..c5d10a0a67e3 100644 --- a/test/Other/new-pm-defaults.ll +++ b/test/Other/new-pm-defaults.ll @@ -30,6 +30,8 @@ ; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> ; CHECK-O-NEXT: Starting llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> @@ -53,7 +55,6 @@ ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. -; CHECK-O-NEXT: Running pass: PGOIndirectCallPromotion ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA ; CHECK-O-NEXT: Running analysis: GlobalsAA ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis @@ -134,6 +135,10 @@ ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Finished CGSCC pass manager run. +; CHECK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: GlobalOptPass ; CHECK-O-NEXT: Running pass: EliminateAvailableExternallyPass ; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA @@ -163,6 +168,7 @@ ; CHECK-O-NEXT: Running pass: GlobalDCEPass ; CHECK-O-NEXT: Running pass: ConstantMergePass ; CHECK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-O-NEXT: Finished llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: PrintModulePass ; ; Make sure we get the IR back out without changes when we print the module. diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll new file mode 100644 index 000000000000..52f475b0397d --- /dev/null +++ b/test/Other/new-pm-thinlto-defaults.ll @@ -0,0 +1,221 @@ +; The IR below was crafted so as: +; 1) To have a loop, so we create a loop pass manager +; 2) To be "immutable" in the sense that no pass in the standard +; pipeline will modify it. +; Since no transformations take place, we don't expect any analyses +; to be invalidated. +; Any invalidation that shows up here is a bug, unless we started modifying +; the IR, in which case we need to make it immutable harder. +; +; Prelink pipelines: +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto-pre-link<O1>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-PRELINK-O,CHECK-PRELINK-O1 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto-pre-link<O2>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O2 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto-pre-link<O3>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-PRELINK-O,CHECK-PRELINK-O3 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto-pre-link<Os>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-PRELINK-O,CHECK-PRELINK-Os +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto-pre-link<Oz>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-PRELINK-O,CHECK-PRELINK-Oz +; +; Postlink pipelines: +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto<O1>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-POSTLINK-O,CHECK-POSTLINK-O1 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto<O2>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-POSTLINK-O,CHECK-POSTLINK-O2 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto<O3>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-POSTLINK-O,CHECK-POSTLINK-O3 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto<Os>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-POSTLINK-O,CHECK-POSTLINK-Os +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto<Oz>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-POSTLINK-O,CHECK-POSTLINK-Oz +; +; CHECK-O: Starting llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass +; CHECK-POSTLINK-O-NEXT: Running pass: PGOIndirectCallPromotion +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass +; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running analysis: TargetIRAnalysis +; CHECK-O-NEXT: Running analysis: AssumptionAnalysis +; CHECK-O-NEXT: Running pass: SROA +; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis +; CHECK-O-NEXT: Running pass: EarlyCSEPass +; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: IPSCCPPass +; CHECK-O-NEXT: Running pass: GlobalOptPass +; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PromotePass> +; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass +; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA +; CHECK-O-NEXT: Running analysis: GlobalsAA +; CHECK-O-NEXT: Running analysis: CallGraphAnalysis +; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis +; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis +; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}LazyCallGraph{{.*}}> +; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy +; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis +; CHECK-O-NEXT: Starting CGSCC pass manager run. +; CHECK-O-NEXT: Running pass: InlinerPass +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}> +; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass +; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy +; CHECK-O-NEXT: Running analysis: AAManager +; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass +; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: SROA +; CHECK-O-NEXT: Running pass: EarlyCSEPass +; CHECK-O-NEXT: Running pass: SpeculativeExecutionPass +; CHECK-O-NEXT: Running pass: JumpThreadingPass +; CHECK-O-NEXT: Running analysis: LazyValueAnalysis +; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass +; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass +; CHECK-O3-NEXT: Running pass: LibCallsShrinkWrapPass +; CHECK-O-NEXT: Running pass: TailCallElimPass +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running pass: ReassociatePass +; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis +; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis +; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}> +; CHECK-O-NEXT: Running analysis: LoopAnalysis +; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis +; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy +; CHECK-O-NEXT: Starting Loop pass manager run. +; CHECK-O-NEXT: Running pass: LoopRotatePass +; CHECK-O-NEXT: Running pass: LICM +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy +; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass +; CHECK-O-NEXT: Finished Loop pass manager run. +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}> +; CHECK-O-NEXT: Starting Loop pass manager run. +; CHECK-O-NEXT: Running pass: IndVarSimplifyPass +; CHECK-O-NEXT: Running pass: LoopIdiomRecognizePass +; CHECK-O-NEXT: Running pass: LoopDeletionPass +; CHECK-O-NEXT: Running pass: LoopUnrollPass +; CHECK-O-NEXT: Finished Loop pass manager run. +; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass +; CHECK-Os-NEXT: Running pass: GVN +; CHECK-Os-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-Oz-NEXT: Running pass: MergedLoadStoreMotionPass +; CHECK-Oz-NEXT: Running pass: GVN +; CHECK-Oz-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-O2-NEXT: Running pass: MergedLoadStoreMotionPass +; CHECK-O2-NEXT: Running pass: GVN +; CHECK-O2-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-O3-NEXT: Running pass: MergedLoadStoreMotionPass +; CHECK-O3-NEXT: Running pass: GVN +; CHECK-O3-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-O-NEXT: Running pass: MemCpyOptPass +; CHECK-O1-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-O-NEXT: Running pass: SCCPPass +; CHECK-O-NEXT: Running pass: BDCEPass +; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis +; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O-NEXT: Running pass: JumpThreadingPass +; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass +; CHECK-O-NEXT: Running pass: DSEPass +; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}> +; CHECK-O-NEXT: Running pass: ADCEPass +; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. +; CHECK-O-NEXT: Finished CGSCC pass manager run. +; CHECK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-PRELINK-O-NEXT: Running pass: GlobalOptPass +; CHECK-PRELINK-O-NEXT: Running pass: NameAnonGlobalPass +; CHECK-POSTLINK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-POSTLINK-O-NEXT: Starting llvm::Module pass manager run. +; CHECK-POSTLINK-O-NEXT: Running pass: GlobalOptPass +; CHECK-POSTLINK-O-NEXT: Running pass: EliminateAvailableExternallyPass +; CHECK-POSTLINK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass +; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA +; CHECK-POSTLINK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-POSTLINK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-POSTLINK-O-NEXT: Running pass: Float2IntPass +; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopRotatePass +; CHECK-POSTLINK-O-NEXT: Running pass: LoopDistributePass +; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis +; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis +; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass +; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis +; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass +; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass +; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass +; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopUnrollPass +; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass +; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis +; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass +; CHECK-POSTLINK-O-NEXT: Running pass: AlignmentFromAssumptionsPass +; CHECK-POSTLINK-O-NEXT: Running pass: LoopSinkPass +; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifierPass +; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-POSTLINK-O-NEXT: Finished llvm::Function pass manager run. +; CHECK-POSTLINK-O-NEXT: Running pass: GlobalDCEPass +; CHECK-POSTLINK-O-NEXT: Running pass: ConstantMergePass +; CHECK-POSTLINK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: PrintModulePass + +; Make sure we get the IR back out without changes when we print the module. +; CHECK-O-LABEL: define void @foo(i32 %n) local_unnamed_addr { +; CHECK-O-NEXT: entry: +; CHECK-O-NEXT: br label %loop +; CHECK-O: loop: +; CHECK-O-NEXT: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK-O-NEXT: %iv.next = add i32 %iv, 1 +; CHECK-O-NEXT: tail call void @bar() +; CHECK-O-NEXT: %cmp = icmp eq i32 %iv, %n +; CHECK-O-NEXT: br i1 %cmp, label %exit, label %loop +; CHECK-O: exit: +; CHECK-O-NEXT: ret void +; CHECK-O-NEXT: } +; +; CHECK-O-NEXT: Finished llvm::Module pass manager run. + +declare void @bar() local_unnamed_addr + +define void @foo(i32 %n) local_unnamed_addr { +entry: + br label %loop +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, 1 + tail call void @bar() + %cmp = icmp eq i32 %iv, %n + br i1 %cmp, label %exit, label %loop +exit: + ret void +} diff --git a/test/ThinLTO/X86/error-newpm.ll b/test/ThinLTO/X86/newpm-basic.ll index 9c2fd2c70d6d..d357cbc85d00 100644 --- a/test/ThinLTO/X86/error-newpm.ll +++ b/test/ThinLTO/X86/newpm-basic.ll @@ -1,9 +1,7 @@ ; RUN: opt -module-summary %s -o %t1.bc -; RUN: not llvm-lto2 run %t1.bc -o %t.o \ +; RUN: llvm-lto2 run %t1.bc -o %t.o \ ; RUN: -r=%t1.bc,_tinkywinky,pxl \ -; RUN: -lto-use-new-pm 2>&1 | FileCheck %s - -; CHECK: ThinLTO not supported with the new PM yet! +; RUN: -lto-use-new-pm target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.11.0" diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca.ll new file mode 100644 index 000000000000..48db0b61a31b --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineAlloca.ll @@ -0,0 +1,68 @@ + +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + ; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +; Function Attrs: nounwind uwtable +define i32 @callee_sinkable_bitcast(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_sinkable_bitcast.{{[0-9]}} +; CHECK: alloca +; CHECK-NEXT: bitcast +; CHECK: call void @llvm.lifetime +bb: + %tmp = alloca %"class.base", align 4 + %tmp1 = bitcast %"class.base"* %tmp to i8* + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1) #2 + %tmp11 = bitcast %"class.base"* %tmp to i32* + store i32 %tmp3, i32* %tmp11, align 4, !tbaa !2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + call void @bar(i32* nonnull %tmp11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + ret i32 %tmp7 +} + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_sinkable_bitcast(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca2.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca2.ll new file mode 100644 index 000000000000..4ca418389e5e --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineAlloca2.ll @@ -0,0 +1,65 @@ +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +define i32 @callee_no_bitcast(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_no_bitcast.{{[0-9]}} +; CHECK: alloca +; CHECK: call void @llvm.lifetime +bb: + %tmp = alloca i8, align 4 + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp) #2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + %tmp11 = bitcast i8 * %tmp to i32* + call void @bar(i32* nonnull %tmp11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + ret i32 %tmp7 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_no_bitcast(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + + diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll new file mode 100644 index 000000000000..6bb38d44f466 --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll @@ -0,0 +1,67 @@ +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +define i32 @callee_unknown_use1(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_unknown_use1.{{[0-9]}} +; CHECK-NOT: alloca +; CHECK: call void @llvm.lifetime +bb: + %tmp = alloca i8, align 4 + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp) #2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + %tmp11 = bitcast i8* %tmp to i32* + call void @bar(i32* nonnull %tmp11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + %tmp1 = bitcast i8* %tmp to i32* + ret i32 %tmp7 +} + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_unknown_use1(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + + diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll new file mode 100644 index 000000000000..9c53496e1cea --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll @@ -0,0 +1,67 @@ +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +define i32 @callee_unknown_use2(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_unknown_use2.{{[0-9]}} +; CHECK-NOT: alloca +; CHECK: call void @llvm.lifetime +bb: + %tmp = alloca i32, align 4 + %tmp1 = bitcast i32* %tmp to i8* + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1) #2 + store i32 %tmp3, i32* %tmp, align 4, !tbaa !2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + call void @bar(i32* nonnull %tmp) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + %tmp10 = bitcast i8* %tmp1 to i32* + ret i32 %tmp7 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_unknown_use2(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + + diff --git a/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll b/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll new file mode 100644 index 000000000000..e8a4d1281a23 --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll @@ -0,0 +1,61 @@ +; RUN: opt -S -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +; RUN: opt -S -passes=partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +define i32 @test(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 (...) @bar() #1 + %tmp1 = icmp slt i32 %arg, 0 + br i1 %tmp1, label %bb6, label %bb2 + +bb2: ; preds = %bb + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + %tmp3 = tail call i32 (...) @bar() #1 + %tmp4 = icmp eq i32 %tmp3, 10 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb2 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + br label %bb6 + +bb6: ; preds = %bb5, %bb2, %bb + %tmp7 = phi i32 [ %tmp, %bb5 ], [ 0, %bb ], [ %tmp, %bb2 ] + ret i32 %tmp7 +} + +declare i32 @bar(...) local_unnamed_addr #1 + +declare void @foo(...) local_unnamed_addr #1 + +; Function Attrs: nounwind uwtable +define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL: @dummy_caller +; CHECK: codeRepl.i: +; CHECK: call void @test.1_bb2() +; CHECK-NOT: load +; CHECK br + +bb: + %tmp = tail call i32 @test(i32 %arg) + ret i32 %tmp +} + +; CHECK-LABEL: define internal void @test.1_bb2() +; CHECK: .exitStub: +; CHECK-NOT: store i32 %tmp7, i32* %tmp7.out +; CHECK: ret + + +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind uwtable } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} diff --git a/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll b/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll new file mode 100644 index 000000000000..a48ff4b1b8f9 --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll @@ -0,0 +1,62 @@ +; RUN: opt -S -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +; RUN: opt -S -passes=partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s + +define i32 @test(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 (...) @bar() #1 + %tmp1 = icmp slt i32 %arg, 0 + br i1 %tmp1, label %bb6, label %bb2 + +bb2: ; preds = %bb + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + %tmp3 = tail call i32 (...) @bar() #1 + %tmp4 = icmp eq i32 %tmp3, 10 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb2 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + br label %bb6 + +bb6: ; preds = %bb5, %bb2, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ], [ 1, %bb2 ] + ret i32 %tmp7 +} + +; Function Attrs: nounwind uwtable +declare i32 @bar(...) local_unnamed_addr #0 + +; Function Attrs: nounwind uwtable +declare void @foo(...) local_unnamed_addr #0 + +; Function Attrs: nounwind uwtable +define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL: @dummy_caller +; CHECK: codeRepl.i: +; CHECK: call void @test.1_bb2() +; CHECK-NOT: load +; CHECK br +bb: + %tmp = tail call i32 @test(i32 %arg) + ret i32 %tmp +} + +; CHECK-LABEL: define internal void @test.1_bb2() +; CHECK: .exitStub: +; CHECK-NOT: store i32 %tmp7, i32* %tmp7.out +; CHECK: ret + +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} diff --git a/test/Transforms/GVN/PRE/phi-translate-2.ll b/test/Transforms/GVN/PRE/phi-translate-2.ll deleted file mode 100644 index b2993657c7f5..000000000000 --- a/test/Transforms/GVN/PRE/phi-translate-2.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: opt < %s -gvn -S | FileCheck %s -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -@a = common global [100 x i64] zeroinitializer, align 16 -@b = common global [100 x i64] zeroinitializer, align 16 -@g1 = common global i64 0, align 8 -@g2 = common global i64 0, align 8 -@g3 = common global i64 0, align 8 -declare i64 @goo(...) local_unnamed_addr #1 - -define void @test1(i64 %a, i64 %b, i64 %c, i64 %d) { -entry: - %mul = mul nsw i64 %b, %a - store i64 %mul, i64* @g1, align 8 - %t0 = load i64, i64* @g2, align 8 - %cmp = icmp sgt i64 %t0, 3 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - %mul2 = mul nsw i64 %d, %c - store i64 %mul2, i64* @g2, align 8 - br label %if.end - -; Check phi-translate works and mul is removed. -; CHECK-LABEL: @test1( -; CHECK: if.end: -; CHECK: %[[MULPHI:.*]] = phi i64 [ {{.*}}, %if.then ], [ %mul, %entry ] -; CHECK-NOT: = mul -; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8 -if.end: ; preds = %if.then, %entry - %b.addr.0 = phi i64 [ %d, %if.then ], [ %b, %entry ] - %a.addr.0 = phi i64 [ %c, %if.then ], [ %a, %entry ] - %mul3 = mul nsw i64 %a.addr.0, %b.addr.0 - store i64 %mul3, i64* @g3, align 8 - ret void -} - -define void @test2(i64 %i) { -entry: - %arrayidx = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i - %t0 = load i64, i64* %arrayidx, align 8 - %arrayidx1 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i - %t1 = load i64, i64* %arrayidx1, align 8 - %mul = mul nsw i64 %t1, %t0 - store i64 %mul, i64* @g1, align 8 - %cmp = icmp sgt i64 %mul, 3 - br i1 %cmp, label %if.then, label %if.end - -; Check phi-translate works for the phi generated by loadpre. A new mul will be -; inserted in if.then block. -; CHECK-LABEL: @test2( -; CHECK: if.then: -; CHECK: %[[MUL_THEN:.*]] = mul -; CHECK: br label %if.end -if.then: ; preds = %entry - %call = tail call i64 (...) @goo() #2 - store i64 %call, i64* @g2, align 8 - br label %if.end - -; CHECK: if.end: -; CHECK: %[[MULPHI:.*]] = phi i64 [ %[[MUL_THEN]], %if.then ], [ %mul, %entry ] -; CHECK-NOT: = mul -; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8 -if.end: ; preds = %if.then, %entry - %i.addr.0 = phi i64 [ 3, %if.then ], [ %i, %entry ] - %arrayidx3 = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i.addr.0 - %t2 = load i64, i64* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i.addr.0 - %t3 = load i64, i64* %arrayidx4, align 8 - %mul5 = mul nsw i64 %t3, %t2 - store i64 %mul5, i64* @g3, align 8 - ret void -} - -; Check phi-translate doesn't go through backedge, which may lead to incorrect -; pre transformation. -; CHECK: for.end: -; CHECK-NOT: %{{.*pre-phi}} = phi -; CHECK: ret void -define void @test3(i64 %N, i64* nocapture readonly %a) { -entry: - br label %for.cond - -for.cond: ; preds = %for.body, %entry - %i.0 = phi i64 [ 0, %entry ], [ %add, %for.body ] - %add = add nuw nsw i64 %i.0, 1 - %arrayidx = getelementptr inbounds i64, i64* %a, i64 %add - %tmp0 = load i64, i64* %arrayidx, align 8 - %cmp = icmp slt i64 %i.0, %N - br i1 %cmp, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %call = tail call i64 (...) @goo() #2 - %add1 = sub nsw i64 0, %call - %tobool = icmp eq i64 %tmp0, %add1 - br i1 %tobool, label %for.cond, label %for.end - -for.end: ; preds = %for.body, %for.cond - %i.0.lcssa = phi i64 [ %i.0, %for.body ], [ %i.0, %for.cond ] - %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.0.lcssa - %tmp1 = load i64, i64* %arrayidx2, align 8 - store i64 %tmp1, i64* @g1, align 8 - ret void -} - diff --git a/test/Transforms/GVN/PRE/pre-gep-load.ll b/test/Transforms/GVN/PRE/pre-gep-load.ll index 1b2b4d20d31d..9eec8bb6455b 100644 --- a/test/Transforms/GVN/PRE/pre-gep-load.ll +++ b/test/Transforms/GVN/PRE/pre-gep-load.ll @@ -37,7 +37,7 @@ sw.bb2: ; preds = %if.end, %entry %3 = load double, double* %arrayidx5, align 8 ; CHECK: sw.bb2: ; CHECK-NOT: sext -; CHECK: phi double [ +; CHECK-NEXT: phi double [ ; CHECK-NOT: load %sub6 = fsub double 3.000000e+00, %3 br label %return diff --git a/test/Transforms/GVN/PRE/pre-load.ll b/test/Transforms/GVN/PRE/pre-load.ll index ffff2b7f08e5..685df24f62b6 100644 --- a/test/Transforms/GVN/PRE/pre-load.ll +++ b/test/Transforms/GVN/PRE/pre-load.ll @@ -72,7 +72,7 @@ block4: %PRE = load i32, i32* %P3 ret i32 %PRE ; CHECK: block4: -; CHECK: phi i32 [ +; CHECK-NEXT: phi i32 [ ; CHECK-NOT: load ; CHECK: ret i32 } @@ -104,7 +104,7 @@ block4: %PRE = load i32, i32* %P3 ret i32 %PRE ; CHECK: block4: -; CHECK: phi i32 [ +; CHECK-NEXT: phi i32 [ ; CHECK-NOT: load ; CHECK: ret i32 } @@ -263,7 +263,7 @@ block4: %PRE = load i32, i32* %P3 ret i32 %PRE ; CHECK: block4: -; CHECK: phi i32 [ +; CHECK-NEXT: phi i32 [ ; CHECK-NOT: load ; CHECK: ret i32 } diff --git a/test/Transforms/Inline/AArch64/gep-cost.ll b/test/Transforms/Inline/AArch64/gep-cost.ll index 204958f082dd..7d191d37f1fc 100644 --- a/test/Transforms/Inline/AArch64/gep-cost.ll +++ b/test/Transforms/Inline/AArch64/gep-cost.ll @@ -4,11 +4,21 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" -define void @outer([4 x i32]* %ptr, i32 %i) { +define void @outer1([4 x i32]* %ptr, i32 %i) { call void @inner1([4 x i32]* %ptr, i32 %i) + ret void +} + +define void @outer2([4 x i32]* %ptr, i32 %i) { call void @inner2([4 x i32]* %ptr, i32 %i) ret void } + +define void @outer3([4 x i32]* %ptr, i32 %j) { + call void @inner3([4 x i32]* %ptr, i32 0, i32 %j) + ret void +} + ; The gep in inner1() is reg+reg, which is a legal addressing mode for AArch64. ; Thus, both the gep and ret can be simplified. ; CHECK: Analyzing call of inner1 @@ -19,7 +29,7 @@ define void @inner1([4 x i32]* %ptr, i32 %i) { ret void } -; The gep in inner2() is reg+imm+reg, which is not a legal addressing mode for +; The gep in inner2() is reg+imm+reg, which is not a legal addressing mode for ; AArch64. Thus, only the ret can be simplified and not the gep. ; CHECK: Analyzing call of inner2 ; CHECK: NumInstructionsSimplified: 1 @@ -28,3 +38,14 @@ define void @inner2([4 x i32]* %ptr, i32 %i) { %G = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i32 1, i32 %i ret void } + +; The gep in inner3() is reg+reg because %i is a known constant from the +; callsite. This case is a legal addressing mode for AArch64. Thus, both the +; gep and ret can be simplified. +; CHECK: Analyzing call of inner3 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 2 +define void @inner3([4 x i32]* %ptr, i32 %i, i32 %j) { + %G = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i32 %i, i32 %j + ret void +} diff --git a/test/Transforms/InstCombine/ctpop.ll b/test/Transforms/InstCombine/ctpop.ll index 6bc6f9731979..d49a907ffce1 100644 --- a/test/Transforms/InstCombine/ctpop.ll +++ b/test/Transforms/InstCombine/ctpop.ll @@ -52,3 +52,19 @@ define i1 @test4(i8 %arg) { %res = icmp eq i8 %cnt, 2 ret i1 %res } + +; Test when the number of possible known bits isn't one less than a power of 2 +; and the compare value is greater but less than the next power of 2. +; TODO: The icmp is unnecessary given the known bits of the input. +define i1 @test5(i32 %arg) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: [[AND:%.*]] = and i32 [[ARG:%.*]], 3 +; CHECK-NEXT: [[CNT:%.*]] = call i32 @llvm.ctpop.i32(i32 [[AND]]) +; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CNT]], 3 +; CHECK-NEXT: ret i1 [[RES]] +; + %and = and i32 %arg, 3 + %cnt = call i32 @llvm.ctpop.i32(i32 %and) + %res = icmp eq i32 %cnt, 3 + ret i1 %res +} diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll index 5654b265da58..78c98955353e 100644 --- a/test/Transforms/InstCombine/intrinsics.ll +++ b/test/Transforms/InstCombine/intrinsics.ll @@ -305,6 +305,20 @@ define i1 @cttz_knownbits2(i32 %arg) { ret i1 %res } +; TODO: The icmp is unnecessary given the known bits of the input. +define i1 @cttz_knownbits3(i32 %arg) { +; CHECK-LABEL: @cttz_knownbits3( +; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG:%.*]], 4 +; CHECK-NEXT: [[CNT:%.*]] = call i32 @llvm.cttz.i32(i32 [[OR]], i1 true) #2 +; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CNT]], 3 +; CHECK-NEXT: ret i1 [[RES]] +; + %or = or i32 %arg, 4 + %cnt = call i32 @llvm.cttz.i32(i32 %or, i1 true) nounwind readnone + %res = icmp eq i32 %cnt, 3 + ret i1 %res +} + define i8 @ctlz(i8 %a) { ; CHECK-LABEL: @ctlz( ; CHECK-NEXT: ret i8 2 @@ -338,6 +352,20 @@ define i1 @ctlz_knownbits2(i8 %arg) { ret i1 %res } +; TODO: The icmp is unnecessary given the known bits of the input. +define i1 @ctlz_knownbits3(i8 %arg) { +; CHECK-LABEL: @ctlz_knownbits3( +; CHECK-NEXT: [[OR:%.*]] = or i8 [[ARG:%.*]], 32 +; CHECK-NEXT: [[CNT:%.*]] = call i8 @llvm.ctlz.i8(i8 [[OR]], i1 true) #2 +; CHECK-NEXT: [[RES:%.*]] = icmp eq i8 [[CNT]], 3 +; CHECK-NEXT: ret i1 [[RES]] +; + %or = or i8 %arg, 32 + %cnt = call i8 @llvm.ctlz.i8(i8 %or, i1 true) nounwind readnone + %res = icmp eq i8 %cnt, 3 + ret i1 %res +} + define void @cmp.simplify(i32 %a, i32 %b, i1* %c) { %lz = tail call i32 @llvm.ctlz.i32(i32 %a, i1 false) nounwind readnone %lz.cmp = icmp eq i32 %lz, 32 diff --git a/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll new file mode 100644 index 000000000000..247ea35ff5d0 --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll @@ -0,0 +1,49 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: all_scalar +; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions +; +define void @all_scalar(i64* %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr i64, i64* %a, i64 %i + store i64 0, i64* %tmp0, align 1 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + +; CHECK-LABEL: PR33193 +; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64 +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64 +; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions +%struct.a = type { i32, i8 } +define void @PR33193(%struct.a* %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %j = phi i32 [ 0, %entry ], [ %j.next, %for.body ] + %tmp0 = getelementptr inbounds %struct.a, %struct.a* %a, i64 %i, i32 1 + store i8 0, i8* %tmp0, align 4 + %j.next = add i32 %j, 1 + %i.next = zext i32 %j.next to i64 + %cond = icmp ugt i64 %n, %i.next + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll b/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll new file mode 100644 index 000000000000..736ddc32856c --- /dev/null +++ b/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll @@ -0,0 +1,104 @@ +; RUN: opt -lower-expect -S -o - < %s | FileCheck %s +; RUN: opt -S -passes='function(lower-expect)' < %s | FileCheck %s + +define i32 @foo(i32 %arg) #0 { +; CHECK-LABEL: @foo(i32{{.*}}) +bb: + %tmp = sext i32 %arg to i64 + %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4) + %tmp2 = icmp ne i64 %tmp1, 0 + br i1 %tmp2, label %bb3, label %bb5 +; CHECK: br i1 %tmp2{{.*}}!prof [[LIKELY:![0-9]+]] + +bb3: ; preds = %bb + %tmp4 = call i32 (...) @bar() + br label %bb5 + +bb5: ; preds = %bb3, %bb + ret i32 1 +} + +define i32 @foo2(i32 %arg) #0 { +; CHECK-LABEL: @foo2 +bb: + %tmp = sext i32 %arg to i64 + %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4) + %tmp2 = icmp eq i64 %tmp1, 2 + br i1 %tmp2, label %bb3, label %bb5 +; CHECK: br i1 %tmp2{{.*}}!prof [[UNLIKELY:![0-9]+]] + +bb3: ; preds = %bb + %tmp4 = call i32 (...) @bar() + br label %bb5 + +bb5: ; preds = %bb3, %bb + ret i32 1 +} + +define i32 @foo3(i32 %arg) #0 { +; CHECK-LABEL: @foo3 +bb: + %tmp = sext i32 %arg to i64 + %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4) + %tmp2 = icmp eq i64 %tmp1, 4 + br i1 %tmp2, label %bb3, label %bb5 +; CHECK: br i1 %tmp2{{.*}}!prof [[LIKELY]] + +bb3: ; preds = %bb + %tmp4 = call i32 (...) @bar() + br label %bb5 + +bb5: ; preds = %bb3, %bb + ret i32 1 +} + +define i32 @foo4(i32 %arg) #0 { +; CHECK-LABEL: @foo4 +bb: + %tmp = sext i32 %arg to i64 + %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4) + %tmp2 = icmp ne i64 %tmp1, 2 + br i1 %tmp2, label %bb3, label %bb5 +; CHECK: br i1 %tmp2{{.*}}!prof [[LIKELY]] + +bb3: ; preds = %bb + %tmp4 = call i32 (...) @bar() + br label %bb5 + +bb5: ; preds = %bb3, %bb + ret i32 1 +} + +define i32 @foo5(i32 %arg, i32 %arg1) #0 { +; CHECK-LABEL: @foo5 +bb: + %tmp = sext i32 %arg1 to i64 + %tmp2 = call i64 @llvm.expect.i64(i64 %tmp, i64 4) + %tmp3 = sext i32 %arg to i64 + %tmp4 = icmp ne i64 %tmp2, %tmp3 + br i1 %tmp4, label %bb5, label %bb7 +; CHECK-NOT: !prof + +bb5: ; preds = %bb + %tmp6 = call i32 (...) @bar() + br label %bb7 + +bb7: ; preds = %bb5, %bb + ret i32 1 +} + +declare i64 @llvm.expect.i64(i64, i64) #1 + +declare i32 @bar(...) local_unnamed_addr #0 + +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind readnone } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 304373)"} +; CHECK: [[LIKELY]] = !{!"branch_weights", i32 2000, i32 1} +; CHECK: [[UNLIKELY]] = !{!"branch_weights", i32 1, i32 2000} + diff --git a/test/Transforms/NewGVN/completeness.ll b/test/Transforms/NewGVN/completeness.ll index bafe5f966d22..2b28f12df9d1 100644 --- a/test/Transforms/NewGVN/completeness.ll +++ b/test/Transforms/NewGVN/completeness.ll @@ -389,6 +389,23 @@ bb6: ; preds = %bb6, %bb2 ;; Ensure that we revisit predicateinfo operands at the right points in time. define void @test10() { +; CHECK-LABEL: @test10( +; CHECK-NEXT: b: +; CHECK-NEXT: br label [[G:%.*]] +; CHECK: g: +; CHECK-NEXT: [[N:%.*]] = phi i32* [ [[H:%.*]], [[I:%.*]] ], [ null, [[B:%.*]] ] +; CHECK-NEXT: [[H]] = getelementptr i32, i32* [[N]], i64 1 +; CHECK-NEXT: [[J:%.*]] = icmp eq i32* [[H]], getelementptr (i32, i32* null, i64 8) +; CHECK-NEXT: br i1 [[J]], label [[C:%.*]], label [[I]] +; CHECK: i: +; CHECK-NEXT: br i1 undef, label [[K:%.*]], label [[G]] +; CHECK: k: +; CHECK-NEXT: br i1 false, label [[C]], label [[O:%.*]] +; CHECK: o: +; CHECK-NEXT: br label [[C]] +; CHECK: c: +; CHECK-NEXT: ret void +; b: %m = getelementptr i32, i32* null, i64 8 br label %g diff --git a/test/Transforms/NewGVN/pr33185.ll b/test/Transforms/NewGVN/pr33185.ll new file mode 100644 index 000000000000..c687d8fe51eb --- /dev/null +++ b/test/Transforms/NewGVN/pr33185.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -newgvn -S %s | FileCheck %s + +@a = local_unnamed_addr global i32 9, align 4 +@.str4 = private unnamed_addr constant [6 x i8] c"D:%d\0A\00", align 1 + +define i32 @main() local_unnamed_addr { +; CHECK-LABEL: @main( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* @a, align 4 +; CHECK-NEXT: [[CMP1_I:%.*]] = icmp ne i32 [[TMP]], 0 +; CHECK-NEXT: br label [[FOR_BODY_I:%.*]] +; CHECK: for.body.i: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[COND_END_I:%.*]] ] +; CHECK-NEXT: [[F_08_I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[COND_END_I]] ] +; CHECK-NEXT: [[MUL_I:%.*]] = select i1 [[CMP1_I]], i32 [[F_08_I]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[COND_END_I]], label [[COND_TRUE_I:%.*]] +; CHECK: cond.true.i: +; CHECK-NEXT: [[DIV_I:%.*]] = udiv i32 [[MUL_I]], [[F_08_I]] +; CHECK-NEXT: br label [[COND_END_I]] +; CHECK: cond.end.i: +; CHECK-NEXT: [[COND_I:%.*]] = phi i32 [ [[DIV_I]], [[COND_TRUE_I]] ], [ 0, [[FOR_BODY_I]] ] +; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[F_08_I]], 1 +; CHECK-NEXT: [[EXITCOND_I:%.*]] = icmp eq i32 [[INC_I]], 4 +; CHECK-NEXT: br i1 [[EXITCOND_I]], label [[FN1_EXIT:%.*]], label [[FOR_BODY_I]] +; CHECK: fn1.exit: +; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str4, i64 0, i64 0), i32 [[COND_I]]) +; CHECK-NEXT: ret i32 0 +; +entry: + %tmp = load i32, i32* @a, align 4 + %cmp1.i = icmp ne i32 %tmp, 0 + br label %for.body.i + +for.body.i: + %tmp1 = phi i1 [ true, %entry ], [ false, %cond.end.i ] + %f.08.i = phi i32 [ 0, %entry ], [ %inc.i, %cond.end.i ] + %mul.i = select i1 %cmp1.i, i32 %f.08.i, i32 0 + br i1 %tmp1, label %cond.end.i, label %cond.true.i + +cond.true.i: + ;; Ensure we don't replace this divide with a phi of ops that merges the wrong loop iteration value + %div.i = udiv i32 %mul.i, %f.08.i + br label %cond.end.i + +cond.end.i: + %cond.i = phi i32 [ %div.i, %cond.true.i ], [ 0, %for.body.i ] + %inc.i = add nuw nsw i32 %f.08.i, 1 + %exitcond.i = icmp eq i32 %inc.i, 4 + br i1 %exitcond.i, label %fn1.exit, label %for.body.i + +fn1.exit: + %cond.i.lcssa = phi i32 [ %cond.i, %cond.end.i ] + %call4= tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str4, i64 0, i64 0), i32 %cond.i.lcssa) + ret i32 0 +} + +declare i32 @printf(i8* nocapture readonly, ...) + diff --git a/test/Transforms/PGOProfile/branch1.ll b/test/Transforms/PGOProfile/branch1.ll index 3db7566d5078..f675b1f1a011 100644 --- a/test/Transforms/PGOProfile/branch1.ll +++ b/test/Transforms/PGOProfile/branch1.ll @@ -15,6 +15,9 @@ ; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE ; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.l.profdata -S | FileCheck %s --check-prefix=USE-LARGE +; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -pass-remarks-analysis=pgo-use-annot -pgo-emit-branch-prob -S 2>&1| FileCheck %s --check-prefix=ANALYSIS +; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -pass-remarks-analysis=pgo-use-annot -pgo-emit-branch-prob -S 2>&1| FileCheck %s --check-prefix=ANALYSIS + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; GEN-DARWIN-LINKONCE: target triple = "x86_64-apple-darwin" @@ -54,3 +57,5 @@ if.end: ; USE-DAG: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}} ; USE-DAG: {{![0-9]+}} = !{!"DetailedSummary", {{![0-9]+}}} ; USE-DAG: ![[FUNC_ENTRY_COUNT]] = !{!"function_entry_count", i64 3} + +; ANALYSIS:remark: <unknown>:0:0: sgt_i32_Zero {{.*}}66.67% (total count : 3) diff --git a/test/Transforms/ThinLTOBitcodeWriter/new-pm.ll b/test/Transforms/ThinLTOBitcodeWriter/new-pm.ll new file mode 100644 index 000000000000..03facd072b34 --- /dev/null +++ b/test/Transforms/ThinLTOBitcodeWriter/new-pm.ll @@ -0,0 +1,9 @@ +; RUN: opt -passes='no-op-module' -debug-pass-manager -thinlto-bc -thin-link-bitcode-file=%t2 -o %t %s 2>&1 | FileCheck %s --check-prefix=DEBUG_PM +; RUN: llvm-bcanalyzer -dump %t2 | FileCheck %s --check-prefix=BITCODE + +; DEBUG_PM: ThinLTOBitcodeWriterPass +; BITCODE: Foo + +define void @Foo() { + ret void +} diff --git a/test/Transforms/Util/PredicateInfo/condprop.ll b/test/Transforms/Util/PredicateInfo/condprop.ll index 79c76baa6f61..61f59f03e1bc 100644 --- a/test/Transforms/Util/PredicateInfo/condprop.ll +++ b/test/Transforms/Util/PredicateInfo/condprop.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -print-predicateinfo -analyze < %s 2>&1 | FileCheck %s +; RUN: opt -print-predicateinfo -analyze -reverse-iterate < %s 2>&1 | FileCheck %s @a = external global i32 ; <i32*> [#uses=7] @@ -98,10 +99,10 @@ define void @test3(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) ; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) -; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) ; CHECK-NEXT: br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]] ; CHECK: both_zero: @@ -382,8 +383,8 @@ ret: define i32 @test10(i32 %j, i32 %i) { ; CHECK-LABEL: @test10( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]] -; CHECK: [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]]) ; CHECK: [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]]) +; CHECK: [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]]) ; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]] ; CHECK: cond_true: ; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]] diff --git a/test/Transforms/Util/PredicateInfo/testandor.ll b/test/Transforms/Util/PredicateInfo/testandor.ll index 5942ed155318..43c508670908 100644 --- a/test/Transforms/Util/PredicateInfo/testandor.ll +++ b/test/Transforms/Util/PredicateInfo/testandor.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -print-predicateinfo < %s 2>&1 | FileCheck %s +; RUN: opt -print-predicateinfo -reverse-iterate < %s 2>&1 | FileCheck %s declare void @foo(i1) declare void @bar(i32) @@ -10,10 +11,10 @@ define void @testor(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = or i1 [[XZ]], [[YZ]] -; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) ; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) -; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) ; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]] ; CHECK: oneof: @@ -54,10 +55,10 @@ define void @testand(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) ; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) -; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) ; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] ; CHECK: both: @@ -98,9 +99,9 @@ define void @testandsame(i32 %x, i32 %y) { ; CHECK-NEXT: [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0 ; CHECK-NEXT: [[XLT:%.*]] = icmp slt i32 [[X]], 100 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XGT]], [[XLT]] -; CHECK: [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XGT]]) ; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) ; CHECK: [[X_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X_0]]) +; CHECK: [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XGT]]) ; CHECK: [[XLT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XLT]]) ; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) ; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] @@ -136,23 +137,23 @@ define void @testandassume(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) -; CHECK: [[TMP2:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) -; CHECK: [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) -; CHECK: [[TMP4:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[TMP1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[TMP2:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[TMP4:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[TMP5:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[TMP5]]) -; CHECK: [[DOT0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP1]]) +; CHECK: [[DOT0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP1]]) ; CHECK: [[DOT01:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP2]]) ; CHECK: [[DOT02:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP3]]) -; CHECK: [[DOT03:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP4]]) +; CHECK: [[DOT03:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP4]]) ; CHECK: [[DOT04:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP5]]) ; CHECK-NEXT: br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]] ; CHECK: both: -; CHECK-NEXT: call void @foo(i1 [[DOT0]]) ; CHECK-NEXT: call void @foo(i1 [[DOT02]]) +; CHECK-NEXT: call void @foo(i1 [[DOT03]]) +; CHECK-NEXT: call void @bar(i32 [[DOT0]]) ; CHECK-NEXT: call void @bar(i32 [[DOT01]]) -; CHECK-NEXT: call void @bar(i32 [[DOT03]]) ; CHECK-NEXT: ret void ; CHECK: nope: ; CHECK-NEXT: call void @foo(i1 [[DOT04]]) diff --git a/test/tools/llvm-config/cflags.test b/test/tools/llvm-config/cflags.test index ef3e486bd968..461de86b64c0 100644 --- a/test/tools/llvm-config/cflags.test +++ b/test/tools/llvm-config/cflags.test @@ -4,4 +4,4 @@ RUN: llvm-config --cxxflags 2>&1 | FileCheck %s CHECK: -I CHECK: {{[/\\]}}include CHECK-NOT: error: -CHECK-NOT: warning +CHECK-NOT: warning: diff --git a/test/tools/llvm-cvtres/Inputs/test_resource.rc b/test/tools/llvm-cvtres/Inputs/test_resource.rc index fd616520dbe1..5ca097baa0f7 100644 --- a/test/tools/llvm-cvtres/Inputs/test_resource.rc +++ b/test/tools/llvm-cvtres/Inputs/test_resource.rc @@ -42,3 +42,9 @@ LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS MENUITEM "salad", 101
MENUITEM "duck", 102
}
+
+
+myresource stringarray {
+ "this is a user defined resource\0",
+ "it contains many strings\0",
+}
\ No newline at end of file diff --git a/test/tools/llvm-cvtres/Inputs/test_resource.res b/test/tools/llvm-cvtres/Inputs/test_resource.res Binary files differindex c577ecc3d633..d422bb4904da 100644 --- a/test/tools/llvm-cvtres/Inputs/test_resource.res +++ b/test/tools/llvm-cvtres/Inputs/test_resource.res diff --git a/test/tools/llvm-cvtres/resource.test b/test/tools/llvm-cvtres/resource.test index 16970343c60d..b9be74bf671b 100644 --- a/test/tools/llvm-cvtres/resource.test +++ b/test/tools/llvm-cvtres/resource.test @@ -4,4 +4,48 @@ RUN: llvm-cvtres %p/Inputs/test_resource.res | FileCheck %s -CHECK: Number of resources: 7 +CHECK: Number of resources: 8 +CHECK-NEXT: Resource Tree [ +CHECK-NEXT: STRINGARRAY [ +CHECK-NEXT: MYRESOURCE [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 2 [ +CHECK-NEXT: CURSOR [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: OKAY [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 4 [ +CHECK-NEXT: "EAT" [ +CHECK-NEXT: 3081 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 14432 [ +CHECK-NEXT: 2052 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 5 [ +CHECK-NEXT: TESTDIALOG [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 9 [ +CHECK-NEXT: MYACCELERATORS [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 12 [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp index 246580c8bdbe..ae3a31adaea3 100644 --- a/tools/bugpoint/OptimizerDriver.cpp +++ b/tools/bugpoint/OptimizerDriver.cpp @@ -47,14 +47,13 @@ static cl::opt<bool> PreserveBitcodeUseListOrder( cl::desc("Preserve use-list order when writing LLVM bitcode."), cl::init(true), cl::Hidden); -namespace { // ChildOutput - This option captures the name of the child output file that // is set up by the parent bugpoint process -cl::opt<std::string> ChildOutput("child-output", cl::ReallyHidden); -cl::opt<std::string> OptCmd("opt-command", cl::init(""), - cl::desc("Path to opt. (default: search path " - "for 'opt'.)")); -} +static cl::opt<std::string> ChildOutput("child-output", cl::ReallyHidden); +static cl::opt<std::string> + OptCmd("opt-command", cl::init(""), + cl::desc("Path to opt. (default: search path " + "for 'opt'.)")); /// writeProgramToFile - This writes the current "Program" to the named bitcode /// file. If an error occurs, true is returned. diff --git a/tools/llvm-config/BuildVariables.inc.in b/tools/llvm-config/BuildVariables.inc.in index 0740c3f9d9f5..f201e1f7bff0 100644 --- a/tools/llvm-config/BuildVariables.inc.in +++ b/tools/llvm-config/BuildVariables.inc.in @@ -34,3 +34,4 @@ #define LLVM_DYLIB_COMPONENTS "@LLVM_DYLIB_COMPONENTS@" #define LLVM_DYLIB_VERSION "@LLVM_DYLIB_VERSION@" #define LLVM_HAS_GLOBAL_ISEL @LLVM_HAS_GLOBAL_ISEL@ +#define LLVM_TOOLS_INSTALL_DIR "@LLVM_TOOLS_INSTALL_DIR@" diff --git a/tools/llvm-config/llvm-config.cpp b/tools/llvm-config/llvm-config.cpp index 25344e4cd011..888da7143c9f 100644 --- a/tools/llvm-config/llvm-config.cpp +++ b/tools/llvm-config/llvm-config.cpp @@ -333,7 +333,9 @@ int main(int argc, char **argv) { } else { ActivePrefix = CurrentExecPrefix; ActiveIncludeDir = ActivePrefix + "/include"; - ActiveBinDir = ActivePrefix + "/bin"; + SmallString<PATH_MAX> path(StringRef(LLVM_TOOLS_INSTALL_DIR)); + sys::fs::make_absolute(ActivePrefix, path); + ActiveBinDir = path.str(); ActiveLibDir = ActivePrefix + "/lib" + LLVM_LIBDIR_SUFFIX; ActiveCMakeDir = ActiveLibDir + "/cmake/llvm"; ActiveIncludeOption = "-I" + ActiveIncludeDir; diff --git a/tools/llvm-cvtres/llvm-cvtres.cpp b/tools/llvm-cvtres/llvm-cvtres.cpp index 96f7437ab5f6..95a6623b44eb 100644 --- a/tools/llvm-cvtres/llvm-cvtres.cpp +++ b/tools/llvm-cvtres/llvm-cvtres.cpp @@ -131,7 +131,7 @@ int main(int argc_, const char *argv_[]) { std::vector<std::string> InputFiles = InputArgs.getAllArgValues(OPT_INPUT); if (InputFiles.size() == 0) { - reportError("No input file specified"); + reportError("No input file specified.\n"); } SmallString<128> OutputFile; @@ -143,6 +143,20 @@ int main(int argc_, const char *argv_[]) { llvm::sys::path::replace_extension(OutputFile, ".obj"); } + outs() << "Machine: "; + switch (Machine) { + case machine::ARM: + outs() << "ARM\n"; + break; + case machine::X86: + outs() << "X86\n"; + break; + default: + outs() << "X64\n"; + } + + WindowsResourceParser Parser; + for (const auto &File : InputFiles) { Expected<object::OwningBinary<object::Binary>> BinaryOrErr = object::createBinary(File); @@ -166,17 +180,11 @@ int main(int argc_, const char *argv_[]) { EntryNumber++; } outs() << "Number of resources: " << EntryNumber << "\n"; + + error(Parser.parse(RF)); } - outs() << "Machine: "; - switch (Machine) { - case machine::ARM: - outs() << "ARM\n"; - break; - case machine::X86: - outs() << "X86\n"; - break; - default: - outs() << "X64\n"; - } + + Parser.printTree(); + return 0; } diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp index 8ecf18480994..2b5babe79824 100644 --- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp +++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp @@ -95,8 +95,12 @@ static void DumpObjectFile(ObjectFile &Obj, Twine Filename) { outs() << Filename.str() << ":\tfile format " << Obj.getFileFormatName() << "\n\n"; + // Dump the complete DWARF structure. - DICtx->dump(outs(), DumpType, false, SummarizeTypes); + DIDumpOptions DumpOpts; + DumpOpts.DumpType = DumpType; + DumpOpts.SummarizeTypes = SummarizeTypes; + DICtx->dump(outs(), DumpOpts); } static void DumpInput(StringRef Filename) { diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp index 9e02951a4a93..a260d6ff42c5 100644 --- a/tools/llvm-objdump/MachODump.cpp +++ b/tools/llvm-objdump/MachODump.cpp @@ -1271,7 +1271,10 @@ static void ProcessMachO(StringRef Name, MachOObjectFile *MachOOF, if (DwarfDumpType != DIDT_Null) { std::unique_ptr<DIContext> DICtx(new DWARFContextInMemory(*MachOOF)); // Dump the complete DWARF structure. - DICtx->dump(outs(), DwarfDumpType, true /* DumpEH */); + DIDumpOptions DumpOpts; + DumpOpts.DumpType = DwarfDumpType; + DumpOpts.DumpEH = true; + DICtx->dump(outs(), DumpOpts); } } diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp index 18fa0e074084..6a9151570908 100644 --- a/tools/llvm-objdump/llvm-objdump.cpp +++ b/tools/llvm-objdump/llvm-objdump.cpp @@ -2064,7 +2064,10 @@ static void DumpObject(ObjectFile *o, const Archive *a = nullptr) { if (DwarfDumpType != DIDT_Null) { std::unique_ptr<DIContext> DICtx(new DWARFContextInMemory(*o)); // Dump the complete DWARF structure. - DICtx->dump(outs(), DwarfDumpType, true /* DumpEH */); + DIDumpOptions DumpOpts; + DumpOpts.DumpType = DwarfDumpType; + DumpOpts.DumpEH = true; + DICtx->dump(outs(), DumpOpts); } } diff --git a/tools/llvm-pdbdump/Analyze.cpp b/tools/llvm-pdbdump/Analyze.cpp index 3a026e5d2451..b503cdcbf1ea 100644 --- a/tools/llvm-pdbdump/Analyze.cpp +++ b/tools/llvm-pdbdump/Analyze.cpp @@ -35,7 +35,7 @@ static StringRef getLeafTypeName(TypeLeafKind LT) { #define TYPE_RECORD(ename, value, name) \ case ename: \ return #name; -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" default: break; } diff --git a/tools/llvm-pdbdump/CMakeLists.txt b/tools/llvm-pdbdump/CMakeLists.txt index 325e38c15ca7..a1f54a3bff6a 100644 --- a/tools/llvm-pdbdump/CMakeLists.txt +++ b/tools/llvm-pdbdump/CMakeLists.txt @@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS DebugInfoMSF DebugInfoPDB Object + ObjectYAML Support ) @@ -12,8 +13,6 @@ add_llvm_tool(llvm-pdbdump CompactTypeDumpVisitor.cpp Diff.cpp llvm-pdbdump.cpp - YamlSymbolDumper.cpp - YamlTypeDumper.cpp LinePrinter.cpp LLVMOutputStyle.cpp PdbYaml.cpp diff --git a/tools/llvm-pdbdump/CompactTypeDumpVisitor.cpp b/tools/llvm-pdbdump/CompactTypeDumpVisitor.cpp index 3b609ae50c1c..6dd54e0dbec1 100644 --- a/tools/llvm-pdbdump/CompactTypeDumpVisitor.cpp +++ b/tools/llvm-pdbdump/CompactTypeDumpVisitor.cpp @@ -18,7 +18,7 @@ using namespace llvm::pdb; static const EnumEntry<TypeLeafKind> LeafTypeNames[] = { #define CV_TYPE(enum, val) {#enum, enum}, -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" }; static StringRef getLeafName(TypeLeafKind K) { diff --git a/tools/llvm-pdbdump/PdbYaml.cpp b/tools/llvm-pdbdump/PdbYaml.cpp index dd32eca14c1d..e288063e2afa 100644 --- a/tools/llvm-pdbdump/PdbYaml.cpp +++ b/tools/llvm-pdbdump/PdbYaml.cpp @@ -9,22 +9,19 @@ #include "PdbYaml.h" -#include "YamlSerializationContext.h" -#include "YamlSymbolDumper.h" -#include "YamlTypeDumper.h" - #include "llvm/ADT/StringExtras.h" #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h" #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" -#include "llvm/DebugInfo/CodeView/SymbolSerializer.h" #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h" #include "llvm/DebugInfo/CodeView/TypeSerializer.h" #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h" #include "llvm/DebugInfo/PDB/Native/PDBFile.h" +#include "llvm/DebugInfo/PDB/Native/RawTypes.h" #include "llvm/DebugInfo/PDB/Native/TpiHashing.h" #include "llvm/DebugInfo/PDB/PDBExtras.h" #include "llvm/DebugInfo/PDB/PDBTypes.h" +#include "llvm/ObjectYAML/CodeViewYAMLTypes.h" using namespace llvm; using namespace llvm::pdb; @@ -35,15 +32,6 @@ LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::StringRef) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::NamedStreamMapping) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbDbiModuleInfo) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbSourceFileChecksumEntry) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbSourceLineEntry) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbSourceColumnEntry) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbSourceLineBlock) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbSourceLineInfo) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbInlineeSite) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbInlineeInfo) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbSymbolRecord) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbTpiRecord) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::StreamBlockList) LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::pdb::PdbRaw_FeatureSig) @@ -152,57 +140,18 @@ template <> struct ScalarEnumerationTraits<llvm::pdb::PdbRaw_FeatureSig> { io.enumCase(Features, "VC140", PdbRaw_FeatureSig::VC140); } }; - -template <> struct ScalarEnumerationTraits<llvm::codeview::FileChecksumKind> { - static void enumeration(IO &io, llvm::codeview::FileChecksumKind &Kind) { - io.enumCase(Kind, "None", llvm::codeview::FileChecksumKind::None); - io.enumCase(Kind, "MD5", llvm::codeview::FileChecksumKind::MD5); - io.enumCase(Kind, "SHA1", llvm::codeview::FileChecksumKind::SHA1); - io.enumCase(Kind, "SHA256", llvm::codeview::FileChecksumKind::SHA256); - } -}; - -template <> struct ScalarBitSetTraits<llvm::codeview::LineFlags> { - static void bitset(IO &io, llvm::codeview::LineFlags &Flags) { - io.bitSetCase(Flags, "HasColumnInfo", llvm::codeview::LF_HaveColumns); - io.enumFallback<Hex16>(Flags); - } -}; -} -} - -void ScalarTraits<HexFormattedString>::output(const HexFormattedString &Value, - void *ctx, raw_ostream &Out) { - StringRef Bytes(reinterpret_cast<const char *>(Value.Bytes.data()), - Value.Bytes.size()); - Out << toHex(Bytes); } - -StringRef ScalarTraits<HexFormattedString>::input(StringRef Scalar, void *ctxt, - HexFormattedString &Value) { - std::string H = fromHex(Scalar); - Value.Bytes.assign(H.begin(), H.end()); - return StringRef(); } void MappingTraits<PdbObject>::mapping(IO &IO, PdbObject &Obj) { - // Create a single serialization context that will be passed through the - // entire process of serializing / deserializing a Tpi Stream. This is - // especially important when we are going from Pdb -> Yaml because we need - // to maintain state in a TypeTableBuilder across mappings, and at the end of - // the entire process, we need to have one TypeTableBuilder that has every - // record. - pdb::yaml::SerializationContext Context(IO, Obj.Allocator); - - IO.mapOptional("MSF", Obj.Headers); IO.mapOptional("StreamSizes", Obj.StreamSizes); IO.mapOptional("StreamMap", Obj.StreamMap); IO.mapOptional("StringTable", Obj.StringTable); IO.mapOptional("PdbStream", Obj.PdbStream); - IO.mapOptionalWithContext("DbiStream", Obj.DbiStream, Context); - IO.mapOptionalWithContext("TpiStream", Obj.TpiStream, Context); - IO.mapOptionalWithContext("IpiStream", Obj.IpiStream, Context); + IO.mapOptional("DbiStream", Obj.DbiStream); + IO.mapOptional("TpiStream", Obj.TpiStream); + IO.mapOptional("IpiStream", Obj.IpiStream); } void MappingTraits<MSFHeaders>::mapping(IO &IO, MSFHeaders &Obj) { @@ -239,7 +188,7 @@ void MappingTraits<PdbInfoStream>::mapping(IO &IO, PdbInfoStream &Obj) { IO.mapOptional("Version", Obj.Version, PdbImplVC70); } -void MappingContextTraits<PdbDbiStream, pdb::yaml::SerializationContext>::mapping(IO &IO, PdbDbiStream &Obj, pdb::yaml::SerializationContext &Context) { +void MappingTraits<PdbDbiStream>::mapping(IO &IO, PdbDbiStream &Obj) { IO.mapOptional("VerHeader", Obj.VerHeader, PdbDbiV70); IO.mapOptional("Age", Obj.Age, 1U); IO.mapOptional("BuildNumber", Obj.BuildNumber, uint16_t(0U)); @@ -247,13 +196,13 @@ void MappingContextTraits<PdbDbiStream, pdb::yaml::SerializationContext>::mappin IO.mapOptional("PdbDllRbld", Obj.PdbDllRbld, uint16_t(0U)); IO.mapOptional("Flags", Obj.Flags, uint16_t(1U)); IO.mapOptional("MachineType", Obj.MachineType, PDB_Machine::x86); - IO.mapOptionalWithContext("Modules", Obj.ModInfos, Context); + IO.mapOptional("Modules", Obj.ModInfos); } -void MappingContextTraits<PdbTpiStream, pdb::yaml::SerializationContext>::mapping( - IO &IO, pdb::yaml::PdbTpiStream &Obj, pdb::yaml::SerializationContext &Context) { +void MappingTraits<PdbTpiStream>::mapping(IO &IO, + pdb::yaml::PdbTpiStream &Obj) { IO.mapOptional("Version", Obj.Version, PdbTpiV80); - IO.mapRequired("Records", Obj.Records, Context); + IO.mapRequired("Records", Obj.Records); } void MappingTraits<NamedStreamMapping>::mapping(IO &IO, @@ -262,134 +211,15 @@ void MappingTraits<NamedStreamMapping>::mapping(IO &IO, IO.mapRequired("StreamNum", Obj.StreamNumber); } -void MappingContextTraits<PdbSymbolRecord, pdb::yaml::SerializationContext>::mapping(IO &IO, PdbSymbolRecord &Obj, pdb::yaml::SerializationContext &Context) { - codeview::SymbolVisitorCallbackPipeline Pipeline; - codeview::SymbolSerializer Serializer(Context.Allocator); - codeview::SymbolDeserializer Deserializer(nullptr); - codeview::yaml::YamlSymbolDumper Dumper(IO); - - if (IO.outputting()) { - // For PDB to Yaml, deserialize into a high level record type, then dump it. - Pipeline.addCallbackToPipeline(Deserializer); - Pipeline.addCallbackToPipeline(Dumper); - } else { - // For the other way around, dump it into a concrete structure, and then - // serialize it into the CVRecord. - Pipeline.addCallbackToPipeline(Dumper); - Pipeline.addCallbackToPipeline(Serializer); - } - - codeview::CVSymbolVisitor Visitor(Pipeline); - consumeError(Visitor.visitSymbolRecord(Obj.Record)); -} - -void MappingContextTraits<PdbModiStream, pdb::yaml::SerializationContext>::mapping(IO &IO, PdbModiStream &Obj, pdb::yaml::SerializationContext &Context) { +void MappingTraits<PdbModiStream>::mapping(IO &IO, PdbModiStream &Obj) { IO.mapOptional("Signature", Obj.Signature, 4U); - IO.mapRequired("Records", Obj.Symbols, Context); + IO.mapRequired("Records", Obj.Symbols); } -void MappingContextTraits<PdbDbiModuleInfo, pdb::yaml::SerializationContext>::mapping(IO &IO, PdbDbiModuleInfo &Obj, pdb::yaml::SerializationContext &Context) { +void MappingTraits<PdbDbiModuleInfo>::mapping(IO &IO, PdbDbiModuleInfo &Obj) { IO.mapRequired("Module", Obj.Mod); IO.mapOptional("ObjFile", Obj.Obj, Obj.Mod); IO.mapOptional("SourceFiles", Obj.SourceFiles); - IO.mapOptionalWithContext("LineInfo", Obj.FileLineInfo, Context); - IO.mapOptionalWithContext("Modi", Obj.Modi, Context); -} - -void MappingContextTraits<pdb::yaml::PdbSourceLineEntry, - pdb::yaml::SerializationContext>:: - mapping(IO &IO, PdbSourceLineEntry &Obj, - pdb::yaml::SerializationContext &Context) { - IO.mapRequired("Offset", Obj.Offset); - IO.mapRequired("LineStart", Obj.LineStart); - IO.mapRequired("IsStatement", Obj.IsStatement); - IO.mapRequired("EndDelta", Obj.EndDelta); -} - -void MappingContextTraits<pdb::yaml::PdbSourceColumnEntry, - pdb::yaml::SerializationContext>:: - mapping(IO &IO, PdbSourceColumnEntry &Obj, - pdb::yaml::SerializationContext &Context) { - IO.mapRequired("StartColumn", Obj.StartColumn); - IO.mapRequired("EndColumn", Obj.EndColumn); -} - -void MappingContextTraits<pdb::yaml::PdbSourceLineBlock, - pdb::yaml::SerializationContext>:: - mapping(IO &IO, PdbSourceLineBlock &Obj, - pdb::yaml::SerializationContext &Context) { - IO.mapRequired("FileName", Obj.FileName); - IO.mapRequired("Lines", Obj.Lines, Context); - IO.mapRequired("Columns", Obj.Columns, Context); -} - -void MappingContextTraits<pdb::yaml::PdbSourceFileChecksumEntry, - pdb::yaml::SerializationContext>:: - mapping(IO &IO, PdbSourceFileChecksumEntry &Obj, - pdb::yaml::SerializationContext &Context) { - IO.mapRequired("FileName", Obj.FileName); - IO.mapRequired("Kind", Obj.Kind); - IO.mapRequired("Checksum", Obj.ChecksumBytes); -} - -void MappingContextTraits<pdb::yaml::PdbSourceLineInfo, - pdb::yaml::SerializationContext>:: - mapping(IO &IO, PdbSourceLineInfo &Obj, - pdb::yaml::SerializationContext &Context) { - IO.mapRequired("CodeSize", Obj.CodeSize); - - IO.mapRequired("Flags", Obj.Flags); - IO.mapRequired("RelocOffset", Obj.RelocOffset); - IO.mapRequired("RelocSegment", Obj.RelocSegment); - IO.mapRequired("Blocks", Obj.Blocks, Context); -} - -void MappingContextTraits<pdb::yaml::PdbSourceFileInfo, - pdb::yaml::SerializationContext>:: - mapping(IO &IO, PdbSourceFileInfo &Obj, - pdb::yaml::SerializationContext &Context) { - IO.mapOptionalWithContext("Checksums", Obj.FileChecksums, Context); - IO.mapOptionalWithContext("Lines", Obj.LineFragments, Context); - IO.mapOptionalWithContext("InlineeLines", Obj.Inlinees, Context); -} - -void MappingContextTraits<PdbInlineeSite, SerializationContext>::mapping( - IO &IO, PdbInlineeSite &Obj, SerializationContext &Context) { - IO.mapRequired("FileName", Obj.FileName); - IO.mapRequired("LineNum", Obj.SourceLineNum); - IO.mapRequired("Inlinee", Obj.Inlinee); - IO.mapOptional("ExtraFiles", Obj.ExtraFiles); -} - -void MappingContextTraits<PdbInlineeInfo, SerializationContext>::mapping( - IO &IO, PdbInlineeInfo &Obj, SerializationContext &Context) { - IO.mapRequired("HasExtraFiles", Obj.HasExtraFiles); - IO.mapRequired("Sites", Obj.Sites, Context); -} - -void MappingContextTraits<PdbTpiRecord, pdb::yaml::SerializationContext>:: - mapping(IO &IO, pdb::yaml::PdbTpiRecord &Obj, - pdb::yaml::SerializationContext &Context) { - if (IO.outputting()) { - // For PDB to Yaml, deserialize into a high level record type, then dump it. - consumeError(codeview::visitTypeRecord(Obj.Record, Context.Dumper)); - } else { - codeview::TypeVisitorCallbackPipeline Pipeline; - codeview::TypeSerializer Serializer(Context.Allocator); - pdb::TpiHashUpdater Hasher; - // For Yaml to PDB, extract from the high level record type, then write it - // to bytes. - - // This might be interpreted as a hack, but serializing FieldList - // sub-records requires having access to the same serializer being used by - // the FieldList itself. - Context.ActiveSerializer = &Serializer; - Pipeline.addCallbackToPipeline(Context.Dumper); - Pipeline.addCallbackToPipeline(Serializer); - Pipeline.addCallbackToPipeline(Hasher); - consumeError(codeview::visitTypeRecord(Obj.Record, Pipeline, - codeview::VDS_BytesExternal)); - } - - Context.ActiveSerializer = nullptr; + IO.mapOptional("LineInfo", Obj.FileLineInfo); + IO.mapOptional("Modi", Obj.Modi); } diff --git a/tools/llvm-pdbdump/PdbYaml.h b/tools/llvm-pdbdump/PdbYaml.h index 423845caeb31..deb500ec2074 100644 --- a/tools/llvm-pdbdump/PdbYaml.h +++ b/tools/llvm-pdbdump/PdbYaml.h @@ -19,6 +19,9 @@ #include "llvm/DebugInfo/PDB/Native/PDBFile.h" #include "llvm/DebugInfo/PDB/Native/RawConstants.h" #include "llvm/DebugInfo/PDB/PDBTypes.h" +#include "llvm/ObjectYAML/CodeViewYAMLDebugSections.h" +#include "llvm/ObjectYAML/CodeViewYAMLSymbols.h" +#include "llvm/ObjectYAML/CodeViewYAMLTypes.h" #include "llvm/Support/Endian.h" #include "llvm/Support/YAMLTraits.h" @@ -56,75 +59,16 @@ struct PdbInfoStream { std::vector<NamedStreamMapping> NamedStreams; }; -struct PdbSymbolRecord { - codeview::CVSymbol Record; -}; - struct PdbModiStream { uint32_t Signature; - std::vector<PdbSymbolRecord> Symbols; -}; - -struct PdbSourceLineEntry { - uint32_t Offset; - uint32_t LineStart; - uint32_t EndDelta; - bool IsStatement; -}; - -struct PdbSourceColumnEntry { - uint16_t StartColumn; - uint16_t EndColumn; -}; - -struct PdbSourceLineBlock { - StringRef FileName; - std::vector<PdbSourceLineEntry> Lines; - std::vector<PdbSourceColumnEntry> Columns; -}; - -struct HexFormattedString { - std::vector<uint8_t> Bytes; -}; - -struct PdbSourceFileChecksumEntry { - StringRef FileName; - codeview::FileChecksumKind Kind; - HexFormattedString ChecksumBytes; -}; - -struct PdbSourceLineInfo { - uint32_t RelocOffset; - uint32_t RelocSegment; - codeview::LineFlags Flags; - uint32_t CodeSize; - - std::vector<PdbSourceLineBlock> Blocks; -}; - -struct PdbInlineeSite { - codeview::TypeIndex Inlinee; - StringRef FileName; - uint32_t SourceLineNum; - std::vector<StringRef> ExtraFiles; -}; - -struct PdbInlineeInfo { - bool HasExtraFiles; - std::vector<PdbInlineeSite> Sites; -}; - -struct PdbSourceFileInfo { - std::vector<PdbSourceFileChecksumEntry> FileChecksums; - std::vector<PdbSourceLineInfo> LineFragments; - std::vector<PdbInlineeInfo> Inlinees; + std::vector<CodeViewYAML::SymbolRecord> Symbols; }; struct PdbDbiModuleInfo { StringRef Obj; StringRef Mod; std::vector<StringRef> SourceFiles; - Optional<PdbSourceFileInfo> FileLineInfo; + Optional<CodeViewYAML::SourceFileInfo> FileLineInfo; Optional<PdbModiStream> Modi; }; @@ -140,17 +84,9 @@ struct PdbDbiStream { std::vector<PdbDbiModuleInfo> ModInfos; }; -struct PdbTpiRecord { - codeview::CVType Record; -}; - -struct PdbTpiFieldListRecord { - codeview::CVMemberRecord Record; -}; - struct PdbTpiStream { PdbRaw_TpiVer Version = PdbTpiV80; - std::vector<PdbTpiRecord> Records; + std::vector<CodeViewYAML::LeafRecord> Records; }; struct PdbObject { @@ -172,126 +108,15 @@ struct PdbObject { } } -namespace llvm { -namespace yaml { - -template <> struct MappingTraits<pdb::yaml::PdbObject> { - static void mapping(IO &IO, pdb::yaml::PdbObject &Obj); -}; - -template <> struct MappingTraits<pdb::yaml::MSFHeaders> { - static void mapping(IO &IO, pdb::yaml::MSFHeaders &Obj); -}; - -template <> struct MappingTraits<msf::SuperBlock> { - static void mapping(IO &IO, msf::SuperBlock &SB); -}; - -template <> struct MappingTraits<pdb::yaml::StreamBlockList> { - static void mapping(IO &IO, pdb::yaml::StreamBlockList &SB); -}; - -template <> struct MappingTraits<pdb::yaml::PdbInfoStream> { - static void mapping(IO &IO, pdb::yaml::PdbInfoStream &Obj); -}; - -template <> struct MappingContextTraits<pdb::yaml::PdbDbiStream, pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbDbiStream &Obj, pdb::yaml::SerializationContext &Context); -}; - -template <> -struct MappingContextTraits<pdb::yaml::PdbTpiStream, pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbTpiStream &Obj, - pdb::yaml::SerializationContext &Context); -}; - -template <> struct MappingTraits<pdb::yaml::NamedStreamMapping> { - static void mapping(IO &IO, pdb::yaml::NamedStreamMapping &Obj); -}; - -template <> struct MappingContextTraits<pdb::yaml::PdbSymbolRecord, pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbSymbolRecord &Obj, pdb::yaml::SerializationContext &Context); -}; - -template <> struct MappingContextTraits<pdb::yaml::PdbModiStream, pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbModiStream &Obj, pdb::yaml::SerializationContext &Context); -}; - -template <> struct MappingContextTraits<pdb::yaml::PdbDbiModuleInfo, pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbDbiModuleInfo &Obj, pdb::yaml::SerializationContext &Context); -}; - -template <> -struct MappingContextTraits<pdb::yaml::PdbSourceLineEntry, - pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbSourceLineEntry &Obj, - pdb::yaml::SerializationContext &Context); -}; - -template <> -struct MappingContextTraits<pdb::yaml::PdbSourceColumnEntry, - pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbSourceColumnEntry &Obj, - pdb::yaml::SerializationContext &Context); -}; - -template <> -struct MappingContextTraits<pdb::yaml::PdbSourceLineBlock, - pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbSourceLineBlock &Obj, - pdb::yaml::SerializationContext &Context); -}; - -template <> -struct MappingContextTraits<pdb::yaml::PdbSourceFileChecksumEntry, - pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbSourceFileChecksumEntry &Obj, - pdb::yaml::SerializationContext &Context); -}; - -template <> struct ScalarTraits<pdb::yaml::HexFormattedString> { - static void output(const pdb::yaml::HexFormattedString &Value, void *ctx, - llvm::raw_ostream &Out); - static StringRef input(StringRef Scalar, void *ctxt, - pdb::yaml::HexFormattedString &Value); - static bool mustQuote(StringRef) { return false; } -}; - -template <> -struct MappingContextTraits<pdb::yaml::PdbSourceLineInfo, - pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbSourceLineInfo &Obj, - pdb::yaml::SerializationContext &Context); -}; - -template <> -struct MappingContextTraits<pdb::yaml::PdbSourceFileInfo, - pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbSourceFileInfo &Obj, - pdb::yaml::SerializationContext &Context); -}; - -template <> -struct MappingContextTraits<pdb::yaml::PdbInlineeInfo, - pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbInlineeInfo &Obj, - pdb::yaml::SerializationContext &Context); -}; - -template <> -struct MappingContextTraits<pdb::yaml::PdbInlineeSite, - pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbInlineeSite &Obj, - pdb::yaml::SerializationContext &Context); -}; - -template <> -struct MappingContextTraits<pdb::yaml::PdbTpiRecord, - pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbTpiRecord &Obj, - pdb::yaml::SerializationContext &Context); -}; -} -} +LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbObject) +LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::MSFHeaders) +LLVM_YAML_DECLARE_MAPPING_TRAITS(msf::SuperBlock) +LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::StreamBlockList) +LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbInfoStream) +LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbDbiStream) +LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbTpiStream) +LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::NamedStreamMapping) +LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbModiStream) +LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbDbiModuleInfo) #endif // LLVM_TOOLS_LLVMPDBDUMP_PDBYAML_H diff --git a/tools/llvm-pdbdump/YAMLOutputStyle.cpp b/tools/llvm-pdbdump/YAMLOutputStyle.cpp index 7aa68dee7d47..18839a7679d3 100644 --- a/tools/llvm-pdbdump/YAMLOutputStyle.cpp +++ b/tools/llvm-pdbdump/YAMLOutputStyle.cpp @@ -104,12 +104,12 @@ Error YAMLOutputStyle::dump() { namespace { class C13YamlVisitor : public C13DebugFragmentVisitor { public: - C13YamlVisitor(llvm::pdb::yaml::PdbSourceFileInfo &Info, PDBFile &F) + C13YamlVisitor(CodeViewYAML::SourceFileInfo &Info, PDBFile &F) : C13DebugFragmentVisitor(F), Info(Info) {} Error handleFileChecksums() override { for (const auto &C : *Checksums) { - llvm::pdb::yaml::PdbSourceFileChecksumEntry Entry; + CodeViewYAML::SourceFileChecksumEntry Entry; if (auto Result = getNameFromStringTable(C.FileNameOffset)) Entry.FileName = *Result; else @@ -143,7 +143,7 @@ public: return Result.takeError(); for (const auto &N : L.LineNumbers) { - llvm::pdb::yaml::PdbSourceLineEntry Line; + CodeViewYAML::SourceLineEntry Line; Line.Offset = N.Offset; codeview::LineInfo LI(N.Flags); Line.LineStart = LI.getStartLine(); @@ -154,7 +154,7 @@ public: if (LF.hasColumnInfo()) { for (const auto &C : L.Columns) { - llvm::pdb::yaml::PdbSourceColumnEntry Column; + CodeViewYAML::SourceColumnEntry Column; Column.StartColumn = C.StartColumn; Column.EndColumn = C.EndColumn; Block.Columns.push_back(Column); @@ -179,7 +179,7 @@ public: else return Result.takeError(); - Site.Inlinee = IL.Header->Inlinee; + Site.Inlinee = IL.Header->Inlinee.getIndex(); Site.SourceLineNum = IL.Header->SourceLineNum; if (ILF.hasExtraFiles()) { for (const auto &EF : IL.ExtraFiles) { @@ -195,17 +195,16 @@ public: } private: - - llvm::pdb::yaml::PdbSourceFileInfo &Info; + CodeViewYAML::SourceFileInfo &Info; }; } -Expected<Optional<llvm::pdb::yaml::PdbSourceFileInfo>> +Expected<Optional<CodeViewYAML::SourceFileInfo>> YAMLOutputStyle::getFileLineInfo(const pdb::ModuleDebugStreamRef &ModS) { if (!ModS.hasLineInfo()) return None; - yaml::PdbSourceFileInfo Info; + CodeViewYAML::SourceFileInfo Info; C13YamlVisitor Visitor(Info, File); if (auto EC = codeview::visitDebugSubsections(ModS.linesAndChecksums(), Visitor)) @@ -357,8 +356,11 @@ Error YAMLOutputStyle::dumpDbiStream() { DMI.Modi->Signature = ModS.signature(); bool HadError = false; for (auto &Sym : ModS.symbols(&HadError)) { - pdb::yaml::PdbSymbolRecord Record{Sym}; - DMI.Modi->Symbols.push_back(Record); + auto ES = CodeViewYAML::SymbolRecord::fromCodeViewSymbol(Sym); + if (!ES) + return ES.takeError(); + + DMI.Modi->Symbols.push_back(*ES); } } } @@ -378,13 +380,10 @@ Error YAMLOutputStyle::dumpTpiStream() { Obj.TpiStream.emplace(); Obj.TpiStream->Version = TS.getTpiVersion(); for (auto &Record : TS.types(nullptr)) { - yaml::PdbTpiRecord R; - // It's not necessary to set R.RecordData here. That only exists as a - // way to have the `PdbTpiRecord` structure own the memory that `R.Record` - // references. In the case of reading an existing PDB though, that memory - // is owned by the backing stream. - R.Record = Record; - Obj.TpiStream->Records.push_back(R); + auto ExpectedRecord = CodeViewYAML::LeafRecord::fromCodeViewRecord(Record); + if (!ExpectedRecord) + return ExpectedRecord.takeError(); + Obj.TpiStream->Records.push_back(*ExpectedRecord); } return Error::success(); @@ -402,9 +401,11 @@ Error YAMLOutputStyle::dumpIpiStream() { Obj.IpiStream.emplace(); Obj.IpiStream->Version = IS.getTpiVersion(); for (auto &Record : IS.types(nullptr)) { - yaml::PdbTpiRecord R; - R.Record = Record; - Obj.IpiStream->Records.push_back(R); + auto ExpectedRecord = CodeViewYAML::LeafRecord::fromCodeViewRecord(Record); + if (!ExpectedRecord) + return ExpectedRecord.takeError(); + + Obj.IpiStream->Records.push_back(*ExpectedRecord); } return Error::success(); diff --git a/tools/llvm-pdbdump/YAMLOutputStyle.h b/tools/llvm-pdbdump/YAMLOutputStyle.h index 068312aec450..6e4067c48f88 100644 --- a/tools/llvm-pdbdump/YAMLOutputStyle.h +++ b/tools/llvm-pdbdump/YAMLOutputStyle.h @@ -27,7 +27,7 @@ public: Error dump() override; private: - Expected<Optional<llvm::pdb::yaml::PdbSourceFileInfo>> + Expected<Optional<CodeViewYAML::SourceFileInfo>> getFileLineInfo(const pdb::ModuleDebugStreamRef &ModS); Error dumpStringTable(); diff --git a/tools/llvm-pdbdump/YamlSerializationContext.h b/tools/llvm-pdbdump/YamlSerializationContext.h deleted file mode 100644 index dcf29d249d60..000000000000 --- a/tools/llvm-pdbdump/YamlSerializationContext.h +++ /dev/null @@ -1,39 +0,0 @@ -//===- YamlSerializationContext.h ----------------------------- *- C++ --*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVMPDBDUMP_YAMLSERIALIZATIONCONTEXT_H -#define LLVM_TOOLS_LLVMPDBDUMP_YAMLSERIALIZATIONCONTEXT_H - -#include "PdbYaml.h" -#include "YamlTypeDumper.h" -#include "llvm/Support/Allocator.h" - -namespace llvm { -namespace codeview { -class TypeSerializer; -} -namespace yaml { -class IO; -} - -namespace pdb { -namespace yaml { -struct SerializationContext { - explicit SerializationContext(llvm::yaml::IO &IO, BumpPtrAllocator &Allocator) - : Dumper(IO, *this), Allocator(Allocator) {} - - codeview::yaml::YamlTypeDumperCallbacks Dumper; - BumpPtrAllocator &Allocator; - codeview::TypeSerializer *ActiveSerializer = nullptr; -}; -} -} -} - -#endif
\ No newline at end of file diff --git a/tools/llvm-pdbdump/YamlSymbolDumper.cpp b/tools/llvm-pdbdump/YamlSymbolDumper.cpp deleted file mode 100644 index 431bf404fb04..000000000000 --- a/tools/llvm-pdbdump/YamlSymbolDumper.cpp +++ /dev/null @@ -1,413 +0,0 @@ -//===- YamlSymbolDumper.cpp ----------------------------------- *- C++ --*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "YamlSymbolDumper.h" -#include "PdbYaml.h" -#include "YamlTypeDumper.h" - -#include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h" -#include "llvm/DebugInfo/CodeView/EnumTables.h" -#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" -#include "llvm/DebugInfo/CodeView/SymbolRecord.h" -#include "llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h" - -using namespace llvm; -using namespace llvm::codeview; -using namespace llvm::codeview::yaml; - -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex) -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint64_t) -LLVM_YAML_IS_SEQUENCE_VECTOR(OneMethodRecord) -LLVM_YAML_IS_SEQUENCE_VECTOR(VFTableSlotKind) -LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) -LLVM_YAML_IS_SEQUENCE_VECTOR(CVType) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbTpiFieldListRecord) - -namespace llvm { -namespace yaml { -void ScalarEnumerationTraits<SymbolKind>::enumeration(IO &io, - SymbolKind &Value) { - auto SymbolNames = getSymbolTypeNames(); - for (const auto &E : SymbolNames) - io.enumCase(Value, E.Name.str().c_str(), E.Value); -} - -template <> struct ScalarBitSetTraits<CompileSym2Flags> { - static void bitset(IO &io, CompileSym2Flags &Flags) { - auto FlagNames = getCompileSym2FlagNames(); - for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<CompileSym2Flags>(E.Value)); - } - } -}; - -template <> struct ScalarBitSetTraits<CompileSym3Flags> { - static void bitset(IO &io, CompileSym3Flags &Flags) { - auto FlagNames = getCompileSym3FlagNames(); - for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<CompileSym3Flags>(E.Value)); - } - } -}; - -template <> struct ScalarBitSetTraits<ExportFlags> { - static void bitset(IO &io, ExportFlags &Flags) { - auto FlagNames = getExportSymFlagNames(); - for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<ExportFlags>(E.Value)); - } - } -}; - -template <> struct ScalarBitSetTraits<LocalSymFlags> { - static void bitset(IO &io, LocalSymFlags &Flags) { - auto FlagNames = getLocalFlagNames(); - for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<LocalSymFlags>(E.Value)); - } - } -}; - -template <> struct ScalarBitSetTraits<ProcSymFlags> { - static void bitset(IO &io, ProcSymFlags &Flags) { - auto FlagNames = getProcSymFlagNames(); - for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<ProcSymFlags>(E.Value)); - } - } -}; - -template <> struct ScalarBitSetTraits<FrameProcedureOptions> { - static void bitset(IO &io, FrameProcedureOptions &Flags) { - auto FlagNames = getFrameProcSymFlagNames(); - for (const auto &E : FlagNames) { - io.bitSetCase(Flags, E.Name.str().c_str(), - static_cast<FrameProcedureOptions>(E.Value)); - } - } -}; - -template <> struct ScalarEnumerationTraits<CPUType> { - static void enumeration(IO &io, CPUType &Cpu) { - auto CpuNames = getCPUTypeNames(); - for (const auto &E : CpuNames) { - io.enumCase(Cpu, E.Name.str().c_str(), static_cast<CPUType>(E.Value)); - } - } -}; - -template <> struct ScalarEnumerationTraits<RegisterId> { - static void enumeration(IO &io, RegisterId &Reg) { - auto RegNames = getRegisterNames(); - for (const auto &E : RegNames) { - io.enumCase(Reg, E.Name.str().c_str(), static_cast<RegisterId>(E.Value)); - } - io.enumFallback<Hex16>(Reg); - } -}; - -template <> struct ScalarEnumerationTraits<TrampolineType> { - static void enumeration(IO &io, TrampolineType &Tramp) { - auto TrampNames = getTrampolineNames(); - for (const auto &E : TrampNames) { - io.enumCase(Tramp, E.Name.str().c_str(), - static_cast<TrampolineType>(E.Value)); - } - } -}; - -template <> struct ScalarEnumerationTraits<ThunkOrdinal> { - static void enumeration(IO &io, ThunkOrdinal &Ord) { - auto ThunkNames = getThunkOrdinalNames(); - for (const auto &E : ThunkNames) { - io.enumCase(Ord, E.Name.str().c_str(), - static_cast<ThunkOrdinal>(E.Value)); - } - } -}; - -void MappingTraits<ScopeEndSym>::mapping(IO &IO, ScopeEndSym &Obj) {} - -void MappingTraits<Thunk32Sym>::mapping(IO &IO, Thunk32Sym &Thunk) { - IO.mapRequired("Parent", Thunk.Parent); - IO.mapRequired("End", Thunk.End); - IO.mapRequired("Next", Thunk.Next); - IO.mapRequired("Off", Thunk.Offset); - IO.mapRequired("Seg", Thunk.Segment); - IO.mapRequired("Len", Thunk.Length); - IO.mapRequired("Ordinal", Thunk.Thunk); -} - -void MappingTraits<TrampolineSym>::mapping(IO &IO, TrampolineSym &Tramp) { - IO.mapRequired("Type", Tramp.Type); - IO.mapRequired("Size", Tramp.Size); - IO.mapRequired("ThunkOff", Tramp.ThunkOffset); - IO.mapRequired("TargetOff", Tramp.TargetOffset); - IO.mapRequired("ThunkSection", Tramp.ThunkSection); - IO.mapRequired("TargetSection", Tramp.TargetSection); -} - -void MappingTraits<SectionSym>::mapping(IO &IO, SectionSym &Section) { - IO.mapRequired("SectionNumber", Section.SectionNumber); - IO.mapRequired("Alignment", Section.Alignment); - IO.mapRequired("Rva", Section.Rva); - IO.mapRequired("Length", Section.Length); - IO.mapRequired("Characteristics", Section.Characteristics); - IO.mapRequired("Name", Section.Name); -} - -void MappingTraits<CoffGroupSym>::mapping(IO &IO, CoffGroupSym &CoffGroup) { - IO.mapRequired("Size", CoffGroup.Size); - IO.mapRequired("Characteristics", CoffGroup.Characteristics); - IO.mapRequired("Offset", CoffGroup.Offset); - IO.mapRequired("Segment", CoffGroup.Segment); - IO.mapRequired("Name", CoffGroup.Name); -} - -void MappingTraits<ExportSym>::mapping(IO &IO, ExportSym &Export) { - IO.mapRequired("Ordinal", Export.Ordinal); - IO.mapRequired("Flags", Export.Flags); - IO.mapRequired("Name", Export.Name); -} - -void MappingTraits<ProcSym>::mapping(IO &IO, ProcSym &Proc) { - // TODO: Print the linkage name - - IO.mapRequired("PtrParent", Proc.Parent); - IO.mapRequired("PtrEnd", Proc.End); - IO.mapRequired("PtrNext", Proc.Next); - IO.mapRequired("CodeSize", Proc.CodeSize); - IO.mapRequired("DbgStart", Proc.DbgStart); - IO.mapRequired("DbgEnd", Proc.DbgEnd); - IO.mapRequired("FunctionType", Proc.FunctionType); - IO.mapRequired("Segment", Proc.Segment); - IO.mapRequired("Flags", Proc.Flags); - IO.mapRequired("DisplayName", Proc.Name); -} - -void MappingTraits<RegisterSym>::mapping(IO &IO, RegisterSym &Register) { - IO.mapRequired("Type", Register.Index); - IO.mapRequired("Seg", Register.Register); - IO.mapRequired("Name", Register.Name); -} - -void MappingTraits<PublicSym32>::mapping(IO &IO, PublicSym32 &Public) { - IO.mapRequired("Type", Public.Index); - IO.mapRequired("Seg", Public.Segment); - IO.mapRequired("Off", Public.Offset); - IO.mapRequired("Name", Public.Name); -} - -void MappingTraits<ProcRefSym>::mapping(IO &IO, ProcRefSym &ProcRef) { - IO.mapRequired("SumName", ProcRef.SumName); - IO.mapRequired("SymOffset", ProcRef.SymOffset); - IO.mapRequired("Mod", ProcRef.Module); - IO.mapRequired("Name", ProcRef.Name); -} - -void MappingTraits<EnvBlockSym>::mapping(IO &IO, EnvBlockSym &EnvBlock) { - IO.mapRequired("Entries", EnvBlock.Fields); -} - -void MappingTraits<InlineSiteSym>::mapping(IO &IO, InlineSiteSym &InlineSite) { - IO.mapRequired("PtrParent", InlineSite.Parent); - IO.mapRequired("PtrEnd", InlineSite.End); - IO.mapRequired("Inlinee", InlineSite.Inlinee); - // TODO: The binary annotations -} - -void MappingTraits<LocalSym>::mapping(IO &IO, LocalSym &Local) { - IO.mapRequired("Type", Local.Type); - IO.mapRequired("Flags", Local.Flags); - IO.mapRequired("VarName", Local.Name); -} - -void MappingTraits<DefRangeSym>::mapping(IO &IO, DefRangeSym &Obj) { - // TODO: Print the subfields -} - -void MappingTraits<DefRangeSubfieldSym>::mapping(IO &IO, - DefRangeSubfieldSym &Obj) { - // TODO: Print the subfields -} - -void MappingTraits<DefRangeRegisterSym>::mapping(IO &IO, - DefRangeRegisterSym &Obj) { - // TODO: Print the subfields -} - -void MappingTraits<DefRangeFramePointerRelSym>::mapping( - IO &IO, DefRangeFramePointerRelSym &Obj) { - // TODO: Print the subfields -} - -void MappingTraits<DefRangeSubfieldRegisterSym>::mapping( - IO &IO, DefRangeSubfieldRegisterSym &Obj) { - // TODO: Print the subfields -} - -void MappingTraits<DefRangeFramePointerRelFullScopeSym>::mapping( - IO &IO, DefRangeFramePointerRelFullScopeSym &Obj) { - // TODO: Print the subfields -} - -void MappingTraits<DefRangeRegisterRelSym>::mapping( - IO &IO, DefRangeRegisterRelSym &Obj) { - // TODO: Print the subfields -} - -void MappingTraits<BlockSym>::mapping(IO &IO, BlockSym &Block) { - // TODO: Print the linkage name - IO.mapRequired("PtrParent", Block.Parent); - IO.mapRequired("PtrEnd", Block.End); - IO.mapRequired("CodeSize", Block.CodeSize); - IO.mapRequired("Segment", Block.Segment); - IO.mapRequired("BlockName", Block.Name); -} - -void MappingTraits<LabelSym>::mapping(IO &IO, LabelSym &Label) { - // TODO: Print the linkage name - IO.mapRequired("Segment", Label.Segment); - IO.mapRequired("Flags", Label.Flags); - IO.mapRequired("Flags", Label.Flags); - IO.mapRequired("DisplayName", Label.Name); -} - -void MappingTraits<ObjNameSym>::mapping(IO &IO, ObjNameSym &ObjName) { - IO.mapRequired("Signature", ObjName.Signature); - IO.mapRequired("ObjectName", ObjName.Name); -} - -void MappingTraits<Compile2Sym>::mapping(IO &IO, Compile2Sym &Compile2) { - IO.mapRequired("Flags", Compile2.Flags); - IO.mapRequired("Machine", Compile2.Machine); - IO.mapRequired("FrontendMajor", Compile2.VersionFrontendMajor); - IO.mapRequired("FrontendMinor", Compile2.VersionFrontendMinor); - IO.mapRequired("FrontendBuild", Compile2.VersionFrontendBuild); - IO.mapRequired("BackendMajor", Compile2.VersionBackendMajor); - IO.mapRequired("BackendMinor", Compile2.VersionBackendMinor); - IO.mapRequired("BackendBuild", Compile2.VersionBackendBuild); - IO.mapRequired("Version", Compile2.Version); -} - -void MappingTraits<Compile3Sym>::mapping(IO &IO, Compile3Sym &Compile3) { - IO.mapRequired("Flags", Compile3.Flags); - IO.mapRequired("Machine", Compile3.Machine); - IO.mapRequired("FrontendMajor", Compile3.VersionFrontendMajor); - IO.mapRequired("FrontendMinor", Compile3.VersionFrontendMinor); - IO.mapRequired("FrontendBuild", Compile3.VersionFrontendBuild); - IO.mapRequired("FrontendQFE", Compile3.VersionFrontendQFE); - IO.mapRequired("BackendMajor", Compile3.VersionBackendMajor); - IO.mapRequired("BackendMinor", Compile3.VersionBackendMinor); - IO.mapRequired("BackendBuild", Compile3.VersionBackendBuild); - IO.mapRequired("BackendQFE", Compile3.VersionBackendQFE); - IO.mapRequired("Version", Compile3.Version); -} - -void MappingTraits<FrameProcSym>::mapping(IO &IO, FrameProcSym &FrameProc) { - IO.mapRequired("TotalFrameBytes", FrameProc.TotalFrameBytes); - IO.mapRequired("PaddingFrameBytes", FrameProc.PaddingFrameBytes); - IO.mapRequired("OffsetToPadding", FrameProc.OffsetToPadding); - IO.mapRequired("BytesOfCalleeSavedRegisters", - FrameProc.BytesOfCalleeSavedRegisters); - IO.mapRequired("OffsetOfExceptionHandler", - FrameProc.OffsetOfExceptionHandler); - IO.mapRequired("SectionIdOfExceptionHandler", - FrameProc.SectionIdOfExceptionHandler); - IO.mapRequired("Flags", FrameProc.Flags); -} - -void MappingTraits<CallSiteInfoSym>::mapping(IO &IO, - CallSiteInfoSym &CallSiteInfo) { - // TODO: Map Linkage Name - IO.mapRequired("Segment", CallSiteInfo.Segment); - IO.mapRequired("Type", CallSiteInfo.Type); -} - -void MappingTraits<FileStaticSym>::mapping(IO &IO, FileStaticSym &FileStatic) { - IO.mapRequired("Index", FileStatic.Index); - IO.mapRequired("ModFilenameOffset", FileStatic.ModFilenameOffset); - IO.mapRequired("Flags", FileStatic.Flags); - IO.mapRequired("Name", FileStatic.Name); -} - -void MappingTraits<HeapAllocationSiteSym>::mapping( - IO &IO, HeapAllocationSiteSym &HeapAllocSite) { - // TODO: Map Linkage Name - IO.mapRequired("Segment", HeapAllocSite.Segment); - IO.mapRequired("CallInstructionSize", HeapAllocSite.CallInstructionSize); - IO.mapRequired("Type", HeapAllocSite.Type); -} - -void MappingTraits<FrameCookieSym>::mapping(IO &IO, - FrameCookieSym &FrameCookie) { - // TODO: Map Linkage Name - IO.mapRequired("Register", FrameCookie.Register); - IO.mapRequired("CookieKind", FrameCookie.CookieKind); - IO.mapRequired("Flags", FrameCookie.Flags); -} - -void MappingTraits<CallerSym>::mapping(IO &IO, CallerSym &Caller) { - // TODO: Correctly handle the ArrayRef in here. - std::vector<TypeIndex> Indices(Caller.Indices); - IO.mapRequired("FuncID", Indices); -} - -void MappingTraits<UDTSym>::mapping(IO &IO, UDTSym &UDT) { - IO.mapRequired("Type", UDT.Type); - IO.mapRequired("UDTName", UDT.Name); -} - -void MappingTraits<BuildInfoSym>::mapping(IO &IO, BuildInfoSym &BuildInfo) { - IO.mapRequired("BuildId", BuildInfo.BuildId); -} - -void MappingTraits<BPRelativeSym>::mapping(IO &IO, BPRelativeSym &BPRel) { - IO.mapRequired("Offset", BPRel.Offset); - IO.mapRequired("Type", BPRel.Type); - IO.mapRequired("VarName", BPRel.Name); -} - -void MappingTraits<RegRelativeSym>::mapping(IO &IO, RegRelativeSym &RegRel) { - IO.mapRequired("Offset", RegRel.Offset); - IO.mapRequired("Type", RegRel.Type); - IO.mapRequired("Register", RegRel.Register); - IO.mapRequired("VarName", RegRel.Name); -} - -void MappingTraits<ConstantSym>::mapping(IO &IO, ConstantSym &Constant) { - IO.mapRequired("Type", Constant.Type); - IO.mapRequired("Value", Constant.Value); - IO.mapRequired("Name", Constant.Name); -} - -void MappingTraits<DataSym>::mapping(IO &IO, DataSym &Data) { - // TODO: Map linkage name - IO.mapRequired("Type", Data.Type); - IO.mapRequired("DisplayName", Data.Name); -} - -void MappingTraits<ThreadLocalDataSym>::mapping(IO &IO, - ThreadLocalDataSym &Data) { - // TODO: Map linkage name - IO.mapRequired("Type", Data.Type); - IO.mapRequired("DisplayName", Data.Name); -} -} -} - -Error llvm::codeview::yaml::YamlSymbolDumper::visitSymbolBegin(CVSymbol &CVR) { - YamlIO.mapRequired("Kind", CVR.Type); - return Error::success(); -} diff --git a/tools/llvm-pdbdump/YamlSymbolDumper.h b/tools/llvm-pdbdump/YamlSymbolDumper.h deleted file mode 100644 index 61e63f96719a..000000000000 --- a/tools/llvm-pdbdump/YamlSymbolDumper.h +++ /dev/null @@ -1,66 +0,0 @@ -//===- YamlSymbolDumper.h ------------------------------------- *- C++ --*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVMPDBDUMP_YAMLSYMBOLDUMPER_H -#define LLVM_TOOLS_LLVMPDBDUMP_YAMLSYMBOLDUMPER_H - -#include "llvm/DebugInfo/CodeView/CodeView.h" -#include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h" -#include "llvm/Support/YAMLTraits.h" - -namespace llvm { -namespace pdb { -namespace yaml { -struct SerializationContext; -} -} -namespace codeview { -namespace yaml { -class YamlSymbolDumper : public SymbolVisitorCallbacks { -public: - YamlSymbolDumper(llvm::yaml::IO &IO) : YamlIO(IO) {} - - virtual Error visitSymbolBegin(CVSymbol &Record) override; - -#define SYMBOL_RECORD(EnumName, EnumVal, Name) \ - Error visitKnownRecord(CVSymbol &CVR, Name &Record) override { \ - visitKnownRecordImpl(#Name, CVR, Record); \ - return Error::success(); \ - } -#define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def" - -private: - template <typename T> - void visitKnownRecordImpl(const char *Name, CVSymbol &Type, T &Record) { - YamlIO.mapRequired(Name, Record); - } - - llvm::yaml::IO &YamlIO; -}; -} -} -} - -namespace llvm { -namespace yaml { -template <> struct ScalarEnumerationTraits<codeview::SymbolKind> { - static void enumeration(IO &io, codeview::SymbolKind &Value); -}; - -#define SYMBOL_RECORD(EnumName, EnumVal, Name) \ - template <> struct MappingTraits<codeview::Name> { \ - static void mapping(IO &IO, codeview::Name &Obj); \ - }; -#define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def" -} -} - -#endif diff --git a/tools/llvm-pdbdump/YamlTypeDumper.cpp b/tools/llvm-pdbdump/YamlTypeDumper.cpp deleted file mode 100644 index beb700720954..000000000000 --- a/tools/llvm-pdbdump/YamlTypeDumper.cpp +++ /dev/null @@ -1,589 +0,0 @@ -//===- YamlTypeDumper.cpp ------------------------------------- *- C++ --*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "YamlTypeDumper.h" -#include "PdbYaml.h" -#include "YamlSerializationContext.h" - -#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" -#include "llvm/DebugInfo/CodeView/EnumTables.h" -#include "llvm/DebugInfo/CodeView/TypeRecord.h" -#include "llvm/DebugInfo/CodeView/TypeSerializer.h" -#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h" -#include "llvm/DebugInfo/PDB/Native/TpiHashing.h" - -using namespace llvm; -using namespace llvm::codeview; -using namespace llvm::codeview::yaml; - -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex) -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint64_t) -LLVM_YAML_IS_SEQUENCE_VECTOR(OneMethodRecord) -LLVM_YAML_IS_SEQUENCE_VECTOR(VFTableSlotKind) -LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) -LLVM_YAML_IS_SEQUENCE_VECTOR(CVType) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbTpiFieldListRecord) - -namespace { -struct FieldListRecordSplitter : public TypeVisitorCallbacks { -public: - explicit FieldListRecordSplitter( - std::vector<llvm::pdb::yaml::PdbTpiFieldListRecord> &Records) - : Records(Records) {} - -#define TYPE_RECORD(EnumName, EnumVal, Name) -#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#define MEMBER_RECORD(EnumName, EnumVal, Name) \ - Error visitKnownMember(CVMemberRecord &CVT, Name##Record &Record) override { \ - visitKnownMemberImpl(CVT); \ - return Error::success(); \ - } -#include "llvm/DebugInfo/CodeView/TypeRecords.def" - -private: - void visitKnownMemberImpl(CVMemberRecord &CVT) { - llvm::pdb::yaml::PdbTpiFieldListRecord R; - R.Record = CVT; - Records.push_back(std::move(R)); - } - - std::vector<llvm::pdb::yaml::PdbTpiFieldListRecord> &Records; -}; -} - -namespace llvm { -namespace yaml { -template <> struct ScalarEnumerationTraits<PointerToMemberRepresentation> { - static void enumeration(IO &IO, PointerToMemberRepresentation &Value) { - IO.enumCase(Value, "Unknown", PointerToMemberRepresentation::Unknown); - IO.enumCase(Value, "SingleInheritanceData", - PointerToMemberRepresentation::SingleInheritanceData); - IO.enumCase(Value, "MultipleInheritanceData", - PointerToMemberRepresentation::MultipleInheritanceData); - IO.enumCase(Value, "VirtualInheritanceData", - PointerToMemberRepresentation::VirtualInheritanceData); - IO.enumCase(Value, "GeneralData", - PointerToMemberRepresentation::GeneralData); - IO.enumCase(Value, "SingleInheritanceFunction", - PointerToMemberRepresentation::SingleInheritanceFunction); - IO.enumCase(Value, "MultipleInheritanceFunction", - PointerToMemberRepresentation::MultipleInheritanceFunction); - IO.enumCase(Value, "VirtualInheritanceFunction", - PointerToMemberRepresentation::VirtualInheritanceFunction); - IO.enumCase(Value, "GeneralFunction", - PointerToMemberRepresentation::GeneralFunction); - } -}; - -template <> struct ScalarEnumerationTraits<VFTableSlotKind> { - static void enumeration(IO &IO, VFTableSlotKind &Kind) { - IO.enumCase(Kind, "Near16", VFTableSlotKind::Near16); - IO.enumCase(Kind, "Far16", VFTableSlotKind::Far16); - IO.enumCase(Kind, "This", VFTableSlotKind::This); - IO.enumCase(Kind, "Outer", VFTableSlotKind::Outer); - IO.enumCase(Kind, "Meta", VFTableSlotKind::Meta); - IO.enumCase(Kind, "Near", VFTableSlotKind::Near); - IO.enumCase(Kind, "Far", VFTableSlotKind::Far); - } -}; - -template <> struct ScalarEnumerationTraits<CallingConvention> { - static void enumeration(IO &IO, CallingConvention &Value) { - IO.enumCase(Value, "NearC", CallingConvention::NearC); - IO.enumCase(Value, "FarC", CallingConvention::FarC); - IO.enumCase(Value, "NearPascal", CallingConvention::NearPascal); - IO.enumCase(Value, "FarPascal", CallingConvention::FarPascal); - IO.enumCase(Value, "NearFast", CallingConvention::NearFast); - IO.enumCase(Value, "FarFast", CallingConvention::FarFast); - IO.enumCase(Value, "NearStdCall", CallingConvention::NearStdCall); - IO.enumCase(Value, "FarStdCall", CallingConvention::FarStdCall); - IO.enumCase(Value, "NearSysCall", CallingConvention::NearSysCall); - IO.enumCase(Value, "FarSysCall", CallingConvention::FarSysCall); - IO.enumCase(Value, "ThisCall", CallingConvention::ThisCall); - IO.enumCase(Value, "MipsCall", CallingConvention::MipsCall); - IO.enumCase(Value, "Generic", CallingConvention::Generic); - IO.enumCase(Value, "AlphaCall", CallingConvention::AlphaCall); - IO.enumCase(Value, "PpcCall", CallingConvention::PpcCall); - IO.enumCase(Value, "SHCall", CallingConvention::SHCall); - IO.enumCase(Value, "ArmCall", CallingConvention::ArmCall); - IO.enumCase(Value, "AM33Call", CallingConvention::AM33Call); - IO.enumCase(Value, "TriCall", CallingConvention::TriCall); - IO.enumCase(Value, "SH5Call", CallingConvention::SH5Call); - IO.enumCase(Value, "M32RCall", CallingConvention::M32RCall); - IO.enumCase(Value, "ClrCall", CallingConvention::ClrCall); - IO.enumCase(Value, "Inline", CallingConvention::Inline); - IO.enumCase(Value, "NearVector", CallingConvention::NearVector); - } -}; - -template <> struct ScalarEnumerationTraits<PointerKind> { - static void enumeration(IO &IO, PointerKind &Kind) { - IO.enumCase(Kind, "Near16", PointerKind::Near16); - IO.enumCase(Kind, "Far16", PointerKind::Far16); - IO.enumCase(Kind, "Huge16", PointerKind::Huge16); - IO.enumCase(Kind, "BasedOnSegment", PointerKind::BasedOnSegment); - IO.enumCase(Kind, "BasedOnValue", PointerKind::BasedOnValue); - IO.enumCase(Kind, "BasedOnSegmentValue", PointerKind::BasedOnSegmentValue); - IO.enumCase(Kind, "BasedOnAddress", PointerKind::BasedOnAddress); - IO.enumCase(Kind, "BasedOnSegmentAddress", - PointerKind::BasedOnSegmentAddress); - IO.enumCase(Kind, "BasedOnType", PointerKind::BasedOnType); - IO.enumCase(Kind, "BasedOnSelf", PointerKind::BasedOnSelf); - IO.enumCase(Kind, "Near32", PointerKind::Near32); - IO.enumCase(Kind, "Far32", PointerKind::Far32); - IO.enumCase(Kind, "Near64", PointerKind::Near64); - } -}; - -template <> struct ScalarEnumerationTraits<PointerMode> { - static void enumeration(IO &IO, PointerMode &Mode) { - IO.enumCase(Mode, "Pointer", PointerMode::Pointer); - IO.enumCase(Mode, "LValueReference", PointerMode::LValueReference); - IO.enumCase(Mode, "PointerToDataMember", PointerMode::PointerToDataMember); - IO.enumCase(Mode, "PointerToMemberFunction", - PointerMode::PointerToMemberFunction); - IO.enumCase(Mode, "RValueReference", PointerMode::RValueReference); - } -}; - -template <> struct ScalarEnumerationTraits<HfaKind> { - static void enumeration(IO &IO, HfaKind &Value) { - IO.enumCase(Value, "None", HfaKind::None); - IO.enumCase(Value, "Float", HfaKind::Float); - IO.enumCase(Value, "Double", HfaKind::Double); - IO.enumCase(Value, "Other", HfaKind::Other); - } -}; - -template <> struct ScalarEnumerationTraits<MemberAccess> { - static void enumeration(IO &IO, MemberAccess &Access) { - IO.enumCase(Access, "None", MemberAccess::None); - IO.enumCase(Access, "Private", MemberAccess::Private); - IO.enumCase(Access, "Protected", MemberAccess::Protected); - IO.enumCase(Access, "Public", MemberAccess::Public); - } -}; - -template <> struct ScalarEnumerationTraits<MethodKind> { - static void enumeration(IO &IO, MethodKind &Kind) { - IO.enumCase(Kind, "Vanilla", MethodKind::Vanilla); - IO.enumCase(Kind, "Virtual", MethodKind::Virtual); - IO.enumCase(Kind, "Static", MethodKind::Static); - IO.enumCase(Kind, "Friend", MethodKind::Friend); - IO.enumCase(Kind, "IntroducingVirtual", MethodKind::IntroducingVirtual); - IO.enumCase(Kind, "PureVirtual", MethodKind::PureVirtual); - IO.enumCase(Kind, "PureIntroducingVirtual", - MethodKind::PureIntroducingVirtual); - } -}; - -template <> struct ScalarEnumerationTraits<WindowsRTClassKind> { - static void enumeration(IO &IO, WindowsRTClassKind &Value) { - IO.enumCase(Value, "None", WindowsRTClassKind::None); - IO.enumCase(Value, "Ref", WindowsRTClassKind::RefClass); - IO.enumCase(Value, "Value", WindowsRTClassKind::ValueClass); - IO.enumCase(Value, "Interface", WindowsRTClassKind::Interface); - } -}; - -template <> struct ScalarEnumerationTraits<LabelType> { - static void enumeration(IO &IO, LabelType &Value) { - IO.enumCase(Value, "Near", LabelType::Near); - IO.enumCase(Value, "Far", LabelType::Far); - } -}; - -template <> struct ScalarBitSetTraits<PointerOptions> { - static void bitset(IO &IO, PointerOptions &Options) { - IO.bitSetCase(Options, "None", PointerOptions::None); - IO.bitSetCase(Options, "Flat32", PointerOptions::Flat32); - IO.bitSetCase(Options, "Volatile", PointerOptions::Volatile); - IO.bitSetCase(Options, "Const", PointerOptions::Const); - IO.bitSetCase(Options, "Unaligned", PointerOptions::Unaligned); - IO.bitSetCase(Options, "Restrict", PointerOptions::Restrict); - IO.bitSetCase(Options, "WinRTSmartPointer", - PointerOptions::WinRTSmartPointer); - } -}; - -template <> struct ScalarBitSetTraits<ModifierOptions> { - static void bitset(IO &IO, ModifierOptions &Options) { - IO.bitSetCase(Options, "None", ModifierOptions::None); - IO.bitSetCase(Options, "Const", ModifierOptions::Const); - IO.bitSetCase(Options, "Volatile", ModifierOptions::Volatile); - IO.bitSetCase(Options, "Unaligned", ModifierOptions::Unaligned); - } -}; - -template <> struct ScalarBitSetTraits<FunctionOptions> { - static void bitset(IO &IO, FunctionOptions &Options) { - IO.bitSetCase(Options, "None", FunctionOptions::None); - IO.bitSetCase(Options, "CxxReturnUdt", FunctionOptions::CxxReturnUdt); - IO.bitSetCase(Options, "Constructor", FunctionOptions::Constructor); - IO.bitSetCase(Options, "ConstructorWithVirtualBases", - FunctionOptions::ConstructorWithVirtualBases); - } -}; - -template <> struct ScalarBitSetTraits<ClassOptions> { - static void bitset(IO &IO, ClassOptions &Options) { - IO.bitSetCase(Options, "None", ClassOptions::None); - IO.bitSetCase(Options, "HasConstructorOrDestructor", - ClassOptions::HasConstructorOrDestructor); - IO.bitSetCase(Options, "HasOverloadedOperator", - ClassOptions::HasOverloadedOperator); - IO.bitSetCase(Options, "Nested", ClassOptions::Nested); - IO.bitSetCase(Options, "ContainsNestedClass", - ClassOptions::ContainsNestedClass); - IO.bitSetCase(Options, "HasOverloadedAssignmentOperator", - ClassOptions::HasOverloadedAssignmentOperator); - IO.bitSetCase(Options, "HasConversionOperator", - ClassOptions::HasConversionOperator); - IO.bitSetCase(Options, "ForwardReference", ClassOptions::ForwardReference); - IO.bitSetCase(Options, "Scoped", ClassOptions::Scoped); - IO.bitSetCase(Options, "HasUniqueName", ClassOptions::HasUniqueName); - IO.bitSetCase(Options, "Sealed", ClassOptions::Sealed); - IO.bitSetCase(Options, "Intrinsic", ClassOptions::Intrinsic); - } -}; - -template <> struct ScalarBitSetTraits<MethodOptions> { - static void bitset(IO &IO, MethodOptions &Options) { - IO.bitSetCase(Options, "None", MethodOptions::None); - IO.bitSetCase(Options, "Pseudo", MethodOptions::Pseudo); - IO.bitSetCase(Options, "NoInherit", MethodOptions::NoInherit); - IO.bitSetCase(Options, "NoConstruct", MethodOptions::NoConstruct); - IO.bitSetCase(Options, "CompilerGenerated", - MethodOptions::CompilerGenerated); - IO.bitSetCase(Options, "Sealed", MethodOptions::Sealed); - } -}; - -void ScalarTraits<APSInt>::output(const APSInt &S, void *, - llvm::raw_ostream &OS) { - S.print(OS, true); -} -StringRef ScalarTraits<APSInt>::input(StringRef Scalar, void *Ctx, APSInt &S) { - S = APSInt(Scalar); - return ""; -} - -bool ScalarTraits<APSInt>::mustQuote(StringRef Scalar) { return false; } - -void MappingContextTraits<CVType, pdb::yaml::SerializationContext>::mapping( - IO &IO, CVType &Record, pdb::yaml::SerializationContext &Context) { - if (IO.outputting()) - consumeError(codeview::visitTypeRecord(Record, Context.Dumper)); -} - -void MappingTraits<StringIdRecord>::mapping(IO &IO, StringIdRecord &String) { - IO.mapRequired("Id", String.Id); - IO.mapRequired("String", String.String); -} - -void MappingTraits<ArgListRecord>::mapping(IO &IO, ArgListRecord &Args) { - IO.mapRequired("ArgIndices", Args.ArgIndices); -} - -void MappingTraits<StringListRecord>::mapping(IO &IO, StringListRecord &Strings) { - IO.mapRequired("StringIndices", Strings.StringIndices); -} - -void MappingTraits<ClassRecord>::mapping(IO &IO, ClassRecord &Class) { - IO.mapRequired("MemberCount", Class.MemberCount); - IO.mapRequired("Options", Class.Options); - IO.mapRequired("FieldList", Class.FieldList); - IO.mapRequired("Name", Class.Name); - IO.mapRequired("UniqueName", Class.UniqueName); - IO.mapRequired("DerivationList", Class.DerivationList); - IO.mapRequired("VTableShape", Class.VTableShape); - IO.mapRequired("Size", Class.Size); -} - -void MappingTraits<UnionRecord>::mapping(IO &IO, UnionRecord &Union) { - IO.mapRequired("MemberCount", Union.MemberCount); - IO.mapRequired("Options", Union.Options); - IO.mapRequired("FieldList", Union.FieldList); - IO.mapRequired("Name", Union.Name); - IO.mapRequired("UniqueName", Union.UniqueName); - IO.mapRequired("Size", Union.Size); -} - -void MappingTraits<EnumRecord>::mapping(IO &IO, EnumRecord &Enum) { - IO.mapRequired("NumEnumerators", Enum.MemberCount); - IO.mapRequired("Options", Enum.Options); - IO.mapRequired("FieldList", Enum.FieldList); - IO.mapRequired("Name", Enum.Name); - IO.mapRequired("UniqueName", Enum.UniqueName); - IO.mapRequired("UnderlyingType", Enum.UnderlyingType); -} - -void MappingTraits<ArrayRecord>::mapping(IO &IO, ArrayRecord &AT) { - IO.mapRequired("ElementType", AT.ElementType); - IO.mapRequired("IndexType", AT.IndexType); - IO.mapRequired("Size", AT.Size); - IO.mapRequired("Name", AT.Name); -} - -void MappingTraits<VFTableRecord>::mapping(IO &IO, VFTableRecord &VFT) { - IO.mapRequired("CompleteClass", VFT.CompleteClass); - IO.mapRequired("OverriddenVFTable", VFT.OverriddenVFTable); - IO.mapRequired("VFPtrOffset", VFT.VFPtrOffset); - IO.mapRequired("MethodNames", VFT.MethodNames); -} - -void MappingTraits<MemberFuncIdRecord>::mapping(IO &IO, - MemberFuncIdRecord &Id) { - IO.mapRequired("ClassType", Id.ClassType); - IO.mapRequired("FunctionType", Id.FunctionType); - IO.mapRequired("Name", Id.Name); -} - -void MappingTraits<ProcedureRecord>::mapping(IO &IO, ProcedureRecord &Proc) { - IO.mapRequired("ReturnType", Proc.ReturnType); - IO.mapRequired("CallConv", Proc.CallConv); - IO.mapRequired("Options", Proc.Options); - IO.mapRequired("ParameterCount", Proc.ParameterCount); - IO.mapRequired("ArgumentList", Proc.ArgumentList); -} - -void MappingTraits<MemberFunctionRecord>::mapping(IO &IO, - MemberFunctionRecord &MF) { - IO.mapRequired("ReturnType", MF.ReturnType); - IO.mapRequired("ClassType", MF.ClassType); - IO.mapRequired("ThisType", MF.ThisType); - IO.mapRequired("CallConv", MF.CallConv); - IO.mapRequired("Options", MF.Options); - IO.mapRequired("ParameterCount", MF.ParameterCount); - IO.mapRequired("ArgumentList", MF.ArgumentList); - IO.mapRequired("ThisPointerAdjustment", MF.ThisPointerAdjustment); -} - -void MappingTraits<MethodOverloadListRecord>::mapping( - IO &IO, MethodOverloadListRecord &MethodList) { - IO.mapRequired("Methods", MethodList.Methods); -} - -void MappingTraits<FuncIdRecord>::mapping(IO &IO, FuncIdRecord &Func) { - IO.mapRequired("ParentScope", Func.ParentScope); - IO.mapRequired("FunctionType", Func.FunctionType); - IO.mapRequired("Name", Func.Name); -} - -void MappingTraits<TypeServer2Record>::mapping(IO &IO, TypeServer2Record &TS) { - IO.mapRequired("Guid", TS.Guid); - IO.mapRequired("Age", TS.Age); - IO.mapRequired("Name", TS.Name); -} - -void MappingTraits<PointerRecord>::mapping(IO &IO, PointerRecord &Ptr) { - IO.mapRequired("ReferentType", Ptr.ReferentType); - IO.mapRequired("Attrs", Ptr.Attrs); - IO.mapOptional("MemberInfo", Ptr.MemberInfo); -} - -void MappingTraits<MemberPointerInfo>::mapping(IO &IO, MemberPointerInfo &MPI) { - IO.mapRequired("ContainingType", MPI.ContainingType); - IO.mapRequired("Representation", MPI.Representation); -} - -void MappingTraits<ModifierRecord>::mapping(IO &IO, ModifierRecord &Mod) { - IO.mapRequired("ModifiedType", Mod.ModifiedType); - IO.mapRequired("Modifiers", Mod.Modifiers); -} - -void MappingTraits<BitFieldRecord>::mapping(IO &IO, BitFieldRecord &BitField) { - IO.mapRequired("Type", BitField.Type); - IO.mapRequired("BitSize", BitField.BitSize); - IO.mapRequired("BitOffset", BitField.BitOffset); -} - -void MappingTraits<VFTableShapeRecord>::mapping(IO &IO, - VFTableShapeRecord &Shape) { - IO.mapRequired("Slots", Shape.Slots); -} - -void MappingTraits<UdtSourceLineRecord>::mapping(IO &IO, - UdtSourceLineRecord &Line) { - IO.mapRequired("UDT", Line.UDT); - IO.mapRequired("SourceFile", Line.SourceFile); - IO.mapRequired("LineNumber", Line.LineNumber); -} - -void MappingTraits<UdtModSourceLineRecord>::mapping( - IO &IO, UdtModSourceLineRecord &Line) { - IO.mapRequired("UDT", Line.UDT); - IO.mapRequired("SourceFile", Line.SourceFile); - IO.mapRequired("LineNumber", Line.LineNumber); - IO.mapRequired("Module", Line.Module); -} - -void MappingTraits<BuildInfoRecord>::mapping(IO &IO, BuildInfoRecord &Args) { - IO.mapRequired("ArgIndices", Args.ArgIndices); -} - -void MappingTraits<LabelRecord>::mapping(IO &IO, LabelRecord &R) { - IO.mapRequired("Mode", R.Mode); -} - -void MappingTraits<NestedTypeRecord>::mapping(IO &IO, - NestedTypeRecord &Nested) { - IO.mapRequired("Type", Nested.Type); - IO.mapRequired("Name", Nested.Name); -} - -void MappingTraits<OneMethodRecord>::mapping(IO &IO, OneMethodRecord &Method) { - IO.mapRequired("Type", Method.Type); - IO.mapRequired("Attrs", Method.Attrs.Attrs); - IO.mapRequired("VFTableOffset", Method.VFTableOffset); - IO.mapRequired("Name", Method.Name); -} - -void MappingTraits<OverloadedMethodRecord>::mapping( - IO &IO, OverloadedMethodRecord &Method) { - IO.mapRequired("NumOverloads", Method.NumOverloads); - IO.mapRequired("MethodList", Method.MethodList); - IO.mapRequired("Name", Method.Name); -} - -void MappingTraits<DataMemberRecord>::mapping(IO &IO, DataMemberRecord &Field) { - IO.mapRequired("Attrs", Field.Attrs.Attrs); - IO.mapRequired("Type", Field.Type); - IO.mapRequired("FieldOffset", Field.FieldOffset); - IO.mapRequired("Name", Field.Name); -} - -void MappingTraits<StaticDataMemberRecord>::mapping( - IO &IO, StaticDataMemberRecord &Field) { - IO.mapRequired("Attrs", Field.Attrs.Attrs); - IO.mapRequired("Type", Field.Type); - IO.mapRequired("Name", Field.Name); -} - -void MappingTraits<VFPtrRecord>::mapping(IO &IO, VFPtrRecord &VFTable) { - IO.mapRequired("Type", VFTable.Type); -} - -void MappingTraits<EnumeratorRecord>::mapping(IO &IO, EnumeratorRecord &Enum) { - IO.mapRequired("Attrs", Enum.Attrs.Attrs); - IO.mapRequired("Value", Enum.Value); - IO.mapRequired("Name", Enum.Name); -} - -void MappingTraits<BaseClassRecord>::mapping(IO &IO, BaseClassRecord &Base) { - IO.mapRequired("Attrs", Base.Attrs.Attrs); - IO.mapRequired("Type", Base.Type); - IO.mapRequired("Offset", Base.Offset); -} - -void MappingTraits<VirtualBaseClassRecord>::mapping( - IO &IO, VirtualBaseClassRecord &Base) { - IO.mapRequired("Attrs", Base.Attrs.Attrs); - IO.mapRequired("BaseType", Base.BaseType); - IO.mapRequired("VBPtrType", Base.VBPtrType); - IO.mapRequired("VBPtrOffset", Base.VBPtrOffset); - IO.mapRequired("VTableIndex", Base.VTableIndex); -} - -void MappingTraits<ListContinuationRecord>::mapping( - IO &IO, ListContinuationRecord &Cont) { - IO.mapRequired("ContinuationIndex", Cont.ContinuationIndex); -} - -void ScalarTraits<codeview::TypeIndex>::output(const codeview::TypeIndex &S, - void *, llvm::raw_ostream &OS) { - OS << S.getIndex(); -} -StringRef ScalarTraits<codeview::TypeIndex>::input(StringRef Scalar, void *Ctx, - codeview::TypeIndex &S) { - uint32_t I; - StringRef Result = ScalarTraits<uint32_t>::input(Scalar, Ctx, I); - if (!Result.empty()) - return Result; - S = TypeIndex(I); - return ""; -} -bool ScalarTraits<codeview::TypeIndex>::mustQuote(StringRef Scalar) { - return false; -} - -void ScalarEnumerationTraits<TypeLeafKind>::enumeration(IO &io, - TypeLeafKind &Value) { - auto TypeLeafNames = getTypeLeafNames(); - for (const auto &E : TypeLeafNames) - io.enumCase(Value, E.Name.str().c_str(), E.Value); -} -} -} - -Error llvm::codeview::yaml::YamlTypeDumperCallbacks::visitTypeBegin( - CVType &CVR) { - YamlIO.mapRequired("Kind", CVR.Type); - return Error::success(); -} - -Error llvm::codeview::yaml::YamlTypeDumperCallbacks::visitMemberBegin( - CVMemberRecord &Record) { - YamlIO.mapRequired("Kind", Record.Kind); - return Error::success(); -} - -void llvm::codeview::yaml::YamlTypeDumperCallbacks::visitKnownRecordImpl( - const char *Name, CVType &CVR, FieldListRecord &FieldList) { - std::vector<llvm::pdb::yaml::PdbTpiFieldListRecord> FieldListRecords; - if (YamlIO.outputting()) { - // If we are outputting, then `FieldList.Data` contains a huge chunk of data - // representing the serialized list of members. We need to split it up into - // individual CVType records where each record represents an individual - // member. This way, we can simply map the entire thing as a Yaml sequence, - // which will recurse back to the standard handler for top-level fields - // (top-level and member fields all have the exact same Yaml syntax so use - // the same parser). - FieldListRecordSplitter Splitter(FieldListRecords); - consumeError(codeview::visitMemberRecordStream(FieldList.Data, Splitter)); - } - // Note that if we're not outputting (i.e. Yaml -> PDB) the result of this - // mapping gets lost, as the records are simply stored in this locally scoped - // vector. What's important though is they are all sharing a single - // Serializer - // instance (in `Context.ActiveSerializer`), and that is building up a list of - // all the types. The fact that we need a throwaway vector here is just to - // appease the YAML API to treat this as a sequence and do this mapping once - // for each YAML Sequence element in the input Yaml stream. - YamlIO.mapRequired("FieldList", FieldListRecords, Context); -} - -namespace llvm { -namespace yaml { -template <> -struct MappingContextTraits<pdb::yaml::PdbTpiFieldListRecord, - pdb::yaml::SerializationContext> { - static void mapping(IO &IO, pdb::yaml::PdbTpiFieldListRecord &Obj, - pdb::yaml::SerializationContext &Context) { - if (IO.outputting()) - consumeError(codeview::visitMemberRecord(Obj.Record, Context.Dumper)); - else { - // If we are not outputting, then the array contains no data starting out, - // and is instead populated from the sequence represented by the yaml -- - // again, using the same logic that we use for top-level records. - assert(Context.ActiveSerializer && "There is no active serializer!"); - codeview::TypeVisitorCallbackPipeline Pipeline; - pdb::TpiHashUpdater Hasher; - - Pipeline.addCallbackToPipeline(Context.Dumper); - Pipeline.addCallbackToPipeline(*Context.ActiveSerializer); - Pipeline.addCallbackToPipeline(Hasher); - consumeError( - codeview::visitMemberRecord(Obj.Record, Pipeline, VDS_BytesExternal)); - } - } -}; -} -} diff --git a/tools/llvm-pdbdump/YamlTypeDumper.h b/tools/llvm-pdbdump/YamlTypeDumper.h deleted file mode 100644 index 3f15ba0bf85d..000000000000 --- a/tools/llvm-pdbdump/YamlTypeDumper.h +++ /dev/null @@ -1,116 +0,0 @@ -//===- YamlTypeDumper.h --------------------------------------- *- C++ --*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVMPDBDUMP_YAMLTYPEDUMPER_H -#define LLVM_TOOLS_LLVMPDBDUMP_YAMLTYPEDUMPER_H - -#include "llvm/DebugInfo/CodeView/CodeView.h" -#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" -#include "llvm/Support/YAMLTraits.h" - -namespace llvm { -namespace pdb { -namespace yaml { -struct SerializationContext; -} -} -namespace codeview { -namespace yaml { -class YamlTypeDumperCallbacks : public TypeVisitorCallbacks { -public: - YamlTypeDumperCallbacks(llvm::yaml::IO &IO, - llvm::pdb::yaml::SerializationContext &Context) - : YamlIO(IO), Context(Context) {} - - virtual Error visitTypeBegin(CVType &Record) override; - virtual Error visitMemberBegin(CVMemberRecord &Record) override; - -#define TYPE_RECORD(EnumName, EnumVal, Name) \ - Error visitKnownRecord(CVRecord<TypeLeafKind> &CVR, Name##Record &Record) \ - override { \ - visitKnownRecordImpl(#Name, CVR, Record); \ - return Error::success(); \ - } -#define MEMBER_RECORD(EnumName, EnumVal, Name) \ - Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override { \ - visitKnownMemberImpl(#Name, Record); \ - return Error::success(); \ - } -#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" - -private: - template <typename T> void visitKnownMemberImpl(const char *Name, T &Record) { - YamlIO.mapRequired(Name, Record); - } - - template <typename T> - void visitKnownRecordImpl(const char *Name, CVType &Type, T &Record) { - YamlIO.mapRequired(Name, Record); - } - - void visitKnownRecordImpl(const char *Name, CVType &CVR, - FieldListRecord &FieldList); - - llvm::yaml::IO &YamlIO; - llvm::pdb::yaml::SerializationContext &Context; -}; -} -} -namespace pdb { -namespace yaml { -struct SerializationContext; -} -} -} - -namespace llvm { -namespace yaml { - -template <> struct ScalarTraits<APSInt> { - static void output(const APSInt &S, void *, llvm::raw_ostream &OS); - static StringRef input(StringRef Scalar, void *Ctx, APSInt &S); - static bool mustQuote(StringRef Scalar); -}; - -template <> struct ScalarTraits<codeview::TypeIndex> { - static void output(const codeview::TypeIndex &S, void *, - llvm::raw_ostream &OS); - static StringRef input(StringRef Scalar, void *Ctx, codeview::TypeIndex &S); - static bool mustQuote(StringRef Scalar); -}; - -template <> struct MappingTraits<codeview::MemberPointerInfo> { - static void mapping(IO &IO, codeview::MemberPointerInfo &Obj); -}; - -template <> -struct MappingContextTraits<codeview::CVType, pdb::yaml::SerializationContext> { - static void mapping(IO &IO, codeview::CVType &Obj, - pdb::yaml::SerializationContext &Context); -}; - -template <> struct ScalarEnumerationTraits<codeview::TypeLeafKind> { - static void enumeration(IO &io, codeview::TypeLeafKind &Value); -}; - -#define TYPE_RECORD(EnumName, EnumVal, Name) \ - template <> struct MappingTraits<codeview::Name##Record> { \ - static void mapping(IO &IO, codeview::Name##Record &Obj); \ - }; -#define MEMBER_RECORD(EnumName, EnumVal, Name) \ - TYPE_RECORD(EnumName, EnumVal, Name) -#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" -} -} - -#endif diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp index baba862ae663..0b2b766a3c52 100644 --- a/tools/llvm-pdbdump/llvm-pdbdump.cpp +++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp @@ -476,6 +476,7 @@ static void yamlToPdb(StringRef Path) { std::unique_ptr<MemoryBuffer> &Buffer = ErrorOrBuffer.get(); llvm::yaml::Input In(Buffer->getBuffer()); + In.setContext(&Allocator); pdb::yaml::PdbObject YamlObj(Allocator); In >> YamlObj; @@ -535,7 +536,7 @@ static void yamlToPdb(StringRef Path) { if (MI.Modi.hasValue()) { const auto &ModiStream = *MI.Modi; for (auto Symbol : ModiStream.Symbols) - ModiBuilder.addSymbol(Symbol.Record); + ModiBuilder.addSymbol(Symbol.toCodeViewSymbol(Allocator)); } if (MI.FileLineInfo.hasValue()) { const auto &FLI = *MI.FileLineInfo; @@ -584,7 +585,7 @@ static void yamlToPdb(StringRef Path) { auto Inlinees = llvm::make_unique<DebugInlineeLinesSubsection>( ChecksumRef, Inlinee.HasExtraFiles); for (const auto &Site : Inlinee.Sites) { - Inlinees->addInlineSite(Site.Inlinee, Site.FileName, + Inlinees->addInlineSite(TypeIndex(Site.Inlinee), Site.FileName, Site.SourceLineNum); if (!Inlinee.HasExtraFiles) continue; @@ -601,14 +602,18 @@ static void yamlToPdb(StringRef Path) { auto &TpiBuilder = Builder.getTpiBuilder(); const auto &Tpi = YamlObj.TpiStream.getValueOr(DefaultTpiStream); TpiBuilder.setVersionHeader(Tpi.Version); - for (const auto &R : Tpi.Records) - TpiBuilder.addTypeRecord(R.Record.data(), R.Record.Hash); + for (const auto &R : Tpi.Records) { + CVType Type = R.toCodeViewRecord(Allocator); + TpiBuilder.addTypeRecord(Type.RecordData, None); + } const auto &Ipi = YamlObj.IpiStream.getValueOr(DefaultIpiStream); auto &IpiBuilder = Builder.getIpiBuilder(); IpiBuilder.setVersionHeader(Ipi.Version); - for (const auto &R : Ipi.Records) - IpiBuilder.addTypeRecord(R.Record.data(), R.Record.Hash); + for (const auto &R : Ipi.Records) { + CVType Type = R.toCodeViewRecord(Allocator); + IpiBuilder.addTypeRecord(Type.RecordData, None); + } ExitOnErr(Builder.commit(opts::yaml2pdb::YamlPdbOutputFile)); } diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp index 2e9e01d9642b..427920569042 100644 --- a/tools/llvm-readobj/ELFDumper.cpp +++ b/tools/llvm-readobj/ELFDumper.cpp @@ -2742,6 +2742,7 @@ std::string GNUStyle<ELFT>::getSymbolSectionNdx(const ELFO *Obj, case ELF::SHN_XINDEX: SectionIndex = unwrapOrError(object::getExtendedSymbolTableIndex<ELFT>( Symbol, FirstSym, this->dumper()->getShndxTable())); + LLVM_FALLTHROUGH; default: // Find if: // Processor specific diff --git a/tools/obj2yaml/macho2yaml.cpp b/tools/obj2yaml/macho2yaml.cpp index 9ad2a6d979f5..f7b6c4748d5e 100644 --- a/tools/obj2yaml/macho2yaml.cpp +++ b/tools/obj2yaml/macho2yaml.cpp @@ -261,6 +261,7 @@ void MachODumper::dumpRebaseOpcodes(std::unique_ptr<MachOYAML::Object> &Y) { ULEB = decodeULEB128(OpCode + 1, &Count); RebaseOp.ExtraData.push_back(ULEB); OpCode += Count; + LLVM_FALLTHROUGH; // Intentionally no break here -- This opcode has two ULEB values case MachO::REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: case MachO::REBASE_OPCODE_ADD_ADDR_ULEB: @@ -308,6 +309,7 @@ void MachODumper::dumpBindOpcodes( ULEB = decodeULEB128(OpCode + 1, &Count); BindOp.ULEBExtraData.push_back(ULEB); OpCode += Count; + LLVM_FALLTHROUGH; // Intentionally no break here -- this opcode has two ULEB values case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB: diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp index df467da690e7..58e9caeff0fb 100644 --- a/tools/opt/NewPMDriver.cpp +++ b/tools/opt/NewPMDriver.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" using namespace llvm; @@ -47,8 +48,9 @@ static cl::opt<std::string> "pipeline for handling managed aliasing queries"), cl::Hidden); -bool llvm::runPassPipeline(StringRef Arg0, Module &M, - TargetMachine *TM, tool_output_file *Out, +bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM, + tool_output_file *Out, + tool_output_file *ThinLTOLinkOut, StringRef PassPipeline, OutputKind OK, VerifierKind VK, bool ShouldPreserveAssemblyUseListOrder, @@ -104,6 +106,10 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, MPM.addPass(BitcodeWriterPass(Out->os(), ShouldPreserveBitcodeUseListOrder, EmitSummaryIndex, EmitModuleHash)); break; + case OK_OutputThinLTOBitcode: + MPM.addPass(ThinLTOBitcodeWriterPass( + Out->os(), ThinLTOLinkOut ? &ThinLTOLinkOut->os() : nullptr)); + break; } // Before executing passes, print the final values of the LLVM options. @@ -113,7 +119,10 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, MPM.run(M, MAM); // Declare success. - if (OK != OK_NoOutput) + if (OK != OK_NoOutput) { Out->keep(); + if (OK == OK_OutputThinLTOBitcode && ThinLTOLinkOut) + ThinLTOLinkOut->keep(); + } return true; } diff --git a/tools/opt/NewPMDriver.h b/tools/opt/NewPMDriver.h index 04022e7ec562..8012e0a025c9 100644 --- a/tools/opt/NewPMDriver.h +++ b/tools/opt/NewPMDriver.h @@ -32,7 +32,8 @@ namespace opt_tool { enum OutputKind { OK_NoOutput, OK_OutputAssembly, - OK_OutputBitcode + OK_OutputBitcode, + OK_OutputThinLTOBitcode, }; enum VerifierKind { VK_NoVerifier, @@ -47,8 +48,11 @@ enum VerifierKind { /// inclusion of the new pass manager headers and the old headers into the same /// file. It's interface is consequentially somewhat ad-hoc, but will go away /// when the transition finishes. -bool runPassPipeline(StringRef Arg0, Module &M, - TargetMachine *TM, tool_output_file *Out, +/// +/// ThinLTOLinkOut is only used when OK is OK_OutputThinLTOBitcode, and can be +/// nullptr. +bool runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM, + tool_output_file *Out, tool_output_file *ThinLinkOut, StringRef PassPipeline, opt_tool::OutputKind OK, opt_tool::VerifierKind VK, bool ShouldPreserveAssemblyUseListOrder, diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp index bef197d603ca..9d489ab5a2d2 100644 --- a/tools/opt/opt.cpp +++ b/tools/opt/opt.cpp @@ -518,7 +518,9 @@ int main(int argc, char **argv) { if (PassPipeline.getNumOccurrences() > 0) { OutputKind OK = OK_NoOutput; if (!NoOutput) - OK = OutputAssembly ? OK_OutputAssembly : OK_OutputBitcode; + OK = OutputAssembly + ? OK_OutputAssembly + : (OutputThinLTOBC ? OK_OutputThinLTOBitcode : OK_OutputBitcode); VerifierKind VK = VK_VerifyInAndOut; if (NoVerify) @@ -529,7 +531,7 @@ int main(int argc, char **argv) { // The user has asked to use the new pass manager and provided a pipeline // string. Hand off the rest of the functionality to the new code for that // layer. - return runPassPipeline(argv[0], *M, TM.get(), Out.get(), + return runPassPipeline(argv[0], *M, TM.get(), Out.get(), ThinLinkOut.get(), PassPipeline, OK, VK, PreserveAssemblyUseListOrder, PreserveBitcodeUseListOrder, EmitSummaryIndex, EmitModuleHash) diff --git a/unittests/ADT/ArrayRefTest.cpp b/unittests/ADT/ArrayRefTest.cpp index 65b4cbcd6689..4694ff112cb5 100644 --- a/unittests/ADT/ArrayRefTest.cpp +++ b/unittests/ADT/ArrayRefTest.cpp @@ -11,6 +11,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/raw_ostream.h" #include "gtest/gtest.h" +#include <limits> #include <vector> using namespace llvm; @@ -80,15 +81,25 @@ TEST(ArrayRefTest, AllocatorCopy) { EXPECT_NE(makeArrayRef(Array3Src).data(), Array3Copy.data()); } +TEST(ArrayRefTest, SizeTSizedOperations) { + ArrayRef<char> AR(nullptr, std::numeric_limits<ptrdiff_t>::max()); + + // Check that drop_back accepts size_t-sized numbers. + EXPECT_EQ(1U, AR.drop_back(AR.size() - 1).size()); + + // Check that drop_front accepts size_t-sized numbers. + EXPECT_EQ(1U, AR.drop_front(AR.size() - 1).size()); + + // Check that slice accepts size_t-sized numbers. + EXPECT_EQ(1U, AR.slice(AR.size() - 1).size()); + EXPECT_EQ(AR.size() - 1, AR.slice(1, AR.size() - 1).size()); +} + TEST(ArrayRefTest, DropBack) { static const int TheNumbers[] = {4, 8, 15, 16, 23, 42}; ArrayRef<int> AR1(TheNumbers); ArrayRef<int> AR2(TheNumbers, AR1.size() - 1); EXPECT_TRUE(AR1.drop_back().equals(AR2)); - - // Check that drop_back accepts size_t-sized numbers. - ArrayRef<char> AR3((const char *)0x10000, SIZE_MAX - 0x10000); - EXPECT_EQ(1U, AR3.drop_back(AR3.size() - 1).size()); } TEST(ArrayRefTest, DropFront) { @@ -96,10 +107,6 @@ TEST(ArrayRefTest, DropFront) { ArrayRef<int> AR1(TheNumbers); ArrayRef<int> AR2(&TheNumbers[2], AR1.size() - 2); EXPECT_TRUE(AR1.drop_front(2).equals(AR2)); - - // Check that drop_front accepts size_t-sized numbers. - ArrayRef<char> AR3((const char *)0x10000, SIZE_MAX - 0x10000); - EXPECT_EQ(1U, AR3.drop_front(AR3.size() - 1).size()); } TEST(ArrayRefTest, DropWhile) { @@ -187,13 +194,6 @@ TEST(ArrayRefTest, EmptyEquals) { EXPECT_TRUE(ArrayRef<unsigned>() == ArrayRef<unsigned>()); } -TEST(ArrayRefTest, Slice) { - // Check that slice accepts size_t-sized numbers. - ArrayRef<char> AR((const char *)0x10000, SIZE_MAX - 0x10000); - EXPECT_EQ(1U, AR.slice(AR.size() - 1).size()); - EXPECT_EQ(AR.size() - 1, AR.slice(1, AR.size() - 1).size()); -} - TEST(ArrayRefTest, ConstConvert) { int buf[4]; for (int i = 0; i < 4; ++i) diff --git a/unittests/IR/AttributesTest.cpp b/unittests/IR/AttributesTest.cpp index 7af4aebd540a..ab018d845382 100644 --- a/unittests/IR/AttributesTest.cpp +++ b/unittests/IR/AttributesTest.cpp @@ -82,4 +82,11 @@ TEST(Attributes, AddMatchingAlignAttr) { EXPECT_TRUE(AL.hasParamAttribute(0, Attribute::NonNull)); } +TEST(Attributes, EmptyGet) { + LLVMContext C; + AttributeList EmptyLists[] = {AttributeList(), AttributeList()}; + AttributeList AL = AttributeList::get(C, EmptyLists); + EXPECT_TRUE(AL.isEmpty()); +} + } // end anonymous namespace diff --git a/unittests/Support/TargetParserTest.cpp b/unittests/Support/TargetParserTest.cpp index 6be6f7bfb5d5..76d1917d537a 100644 --- a/unittests/Support/TargetParserTest.cpp +++ b/unittests/Support/TargetParserTest.cpp @@ -319,7 +319,7 @@ TEST(TargetParserTest, testARMArch) { testARMArch("armv6-m", "cortex-m0", "v6m", ARMBuildAttrs::CPUArch::v6_M)); EXPECT_TRUE( - testARMArch("armv7-a", "cortex-a8", "v7", + testARMArch("armv7-a", "generic", "v7", ARMBuildAttrs::CPUArch::v7)); EXPECT_TRUE( testARMArch("armv7ve", "generic", "v7ve", @@ -334,7 +334,7 @@ TEST(TargetParserTest, testARMArch) { testARMArch("armv7e-m", "cortex-m4", "v7em", ARMBuildAttrs::CPUArch::v7E_M)); EXPECT_TRUE( - testARMArch("armv8-a", "cortex-a53", "v8", + testARMArch("armv8-a", "generic", "v8", ARMBuildAttrs::CPUArch::v8_A)); EXPECT_TRUE( testARMArch("armv8.1-a", "generic", "v8.1a", diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp index 264175ae9677..0980e08f67f7 100644 --- a/utils/TableGen/AsmMatcherEmitter.cpp +++ b/utils/TableGen/AsmMatcherEmitter.cpp @@ -354,11 +354,11 @@ public: class AsmVariantInfo { public: - std::string RegisterPrefix; - std::string TokenizingCharacters; - std::string SeparatorCharacters; - std::string BreakCharacters; - std::string Name; + StringRef RegisterPrefix; + StringRef TokenizingCharacters; + StringRef SeparatorCharacters; + StringRef BreakCharacters; + StringRef Name; int AsmVariantNo; }; @@ -1438,8 +1438,8 @@ void AsmMatcherInfo::buildInfo() { unsigned VariantCount = Target.getAsmParserVariantCount(); for (unsigned VC = 0; VC != VariantCount; ++VC) { Record *AsmVariant = Target.getAsmParserVariant(VC); - std::string CommentDelimiter = - AsmVariant->getValueAsString("CommentDelimiter"); + StringRef CommentDelimiter = + AsmVariant->getValueAsString("CommentDelimiter"); AsmVariantInfo Variant; Variant.RegisterPrefix = AsmVariant->getValueAsString("RegisterPrefix"); Variant.TokenizingCharacters = @@ -1463,7 +1463,7 @@ void AsmMatcherInfo::buildInfo() { continue; // Ignore instructions for different instructions - const std::string V = CGI->TheDef->getValueAsString("AsmVariantName"); + StringRef V = CGI->TheDef->getValueAsString("AsmVariantName"); if (!V.empty() && V != Variant.Name) continue; @@ -1495,7 +1495,7 @@ void AsmMatcherInfo::buildInfo() { .startswith( MatchPrefix)) continue; - const std::string V = Alias->TheDef->getValueAsString("AsmVariantName"); + StringRef V = Alias->TheDef->getValueAsString("AsmVariantName"); if (!V.empty() && V != Variant.Name) continue; @@ -1564,8 +1564,8 @@ void AsmMatcherInfo::buildInfo() { // If the instruction has a two-operand alias, build up the // matchable here. We'll add them in bulk at the end to avoid // confusing this loop. - std::string Constraint = - II->TheDef->getValueAsString("TwoOperandAliasConstraint"); + StringRef Constraint = + II->TheDef->getValueAsString("TwoOperandAliasConstraint"); if (Constraint != "") { // Start by making a copy of the original matchable. auto AliasII = llvm::make_unique<MatchableInfo>(*II); @@ -1898,10 +1898,10 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, for (auto &II : Infos) { // Check if we have a custom match function. - std::string AsmMatchConverter = - II->getResultInst()->TheDef->getValueAsString("AsmMatchConverter"); + StringRef AsmMatchConverter = + II->getResultInst()->TheDef->getValueAsString("AsmMatchConverter"); if (!AsmMatchConverter.empty() && II->UseInstAsmMatchConverter) { - std::string Signature = "ConvertCustom_" + AsmMatchConverter; + std::string Signature = ("ConvertCustom_" + AsmMatchConverter).str(); II->ConversionFnKind = Signature; // Check if we have already generated this signature. @@ -2443,7 +2443,7 @@ static void emitMnemonicAliasVariant(raw_ostream &OS,const AsmMatcherInfo &Info, for (Record *R : Aliases) { // FIXME: Allow AssemblerVariantName to be a comma separated list. - std::string AsmVariantName = R->getValueAsString("AsmVariantName"); + StringRef AsmVariantName = R->getValueAsString("AsmVariantName"); if (AsmVariantName != AsmParserVariantName) continue; AliasesFromMnemonic[R->getValueAsString("FromMnemonic")].push_back(R); @@ -2486,14 +2486,18 @@ static void emitMnemonicAliasVariant(raw_ostream &OS,const AsmMatcherInfo &Info, if (!MatchCode.empty()) MatchCode += "else "; MatchCode += "if ((Features & " + FeatureMask + ") == "+FeatureMask+")\n"; - MatchCode += " Mnemonic = \"" +R->getValueAsString("ToMnemonic")+"\";\n"; + MatchCode += " Mnemonic = \""; + MatchCode += R->getValueAsString("ToMnemonic"); + MatchCode += "\";\n"; } if (AliasWithNoPredicate != -1) { Record *R = ToVec[AliasWithNoPredicate]; if (!MatchCode.empty()) MatchCode += "else\n "; - MatchCode += "Mnemonic = \"" + R->getValueAsString("ToMnemonic")+"\";\n"; + MatchCode += "Mnemonic = \""; + MatchCode += R->getValueAsString("ToMnemonic"); + MatchCode += "\";\n"; } MatchCode += "return;"; @@ -2522,7 +2526,7 @@ static bool emitMnemonicAliases(raw_ostream &OS, const AsmMatcherInfo &Info, for (unsigned VC = 0; VC != VariantCount; ++VC) { Record *AsmVariant = Target.getAsmParserVariant(VC); int AsmParserVariantNo = AsmVariant->getValueAsInt("Variant"); - std::string AsmParserVariantName = AsmVariant->getValueAsString("Name"); + StringRef AsmParserVariantName = AsmVariant->getValueAsString("Name"); OS << " case " << AsmParserVariantNo << ":\n"; emitMnemonicAliasVariant(OS, Info, Aliases, /*Indent=*/2, AsmParserVariantName); @@ -2710,7 +2714,7 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target, void AsmMatcherEmitter::run(raw_ostream &OS) { CodeGenTarget Target(Records); Record *AsmParser = Target.getAsmParser(); - std::string ClassName = AsmParser->getValueAsString("AsmParserClassName"); + StringRef ClassName = AsmParser->getValueAsString("AsmParserClassName"); // Compute the information on the instructions to match. AsmMatcherInfo Info(AsmParser, Target, Records); @@ -3173,8 +3177,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { << " }\n\n"; // Call the post-processing function, if used. - std::string InsnCleanupFn = - AsmParser->getValueAsString("AsmParserInstCleanup"); + StringRef InsnCleanupFn = AsmParser->getValueAsString("AsmParserInstCleanup"); if (!InsnCleanupFn.empty()) OS << " " << InsnCleanupFn << "(Inst);\n"; diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp index 40b7857ab994..30d21984c4d3 100644 --- a/utils/TableGen/AsmWriterEmitter.cpp +++ b/utils/TableGen/AsmWriterEmitter.cpp @@ -272,7 +272,7 @@ static void UnescapeString(std::string &Str) { /// clearing the Instructions vector. void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) { Record *AsmWriter = Target.getAsmWriter(); - std::string ClassName = AsmWriter->getValueAsString("AsmWriterClassName"); + StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName"); bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget"); O << @@ -523,7 +523,7 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName, // If the register has an alternate name for this index, use it. // Otherwise, leave it empty as an error flag. if (Idx < e) { - std::vector<std::string> AltNames = + std::vector<StringRef> AltNames = Reg.TheDef->getValueAsListOfStrings("AltNames"); if (AltNames.size() <= Idx) PrintFatalError(Reg.TheDef->getLoc(), @@ -553,12 +553,11 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName, void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) { Record *AsmWriter = Target.getAsmWriter(); - std::string ClassName = AsmWriter->getValueAsString("AsmWriterClassName"); + StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName"); const auto &Registers = Target.getRegBank().getRegisters(); const std::vector<Record*> &AltNameIndices = Target.getRegAltNameIndices(); bool hasAltNames = AltNameIndices.size() > 1; - std::string Namespace = - Registers.front().TheDef->getValueAsString("Namespace"); + StringRef Namespace = Registers.front().TheDef->getValueAsString("Namespace"); O << "\n\n/// getRegisterName - This method is automatically generated by tblgen\n" @@ -583,14 +582,16 @@ void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) { O << " switch(AltIdx) {\n" << " default: llvm_unreachable(\"Invalid register alt name index!\");\n"; for (const Record *R : AltNameIndices) { - const std::string &AltName = R->getName(); - std::string Prefix = !Namespace.empty() ? Namespace + "::" : ""; - O << " case " << Prefix << AltName << ":\n" - << " assert(*(AsmStrs" << AltName << "+RegAsmOffset" - << AltName << "[RegNo-1]) &&\n" + StringRef AltName = R->getName(); + O << " case "; + if (!Namespace.empty()) + O << Namespace << "::"; + O << AltName << ":\n" + << " assert(*(AsmStrs" << AltName << "+RegAsmOffset" << AltName + << "[RegNo-1]) &&\n" << " \"Invalid alt name index for register!\");\n" - << " return AsmStrs" << AltName << "+RegAsmOffset" - << AltName << "[RegNo-1];\n"; + << " return AsmStrs" << AltName << "+RegAsmOffset" << AltName + << "[RegNo-1];\n"; } O << " }\n"; } else { @@ -762,7 +763,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { ////////////////////////////// // Emit the method that prints the alias instruction. - std::string ClassName = AsmWriter->getValueAsString("AsmWriterClassName"); + StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName"); unsigned Variant = AsmWriter->getValueAsInt("Variant"); bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget"); @@ -807,7 +808,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { IAPrinter IAP(CGA.Result->getAsString(), CGA.AsmString); - std::string Namespace = Target.getName(); + StringRef Namespace = Target.getName(); std::vector<Record *> ReqFeatures; if (PassSubtarget) { // We only consider ReqFeatures predicates if PassSubtarget @@ -845,7 +846,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { // code to use. if (Rec->isSubClassOf("RegisterOperand") || Rec->isSubClassOf("Operand")) { - std::string PrintMethod = Rec->getValueAsString("PrintMethod"); + StringRef PrintMethod = Rec->getValueAsString("PrintMethod"); if (PrintMethod != "" && PrintMethod != "printOperand") { PrintMethodIdx = llvm::find(PrintMethods, PrintMethod) - PrintMethods.begin(); @@ -886,8 +887,9 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { } else break; // No conditions on this operand at all } - Cond = Target.getName().str() + ClassName + "ValidateMCOperand(" + - Op + ", STI, " + utostr(Entry) + ")"; + Cond = (Target.getName() + ClassName + "ValidateMCOperand(" + Op + + ", STI, " + utostr(Entry) + ")") + .str(); } // for all subcases of ResultOperand::K_Record: IAP.addCond(Cond); @@ -923,7 +925,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { for (auto I = ReqFeatures.cbegin(); I != ReqFeatures.cend(); I++) { Record *R = *I; - std::string AsmCondString = R->getValueAsString("AssemblerCondString"); + StringRef AsmCondString = R->getValueAsString("AssemblerCondString"); // AsmCondString has syntax [!]F(,[!]F)* SmallVector<StringRef, 4> Ops; @@ -933,10 +935,12 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { for (auto &Op : Ops) { assert(!Op.empty() && "Empty operator"); if (Op[0] == '!') - Cond = "!STI.getFeatureBits()[" + Namespace + "::" + - Op.substr(1).str() + "]"; + Cond = ("!STI.getFeatureBits()[" + Namespace + "::" + Op.substr(1) + + "]") + .str(); else - Cond = "STI.getFeatureBits()[" + Namespace + "::" + Op.str() + "]"; + Cond = + ("STI.getFeatureBits()[" + Namespace + "::" + Op + "]").str(); IAP.addCond(Cond); } } diff --git a/utils/TableGen/Attributes.cpp b/utils/TableGen/Attributes.cpp index 927f6e0e5b44..d64d30e18c3e 100644 --- a/utils/TableGen/Attributes.cpp +++ b/utils/TableGen/Attributes.cpp @@ -115,7 +115,7 @@ void Attributes::emitFnAttrCompatCheck(raw_ostream &OS, bool IsStringAttr) { Records.getAllDerivedDefinitions("CompatRule"); for (auto *Rule : CompatRules) { - std::string FuncName = Rule->getValueAsString("CompatFunc"); + StringRef FuncName = Rule->getValueAsString("CompatFunc"); OS << " Ret &= " << FuncName << "(Caller, Callee);\n"; } @@ -129,7 +129,7 @@ void Attributes::emitFnAttrCompatCheck(raw_ostream &OS, bool IsStringAttr) { << " const Function &Callee) {\n"; for (auto *Rule : MergeRules) { - std::string FuncName = Rule->getValueAsString("MergeFunc"); + StringRef FuncName = Rule->getValueAsString("MergeFunc"); OS << " " << FuncName << "(Caller, Callee);\n"; } diff --git a/utils/TableGen/CodeEmitterGen.cpp b/utils/TableGen/CodeEmitterGen.cpp index 565235d82143..b80dd5daefe0 100644 --- a/utils/TableGen/CodeEmitterGen.cpp +++ b/utils/TableGen/CodeEmitterGen.cpp @@ -218,10 +218,12 @@ std::string CodeEmitterGen::getInstructionCase(Record *R, AddCodeToMergeInOperand(R, BI, Vals[i].getName(), NumberedOp, NamedOpIndices, Case, Target); } - - std::string PostEmitter = R->getValueAsString("PostEncoderMethod"); + + StringRef PostEmitter = R->getValueAsString("PostEncoderMethod"); if (!PostEmitter.empty()) { - Case += " Value = " + PostEmitter + "(MI, Value"; + Case += " Value = "; + Case += PostEmitter; + Case += "(MI, Value"; Case += ", STI"; Case += ");\n"; } @@ -278,11 +280,11 @@ void CodeEmitterGen::run(raw_ostream &o) { if (R->getValueAsString("Namespace") == "TargetOpcode" || R->getValueAsBit("isPseudo")) continue; - const std::string &InstName = R->getValueAsString("Namespace") + "::" - + R->getName().str(); + std::string InstName = + (R->getValueAsString("Namespace") + "::" + R->getName()).str(); std::string Case = getInstructionCase(R, Target); - CaseMap[Case].push_back(InstName); + CaseMap[Case].push_back(std::move(InstName)); } // Emit initial function code diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp index ef2cb4208eae..231a6ad5706b 100644 --- a/utils/TableGen/CodeGenDAGPatterns.cpp +++ b/utils/TableGen/CodeGenDAGPatterns.cpp @@ -893,7 +893,9 @@ std::string PatternToMatch::getPredicateCheck() const { for (Record *Pred : PredicateRecs) { if (!PredicateCheck.empty()) PredicateCheck += " && "; - PredicateCheck += "(" + Pred->getValueAsString("CondString") + ")"; + PredicateCheck += "("; + PredicateCheck += Pred->getValueAsString("CondString"); + PredicateCheck += ")"; } return PredicateCheck.str(); @@ -2450,7 +2452,7 @@ void CodeGenDAGPatterns::ParseNodeTransforms() { while (!Xforms.empty()) { Record *XFormNode = Xforms.back(); Record *SDNode = XFormNode->getValueAsDef("Opcode"); - std::string Code = XFormNode->getValueAsString("XFormFunction"); + StringRef Code = XFormNode->getValueAsString("XFormFunction"); SDNodeXForms.insert(std::make_pair(XFormNode, NodeXForm(SDNode, Code))); Xforms.pop_back(); diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h index 189d6e382ee7..5c56fb644e7f 100644 --- a/utils/TableGen/CodeGenDAGPatterns.h +++ b/utils/TableGen/CodeGenDAGPatterns.h @@ -223,8 +223,8 @@ struct SDTypeConstraint { /// processing. class SDNodeInfo { Record *Def; - std::string EnumName; - std::string SDClassName; + StringRef EnumName; + StringRef SDClassName; unsigned Properties; unsigned NumResults; int NumOperands; @@ -238,8 +238,8 @@ public: /// variadic. int getNumOperands() const { return NumOperands; } Record *getRecord() const { return Def; } - const std::string &getEnumName() const { return EnumName; } - const std::string &getSDClassName() const { return SDClassName; } + StringRef getEnumName() const { return EnumName; } + StringRef getSDClassName() const { return SDClassName; } const std::vector<SDTypeConstraint> &getTypeConstraints() const { return TypeConstraints; diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp index 627614d991d5..3907336221a4 100644 --- a/utils/TableGen/CodeGenRegisters.cpp +++ b/utils/TableGen/CodeGenRegisters.cpp @@ -679,11 +679,6 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R) Name(R->getName()), TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1) { - // Rename anonymous register classes. - if (R->getName().size() > 9 && R->getName()[9] == '.') { - static unsigned AnonCounter = 0; - R->setName("AnonRegClass_" + utostr(AnonCounter++)); - } std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes"); for (unsigned i = 0, e = TypeList.size(); i != e; ++i) { @@ -867,7 +862,7 @@ std::string CodeGenRegisterClass::getQualifiedName() const { if (Namespace.empty()) return getName(); else - return Namespace + "::" + getName(); + return (Namespace + "::" + getName()).str(); } // Compute sub-classes of all register classes. diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h index 9366838c77cd..1fcba8a135d1 100644 --- a/utils/TableGen/CodeGenRegisters.h +++ b/utils/TableGen/CodeGenRegisters.h @@ -308,13 +308,13 @@ namespace llvm { public: unsigned EnumValue; - std::string Namespace; + StringRef Namespace; SmallVector<MVT::SimpleValueType, 4> VTs; unsigned SpillSize; unsigned SpillAlignment; int CopyCost; bool Allocatable; - std::string AltOrderSelect; + StringRef AltOrderSelect; uint8_t AllocationPriority; /// Contains the combination of the lane masks of all subregisters. LaneBitmask LaneMask; diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp index cae1cf4b861e..20f6047052ff 100644 --- a/utils/TableGen/CodeGenSchedule.cpp +++ b/utils/TableGen/CodeGenSchedule.cpp @@ -542,7 +542,7 @@ void CodeGenSchedModels::collectSchedClasses() { return; for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) { - std::string InstName = Inst->TheDef->getName(); + StringRef InstName = Inst->TheDef->getName(); unsigned SCIdx = InstrClassMap.lookup(Inst->TheDef); if (!SCIdx) { if (!Inst->hasNoSchedulingInfo) diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp index e1aaeccb08d0..75fd73082b9a 100644 --- a/utils/TableGen/FixedLenDecoderEmitter.cpp +++ b/utils/TableGen/FixedLenDecoderEmitter.cpp @@ -1145,16 +1145,15 @@ bool FilterChooser::emitPredicateMatch(raw_ostream &o, unsigned &Indentation, if (!Pred->getValue("AssemblerMatcherPredicate")) continue; - std::string P = Pred->getValueAsString("AssemblerCondString"); + StringRef P = Pred->getValueAsString("AssemblerCondString"); - if (!P.length()) + if (P.empty()) continue; if (!IsFirstEmission) o << " && "; - StringRef SR(P); - std::pair<StringRef, StringRef> pairs = SR.split(','); + std::pair<StringRef, StringRef> pairs = P.split(','); while (!pairs.second.empty()) { emitSinglePredicateMatch(o, pairs.first, Emitter->PredicateNamespace); o << " && "; @@ -1174,9 +1173,9 @@ bool FilterChooser::doesOpcodeNeedPredicate(unsigned Opc) const { if (!Pred->getValue("AssemblerMatcherPredicate")) continue; - std::string P = Pred->getValueAsString("AssemblerCondString"); + StringRef P = Pred->getValueAsString("AssemblerCondString"); - if (!P.length()) + if (P.empty()) continue; return true; @@ -1744,7 +1743,7 @@ static bool populateInstruction(CodeGenTarget &Target, // If the instruction has specified a custom decoding hook, use that instead // of trying to auto-generate the decoder. - std::string InstDecoder = Def.getValueAsString("DecoderMethod"); + StringRef InstDecoder = Def.getValueAsString("DecoderMethod"); if (InstDecoder != "") { bool HasCompleteInstDecoder = Def.getValueAsBit("hasCompleteDecoder"); InsnOperands.push_back(OperandInfo(InstDecoder, HasCompleteInstDecoder)); @@ -2261,7 +2260,7 @@ void FixedLenDecoderEmitter::run(raw_ostream &o) { Def->getValueAsBit("isCodeGenOnly")) continue; - std::string DecoderNamespace = Def->getValueAsString("DecoderNamespace"); + StringRef DecoderNamespace = Def->getValueAsString("DecoderNamespace"); if (Size) { if (populateInstruction(Target, *Inst, i, Operands)) { diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp index e0303b7b1ab4..88ded1f25ffb 100644 --- a/utils/TableGen/GlobalISelEmitter.cpp +++ b/utils/TableGen/GlobalISelEmitter.cpp @@ -118,7 +118,7 @@ static std::string explainPredicates(const TreePatternNode *N) { std::string explainOperator(Record *Operator) { if (Operator->isSubClassOf("SDNode")) - return " (" + Operator->getValueAsString("Opcode") + ")"; + return (" (" + Operator->getValueAsString("Opcode") + ")").str(); if (Operator->isSubClassOf("Intrinsic")) return (" (Operator is an Intrinsic, " + Operator->getName() + ")").str(); diff --git a/utils/TableGen/OptParserEmitter.cpp b/utils/TableGen/OptParserEmitter.cpp index c1b5e6510325..04e6537f3d15 100644 --- a/utils/TableGen/OptParserEmitter.cpp +++ b/utils/TableGen/OptParserEmitter.cpp @@ -21,6 +21,8 @@ using namespace llvm; // Ordering on Info. The logic should match with the consumer-side function in // llvm/Option/OptTable.h. +// FIXME: Mmake this take StringRefs instead of null terminated strings to +// simplify callers. static int StrCmpOptionName(const char *A, const char *B) { const char *X = A, *Y = B; char a = tolower(*A), b = tolower(*B); @@ -53,22 +55,22 @@ static int CompareOptionRecords(Record *const *Av, Record *const *Bv) { // Compare options by name, unless they are sentinels. if (!ASent) - if (int Cmp = StrCmpOptionName(A->getValueAsString("Name").c_str(), - B->getValueAsString("Name").c_str())) + if (int Cmp = StrCmpOptionName(A->getValueAsString("Name").str().c_str(), + B->getValueAsString("Name").str().c_str())) return Cmp; if (!ASent) { - std::vector<std::string> APrefixes = A->getValueAsListOfStrings("Prefixes"); - std::vector<std::string> BPrefixes = B->getValueAsListOfStrings("Prefixes"); - - for (std::vector<std::string>::const_iterator APre = APrefixes.begin(), - AEPre = APrefixes.end(), - BPre = BPrefixes.begin(), - BEPre = BPrefixes.end(); - APre != AEPre && - BPre != BEPre; - ++APre, ++BPre) { - if (int Cmp = StrCmpOptionName(APre->c_str(), BPre->c_str())) + std::vector<StringRef> APrefixes = A->getValueAsListOfStrings("Prefixes"); + std::vector<StringRef> BPrefixes = B->getValueAsListOfStrings("Prefixes"); + + for (std::vector<StringRef>::const_iterator APre = APrefixes.begin(), + AEPre = APrefixes.end(), + BPre = BPrefixes.begin(), + BEPre = BPrefixes.end(); + APre != AEPre && + BPre != BEPre; + ++APre, ++BPre) { + if (int Cmp = StrCmpOptionName(APre->str().c_str(), BPre->str().c_str())) return Cmp; } } @@ -122,7 +124,7 @@ void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) { unsigned CurPrefix = 0; for (unsigned i = 0, e = Opts.size(); i != e; ++i) { const Record &R = *Opts[i]; - std::vector<std::string> prf = R.getValueAsListOfStrings("Prefixes"); + std::vector<StringRef> prf = R.getValueAsListOfStrings("Prefixes"); PrefixKeyT prfkey(prf.begin(), prf.end()); unsigned NewPrefix = CurPrefix + 1; if (Prefixes.insert(std::make_pair(prfkey, (Twine("prefix_") + @@ -207,7 +209,7 @@ void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) { OS << "OPTION("; // The option prefix; - std::vector<std::string> prf = R.getValueAsListOfStrings("Prefixes"); + std::vector<StringRef> prf = R.getValueAsListOfStrings("Prefixes"); OS << Prefixes[PrefixKeyT(prf.begin(), prf.end())] << ", "; // The option string. @@ -240,7 +242,7 @@ void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) { // would become "foo\0bar\0". Note that the compiler adds an implicit // terminating \0 at the end. OS << ", "; - std::vector<std::string> AliasArgs = R.getValueAsListOfStrings("AliasArgs"); + std::vector<StringRef> AliasArgs = R.getValueAsListOfStrings("AliasArgs"); if (AliasArgs.size() == 0) { OS << "nullptr"; } else { diff --git a/utils/TableGen/RegisterBankEmitter.cpp b/utils/TableGen/RegisterBankEmitter.cpp index bf066412b286..3f11eff1d371 100644 --- a/utils/TableGen/RegisterBankEmitter.cpp +++ b/utils/TableGen/RegisterBankEmitter.cpp @@ -44,7 +44,7 @@ public: : TheDef(TheDef), RCs(), RCWithLargestRegsSize(nullptr) {} /// Get the human-readable name for the bank. - std::string getName() const { return TheDef.getValueAsString("Name"); } + StringRef getName() const { return TheDef.getValueAsString("Name"); } /// Get the name of the enumerator in the ID enumeration. std::string getEnumeratorName() const { return (TheDef.getName() + "ID").str(); } diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp index 5b56578a64b3..12cfb93a0c4f 100644 --- a/utils/TableGen/RegisterInfoEmitter.cpp +++ b/utils/TableGen/RegisterInfoEmitter.cpp @@ -93,8 +93,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS, // Register enums are stored as uint16_t in the tables. Make sure we'll fit. assert(Registers.size() <= 0xffff && "Too many regs to fit in tables"); - std::string Namespace = - Registers.front().TheDef->getValueAsString("Namespace"); + StringRef Namespace = Registers.front().TheDef->getValueAsString("Namespace"); emitSourceFileHeader("Target Register Enum Values", OS); @@ -354,7 +353,7 @@ void RegisterInfoEmitter::EmitRegMappingTables( for (unsigned i = I->second.size(), e = maxLength; i != e; ++i) I->second.push_back(-1); - std::string Namespace = Regs.front().TheDef->getValueAsString("Namespace"); + StringRef Namespace = Regs.front().TheDef->getValueAsString("Namespace"); OS << "// " << Namespace << " Dwarf<->LLVM register mappings.\n"; @@ -464,7 +463,7 @@ void RegisterInfoEmitter::EmitRegMapping( if (!maxLength) return; - std::string Namespace = Regs.front().TheDef->getValueAsString("Namespace"); + StringRef Namespace = Regs.front().TheDef->getValueAsString("Namespace"); // Emit reverse information about the dwarf register numbers. for (unsigned j = 0; j < 2; ++j) { diff --git a/utils/TableGen/SearchableTableEmitter.cpp b/utils/TableGen/SearchableTableEmitter.cpp index 80f0b0d4aaf4..efd4e83eca90 100644 --- a/utils/TableGen/SearchableTableEmitter.cpp +++ b/utils/TableGen/SearchableTableEmitter.cpp @@ -112,8 +112,8 @@ private: void SearchableTableEmitter::emitMappingEnum(std::vector<Record *> &Items, Record *InstanceClass, raw_ostream &OS) { - std::string EnumNameField = InstanceClass->getValueAsString("EnumNameField"); - std::string EnumValueField; + StringRef EnumNameField = InstanceClass->getValueAsString("EnumNameField"); + StringRef EnumValueField; if (!InstanceClass->isValueUnset("EnumValueField")) EnumValueField = InstanceClass->getValueAsString("EnumValueField"); diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp index 1903f405d859..7e9f552eccc0 100644 --- a/utils/TableGen/SubtargetEmitter.cpp +++ b/utils/TableGen/SubtargetEmitter.cpp @@ -180,9 +180,9 @@ unsigned SubtargetEmitter::FeatureKeyValues(raw_ostream &OS) { // Next feature Record *Feature = FeatureList[i]; - const std::string &Name = Feature->getName(); - const std::string &CommandLineName = Feature->getValueAsString("Name"); - const std::string &Desc = Feature->getValueAsString("Desc"); + StringRef Name = Feature->getName(); + StringRef CommandLineName = Feature->getValueAsString("Name"); + StringRef Desc = Feature->getValueAsString("Desc"); if (CommandLineName.empty()) continue; @@ -237,7 +237,7 @@ unsigned SubtargetEmitter::CPUKeyValues(raw_ostream &OS) { // Next processor Record *Processor = ProcessorList[i]; - const std::string &Name = Processor->getValueAsString("Name"); + StringRef Name = Processor->getValueAsString("Name"); const std::vector<Record*> &FeatureList = Processor->getValueAsListOfDefs("Features"); @@ -1212,7 +1212,7 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) { // Next processor Record *Processor = ProcessorList[i]; - const std::string &Name = Processor->getValueAsString("Name"); + StringRef Name = Processor->getValueAsString("Name"); const std::string &ProcModelName = SchedModels.getModelForProc(Processor).ModelName; @@ -1360,9 +1360,9 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS, for (Record *R : Features) { // Next record - const std::string &Instance = R->getName(); - const std::string &Value = R->getValueAsString("Value"); - const std::string &Attribute = R->getValueAsString("Attribute"); + StringRef Instance = R->getName(); + StringRef Value = R->getValueAsString("Value"); + StringRef Attribute = R->getValueAsString("Attribute"); if (Value=="true" || Value=="false") OS << " if (Bits[" << Target << "::" diff --git a/utils/TableGen/X86FoldTablesEmitter.cpp b/utils/TableGen/X86FoldTablesEmitter.cpp index 99429c5f96a2..b89cee2ce4bb 100644 --- a/utils/TableGen/X86FoldTablesEmitter.cpp +++ b/utils/TableGen/X86FoldTablesEmitter.cpp @@ -292,7 +292,7 @@ getMemOperandSize(const Record *MemRec, const bool IntrinsicSensitive = false) { (MemRec->getName() == "sdmem" || MemRec->getName() == "ssmem")) return 128; - std::string Name = + StringRef Name = MemRec->getValueAsDef("ParserMatchClass")->getValueAsString("Name"); if (Name == "Mem8") return 8; @@ -368,7 +368,7 @@ static inline const CodeGenInstruction * getAltRegInst(const CodeGenInstruction *I, const RecordKeeper &Records, const CodeGenTarget &Target) { - std::string AltRegInstStr = I->TheDef->getValueAsString("FoldGenRegForm"); + StringRef AltRegInstStr = I->TheDef->getValueAsString("FoldGenRegForm"); Record *AltRegInstRec = Records.getDef(AltRegInstStr); assert(AltRegInstRec && "Alternative register form instruction def not found"); |