diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2010-10-11 17:22:16 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2010-10-11 17:22:16 +0000 |
commit | 361680a51927577ccb20b14704dbaac762181843 (patch) | |
tree | de75a464c5dac7eceb2dbbad8b4d4e1479d79e08 /contrib/llvm/lib/Target/X86 | |
parent | 72578a23bf85ae6f14444ee38e72f9b16899bb52 (diff) |
Remove more unneeded files and directories from contrib/llvm. This
still allows us to build tblgen and clang, and further reduces the
footprint in the tree.
Approved by: rpaulo (mentor)
Notes
Notes:
svn path=/head/; revision=213695
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
17 files changed, 0 insertions, 3555 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt b/contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt deleted file mode 100644 index 40dbdd72faa1..000000000000 --- a/contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMX86AsmParser - X86AsmLexer.cpp - X86AsmParser.cpp - ) -add_dependencies(LLVMX86AsmParser X86CodeGenTable_gen) diff --git a/contrib/llvm/lib/Target/X86/AsmParser/Makefile b/contrib/llvm/lib/Target/X86/AsmParser/Makefile deleted file mode 100644 index fb9760796622..000000000000 --- a/contrib/llvm/lib/Target/X86/AsmParser/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/X86/AsmParser/Makefile -------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMX86AsmParser - -# Hack: we need to include 'main' x86 target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt b/contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt deleted file mode 100644 index 033973eeeff9..000000000000 --- a/contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMX86AsmPrinter - X86ATTInstPrinter.cpp - X86IntelInstPrinter.cpp - X86InstComments.cpp - ) -add_dependencies(LLVMX86AsmPrinter X86CodeGenTable_gen) diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/Makefile b/contrib/llvm/lib/Target/X86/AsmPrinter/Makefile deleted file mode 100644 index c82aa330a20c..000000000000 --- a/contrib/llvm/lib/Target/X86/AsmPrinter/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/X86/AsmPrinter/Makefile ------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMX86AsmPrinter - -# Hack: we need to include 'main' x86 target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/X86/CMakeLists.txt b/contrib/llvm/lib/Target/X86/CMakeLists.txt deleted file mode 100644 index e9399f5c8322..000000000000 --- a/contrib/llvm/lib/Target/X86/CMakeLists.txt +++ /dev/null @@ -1,52 +0,0 @@ -set(LLVM_TARGET_DEFINITIONS X86.td) - -tablegen(X86GenRegisterInfo.h.inc -gen-register-desc-header) -tablegen(X86GenRegisterNames.inc -gen-register-enums) -tablegen(X86GenRegisterInfo.inc -gen-register-desc) -tablegen(X86GenDisassemblerTables.inc -gen-disassembler) -tablegen(X86GenInstrNames.inc -gen-instr-enums) -tablegen(X86GenInstrInfo.inc -gen-instr-desc) -tablegen(X86GenAsmWriter.inc -gen-asm-writer) -tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) -tablegen(X86GenAsmMatcher.inc -gen-asm-matcher) -tablegen(X86GenDAGISel.inc -gen-dag-isel) -tablegen(X86GenFastISel.inc -gen-fast-isel) -tablegen(X86GenCallingConv.inc -gen-callingconv) -tablegen(X86GenSubtarget.inc -gen-subtarget) -tablegen(X86GenEDInfo.inc -gen-enhanced-disassembly-info) - -set(sources - SSEDomainFix.cpp - X86AsmBackend.cpp - X86AsmPrinter.cpp - X86COFFMachineModuleInfo.cpp - X86CodeEmitter.cpp - X86ELFWriterInfo.cpp - X86FastISel.cpp - X86FloatingPoint.cpp - X86ISelDAGToDAG.cpp - X86ISelLowering.cpp - X86InstrInfo.cpp - X86JITInfo.cpp - X86MCAsmInfo.cpp - X86MCCodeEmitter.cpp - X86MCInstLower.cpp - X86RegisterInfo.cpp - X86SelectionDAGInfo.cpp - X86Subtarget.cpp - X86TargetMachine.cpp - X86TargetObjectFile.cpp - ) - -if( CMAKE_CL_64 ) - enable_language(ASM_MASM) - ADD_CUSTOM_COMMAND( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj - COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm - ) - set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj) -endif() - -add_llvm_target(X86CodeGen ${sources}) - diff --git a/contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt b/contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt deleted file mode 100644 index 97589c00515b..000000000000 --- a/contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMX86Disassembler - X86Disassembler.cpp - X86DisassemblerDecoder.c - ) -# workaround for hanging compilation on MSVC9 and 10 -if( MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 ) -set_property( - SOURCE X86Disassembler.cpp - PROPERTY COMPILE_FLAGS "/Od" - ) -endif() -add_dependencies(LLVMX86Disassembler X86CodeGenTable_gen) diff --git a/contrib/llvm/lib/Target/X86/Disassembler/Makefile b/contrib/llvm/lib/Target/X86/Disassembler/Makefile deleted file mode 100644 index 8669fd8fd930..000000000000 --- a/contrib/llvm/lib/Target/X86/Disassembler/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/X86/Disassembler/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMX86Disassembler - -# Hack: we need to include 'main' x86 target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/X86/Makefile b/contrib/llvm/lib/Target/X86/Makefile deleted file mode 100644 index f4ff894a2af7..000000000000 --- a/contrib/llvm/lib/Target/X86/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../.. -LIBRARYNAME = LLVMX86CodeGen -TARGET = X86 - -# Make sure that tblgen is run, first thing. -BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \ - X86GenRegisterInfo.inc X86GenInstrNames.inc \ - X86GenInstrInfo.inc X86GenAsmWriter.inc X86GenAsmMatcher.inc \ - X86GenAsmWriter1.inc X86GenDAGISel.inc \ - X86GenDisassemblerTables.inc X86GenFastISel.inc \ - X86GenCallingConv.inc X86GenSubtarget.inc \ - X86GenEDInfo.inc - -DIRS = AsmPrinter AsmParser Disassembler TargetInfo - -include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/X86/README-FPStack.txt b/contrib/llvm/lib/Target/X86/README-FPStack.txt deleted file mode 100644 index 39efd2dbcf1a..000000000000 --- a/contrib/llvm/lib/Target/X86/README-FPStack.txt +++ /dev/null @@ -1,85 +0,0 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend: FP stack related stuff -//===---------------------------------------------------------------------===// - -//===---------------------------------------------------------------------===// - -Some targets (e.g. athlons) prefer freep to fstp ST(0): -http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html - -//===---------------------------------------------------------------------===// - -This should use fiadd on chips where it is profitable: -double foo(double P, int *I) { return P+*I; } - -We have fiadd patterns now but the followings have the same cost and -complexity. We need a way to specify the later is more profitable. - -def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP:$dst, (fadd RFP:$src1, - (extloadf64f32 addr:$src2)))]>; - // ST(0) = ST(0) + [mem32] - -def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP:$dst, (fadd RFP:$src1, - (X86fild addr:$src2, i32)))]>; - // ST(0) = ST(0) + [mem32int] - -//===---------------------------------------------------------------------===// - -The FP stackifier should handle simple permutates to reduce number of shuffle -instructions, e.g. turning: - -fld P -> fld Q -fld Q fld P -fxch - -or: - -fxch -> fucomi -fucomi jl X -jg X - -Ideas: -http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html - - -//===---------------------------------------------------------------------===// - -Add a target specific hook to DAG combiner to handle SINT_TO_FP and -FP_TO_SINT when the source operand is already in memory. - -//===---------------------------------------------------------------------===// - -Open code rint,floor,ceil,trunc: -http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html -http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html - -Opencode the sincos[f] libcall. - -//===---------------------------------------------------------------------===// - -None of the FPStack instructions are handled in -X86RegisterInfo::foldMemoryOperand, which prevents the spiller from -folding spill code into the instructions. - -//===---------------------------------------------------------------------===// - -Currently the x86 codegen isn't very good at mixing SSE and FPStack -code: - -unsigned int foo(double x) { return x; } - -foo: - subl $20, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, 8(%esp) - fldl 8(%esp) - fisttpll (%esp) - movl (%esp), %eax - addl $20, %esp - ret - -This just requires being smarter when custom expanding fptoui. - -//===---------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/README-MMX.txt b/contrib/llvm/lib/Target/X86/README-MMX.txt deleted file mode 100644 index a6c8616b6d2c..000000000000 --- a/contrib/llvm/lib/Target/X86/README-MMX.txt +++ /dev/null @@ -1,71 +0,0 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend: MMX-specific stuff. -//===---------------------------------------------------------------------===// - -//===---------------------------------------------------------------------===// - -This: - -#include <mmintrin.h> - -__v2si qux(int A) { - return (__v2si){ 0, A }; -} - -is compiled into: - -_qux: - subl $28, %esp - movl 32(%esp), %eax - movd %eax, %mm0 - movq %mm0, (%esp) - movl (%esp), %eax - movl %eax, 20(%esp) - movq %mm0, 8(%esp) - movl 12(%esp), %eax - movl %eax, 16(%esp) - movq 16(%esp), %mm0 - addl $28, %esp - ret - -Yuck! - -GCC gives us: - -_qux: - subl $12, %esp - movl 16(%esp), %eax - movl 20(%esp), %edx - movl $0, (%eax) - movl %edx, 4(%eax) - addl $12, %esp - ret $4 - -//===---------------------------------------------------------------------===// - -We generate crappy code for this: - -__m64 t() { - return _mm_cvtsi32_si64(1); -} - -_t: - subl $12, %esp - movl $1, %eax - movd %eax, %mm0 - movq %mm0, (%esp) - movl (%esp), %eax - movl 4(%esp), %edx - addl $12, %esp - ret - -The extra stack traffic is covered in the previous entry. But the other reason -is we are not smart about materializing constants in MMX registers. With -m64 - - movl $1, %eax - movd %eax, %mm0 - movd %mm0, %rax - ret - -We should be using a constantpool load instead: - movq LC0(%rip), %rax diff --git a/contrib/llvm/lib/Target/X86/README-SSE.txt b/contrib/llvm/lib/Target/X86/README-SSE.txt deleted file mode 100644 index f96b22f1e204..000000000000 --- a/contrib/llvm/lib/Target/X86/README-SSE.txt +++ /dev/null @@ -1,907 +0,0 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend: SSE-specific stuff. -//===---------------------------------------------------------------------===// - -//===---------------------------------------------------------------------===// - -SSE Variable shift can be custom lowered to something like this, which uses a -small table + unaligned load + shuffle instead of going through memory. - -__m128i_shift_right: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - -... -__m128i shift_right(__m128i value, unsigned long offset) { - return _mm_shuffle_epi8(value, - _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); -} - -//===---------------------------------------------------------------------===// - -SSE has instructions for doing operations on complex numbers, we should pattern -match them. Compiling this: - -_Complex float f32(_Complex float A, _Complex float B) { - return A+B; -} - -into: - -_f32: - movdqa %xmm0, %xmm2 - addss %xmm1, %xmm2 - pshufd $16, %xmm2, %xmm2 - pshufd $1, %xmm1, %xmm1 - pshufd $1, %xmm0, %xmm0 - addss %xmm1, %xmm0 - pshufd $16, %xmm0, %xmm1 - movdqa %xmm2, %xmm0 - unpcklps %xmm1, %xmm0 - ret - -seems silly. - - -//===---------------------------------------------------------------------===// - -Expand libm rounding functions inline: Significant speedups possible. -http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html - -//===---------------------------------------------------------------------===// - -When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and -other fast SSE modes. - -//===---------------------------------------------------------------------===// - -Think about doing i64 math in SSE regs on x86-32. - -//===---------------------------------------------------------------------===// - -This testcase should have no SSE instructions in it, and only one load from -a constant pool: - -double %test3(bool %B) { - %C = select bool %B, double 123.412, double 523.01123123 - ret double %C -} - -Currently, the select is being lowered, which prevents the dag combiner from -turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' - -The pattern isel got this one right. - -//===---------------------------------------------------------------------===// - -SSE should implement 'select_cc' using 'emulated conditional moves' that use -pcmp/pand/pandn/por to do a selection instead of a conditional branch: - -double %X(double %Y, double %Z, double %A, double %B) { - %C = setlt double %A, %B - %z = fadd double %Z, 0.0 ;; select operand is not a load - %D = select bool %C, double %Y, double %z - ret double %D -} - -We currently emit: - -_X: - subl $12, %esp - xorpd %xmm0, %xmm0 - addsd 24(%esp), %xmm0 - movsd 32(%esp), %xmm1 - movsd 16(%esp), %xmm2 - ucomisd 40(%esp), %xmm1 - jb LBB_X_2 -LBB_X_1: - movsd %xmm0, %xmm2 -LBB_X_2: - movsd %xmm2, (%esp) - fldl (%esp) - addl $12, %esp - ret - -//===---------------------------------------------------------------------===// - -Lower memcpy / memset to a series of SSE 128 bit move instructions when it's -feasible. - -//===---------------------------------------------------------------------===// - -Codegen: - if (copysign(1.0, x) == copysign(1.0, y)) -into: - if (x^y & mask) -when using SSE. - -//===---------------------------------------------------------------------===// - -Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half -of a v4sf value. - -//===---------------------------------------------------------------------===// - -Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. -Perhaps use pxor / xorp* to clear a XMM register first? - -//===---------------------------------------------------------------------===// - -External test Nurbs exposed some problems. Look for -__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc -emits: - - movaps (%edx), %xmm2 #59.21 - movaps (%edx), %xmm5 #60.21 - movaps (%edx), %xmm4 #61.21 - movaps (%edx), %xmm3 #62.21 - movl 40(%ecx), %ebp #69.49 - shufps $0, %xmm2, %xmm5 #60.21 - movl 100(%esp), %ebx #69.20 - movl (%ebx), %edi #69.20 - imull %ebp, %edi #69.49 - addl (%eax), %edi #70.33 - shufps $85, %xmm2, %xmm4 #61.21 - shufps $170, %xmm2, %xmm3 #62.21 - shufps $255, %xmm2, %xmm2 #63.21 - lea (%ebp,%ebp,2), %ebx #69.49 - negl %ebx #69.49 - lea -3(%edi,%ebx), %ebx #70.33 - shll $4, %ebx #68.37 - addl 32(%ecx), %ebx #68.37 - testb $15, %bl #91.13 - jne L_B1.24 # Prob 5% #91.13 - -This is the llvm code after instruction scheduling: - -cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %reg1078 = MOV32ri -3 - %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 - %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 - %reg1080 = IMUL32rr %reg1079, %reg1037 - %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 - %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 - %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 - %reg1082 = SHL32ri %reg1038, 4 - %reg1039 = ADD32rr %reg1036, %reg1082 - %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 - %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 - %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 - %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 - %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 - %reg1040 = MOV32rr %reg1039 - %reg1084 = AND32ri8 %reg1039, 15 - CMP32ri8 %reg1084, 0 - JE mbb<cond_next204,0xa914d30> - -Still ok. After register allocation: - -cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %EAX = MOV32ri -3 - %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 - ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 - %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 - %EDX = MOV32rm %EDX, 1, %NOREG, 40 - IMUL32rr %EAX<def&use>, %EDX - %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 - %ESI = MOV32rm %ESI, 1, %NOREG, 0 - MOV32mr <fi#4>, 1, %NOREG, 0, %ESI - %EAX = LEA32r %ESI, 1, %EAX, -3 - %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 - %ESI = MOV32rm %ESI, 1, %NOREG, 32 - %EDI = MOV32rr %EAX - SHL32ri %EDI<def&use>, 4 - ADD32rr %EDI<def&use>, %ESI - %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 - %XMM1 = MOVAPSrr %XMM0 - SHUFPSrr %XMM1<def&use>, %XMM1, 170 - %XMM2 = MOVAPSrr %XMM0 - SHUFPSrr %XMM2<def&use>, %XMM2, 0 - %XMM3 = MOVAPSrr %XMM0 - SHUFPSrr %XMM3<def&use>, %XMM3, 255 - SHUFPSrr %XMM0<def&use>, %XMM0, 85 - %EBX = MOV32rr %EDI - AND32ri8 %EBX<def&use>, 15 - CMP32ri8 %EBX, 0 - JE mbb<cond_next204,0xa914d30> - -This looks really bad. The problem is shufps is a destructive opcode. Since it -appears as operand two in more than one shufps ops. It resulted in a number of -copies. Note icc also suffers from the same problem. Either the instruction -selector should select pshufd or The register allocator can made the two-address -to three-address transformation. - -It also exposes some other problems. See MOV32ri -3 and the spills. - -//===---------------------------------------------------------------------===// - -Consider: - -__m128 test(float a) { - return _mm_set_ps(0.0, 0.0, 0.0, a*a); -} - -This compiles into: - -movss 4(%esp), %xmm1 -mulss %xmm1, %xmm1 -xorps %xmm0, %xmm0 -movss %xmm1, %xmm0 -ret - -Because mulss doesn't modify the top 3 elements, the top elements of -xmm1 are already zero'd. We could compile this to: - -movss 4(%esp), %xmm0 -mulss %xmm0, %xmm0 -ret - -//===---------------------------------------------------------------------===// - -Here's a sick and twisted idea. Consider code like this: - -__m128 test(__m128 a) { - float b = *(float*)&A; - ... - return _mm_set_ps(0.0, 0.0, 0.0, b); -} - -This might compile to this code: - -movaps c(%esp), %xmm1 -xorps %xmm0, %xmm0 -movss %xmm1, %xmm0 -ret - -Now consider if the ... code caused xmm1 to get spilled. This might produce -this code: - -movaps c(%esp), %xmm1 -movaps %xmm1, c2(%esp) -... - -xorps %xmm0, %xmm0 -movaps c2(%esp), %xmm1 -movss %xmm1, %xmm0 -ret - -However, since the reload is only used by these instructions, we could -"fold" it into the uses, producing something like this: - -movaps c(%esp), %xmm1 -movaps %xmm1, c2(%esp) -... - -movss c2(%esp), %xmm0 -ret - -... saving two instructions. - -The basic idea is that a reload from a spill slot, can, if only one 4-byte -chunk is used, bring in 3 zeros the one element instead of 4 elements. -This can be used to simplify a variety of shuffle operations, where the -elements are fixed zeros. - -//===---------------------------------------------------------------------===// - -This code generates ugly code, probably due to costs being off or something: - -define void @test(float* %P, <4 x float>* %P2 ) { - %xFloat0.688 = load float* %P - %tmp = load <4 x float>* %P2 - %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 - store <4 x float> %inFloat3.713, <4 x float>* %P2 - ret void -} - -Generates: - -_test: - movl 8(%esp), %eax - movaps (%eax), %xmm0 - pxor %xmm1, %xmm1 - movaps %xmm0, %xmm2 - shufps $50, %xmm1, %xmm2 - shufps $132, %xmm2, %xmm0 - movaps %xmm0, (%eax) - ret - -Would it be better to generate: - -_test: - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - xor %eax, %eax - pinsrw $6, %eax, %xmm0 - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) - ret - -? - -//===---------------------------------------------------------------------===// - -Some useful information in the Apple Altivec / SSE Migration Guide: - -http://developer.apple.com/documentation/Performance/Conceptual/ -Accelerate_sse_migration/index.html - -e.g. SSE select using and, andnot, or. Various SSE compare translations. - -//===---------------------------------------------------------------------===// - -Add hooks to commute some CMPP operations. - -//===---------------------------------------------------------------------===// - -Apply the same transformation that merged four float into a single 128-bit load -to loads from constant pool. - -//===---------------------------------------------------------------------===// - -Floating point max / min are commutable when -enable-unsafe-fp-path is -specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other -nodes which are selected to max / min instructions that are marked commutable. - -//===---------------------------------------------------------------------===// - -We should materialize vector constants like "all ones" and "signbit" with -code like: - - cmpeqps xmm1, xmm1 ; xmm1 = all-ones - -and: - cmpeqps xmm1, xmm1 ; xmm1 = all-ones - psrlq xmm1, 31 ; xmm1 = all 100000000000... - -instead of using a load from the constant pool. The later is important for -ABS/NEG/copysign etc. - -//===---------------------------------------------------------------------===// - -These functions: - -#include <xmmintrin.h> -__m128i a; -void x(unsigned short n) { - a = _mm_slli_epi32 (a, n); -} -void y(unsigned n) { - a = _mm_slli_epi32 (a, n); -} - -compile to ( -O3 -static -fomit-frame-pointer): -_x: - movzwl 4(%esp), %eax - movd %eax, %xmm0 - movaps _a, %xmm1 - pslld %xmm0, %xmm1 - movaps %xmm1, _a - ret -_y: - movd 4(%esp), %xmm0 - movaps _a, %xmm1 - pslld %xmm0, %xmm1 - movaps %xmm1, _a - ret - -"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems -like movd would be sufficient in both cases as the value is already zero -extended in the 32-bit stack slot IIRC. For signed short, it should also be -save, as a really-signed value would be undefined for pslld. - - -//===---------------------------------------------------------------------===// - -#include <math.h> -int t1(double d) { return signbit(d); } - -This currently compiles to: - subl $12, %esp - movsd 16(%esp), %xmm0 - movsd %xmm0, (%esp) - movl 4(%esp), %eax - shrl $31, %eax - addl $12, %esp - ret - -We should use movmskp{s|d} instead. - -//===---------------------------------------------------------------------===// - -CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single -(aligned) vector load. This functionality has a couple of problems. - -1. The code to infer alignment from loads of globals is in the X86 backend, - not the dag combiner. This is because dagcombine2 needs to be able to see - through the X86ISD::Wrapper node, which DAGCombine can't really do. -2. The code for turning 4 x load into a single vector load is target - independent and should be moved to the dag combiner. -3. The code for turning 4 x load into a vector load can only handle a direct - load from a global or a direct load from the stack. It should be generalized - to handle any load from P, P+4, P+8, P+12, where P can be anything. -4. The alignment inference code cannot handle loads from globals in non-static - mode because it doesn't look through the extra dyld stub load. If you try - vec_align.ll without -relocation-model=static, you'll see what I mean. - -//===---------------------------------------------------------------------===// - -We should lower store(fneg(load p), q) into an integer load+xor+store, which -eliminates a constant pool load. For example, consider: - -define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { -entry: - %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] - %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly - ret i64 %tmp20 -} -declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly - -This currently compiles to: - -LCPI1_0: # <4 x float> - .long 2147483648 # float -0 - .long 2147483648 # float -0 - .long 2147483648 # float -0 - .long 2147483648 # float -0 -_ccosf: - subl $12, %esp - movss 16(%esp), %xmm0 - movss %xmm0, 4(%esp) - movss 20(%esp), %xmm0 - xorps LCPI1_0, %xmm0 - movss %xmm0, (%esp) - call L_ccoshf$stub - addl $12, %esp - ret - -Note the load into xmm0, then xor (to negate), then store. In PIC mode, -this code computes the pic base and does two loads to do the constant pool -load, so the improvement is much bigger. - -The tricky part about this xform is that the argument load/store isn't exposed -until post-legalize, and at that point, the fneg has been custom expanded into -an X86 fxor. This means that we need to handle this case in the x86 backend -instead of in target independent code. - -//===---------------------------------------------------------------------===// - -Non-SSE4 insert into 16 x i8 is atrociously bad. - -//===---------------------------------------------------------------------===// - -<2 x i64> extract is substantially worse than <2 x f64>, even if the destination -is memory. - -//===---------------------------------------------------------------------===// - -SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext -sitting between the truncate and the extract. - -//===---------------------------------------------------------------------===// - -INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert -any number of 0.0 simultaneously. Currently we only use it for simple -insertions. - -See comments in LowerINSERT_VECTOR_ELT_SSE4. - -//===---------------------------------------------------------------------===// - -On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not -Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are -legal, it'll just take a few extra patterns written in the .td file. - -Note: this is not a code quality issue; the custom lowered code happens to be -right, but we shouldn't have to custom lower anything. This is probably related -to <2 x i64> ops being so bad. - -//===---------------------------------------------------------------------===// - -'select' on vectors and scalars could be a whole lot better. We currently -lower them to conditional branches. On x86-64 for example, we compile this: - -double test(double a, double b, double c, double d) { return a<b ? c : d; } - -to: - -_test: - ucomisd %xmm0, %xmm1 - ja LBB1_2 # entry -LBB1_1: # entry - movapd %xmm3, %xmm2 -LBB1_2: # entry - movapd %xmm2, %xmm0 - ret - -instead of: - -_test: - cmpltsd %xmm1, %xmm0 - andpd %xmm0, %xmm2 - andnpd %xmm3, %xmm0 - orpd %xmm2, %xmm0 - ret - -For unpredictable branches, the later is much more efficient. This should -just be a matter of having scalar sse map to SELECT_CC and custom expanding -or iseling it. - -//===---------------------------------------------------------------------===// - -LLVM currently generates stack realignment code, when it is not necessary -needed. The problem is that we need to know about stack alignment too early, -before RA runs. - -At that point we don't know, whether there will be vector spill, or not. -Stack realignment logic is overly conservative here, but otherwise we can -produce unaligned loads/stores. - -Fixing this will require some huge RA changes. - -Testcase: -#include <emmintrin.h> - -typedef short vSInt16 __attribute__ ((__vector_size__ (16))); - -static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, -- 22725, - 12873};; - -vSInt16 madd(vSInt16 b) -{ - return _mm_madd_epi16(a, b); -} - -Generated code (x86-32, linux): -madd: - pushl %ebp - movl %esp, %ebp - andl $-16, %esp - movaps .LCPI1_0, %xmm1 - pmaddwd %xmm1, %xmm0 - movl %ebp, %esp - popl %ebp - ret - -//===---------------------------------------------------------------------===// - -Consider: -#include <emmintrin.h> -__m128 foo2 (float x) { - return _mm_set_ps (0, 0, x, 0); -} - -In x86-32 mode, we generate this spiffy code: - -_foo2: - movss 4(%esp), %xmm0 - pshufd $81, %xmm0, %xmm0 - ret - -in x86-64 mode, we generate this code, which could be better: - -_foo2: - xorps %xmm1, %xmm1 - movss %xmm0, %xmm1 - pshufd $81, %xmm1, %xmm0 - ret - -In sse4 mode, we could use insertps to make both better. - -Here's another testcase that could use insertps [mem]: - -#include <xmmintrin.h> -extern float x2, x3; -__m128 foo1 (float x1, float x4) { - return _mm_set_ps (x2, x1, x3, x4); -} - -gcc mainline compiles it to: - -foo1: - insertps $0x10, x2(%rip), %xmm0 - insertps $0x10, x3(%rip), %xmm1 - movaps %xmm1, %xmm2 - movlhps %xmm0, %xmm2 - movaps %xmm2, %xmm0 - ret - -//===---------------------------------------------------------------------===// - -We compile vector multiply-by-constant into poor code: - -define <4 x i32> @f(<4 x i32> %i) nounwind { - %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > - ret <4 x i32> %A -} - -On targets without SSE4.1, this compiles into: - -LCPI1_0: ## <4 x i32> - .long 10 - .long 10 - .long 10 - .long 10 - .text - .align 4,0x90 - .globl _f -_f: - pshufd $3, %xmm0, %xmm1 - movd %xmm1, %eax - imull LCPI1_0+12, %eax - movd %eax, %xmm1 - pshufd $1, %xmm0, %xmm2 - movd %xmm2, %eax - imull LCPI1_0+4, %eax - movd %eax, %xmm2 - punpckldq %xmm1, %xmm2 - movd %xmm0, %eax - imull LCPI1_0, %eax - movd %eax, %xmm1 - movhlps %xmm0, %xmm0 - movd %xmm0, %eax - imull LCPI1_0+8, %eax - movd %eax, %xmm0 - punpckldq %xmm0, %xmm1 - movaps %xmm1, %xmm0 - punpckldq %xmm2, %xmm0 - ret - -It would be better to synthesize integer vector multiplication by constants -using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, -simple cases such as multiplication by powers of two would be better as -vector shifts than as multiplications. - -//===---------------------------------------------------------------------===// - -We compile this: - -__m128i -foo2 (char x) -{ - return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); -} - -into: - movl $1, %eax - xorps %xmm0, %xmm0 - pinsrw $2, %eax, %xmm0 - movzbl 4(%esp), %eax - pinsrw $3, %eax, %xmm0 - movl $256, %eax - pinsrw $7, %eax, %xmm0 - ret - - -gcc-4.2: - subl $12, %esp - movzbl 16(%esp), %eax - movdqa LC0, %xmm0 - pinsrw $3, %eax, %xmm0 - addl $12, %esp - ret - .const - .align 4 -LC0: - .word 0 - .word 0 - .word 1 - .word 0 - .word 0 - .word 0 - .word 0 - .word 256 - -With SSE4, it should be - movdqa .LC0(%rip), %xmm0 - pinsrb $6, %edi, %xmm0 - -//===---------------------------------------------------------------------===// - -We should transform a shuffle of two vectors of constants into a single vector -of constants. Also, insertelement of a constant into a vector of constants -should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. - -We compiled it to something horrible: - - .align 4 -LCPI1_1: ## float - .long 1065353216 ## float 1 - .const - - .align 4 -LCPI1_0: ## <4 x float> - .space 4 - .long 1065353216 ## float 1 - .space 4 - .long 1065353216 ## float 1 - .text - .align 4,0x90 - .globl _t -_t: - xorps %xmm0, %xmm0 - movhps LCPI1_0, %xmm0 - movss LCPI1_1, %xmm1 - movaps %xmm0, %xmm2 - shufps $2, %xmm1, %xmm2 - shufps $132, %xmm2, %xmm0 - movaps %xmm0, 0 - -//===---------------------------------------------------------------------===// -rdar://5907648 - -This function: - -float foo(unsigned char x) { - return x; -} - -compiles to (x86-32): - -define float @foo(i8 zeroext %x) nounwind { - %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] - ret float %tmp12 -} - -compiles to: - -_foo: - subl $4, %esp - movzbl 8(%esp), %eax - cvtsi2ss %eax, %xmm0 - movss %xmm0, (%esp) - flds (%esp) - addl $4, %esp - ret - -We should be able to use: - cvtsi2ss 8($esp), %xmm0 -since we know the stack slot is already zext'd. - -//===---------------------------------------------------------------------===// - -Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) -when code size is critical. movlps is slower than movsd on core2 but it's one -byte shorter. - -//===---------------------------------------------------------------------===// - -We should use a dynamic programming based approach to tell when using FPStack -operations is cheaper than SSE. SciMark montecarlo contains code like this -for example: - -double MonteCarlo_num_flops(int Num_samples) { - return ((double) Num_samples)* 4.0; -} - -In fpstack mode, this compiles into: - -LCPI1_0: - .long 1082130432 ## float 4.000000e+00 -_MonteCarlo_num_flops: - subl $4, %esp - movl 8(%esp), %eax - movl %eax, (%esp) - fildl (%esp) - fmuls LCPI1_0 - addl $4, %esp - ret - -in SSE mode, it compiles into significantly slower code: - -_MonteCarlo_num_flops: - subl $12, %esp - cvtsi2sd 16(%esp), %xmm0 - mulsd LCPI1_0, %xmm0 - movsd %xmm0, (%esp) - fldl (%esp) - addl $12, %esp - ret - -There are also other cases in scimark where using fpstack is better, it is -cheaper to do fld1 than load from a constant pool for example, so -"load, add 1.0, store" is better done in the fp stack, etc. - -//===---------------------------------------------------------------------===// - -The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to -"cmpsd". For example, this code: - -double d1(double x) { return x == x ? x : x + x; } - -Compiles into: - -_d1: - ucomisd %xmm0, %xmm0 - jnp LBB1_2 - addsd %xmm0, %xmm0 - ret -LBB1_2: - ret - -Also, the 'ret's should be shared. This is PR6032. - -//===---------------------------------------------------------------------===// - -These should compile into the same code (PR6214): Perhaps instcombine should -canonicalize the former into the later? - -define float @foo(float %x) nounwind { - %t = bitcast float %x to i32 - %s = and i32 %t, 2147483647 - %d = bitcast i32 %s to float - ret float %d -} - -declare float @fabsf(float %n) -define float @bar(float %x) nounwind { - %d = call float @fabsf(float %x) - ret float %d -} - -//===---------------------------------------------------------------------===// - -This IR (from PR6194): - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-apple-darwin10.0.0" - -%0 = type { double, double } -%struct.float3 = type { float, float, float } - -define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { -entry: - %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] - %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] - %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] - %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] - %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] - %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] - %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] - store float %tmp12, float* %tmp5 - ret void -} - -Compiles to: - -_test: ## @test - movd %xmm0, %rax - shrq $32, %rax - movl %eax, 4(%rdi) - ret - -This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and -doing a shuffle from v[1] to v[0] then a float store. - -//===---------------------------------------------------------------------===// - -On SSE4 machines, we compile this code: - -define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, - <2 x float> *%P) nounwind { - %Z = fadd <2 x float> %Q, %R - - store <2 x float> %Z, <2 x float> *%P - ret <2 x float> %Z -} - -into: - -_test2: ## @test2 -## BB#0: - insertps $0, %xmm2, %xmm2 - insertps $16, %xmm3, %xmm2 - insertps $0, %xmm0, %xmm3 - insertps $16, %xmm1, %xmm3 - addps %xmm2, %xmm3 - movq %xmm3, (%rdi) - movaps %xmm3, %xmm0 - pshufd $1, %xmm3, %xmm1 - ## kill: XMM1<def> XMM1<kill> - ret - -The insertps's of $0 are pointless complex copies. - -//===---------------------------------------------------------------------===// - - diff --git a/contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt b/contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt deleted file mode 100644 index c26c75ab951c..000000000000 --- a/contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt +++ /dev/null @@ -1,14 +0,0 @@ -//===---------------------------------------------------------------------===// -// Testcases that crash the X86 backend because they aren't implemented -//===---------------------------------------------------------------------===// - -These are cases we know the X86 backend doesn't handle. Patches are welcome -and appreciated, because no one has signed up to implemented these yet. -Implementing these would allow elimination of the corresponding intrinsics, -which would be great. - -1) vector shifts -2) vector comparisons -3) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688 -4) bitcasts from vectors to scalars: PR2804 -5) llvm.atomic.cmp.swap.i128.p0i128: PR3462 diff --git a/contrib/llvm/lib/Target/X86/README-X86-64.txt b/contrib/llvm/lib/Target/X86/README-X86-64.txt deleted file mode 100644 index 78c4dc00ee72..000000000000 --- a/contrib/llvm/lib/Target/X86/README-X86-64.txt +++ /dev/null @@ -1,273 +0,0 @@ -//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===// - -AMD64 Optimization Manual 8.2 has some nice information about optimizing integer -multiplication by a constant. How much of it applies to Intel's X86-64 -implementation? There are definite trade-offs to consider: latency vs. register -pressure vs. code size. - -//===---------------------------------------------------------------------===// - -Are we better off using branches instead of cmove to implement FP to -unsigned i64? - -_conv: - ucomiss LC0(%rip), %xmm0 - cvttss2siq %xmm0, %rdx - jb L3 - subss LC0(%rip), %xmm0 - movabsq $-9223372036854775808, %rax - cvttss2siq %xmm0, %rdx - xorq %rax, %rdx -L3: - movq %rdx, %rax - ret - -instead of - -_conv: - movss LCPI1_0(%rip), %xmm1 - cvttss2siq %xmm0, %rcx - movaps %xmm0, %xmm2 - subss %xmm1, %xmm2 - cvttss2siq %xmm2, %rax - movabsq $-9223372036854775808, %rdx - xorq %rdx, %rax - ucomiss %xmm1, %xmm0 - cmovb %rcx, %rax - ret - -Seems like the jb branch has high likelyhood of being taken. It would have -saved a few instructions. - -//===---------------------------------------------------------------------===// - -Poor codegen: - -int X[2]; -int b; -void test(void) { - memset(X, b, 2*sizeof(X[0])); -} - -llc: - movq _b@GOTPCREL(%rip), %rax - movzbq (%rax), %rax - movq %rax, %rcx - shlq $8, %rcx - orq %rax, %rcx - movq %rcx, %rax - shlq $16, %rax - orq %rcx, %rax - movq %rax, %rcx - shlq $32, %rcx - movq _X@GOTPCREL(%rip), %rdx - orq %rax, %rcx - movq %rcx, (%rdx) - ret - -gcc: - movq _b@GOTPCREL(%rip), %rax - movabsq $72340172838076673, %rdx - movzbq (%rax), %rax - imulq %rdx, %rax - movq _X@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) - ret - -And the codegen is even worse for the following -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33103): - void fill1(char *s, int a) - { - __builtin_memset(s, a, 15); - } - -For this version, we duplicate the computation of the constant to store. - -//===---------------------------------------------------------------------===// - -It's not possible to reference AH, BH, CH, and DH registers in an instruction -requiring REX prefix. However, divb and mulb both produce results in AH. If isel -emits a CopyFromReg which gets turned into a movb and that can be allocated a -r8b - r15b. - -To get around this, isel emits a CopyFromReg from AX and then right shift it -down by 8 and truncate it. It's not pretty but it works. We need some register -allocation magic to make the hack go away (e.g. putting additional constraints -on the result of the movb). - -//===---------------------------------------------------------------------===// - -The x86-64 ABI for hidden-argument struct returns requires that the -incoming value of %rdi be copied into %rax by the callee upon return. - -The idea is that it saves callers from having to remember this value, -which would often require a callee-saved register. Callees usually -need to keep this value live for most of their body anyway, so it -doesn't add a significant burden on them. - -We currently implement this in codegen, however this is suboptimal -because it means that it would be quite awkward to implement the -optimization for callers. - -A better implementation would be to relax the LLVM IR rules for sret -arguments to allow a function with an sret argument to have a non-void -return type, and to have the front-end to set up the sret argument value -as the return value of the function. The front-end could more easily -emit uses of the returned struct value to be in terms of the function's -lowered return value, and it would free non-C frontends from a -complication only required by a C-based ABI. - -//===---------------------------------------------------------------------===// - -We get a redundant zero extension for code like this: - -int mask[1000]; -int foo(unsigned x) { - if (x < 10) - x = x * 45; - else - x = x * 78; - return mask[x]; -} - -_foo: -LBB1_0: ## entry - cmpl $9, %edi - jbe LBB1_3 ## bb -LBB1_1: ## bb1 - imull $78, %edi, %eax -LBB1_2: ## bb2 - movl %eax, %eax <---- - movq _mask@GOTPCREL(%rip), %rcx - movl (%rcx,%rax,4), %eax - ret -LBB1_3: ## bb - imull $45, %edi, %eax - jmp LBB1_2 ## bb2 - -Before regalloc, we have: - - %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def> - JMP mbb<bb2,0x203afb0> - Successors according to CFG: 0x203afb0 (#3) - -bb1: 0x203af60, LLVM BB @0x1e02310, ID#2: - Predecessors according to CFG: 0x203aec0 (#0) - %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def> - Successors according to CFG: 0x203afb0 (#3) - -bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3: - Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2) - %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>, - %reg1026, mbb<bb1,0x203af60> - %reg1029<def> = MOVZX64rr32 %reg1027 - -so we'd have to know that IMUL32rri8 leaves the high word zero extended and to -be able to recognize the zero extend. This could also presumably be implemented -if we have whole-function selectiondags. - -//===---------------------------------------------------------------------===// - -Take the following C code -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640): - -struct u1 -{ - float x; - float y; -}; - -float foo(struct u1 u) -{ - return u.x + u.y; -} - -Optimizes to the following IR: -define float @foo(double %u.0) nounwind readnone { -entry: - %tmp8 = bitcast double %u.0 to i64 ; <i64> [#uses=2] - %tmp6 = trunc i64 %tmp8 to i32 ; <i32> [#uses=1] - %tmp7 = bitcast i32 %tmp6 to float ; <float> [#uses=1] - %tmp2 = lshr i64 %tmp8, 32 ; <i64> [#uses=1] - %tmp3 = trunc i64 %tmp2 to i32 ; <i32> [#uses=1] - %tmp4 = bitcast i32 %tmp3 to float ; <float> [#uses=1] - %0 = fadd float %tmp7, %tmp4 ; <float> [#uses=1] - ret float %0 -} - -And current llvm-gcc/clang output: - movd %xmm0, %rax - movd %eax, %xmm1 - shrq $32, %rax - movd %eax, %xmm0 - addss %xmm1, %xmm0 - ret - -We really shouldn't move the floats to RAX, only to immediately move them -straight back to the XMM registers. - -There really isn't any good way to handle this purely in IR optimizers; it -could possibly be handled by changing the output of the fronted, though. It -would also be feasible to add a x86-specific DAGCombine to optimize the -bitcast+trunc+(lshr+)bitcast combination. - -//===---------------------------------------------------------------------===// - -Take the following code -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653): -extern unsigned long table[]; -unsigned long foo(unsigned char *p) { - unsigned long tag = *p; - return table[tag >> 4] + table[tag & 0xf]; -} - -Current code generated: - movzbl (%rdi), %eax - movq %rax, %rcx - andq $240, %rcx - shrq %rcx - andq $15, %rax - movq table(,%rax,8), %rax - addq table(%rcx), %rax - ret - -Issues: -1. First movq should be movl; saves a byte. -2. Both andq's should be andl; saves another two bytes. I think this was - implemented at one point, but subsequently regressed. -3. shrq should be shrl; saves another byte. -4. The first andq can be completely eliminated by using a slightly more - expensive addressing mode. - -//===---------------------------------------------------------------------===// - -Consider the following (contrived testcase, but contains common factors): - -#include <stdarg.h> -int test(int x, ...) { - int sum, i; - va_list l; - va_start(l, x); - for (i = 0; i < x; i++) - sum += va_arg(l, int); - va_end(l); - return sum; -} - -Testcase given in C because fixing it will likely involve changing the IR -generated for it. The primary issue with the result is that it doesn't do any -of the optimizations which are possible if we know the address of a va_list -in the current function is never taken: -1. We shouldn't spill the XMM registers because we only call va_arg with "int". -2. It would be nice if we could scalarrepl the va_list. -3. Probably overkill, but it'd be cool if we could peel off the first five -iterations of the loop. - -Other optimizations involving functions which use va_arg on floats which don't -have the address of a va_list taken: -1. Conversely to the above, we shouldn't spill general registers if we only - call va_arg on "double". -2. If we know nothing more than 64 bits wide is read from the XMM registers, - we can change the spilling code to reduce the amount of stack used by half. - -//===---------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/README.txt b/contrib/llvm/lib/Target/X86/README.txt deleted file mode 100644 index a305ae6ec550..000000000000 --- a/contrib/llvm/lib/Target/X86/README.txt +++ /dev/null @@ -1,1962 +0,0 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend. -//===---------------------------------------------------------------------===// - -We should add support for the "movbe" instruction, which does a byte-swapping -copy (3-addr bswap + memory support?) This is available on Atom processors. - -//===---------------------------------------------------------------------===// - -CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 -backend knows how to three-addressify this shift, but it appears the register -allocator isn't even asking it to do so in this case. We should investigate -why this isn't happening, it could have significant impact on other important -cases for X86 as well. - -//===---------------------------------------------------------------------===// - -This should be one DIV/IDIV instruction, not a libcall: - -unsigned test(unsigned long long X, unsigned Y) { - return X/Y; -} - -This can be done trivially with a custom legalizer. What about overflow -though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224 - -//===---------------------------------------------------------------------===// - -Improvements to the multiply -> shift/add algorithm: -http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html - -//===---------------------------------------------------------------------===// - -Improve code like this (occurs fairly frequently, e.g. in LLVM): -long long foo(int x) { return 1LL << x; } - -http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html -http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html -http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html - -Another useful one would be ~0ULL >> X and ~0ULL << X. - -One better solution for 1LL << x is: - xorl %eax, %eax - xorl %edx, %edx - testb $32, %cl - sete %al - setne %dl - sall %cl, %eax - sall %cl, %edx - -But that requires good 8-bit subreg support. - -Also, this might be better. It's an extra shift, but it's one instruction -shorter, and doesn't stress 8-bit subreg support. -(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, -but without the unnecessary and.) - movl %ecx, %eax - shrl $5, %eax - movl %eax, %edx - xorl $1, %edx - sall %cl, %eax - sall %cl. %edx - -64-bit shifts (in general) expand to really bad code. Instead of using -cmovs, we should expand to a conditional branch like GCC produces. - -//===---------------------------------------------------------------------===// - -Compile this: -_Bool f(_Bool a) { return a!=1; } - -into: - movzbl %dil, %eax - xorl $1, %eax - ret - -(Although note that this isn't a legal way to express the code that llvm-gcc -currently generates for that function.) - -//===---------------------------------------------------------------------===// - -Some isel ideas: - -1. Dynamic programming based approach when compile time if not an - issue. -2. Code duplication (addressing mode) during isel. -3. Other ideas from "Register-Sensitive Selection, Duplication, and - Sequencing of Instructions". -4. Scheduling for reduced register pressure. E.g. "Minimum Register - Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" - and other related papers. - http://citeseer.ist.psu.edu/govindarajan01minimum.html - -//===---------------------------------------------------------------------===// - -Should we promote i16 to i32 to avoid partial register update stalls? - -//===---------------------------------------------------------------------===// - -Leave any_extend as pseudo instruction and hint to register -allocator. Delay codegen until post register allocation. -Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach -the coalescer how to deal with it though. - -//===---------------------------------------------------------------------===// - -It appears icc use push for parameter passing. Need to investigate. - -//===---------------------------------------------------------------------===// - -Only use inc/neg/not instructions on processors where they are faster than -add/sub/xor. They are slower on the P4 due to only updating some processor -flags. - -//===---------------------------------------------------------------------===// - -The instruction selector sometimes misses folding a load into a compare. The -pattern is written as (cmp reg, (load p)). Because the compare isn't -commutative, it is not matched with the load on both sides. The dag combiner -should be made smart enough to cannonicalize the load into the RHS of a compare -when it can invert the result of the compare for free. - -//===---------------------------------------------------------------------===// - -In many cases, LLVM generates code like this: - -_test: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - setl %al - movzbl %al, %eax - ret - -on some processors (which ones?), it is more efficient to do this: - -_test: - movl 8(%esp), %ebx - xor %eax, %eax - cmpl %ebx, 4(%esp) - setl %al - ret - -Doing this correctly is tricky though, as the xor clobbers the flags. - -//===---------------------------------------------------------------------===// - -We should generate bts/btr/etc instructions on targets where they are cheap or -when codesize is important. e.g., for: - -void setbit(int *target, int bit) { - *target |= (1 << bit); -} -void clearbit(int *target, int bit) { - *target &= ~(1 << bit); -} - -//===---------------------------------------------------------------------===// - -Instead of the following for memset char*, 1, 10: - - movl $16843009, 4(%edx) - movl $16843009, (%edx) - movw $257, 8(%edx) - -It might be better to generate - - movl $16843009, %eax - movl %eax, 4(%edx) - movl %eax, (%edx) - movw al, 8(%edx) - -when we can spare a register. It reduces code size. - -//===---------------------------------------------------------------------===// - -Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently -get this: - -define i32 @test1(i32 %X) { - %Y = sdiv i32 %X, 8 - ret i32 %Y -} - -_test1: - movl 4(%esp), %eax - movl %eax, %ecx - sarl $31, %ecx - shrl $29, %ecx - addl %ecx, %eax - sarl $3, %eax - ret - -GCC knows several different ways to codegen it, one of which is this: - -_test1: - movl 4(%esp), %eax - cmpl $-1, %eax - leal 7(%eax), %ecx - cmovle %ecx, %eax - sarl $3, %eax - ret - -which is probably slower, but it's interesting at least :) - -//===---------------------------------------------------------------------===// - -We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl -We should leave these as libcalls for everything over a much lower threshold, -since libc is hand tuned for medium and large mem ops (avoiding RFO for large -stores, TLB preheating, etc) - -//===---------------------------------------------------------------------===// - -Optimize this into something reasonable: - x * copysign(1.0, y) * copysign(1.0, z) - -//===---------------------------------------------------------------------===// - -Optimize copysign(x, *y) to use an integer load from y. - -//===---------------------------------------------------------------------===// - -The following tests perform worse with LSR: - -lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. - -//===---------------------------------------------------------------------===// - -Adding to the list of cmp / test poor codegen issues: - -int test(__m128 *A, __m128 *B) { - if (_mm_comige_ss(*A, *B)) - return 3; - else - return 4; -} - -_test: - movl 8(%esp), %eax - movaps (%eax), %xmm0 - movl 4(%esp), %eax - movaps (%eax), %xmm1 - comiss %xmm0, %xmm1 - setae %al - movzbl %al, %ecx - movl $3, %eax - movl $4, %edx - cmpl $0, %ecx - cmove %edx, %eax - ret - -Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There -are a number of issues. 1) We are introducing a setcc between the result of the -intrisic call and select. 2) The intrinsic is expected to produce a i32 value -so a any extend (which becomes a zero extend) is added. - -We probably need some kind of target DAG combine hook to fix this. - -//===---------------------------------------------------------------------===// - -We generate significantly worse code for this than GCC: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 -http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 - -There is also one case we do worse on PPC. - -//===---------------------------------------------------------------------===// - -For this: - -int test(int a) -{ - return a * 3; -} - -We currently emits - imull $3, 4(%esp), %eax - -Perhaps this is what we really should generate is? Is imull three or four -cycles? Note: ICC generates this: - movl 4(%esp), %eax - leal (%eax,%eax,2), %eax - -The current instruction priority is based on pattern complexity. The former is -more "complex" because it folds a load so the latter will not be emitted. - -Perhaps we should use AddedComplexity to give LEA32r a higher priority? We -should always try to match LEA first since the LEA matching code does some -estimate to determine whether the match is profitable. - -However, if we care more about code size, then imull is better. It's two bytes -shorter than movl + leal. - -On a Pentium M, both variants have the same characteristics with regard -to throughput; however, the multiplication has a latency of four cycles, as -opposed to two cycles for the movl+lea variant. - -//===---------------------------------------------------------------------===// - -__builtin_ffs codegen is messy. - -int ffs_(unsigned X) { return __builtin_ffs(X); } - -llvm produces: -ffs_: - movl 4(%esp), %ecx - bsfl %ecx, %eax - movl $32, %edx - cmove %edx, %eax - incl %eax - xorl %edx, %edx - testl %ecx, %ecx - cmove %edx, %eax - ret - -vs gcc: - -_ffs_: - movl $-1, %edx - bsfl 4(%esp), %eax - cmove %edx, %eax - addl $1, %eax - ret - -Another example of __builtin_ffs (use predsimplify to eliminate a select): - -int foo (unsigned long j) { - if (j) - return __builtin_ffs (j) - 1; - else - return 0; -} - -//===---------------------------------------------------------------------===// - -It appears gcc place string data with linkonce linkage in -.section __TEXT,__const_coal,coalesced instead of -.section __DATA,__const_coal,coalesced. -Take a look at darwin.h, there are other Darwin assembler directives that we -do not make use of. - -//===---------------------------------------------------------------------===// - -define i32 @foo(i32* %a, i32 %t) { -entry: - br label %cond_true - -cond_true: ; preds = %cond_true, %entry - %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3] - %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1] - %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1] - %tmp3 = load i32* %tmp2 ; <i32> [#uses=1] - %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1] - %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2] - %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2] - %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1] - br i1 %tmp, label %bb12, label %cond_true - -bb12: ; preds = %cond_true - ret i32 %tmp7 -} -is pessimized by -loop-reduce and -indvars - -//===---------------------------------------------------------------------===// - -u32 to float conversion improvement: - -float uint32_2_float( unsigned u ) { - float fl = (int) (u & 0xffff); - float fh = (int) (u >> 16); - fh *= 0x1.0p16f; - return fh + fl; -} - -00000000 subl $0x04,%esp -00000003 movl 0x08(%esp,1),%eax -00000007 movl %eax,%ecx -00000009 shrl $0x10,%ecx -0000000c cvtsi2ss %ecx,%xmm0 -00000010 andl $0x0000ffff,%eax -00000015 cvtsi2ss %eax,%xmm1 -00000019 mulss 0x00000078,%xmm0 -00000021 addss %xmm1,%xmm0 -00000025 movss %xmm0,(%esp,1) -0000002a flds (%esp,1) -0000002d addl $0x04,%esp -00000030 ret - -//===---------------------------------------------------------------------===// - -When using fastcc abi, align stack slot of argument of type double on 8 byte -boundary to improve performance. - -//===---------------------------------------------------------------------===// - -Codegen: - -int f(int a, int b) { - if (a == 4 || a == 6) - b++; - return b; -} - - -as: - -or eax, 2 -cmp eax, 6 -jz label - -//===---------------------------------------------------------------------===// - -GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting -simplifications for integer "x cmp y ? a : b". For example, instead of: - -int G; -void f(int X, int Y) { - G = X < 0 ? 14 : 13; -} - -compiling to: - -_f: - movl $14, %eax - movl $13, %ecx - movl 4(%esp), %edx - testl %edx, %edx - cmovl %eax, %ecx - movl %ecx, _G - ret - -it could be: -_f: - movl 4(%esp), %eax - sarl $31, %eax - notl %eax - addl $14, %eax - movl %eax, _G - ret - -etc. - -Another is: -int usesbb(unsigned int a, unsigned int b) { - return (a < b ? -1 : 0); -} -to: -_usesbb: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - sbbl %eax, %eax - ret - -instead of: -_usesbb: - xorl %eax, %eax - movl 8(%esp), %ecx - cmpl %ecx, 4(%esp) - movl $4294967295, %ecx - cmovb %ecx, %eax - ret - -//===---------------------------------------------------------------------===// - -Consider the expansion of: - -define i32 @test3(i32 %X) { - %tmp1 = urem i32 %X, 255 - ret i32 %tmp1 -} - -Currently it compiles to: - -... - movl $2155905153, %ecx - movl 8(%esp), %esi - movl %esi, %eax - mull %ecx -... - -This could be "reassociated" into: - - movl $2155905153, %eax - movl 8(%esp), %ecx - mull %ecx - -to avoid the copy. In fact, the existing two-address stuff would do this -except that mul isn't a commutative 2-addr instruction. I guess this has -to be done at isel time based on the #uses to mul? - -//===---------------------------------------------------------------------===// - -Make sure the instruction which starts a loop does not cross a cacheline -boundary. This requires knowning the exact length of each machine instruction. -That is somewhat complicated, but doable. Example 256.bzip2: - -In the new trace, the hot loop has an instruction which crosses a cacheline -boundary. In addition to potential cache misses, this can't help decoding as I -imagine there has to be some kind of complicated decoder reset and realignment -to grab the bytes from the next cacheline. - -532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines -942 942 0x3d03 movl %dh, (1809(%esp, %esi) -937 937 0x3d0a incl %esi -3 3 0x3d0b cmpb %bl, %dl -27 27 0x3d0d jnz 0x000062db <main+11707> - -//===---------------------------------------------------------------------===// - -In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. - -//===---------------------------------------------------------------------===// - -This could be a single 16-bit load. - -int f(char *p) { - if ((p[0] == 1) & (p[1] == 2)) return 1; - return 0; -} - -//===---------------------------------------------------------------------===// - -We should inline lrintf and probably other libc functions. - -//===---------------------------------------------------------------------===// - -Use the FLAGS values from arithmetic instructions more. For example, compile: - -int add_zf(int *x, int y, int a, int b) { - if ((*x += y) == 0) - return a; - else - return b; -} - -to: - addl %esi, (%rdi) - movl %edx, %eax - cmovne %ecx, %eax - ret -instead of: - -_add_zf: - addl (%rdi), %esi - movl %esi, (%rdi) - testl %esi, %esi - cmove %edx, %ecx - movl %ecx, %eax - ret - -As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll -without a test instruction. - -//===---------------------------------------------------------------------===// - -These two functions have identical effects: - -unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;} -unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;} - -We currently compile them to: - -_f: - movl 4(%esp), %eax - movl %eax, %ecx - incl %ecx - movl 8(%esp), %edx - cmpl %edx, %ecx - jne LBB1_2 #UnifiedReturnBlock -LBB1_1: #cond_true - addl $2, %eax - ret -LBB1_2: #UnifiedReturnBlock - movl %ecx, %eax - ret -_f2: - movl 4(%esp), %eax - movl %eax, %ecx - incl %ecx - cmpl 8(%esp), %ecx - sete %cl - movzbl %cl, %ecx - leal 1(%ecx,%eax), %eax - ret - -both of which are inferior to GCC's: - -_f: - movl 4(%esp), %edx - leal 1(%edx), %eax - addl $2, %edx - cmpl 8(%esp), %eax - cmove %edx, %eax - ret -_f2: - movl 4(%esp), %eax - addl $1, %eax - xorl %edx, %edx - cmpl 8(%esp), %eax - sete %dl - addl %edx, %eax - ret - -//===---------------------------------------------------------------------===// - -This code: - -void test(int X) { - if (X) abort(); -} - -is currently compiled to: - -_test: - subl $12, %esp - cmpl $0, 16(%esp) - jne LBB1_1 - addl $12, %esp - ret -LBB1_1: - call L_abort$stub - -It would be better to produce: - -_test: - subl $12, %esp - cmpl $0, 16(%esp) - jne L_abort$stub - addl $12, %esp - ret - -This can be applied to any no-return function call that takes no arguments etc. -Alternatively, the stack save/restore logic could be shrink-wrapped, producing -something like this: - -_test: - cmpl $0, 4(%esp) - jne LBB1_1 - ret -LBB1_1: - subl $12, %esp - call L_abort$stub - -Both are useful in different situations. Finally, it could be shrink-wrapped -and tail called, like this: - -_test: - cmpl $0, 4(%esp) - jne LBB1_1 - ret -LBB1_1: - pop %eax # realign stack. - call L_abort$stub - -Though this probably isn't worth it. - -//===---------------------------------------------------------------------===// - -Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with -a neg instead of a sub instruction. Consider: - -int test(char X) { return 7-X; } - -we currently produce: -_test: - movl $7, %eax - movsbl 4(%esp), %ecx - subl %ecx, %eax - ret - -We would use one fewer register if codegen'd as: - - movsbl 4(%esp), %eax - neg %eax - add $7, %eax - ret - -Note that this isn't beneficial if the load can be folded into the sub. In -this case, we want a sub: - -int test(int X) { return 7-X; } -_test: - movl $7, %eax - subl 4(%esp), %eax - ret - -//===---------------------------------------------------------------------===// - -Leaf functions that require one 4-byte spill slot have a prolog like this: - -_foo: - pushl %esi - subl $4, %esp -... -and an epilog like this: - addl $4, %esp - popl %esi - ret - -It would be smaller, and potentially faster, to push eax on entry and to -pop into a dummy register instead of using addl/subl of esp. Just don't pop -into any return registers :) - -//===---------------------------------------------------------------------===// - -The X86 backend should fold (branch (or (setcc, setcc))) into multiple -branches. We generate really poor code for: - -double testf(double a) { - return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); -} - -For example, the entry BB is: - -_testf: - subl $20, %esp - pxor %xmm0, %xmm0 - movsd 24(%esp), %xmm1 - ucomisd %xmm0, %xmm1 - setnp %al - sete %cl - testb %cl, %al - jne LBB1_5 # UnifiedReturnBlock -LBB1_1: # cond_true - - -it would be better to replace the last four instructions with: - - jp LBB1_1 - je LBB1_5 -LBB1_1: - -We also codegen the inner ?: into a diamond: - - cvtss2sd LCPI1_0(%rip), %xmm2 - cvtss2sd LCPI1_1(%rip), %xmm3 - ucomisd %xmm1, %xmm0 - ja LBB1_3 # cond_true -LBB1_2: # cond_true - movapd %xmm3, %xmm2 -LBB1_3: # cond_true - movapd %xmm2, %xmm0 - ret - -We should sink the load into xmm3 into the LBB1_2 block. This should -be pretty easy, and will nuke all the copies. - -//===---------------------------------------------------------------------===// - -This: - #include <algorithm> - inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) - { return std::make_pair(a + b, a + b < a); } - bool no_overflow(unsigned a, unsigned b) - { return !full_add(a, b).second; } - -Should compile to: - - - _Z11no_overflowjj: - addl %edi, %esi - setae %al - ret - -FIXME: That code looks wrong; bool return is normally defined as zext. - -on x86-64, not: - -__Z11no_overflowjj: - addl %edi, %esi - cmpl %edi, %esi - setae %al - movzbl %al, %eax - ret - - -//===---------------------------------------------------------------------===// - -The following code: - -bb114.preheader: ; preds = %cond_next94 - %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1] - %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1] - %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1] - %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1] - %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1] - %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2] - %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1] - %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1] - %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1] - %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1] - %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1] - br label %bb114 - -produces: - -LBB3_5: # bb114.preheader - movswl -68(%ebp), %eax - movl $32, %ecx - movl %ecx, -80(%ebp) - subl %eax, -80(%ebp) - movswl -52(%ebp), %eax - movl %ecx, -84(%ebp) - subl %eax, -84(%ebp) - movswl -70(%ebp), %eax - movl %ecx, -88(%ebp) - subl %eax, -88(%ebp) - movswl -50(%ebp), %eax - subl %eax, %ecx - movl %ecx, -76(%ebp) - movswl -42(%ebp), %eax - movl %eax, -92(%ebp) - movswl -66(%ebp), %eax - movl %eax, -96(%ebp) - movw $0, -98(%ebp) - -This appears to be bad because the RA is not folding the store to the stack -slot into the movl. The above instructions could be: - movl $32, -80(%ebp) -... - movl $32, -84(%ebp) -... -This seems like a cross between remat and spill folding. - -This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't -change, so we could simply subtract %eax from %ecx first and then use %ecx (or -vice-versa). - -//===---------------------------------------------------------------------===// - -This code: - - %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1] - br i1 %tmp659, label %cond_true662, label %cond_next715 - -produces this: - - testw %cx, %cx - movswl %cx, %esi - jns LBB4_109 # cond_next715 - -Shark tells us that using %cx in the testw instruction is sub-optimal. It -suggests using the 32-bit register (which is what ICC uses). - -//===---------------------------------------------------------------------===// - -We compile this: - -void compare (long long foo) { - if (foo < 4294967297LL) - abort(); -} - -to: - -compare: - subl $4, %esp - cmpl $0, 8(%esp) - setne %al - movzbw %al, %ax - cmpl $1, 12(%esp) - setg %cl - movzbw %cl, %cx - cmove %ax, %cx - testb $1, %cl - jne .LBB1_2 # UnifiedReturnBlock -.LBB1_1: # ifthen - call abort -.LBB1_2: # UnifiedReturnBlock - addl $4, %esp - ret - -(also really horrible code on ppc). This is due to the expand code for 64-bit -compares. GCC produces multiple branches, which is much nicer: - -compare: - subl $12, %esp - movl 20(%esp), %edx - movl 16(%esp), %eax - decl %edx - jle .L7 -.L5: - addl $12, %esp - ret - .p2align 4,,7 -.L7: - jl .L4 - cmpl $0, %eax - .p2align 4,,8 - ja .L5 -.L4: - .p2align 4,,9 - call abort - -//===---------------------------------------------------------------------===// - -Tail call optimization improvements: Tail call optimization currently -pushes all arguments on the top of the stack (their normal place for -non-tail call optimized calls) that source from the callers arguments -or that source from a virtual register (also possibly sourcing from -callers arguments). -This is done to prevent overwriting of parameters (see example -below) that might be used later. - -example: - -int callee(int32, int64); -int caller(int32 arg1, int32 arg2) { - int64 local = arg2 * 2; - return callee(arg2, (int64)local); -} - -[arg1] [!arg2 no longer valid since we moved local onto it] -[arg2] -> [(int64) -[RETADDR] local ] - -Moving arg1 onto the stack slot of callee function would overwrite -arg2 of the caller. - -Possible optimizations: - - - - Analyse the actual parameters of the callee to see which would - overwrite a caller parameter which is used by the callee and only - push them onto the top of the stack. - - int callee (int32 arg1, int32 arg2); - int caller (int32 arg1, int32 arg2) { - return callee(arg1,arg2); - } - - Here we don't need to write any variables to the top of the stack - since they don't overwrite each other. - - int callee (int32 arg1, int32 arg2); - int caller (int32 arg1, int32 arg2) { - return callee(arg2,arg1); - } - - Here we need to push the arguments because they overwrite each - other. - -//===---------------------------------------------------------------------===// - -main () -{ - int i = 0; - unsigned long int z = 0; - - do { - z -= 0x00004000; - i++; - if (i > 0x00040000) - abort (); - } while (z > 0); - exit (0); -} - -gcc compiles this to: - -_main: - subl $28, %esp - xorl %eax, %eax - jmp L2 -L3: - cmpl $262144, %eax - je L10 -L2: - addl $1, %eax - cmpl $262145, %eax - jne L3 - call L_abort$stub -L10: - movl $0, (%esp) - call L_exit$stub - -llvm: - -_main: - subl $12, %esp - movl $1, %eax - movl $16384, %ecx -LBB1_1: # bb - cmpl $262145, %eax - jge LBB1_4 # cond_true -LBB1_2: # cond_next - incl %eax - addl $4294950912, %ecx - cmpl $16384, %ecx - jne LBB1_1 # bb -LBB1_3: # bb11 - xorl %eax, %eax - addl $12, %esp - ret -LBB1_4: # cond_true - call L_abort$stub - -1. LSR should rewrite the first cmp with induction variable %ecx. -2. DAG combiner should fold - leal 1(%eax), %edx - cmpl $262145, %edx - => - cmpl $262144, %eax - -//===---------------------------------------------------------------------===// - -define i64 @test(double %X) { - %Y = fptosi double %X to i64 - ret i64 %Y -} - -compiles to: - -_test: - subl $20, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, 8(%esp) - fldl 8(%esp) - fisttpll (%esp) - movl 4(%esp), %edx - movl (%esp), %eax - addl $20, %esp - #FP_REG_KILL - ret - -This should just fldl directly from the input stack slot. - -//===---------------------------------------------------------------------===// - -This code: -int foo (int x) { return (x & 65535) | 255; } - -Should compile into: - -_foo: - movzwl 4(%esp), %eax - orl $255, %eax - ret - -instead of: -_foo: - movl $255, %eax - orl 4(%esp), %eax - andl $65535, %eax - ret - -//===---------------------------------------------------------------------===// - -We're codegen'ing multiply of long longs inefficiently: - -unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { - return arg1 * arg2; -} - -We compile to (fomit-frame-pointer): - -_LLM: - pushl %esi - movl 8(%esp), %ecx - movl 16(%esp), %esi - movl %esi, %eax - mull %ecx - imull 12(%esp), %esi - addl %edx, %esi - imull 20(%esp), %ecx - movl %esi, %edx - addl %ecx, %edx - popl %esi - ret - -This looks like a scheduling deficiency and lack of remat of the load from -the argument area. ICC apparently produces: - - movl 8(%esp), %ecx - imull 12(%esp), %ecx - movl 16(%esp), %eax - imull 4(%esp), %eax - addl %eax, %ecx - movl 4(%esp), %eax - mull 12(%esp) - addl %ecx, %edx - ret - -Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 - -//===---------------------------------------------------------------------===// - -We can fold a store into "zeroing a reg". Instead of: - -xorl %eax, %eax -movl %eax, 124(%esp) - -we should get: - -movl $0, 124(%esp) - -if the flags of the xor are dead. - -Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should -be folded into: shl [mem], 1 - -//===---------------------------------------------------------------------===// - -In SSE mode, we turn abs and neg into a load from the constant pool plus a xor -or and instruction, for example: - - xorpd LCPI1_0, %xmm2 - -However, if xmm2 gets spilled, we end up with really ugly code like this: - - movsd (%esp), %xmm0 - xorpd LCPI1_0, %xmm0 - movsd %xmm0, (%esp) - -Since we 'know' that this is a 'neg', we can actually "fold" the spill into -the neg/abs instruction, turning it into an *integer* operation, like this: - - xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) - -you could also use xorb, but xorl is less likely to lead to a partial register -stall. Here is a contrived testcase: - -double a, b, c; -void test(double *P) { - double X = *P; - a = X; - bar(); - X = -X; - b = X; - bar(); - c = X; -} - -//===---------------------------------------------------------------------===// - -The generated code on x86 for checking for signed overflow on a multiply the -obvious way is much longer than it needs to be. - -int x(int a, int b) { - long long prod = (long long)a*b; - return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); -} - -See PR2053 for more details. - -//===---------------------------------------------------------------------===// - -We should investigate using cdq/ctld (effect: edx = sar eax, 31) -more aggressively; it should cost the same as a move+shift on any modern -processor, but it's a lot shorter. Downside is that it puts more -pressure on register allocation because it has fixed operands. - -Example: -int abs(int x) {return x < 0 ? -x : x;} - -gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: -abs: - movl 4(%esp), %eax - cltd - xorl %edx, %eax - subl %edx, %eax - ret - -//===---------------------------------------------------------------------===// - -Consider: -int test(unsigned long a, unsigned long b) { return -(a < b); } - -We currently compile this to: - -define i32 @test(i32 %a, i32 %b) nounwind { - %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1] - %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] - %tmp5 = sub i32 0, %tmp34 ; <i32> [#uses=1] - ret i32 %tmp5 -} - -and - -_test: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - setb %al - movzbl %al, %eax - negl %eax - ret - -Several deficiencies here. First, we should instcombine zext+neg into sext: - -define i32 @test2(i32 %a, i32 %b) nounwind { - %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1] - %tmp34 = sext i1 %tmp3 to i32 ; <i32> [#uses=1] - ret i32 %tmp34 -} - -However, before we can do that, we have to fix the bad codegen that we get for -sext from bool: - -_test2: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - setb %al - movzbl %al, %eax - shll $31, %eax - sarl $31, %eax - ret - -This code should be at least as good as the code above. Once this is fixed, we -can optimize this specific case even more to: - - movl 8(%esp), %eax - xorl %ecx, %ecx - cmpl %eax, 4(%esp) - sbbl %ecx, %ecx - -//===---------------------------------------------------------------------===// - -Take the following code (from -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): - -extern unsigned char first_one[65536]; -int FirstOnet(unsigned long long arg1) -{ - if (arg1 >> 48) - return (first_one[arg1 >> 48]); - return 0; -} - - -The following code is currently generated: -FirstOnet: - movl 8(%esp), %eax - cmpl $65536, %eax - movl 4(%esp), %ecx - jb .LBB1_2 # UnifiedReturnBlock -.LBB1_1: # ifthen - shrl $16, %eax - movzbl first_one(%eax), %eax - ret -.LBB1_2: # UnifiedReturnBlock - xorl %eax, %eax - ret - -We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this -lets us change the cmpl into a testl, which is shorter, and eliminate the shift. - -//===---------------------------------------------------------------------===// - -We compile this function: - -define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { -entry: - %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1] - br i1 %tmp2, label %bb7, label %bb - -bb: ; preds = %entry - %tmp6 = add i32 %b, %a ; <i32> [#uses=1] - ret i32 %tmp6 - -bb7: ; preds = %entry - %tmp10 = sub i32 %a, %c ; <i32> [#uses=1] - ret i32 %tmp10 -} - -to: - -foo: # @foo -# BB#0: # %entry - movl 4(%esp), %ecx - cmpb $0, 16(%esp) - je .LBB0_2 -# BB#1: # %bb - movl 8(%esp), %eax - addl %ecx, %eax - ret -.LBB0_2: # %bb7 - movl 12(%esp), %edx - movl %ecx, %eax - subl %edx, %eax - ret - -There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a -couple more movls by putting 4(%esp) into %eax instead of %ecx. - -//===---------------------------------------------------------------------===// - -See rdar://4653682. - -From flops: - -LBB1_15: # bb310 - cvtss2sd LCPI1_0, %xmm1 - addsd %xmm1, %xmm0 - movsd 176(%esp), %xmm2 - mulsd %xmm0, %xmm2 - movapd %xmm2, %xmm3 - mulsd %xmm3, %xmm3 - movapd %xmm3, %xmm4 - mulsd LCPI1_23, %xmm4 - addsd LCPI1_24, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_25, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_26, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_27, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_28, %xmm4 - mulsd %xmm3, %xmm4 - addsd %xmm1, %xmm4 - mulsd %xmm2, %xmm4 - movsd 152(%esp), %xmm1 - addsd %xmm4, %xmm1 - movsd %xmm1, 152(%esp) - incl %eax - cmpl %eax, %esi - jge LBB1_15 # bb310 -LBB1_16: # bb358.loopexit - movsd 152(%esp), %xmm0 - addsd %xmm0, %xmm0 - addsd LCPI1_22, %xmm0 - movsd %xmm0, 152(%esp) - -Rather than spilling the result of the last addsd in the loop, we should have -insert a copy to split the interval (one for the duration of the loop, one -extending to the fall through). The register pressure in the loop isn't high -enough to warrant the spill. - -Also check why xmm7 is not used at all in the function. - -//===---------------------------------------------------------------------===// - -Take the following: - -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" -target triple = "i386-apple-darwin8" -@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2] -define fastcc void @abort_gzip() noreturn nounwind { -entry: - %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1] - br i1 %tmp.b.i, label %bb.i, label %bb4.i -bb.i: ; preds = %entry - tail call void @exit( i32 1 ) noreturn nounwind - unreachable -bb4.i: ; preds = %entry - store i1 true, i1* @in_exit.4870.b - tail call void @exit( i32 1 ) noreturn nounwind - unreachable -} -declare void @exit(i32) noreturn nounwind - -This compiles into: -_abort_gzip: ## @abort_gzip -## BB#0: ## %entry - subl $12, %esp - movb _in_exit.4870.b, %al - cmpb $1, %al - jne LBB0_2 - -We somehow miss folding the movb into the cmpb. - -//===---------------------------------------------------------------------===// - -We compile: - -int test(int x, int y) { - return x-y-1; -} - -into (-m64): - -_test: - decl %edi - movl %edi, %eax - subl %esi, %eax - ret - -it would be better to codegen as: x+~y (notl+addl) - -//===---------------------------------------------------------------------===// - -This code: - -int foo(const char *str,...) -{ - __builtin_va_list a; int x; - __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a); - return x; -} - -gets compiled into this on x86-64: - subq $200, %rsp - movaps %xmm7, 160(%rsp) - movaps %xmm6, 144(%rsp) - movaps %xmm5, 128(%rsp) - movaps %xmm4, 112(%rsp) - movaps %xmm3, 96(%rsp) - movaps %xmm2, 80(%rsp) - movaps %xmm1, 64(%rsp) - movaps %xmm0, 48(%rsp) - movq %r9, 40(%rsp) - movq %r8, 32(%rsp) - movq %rcx, 24(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 8(%rsp) - leaq (%rsp), %rax - movq %rax, 192(%rsp) - leaq 208(%rsp), %rax - movq %rax, 184(%rsp) - movl $48, 180(%rsp) - movl $8, 176(%rsp) - movl 176(%rsp), %eax - cmpl $47, %eax - jbe .LBB1_3 # bb -.LBB1_1: # bb3 - movq 184(%rsp), %rcx - leaq 8(%rcx), %rax - movq %rax, 184(%rsp) -.LBB1_2: # bb4 - movl (%rcx), %eax - addq $200, %rsp - ret -.LBB1_3: # bb - movl %eax, %ecx - addl $8, %eax - addq 192(%rsp), %rcx - movl %eax, 176(%rsp) - jmp .LBB1_2 # bb4 - -gcc 4.3 generates: - subq $96, %rsp -.LCFI0: - leaq 104(%rsp), %rax - movq %rsi, -80(%rsp) - movl $8, -120(%rsp) - movq %rax, -112(%rsp) - leaq -88(%rsp), %rax - movq %rax, -104(%rsp) - movl $8, %eax - cmpl $48, %eax - jb .L6 - movq -112(%rsp), %rdx - movl (%rdx), %eax - addq $96, %rsp - ret - .p2align 4,,10 - .p2align 3 -.L6: - mov %eax, %edx - addq -104(%rsp), %rdx - addl $8, %eax - movl %eax, -120(%rsp) - movl (%rdx), %eax - addq $96, %rsp - ret - -and it gets compiled into this on x86: - pushl %ebp - movl %esp, %ebp - subl $4, %esp - leal 12(%ebp), %eax - movl %eax, -4(%ebp) - leal 16(%ebp), %eax - movl %eax, -4(%ebp) - movl 12(%ebp), %eax - addl $4, %esp - popl %ebp - ret - -gcc 4.3 generates: - pushl %ebp - movl %esp, %ebp - movl 12(%ebp), %eax - popl %ebp - ret - -//===---------------------------------------------------------------------===// - -Teach tblgen not to check bitconvert source type in some cases. This allows us -to consolidate the following patterns in X86InstrMMX.td: - -def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))), - (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>; -def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))), - (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>; -def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))), - (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>; - -There are other cases in various td files. - -//===---------------------------------------------------------------------===// - -Take something like the following on x86-32: -unsigned a(unsigned long long x, unsigned y) {return x % y;} - -We currently generate a libcall, but we really shouldn't: the expansion is -shorter and likely faster than the libcall. The expected code is something -like the following: - - movl 12(%ebp), %eax - movl 16(%ebp), %ecx - xorl %edx, %edx - divl %ecx - movl 8(%ebp), %eax - divl %ecx - movl %edx, %eax - ret - -A similar code sequence works for division. - -//===---------------------------------------------------------------------===// - -These should compile to the same code, but the later codegen's to useless -instructions on X86. This may be a trivial dag combine (GCC PR7061): - -struct s1 { unsigned char a, b; }; -unsigned long f1(struct s1 x) { - return x.a + x.b; -} -struct s2 { unsigned a: 8, b: 8; }; -unsigned long f2(struct s2 x) { - return x.a + x.b; -} - -//===---------------------------------------------------------------------===// - -We currently compile this: - -define i32 @func1(i32 %v1, i32 %v2) nounwind { -entry: - %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) - %sum = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %normal -normal: - ret i32 %sum -overflow: - call void @llvm.trap() - unreachable -} -declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) -declare void @llvm.trap() - -to: - -_func1: - movl 4(%esp), %eax - addl 8(%esp), %eax - jo LBB1_2 ## overflow -LBB1_1: ## normal - ret -LBB1_2: ## overflow - ud2 - -it would be nice to produce "into" someday. - -//===---------------------------------------------------------------------===// - -This code: - -void vec_mpys1(int y[], const int x[], int scaler) { -int i; -for (i = 0; i < 150; i++) - y[i] += (((long long)scaler * (long long)x[i]) >> 31); -} - -Compiles to this loop with GCC 3.x: - -.L5: - movl %ebx, %eax - imull (%edi,%ecx,4) - shrdl $31, %edx, %eax - addl %eax, (%esi,%ecx,4) - incl %ecx - cmpl $149, %ecx - jle .L5 - -llvm-gcc compiles it to the much uglier: - -LBB1_1: ## bb1 - movl 24(%esp), %eax - movl (%eax,%edi,4), %ebx - movl %ebx, %ebp - imull %esi, %ebp - movl %ebx, %eax - mull %ecx - addl %ebp, %edx - sarl $31, %ebx - imull %ecx, %ebx - addl %edx, %ebx - shldl $1, %eax, %ebx - movl 20(%esp), %eax - addl %ebx, (%eax,%edi,4) - incl %edi - cmpl $150, %edi - jne LBB1_1 ## bb1 - -The issue is that we hoist the cast of "scaler" to long long outside of the -loop, the value comes into the loop as two values, and -RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the -constructed BUILD_PAIR which represents the cast value. - -//===---------------------------------------------------------------------===// - -Test instructions can be eliminated by using EFLAGS values from arithmetic -instructions. This is currently not done for mul, and, or, xor, neg, shl, -sra, srl, shld, shrd, atomic ops, and others. It is also currently not done -for read-modify-write instructions. It is also current not done if the -OF or CF flags are needed. - -The shift operators have the complication that when the shift count is -zero, EFLAGS is not set, so they can only subsume a test instruction if -the shift count is known to be non-zero. Also, using the EFLAGS value -from a shift is apparently very slow on some x86 implementations. - -In read-modify-write instructions, the root node in the isel match is -the store, and isel has no way for the use of the EFLAGS result of the -arithmetic to be remapped to the new node. - -Add and subtract instructions set OF on signed overflow and CF on unsiged -overflow, while test instructions always clear OF and CF. In order to -replace a test with an add or subtract in a situation where OF or CF is -needed, codegen must be able to prove that the operation cannot see -signed or unsigned overflow, respectively. - -//===---------------------------------------------------------------------===// - -memcpy/memmove do not lower to SSE copies when possible. A silly example is: -define <16 x float> @foo(<16 x float> %A) nounwind { - %tmp = alloca <16 x float>, align 16 - %tmp2 = alloca <16 x float>, align 16 - store <16 x float> %A, <16 x float>* %tmp - %s = bitcast <16 x float>* %tmp to i8* - %s2 = bitcast <16 x float>* %tmp2 to i8* - call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) - %R = load <16 x float>* %tmp2 - ret <16 x float> %R -} - -declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind - -which compiles to: - -_foo: - subl $140, %esp - movaps %xmm3, 112(%esp) - movaps %xmm2, 96(%esp) - movaps %xmm1, 80(%esp) - movaps %xmm0, 64(%esp) - movl 60(%esp), %eax - movl %eax, 124(%esp) - movl 56(%esp), %eax - movl %eax, 120(%esp) - movl 52(%esp), %eax - <many many more 32-bit copies> - movaps (%esp), %xmm0 - movaps 16(%esp), %xmm1 - movaps 32(%esp), %xmm2 - movaps 48(%esp), %xmm3 - addl $140, %esp - ret - -On Nehalem, it may even be cheaper to just use movups when unaligned than to -fall back to lower-granularity chunks. - -//===---------------------------------------------------------------------===// - -Implement processor-specific optimizations for parity with GCC on these -processors. GCC does two optimizations: - -1. ix86_pad_returns inserts a noop before ret instructions if immediately - preceeded by a conditional branch or is the target of a jump. -2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of - code contains more than 3 branches. - -The first one is done for all AMDs, Core2, and "Generic" -The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, - Core 2, and "Generic" - -//===---------------------------------------------------------------------===// - -Testcase: -int a(int x) { return (x & 127) > 31; } - -Current output: - movl 4(%esp), %eax - andl $127, %eax - cmpl $31, %eax - seta %al - movzbl %al, %eax - ret - -Ideal output: - xorl %eax, %eax - testl $96, 4(%esp) - setne %al - ret - -This should definitely be done in instcombine, canonicalizing the range -condition into a != condition. We get this IR: - -define i32 @a(i32 %x) nounwind readnone { -entry: - %0 = and i32 %x, 127 ; <i32> [#uses=1] - %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1] - %2 = zext i1 %1 to i32 ; <i32> [#uses=1] - ret i32 %2 -} - -Instcombine prefers to strength reduce relational comparisons to equality -comparisons when possible, this should be another case of that. This could -be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it -looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already -be redesigned to use ComputeMaskedBits and friends. - - -//===---------------------------------------------------------------------===// -Testcase: -int x(int a) { return (a&0xf0)>>4; } - -Current output: - movl 4(%esp), %eax - shrl $4, %eax - andl $15, %eax - ret - -Ideal output: - movzbl 4(%esp), %eax - shrl $4, %eax - ret - -//===---------------------------------------------------------------------===// - -Testcase: -int x(int a) { return (a & 0x80) ? 0x100 : 0; } -int y(int a) { return (a & 0x80) *2; } - -Current: - testl $128, 4(%esp) - setne %al - movzbl %al, %eax - shll $8, %eax - ret - -Better: - movl 4(%esp), %eax - addl %eax, %eax - andl $256, %eax - ret - -This is another general instcombine transformation that is profitable on all -targets. In LLVM IR, these functions look like this: - -define i32 @x(i32 %a) nounwind readnone { -entry: - %0 = and i32 %a, 128 - %1 = icmp eq i32 %0, 0 - %iftmp.0.0 = select i1 %1, i32 0, i32 256 - ret i32 %iftmp.0.0 -} - -define i32 @y(i32 %a) nounwind readnone { -entry: - %0 = shl i32 %a, 1 - %1 = and i32 %0, 256 - ret i32 %1 -} - -Replacing an icmp+select with a shift should always be considered profitable in -instcombine. - -//===---------------------------------------------------------------------===// - -Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch -properly. - -When the return value is not used (i.e. only care about the value in the -memory), x86 does not have to use add to implement these. Instead, it can use -add, sub, inc, dec instructions with the "lock" prefix. - -This is currently implemented using a bit of instruction selection trick. The -issue is the target independent pattern produces one output and a chain and we -want to map it into one that just output a chain. The current trick is to select -it into a MERGE_VALUES with the first definition being an implicit_def. The -proper solution is to add new ISD opcodes for the no-output variant. DAG -combiner can then transform the node before it gets to target node selection. - -Problem #2 is we are adding a whole bunch of x86 atomic instructions when in -fact these instructions are identical to the non-lock versions. We need a way to -add target specific information to target nodes and have this information -carried over to machine instructions. Asm printer (or JIT) can use this -information to add the "lock" prefix. - -//===---------------------------------------------------------------------===// - -_Bool bar(int *x) { return *x & 1; } - -define zeroext i1 @bar(i32* nocapture %x) nounwind readonly { -entry: - %tmp1 = load i32* %x ; <i32> [#uses=1] - %and = and i32 %tmp1, 1 ; <i32> [#uses=1] - %tobool = icmp ne i32 %and, 0 ; <i1> [#uses=1] - ret i1 %tobool -} - -bar: # @bar -# BB#0: # %entry - movl 4(%esp), %eax - movb (%eax), %al - andb $1, %al - movzbl %al, %eax - ret - -Missed optimization: should be movl+andl. - -//===---------------------------------------------------------------------===// - -Consider the following two functions compiled with clang: -_Bool foo(int *x) { return !(*x & 4); } -unsigned bar(int *x) { return !(*x & 4); } - -foo: - movl 4(%esp), %eax - testb $4, (%eax) - sete %al - movzbl %al, %eax - ret - -bar: - movl 4(%esp), %eax - movl (%eax), %eax - shrl $2, %eax - andl $1, %eax - xorl $1, %eax - ret - -The second function generates more code even though the two functions are -are functionally identical. - -//===---------------------------------------------------------------------===// - -Take the following C code: -int x(int y) { return (y & 63) << 14; } - -Code produced by gcc: - andl $63, %edi - sall $14, %edi - movl %edi, %eax - ret - -Code produced by clang: - shll $14, %edi - movl %edi, %eax - andl $1032192, %eax - ret - -The code produced by gcc is 3 bytes shorter. This sort of construct often -shows up with bitfields. - -//===---------------------------------------------------------------------===// - -Take the following C code: -int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } - -We generate the following IR with clang: -define i32 @f(i32 %a, i32 %b) nounwind readnone { -entry: - %tmp = xor i32 %b, %a ; <i32> [#uses=1] - %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1] - %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1] - %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] - ret i32 %conv5 -} - -And the following x86 code: - xorl %esi, %edi - testb $-1, %dil - sete %al - movzbl %al, %eax - ret - -A cmpb instead of the xorl+testb would be one instruction shorter. - -//===---------------------------------------------------------------------===// - -Given the following C code: -int f(int a, int b) { return (signed char)a == (signed char)b; } - -We generate the following IR with clang: -define i32 @f(i32 %a, i32 %b) nounwind readnone { -entry: - %sext = shl i32 %a, 24 ; <i32> [#uses=1] - %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1] - %sext6 = shl i32 %b, 24 ; <i32> [#uses=1] - %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1] - %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1] - %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] - ret i32 %conv5 -} - -And the following x86 code: - movsbl %sil, %eax - movsbl %dil, %ecx - cmpl %eax, %ecx - sete %al - movzbl %al, %eax - ret - - -It should be possible to eliminate the sign extensions. - -//===---------------------------------------------------------------------===// - -LLVM misses a load+store narrowing opportunity in this code: - -%struct.bf = type { i64, i16, i16, i32 } - -@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] - -define void @t1() nounwind ssp { -entry: - %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] - %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1] - %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2] - %3 = load i32* %2, align 1 ; <i32> [#uses=1] - %4 = and i32 %3, -65537 ; <i32> [#uses=1] - store i32 %4, i32* %2, align 1 - %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] - %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1] - %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2] - %8 = load i32* %7, align 1 ; <i32> [#uses=1] - %9 = and i32 %8, -131073 ; <i32> [#uses=1] - store i32 %9, i32* %7, align 1 - ret void -} - -LLVM currently emits this: - - movq bfi(%rip), %rax - andl $-65537, 8(%rax) - movq bfi(%rip), %rax - andl $-131073, 8(%rax) - ret - -It could narrow the loads and stores to emit this: - - movq bfi(%rip), %rax - andb $-2, 10(%rax) - movq bfi(%rip), %rax - andb $-3, 10(%rax) - ret - -The trouble is that there is a TokenFactor between the store and the -load, making it non-trivial to determine if there's anything between -the load and the store which would prohibit narrowing. - -//===---------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt b/contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt deleted file mode 100644 index 90be9f58cc73..000000000000 --- a/contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMX86Info - X86TargetInfo.cpp - ) - -add_dependencies(LLVMX86Info X86CodeGenTable_gen) diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/Makefile b/contrib/llvm/lib/Target/X86/TargetInfo/Makefile deleted file mode 100644 index ee91982df0c8..000000000000 --- a/contrib/llvm/lib/Target/X86/TargetInfo/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/X86/TargetInfo/Makefile ------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMX86Info - -# Hack: we need to include 'main' target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm b/contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm deleted file mode 100644 index f321778db24b..000000000000 --- a/contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm +++ /dev/null @@ -1,68 +0,0 @@ -;;===-- X86CompilationCallback_Win64.asm - Implement Win64 JIT callback ---=== -;; -;; The LLVM Compiler Infrastructure -;; -;; This file is distributed under the University of Illinois Open Source -;; License. See LICENSE.TXT for details. -;; -;;===----------------------------------------------------------------------=== -;; -;; This file implements the JIT interfaces for the X86 target. -;; -;;===----------------------------------------------------------------------=== - -extrn X86CompilationCallback2: PROC - -.code -X86CompilationCallback proc - push rbp - - ; Save RSP. - mov rbp, rsp - - ; Save all int arg registers - ; WARNING: We cannot use register spill area - we're generating stubs by hands! - push rcx - push rdx - push r8 - push r9 - - ; Align stack on 16-byte boundary. - and rsp, -16 - - ; Save all XMM arg registers. Also allocate reg spill area. - sub rsp, 96 - movaps [rsp +32], xmm0 - movaps [rsp+16+32], xmm1 - movaps [rsp+32+32], xmm2 - movaps [rsp+48+32], xmm3 - - ; JIT callee - - ; Pass prev frame and return address. - mov rcx, rbp - mov rdx, qword ptr [rbp+8] - call X86CompilationCallback2 - - ; Restore all XMM arg registers. - movaps xmm3, [rsp+48+32] - movaps xmm2, [rsp+32+32] - movaps xmm1, [rsp+16+32] - movaps xmm0, [rsp +32] - - ; Restore RSP. - mov rsp, rbp - - ; Restore all int arg registers - sub rsp, 32 - pop r9 - pop r8 - pop rdx - pop rcx - - ; Restore RBP. - pop rbp - ret -X86CompilationCallback endp - -End |