src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2010-10-11 17:22:16 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2010-10-11 17:22:16 +0000
commit	361680a51927577ccb20b14704dbaac762181843 (patch)
tree	de75a464c5dac7eceb2dbbad8b4d4e1479d79e08 /contrib/llvm/lib/Target/X86
parent	72578a23bf85ae6f14444ee38e72f9b16899bb52 (diff)

Remove more unneeded files and directories from contrib/llvm. This

still allows us to build tblgen and clang, and further reduces the footprint in the tree. Approved by: rpaulo (mentor)

Notes

Notes: svn path=/head/; revision=213695

Diffstat (limited to 'contrib/llvm/lib/Target/X86')

-rw-r--r--

contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt

-rw-r--r--

contrib/llvm/lib/Target/X86/AsmParser/Makefile

-rw-r--r--

contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt

-rw-r--r--

contrib/llvm/lib/Target/X86/AsmPrinter/Makefile

-rw-r--r--

contrib/llvm/lib/Target/X86/CMakeLists.txt

-rw-r--r--

contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt

-rw-r--r--

contrib/llvm/lib/Target/X86/Disassembler/Makefile

-rw-r--r--

contrib/llvm/lib/Target/X86/Makefile

-rw-r--r--

contrib/llvm/lib/Target/X86/README-FPStack.txt

-rw-r--r--

contrib/llvm/lib/Target/X86/README-MMX.txt

-rw-r--r--

contrib/llvm/lib/Target/X86/README-SSE.txt

907

-rw-r--r--

contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt

-rw-r--r--

contrib/llvm/lib/Target/X86/README-X86-64.txt

273

-rw-r--r--

contrib/llvm/lib/Target/X86/README.txt

1962

-rw-r--r--

contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt

-rw-r--r--

contrib/llvm/lib/Target/X86/TargetInfo/Makefile

-rw-r--r--

contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm

17 files changed, 0 insertions, 3555 deletions

diff --git a/contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt b/contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt
deleted file mode 100644
index 40dbdd72faa1..000000000000
--- a/contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt
+++ /dev/null

@@ -1,7 +0,0 @@

-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )

-add_llvm_library(LLVMX86AsmParser

- X86AsmLexer.cpp

- X86AsmParser.cpp

- )

-add_dependencies(LLVMX86AsmParser X86CodeGenTable_gen)

diff --git a/contrib/llvm/lib/Target/X86/AsmParser/Makefile b/contrib/llvm/lib/Target/X86/AsmParser/Makefile
deleted file mode 100644
index fb9760796622..000000000000
--- a/contrib/llvm/lib/Target/X86/AsmParser/Makefile
+++ /dev/null

@@ -1,15 +0,0 @@

-##===- lib/Target/X86/AsmParser/Makefile -------------------*- Makefile -*-===##

-# The LLVM Compiler Infrastructure

-# This file is distributed under the University of Illinois Open Source

-# License. See LICENSE.TXT for details.

-##===----------------------------------------------------------------------===##

-LEVEL = ../../../..

-LIBRARYNAME = LLVMX86AsmParser

-# Hack: we need to include 'main' x86 target directory to grab private headers

-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..

-include $(LEVEL)/Makefile.common

diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt b/contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt
deleted file mode 100644
index 033973eeeff9..000000000000
--- a/contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt
+++ /dev/null

@@ -1,8 +0,0 @@

-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )

-add_llvm_library(LLVMX86AsmPrinter

- X86ATTInstPrinter.cpp

- X86IntelInstPrinter.cpp

- X86InstComments.cpp

- )

-add_dependencies(LLVMX86AsmPrinter X86CodeGenTable_gen)

diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/Makefile b/contrib/llvm/lib/Target/X86/AsmPrinter/Makefile
deleted file mode 100644
index c82aa330a20c..000000000000
--- a/contrib/llvm/lib/Target/X86/AsmPrinter/Makefile
+++ /dev/null

@@ -1,15 +0,0 @@

-##===- lib/Target/X86/AsmPrinter/Makefile ------------------*- Makefile -*-===##

-# The LLVM Compiler Infrastructure

-# This file is distributed under the University of Illinois Open Source

-# License. See LICENSE.TXT for details.

-##===----------------------------------------------------------------------===##

-LEVEL = ../../../..

-LIBRARYNAME = LLVMX86AsmPrinter

-# Hack: we need to include 'main' x86 target directory to grab private headers

-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..

-include $(LEVEL)/Makefile.common

diff --git a/contrib/llvm/lib/Target/X86/CMakeLists.txt b/contrib/llvm/lib/Target/X86/CMakeLists.txt
deleted file mode 100644
index e9399f5c8322..000000000000
--- a/contrib/llvm/lib/Target/X86/CMakeLists.txt
+++ /dev/null

@@ -1,52 +0,0 @@

-set(LLVM_TARGET_DEFINITIONS X86.td)

-tablegen(X86GenRegisterInfo.h.inc -gen-register-desc-header)

-tablegen(X86GenRegisterNames.inc -gen-register-enums)

-tablegen(X86GenRegisterInfo.inc -gen-register-desc)

-tablegen(X86GenDisassemblerTables.inc -gen-disassembler)

-tablegen(X86GenInstrNames.inc -gen-instr-enums)

-tablegen(X86GenInstrInfo.inc -gen-instr-desc)

-tablegen(X86GenAsmWriter.inc -gen-asm-writer)

-tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)

-tablegen(X86GenAsmMatcher.inc -gen-asm-matcher)

-tablegen(X86GenDAGISel.inc -gen-dag-isel)

-tablegen(X86GenFastISel.inc -gen-fast-isel)

-tablegen(X86GenCallingConv.inc -gen-callingconv)

-tablegen(X86GenSubtarget.inc -gen-subtarget)

-tablegen(X86GenEDInfo.inc -gen-enhanced-disassembly-info)

-set(sources

- SSEDomainFix.cpp

- X86AsmBackend.cpp

- X86AsmPrinter.cpp

- X86COFFMachineModuleInfo.cpp

- X86CodeEmitter.cpp

- X86ELFWriterInfo.cpp

- X86FastISel.cpp

- X86FloatingPoint.cpp

- X86ISelDAGToDAG.cpp

- X86ISelLowering.cpp

- X86InstrInfo.cpp

- X86JITInfo.cpp

- X86MCAsmInfo.cpp

- X86MCCodeEmitter.cpp

- X86MCInstLower.cpp

- X86RegisterInfo.cpp

- X86SelectionDAGInfo.cpp

- X86Subtarget.cpp

- X86TargetMachine.cpp

- X86TargetObjectFile.cpp

- )

-if( CMAKE_CL_64 )

- enable_language(ASM_MASM)

- ADD_CUSTOM_COMMAND(

- OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj

- COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm

- DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm

- )

- set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj)

-endif()

-add_llvm_target(X86CodeGen ${sources})

diff --git a/contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt b/contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt
deleted file mode 100644
index 97589c00515b..000000000000
--- a/contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt
+++ /dev/null

@@ -1,14 +0,0 @@

-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )

-add_llvm_library(LLVMX86Disassembler

- X86Disassembler.cpp

- X86DisassemblerDecoder.c

- )

-# workaround for hanging compilation on MSVC9 and 10

-if( MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )

-set_property(

- SOURCE X86Disassembler.cpp

- PROPERTY COMPILE_FLAGS "/Od"

- )

-endif()

-add_dependencies(LLVMX86Disassembler X86CodeGenTable_gen)

diff --git a/contrib/llvm/lib/Target/X86/Disassembler/Makefile b/contrib/llvm/lib/Target/X86/Disassembler/Makefile
deleted file mode 100644
index 8669fd8fd930..000000000000
--- a/contrib/llvm/lib/Target/X86/Disassembler/Makefile
+++ /dev/null

@@ -1,16 +0,0 @@

-##===- lib/Target/X86/Disassembler/Makefile ----------------*- Makefile -*-===##

-# The LLVM Compiler Infrastructure

-# This file is distributed under the University of Illinois Open Source

-# License. See LICENSE.TXT for details.

-##===----------------------------------------------------------------------===##

-LEVEL = ../../../..

-LIBRARYNAME = LLVMX86Disassembler

-# Hack: we need to include 'main' x86 target directory to grab private headers

-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..

-include $(LEVEL)/Makefile.common

diff --git a/contrib/llvm/lib/Target/X86/Makefile b/contrib/llvm/lib/Target/X86/Makefile
deleted file mode 100644
index f4ff894a2af7..000000000000
--- a/contrib/llvm/lib/Target/X86/Makefile
+++ /dev/null

@@ -1,25 +0,0 @@

-##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##

-# The LLVM Compiler Infrastructure

-# This file is distributed under the University of Illinois Open Source

-# License. See LICENSE.TXT for details.

-##===----------------------------------------------------------------------===##

-LEVEL = ../../..

-LIBRARYNAME = LLVMX86CodeGen

-TARGET = X86

-# Make sure that tblgen is run, first thing.

-BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \

- X86GenRegisterInfo.inc X86GenInstrNames.inc \

- X86GenInstrInfo.inc X86GenAsmWriter.inc X86GenAsmMatcher.inc \

- X86GenAsmWriter1.inc X86GenDAGISel.inc \

- X86GenDisassemblerTables.inc X86GenFastISel.inc \

- X86GenCallingConv.inc X86GenSubtarget.inc \

- X86GenEDInfo.inc

-DIRS = AsmPrinter AsmParser Disassembler TargetInfo

-include $(LEVEL)/Makefile.common

diff --git a/contrib/llvm/lib/Target/X86/README-FPStack.txt b/contrib/llvm/lib/Target/X86/README-FPStack.txt
deleted file mode 100644
index 39efd2dbcf1a..000000000000
--- a/contrib/llvm/lib/Target/X86/README-FPStack.txt
+++ /dev/null

@@ -1,85 +0,0 @@

-//===---------------------------------------------------------------------===//

-// Random ideas for the X86 backend: FP stack related stuff

-//===---------------------------------------------------------------------===//

-Some targets (e.g. athlons) prefer freep to fstp ST(0):

-http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html

-//===---------------------------------------------------------------------===//

-This should use fiadd on chips where it is profitable:

-double foo(double P, int *I) { return P+*I; }

-We have fiadd patterns now but the followings have the same cost and

-complexity. We need a way to specify the later is more profitable.

-def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,

- [(set RFP:$dst, (fadd RFP:$src1,

- (extloadf64f32 addr:$src2)))]>;

- // ST(0) = ST(0) + [mem32]

-def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,

- [(set RFP:$dst, (fadd RFP:$src1,

- (X86fild addr:$src2, i32)))]>;

- // ST(0) = ST(0) + [mem32int]

-//===---------------------------------------------------------------------===//

-The FP stackifier should handle simple permutates to reduce number of shuffle

-instructions, e.g. turning:

-fld P -> fld Q

-fld Q fld P

-fxch

-or:

-fxch -> fucomi

-fucomi jl X

-jg X

-Ideas:

-http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html

-//===---------------------------------------------------------------------===//

-Add a target specific hook to DAG combiner to handle SINT_TO_FP and

-FP_TO_SINT when the source operand is already in memory.

-//===---------------------------------------------------------------------===//

-Open code rint,floor,ceil,trunc:

-http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html

-http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html

-Opencode the sincos[f] libcall.

-//===---------------------------------------------------------------------===//

-None of the FPStack instructions are handled in

-X86RegisterInfo::foldMemoryOperand, which prevents the spiller from

-folding spill code into the instructions.

-//===---------------------------------------------------------------------===//

-Currently the x86 codegen isn't very good at mixing SSE and FPStack

-code:

-unsigned int foo(double x) { return x; }

-foo:

- subl $20, %esp

- movsd 24(%esp), %xmm0

- movsd %xmm0, 8(%esp)

- fldl 8(%esp)

- fisttpll (%esp)

- movl (%esp), %eax

- addl $20, %esp

- ret

-This just requires being smarter when custom expanding fptoui.

-//===---------------------------------------------------------------------===//

diff --git a/contrib/llvm/lib/Target/X86/README-MMX.txt b/contrib/llvm/lib/Target/X86/README-MMX.txt
deleted file mode 100644
index a6c8616b6d2c..000000000000
--- a/contrib/llvm/lib/Target/X86/README-MMX.txt
+++ /dev/null

@@ -1,71 +0,0 @@

-//===---------------------------------------------------------------------===//

-// Random ideas for the X86 backend: MMX-specific stuff.

-//===---------------------------------------------------------------------===//

-This:

-#include <mmintrin.h>

-__v2si qux(int A) {

- return (__v2si){ 0, A };

-is compiled into:

-_qux:

- subl $28, %esp

- movl 32(%esp), %eax

- movd %eax, %mm0

- movq %mm0, (%esp)

- movl (%esp), %eax

- movl %eax, 20(%esp)

- movq %mm0, 8(%esp)

- movl 12(%esp), %eax

- movl %eax, 16(%esp)

- movq 16(%esp), %mm0

- addl $28, %esp

- ret

-Yuck!

-GCC gives us:

-_qux:

- subl $12, %esp

- movl 16(%esp), %eax

- movl 20(%esp), %edx

- movl $0, (%eax)

- movl %edx, 4(%eax)

- addl $12, %esp

- ret $4

-//===---------------------------------------------------------------------===//

-We generate crappy code for this:

-__m64 t() {

- return _mm_cvtsi32_si64(1);

-_t:

- subl $12, %esp

- movl $1, %eax

- movd %eax, %mm0

- movq %mm0, (%esp)

- movl (%esp), %eax

- movl 4(%esp), %edx

- addl $12, %esp

- ret

-The extra stack traffic is covered in the previous entry. But the other reason

-is we are not smart about materializing constants in MMX registers. With -m64

- movl $1, %eax

- movd %eax, %mm0

- movd %mm0, %rax

- ret

-We should be using a constantpool load instead:

- movq LC0(%rip), %rax

diff --git a/contrib/llvm/lib/Target/X86/README-SSE.txt b/contrib/llvm/lib/Target/X86/README-SSE.txt
deleted file mode 100644
index f96b22f1e204..000000000000
--- a/contrib/llvm/lib/Target/X86/README-SSE.txt
+++ /dev/null

@@ -1,907 +0,0 @@

-//===---------------------------------------------------------------------===//

-// Random ideas for the X86 backend: SSE-specific stuff.

-//===---------------------------------------------------------------------===//

-SSE Variable shift can be custom lowered to something like this, which uses a

-small table + unaligned load + shuffle instead of going through memory.

-__m128i_shift_right:

- .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15

- .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1

-...

-__m128i shift_right(__m128i value, unsigned long offset) {

- return _mm_shuffle_epi8(value,

- _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));

-//===---------------------------------------------------------------------===//

-SSE has instructions for doing operations on complex numbers, we should pattern

-match them. Compiling this:

-_Complex float f32(_Complex float A, _Complex float B) {

- return A+B;

-into:

-_f32:

- movdqa %xmm0, %xmm2

- addss %xmm1, %xmm2

- pshufd $16, %xmm2, %xmm2

- pshufd $1, %xmm1, %xmm1

- pshufd $1, %xmm0, %xmm0

- addss %xmm1, %xmm0

- pshufd $16, %xmm0, %xmm1

- movdqa %xmm2, %xmm0

- unpcklps %xmm1, %xmm0

- ret

-seems silly.

-//===---------------------------------------------------------------------===//

-Expand libm rounding functions inline: Significant speedups possible.

-http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html

-//===---------------------------------------------------------------------===//

-When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and

-other fast SSE modes.

-//===---------------------------------------------------------------------===//

-Think about doing i64 math in SSE regs on x86-32.

-//===---------------------------------------------------------------------===//

-This testcase should have no SSE instructions in it, and only one load from

-a constant pool:

-double %test3(bool %B) {

- %C = select bool %B, double 123.412, double 523.01123123

- ret double %C

-Currently, the select is being lowered, which prevents the dag combiner from

-turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'

-The pattern isel got this one right.

-//===---------------------------------------------------------------------===//

-SSE should implement 'select_cc' using 'emulated conditional moves' that use

-pcmp/pand/pandn/por to do a selection instead of a conditional branch:

-double %X(double %Y, double %Z, double %A, double %B) {

- %C = setlt double %A, %B

- %z = fadd double %Z, 0.0 ;; select operand is not a load

- %D = select bool %C, double %Y, double %z

- ret double %D

-We currently emit:

-_X:

- subl $12, %esp

- xorpd %xmm0, %xmm0

- addsd 24(%esp), %xmm0

- movsd 32(%esp), %xmm1

- movsd 16(%esp), %xmm2

- ucomisd 40(%esp), %xmm1

- jb LBB_X_2

-LBB_X_1:

- movsd %xmm0, %xmm2

-LBB_X_2:

- movsd %xmm2, (%esp)

- fldl (%esp)

- addl $12, %esp

- ret

-//===---------------------------------------------------------------------===//

-Lower memcpy / memset to a series of SSE 128 bit move instructions when it's

-feasible.

-//===---------------------------------------------------------------------===//

-Codegen:

- if (copysign(1.0, x) == copysign(1.0, y))

-into:

- if (x^y & mask)

-when using SSE.

-//===---------------------------------------------------------------------===//

-Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half

-of a v4sf value.

-//===---------------------------------------------------------------------===//

-Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.

-Perhaps use pxor / xorp* to clear a XMM register first?

-//===---------------------------------------------------------------------===//

-External test Nurbs exposed some problems. Look for

-__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc

-emits:

- movaps (%edx), %xmm2 #59.21

- movaps (%edx), %xmm5 #60.21

- movaps (%edx), %xmm4 #61.21

- movaps (%edx), %xmm3 #62.21

- movl 40(%ecx), %ebp #69.49

- shufps $0, %xmm2, %xmm5 #60.21

- movl 100(%esp), %ebx #69.20

- movl (%ebx), %edi #69.20

- imull %ebp, %edi #69.49

- addl (%eax), %edi #70.33

- shufps $85, %xmm2, %xmm4 #61.21

- shufps $170, %xmm2, %xmm3 #62.21

- shufps $255, %xmm2, %xmm2 #63.21

- lea (%ebp,%ebp,2), %ebx #69.49

- negl %ebx #69.49

- lea -3(%edi,%ebx), %ebx #70.33

- shll $4, %ebx #68.37

- addl 32(%ecx), %ebx #68.37

- testb $15, %bl #91.13

- jne L_B1.24 # Prob 5% #91.13

-This is the llvm code after instruction scheduling:

-cond_next140 (0xa910740, LLVM BB @0xa90beb0):

- %reg1078 = MOV32ri -3

- %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0

- %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40

- %reg1080 = IMUL32rr %reg1079, %reg1037

- %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0

- %reg1038 = LEA32r %reg1081, 1, %reg1080, -3

- %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32

- %reg1082 = SHL32ri %reg1038, 4

- %reg1039 = ADD32rr %reg1036, %reg1082

- %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0

- %reg1034 = SHUFPSrr %reg1083, %reg1083, 170

- %reg1032 = SHUFPSrr %reg1083, %reg1083, 0

- %reg1035 = SHUFPSrr %reg1083, %reg1083, 255

- %reg1033 = SHUFPSrr %reg1083, %reg1083, 85

- %reg1040 = MOV32rr %reg1039

- %reg1084 = AND32ri8 %reg1039, 15

- CMP32ri8 %reg1084, 0

- JE mbb<cond_next204,0xa914d30>

-Still ok. After register allocation:

-cond_next140 (0xa910740, LLVM BB @0xa90beb0):

- %EAX = MOV32ri -3

- %EDX = MOV32rm <fi#3>, 1, %NOREG, 0

- ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0

- %EDX = MOV32rm <fi#7>, 1, %NOREG, 0

- %EDX = MOV32rm %EDX, 1, %NOREG, 40

- IMUL32rr %EAX<def&use>, %EDX

- %ESI = MOV32rm <fi#5>, 1, %NOREG, 0

- %ESI = MOV32rm %ESI, 1, %NOREG, 0

- MOV32mr <fi#4>, 1, %NOREG, 0, %ESI

- %EAX = LEA32r %ESI, 1, %EAX, -3

- %ESI = MOV32rm <fi#7>, 1, %NOREG, 0

- %ESI = MOV32rm %ESI, 1, %NOREG, 32

- %EDI = MOV32rr %EAX

- SHL32ri %EDI<def&use>, 4

- ADD32rr %EDI<def&use>, %ESI

- %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0

- %XMM1 = MOVAPSrr %XMM0

- SHUFPSrr %XMM1<def&use>, %XMM1, 170

- %XMM2 = MOVAPSrr %XMM0

- SHUFPSrr %XMM2<def&use>, %XMM2, 0

- %XMM3 = MOVAPSrr %XMM0

- SHUFPSrr %XMM3<def&use>, %XMM3, 255

- SHUFPSrr %XMM0<def&use>, %XMM0, 85

- %EBX = MOV32rr %EDI

- AND32ri8 %EBX<def&use>, 15

- CMP32ri8 %EBX, 0

- JE mbb<cond_next204,0xa914d30>

-This looks really bad. The problem is shufps is a destructive opcode. Since it

-appears as operand two in more than one shufps ops. It resulted in a number of

-copies. Note icc also suffers from the same problem. Either the instruction

-selector should select pshufd or The register allocator can made the two-address

-to three-address transformation.

-It also exposes some other problems. See MOV32ri -3 and the spills.

-//===---------------------------------------------------------------------===//

-Consider:

-__m128 test(float a) {

- return _mm_set_ps(0.0, 0.0, 0.0, a*a);

-This compiles into:

-movss 4(%esp), %xmm1

-mulss %xmm1, %xmm1

-xorps %xmm0, %xmm0

-movss %xmm1, %xmm0

-ret

-Because mulss doesn't modify the top 3 elements, the top elements of

-xmm1 are already zero'd. We could compile this to:

-movss 4(%esp), %xmm0

-mulss %xmm0, %xmm0

-ret

-//===---------------------------------------------------------------------===//

-Here's a sick and twisted idea. Consider code like this:

-__m128 test(__m128 a) {

- float b = *(float*)&A;

- ...

- return _mm_set_ps(0.0, 0.0, 0.0, b);

-This might compile to this code:

-movaps c(%esp), %xmm1

-xorps %xmm0, %xmm0

-movss %xmm1, %xmm0

-ret

-Now consider if the ... code caused xmm1 to get spilled. This might produce

-this code:

-movaps c(%esp), %xmm1

-movaps %xmm1, c2(%esp)

-...

-xorps %xmm0, %xmm0

-movaps c2(%esp), %xmm1

-movss %xmm1, %xmm0

-ret

-However, since the reload is only used by these instructions, we could

-"fold" it into the uses, producing something like this:

-movaps c(%esp), %xmm1

-movaps %xmm1, c2(%esp)

-...

-movss c2(%esp), %xmm0

-ret

-... saving two instructions.

-The basic idea is that a reload from a spill slot, can, if only one 4-byte

-chunk is used, bring in 3 zeros the one element instead of 4 elements.

-This can be used to simplify a variety of shuffle operations, where the

-elements are fixed zeros.

-//===---------------------------------------------------------------------===//

-This code generates ugly code, probably due to costs being off or something:

-define void @test(float* %P, <4 x float>* %P2 ) {

- %xFloat0.688 = load float* %P

- %tmp = load <4 x float>* %P2

- %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3

- store <4 x float> %inFloat3.713, <4 x float>* %P2

- ret void

-Generates:

-_test:

- movl 8(%esp), %eax

- movaps (%eax), %xmm0

- pxor %xmm1, %xmm1

- movaps %xmm0, %xmm2

- shufps $50, %xmm1, %xmm2

- shufps $132, %xmm2, %xmm0

- movaps %xmm0, (%eax)

- ret

-Would it be better to generate:

-_test:

- movl 8(%esp), %ecx

- movaps (%ecx), %xmm0

- xor %eax, %eax

- pinsrw $6, %eax, %xmm0

- pinsrw $7, %eax, %xmm0

- movaps %xmm0, (%ecx)

- ret

-//===---------------------------------------------------------------------===//

-Some useful information in the Apple Altivec / SSE Migration Guide:

-http://developer.apple.com/documentation/Performance/Conceptual/

-Accelerate_sse_migration/index.html

-e.g. SSE select using and, andnot, or. Various SSE compare translations.

-//===---------------------------------------------------------------------===//

-Add hooks to commute some CMPP operations.

-//===---------------------------------------------------------------------===//

-Apply the same transformation that merged four float into a single 128-bit load

-to loads from constant pool.

-//===---------------------------------------------------------------------===//

-Floating point max / min are commutable when -enable-unsafe-fp-path is

-specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other

-nodes which are selected to max / min instructions that are marked commutable.

-//===---------------------------------------------------------------------===//

-We should materialize vector constants like "all ones" and "signbit" with

-code like:

- cmpeqps xmm1, xmm1 ; xmm1 = all-ones

-and:

- cmpeqps xmm1, xmm1 ; xmm1 = all-ones

- psrlq xmm1, 31 ; xmm1 = all 100000000000...

-instead of using a load from the constant pool. The later is important for

-ABS/NEG/copysign etc.

-//===---------------------------------------------------------------------===//

-These functions:

-#include <xmmintrin.h>

-__m128i a;

-void x(unsigned short n) {

- a = _mm_slli_epi32 (a, n);

-void y(unsigned n) {

- a = _mm_slli_epi32 (a, n);

-compile to ( -O3 -static -fomit-frame-pointer):

-_x:

- movzwl 4(%esp), %eax

- movd %eax, %xmm0

- movaps _a, %xmm1

- pslld %xmm0, %xmm1

- movaps %xmm1, _a

- ret

-_y:

- movd 4(%esp), %xmm0

- movaps _a, %xmm1

- pslld %xmm0, %xmm1

- movaps %xmm1, _a

- ret

-"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems

-like movd would be sufficient in both cases as the value is already zero

-extended in the 32-bit stack slot IIRC. For signed short, it should also be

-save, as a really-signed value would be undefined for pslld.

-//===---------------------------------------------------------------------===//

-#include <math.h>

-int t1(double d) { return signbit(d); }

-This currently compiles to:

- subl $12, %esp

- movsd 16(%esp), %xmm0

- movsd %xmm0, (%esp)

- movl 4(%esp), %eax

- shrl $31, %eax

- addl $12, %esp

- ret

-We should use movmskp{s|d} instead.

-//===---------------------------------------------------------------------===//

-CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single

-(aligned) vector load. This functionality has a couple of problems.

-1. The code to infer alignment from loads of globals is in the X86 backend,

- not the dag combiner. This is because dagcombine2 needs to be able to see

- through the X86ISD::Wrapper node, which DAGCombine can't really do.

-2. The code for turning 4 x load into a single vector load is target

- independent and should be moved to the dag combiner.

-3. The code for turning 4 x load into a vector load can only handle a direct

- load from a global or a direct load from the stack. It should be generalized

- to handle any load from P, P+4, P+8, P+12, where P can be anything.

-4. The alignment inference code cannot handle loads from globals in non-static

- mode because it doesn't look through the extra dyld stub load. If you try

- vec_align.ll without -relocation-model=static, you'll see what I mean.

-//===---------------------------------------------------------------------===//

-We should lower store(fneg(load p), q) into an integer load+xor+store, which

-eliminates a constant pool load. For example, consider:

-define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {

-entry:

- %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]

- %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly

- ret i64 %tmp20

-declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly

-This currently compiles to:

-LCPI1_0: # <4 x float>

- .long 2147483648 # float -0

-_ccosf:

- subl $12, %esp

- movss 16(%esp), %xmm0

- movss %xmm0, 4(%esp)

- movss 20(%esp), %xmm0

- xorps LCPI1_0, %xmm0

- movss %xmm0, (%esp)

- call L_ccoshf$stub

- addl $12, %esp

- ret

-Note the load into xmm0, then xor (to negate), then store. In PIC mode,

-this code computes the pic base and does two loads to do the constant pool

-load, so the improvement is much bigger.

-The tricky part about this xform is that the argument load/store isn't exposed

-until post-legalize, and at that point, the fneg has been custom expanded into

-an X86 fxor. This means that we need to handle this case in the x86 backend

-instead of in target independent code.

-//===---------------------------------------------------------------------===//

-Non-SSE4 insert into 16 x i8 is atrociously bad.

-//===---------------------------------------------------------------------===//

-<2 x i64> extract is substantially worse than <2 x f64>, even if the destination

-is memory.

-//===---------------------------------------------------------------------===//

-SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext

-sitting between the truncate and the extract.

-//===---------------------------------------------------------------------===//

-INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert

-any number of 0.0 simultaneously. Currently we only use it for simple

-insertions.

-See comments in LowerINSERT_VECTOR_ELT_SSE4.

-//===---------------------------------------------------------------------===//

-On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not

-Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are

-legal, it'll just take a few extra patterns written in the .td file.

-Note: this is not a code quality issue; the custom lowered code happens to be

-right, but we shouldn't have to custom lower anything. This is probably related

-to <2 x i64> ops being so bad.

-//===---------------------------------------------------------------------===//

-'select' on vectors and scalars could be a whole lot better. We currently

-lower them to conditional branches. On x86-64 for example, we compile this:

-double test(double a, double b, double c, double d) { return a<b ? c : d; }

-to:

-_test:

- ucomisd %xmm0, %xmm1

- ja LBB1_2 # entry

-LBB1_1: # entry

- movapd %xmm3, %xmm2

-LBB1_2: # entry

- movapd %xmm2, %xmm0

- ret

-instead of:

-_test:

- cmpltsd %xmm1, %xmm0

- andpd %xmm0, %xmm2

- andnpd %xmm3, %xmm0

- orpd %xmm2, %xmm0

- ret

-For unpredictable branches, the later is much more efficient. This should

-just be a matter of having scalar sse map to SELECT_CC and custom expanding

-or iseling it.

-//===---------------------------------------------------------------------===//

-LLVM currently generates stack realignment code, when it is not necessary

-needed. The problem is that we need to know about stack alignment too early,

-before RA runs.

-At that point we don't know, whether there will be vector spill, or not.

-Stack realignment logic is overly conservative here, but otherwise we can

-produce unaligned loads/stores.

-Fixing this will require some huge RA changes.

-Testcase:

-#include <emmintrin.h>

-typedef short vSInt16 __attribute__ ((__vector_size__ (16)));

-static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,

-- 22725, - 12873};;

-vSInt16 madd(vSInt16 b)

- return _mm_madd_epi16(a, b);

-Generated code (x86-32, linux):

-madd:

- pushl %ebp

- movl %esp, %ebp

- andl $-16, %esp

- movaps .LCPI1_0, %xmm1

- pmaddwd %xmm1, %xmm0

- movl %ebp, %esp

- popl %ebp

- ret

-//===---------------------------------------------------------------------===//

-Consider:

-#include <emmintrin.h>

-__m128 foo2 (float x) {

- return _mm_set_ps (0, 0, x, 0);

-In x86-32 mode, we generate this spiffy code:

-_foo2:

- movss 4(%esp), %xmm0

- pshufd $81, %xmm0, %xmm0

- ret

-in x86-64 mode, we generate this code, which could be better:

-_foo2:

- xorps %xmm1, %xmm1

- movss %xmm0, %xmm1

- pshufd $81, %xmm1, %xmm0

- ret

-In sse4 mode, we could use insertps to make both better.

-Here's another testcase that could use insertps [mem]:

-#include <xmmintrin.h>

-extern float x2, x3;

-__m128 foo1 (float x1, float x4) {

- return _mm_set_ps (x2, x1, x3, x4);

-gcc mainline compiles it to:

-foo1:

- insertps $0x10, x2(%rip), %xmm0

- insertps $0x10, x3(%rip), %xmm1

- movaps %xmm1, %xmm2

- movlhps %xmm0, %xmm2

- movaps %xmm2, %xmm0

- ret

-//===---------------------------------------------------------------------===//

-We compile vector multiply-by-constant into poor code:

-define <4 x i32> @f(<4 x i32> %i) nounwind {

- %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >

- ret <4 x i32> %A

-On targets without SSE4.1, this compiles into:

-LCPI1_0: ## <4 x i32>

- .long 10

- .text

- .align 4,0x90

- .globl _f

-_f:

- pshufd $3, %xmm0, %xmm1

- movd %xmm1, %eax

- imull LCPI1_0+12, %eax

- movd %eax, %xmm1

- pshufd $1, %xmm0, %xmm2

- movd %xmm2, %eax

- imull LCPI1_0+4, %eax

- movd %eax, %xmm2

- punpckldq %xmm1, %xmm2

- movd %xmm0, %eax

- imull LCPI1_0, %eax

- movd %eax, %xmm1

- movhlps %xmm0, %xmm0

- movd %xmm0, %eax

- imull LCPI1_0+8, %eax

- movd %eax, %xmm0

- punpckldq %xmm0, %xmm1

- movaps %xmm1, %xmm0

- punpckldq %xmm2, %xmm0

- ret

-It would be better to synthesize integer vector multiplication by constants

-using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,

-simple cases such as multiplication by powers of two would be better as

-vector shifts than as multiplications.

-//===---------------------------------------------------------------------===//

-We compile this:

-__m128i

-foo2 (char x)

- return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);

-into:

- movl $1, %eax

- xorps %xmm0, %xmm0

- pinsrw $2, %eax, %xmm0

- movzbl 4(%esp), %eax

- pinsrw $3, %eax, %xmm0

- movl $256, %eax

- pinsrw $7, %eax, %xmm0

- ret

-gcc-4.2:

- subl $12, %esp

- movzbl 16(%esp), %eax

- movdqa LC0, %xmm0

- pinsrw $3, %eax, %xmm0

- addl $12, %esp

- ret

- .const

- .align 4

-LC0:

- .word 0

- .word 1

- .word 0

- .word 256

-With SSE4, it should be

- movdqa .LC0(%rip), %xmm0

- pinsrb $6, %edi, %xmm0

-//===---------------------------------------------------------------------===//

-We should transform a shuffle of two vectors of constants into a single vector

-of constants. Also, insertelement of a constant into a vector of constants

-should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.

-We compiled it to something horrible:

- .align 4

-LCPI1_1: ## float

- .long 1065353216 ## float 1

- .const

- .align 4

-LCPI1_0: ## <4 x float>

- .space 4

- .long 1065353216 ## float 1

- .space 4

- .long 1065353216 ## float 1

- .text

- .align 4,0x90

- .globl _t

-_t:

- xorps %xmm0, %xmm0

- movhps LCPI1_0, %xmm0

- movss LCPI1_1, %xmm1

- movaps %xmm0, %xmm2

- shufps $2, %xmm1, %xmm2

- shufps $132, %xmm2, %xmm0

- movaps %xmm0, 0

-//===---------------------------------------------------------------------===//

-rdar://5907648

-This function:

-float foo(unsigned char x) {

- return x;

-compiles to (x86-32):

-define float @foo(i8 zeroext %x) nounwind {

- %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]

- ret float %tmp12

-compiles to:

-_foo:

- subl $4, %esp

- movzbl 8(%esp), %eax

- cvtsi2ss %eax, %xmm0

- movss %xmm0, (%esp)

- flds (%esp)

- addl $4, %esp

- ret

-We should be able to use:

- cvtsi2ss 8($esp), %xmm0

-since we know the stack slot is already zext'd.

-//===---------------------------------------------------------------------===//

-Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))

-when code size is critical. movlps is slower than movsd on core2 but it's one

-byte shorter.

-//===---------------------------------------------------------------------===//

-We should use a dynamic programming based approach to tell when using FPStack

-operations is cheaper than SSE. SciMark montecarlo contains code like this

-for example:

-double MonteCarlo_num_flops(int Num_samples) {

- return ((double) Num_samples)* 4.0;

-In fpstack mode, this compiles into:

-LCPI1_0:

- .long 1082130432 ## float 4.000000e+00

-_MonteCarlo_num_flops:

- subl $4, %esp

- movl 8(%esp), %eax

- movl %eax, (%esp)

- fildl (%esp)

- fmuls LCPI1_0

- addl $4, %esp

- ret

-in SSE mode, it compiles into significantly slower code:

-_MonteCarlo_num_flops:

- subl $12, %esp

- cvtsi2sd 16(%esp), %xmm0

- mulsd LCPI1_0, %xmm0

- movsd %xmm0, (%esp)

- fldl (%esp)

- addl $12, %esp

- ret

-There are also other cases in scimark where using fpstack is better, it is

-cheaper to do fld1 than load from a constant pool for example, so

-"load, add 1.0, store" is better done in the fp stack, etc.

-//===---------------------------------------------------------------------===//

-The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to

-"cmpsd". For example, this code:

-double d1(double x) { return x == x ? x : x + x; }

-Compiles into:

-_d1:

- ucomisd %xmm0, %xmm0

- jnp LBB1_2

- addsd %xmm0, %xmm0

- ret

-LBB1_2:

- ret

-Also, the 'ret's should be shared. This is PR6032.

-//===---------------------------------------------------------------------===//

-These should compile into the same code (PR6214): Perhaps instcombine should

-canonicalize the former into the later?

-define float @foo(float %x) nounwind {

- %t = bitcast float %x to i32

- %s = and i32 %t, 2147483647

- %d = bitcast i32 %s to float

- ret float %d

-declare float @fabsf(float %n)

-define float @bar(float %x) nounwind {

- %d = call float @fabsf(float %x)

- ret float %d

-//===---------------------------------------------------------------------===//

-This IR (from PR6194):

-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"

-target triple = "x86_64-apple-darwin10.0.0"

-%0 = type { double, double }

-%struct.float3 = type { float, float, float }

-define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {

-entry:

- %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]

- %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]

- %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]

- %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]

- %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]

- %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]

- %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]

- store float %tmp12, float* %tmp5

- ret void

-Compiles to:

-_test: ## @test

- movd %xmm0, %rax

- shrq $32, %rax

- movl %eax, 4(%rdi)

- ret

-This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and

-doing a shuffle from v[1] to v[0] then a float store.

-//===---------------------------------------------------------------------===//

-On SSE4 machines, we compile this code:

-define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,

- <2 x float> *%P) nounwind {

- %Z = fadd <2 x float> %Q, %R

- store <2 x float> %Z, <2 x float> *%P

- ret <2 x float> %Z

-into:

-_test2: ## @test2

-## BB#0:

- insertps $0, %xmm2, %xmm2

- insertps $16, %xmm3, %xmm2

- insertps $0, %xmm0, %xmm3

- insertps $16, %xmm1, %xmm3

- addps %xmm2, %xmm3

- movq %xmm3, (%rdi)

- movaps %xmm3, %xmm0

- pshufd $1, %xmm3, %xmm1

- ## kill: XMM1<def> XMM1<kill>

- ret

-The insertps's of $0 are pointless complex copies.

-//===---------------------------------------------------------------------===//

diff --git a/contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt b/contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt
deleted file mode 100644
index c26c75ab951c..000000000000
--- a/contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt
+++ /dev/null

@@ -1,14 +0,0 @@

-//===---------------------------------------------------------------------===//

-// Testcases that crash the X86 backend because they aren't implemented

-//===---------------------------------------------------------------------===//

-These are cases we know the X86 backend doesn't handle. Patches are welcome

-and appreciated, because no one has signed up to implemented these yet.

-Implementing these would allow elimination of the corresponding intrinsics,

-which would be great.

-1) vector shifts

-2) vector comparisons

-3) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688

-4) bitcasts from vectors to scalars: PR2804

-5) llvm.atomic.cmp.swap.i128.p0i128: PR3462

diff --git a/contrib/llvm/lib/Target/X86/README-X86-64.txt b/contrib/llvm/lib/Target/X86/README-X86-64.txt
deleted file mode 100644
index 78c4dc00ee72..000000000000
--- a/contrib/llvm/lib/Target/X86/README-X86-64.txt
+++ /dev/null

@@ -1,273 +0,0 @@

-//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//

-AMD64 Optimization Manual 8.2 has some nice information about optimizing integer

-multiplication by a constant. How much of it applies to Intel's X86-64

-implementation? There are definite trade-offs to consider: latency vs. register

-pressure vs. code size.

-//===---------------------------------------------------------------------===//

-Are we better off using branches instead of cmove to implement FP to

-unsigned i64?

-_conv:

- ucomiss LC0(%rip), %xmm0

- cvttss2siq %xmm0, %rdx

- jb L3

- subss LC0(%rip), %xmm0

- movabsq $-9223372036854775808, %rax

- cvttss2siq %xmm0, %rdx

- xorq %rax, %rdx

-L3:

- movq %rdx, %rax

- ret

-instead of

-_conv:

- movss LCPI1_0(%rip), %xmm1

- cvttss2siq %xmm0, %rcx

- movaps %xmm0, %xmm2

- subss %xmm1, %xmm2

- cvttss2siq %xmm2, %rax

- movabsq $-9223372036854775808, %rdx

- xorq %rdx, %rax

- ucomiss %xmm1, %xmm0

- cmovb %rcx, %rax

- ret

-Seems like the jb branch has high likelyhood of being taken. It would have

-saved a few instructions.

-//===---------------------------------------------------------------------===//

-Poor codegen:

-int X[2];

-int b;

-void test(void) {

- memset(X, b, 2*sizeof(X[0]));

-llc:

- movq _b@GOTPCREL(%rip), %rax

- movzbq (%rax), %rax

- movq %rax, %rcx

- shlq $8, %rcx

- orq %rax, %rcx

- movq %rcx, %rax

- shlq $16, %rax

- orq %rcx, %rax

- movq %rax, %rcx

- shlq $32, %rcx

- movq _X@GOTPCREL(%rip), %rdx

- orq %rax, %rcx

- movq %rcx, (%rdx)

- ret

-gcc:

- movq _b@GOTPCREL(%rip), %rax

- movabsq $72340172838076673, %rdx

- movzbq (%rax), %rax

- imulq %rdx, %rax

- movq _X@GOTPCREL(%rip), %rdx

- movq %rax, (%rdx)

- ret

-And the codegen is even worse for the following

-(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33103):

- void fill1(char *s, int a)

- {

- __builtin_memset(s, a, 15);

- }

-For this version, we duplicate the computation of the constant to store.

-//===---------------------------------------------------------------------===//

-It's not possible to reference AH, BH, CH, and DH registers in an instruction

-requiring REX prefix. However, divb and mulb both produce results in AH. If isel

-emits a CopyFromReg which gets turned into a movb and that can be allocated a

-r8b - r15b.

-To get around this, isel emits a CopyFromReg from AX and then right shift it

-down by 8 and truncate it. It's not pretty but it works. We need some register

-allocation magic to make the hack go away (e.g. putting additional constraints

-on the result of the movb).

-//===---------------------------------------------------------------------===//

-The x86-64 ABI for hidden-argument struct returns requires that the

-incoming value of %rdi be copied into %rax by the callee upon return.

-The idea is that it saves callers from having to remember this value,

-which would often require a callee-saved register. Callees usually

-need to keep this value live for most of their body anyway, so it

-doesn't add a significant burden on them.

-We currently implement this in codegen, however this is suboptimal

-because it means that it would be quite awkward to implement the

-optimization for callers.

-A better implementation would be to relax the LLVM IR rules for sret

-arguments to allow a function with an sret argument to have a non-void

-return type, and to have the front-end to set up the sret argument value

-as the return value of the function. The front-end could more easily

-emit uses of the returned struct value to be in terms of the function's

-lowered return value, and it would free non-C frontends from a

-complication only required by a C-based ABI.

-//===---------------------------------------------------------------------===//

-We get a redundant zero extension for code like this:

-int mask[1000];

-int foo(unsigned x) {

- if (x < 10)

- x = x * 45;

- else

- x = x * 78;

- return mask[x];

-_foo:

-LBB1_0: ## entry

- cmpl $9, %edi

- jbe LBB1_3 ## bb

-LBB1_1: ## bb1

- imull $78, %edi, %eax

-LBB1_2: ## bb2

- movl %eax, %eax <----

- movq _mask@GOTPCREL(%rip), %rcx

- movl (%rcx,%rax,4), %eax

- ret

-LBB1_3: ## bb

- imull $45, %edi, %eax

- jmp LBB1_2 ## bb2

-Before regalloc, we have:

- %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>

- JMP mbb<bb2,0x203afb0>

- Successors according to CFG: 0x203afb0 (#3)

-bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:

- Predecessors according to CFG: 0x203aec0 (#0)

- %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>

- Successors according to CFG: 0x203afb0 (#3)

-bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:

- Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)

- %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,

- %reg1026, mbb<bb1,0x203af60>

- %reg1029<def> = MOVZX64rr32 %reg1027

-so we'd have to know that IMUL32rri8 leaves the high word zero extended and to

-be able to recognize the zero extend. This could also presumably be implemented

-if we have whole-function selectiondags.

-//===---------------------------------------------------------------------===//

-Take the following C code

-(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640):

-struct u1

- float x;

- float y;

-};

-float foo(struct u1 u)

- return u.x + u.y;

-Optimizes to the following IR:

-define float @foo(double %u.0) nounwind readnone {

-entry:

- %tmp8 = bitcast double %u.0 to i64 ; <i64> [#uses=2]

- %tmp6 = trunc i64 %tmp8 to i32 ; <i32> [#uses=1]

- %tmp7 = bitcast i32 %tmp6 to float ; <float> [#uses=1]

- %tmp2 = lshr i64 %tmp8, 32 ; <i64> [#uses=1]

- %tmp3 = trunc i64 %tmp2 to i32 ; <i32> [#uses=1]

- %tmp4 = bitcast i32 %tmp3 to float ; <float> [#uses=1]

- %0 = fadd float %tmp7, %tmp4 ; <float> [#uses=1]

- ret float %0

-And current llvm-gcc/clang output:

- movd %xmm0, %rax

- movd %eax, %xmm1

- shrq $32, %rax

- movd %eax, %xmm0

- addss %xmm1, %xmm0

- ret

-We really shouldn't move the floats to RAX, only to immediately move them

-straight back to the XMM registers.

-There really isn't any good way to handle this purely in IR optimizers; it

-could possibly be handled by changing the output of the fronted, though. It

-would also be feasible to add a x86-specific DAGCombine to optimize the

-bitcast+trunc+(lshr+)bitcast combination.

-//===---------------------------------------------------------------------===//

-Take the following code

-(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):

-extern unsigned long table[];

-unsigned long foo(unsigned char *p) {

- unsigned long tag = *p;

- return table[tag >> 4] + table[tag & 0xf];

-Current code generated:

- movzbl (%rdi), %eax

- movq %rax, %rcx

- andq $240, %rcx

- shrq %rcx

- andq $15, %rax

- movq table(,%rax,8), %rax

- addq table(%rcx), %rax

- ret

-Issues:

-1. First movq should be movl; saves a byte.

-2. Both andq's should be andl; saves another two bytes. I think this was

- implemented at one point, but subsequently regressed.

-3. shrq should be shrl; saves another byte.

-4. The first andq can be completely eliminated by using a slightly more

- expensive addressing mode.

-//===---------------------------------------------------------------------===//

-Consider the following (contrived testcase, but contains common factors):

-#include <stdarg.h>

-int test(int x, ...) {

- int sum, i;

- va_list l;

- va_start(l, x);

- for (i = 0; i < x; i++)

- sum += va_arg(l, int);

- va_end(l);

- return sum;

-Testcase given in C because fixing it will likely involve changing the IR

-generated for it. The primary issue with the result is that it doesn't do any

-of the optimizations which are possible if we know the address of a va_list

-in the current function is never taken:

-1. We shouldn't spill the XMM registers because we only call va_arg with "int".

-2. It would be nice if we could scalarrepl the va_list.

-3. Probably overkill, but it'd be cool if we could peel off the first five

-iterations of the loop.

-Other optimizations involving functions which use va_arg on floats which don't

-have the address of a va_list taken:

-1. Conversely to the above, we shouldn't spill general registers if we only

- call va_arg on "double".

-2. If we know nothing more than 64 bits wide is read from the XMM registers,

- we can change the spilling code to reduce the amount of stack used by half.

-//===---------------------------------------------------------------------===//

diff --git a/contrib/llvm/lib/Target/X86/README.txt b/contrib/llvm/lib/Target/X86/README.txt
deleted file mode 100644
index a305ae6ec550..000000000000
--- a/contrib/llvm/lib/Target/X86/README.txt
+++ /dev/null

@@ -1,1962 +0,0 @@

-//===---------------------------------------------------------------------===//

-// Random ideas for the X86 backend.

-//===---------------------------------------------------------------------===//

-We should add support for the "movbe" instruction, which does a byte-swapping

-copy (3-addr bswap + memory support?) This is available on Atom processors.

-//===---------------------------------------------------------------------===//

-CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86

-backend knows how to three-addressify this shift, but it appears the register

-allocator isn't even asking it to do so in this case. We should investigate

-why this isn't happening, it could have significant impact on other important

-cases for X86 as well.

-//===---------------------------------------------------------------------===//

-This should be one DIV/IDIV instruction, not a libcall:

-unsigned test(unsigned long long X, unsigned Y) {

- return X/Y;

-This can be done trivially with a custom legalizer. What about overflow

-though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224

-//===---------------------------------------------------------------------===//

-Improvements to the multiply -> shift/add algorithm:

-http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html

-//===---------------------------------------------------------------------===//

-Improve code like this (occurs fairly frequently, e.g. in LLVM):

-long long foo(int x) { return 1LL << x; }

-http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html

-http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html

-http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html

-Another useful one would be ~0ULL >> X and ~0ULL << X.

-One better solution for 1LL << x is:

- xorl %eax, %eax

- xorl %edx, %edx

- testb $32, %cl

- sete %al

- setne %dl

- sall %cl, %eax

- sall %cl, %edx

-But that requires good 8-bit subreg support.

-Also, this might be better. It's an extra shift, but it's one instruction

-shorter, and doesn't stress 8-bit subreg support.

-(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,

-but without the unnecessary and.)

- movl %ecx, %eax

- shrl $5, %eax

- movl %eax, %edx

- xorl $1, %edx

- sall %cl, %eax

- sall %cl. %edx

-64-bit shifts (in general) expand to really bad code. Instead of using

-cmovs, we should expand to a conditional branch like GCC produces.

-//===---------------------------------------------------------------------===//

-Compile this:

-_Bool f(_Bool a) { return a!=1; }

-into:

- movzbl %dil, %eax

- xorl $1, %eax

- ret

-(Although note that this isn't a legal way to express the code that llvm-gcc

-currently generates for that function.)

-//===---------------------------------------------------------------------===//

-Some isel ideas:

-1. Dynamic programming based approach when compile time if not an

- issue.

-2. Code duplication (addressing mode) during isel.

-3. Other ideas from "Register-Sensitive Selection, Duplication, and

- Sequencing of Instructions".

-4. Scheduling for reduced register pressure. E.g. "Minimum Register

- Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"

- and other related papers.

- http://citeseer.ist.psu.edu/govindarajan01minimum.html

-//===---------------------------------------------------------------------===//

-Should we promote i16 to i32 to avoid partial register update stalls?

-//===---------------------------------------------------------------------===//

-Leave any_extend as pseudo instruction and hint to register

-allocator. Delay codegen until post register allocation.

-Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach

-the coalescer how to deal with it though.

-//===---------------------------------------------------------------------===//

-It appears icc use push for parameter passing. Need to investigate.

-//===---------------------------------------------------------------------===//

-Only use inc/neg/not instructions on processors where they are faster than

-add/sub/xor. They are slower on the P4 due to only updating some processor

-flags.

-//===---------------------------------------------------------------------===//

-The instruction selector sometimes misses folding a load into a compare. The

-pattern is written as (cmp reg, (load p)). Because the compare isn't

-commutative, it is not matched with the load on both sides. The dag combiner

-should be made smart enough to cannonicalize the load into the RHS of a compare

-when it can invert the result of the compare for free.

-//===---------------------------------------------------------------------===//

-In many cases, LLVM generates code like this:

-_test:

- movl 8(%esp), %eax

- cmpl %eax, 4(%esp)

- setl %al

- movzbl %al, %eax

- ret

-on some processors (which ones?), it is more efficient to do this:

-_test:

- movl 8(%esp), %ebx

- xor %eax, %eax

- cmpl %ebx, 4(%esp)

- setl %al

- ret

-Doing this correctly is tricky though, as the xor clobbers the flags.

-//===---------------------------------------------------------------------===//

-We should generate bts/btr/etc instructions on targets where they are cheap or

-when codesize is important. e.g., for:

-void setbit(int *target, int bit) {

- *target |= (1 << bit);

-void clearbit(int *target, int bit) {

- *target &= ~(1 << bit);

-//===---------------------------------------------------------------------===//

-Instead of the following for memset char*, 1, 10:

- movl $16843009, 4(%edx)

- movl $16843009, (%edx)

- movw $257, 8(%edx)

-It might be better to generate

- movl $16843009, %eax

- movl %eax, 4(%edx)

- movl %eax, (%edx)

- movw al, 8(%edx)

-when we can spare a register. It reduces code size.

-//===---------------------------------------------------------------------===//

-Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently

-get this:

-define i32 @test1(i32 %X) {

- %Y = sdiv i32 %X, 8

- ret i32 %Y

-_test1:

- movl 4(%esp), %eax

- movl %eax, %ecx

- sarl $31, %ecx

- shrl $29, %ecx

- addl %ecx, %eax

- sarl $3, %eax

- ret

-GCC knows several different ways to codegen it, one of which is this:

-_test1:

- movl 4(%esp), %eax

- cmpl $-1, %eax

- leal 7(%eax), %ecx

- cmovle %ecx, %eax

- sarl $3, %eax

- ret

-which is probably slower, but it's interesting at least :)

-//===---------------------------------------------------------------------===//

-We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl

-We should leave these as libcalls for everything over a much lower threshold,

-since libc is hand tuned for medium and large mem ops (avoiding RFO for large

-stores, TLB preheating, etc)

-//===---------------------------------------------------------------------===//

-Optimize this into something reasonable:

- x * copysign(1.0, y) * copysign(1.0, z)

-//===---------------------------------------------------------------------===//

-Optimize copysign(x, *y) to use an integer load from y.

-//===---------------------------------------------------------------------===//

-The following tests perform worse with LSR:

-lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.

-//===---------------------------------------------------------------------===//

-Adding to the list of cmp / test poor codegen issues:

-int test(__m128 *A, __m128 *B) {

- if (_mm_comige_ss(*A, *B))

- return 3;

- else

- return 4;

-_test:

- movl 8(%esp), %eax

- movaps (%eax), %xmm0

- movl 4(%esp), %eax

- movaps (%eax), %xmm1

- comiss %xmm0, %xmm1

- setae %al

- movzbl %al, %ecx

- movl $3, %eax

- movl $4, %edx

- cmpl $0, %ecx

- cmove %edx, %eax

- ret

-Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There

-are a number of issues. 1) We are introducing a setcc between the result of the

-intrisic call and select. 2) The intrinsic is expected to produce a i32 value

-so a any extend (which becomes a zero extend) is added.

-We probably need some kind of target DAG combine hook to fix this.

-//===---------------------------------------------------------------------===//

-We generate significantly worse code for this than GCC:

-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150

-http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701

-There is also one case we do worse on PPC.

-//===---------------------------------------------------------------------===//

-For this:

-int test(int a)

- return a * 3;

-We currently emits

- imull $3, 4(%esp), %eax

-Perhaps this is what we really should generate is? Is imull three or four

-cycles? Note: ICC generates this:

- movl 4(%esp), %eax

- leal (%eax,%eax,2), %eax

-The current instruction priority is based on pattern complexity. The former is

-more "complex" because it folds a load so the latter will not be emitted.

-Perhaps we should use AddedComplexity to give LEA32r a higher priority? We

-should always try to match LEA first since the LEA matching code does some

-estimate to determine whether the match is profitable.

-However, if we care more about code size, then imull is better. It's two bytes

-shorter than movl + leal.

-On a Pentium M, both variants have the same characteristics with regard

-to throughput; however, the multiplication has a latency of four cycles, as

-opposed to two cycles for the movl+lea variant.

-//===---------------------------------------------------------------------===//

-__builtin_ffs codegen is messy.

-int ffs_(unsigned X) { return __builtin_ffs(X); }

-llvm produces:

-ffs_:

- movl 4(%esp), %ecx

- bsfl %ecx, %eax

- movl $32, %edx

- cmove %edx, %eax

- incl %eax

- xorl %edx, %edx

- testl %ecx, %ecx

- cmove %edx, %eax

- ret

-vs gcc:

-_ffs_:

- movl $-1, %edx

- bsfl 4(%esp), %eax

- cmove %edx, %eax

- addl $1, %eax

- ret

-Another example of __builtin_ffs (use predsimplify to eliminate a select):

-int foo (unsigned long j) {

- if (j)

- return __builtin_ffs (j) - 1;

- else

- return 0;

-//===---------------------------------------------------------------------===//

-It appears gcc place string data with linkonce linkage in

-.section __TEXT,__const_coal,coalesced instead of

-.section __DATA,__const_coal,coalesced.

-Take a look at darwin.h, there are other Darwin assembler directives that we

-do not make use of.

-//===---------------------------------------------------------------------===//

-define i32 @foo(i32* %a, i32 %t) {

-entry:

- br label %cond_true

-cond_true: ; preds = %cond_true, %entry

- %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]

- %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]

- %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]

- %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]

- %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]

- %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]

- %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]

- %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]

- br i1 %tmp, label %bb12, label %cond_true

-bb12: ; preds = %cond_true

- ret i32 %tmp7

-is pessimized by -loop-reduce and -indvars

-//===---------------------------------------------------------------------===//

-u32 to float conversion improvement:

-float uint32_2_float( unsigned u ) {

- float fl = (int) (u & 0xffff);

- float fh = (int) (u >> 16);

- fh *= 0x1.0p16f;

- return fh + fl;

-00000000 subl $0x04,%esp

-00000003 movl 0x08(%esp,1),%eax

-00000007 movl %eax,%ecx

-00000009 shrl $0x10,%ecx

-0000000c cvtsi2ss %ecx,%xmm0

-00000010 andl $0x0000ffff,%eax

-00000015 cvtsi2ss %eax,%xmm1

-00000019 mulss 0x00000078,%xmm0

-00000021 addss %xmm1,%xmm0

-00000025 movss %xmm0,(%esp,1)

-0000002a flds (%esp,1)

-0000002d addl $0x04,%esp

-00000030 ret

-//===---------------------------------------------------------------------===//

-When using fastcc abi, align stack slot of argument of type double on 8 byte

-boundary to improve performance.

-//===---------------------------------------------------------------------===//

-Codegen:

-int f(int a, int b) {

- if (a == 4 || a == 6)

- b++;

- return b;

-as:

-or eax, 2

-cmp eax, 6

-jz label

-//===---------------------------------------------------------------------===//

-GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting

-simplifications for integer "x cmp y ? a : b". For example, instead of:

-int G;

-void f(int X, int Y) {

- G = X < 0 ? 14 : 13;

-compiling to:

-_f:

- movl $14, %eax

- movl $13, %ecx

- movl 4(%esp), %edx

- testl %edx, %edx

- cmovl %eax, %ecx

- movl %ecx, _G

- ret

-it could be:

-_f:

- movl 4(%esp), %eax

- sarl $31, %eax

- notl %eax

- addl $14, %eax

- movl %eax, _G

- ret

-etc.

-Another is:

-int usesbb(unsigned int a, unsigned int b) {

- return (a < b ? -1 : 0);

-to:

-_usesbb:

- movl 8(%esp), %eax

- cmpl %eax, 4(%esp)

- sbbl %eax, %eax

- ret

-instead of:

-_usesbb:

- xorl %eax, %eax

- movl 8(%esp), %ecx

- cmpl %ecx, 4(%esp)

- movl $4294967295, %ecx

- cmovb %ecx, %eax

- ret

-//===---------------------------------------------------------------------===//

-Consider the expansion of:

-define i32 @test3(i32 %X) {

- %tmp1 = urem i32 %X, 255

- ret i32 %tmp1

-Currently it compiles to:

-...

- movl $2155905153, %ecx

- movl 8(%esp), %esi

- movl %esi, %eax

- mull %ecx

-...

-This could be "reassociated" into:

- movl $2155905153, %eax

- movl 8(%esp), %ecx

- mull %ecx

-to avoid the copy. In fact, the existing two-address stuff would do this

-except that mul isn't a commutative 2-addr instruction. I guess this has

-to be done at isel time based on the #uses to mul?

-//===---------------------------------------------------------------------===//

-Make sure the instruction which starts a loop does not cross a cacheline

-boundary. This requires knowning the exact length of each machine instruction.

-That is somewhat complicated, but doable. Example 256.bzip2:

-In the new trace, the hot loop has an instruction which crosses a cacheline

-boundary. In addition to potential cache misses, this can't help decoding as I

-imagine there has to be some kind of complicated decoder reset and realignment

-to grab the bytes from the next cacheline.

-532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines

-942 942 0x3d03 movl %dh, (1809(%esp, %esi)

-937 937 0x3d0a incl %esi

-3 3 0x3d0b cmpb %bl, %dl

-27 27 0x3d0d jnz 0x000062db <main+11707>

-//===---------------------------------------------------------------------===//

-In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.

-//===---------------------------------------------------------------------===//

-This could be a single 16-bit load.

-int f(char *p) {

- if ((p[0] == 1) & (p[1] == 2)) return 1;

- return 0;

-//===---------------------------------------------------------------------===//

-We should inline lrintf and probably other libc functions.

-//===---------------------------------------------------------------------===//

-Use the FLAGS values from arithmetic instructions more. For example, compile:

-int add_zf(int *x, int y, int a, int b) {

- if ((*x += y) == 0)

- return a;

- else

- return b;

-to:

- addl %esi, (%rdi)

- movl %edx, %eax

- cmovne %ecx, %eax

- ret

-instead of:

-_add_zf:

- addl (%rdi), %esi

- movl %esi, (%rdi)

- testl %esi, %esi

- cmove %edx, %ecx

- movl %ecx, %eax

- ret

-As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll

-without a test instruction.

-//===---------------------------------------------------------------------===//

-These two functions have identical effects:

-unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}

-unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}

-We currently compile them to:

-_f:

- movl 4(%esp), %eax

- movl %eax, %ecx

- incl %ecx

- movl 8(%esp), %edx

- cmpl %edx, %ecx

- jne LBB1_2 #UnifiedReturnBlock

-LBB1_1: #cond_true

- addl $2, %eax

- ret

-LBB1_2: #UnifiedReturnBlock

- movl %ecx, %eax

- ret

-_f2:

- movl 4(%esp), %eax

- movl %eax, %ecx

- incl %ecx

- cmpl 8(%esp), %ecx

- sete %cl

- movzbl %cl, %ecx

- leal 1(%ecx,%eax), %eax

- ret

-both of which are inferior to GCC's:

-_f:

- movl 4(%esp), %edx

- leal 1(%edx), %eax

- addl $2, %edx

- cmpl 8(%esp), %eax

- cmove %edx, %eax

- ret

-_f2:

- movl 4(%esp), %eax

- addl $1, %eax

- xorl %edx, %edx

- cmpl 8(%esp), %eax

- sete %dl

- addl %edx, %eax

- ret

-//===---------------------------------------------------------------------===//

-This code:

-void test(int X) {

- if (X) abort();

-is currently compiled to:

-_test:

- subl $12, %esp

- cmpl $0, 16(%esp)

- jne LBB1_1

- addl $12, %esp

- ret

-LBB1_1:

- call L_abort$stub

-It would be better to produce:

-_test:

- subl $12, %esp

- cmpl $0, 16(%esp)

- jne L_abort$stub

- addl $12, %esp

- ret

-This can be applied to any no-return function call that takes no arguments etc.

-Alternatively, the stack save/restore logic could be shrink-wrapped, producing

-something like this:

-_test:

- cmpl $0, 4(%esp)

- jne LBB1_1

- ret

-LBB1_1:

- subl $12, %esp

- call L_abort$stub

-Both are useful in different situations. Finally, it could be shrink-wrapped

-and tail called, like this:

-_test:

- cmpl $0, 4(%esp)

- jne LBB1_1

- ret

-LBB1_1:

- pop %eax # realign stack.

- call L_abort$stub

-Though this probably isn't worth it.

-//===---------------------------------------------------------------------===//

-Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with

-a neg instead of a sub instruction. Consider:

-int test(char X) { return 7-X; }

-we currently produce:

-_test:

- movl $7, %eax

- movsbl 4(%esp), %ecx

- subl %ecx, %eax

- ret

-We would use one fewer register if codegen'd as:

- movsbl 4(%esp), %eax

- neg %eax

- add $7, %eax

- ret

-Note that this isn't beneficial if the load can be folded into the sub. In

-this case, we want a sub:

-int test(int X) { return 7-X; }

-_test:

- movl $7, %eax

- subl 4(%esp), %eax

- ret

-//===---------------------------------------------------------------------===//

-Leaf functions that require one 4-byte spill slot have a prolog like this:

-_foo:

- pushl %esi

- subl $4, %esp

-...

-and an epilog like this:

- addl $4, %esp

- popl %esi

- ret

-It would be smaller, and potentially faster, to push eax on entry and to

-pop into a dummy register instead of using addl/subl of esp. Just don't pop

-into any return registers :)

-//===---------------------------------------------------------------------===//

-The X86 backend should fold (branch (or (setcc, setcc))) into multiple

-branches. We generate really poor code for:

-double testf(double a) {

- return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);

-For example, the entry BB is:

-_testf:

- subl $20, %esp

- pxor %xmm0, %xmm0

- movsd 24(%esp), %xmm1

- ucomisd %xmm0, %xmm1

- setnp %al

- sete %cl

- testb %cl, %al

- jne LBB1_5 # UnifiedReturnBlock

-LBB1_1: # cond_true

-it would be better to replace the last four instructions with:

- jp LBB1_1

- je LBB1_5

-LBB1_1:

-We also codegen the inner ?: into a diamond:

- cvtss2sd LCPI1_0(%rip), %xmm2

- cvtss2sd LCPI1_1(%rip), %xmm3

- ucomisd %xmm1, %xmm0

- ja LBB1_3 # cond_true

-LBB1_2: # cond_true

- movapd %xmm3, %xmm2

-LBB1_3: # cond_true

- movapd %xmm2, %xmm0

- ret

-We should sink the load into xmm3 into the LBB1_2 block. This should

-be pretty easy, and will nuke all the copies.

-//===---------------------------------------------------------------------===//

-This:

- #include <algorithm>

- inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)

- { return std::make_pair(a + b, a + b < a); }

- bool no_overflow(unsigned a, unsigned b)

- { return !full_add(a, b).second; }

-Should compile to:

- _Z11no_overflowjj:

- addl %edi, %esi

- setae %al

- ret

-FIXME: That code looks wrong; bool return is normally defined as zext.

-on x86-64, not:

-__Z11no_overflowjj:

- addl %edi, %esi

- cmpl %edi, %esi

- setae %al

- movzbl %al, %eax

- ret

-//===---------------------------------------------------------------------===//

-The following code:

-bb114.preheader: ; preds = %cond_next94

- %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]

- %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]

- %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]

- %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]

- %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]

- %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]

- %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]

- %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]

- %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]

- %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]

- %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]

- br label %bb114

-produces:

-LBB3_5: # bb114.preheader

- movswl -68(%ebp), %eax

- movl $32, %ecx

- movl %ecx, -80(%ebp)

- subl %eax, -80(%ebp)

- movswl -52(%ebp), %eax

- movl %ecx, -84(%ebp)

- subl %eax, -84(%ebp)

- movswl -70(%ebp), %eax

- movl %ecx, -88(%ebp)

- subl %eax, -88(%ebp)

- movswl -50(%ebp), %eax

- subl %eax, %ecx

- movl %ecx, -76(%ebp)

- movswl -42(%ebp), %eax

- movl %eax, -92(%ebp)

- movswl -66(%ebp), %eax

- movl %eax, -96(%ebp)

- movw $0, -98(%ebp)

-This appears to be bad because the RA is not folding the store to the stack

-slot into the movl. The above instructions could be:

- movl $32, -80(%ebp)

-...

- movl $32, -84(%ebp)

-...

-This seems like a cross between remat and spill folding.

-This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't

-change, so we could simply subtract %eax from %ecx first and then use %ecx (or

-vice-versa).

-//===---------------------------------------------------------------------===//

-This code:

- %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]

- br i1 %tmp659, label %cond_true662, label %cond_next715

-produces this:

- testw %cx, %cx

- movswl %cx, %esi

- jns LBB4_109 # cond_next715

-Shark tells us that using %cx in the testw instruction is sub-optimal. It

-suggests using the 32-bit register (which is what ICC uses).

-//===---------------------------------------------------------------------===//

-We compile this:

-void compare (long long foo) {

- if (foo < 4294967297LL)

- abort();

-to:

-compare:

- subl $4, %esp

- cmpl $0, 8(%esp)

- setne %al

- movzbw %al, %ax

- cmpl $1, 12(%esp)

- setg %cl

- movzbw %cl, %cx

- cmove %ax, %cx

- testb $1, %cl

- jne .LBB1_2 # UnifiedReturnBlock

-.LBB1_1: # ifthen

- call abort

-.LBB1_2: # UnifiedReturnBlock

- addl $4, %esp

- ret

-(also really horrible code on ppc). This is due to the expand code for 64-bit

-compares. GCC produces multiple branches, which is much nicer:

-compare:

- subl $12, %esp

- movl 20(%esp), %edx

- movl 16(%esp), %eax

- decl %edx

- jle .L7

-.L5:

- addl $12, %esp

- ret

- .p2align 4,,7

-.L7:

- jl .L4

- cmpl $0, %eax

- .p2align 4,,8

- ja .L5

-.L4:

- .p2align 4,,9

- call abort

-//===---------------------------------------------------------------------===//

-Tail call optimization improvements: Tail call optimization currently

-pushes all arguments on the top of the stack (their normal place for

-non-tail call optimized calls) that source from the callers arguments

-or that source from a virtual register (also possibly sourcing from

-callers arguments).

-This is done to prevent overwriting of parameters (see example

-below) that might be used later.

-example:

-int callee(int32, int64);

-int caller(int32 arg1, int32 arg2) {

- int64 local = arg2 * 2;

- return callee(arg2, (int64)local);

-[arg1] [!arg2 no longer valid since we moved local onto it]

-[arg2] -> [(int64)

-[RETADDR] local ]

-Moving arg1 onto the stack slot of callee function would overwrite

-arg2 of the caller.

-Possible optimizations:

- - Analyse the actual parameters of the callee to see which would

- overwrite a caller parameter which is used by the callee and only

- push them onto the top of the stack.

- int callee (int32 arg1, int32 arg2);

- int caller (int32 arg1, int32 arg2) {

- return callee(arg1,arg2);

- }

- Here we don't need to write any variables to the top of the stack

- since they don't overwrite each other.

- int callee (int32 arg1, int32 arg2);

- int caller (int32 arg1, int32 arg2) {

- return callee(arg2,arg1);

- }

- Here we need to push the arguments because they overwrite each

- other.

-//===---------------------------------------------------------------------===//

-main ()

- int i = 0;

- unsigned long int z = 0;

- do {

- z -= 0x00004000;

- i++;

- if (i > 0x00040000)

- abort ();

- } while (z > 0);

- exit (0);

-gcc compiles this to:

-_main:

- subl $28, %esp

- xorl %eax, %eax

- jmp L2

-L3:

- cmpl $262144, %eax

- je L10

-L2:

- addl $1, %eax

- cmpl $262145, %eax

- jne L3

- call L_abort$stub

-L10:

- movl $0, (%esp)

- call L_exit$stub

-llvm:

-_main:

- subl $12, %esp

- movl $1, %eax

- movl $16384, %ecx

-LBB1_1: # bb

- cmpl $262145, %eax

- jge LBB1_4 # cond_true

-LBB1_2: # cond_next

- incl %eax

- addl $4294950912, %ecx

- cmpl $16384, %ecx

- jne LBB1_1 # bb

-LBB1_3: # bb11

- xorl %eax, %eax

- addl $12, %esp

- ret

-LBB1_4: # cond_true

- call L_abort$stub

-1. LSR should rewrite the first cmp with induction variable %ecx.

-2. DAG combiner should fold

- leal 1(%eax), %edx

- cmpl $262145, %edx

- =>

- cmpl $262144, %eax

-//===---------------------------------------------------------------------===//

-define i64 @test(double %X) {

- %Y = fptosi double %X to i64

- ret i64 %Y

-compiles to:

-_test:

- subl $20, %esp

- movsd 24(%esp), %xmm0

- movsd %xmm0, 8(%esp)

- fldl 8(%esp)

- fisttpll (%esp)

- movl 4(%esp), %edx

- movl (%esp), %eax

- addl $20, %esp

- #FP_REG_KILL

- ret

-This should just fldl directly from the input stack slot.

-//===---------------------------------------------------------------------===//

-This code:

-int foo (int x) { return (x & 65535) | 255; }

-Should compile into:

-_foo:

- movzwl 4(%esp), %eax

- orl $255, %eax

- ret

-instead of:

-_foo:

- movl $255, %eax

- orl 4(%esp), %eax

- andl $65535, %eax

- ret

-//===---------------------------------------------------------------------===//

-We're codegen'ing multiply of long longs inefficiently:

-unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {

- return arg1 * arg2;

-We compile to (fomit-frame-pointer):

-_LLM:

- pushl %esi

- movl 8(%esp), %ecx

- movl 16(%esp), %esi

- movl %esi, %eax

- mull %ecx

- imull 12(%esp), %esi

- addl %edx, %esi

- imull 20(%esp), %ecx

- movl %esi, %edx

- addl %ecx, %edx

- popl %esi

- ret

-This looks like a scheduling deficiency and lack of remat of the load from

-the argument area. ICC apparently produces:

- movl 8(%esp), %ecx

- imull 12(%esp), %ecx

- movl 16(%esp), %eax

- imull 4(%esp), %eax

- addl %eax, %ecx

- movl 4(%esp), %eax

- mull 12(%esp)

- addl %ecx, %edx

- ret

-Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:

-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236

-//===---------------------------------------------------------------------===//

-We can fold a store into "zeroing a reg". Instead of:

-xorl %eax, %eax

-movl %eax, 124(%esp)

-we should get:

-movl $0, 124(%esp)

-if the flags of the xor are dead.

-Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should

-be folded into: shl [mem], 1

-//===---------------------------------------------------------------------===//

-In SSE mode, we turn abs and neg into a load from the constant pool plus a xor

-or and instruction, for example:

- xorpd LCPI1_0, %xmm2

-However, if xmm2 gets spilled, we end up with really ugly code like this:

- movsd (%esp), %xmm0

- xorpd LCPI1_0, %xmm0

- movsd %xmm0, (%esp)

-Since we 'know' that this is a 'neg', we can actually "fold" the spill into

-the neg/abs instruction, turning it into an *integer* operation, like this:

- xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)

-you could also use xorb, but xorl is less likely to lead to a partial register

-stall. Here is a contrived testcase:

-double a, b, c;

-void test(double *P) {

- double X = *P;

- a = X;

- bar();

- X = -X;

- b = X;

- bar();

- c = X;

-//===---------------------------------------------------------------------===//

-The generated code on x86 for checking for signed overflow on a multiply the

-obvious way is much longer than it needs to be.

-int x(int a, int b) {

- long long prod = (long long)a*b;

- return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);

-See PR2053 for more details.

-//===---------------------------------------------------------------------===//

-We should investigate using cdq/ctld (effect: edx = sar eax, 31)

-more aggressively; it should cost the same as a move+shift on any modern

-processor, but it's a lot shorter. Downside is that it puts more

-pressure on register allocation because it has fixed operands.

-Example:

-int abs(int x) {return x < 0 ? -x : x;}

-gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:

-abs:

- movl 4(%esp), %eax

- cltd

- xorl %edx, %eax

- subl %edx, %eax

- ret

-//===---------------------------------------------------------------------===//

-Consider:

-int test(unsigned long a, unsigned long b) { return -(a < b); }

-We currently compile this to:

-define i32 @test(i32 %a, i32 %b) nounwind {

- %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]

- %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]

- %tmp5 = sub i32 0, %tmp34 ; <i32> [#uses=1]

- ret i32 %tmp5

-and

-_test:

- movl 8(%esp), %eax

- cmpl %eax, 4(%esp)

- setb %al

- movzbl %al, %eax

- negl %eax

- ret

-Several deficiencies here. First, we should instcombine zext+neg into sext:

-define i32 @test2(i32 %a, i32 %b) nounwind {

- %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]

- %tmp34 = sext i1 %tmp3 to i32 ; <i32> [#uses=1]

- ret i32 %tmp34

-However, before we can do that, we have to fix the bad codegen that we get for

-sext from bool:

-_test2:

- movl 8(%esp), %eax

- cmpl %eax, 4(%esp)

- setb %al

- movzbl %al, %eax

- shll $31, %eax

- sarl $31, %eax

- ret

-This code should be at least as good as the code above. Once this is fixed, we

-can optimize this specific case even more to:

- movl 8(%esp), %eax

- xorl %ecx, %ecx

- cmpl %eax, 4(%esp)

- sbbl %ecx, %ecx

-//===---------------------------------------------------------------------===//

-Take the following code (from

-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):

-extern unsigned char first_one[65536];

-int FirstOnet(unsigned long long arg1)

- if (arg1 >> 48)

- return (first_one[arg1 >> 48]);

- return 0;

-The following code is currently generated:

-FirstOnet:

- movl 8(%esp), %eax

- cmpl $65536, %eax

- movl 4(%esp), %ecx

- jb .LBB1_2 # UnifiedReturnBlock

-.LBB1_1: # ifthen

- shrl $16, %eax

- movzbl first_one(%eax), %eax

- ret

-.LBB1_2: # UnifiedReturnBlock

- xorl %eax, %eax

- ret

-We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this

-lets us change the cmpl into a testl, which is shorter, and eliminate the shift.

-//===---------------------------------------------------------------------===//

-We compile this function:

-define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {

-entry:

- %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]

- br i1 %tmp2, label %bb7, label %bb

-bb: ; preds = %entry

- %tmp6 = add i32 %b, %a ; <i32> [#uses=1]

- ret i32 %tmp6

-bb7: ; preds = %entry

- %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]

- ret i32 %tmp10

-to:

-foo: # @foo

-# BB#0: # %entry

- movl 4(%esp), %ecx

- cmpb $0, 16(%esp)

- je .LBB0_2

-# BB#1: # %bb

- movl 8(%esp), %eax

- addl %ecx, %eax

- ret

-.LBB0_2: # %bb7

- movl 12(%esp), %edx

- movl %ecx, %eax

- subl %edx, %eax

- ret

-There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a

-couple more movls by putting 4(%esp) into %eax instead of %ecx.

-//===---------------------------------------------------------------------===//

-See rdar://4653682.

-From flops:

-LBB1_15: # bb310

- cvtss2sd LCPI1_0, %xmm1

- addsd %xmm1, %xmm0

- movsd 176(%esp), %xmm2

- mulsd %xmm0, %xmm2

- movapd %xmm2, %xmm3

- mulsd %xmm3, %xmm3

- movapd %xmm3, %xmm4

- mulsd LCPI1_23, %xmm4

- addsd LCPI1_24, %xmm4

- mulsd %xmm3, %xmm4

- addsd LCPI1_25, %xmm4

- mulsd %xmm3, %xmm4

- addsd LCPI1_26, %xmm4

- mulsd %xmm3, %xmm4

- addsd LCPI1_27, %xmm4

- mulsd %xmm3, %xmm4

- addsd LCPI1_28, %xmm4

- mulsd %xmm3, %xmm4

- addsd %xmm1, %xmm4

- mulsd %xmm2, %xmm4

- movsd 152(%esp), %xmm1

- addsd %xmm4, %xmm1

- movsd %xmm1, 152(%esp)

- incl %eax

- cmpl %eax, %esi

- jge LBB1_15 # bb310

-LBB1_16: # bb358.loopexit

- movsd 152(%esp), %xmm0

- addsd %xmm0, %xmm0

- addsd LCPI1_22, %xmm0

- movsd %xmm0, 152(%esp)

-Rather than spilling the result of the last addsd in the loop, we should have

-insert a copy to split the interval (one for the duration of the loop, one

-extending to the fall through). The register pressure in the loop isn't high

-enough to warrant the spill.

-Also check why xmm7 is not used at all in the function.

-//===---------------------------------------------------------------------===//

-Take the following:

-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"

-target triple = "i386-apple-darwin8"

-@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]

-define fastcc void @abort_gzip() noreturn nounwind {

-entry:

- %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]

- br i1 %tmp.b.i, label %bb.i, label %bb4.i

-bb.i: ; preds = %entry

- tail call void @exit( i32 1 ) noreturn nounwind

- unreachable

-bb4.i: ; preds = %entry

- store i1 true, i1* @in_exit.4870.b

- tail call void @exit( i32 1 ) noreturn nounwind

- unreachable

-declare void @exit(i32) noreturn nounwind

-This compiles into:

-_abort_gzip: ## @abort_gzip

-## BB#0: ## %entry

- subl $12, %esp

- movb _in_exit.4870.b, %al

- cmpb $1, %al

- jne LBB0_2

-We somehow miss folding the movb into the cmpb.

-//===---------------------------------------------------------------------===//

-We compile:

-int test(int x, int y) {

- return x-y-1;

-into (-m64):

-_test:

- decl %edi

- movl %edi, %eax

- subl %esi, %eax

- ret

-it would be better to codegen as: x+~y (notl+addl)

-//===---------------------------------------------------------------------===//

-This code:

-int foo(const char *str,...)

- __builtin_va_list a; int x;

- __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);

- return x;

-gets compiled into this on x86-64:

- subq $200, %rsp

- movaps %xmm7, 160(%rsp)

- movaps %xmm6, 144(%rsp)

- movaps %xmm5, 128(%rsp)

- movaps %xmm4, 112(%rsp)

- movaps %xmm3, 96(%rsp)

- movaps %xmm2, 80(%rsp)

- movaps %xmm1, 64(%rsp)

- movaps %xmm0, 48(%rsp)

- movq %r9, 40(%rsp)

- movq %r8, 32(%rsp)

- movq %rcx, 24(%rsp)

- movq %rdx, 16(%rsp)

- movq %rsi, 8(%rsp)

- leaq (%rsp), %rax

- movq %rax, 192(%rsp)

- leaq 208(%rsp), %rax

- movq %rax, 184(%rsp)

- movl $48, 180(%rsp)

- movl $8, 176(%rsp)

- movl 176(%rsp), %eax

- cmpl $47, %eax

- jbe .LBB1_3 # bb

-.LBB1_1: # bb3

- movq 184(%rsp), %rcx

- leaq 8(%rcx), %rax

- movq %rax, 184(%rsp)

-.LBB1_2: # bb4

- movl (%rcx), %eax

- addq $200, %rsp

- ret

-.LBB1_3: # bb

- movl %eax, %ecx

- addl $8, %eax

- addq 192(%rsp), %rcx

- movl %eax, 176(%rsp)

- jmp .LBB1_2 # bb4

-gcc 4.3 generates:

- subq $96, %rsp

-.LCFI0:

- leaq 104(%rsp), %rax

- movq %rsi, -80(%rsp)

- movl $8, -120(%rsp)

- movq %rax, -112(%rsp)

- leaq -88(%rsp), %rax

- movq %rax, -104(%rsp)

- movl $8, %eax

- cmpl $48, %eax

- jb .L6

- movq -112(%rsp), %rdx

- movl (%rdx), %eax

- addq $96, %rsp

- ret

- .p2align 4,,10

- .p2align 3

-.L6:

- mov %eax, %edx

- addq -104(%rsp), %rdx

- addl $8, %eax

- movl %eax, -120(%rsp)

- movl (%rdx), %eax

- addq $96, %rsp

- ret

-and it gets compiled into this on x86:

- pushl %ebp

- movl %esp, %ebp

- subl $4, %esp

- leal 12(%ebp), %eax

- movl %eax, -4(%ebp)

- leal 16(%ebp), %eax

- movl %eax, -4(%ebp)

- movl 12(%ebp), %eax

- addl $4, %esp

- popl %ebp

- ret

-gcc 4.3 generates:

- pushl %ebp

- movl %esp, %ebp

- movl 12(%ebp), %eax

- popl %ebp

- ret

-//===---------------------------------------------------------------------===//

-Teach tblgen not to check bitconvert source type in some cases. This allows us

-to consolidate the following patterns in X86InstrMMX.td:

-def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),

- (iPTR 0))))),

- (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;

-def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),

- (iPTR 0))))),

- (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;

-def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),

- (iPTR 0))))),

- (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;

-There are other cases in various td files.

-//===---------------------------------------------------------------------===//

-Take something like the following on x86-32:

-unsigned a(unsigned long long x, unsigned y) {return x % y;}

-We currently generate a libcall, but we really shouldn't: the expansion is

-shorter and likely faster than the libcall. The expected code is something

-like the following:

- movl 12(%ebp), %eax

- movl 16(%ebp), %ecx

- xorl %edx, %edx

- divl %ecx

- movl 8(%ebp), %eax

- divl %ecx

- movl %edx, %eax

- ret

-A similar code sequence works for division.

-//===---------------------------------------------------------------------===//

-These should compile to the same code, but the later codegen's to useless

-instructions on X86. This may be a trivial dag combine (GCC PR7061):

-struct s1 { unsigned char a, b; };

-unsigned long f1(struct s1 x) {

- return x.a + x.b;

-struct s2 { unsigned a: 8, b: 8; };

-unsigned long f2(struct s2 x) {

- return x.a + x.b;

-//===---------------------------------------------------------------------===//

-We currently compile this:

-define i32 @func1(i32 %v1, i32 %v2) nounwind {

-entry:

- %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)

- %sum = extractvalue {i32, i1} %t, 0

- %obit = extractvalue {i32, i1} %t, 1

- br i1 %obit, label %overflow, label %normal

-normal:

- ret i32 %sum

-overflow:

- call void @llvm.trap()

- unreachable

-declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)

-declare void @llvm.trap()

-to:

-_func1:

- movl 4(%esp), %eax

- addl 8(%esp), %eax

- jo LBB1_2 ## overflow

-LBB1_1: ## normal

- ret

-LBB1_2: ## overflow

- ud2

-it would be nice to produce "into" someday.

-//===---------------------------------------------------------------------===//

-This code:

-void vec_mpys1(int y[], const int x[], int scaler) {

-int i;

-for (i = 0; i < 150; i++)

- y[i] += (((long long)scaler * (long long)x[i]) >> 31);

-Compiles to this loop with GCC 3.x:

-.L5:

- movl %ebx, %eax

- imull (%edi,%ecx,4)

- shrdl $31, %edx, %eax

- addl %eax, (%esi,%ecx,4)

- incl %ecx

- cmpl $149, %ecx

- jle .L5

-llvm-gcc compiles it to the much uglier:

-LBB1_1: ## bb1

- movl 24(%esp), %eax

- movl (%eax,%edi,4), %ebx

- movl %ebx, %ebp

- imull %esi, %ebp

- movl %ebx, %eax

- mull %ecx

- addl %ebp, %edx

- sarl $31, %ebx

- imull %ecx, %ebx

- addl %edx, %ebx

- shldl $1, %eax, %ebx

- movl 20(%esp), %eax

- addl %ebx, (%eax,%edi,4)

- incl %edi

- cmpl $150, %edi

- jne LBB1_1 ## bb1

-The issue is that we hoist the cast of "scaler" to long long outside of the

-loop, the value comes into the loop as two values, and

-RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the

-constructed BUILD_PAIR which represents the cast value.

-//===---------------------------------------------------------------------===//

-Test instructions can be eliminated by using EFLAGS values from arithmetic

-instructions. This is currently not done for mul, and, or, xor, neg, shl,

-sra, srl, shld, shrd, atomic ops, and others. It is also currently not done

-for read-modify-write instructions. It is also current not done if the

-OF or CF flags are needed.

-The shift operators have the complication that when the shift count is

-zero, EFLAGS is not set, so they can only subsume a test instruction if

-the shift count is known to be non-zero. Also, using the EFLAGS value

-from a shift is apparently very slow on some x86 implementations.

-In read-modify-write instructions, the root node in the isel match is

-the store, and isel has no way for the use of the EFLAGS result of the

-arithmetic to be remapped to the new node.

-Add and subtract instructions set OF on signed overflow and CF on unsiged

-overflow, while test instructions always clear OF and CF. In order to

-replace a test with an add or subtract in a situation where OF or CF is

-needed, codegen must be able to prove that the operation cannot see

-signed or unsigned overflow, respectively.

-//===---------------------------------------------------------------------===//

-memcpy/memmove do not lower to SSE copies when possible. A silly example is:

-define <16 x float> @foo(<16 x float> %A) nounwind {

- %tmp = alloca <16 x float>, align 16

- %tmp2 = alloca <16 x float>, align 16

- store <16 x float> %A, <16 x float>* %tmp

- %s = bitcast <16 x float>* %tmp to i8*

- %s2 = bitcast <16 x float>* %tmp2 to i8*

- call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)

- %R = load <16 x float>* %tmp2

- ret <16 x float> %R

-declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind

-which compiles to:

-_foo:

- subl $140, %esp

- movaps %xmm3, 112(%esp)

- movaps %xmm2, 96(%esp)

- movaps %xmm1, 80(%esp)

- movaps %xmm0, 64(%esp)

- movl 60(%esp), %eax

- movl %eax, 124(%esp)

- movl 56(%esp), %eax

- movl %eax, 120(%esp)

- movl 52(%esp), %eax

- <many many more 32-bit copies>

- movaps (%esp), %xmm0

- movaps 16(%esp), %xmm1

- movaps 32(%esp), %xmm2

- movaps 48(%esp), %xmm3

- addl $140, %esp

- ret

-On Nehalem, it may even be cheaper to just use movups when unaligned than to

-fall back to lower-granularity chunks.

-//===---------------------------------------------------------------------===//

-Implement processor-specific optimizations for parity with GCC on these

-processors. GCC does two optimizations:

-1. ix86_pad_returns inserts a noop before ret instructions if immediately

- preceeded by a conditional branch or is the target of a jump.

-2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of

- code contains more than 3 branches.

-The first one is done for all AMDs, Core2, and "Generic"

-The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,

- Core 2, and "Generic"

-//===---------------------------------------------------------------------===//

-Testcase:

-int a(int x) { return (x & 127) > 31; }

-Current output:

- movl 4(%esp), %eax

- andl $127, %eax

- cmpl $31, %eax

- seta %al

- movzbl %al, %eax

- ret

-Ideal output:

- xorl %eax, %eax

- testl $96, 4(%esp)

- setne %al

- ret

-This should definitely be done in instcombine, canonicalizing the range

-condition into a != condition. We get this IR:

-define i32 @a(i32 %x) nounwind readnone {

-entry:

- %0 = and i32 %x, 127 ; <i32> [#uses=1]

- %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1]

- %2 = zext i1 %1 to i32 ; <i32> [#uses=1]

- ret i32 %2

-Instcombine prefers to strength reduce relational comparisons to equality

-comparisons when possible, this should be another case of that. This could

-be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it

-looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already

-be redesigned to use ComputeMaskedBits and friends.

-//===---------------------------------------------------------------------===//

-Testcase:

-int x(int a) { return (a&0xf0)>>4; }

-Current output:

- movl 4(%esp), %eax

- shrl $4, %eax

- andl $15, %eax

- ret

-Ideal output:

- movzbl 4(%esp), %eax

- shrl $4, %eax

- ret

-//===---------------------------------------------------------------------===//

-Testcase:

-int x(int a) { return (a & 0x80) ? 0x100 : 0; }

-int y(int a) { return (a & 0x80) *2; }

-Current:

- testl $128, 4(%esp)

- setne %al

- movzbl %al, %eax

- shll $8, %eax

- ret

-Better:

- movl 4(%esp), %eax

- addl %eax, %eax

- andl $256, %eax

- ret

-This is another general instcombine transformation that is profitable on all

-targets. In LLVM IR, these functions look like this:

-define i32 @x(i32 %a) nounwind readnone {

-entry:

- %0 = and i32 %a, 128

- %1 = icmp eq i32 %0, 0

- %iftmp.0.0 = select i1 %1, i32 0, i32 256

- ret i32 %iftmp.0.0

-define i32 @y(i32 %a) nounwind readnone {

-entry:

- %0 = shl i32 %a, 1

- %1 = and i32 %0, 256

- ret i32 %1

-Replacing an icmp+select with a shift should always be considered profitable in

-instcombine.

-//===---------------------------------------------------------------------===//

-Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch

-properly.

-When the return value is not used (i.e. only care about the value in the

-memory), x86 does not have to use add to implement these. Instead, it can use

-add, sub, inc, dec instructions with the "lock" prefix.

-This is currently implemented using a bit of instruction selection trick. The

-issue is the target independent pattern produces one output and a chain and we

-want to map it into one that just output a chain. The current trick is to select

-it into a MERGE_VALUES with the first definition being an implicit_def. The

-proper solution is to add new ISD opcodes for the no-output variant. DAG

-combiner can then transform the node before it gets to target node selection.

-Problem #2 is we are adding a whole bunch of x86 atomic instructions when in

-fact these instructions are identical to the non-lock versions. We need a way to

-add target specific information to target nodes and have this information

-carried over to machine instructions. Asm printer (or JIT) can use this

-information to add the "lock" prefix.

-//===---------------------------------------------------------------------===//

-_Bool bar(int *x) { return *x & 1; }

-define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {

-entry:

- %tmp1 = load i32* %x ; <i32> [#uses=1]

- %and = and i32 %tmp1, 1 ; <i32> [#uses=1]

- %tobool = icmp ne i32 %and, 0 ; <i1> [#uses=1]

- ret i1 %tobool

-bar: # @bar

-# BB#0: # %entry

- movl 4(%esp), %eax

- movb (%eax), %al

- andb $1, %al

- movzbl %al, %eax

- ret

-Missed optimization: should be movl+andl.

-//===---------------------------------------------------------------------===//

-Consider the following two functions compiled with clang:

-_Bool foo(int *x) { return !(*x & 4); }

-unsigned bar(int *x) { return !(*x & 4); }

-foo:

- movl 4(%esp), %eax

- testb $4, (%eax)

- sete %al

- movzbl %al, %eax

- ret

-bar:

- movl 4(%esp), %eax

- movl (%eax), %eax

- shrl $2, %eax

- andl $1, %eax

- xorl $1, %eax

- ret

-The second function generates more code even though the two functions are

-are functionally identical.

-//===---------------------------------------------------------------------===//

-Take the following C code:

-int x(int y) { return (y & 63) << 14; }

-Code produced by gcc:

- andl $63, %edi

- sall $14, %edi

- movl %edi, %eax

- ret

-Code produced by clang:

- shll $14, %edi

- movl %edi, %eax

- andl $1032192, %eax

- ret

-The code produced by gcc is 3 bytes shorter. This sort of construct often

-shows up with bitfields.

-//===---------------------------------------------------------------------===//

-Take the following C code:

-int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }

-We generate the following IR with clang:

-define i32 @f(i32 %a, i32 %b) nounwind readnone {

-entry:

- %tmp = xor i32 %b, %a ; <i32> [#uses=1]

- %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]

- %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]

- %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]

- ret i32 %conv5

-And the following x86 code:

- xorl %esi, %edi

- testb $-1, %dil

- sete %al

- movzbl %al, %eax

- ret

-A cmpb instead of the xorl+testb would be one instruction shorter.

-//===---------------------------------------------------------------------===//

-Given the following C code:

-int f(int a, int b) { return (signed char)a == (signed char)b; }

-We generate the following IR with clang:

-define i32 @f(i32 %a, i32 %b) nounwind readnone {

-entry:

- %sext = shl i32 %a, 24 ; <i32> [#uses=1]

- %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]

- %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]

- %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]

- %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]

- %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]

- ret i32 %conv5

-And the following x86 code:

- movsbl %sil, %eax

- movsbl %dil, %ecx

- cmpl %eax, %ecx

- sete %al

- movzbl %al, %eax

- ret

-It should be possible to eliminate the sign extensions.

-//===---------------------------------------------------------------------===//

-LLVM misses a load+store narrowing opportunity in this code:

-%struct.bf = type { i64, i16, i16, i32 }

-@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2]

-define void @t1() nounwind ssp {

-entry:

- %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]

- %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]

- %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2]

- %3 = load i32* %2, align 1 ; <i32> [#uses=1]

- %4 = and i32 %3, -65537 ; <i32> [#uses=1]

- store i32 %4, i32* %2, align 1

- %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]

- %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]

- %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2]

- %8 = load i32* %7, align 1 ; <i32> [#uses=1]

- %9 = and i32 %8, -131073 ; <i32> [#uses=1]

- store i32 %9, i32* %7, align 1

- ret void

-LLVM currently emits this:

- movq bfi(%rip), %rax

- andl $-65537, 8(%rax)

- movq bfi(%rip), %rax

- andl $-131073, 8(%rax)

- ret

-It could narrow the loads and stores to emit this:

- movq bfi(%rip), %rax

- andb $-2, 10(%rax)

- movq bfi(%rip), %rax

- andb $-3, 10(%rax)

- ret

-The trouble is that there is a TokenFactor between the store and the

-load, making it non-trivial to determine if there's anything between

-the load and the store which would prohibit narrowing.

-//===---------------------------------------------------------------------===//

diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt b/contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt
deleted file mode 100644
index 90be9f58cc73..000000000000
--- a/contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt
+++ /dev/null

@@ -1,7 +0,0 @@

-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )

-add_llvm_library(LLVMX86Info

- X86TargetInfo.cpp

- )

-add_dependencies(LLVMX86Info X86CodeGenTable_gen)

diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/Makefile b/contrib/llvm/lib/Target/X86/TargetInfo/Makefile
deleted file mode 100644
index ee91982df0c8..000000000000
--- a/contrib/llvm/lib/Target/X86/TargetInfo/Makefile
+++ /dev/null

@@ -1,16 +0,0 @@

-##===- lib/Target/X86/TargetInfo/Makefile ------------------*- Makefile -*-===##

-# The LLVM Compiler Infrastructure

-# This file is distributed under the University of Illinois Open Source

-# License. See LICENSE.TXT for details.

-##===----------------------------------------------------------------------===##

-LEVEL = ../../../..

-LIBRARYNAME = LLVMX86Info

-# Hack: we need to include 'main' target directory to grab private headers

-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..

-include $(LEVEL)/Makefile.common

diff --git a/contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm b/contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm
deleted file mode 100644
index f321778db24b..000000000000
--- a/contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm
+++ /dev/null

@@ -1,68 +0,0 @@

-;;===-- X86CompilationCallback_Win64.asm - Implement Win64 JIT callback ---===

-;;

-;; The LLVM Compiler Infrastructure

-;;

-;; This file is distributed under the University of Illinois Open Source

-;; License. See LICENSE.TXT for details.

-;;

-;;===----------------------------------------------------------------------===

-;;

-;; This file implements the JIT interfaces for the X86 target.

-;;

-;;===----------------------------------------------------------------------===

-extrn X86CompilationCallback2: PROC

-.code

-X86CompilationCallback proc

- push rbp

- ; Save RSP.

- mov rbp, rsp

- ; Save all int arg registers

- ; WARNING: We cannot use register spill area - we're generating stubs by hands!

- push rcx

- push rdx

- push r8

- push r9

- ; Align stack on 16-byte boundary.

- and rsp, -16

- ; Save all XMM arg registers. Also allocate reg spill area.

- sub rsp, 96

- movaps [rsp +32], xmm0

- movaps [rsp+16+32], xmm1

- movaps [rsp+32+32], xmm2

- movaps [rsp+48+32], xmm3

- ; JIT callee

- ; Pass prev frame and return address.

- mov rcx, rbp

- mov rdx, qword ptr [rbp+8]

- call X86CompilationCallback2

- ; Restore all XMM arg registers.

- movaps xmm3, [rsp+48+32]

- movaps xmm2, [rsp+32+32]

- movaps xmm1, [rsp+16+32]

- movaps xmm0, [rsp +32]

- ; Restore RSP.

- mov rsp, rbp

- ; Restore all int arg registers

- sub rsp, 32

- pop r9

- pop r8

- pop rdx

- pop rcx

- ; Restore RBP.

- pop rbp

- ret

-X86CompilationCallback endp

-End