diff options
Diffstat (limited to 'contrib/gcc/config/arm/arm1020e.md')
-rw-r--r-- | contrib/gcc/config/arm/arm1020e.md | 388 |
1 files changed, 0 insertions, 388 deletions
diff --git a/contrib/gcc/config/arm/arm1020e.md b/contrib/gcc/config/arm/arm1020e.md deleted file mode 100644 index 32a5d95e9659..000000000000 --- a/contrib/gcc/config/arm/arm1020e.md +++ /dev/null @@ -1,388 +0,0 @@ -;; ARM 1020E & ARM 1022E Pipeline Description -;; Copyright (C) 2005 Free Software Foundation, Inc. -;; Contributed by Richard Earnshaw (richard.earnshaw@arm.com) -;; -;; This file is part of GCC. -;; -;; GCC is free software; you can redistribute it and/or modify it -;; under the terms of the GNU General Public License as published by -;; the Free Software Foundation; either version 2, or (at your option) -;; any later version. -;; -;; GCC is distributed in the hope that it will be useful, but -;; WITHOUT ANY WARRANTY; without even the implied warranty of -;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;; General Public License for more details. -;; -;; You should have received a copy of the GNU General Public License -;; along with GCC; see the file COPYING. If not, write to the Free -;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -;; 02110-1301, USA. */ - -;; These descriptions are based on the information contained in the -;; ARM1020E Technical Reference Manual, Copyright (c) 2003 ARM -;; Limited. -;; - -;; This automaton provides a pipeline description for the ARM -;; 1020E core. -;; -;; The model given here assumes that the condition for all conditional -;; instructions is "true", i.e., that all of the instructions are -;; actually executed. - -(define_automaton "arm1020e") - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Pipelines -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; There are two pipelines: -;; -;; - An Arithmetic Logic Unit (ALU) pipeline. -;; -;; The ALU pipeline has fetch, issue, decode, execute, memory, and -;; write stages. We only need to model the execute, memory and write -;; stages. -;; -;; - A Load-Store Unit (LSU) pipeline. -;; -;; The LSU pipeline has decode, execute, memory, and write stages. -;; We only model the execute, memory and write stages. - -(define_cpu_unit "1020a_e,1020a_m,1020a_w" "arm1020e") -(define_cpu_unit "1020l_e,1020l_m,1020l_w" "arm1020e") - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; ALU Instructions -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; ALU instructions require three cycles to execute, and use the ALU -;; pipeline in each of the three stages. The results are available -;; after the execute stage stage has finished. -;; -;; If the destination register is the PC, the pipelines are stalled -;; for several cycles. That case is not modeled here. - -;; ALU operations with no shifted operand -(define_insn_reservation "1020alu_op" 1 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "alu")) - "1020a_e,1020a_m,1020a_w") - -;; ALU operations with a shift-by-constant operand -(define_insn_reservation "1020alu_shift_op" 1 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "alu_shift")) - "1020a_e,1020a_m,1020a_w") - -;; ALU operations with a shift-by-register operand -;; These really stall in the decoder, in order to read -;; the shift value in a second cycle. Pretend we take two cycles in -;; the execute stage. -(define_insn_reservation "1020alu_shift_reg_op" 2 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "alu_shift_reg")) - "1020a_e*2,1020a_m,1020a_w") - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Multiplication Instructions -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Multiplication instructions loop in the execute stage until the -;; instruction has been passed through the multiplier array enough -;; times. - -;; The result of the "smul" and "smulw" instructions is not available -;; until after the memory stage. -(define_insn_reservation "1020mult1" 2 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "insn" "smulxy,smulwy")) - "1020a_e,1020a_m,1020a_w") - -;; The "smlaxy" and "smlawx" instructions require two iterations through -;; the execute stage; the result is available immediately following -;; the execute stage. -(define_insn_reservation "1020mult2" 2 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "insn" "smlaxy,smlalxy,smlawx")) - "1020a_e*2,1020a_m,1020a_w") - -;; The "smlalxy", "mul", and "mla" instructions require two iterations -;; through the execute stage; the result is not available until after -;; the memory stage. -(define_insn_reservation "1020mult3" 3 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "insn" "smlalxy,mul,mla")) - "1020a_e*2,1020a_m,1020a_w") - -;; The "muls" and "mlas" instructions loop in the execute stage for -;; four iterations in order to set the flags. The value result is -;; available after three iterations. -(define_insn_reservation "1020mult4" 3 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "insn" "muls,mlas")) - "1020a_e*4,1020a_m,1020a_w") - -;; Long multiply instructions that produce two registers of -;; output (such as umull) make their results available in two cycles; -;; the least significant word is available before the most significant -;; word. That fact is not modeled; instead, the instructions are -;; described.as if the entire result was available at the end of the -;; cycle in which both words are available. - -;; The "umull", "umlal", "smull", and "smlal" instructions all take -;; three iterations through the execute cycle, and make their results -;; available after the memory cycle. -(define_insn_reservation "1020mult5" 4 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "insn" "umull,umlal,smull,smlal")) - "1020a_e*3,1020a_m,1020a_w") - -;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in -;; the execute stage for five iterations in order to set the flags. -;; The value result is available after four iterations. -(define_insn_reservation "1020mult6" 4 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "insn" "umulls,umlals,smulls,smlals")) - "1020a_e*5,1020a_m,1020a_w") - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Load/Store Instructions -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; The models for load/store instructions do not accurately describe -;; the difference between operations with a base register writeback -;; (such as "ldm!"). These models assume that all memory references -;; hit in dcache. - -;; LSU instructions require six cycles to execute. They use the ALU -;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles -;; three through six. -;; Loads and stores which use a scaled register offset or scaled -;; register pre-indexed addressing mode take three cycles EXCEPT for -;; those that are base + offset with LSL of 0 or 2, or base - offset -;; with LSL of zero. The remainder take 1 cycle to execute. -;; For 4byte loads there is a bypass from the load stage - -(define_insn_reservation "1020load1_op" 2 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "load_byte,load1")) - "1020a_e+1020l_e,1020l_m,1020l_w") - -(define_insn_reservation "1020store1_op" 0 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "store1")) - "1020a_e+1020l_e,1020l_m,1020l_w") - -;; A load's result can be stored by an immediately following store -(define_bypass 1 "1020load1_op" "1020store1_op" "arm_no_early_store_addr_dep") - -;; On a LDM/STM operation, the LSU pipeline iterates until all of the -;; registers have been processed. -;; -;; The time it takes to load the data depends on whether or not the -;; base address is 64-bit aligned; if it is not, an additional cycle -;; is required. This model assumes that the address is always 64-bit -;; aligned. Because the processor can load two registers per cycle, -;; that assumption means that we use the same instruction reservations -;; for loading 2k and 2k - 1 registers. -;; -;; The ALU pipeline is decoupled after the first cycle unless there is -;; a register dependency; the dependency is cleared as soon as the LDM/STM -;; has dealt with the corresponding register. So for example, -;; stmia sp, {r0-r3} -;; add r0, r0, #4 -;; will have one fewer stalls than -;; stmia sp, {r0-r3} -;; add r3, r3, #4 -;; -;; As with ALU operations, if one of the destination registers is the -;; PC, there are additional stalls; that is not modeled. - -(define_insn_reservation "1020load2_op" 2 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "load2")) - "1020a_e+1020l_e,1020l_m,1020l_w") - -(define_insn_reservation "1020store2_op" 0 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "store2")) - "1020a_e+1020l_e,1020l_m,1020l_w") - -(define_insn_reservation "1020load34_op" 3 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "load3,load4")) - "1020a_e+1020l_e,1020l_e+1020l_m,1020l_m,1020l_w") - -(define_insn_reservation "1020store34_op" 0 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "store3,store4")) - "1020a_e+1020l_e,1020l_e+1020l_m,1020l_m,1020l_w") - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Branch and Call Instructions -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Branch instructions are difficult to model accurately. The ARM -;; core can predict most branches. If the branch is predicted -;; correctly, and predicted early enough, the branch can be completely -;; eliminated from the instruction stream. Some branches can -;; therefore appear to require zero cycles to execute. We assume that -;; all branches are predicted correctly, and that the latency is -;; therefore the minimum value. - -(define_insn_reservation "1020branch_op" 0 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "branch")) - "1020a_e") - -;; The latency for a call is not predictable. Therefore, we use 32 as -;; roughly equivalent to positive infinity. - -(define_insn_reservation "1020call_op" 32 - (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "type" "call")) - "1020a_e*32") - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; VFP -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(define_cpu_unit "v10_fmac" "arm1020e") - -(define_cpu_unit "v10_ds" "arm1020e") - -(define_cpu_unit "v10_fmstat" "arm1020e") - -(define_cpu_unit "v10_ls1,v10_ls2,v10_ls3" "arm1020e") - -;; fmstat is a serializing instruction. It will stall the core until -;; the mac and ds units have completed. -(exclusion_set "v10_fmac,v10_ds" "v10_fmstat") - -(define_attr "vfp10" "yes,no" - (const (if_then_else (and (eq_attr "tune" "arm1020e,arm1022e") - (eq_attr "fpu" "vfp")) - (const_string "yes") (const_string "no")))) - -;; The VFP "type" attributes differ from those used in the FPA model. -;; ffarith Fast floating point insns, e.g. abs, neg, cpy, cmp. -;; farith Most arithmetic insns. -;; fmul Double precision multiply. -;; fdivs Single precision sqrt or division. -;; fdivd Double precision sqrt or division. -;; f_flag fmstat operation -;; f_load Floating point load from memory. -;; f_store Floating point store to memory. -;; f_2_r Transfer vfp to arm reg. -;; r_2_f Transfer arm to vfp reg. - -;; Note, no instruction can issue to the VFP if the core is stalled in the -;; first execute state. We model this by using 1020a_e in the first cycle. -(define_insn_reservation "v10_ffarith" 5 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "ffarith")) - "1020a_e+v10_fmac") - -(define_insn_reservation "v10_farith" 5 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "farith")) - "1020a_e+v10_fmac") - -(define_insn_reservation "v10_cvt" 5 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "f_cvt")) - "1020a_e+v10_fmac") - -(define_insn_reservation "v10_fmul" 6 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "fmul")) - "1020a_e+v10_fmac*2") - -(define_insn_reservation "v10_fdivs" 18 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "fdivs")) - "1020a_e+v10_ds*14") - -(define_insn_reservation "v10_fdivd" 32 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "fdivd")) - "1020a_e+v10_fmac+v10_ds*28") - -(define_insn_reservation "v10_floads" 4 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "f_loads")) - "1020a_e+1020l_e+v10_ls1,v10_ls2") - -;; We model a load of a double as needing all the vfp ls* stage in cycle 1. -;; This gives the correct mix between single-and double loads where a flds -;; followed by and fldd will stall for one cycle, but two back-to-back fldd -;; insns stall for two cycles. -(define_insn_reservation "v10_floadd" 5 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "f_loadd")) - "1020a_e+1020l_e+v10_ls1+v10_ls2+v10_ls3,v10_ls2+v10_ls3,v10_ls3") - -;; Moves to/from arm regs also use the load/store pipeline. - -(define_insn_reservation "v10_c2v" 4 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "r_2_f")) - "1020a_e+1020l_e+v10_ls1,v10_ls2") - -(define_insn_reservation "v10_fstores" 1 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "f_stores")) - "1020a_e+1020l_e+v10_ls1,v10_ls2") - -(define_insn_reservation "v10_fstored" 1 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "f_stored")) - "1020a_e+1020l_e+v10_ls1+v10_ls2+v10_ls3,v10_ls2+v10_ls3,v10_ls3") - -(define_insn_reservation "v10_v2c" 1 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "f_2_r")) - "1020a_e+1020l_e,1020l_m,1020l_w") - -(define_insn_reservation "v10_to_cpsr" 2 - (and (eq_attr "vfp10" "yes") - (eq_attr "type" "f_flag")) - "1020a_e+v10_fmstat,1020a_e+1020l_e,1020l_m,1020l_w") - -;; VFP bypasses - -;; There are bypasses for most operations other than store - -(define_bypass 3 - "v10_c2v,v10_floads" - "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd,v10_cvt") - -(define_bypass 4 - "v10_floadd" - "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") - -;; Arithmetic to other arithmetic saves a cycle due to forwarding -(define_bypass 4 - "v10_ffarith,v10_farith" - "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") - -(define_bypass 5 - "v10_fmul" - "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") - -(define_bypass 17 - "v10_fdivs" - "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") - -(define_bypass 31 - "v10_fdivd" - "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") - -;; VFP anti-dependencies. - -;; There is one anti-dependence in the following case (not yet modelled): -;; - After a store: one extra cycle for both fsts and fstd -;; Note, back-to-back fstd instructions will overload the load/store datapath -;; causing a two-cycle stall. |