Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/arm/cortex-a5.md @ 68:561a7518be6b
update gcc-4.6
author | Nobuyasu Oshiro <dimolto@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sun, 21 Aug 2011 07:07:55 +0900 |
parents | |
children | 04ced10e8804 |
comparison
equal
deleted
inserted
replaced
67:f6334be47118 | 68:561a7518be6b |
---|---|
1 ;; ARM Cortex-A5 pipeline description | |
2 ;; Copyright (C) 2010 Free Software Foundation, Inc. | |
3 ;; Contributed by CodeSourcery. | |
4 ;; | |
5 ;; This file is part of GCC. | |
6 ;; | |
7 ;; GCC is free software; you can redistribute it and/or modify it | |
8 ;; under the terms of the GNU General Public License as published by | |
9 ;; the Free Software Foundation; either version 3, or (at your option) | |
10 ;; any later version. | |
11 ;; | |
12 ;; GCC is distributed in the hope that it will be useful, but | |
13 ;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 ;; General Public License for more details. | |
16 ;; | |
17 ;; You should have received a copy of the GNU General Public License | |
18 ;; along with GCC; see the file COPYING3. If not see | |
19 ;; <http://www.gnu.org/licenses/>. | |
20 | |
21 (define_automaton "cortex_a5") | |
22 | |
23 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
24 ;; Functional units. | |
25 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
26 | |
27 ;; The integer (ALU) pipeline. There are five DPU pipeline | |
28 ;; stages. However the decode/issue stages operate the same for all | |
29 ;; instructions, so do not model them. We only need to model the | |
30 ;; first execute stage because instructions always advance one stage | |
31 ;; per cycle in order. Only branch instructions may dual-issue, so a | |
32 ;; single unit covers all of the LS, ALU, MAC and FPU pipelines. | |
33 | |
34 (define_cpu_unit "cortex_a5_ex1" "cortex_a5") | |
35 | |
36 ;; The branch pipeline. Branches can dual-issue with other instructions | |
37 ;; (except when those instructions take multiple cycles to issue). | |
38 | |
39 (define_cpu_unit "cortex_a5_branch" "cortex_a5") | |
40 | |
41 ;; Pseudo-unit for blocking the multiply pipeline when a double-precision | |
42 ;; multiply is in progress. | |
43 | |
44 (define_cpu_unit "cortex_a5_fpmul_pipe" "cortex_a5") | |
45 | |
46 ;; The floating-point add pipeline (ex1/f1 stage), used to model the usage | |
47 ;; of the add pipeline by fmac instructions, etc. | |
48 | |
49 (define_cpu_unit "cortex_a5_fpadd_pipe" "cortex_a5") | |
50 | |
51 ;; Floating-point div/sqrt (long latency, out-of-order completion). | |
52 | |
53 (define_cpu_unit "cortex_a5_fp_div_sqrt" "cortex_a5") | |
54 | |
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
56 ;; ALU instructions. | |
57 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
58 | |
59 (define_insn_reservation "cortex_a5_alu" 2 | |
60 (and (eq_attr "tune" "cortexa5") | |
61 (eq_attr "type" "alu")) | |
62 "cortex_a5_ex1") | |
63 | |
64 (define_insn_reservation "cortex_a5_alu_shift" 2 | |
65 (and (eq_attr "tune" "cortexa5") | |
66 (eq_attr "type" "alu_shift,alu_shift_reg")) | |
67 "cortex_a5_ex1") | |
68 | |
69 ;; Forwarding path for unshifted operands. | |
70 | |
71 (define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift" | |
72 "cortex_a5_alu") | |
73 | |
74 (define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift" | |
75 "cortex_a5_alu_shift" | |
76 "arm_no_early_alu_shift_dep") | |
77 | |
78 ;; The multiplier pipeline can forward results from wr stage only so | |
79 ;; there's no need to specify bypasses). | |
80 | |
81 (define_insn_reservation "cortex_a5_mul" 2 | |
82 (and (eq_attr "tune" "cortexa5") | |
83 (eq_attr "type" "mult")) | |
84 "cortex_a5_ex1") | |
85 | |
86 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
87 ;; Load/store instructions. | |
88 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
89 | |
90 ;; Address-generation happens in the issue stage, which is one stage behind | |
91 ;; the ex1 stage (the first stage we care about for scheduling purposes). The | |
92 ;; dc1 stage is parallel with ex1, dc2 with ex2 and rot with wr. | |
93 | |
94 (define_insn_reservation "cortex_a5_load1" 2 | |
95 (and (eq_attr "tune" "cortexa5") | |
96 (eq_attr "type" "load_byte,load1")) | |
97 "cortex_a5_ex1") | |
98 | |
99 (define_insn_reservation "cortex_a5_store1" 0 | |
100 (and (eq_attr "tune" "cortexa5") | |
101 (eq_attr "type" "store1")) | |
102 "cortex_a5_ex1") | |
103 | |
104 (define_insn_reservation "cortex_a5_load2" 3 | |
105 (and (eq_attr "tune" "cortexa5") | |
106 (eq_attr "type" "load2")) | |
107 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
108 | |
109 (define_insn_reservation "cortex_a5_store2" 0 | |
110 (and (eq_attr "tune" "cortexa5") | |
111 (eq_attr "type" "store2")) | |
112 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
113 | |
114 (define_insn_reservation "cortex_a5_load3" 4 | |
115 (and (eq_attr "tune" "cortexa5") | |
116 (eq_attr "type" "load3")) | |
117 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
118 cortex_a5_ex1") | |
119 | |
120 (define_insn_reservation "cortex_a5_store3" 0 | |
121 (and (eq_attr "tune" "cortexa5") | |
122 (eq_attr "type" "store3")) | |
123 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
124 cortex_a5_ex1") | |
125 | |
126 (define_insn_reservation "cortex_a5_load4" 5 | |
127 (and (eq_attr "tune" "cortexa5") | |
128 (eq_attr "type" "load3")) | |
129 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
130 cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
131 | |
132 (define_insn_reservation "cortex_a5_store4" 0 | |
133 (and (eq_attr "tune" "cortexa5") | |
134 (eq_attr "type" "store3")) | |
135 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
136 cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
137 | |
138 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
139 ;; Branches. | |
140 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
141 | |
142 ;; Direct branches are the only instructions we can dual-issue (also IT and | |
143 ;; nop, but those aren't very interesting for scheduling). (The latency here | |
144 ;; is meant to represent when the branch actually takes place, but may not be | |
145 ;; entirely correct.) | |
146 | |
147 (define_insn_reservation "cortex_a5_branch" 3 | |
148 (and (eq_attr "tune" "cortexa5") | |
149 (eq_attr "type" "branch,call")) | |
150 "cortex_a5_branch") | |
151 | |
152 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
153 ;; Floating-point arithmetic. | |
154 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
155 | |
156 (define_insn_reservation "cortex_a5_fpalu" 4 | |
157 (and (eq_attr "tune" "cortexa5") | |
158 (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fcpys, fmuls, f_cvt,\ | |
159 fcmps, fcmpd")) | |
160 "cortex_a5_ex1+cortex_a5_fpadd_pipe") | |
161 | |
162 ;; For fconsts and fconstd, 8-bit immediate data is passed directly from | |
163 ;; f1 to f3 (which I think reduces the latency by one cycle). | |
164 | |
165 (define_insn_reservation "cortex_a5_fconst" 3 | |
166 (and (eq_attr "tune" "cortexa5") | |
167 (eq_attr "type" "fconsts,fconstd")) | |
168 "cortex_a5_ex1+cortex_a5_fpadd_pipe") | |
169 | |
170 ;; We should try not to attempt to issue a single-precision multiplication in | |
171 ;; the middle of a double-precision multiplication operation (the usage of | |
172 ;; cortex_a5_fpmul_pipe). | |
173 | |
174 (define_insn_reservation "cortex_a5_fpmuls" 4 | |
175 (and (eq_attr "tune" "cortexa5") | |
176 (eq_attr "type" "fmuls")) | |
177 "cortex_a5_ex1+cortex_a5_fpmul_pipe") | |
178 | |
179 ;; For single-precision multiply-accumulate, the add (accumulate) is issued | |
180 ;; whilst the multiply is in F4. The multiply result can then be forwarded | |
181 ;; from F5 to F1. The issue unit is only used once (when we first start | |
182 ;; processing the instruction), but the usage of the FP add pipeline could | |
183 ;; block other instructions attempting to use it simultaneously. We try to | |
184 ;; avoid that using cortex_a5_fpadd_pipe. | |
185 | |
186 (define_insn_reservation "cortex_a5_fpmacs" 8 | |
187 (and (eq_attr "tune" "cortexa5") | |
188 (eq_attr "type" "fmacs")) | |
189 "cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe") | |
190 | |
191 ;; Non-multiply instructions can issue in the middle two instructions of a | |
192 ;; double-precision multiply. Note that it isn't entirely clear when a branch | |
193 ;; can dual-issue when a multi-cycle multiplication is in progress; we ignore | |
194 ;; that for now though. | |
195 | |
196 (define_insn_reservation "cortex_a5_fpmuld" 7 | |
197 (and (eq_attr "tune" "cortexa5") | |
198 (eq_attr "type" "fmuld")) | |
199 "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\ | |
200 cortex_a5_ex1+cortex_a5_fpmul_pipe") | |
201 | |
202 (define_insn_reservation "cortex_a5_fpmacd" 11 | |
203 (and (eq_attr "tune" "cortexa5") | |
204 (eq_attr "type" "fmacd")) | |
205 "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\ | |
206 cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe") | |
207 | |
208 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
209 ;; Floating-point divide/square root instructions. | |
210 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
211 | |
212 ;; ??? Not sure if the 14 cycles taken for single-precision divide to complete | |
213 ;; includes the time taken for the special instruction used to collect the | |
214 ;; result to travel down the multiply pipeline, or not. Assuming so. (If | |
215 ;; that's wrong, the latency should be increased by a few cycles.) | |
216 | |
217 ;; fsqrt takes one cycle less, but that is not modelled, nor is the use of the | |
218 ;; multiply pipeline to collect the divide/square-root result. | |
219 | |
220 (define_insn_reservation "cortex_a5_fdivs" 14 | |
221 (and (eq_attr "tune" "cortexa5") | |
222 (eq_attr "type" "fdivs")) | |
223 "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 13") | |
224 | |
225 ;; ??? Similarly for fdivd. | |
226 | |
227 (define_insn_reservation "cortex_a5_fdivd" 29 | |
228 (and (eq_attr "tune" "cortexa5") | |
229 (eq_attr "type" "fdivd")) | |
230 "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 28") | |
231 | |
232 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
233 ;; VFP to/from core transfers. | |
234 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
235 | |
236 ;; FP loads take data from wr/rot/f3. | |
237 | |
238 ;; Core-to-VFP transfers use the multiply pipeline. | |
239 | |
240 (define_insn_reservation "cortex_a5_r2f" 4 | |
241 (and (eq_attr "tune" "cortexa5") | |
242 (eq_attr "type" "r_2_f")) | |
243 "cortex_a5_ex1") | |
244 | |
245 (define_insn_reservation "cortex_a5_f2r" 2 | |
246 (and (eq_attr "tune" "cortexa5") | |
247 (eq_attr "type" "f_2_r")) | |
248 "cortex_a5_ex1") | |
249 | |
250 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
251 ;; VFP flag transfer. | |
252 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
253 | |
254 ;; ??? The flag forwarding from fmstat to the ex2 stage of the second | |
255 ;; instruction is not modeled at present. | |
256 | |
257 (define_insn_reservation "cortex_a5_f_flags" 4 | |
258 (and (eq_attr "tune" "cortexa5") | |
259 (eq_attr "type" "f_flag")) | |
260 "cortex_a5_ex1") | |
261 | |
262 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
263 ;; VFP load/store. | |
264 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
265 | |
266 (define_insn_reservation "cortex_a5_f_loads" 4 | |
267 (and (eq_attr "tune" "cortexa5") | |
268 (eq_attr "type" "f_loads")) | |
269 "cortex_a5_ex1") | |
270 | |
271 (define_insn_reservation "cortex_a5_f_loadd" 5 | |
272 (and (eq_attr "tune" "cortexa5") | |
273 (eq_attr "type" "f_loadd")) | |
274 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
275 | |
276 (define_insn_reservation "cortex_a5_f_stores" 0 | |
277 (and (eq_attr "tune" "cortexa5") | |
278 (eq_attr "type" "f_stores")) | |
279 "cortex_a5_ex1") | |
280 | |
281 (define_insn_reservation "cortex_a5_f_stored" 0 | |
282 (and (eq_attr "tune" "cortexa5") | |
283 (eq_attr "type" "f_stored")) | |
284 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
285 | |
286 ;; Load-to-use for floating-point values has a penalty of one cycle, | |
287 ;; i.e. a latency of two. | |
288 | |
289 (define_bypass 2 "cortex_a5_f_loads" | |
290 "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\ | |
291 cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\ | |
292 cortex_a5_f2r") | |
293 | |
294 (define_bypass 3 "cortex_a5_f_loadd" | |
295 "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\ | |
296 cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\ | |
297 cortex_a5_f2r") |