0
|
1 (* Emission of the core of the Cortex-A8 NEON scheduling description.
|
|
2 Copyright (C) 2007 Free Software Foundation, Inc.
|
|
3 Contributed by CodeSourcery.
|
|
4
|
|
5 This file is part of GCC.
|
|
6
|
|
7 GCC is free software; you can redistribute it and/or modify it under
|
|
8 the terms of the GNU General Public License as published by the Free
|
|
9 Software Foundation; either version 3, or (at your option) any later
|
|
10 version.
|
|
11
|
|
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
15 for more details.
|
|
16
|
|
17 You should have received a copy of the GNU General Public License
|
|
18 along with GCC; see the file COPYING3. If not see
|
|
19 <http://www.gnu.org/licenses/>.
|
|
20 *)
|
|
21
|
|
22 (* This scheduling description generator works as follows.
|
|
23 - Each group of instructions has source and destination requirements
|
|
24 specified. The source requirements may be specified using
|
|
25 Source (the stage at which all source operands not otherwise
|
|
26 described are read), Source_m (the stage at which Rm operands are
|
|
27 read), Source_n (likewise for Rn) and Source_d (likewise for Rd).
|
|
28 - For each group of instructions the earliest stage where a source
|
|
29 operand may be required is calculated.
|
|
30 - Each group of instructions is selected in turn as a producer.
|
|
31 The latencies between this group and every other group are then
|
|
32 calculated, yielding up to four values for each combination:
|
|
33 1. Producer -> consumer Rn latency
|
|
34 2. Producer -> consumer Rm latency
|
|
35 3. Producer -> consumer Rd (as a source) latency
|
|
36 4. Producer -> consumer worst-case latency.
|
|
37 Value 4 is calculated from the destination availability requirements
|
|
38 of the consumer and the earliest source availability requirements
|
|
39 of the producer.
|
|
40 - The largest Value 4 calculated for the current producer is the
|
|
41 worse-case latency, L, for that instruction group. This value is written
|
|
42 out in a define_insn_reservation for the producer group.
|
|
43 - For each producer and consumer pair, the latencies calculated above
|
|
44 are collated. The average (of up to four values) is calculated and
|
|
45 if this average is different from the worst-case latency, an
|
|
46 unguarded define_bypass construction is issued for that pair.
|
|
47 (For each pair only one define_bypass construction will be emitted,
|
|
48 and at present we do not emit specific guards.)
|
|
49 *)
|
|
50
|
|
51 open Utils
|
|
52
|
|
53 let n1 = 1 and n2 = 2 and n3 = 3 and n4 = 4 and n5 = 5 and n6 = 6
|
|
54 and n7 = 7 and n8 = 8 and n9 = 9
|
|
55
|
|
56 type availability = Source of int
|
|
57 | Source_n of int
|
|
58 | Source_m of int
|
|
59 | Source_d of int
|
|
60 | Dest of int
|
|
61 | Dest_n_after of int * int
|
|
62
|
|
63 type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
|
|
64
|
|
65 (* Reservation behaviors. All but the last row here correspond to one
|
|
66 pipeline each. Each constructor will correspond to one
|
|
67 define_reservation. *)
|
|
68 type reservation =
|
|
69 Mul | Mul_2cycle | Mul_4cycle
|
|
70 | Shift | Shift_2cycle
|
|
71 | ALU | ALU_2cycle
|
|
72 | Fmul | Fmul_2cycle
|
|
73 | Fadd | Fadd_2cycle
|
|
74 (* | VFP *)
|
|
75 | Permute of int
|
|
76 | Ls of int
|
|
77 | Fmul_then_fadd | Fmul_then_fadd_2
|
|
78
|
|
79 (* This table must be kept as short as possible by conflating
|
|
80 entries with the same availability behavior.
|
|
81
|
|
82 First components: instruction group names
|
|
83 Second components: availability requirements, in the order in which
|
|
84 they should appear in the comments in the .md file.
|
|
85 Third components: reservation info
|
|
86 *)
|
|
87 let availability_table = [
|
|
88 (* NEON integer ALU instructions. *)
|
|
89 (* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr
|
|
90 veor vbic vorn ddd qqq *)
|
|
91 "neon_int_1", [Source n2; Dest n3], ALU;
|
|
92 (* vadd vsub qqd vsub ddd qqq *)
|
|
93 "neon_int_2", [Source_m n1; Source_n n2; Dest n3], ALU;
|
|
94 (* vsum vneg dd qq vadd vsub qdd *)
|
|
95 "neon_int_3", [Source n1; Dest n3], ALU;
|
|
96 (* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *)
|
|
97 (* vhadd vrhadd vqadd vtst ddd qqq *)
|
|
98 "neon_int_4", [Source n2; Dest n4], ALU;
|
|
99 (* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *)
|
|
100 "neon_int_5", [Source_m n1; Source_n n2; Dest n4], ALU;
|
|
101 (* vqneg vqabs dd qq *)
|
|
102 "neon_vqneg_vqabs", [Source n1; Dest n4], ALU;
|
|
103 (* vmov vmvn *)
|
|
104 "neon_vmov", [Dest n3], ALU;
|
|
105 (* vaba *)
|
|
106 "neon_vaba", [Source_n n2; Source_m n1; Source_d n3; Dest n6], ALU;
|
|
107 "neon_vaba_qqq",
|
|
108 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], ALU_2cycle;
|
|
109 (* vsma *)
|
|
110 "neon_vsma", [Source_m n1; Source_d n3; Dest n6], ALU;
|
|
111
|
|
112 (* NEON integer multiply instructions. *)
|
|
113 (* vmul, vqdmlh, vqrdmlh *)
|
|
114 (* vmul, vqdmul, qdd 16/8 long 32/16 long *)
|
|
115 "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2; Dest n6], Mul;
|
|
116 "neon_mul_qqq_8_16_32_ddd_32", [Source n2; Dest_n_after (1, n6)], Mul_2cycle;
|
|
117 (* vmul, vqdmul again *)
|
|
118 "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar",
|
|
119 [Source_n n2; Source_m n1; Dest_n_after (1, n6)], Mul_2cycle;
|
|
120 (* vmla, vmls *)
|
|
121 "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long",
|
|
122 [Source_n n2; Source_m n2; Source_d n3; Dest n6], Mul;
|
|
123 "neon_mla_qqq_8_16",
|
|
124 [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
|
|
125 "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long",
|
|
126 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
|
|
127 "neon_mla_qqq_32_qqd_32_scalar",
|
|
128 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (3, n6)], Mul_4cycle;
|
|
129 (* vmul, vqdmulh, vqrdmulh *)
|
|
130 (* vmul, vqdmul *)
|
|
131 "neon_mul_ddd_16_scalar_32_16_long_scalar",
|
|
132 [Source_n n2; Source_m n1; Dest n6], Mul;
|
|
133 "neon_mul_qqd_32_scalar",
|
|
134 [Source_n n2; Source_m n1; Dest_n_after (3, n6)], Mul_4cycle;
|
|
135 (* vmla, vmls *)
|
|
136 (* vmla, vmla, vqdmla, vqdmls *)
|
|
137 "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar",
|
|
138 [Source_n n2; Source_m n1; Source_d n3; Dest n6], Mul;
|
|
139
|
|
140 (* NEON integer shift instructions. *)
|
|
141 (* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *)
|
|
142 "neon_shift_1", [Source n1; Dest n3], Shift;
|
|
143 (* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow;
|
|
144 vqshl_vrshl_vqrshl_ddd *)
|
|
145 "neon_shift_2", [Source n1; Dest n4], Shift;
|
|
146 (* vsli, vsri and vshl for qqq *)
|
|
147 "neon_shift_3", [Source n1; Dest_n_after (1, n3)], Shift_2cycle;
|
|
148 "neon_vshl_ddd", [Source n1; Dest n1], Shift;
|
|
149 "neon_vqshl_vrshl_vqrshl_qqq", [Source n1; Dest_n_after (1, n4)],
|
|
150 Shift_2cycle;
|
|
151 "neon_vsra_vrsra", [Source_m n1; Source_d n3; Dest n6], Shift;
|
|
152
|
|
153 (* NEON floating-point instructions. *)
|
|
154 (* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *)
|
|
155 (* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *)
|
|
156 "neon_fp_vadd_ddd_vabs_dd", [Source n2; Dest n5], Fadd;
|
|
157 "neon_fp_vadd_qqq_vabs_qq", [Source n2; Dest_n_after (1, n5)],
|
|
158 Fadd_2cycle;
|
|
159 (* vsum, fvmx, vfmn *)
|
|
160 "neon_fp_vsum", [Source n1; Dest n5], Fadd;
|
|
161 "neon_fp_vmul_ddd", [Source_n n2; Source_m n1; Dest n5], Fmul;
|
|
162 "neon_fp_vmul_qqd", [Source_n n2; Source_m n1; Dest_n_after (1, n5)],
|
|
163 Fmul_2cycle;
|
|
164 (* vmla, vmls *)
|
|
165 "neon_fp_vmla_ddd",
|
|
166 [Source_n n2; Source_m n2; Source_d n3; Dest n9], Fmul_then_fadd;
|
|
167 "neon_fp_vmla_qqq",
|
|
168 [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n9)],
|
|
169 Fmul_then_fadd_2;
|
|
170 "neon_fp_vmla_ddd_scalar",
|
|
171 [Source_n n2; Source_m n1; Source_d n3; Dest n9], Fmul_then_fadd;
|
|
172 "neon_fp_vmla_qqq_scalar",
|
|
173 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n9)],
|
|
174 Fmul_then_fadd_2;
|
|
175 "neon_fp_vrecps_vrsqrts_ddd", [Source n2; Dest n9], Fmul_then_fadd;
|
|
176 "neon_fp_vrecps_vrsqrts_qqq", [Source n2; Dest_n_after (1, n9)],
|
|
177 Fmul_then_fadd_2;
|
|
178
|
|
179 (* NEON byte permute instructions. *)
|
|
180 (* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *)
|
|
181 "neon_bp_simple", [Source n1; Dest n2], Permute 1;
|
|
182 (* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1};
|
|
183 similarly for vtbx *)
|
|
184 "neon_bp_2cycle", [Source n1; Dest_n_after (1, n2)], Permute 2;
|
|
185 (* all the rest *)
|
|
186 "neon_bp_3cycle", [Source n1; Dest_n_after (2, n2)], Permute 3;
|
|
187
|
|
188 (* NEON load/store instructions. *)
|
|
189 "neon_ldr", [Dest n1], Ls 1;
|
|
190 "neon_str", [Source n1], Ls 1;
|
|
191 "neon_vld1_1_2_regs", [Dest_n_after (1, n1)], Ls 2;
|
|
192 "neon_vld1_3_4_regs", [Dest_n_after (2, n1)], Ls 3;
|
|
193 "neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after (1, n2)], Ls 2;
|
|
194 "neon_vld2_4_regs", [Dest_n_after (2, n2)], Ls 3;
|
|
195 "neon_vld3_vld4", [Dest_n_after (3, n2)], Ls 4;
|
|
196 "neon_vst1_1_2_regs_vst2_2_regs", [Source n1], Ls 2;
|
|
197 "neon_vst1_3_4_regs", [Source n1], Ls 3;
|
|
198 "neon_vst2_4_regs_vst3_vst4", [Source n1], Ls 4;
|
|
199 "neon_vst3_vst4", [Source n1], Ls 4;
|
|
200 "neon_vld1_vld2_lane", [Source n1; Dest_n_after (2, n2)], Ls 3;
|
|
201 "neon_vld3_vld4_lane", [Source n1; Dest_n_after (4, n2)], Ls 5;
|
|
202 "neon_vst1_vst2_lane", [Source n1], Ls 2;
|
|
203 "neon_vst3_vst4_lane", [Source n1], Ls 3;
|
|
204 "neon_vld3_vld4_all_lanes", [Dest_n_after (1, n2)], Ls 3;
|
|
205
|
|
206 (* NEON register transfer instructions. *)
|
|
207 "neon_mcr", [Dest n2], Permute 1;
|
|
208 "neon_mcr_2_mcrr", [Dest n2], Permute 2;
|
|
209 (* MRC instructions are in the .tpl file. *)
|
|
210 ]
|
|
211
|
|
212 (* Augment the tuples in the availability table with an extra component
|
|
213 that describes the earliest stage where a source operand may be
|
|
214 required. (It is also possible that an entry in the table has no
|
|
215 source requirements.) *)
|
|
216 let calculate_sources =
|
|
217 List.map (fun (name, avail, res) ->
|
|
218 let earliest_stage =
|
|
219 List.fold_left
|
|
220 (fun cur -> fun info ->
|
|
221 match info with
|
|
222 Source stage
|
|
223 | Source_n stage
|
|
224 | Source_m stage
|
|
225 | Source_d stage ->
|
|
226 (match cur with
|
|
227 None -> Some stage
|
|
228 | Some stage' when stage < stage' -> Some stage
|
|
229 | _ -> cur)
|
|
230 | _ -> cur) None avail
|
|
231 in
|
|
232 (name, avail, res, earliest_stage))
|
|
233
|
|
234 (* Find the stage, if any, at the end of which a group produces a result. *)
|
|
235 let find_dest (attr, avail, _, _) =
|
|
236 try
|
|
237 find_with_result
|
|
238 (fun av -> match av with
|
|
239 Dest st -> Some (Some st)
|
|
240 | Dest_n_after (after, st) -> Some (Some (after + st))
|
|
241 | _ -> None) avail
|
|
242 with Not_found -> None
|
|
243
|
|
244 (* Find the worst-case latency between a producer and a consumer. *)
|
|
245 let worst_case_latency producer (_, _, _, earliest_required) =
|
|
246 let dest = find_dest producer in
|
|
247 match earliest_required, dest with
|
|
248 None, _ ->
|
|
249 (* The consumer doesn't have any source requirements. *)
|
|
250 None
|
|
251 | _, None ->
|
|
252 (* The producer doesn't produce any results (e.g. a store insn). *)
|
|
253 None
|
|
254 | Some consumed, Some produced -> Some (produced - consumed + 1)
|
|
255
|
|
256 (* Helper function for below. *)
|
|
257 let latency_calc f producer (_, avail, _, _) =
|
|
258 try
|
|
259 let source_avail = find_with_result f avail in
|
|
260 match find_dest producer with
|
|
261 None ->
|
|
262 (* The producer does not produce a result. *)
|
|
263 Some 0
|
|
264 | Some produced ->
|
|
265 let latency = produced - source_avail + 1 in
|
|
266 (* Latencies below zero are raised to zero since we don't have
|
|
267 delay slots. *)
|
|
268 if latency < 0 then Some 0 else Some latency
|
|
269 with Not_found -> None
|
|
270
|
|
271 (* Find any Rm latency between a producer and a consumer. If no
|
|
272 Rm source requirement is explicitly specified for the consumer,
|
|
273 return "positive infinity". Also return "positive infinity" if
|
|
274 the latency matches the supplied worst-case latency for this
|
|
275 producer. *)
|
|
276 let get_m_latency producer consumer =
|
|
277 match latency_calc (fun av -> match av with Source_m stage -> Some stage
|
|
278 | _ -> None) producer consumer
|
|
279 with None -> [] | Some latency -> [(Guard_only_m, latency)]
|
|
280
|
|
281 (* Likewise for Rn. *)
|
|
282 let get_n_latency producer consumer =
|
|
283 match latency_calc (fun av -> match av with Source_n stage -> Some stage
|
|
284 | _ -> None) producer consumer
|
|
285 with None -> [] | Some latency -> [(Guard_only_n, latency)]
|
|
286
|
|
287 (* Likewise for Rd. *)
|
|
288 let get_d_latency producer consumer =
|
|
289 match
|
|
290 latency_calc (fun av -> match av with Source_d stage -> Some stage
|
|
291 | _ -> None) producer consumer
|
|
292 with None -> [] | Some latency -> [(Guard_only_d, latency)]
|
|
293
|
|
294 (* Given a producer and a consumer, work out the latency of the producer
|
|
295 to the consumer in each of the four cases (availability information
|
|
296 permitting) identified at the top of this file. Return the
|
|
297 consumer, the worst-case unguarded latency and any guarded latencies. *)
|
|
298 let calculate_latencies producer consumer =
|
|
299 let worst = worst_case_latency producer consumer in
|
|
300 let m_latency = get_m_latency producer consumer in
|
|
301 let n_latency = get_n_latency producer consumer in
|
|
302 let d_latency = get_d_latency producer consumer in
|
|
303 (consumer, worst, m_latency @ n_latency @ d_latency)
|
|
304
|
|
305 (* Helper function for below. *)
|
|
306 let pick_latency largest worst guards =
|
|
307 let guards =
|
|
308 match worst with
|
|
309 None -> guards
|
|
310 | Some worst -> (Guard_none, worst) :: guards
|
|
311 in
|
|
312 if List.length guards = 0 then None else
|
|
313 let total_latency =
|
|
314 List.fold_left (fun acc -> fun (_, latency) -> acc + latency) 0 guards
|
|
315 in
|
|
316 let average_latency = (float_of_int total_latency) /.
|
|
317 (float_of_int (List.length guards)) in
|
|
318 let rounded_latency = int_of_float (ceil average_latency) in
|
|
319 if rounded_latency = largest then None
|
|
320 else Some (Guard_none, rounded_latency)
|
|
321
|
|
322 (* Collate all bypasses for a particular producer as required in
|
|
323 worst_case_latencies_and_bypasses. (By this stage there is a maximum
|
|
324 of one bypass from this producer to any particular consumer listed
|
|
325 in LATENCIES.) Use a hash table to collate bypasses with the
|
|
326 same latency and guard. *)
|
|
327 let collate_bypasses (producer_name, _, _, _) largest latencies =
|
|
328 let ht = Hashtbl.create 42 in
|
|
329 let keys = ref [] in
|
|
330 List.iter (
|
|
331 fun ((consumer, _, _, _), worst, guards) ->
|
|
332 (* Find out which latency to use. Ignoring latencies that match
|
|
333 the *overall* worst-case latency for this producer (which will
|
|
334 be in define_insn_reservation), we have to examine:
|
|
335 1. the latency with no guard between this producer and this
|
|
336 consumer; and
|
|
337 2. any guarded latency. *)
|
|
338 let guard_latency_opt = pick_latency largest worst guards in
|
|
339 match guard_latency_opt with
|
|
340 None -> ()
|
|
341 | Some (guard, latency) ->
|
|
342 begin
|
|
343 (if (try ignore (Hashtbl.find ht (guard, latency)); false
|
|
344 with Not_found -> true) then
|
|
345 keys := (guard, latency) :: !keys);
|
|
346 Hashtbl.add ht (guard, latency) consumer
|
|
347 end
|
|
348 ) latencies;
|
|
349 (* The hash table now has bypasses collated so that ones with the
|
|
350 same latency and guard have the same keys. Walk through all the
|
|
351 keys, extract the associated bypasses, and concatenate the names
|
|
352 of the consumers for each bypass. *)
|
|
353 List.map (
|
|
354 fun ((guard, latency) as key) ->
|
|
355 let consumers = Hashtbl.find_all ht key in
|
|
356 (producer_name,
|
|
357 String.concat ",\\\n " consumers,
|
|
358 latency,
|
|
359 guard)
|
|
360 ) !keys
|
|
361
|
|
362 (* For every producer, find the worst-case latency between it and
|
|
363 *any* consumer. Also determine (if such a thing exists) the
|
|
364 lowest-latency bypass from each producer to each consumer. Group
|
|
365 the output in such a way that all bypasses with the same producer
|
|
366 and latency are together, and so that bypasses with the worst-case
|
|
367 latency are ignored. *)
|
|
368 let worst_case_latencies_and_bypasses =
|
|
369 let rec f (worst_acc, bypasses_acc) prev xs =
|
|
370 match xs with
|
|
371 [] -> (worst_acc, bypasses_acc)
|
|
372 | ((producer_name, producer_avail, res_string, _) as producer)::next ->
|
|
373 (* For this particular producer, work out the latencies between
|
|
374 it and every consumer. *)
|
|
375 let latencies =
|
|
376 List.fold_left (fun acc -> fun consumer ->
|
|
377 (calculate_latencies producer consumer) :: acc)
|
|
378 [] (prev @ xs)
|
|
379 in
|
|
380 (* Now work out what the overall worst case latency was for this
|
|
381 particular producer. *)
|
|
382 match latencies with
|
|
383 [] -> assert false
|
|
384 | _ ->
|
|
385 let comp_fn (_, l1, _) (_, l2, _) =
|
|
386 if l1 > l2 then -1 else if l1 = l2 then 0 else 1
|
|
387 in
|
|
388 let largest =
|
|
389 match List.hd (List.sort comp_fn latencies) with
|
|
390 (_, None, _) -> 0 (* Producer has no consumers. *)
|
|
391 | (_, Some worst, _) -> worst
|
|
392 in
|
|
393 (* Having got the largest latency, collect all bypasses for
|
|
394 this producer and filter out those with that larger
|
|
395 latency. Record the others for later emission. *)
|
|
396 let bypasses = collate_bypasses producer largest latencies in
|
|
397 (* Go on to process remaining producers, having noted
|
|
398 the result for this one. *)
|
|
399 f ((producer_name, producer_avail, largest,
|
|
400 res_string) :: worst_acc,
|
|
401 bypasses @ bypasses_acc)
|
|
402 (prev @ [producer]) next
|
|
403 in
|
|
404 f ([], []) []
|
|
405
|
|
406 (* Emit a helpful comment for a define_insn_reservation. *)
|
|
407 let write_comment producer avail =
|
|
408 let seen_source = ref false in
|
|
409 let describe info =
|
|
410 let read = if !seen_source then "" else "read " in
|
|
411 match info with
|
|
412 Source stage ->
|
|
413 seen_source := true;
|
|
414 Printf.printf "%stheir source operands at N%d" read stage
|
|
415 | Source_n stage ->
|
|
416 seen_source := true;
|
|
417 Printf.printf "%stheir (D|Q)n operands at N%d" read stage
|
|
418 | Source_m stage ->
|
|
419 seen_source := true;
|
|
420 Printf.printf "%stheir (D|Q)m operands at N%d" read stage
|
|
421 | Source_d stage ->
|
|
422 Printf.printf "%stheir (D|Q)d operands at N%d" read stage
|
|
423 | Dest stage ->
|
|
424 Printf.printf "produce a result at N%d" stage
|
|
425 | Dest_n_after (after, stage) ->
|
|
426 Printf.printf "produce a result at N%d on cycle %d" stage (after + 1)
|
|
427 in
|
|
428 Printf.printf ";; Instructions using this reservation ";
|
|
429 let rec f infos x =
|
|
430 let sep = if x mod 2 = 1 then "" else "\n;;" in
|
|
431 match infos with
|
|
432 [] -> assert false
|
|
433 | [info] -> describe info; Printf.printf ".\n"
|
|
434 | info::(_::[] as infos) ->
|
|
435 describe info; Printf.printf ", and%s " sep; f infos (x+1)
|
|
436 | info::infos -> describe info; Printf.printf ",%s " sep; f infos (x+1)
|
|
437 in
|
|
438 f avail 0
|
|
439
|
|
440 (* Emit a define_insn_reservation for each producer. The latency
|
|
441 written in will be its worst-case latency. *)
|
|
442 let emit_insn_reservations =
|
|
443 List.iter (
|
|
444 fun (producer, avail, latency, reservation) ->
|
|
445 write_comment producer avail;
|
|
446 Printf.printf "(define_insn_reservation \"%s\" %d\n" producer latency;
|
|
447 Printf.printf " (and (eq_attr \"tune\" \"cortexa8\")\n";
|
|
448 Printf.printf " (eq_attr \"neon_type\" \"%s\"))\n" producer;
|
|
449 let str =
|
|
450 match reservation with
|
|
451 Mul -> "dp" | Mul_2cycle -> "dp_2" | Mul_4cycle -> "dp_4"
|
|
452 | Shift -> "dp" | Shift_2cycle -> "dp_2"
|
|
453 | ALU -> "dp" | ALU_2cycle -> "dp_2"
|
|
454 | Fmul -> "dp" | Fmul_2cycle -> "dp_2"
|
|
455 | Fadd -> "fadd" | Fadd_2cycle -> "fadd_2"
|
|
456 | Ls 1 -> "ls"
|
|
457 | Ls n -> "ls_" ^ (string_of_int n)
|
|
458 | Permute 1 -> "perm"
|
|
459 | Permute n -> "perm_" ^ (string_of_int n)
|
|
460 | Fmul_then_fadd -> "fmul_then_fadd"
|
|
461 | Fmul_then_fadd_2 -> "fmul_then_fadd_2"
|
|
462 in
|
|
463 Printf.printf " \"cortex_a8_neon_%s\")\n\n" str
|
|
464 )
|
|
465
|
|
466 (* Given a guard description, return the name of the C function to
|
|
467 be used as the guard for define_bypass. *)
|
|
468 let guard_fn g =
|
|
469 match g with
|
|
470 Guard_only_m -> "arm_neon_only_m_dependency"
|
|
471 | Guard_only_n -> "arm_neon_only_n_dependency"
|
|
472 | Guard_only_d -> "arm_neon_only_d_dependency"
|
|
473 | Guard_none -> assert false
|
|
474
|
|
475 (* Emit a define_bypass for each bypass. *)
|
|
476 let emit_bypasses =
|
|
477 List.iter (
|
|
478 fun (producer, consumers, latency, guard) ->
|
|
479 Printf.printf "(define_bypass %d \"%s\"\n" latency producer;
|
|
480 if guard = Guard_none then
|
|
481 Printf.printf " \"%s\")\n\n" consumers
|
|
482 else
|
|
483 begin
|
|
484 Printf.printf " \"%s\"\n" consumers;
|
|
485 Printf.printf " \"%s\")\n\n" (guard_fn guard)
|
|
486 end
|
|
487 )
|
|
488
|
|
489 (* Program entry point. *)
|
|
490 let main =
|
|
491 let table = calculate_sources availability_table in
|
|
492 let worst_cases, bypasses = worst_case_latencies_and_bypasses table in
|
|
493 emit_insn_reservations (List.rev worst_cases);
|
|
494 Printf.printf ";; Exceptions to the default latencies.\n\n";
|
|
495 emit_bypasses bypasses
|
|
496
|