0
|
1 /* Linear Loop transforms
|
|
2 Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009
|
|
3 Free Software Foundation, Inc.
|
|
4 Contributed by Daniel Berlin <dberlin@dberlin.org>.
|
|
5
|
|
6 This file is part of GCC.
|
|
7
|
|
8 GCC is free software; you can redistribute it and/or modify it under
|
|
9 the terms of the GNU General Public License as published by the Free
|
|
10 Software Foundation; either version 3, or (at your option) any later
|
|
11 version.
|
|
12
|
|
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
16 for more details.
|
|
17
|
|
18 You should have received a copy of the GNU General Public License
|
|
19 along with GCC; see the file COPYING3. If not see
|
|
20 <http://www.gnu.org/licenses/>. */
|
|
21
|
|
22
|
|
23 #include "config.h"
|
|
24 #include "system.h"
|
|
25 #include "coretypes.h"
|
|
26 #include "tm.h"
|
|
27 #include "ggc.h"
|
|
28 #include "tree.h"
|
|
29 #include "target.h"
|
|
30
|
|
31 #include "rtl.h"
|
|
32 #include "basic-block.h"
|
|
33 #include "diagnostic.h"
|
|
34 #include "obstack.h"
|
|
35 #include "tree-flow.h"
|
|
36 #include "tree-dump.h"
|
|
37 #include "timevar.h"
|
|
38 #include "cfgloop.h"
|
|
39 #include "expr.h"
|
|
40 #include "optabs.h"
|
|
41 #include "tree-chrec.h"
|
|
42 #include "tree-data-ref.h"
|
|
43 #include "tree-scalar-evolution.h"
|
|
44 #include "tree-pass.h"
|
|
45 #include "lambda.h"
|
|
46
|
|
47 /* Linear loop transforms include any composition of interchange,
|
|
48 scaling, skewing, and reversal. They are used to change the
|
|
49 iteration order of loop nests in order to optimize data locality of
|
|
50 traversals, or remove dependences that prevent
|
|
51 parallelization/vectorization/etc.
|
|
52
|
|
53 TODO: Determine reuse vectors/matrix and use it to determine optimal
|
|
54 transform matrix for locality purposes.
|
|
55 TODO: Completion of partial transforms. */
|
|
56
|
|
57 /* Gather statistics for loop interchange. LOOP is the loop being
|
|
58 considered. The first loop in the considered loop nest is
|
|
59 FIRST_LOOP, and consequently, the index of the considered loop is
|
|
60 obtained by LOOP->DEPTH - FIRST_LOOP->DEPTH
|
|
61
|
|
62 Initializes:
|
|
63 - DEPENDENCE_STEPS the sum of all the data dependence distances
|
|
64 carried by loop LOOP,
|
|
65
|
|
66 - NB_DEPS_NOT_CARRIED_BY_LOOP the number of dependence relations
|
|
67 for which the loop LOOP is not carrying any dependence,
|
|
68
|
|
69 - ACCESS_STRIDES the sum of all the strides in LOOP.
|
|
70
|
|
71 Example: for the following loop,
|
|
72
|
|
73 | loop_1 runs 1335 times
|
|
74 | loop_2 runs 1335 times
|
|
75 | A[{{0, +, 1}_1, +, 1335}_2]
|
|
76 | B[{{0, +, 1}_1, +, 1335}_2]
|
|
77 | endloop_2
|
|
78 | A[{0, +, 1336}_1]
|
|
79 | endloop_1
|
|
80
|
|
81 gather_interchange_stats (in loop_1) will return
|
|
82 DEPENDENCE_STEPS = 3002
|
|
83 NB_DEPS_NOT_CARRIED_BY_LOOP = 5
|
|
84 ACCESS_STRIDES = 10694
|
|
85
|
|
86 gather_interchange_stats (in loop_2) will return
|
|
87 DEPENDENCE_STEPS = 3000
|
|
88 NB_DEPS_NOT_CARRIED_BY_LOOP = 7
|
|
89 ACCESS_STRIDES = 8010
|
|
90 */
|
|
91
|
|
92 static void
|
|
93 gather_interchange_stats (VEC (ddr_p, heap) *dependence_relations ATTRIBUTE_UNUSED,
|
|
94 VEC (data_reference_p, heap) *datarefs ATTRIBUTE_UNUSED,
|
|
95 struct loop *loop ATTRIBUTE_UNUSED,
|
|
96 struct loop *first_loop ATTRIBUTE_UNUSED,
|
|
97 unsigned int *dependence_steps ATTRIBUTE_UNUSED,
|
|
98 unsigned int *nb_deps_not_carried_by_loop ATTRIBUTE_UNUSED,
|
|
99 double_int *access_strides ATTRIBUTE_UNUSED)
|
|
100 {
|
|
101 unsigned int i, j;
|
|
102 struct data_dependence_relation *ddr;
|
|
103 struct data_reference *dr;
|
|
104
|
|
105 *dependence_steps = 0;
|
|
106 *nb_deps_not_carried_by_loop = 0;
|
|
107 *access_strides = double_int_zero;
|
|
108
|
|
109 for (i = 0; VEC_iterate (ddr_p, dependence_relations, i, ddr); i++)
|
|
110 {
|
|
111 /* If we don't know anything about this dependence, or the distance
|
|
112 vector is NULL, or there is no dependence, then there is no reuse of
|
|
113 data. */
|
|
114 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
|
|
115 || DDR_ARE_DEPENDENT (ddr) == chrec_known
|
|
116 || DDR_NUM_DIST_VECTS (ddr) == 0)
|
|
117 continue;
|
|
118
|
|
119 for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
|
|
120 {
|
|
121 int dist = DDR_DIST_VECT (ddr, j)[loop_depth (loop) - loop_depth (first_loop)];
|
|
122
|
|
123 if (dist == 0)
|
|
124 (*nb_deps_not_carried_by_loop) += 1;
|
|
125
|
|
126 else if (dist < 0)
|
|
127 (*dependence_steps) += -dist;
|
|
128
|
|
129 else
|
|
130 (*dependence_steps) += dist;
|
|
131 }
|
|
132 }
|
|
133
|
|
134 /* Compute the access strides. */
|
|
135 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
|
|
136 {
|
|
137 unsigned int it;
|
|
138 tree ref = DR_REF (dr);
|
|
139 gimple stmt = DR_STMT (dr);
|
|
140 struct loop *stmt_loop = loop_containing_stmt (stmt);
|
|
141 struct loop *inner_loop = first_loop->inner;
|
|
142
|
|
143 if (inner_loop != stmt_loop
|
|
144 && !flow_loop_nested_p (inner_loop, stmt_loop))
|
|
145 continue;
|
|
146
|
|
147 for (it = 0; it < DR_NUM_DIMENSIONS (dr);
|
|
148 it++, ref = TREE_OPERAND (ref, 0))
|
|
149 {
|
|
150 int num = am_vector_index_for_loop (DR_ACCESS_MATRIX (dr), loop->num);
|
|
151 int istride = AM_GET_ACCESS_MATRIX_ELEMENT (DR_ACCESS_MATRIX (dr), it, num);
|
|
152 tree array_size = TYPE_SIZE (TREE_TYPE (ref));
|
|
153 double_int dstride;
|
|
154
|
|
155 if (array_size == NULL_TREE
|
|
156 || TREE_CODE (array_size) != INTEGER_CST)
|
|
157 continue;
|
|
158
|
|
159 dstride = double_int_mul (tree_to_double_int (array_size),
|
|
160 shwi_to_double_int (istride));
|
|
161 (*access_strides) = double_int_add (*access_strides, dstride);
|
|
162 }
|
|
163 }
|
|
164 }
|
|
165
|
|
166 /* Attempt to apply interchange transformations to TRANS to maximize the
|
|
167 spatial and temporal locality of the loop.
|
|
168 Returns the new transform matrix. The smaller the reuse vector
|
|
169 distances in the inner loops, the fewer the cache misses.
|
|
170 FIRST_LOOP is the loop->num of the first loop in the analyzed loop
|
|
171 nest. */
|
|
172
|
|
173
|
|
174 static lambda_trans_matrix
|
|
175 try_interchange_loops (lambda_trans_matrix trans,
|
|
176 unsigned int depth,
|
|
177 VEC (ddr_p, heap) *dependence_relations,
|
|
178 VEC (data_reference_p, heap) *datarefs,
|
|
179 struct loop *first_loop)
|
|
180 {
|
|
181 bool res;
|
|
182 struct loop *loop_i;
|
|
183 struct loop *loop_j;
|
|
184 unsigned int dependence_steps_i, dependence_steps_j;
|
|
185 double_int access_strides_i, access_strides_j;
|
|
186 double_int small, large, nb_iter;
|
|
187 double_int l1_cache_size, l2_cache_size;
|
|
188 int cmp;
|
|
189 unsigned int nb_deps_not_carried_by_i, nb_deps_not_carried_by_j;
|
|
190 struct data_dependence_relation *ddr;
|
|
191
|
|
192 if (VEC_length (ddr_p, dependence_relations) == 0)
|
|
193 return trans;
|
|
194
|
|
195 /* When there is an unknown relation in the dependence_relations, we
|
|
196 know that it is no worth looking at this loop nest: give up. */
|
|
197 ddr = VEC_index (ddr_p, dependence_relations, 0);
|
|
198 if (ddr == NULL || DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
|
|
199 return trans;
|
|
200
|
|
201 l1_cache_size = uhwi_to_double_int (L1_CACHE_SIZE * 1024);
|
|
202 l2_cache_size = uhwi_to_double_int (L2_CACHE_SIZE * 1024);
|
|
203
|
|
204 /* LOOP_I is always the outer loop. */
|
|
205 for (loop_j = first_loop->inner;
|
|
206 loop_j;
|
|
207 loop_j = loop_j->inner)
|
|
208 for (loop_i = first_loop;
|
|
209 loop_depth (loop_i) < loop_depth (loop_j);
|
|
210 loop_i = loop_i->inner)
|
|
211 {
|
|
212 gather_interchange_stats (dependence_relations, datarefs,
|
|
213 loop_i, first_loop,
|
|
214 &dependence_steps_i,
|
|
215 &nb_deps_not_carried_by_i,
|
|
216 &access_strides_i);
|
|
217 gather_interchange_stats (dependence_relations, datarefs,
|
|
218 loop_j, first_loop,
|
|
219 &dependence_steps_j,
|
|
220 &nb_deps_not_carried_by_j,
|
|
221 &access_strides_j);
|
|
222
|
|
223 /* Heuristics for loop interchange profitability:
|
|
224
|
|
225 0. Don't transform if the smallest stride is larger than
|
|
226 the L2 cache, or if the largest stride multiplied by the
|
|
227 number of iterations is smaller than the L1 cache.
|
|
228
|
|
229 1. (spatial locality) Inner loops should have smallest
|
|
230 dependence steps.
|
|
231
|
|
232 2. (spatial locality) Inner loops should contain more
|
|
233 dependence relations not carried by the loop.
|
|
234
|
|
235 3. (temporal locality) Inner loops should have smallest
|
|
236 array access strides.
|
|
237 */
|
|
238
|
|
239 cmp = double_int_ucmp (access_strides_i, access_strides_j);
|
|
240 small = cmp < 0 ? access_strides_i : access_strides_j;
|
|
241 large = cmp < 0 ? access_strides_j : access_strides_i;
|
|
242
|
|
243 if (double_int_ucmp (small, l2_cache_size) > 0)
|
|
244 continue;
|
|
245
|
|
246 res = cmp < 0 ?
|
|
247 estimated_loop_iterations (loop_j, false, &nb_iter):
|
|
248 estimated_loop_iterations (loop_i, false, &nb_iter);
|
|
249 large = double_int_mul (large, nb_iter);
|
|
250
|
|
251 if (res && double_int_ucmp (large, l1_cache_size) < 0)
|
|
252 continue;
|
|
253
|
|
254 if (dependence_steps_i < dependence_steps_j
|
|
255 || nb_deps_not_carried_by_i > nb_deps_not_carried_by_j
|
|
256 || cmp < 0)
|
|
257 {
|
|
258 lambda_matrix_row_exchange (LTM_MATRIX (trans),
|
|
259 loop_depth (loop_i) - loop_depth (first_loop),
|
|
260 loop_depth (loop_j) - loop_depth (first_loop));
|
|
261 /* Validate the resulting matrix. When the transformation
|
|
262 is not valid, reverse to the previous transformation. */
|
|
263 if (!lambda_transform_legal_p (trans, depth, dependence_relations))
|
|
264 lambda_matrix_row_exchange (LTM_MATRIX (trans),
|
|
265 loop_depth (loop_i) - loop_depth (first_loop),
|
|
266 loop_depth (loop_j) - loop_depth (first_loop));
|
|
267 }
|
|
268 }
|
|
269
|
|
270 return trans;
|
|
271 }
|
|
272
|
|
273 /* Return the number of nested loops in LOOP_NEST, or 0 if the loops
|
|
274 are not perfectly nested. */
|
|
275
|
|
276 unsigned int
|
|
277 perfect_loop_nest_depth (struct loop *loop_nest)
|
|
278 {
|
|
279 struct loop *temp;
|
|
280 unsigned int depth = 1;
|
|
281
|
|
282 /* If it's not a loop nest, we don't want it. We also don't handle
|
|
283 sibling loops properly, which are loops of the following form:
|
|
284
|
|
285 | for (i = 0; i < 50; i++)
|
|
286 | {
|
|
287 | for (j = 0; j < 50; j++)
|
|
288 | {
|
|
289 | ...
|
|
290 | }
|
|
291 | for (j = 0; j < 50; j++)
|
|
292 | {
|
|
293 | ...
|
|
294 | }
|
|
295 | }
|
|
296 */
|
|
297
|
|
298 if (!loop_nest->inner || !single_exit (loop_nest))
|
|
299 return 0;
|
|
300
|
|
301 for (temp = loop_nest->inner; temp; temp = temp->inner)
|
|
302 {
|
|
303 /* If we have a sibling loop or multiple exit edges, jump ship. */
|
|
304 if (temp->next || !single_exit (temp))
|
|
305 return 0;
|
|
306
|
|
307 depth++;
|
|
308 }
|
|
309
|
|
310 return depth;
|
|
311 }
|
|
312
|
|
313 /* Perform a set of linear transforms on loops. */
|
|
314
|
|
315 void
|
|
316 linear_transform_loops (void)
|
|
317 {
|
|
318 bool modified = false;
|
|
319 loop_iterator li;
|
|
320 VEC(tree,heap) *oldivs = NULL;
|
|
321 VEC(tree,heap) *invariants = NULL;
|
|
322 VEC(tree,heap) *lambda_parameters = NULL;
|
|
323 VEC(gimple,heap) *remove_ivs = VEC_alloc (gimple, heap, 3);
|
|
324 struct loop *loop_nest;
|
|
325 gimple oldiv_stmt;
|
|
326 unsigned i;
|
|
327
|
|
328 FOR_EACH_LOOP (li, loop_nest, 0)
|
|
329 {
|
|
330 unsigned int depth = 0;
|
|
331 VEC (ddr_p, heap) *dependence_relations;
|
|
332 VEC (data_reference_p, heap) *datarefs;
|
|
333
|
|
334 lambda_loopnest before, after;
|
|
335 lambda_trans_matrix trans;
|
|
336 struct obstack lambda_obstack;
|
|
337 struct loop *loop;
|
|
338 VEC(loop_p,heap) *nest;
|
|
339
|
|
340 depth = perfect_loop_nest_depth (loop_nest);
|
|
341 if (depth == 0)
|
|
342 continue;
|
|
343
|
|
344 nest = VEC_alloc (loop_p, heap, 3);
|
|
345 for (loop = loop_nest; loop; loop = loop->inner)
|
|
346 VEC_safe_push (loop_p, heap, nest, loop);
|
|
347
|
|
348 gcc_obstack_init (&lambda_obstack);
|
|
349 VEC_truncate (tree, oldivs, 0);
|
|
350 VEC_truncate (tree, invariants, 0);
|
|
351 VEC_truncate (tree, lambda_parameters, 0);
|
|
352
|
|
353 datarefs = VEC_alloc (data_reference_p, heap, 10);
|
|
354 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
|
|
355 if (!compute_data_dependences_for_loop (loop_nest, true, &datarefs,
|
|
356 &dependence_relations))
|
|
357 goto free_and_continue;
|
|
358
|
|
359 lambda_collect_parameters (datarefs, &lambda_parameters);
|
|
360 if (!lambda_compute_access_matrices (datarefs, lambda_parameters, nest))
|
|
361 goto free_and_continue;
|
|
362
|
|
363 if (dump_file && (dump_flags & TDF_DETAILS))
|
|
364 dump_ddrs (dump_file, dependence_relations);
|
|
365
|
|
366 /* Build the transformation matrix. */
|
|
367 trans = lambda_trans_matrix_new (depth, depth);
|
|
368 lambda_matrix_id (LTM_MATRIX (trans), depth);
|
|
369 trans = try_interchange_loops (trans, depth, dependence_relations,
|
|
370 datarefs, loop_nest);
|
|
371
|
|
372 if (lambda_trans_matrix_id_p (trans))
|
|
373 {
|
|
374 if (dump_file)
|
|
375 fprintf (dump_file, "Won't transform loop. Optimal transform is the identity transform\n");
|
|
376 goto free_and_continue;
|
|
377 }
|
|
378
|
|
379 /* Check whether the transformation is legal. */
|
|
380 if (!lambda_transform_legal_p (trans, depth, dependence_relations))
|
|
381 {
|
|
382 if (dump_file)
|
|
383 fprintf (dump_file, "Can't transform loop, transform is illegal:\n");
|
|
384 goto free_and_continue;
|
|
385 }
|
|
386
|
|
387 before = gcc_loopnest_to_lambda_loopnest (loop_nest, &oldivs,
|
|
388 &invariants, &lambda_obstack);
|
|
389
|
|
390 if (!before)
|
|
391 goto free_and_continue;
|
|
392
|
|
393 if (dump_file)
|
|
394 {
|
|
395 fprintf (dump_file, "Before:\n");
|
|
396 print_lambda_loopnest (dump_file, before, 'i');
|
|
397 }
|
|
398
|
|
399 after = lambda_loopnest_transform (before, trans, &lambda_obstack);
|
|
400
|
|
401 if (dump_file)
|
|
402 {
|
|
403 fprintf (dump_file, "After:\n");
|
|
404 print_lambda_loopnest (dump_file, after, 'u');
|
|
405 }
|
|
406
|
|
407 lambda_loopnest_to_gcc_loopnest (loop_nest, oldivs, invariants,
|
|
408 &remove_ivs,
|
|
409 after, trans, &lambda_obstack);
|
|
410 modified = true;
|
|
411
|
|
412 if (dump_file)
|
|
413 fprintf (dump_file, "Successfully transformed loop.\n");
|
|
414
|
|
415 free_and_continue:
|
|
416 obstack_free (&lambda_obstack, NULL);
|
|
417 free_dependence_relations (dependence_relations);
|
|
418 free_data_refs (datarefs);
|
|
419 VEC_free (loop_p, heap, nest);
|
|
420 }
|
|
421
|
|
422 for (i = 0; VEC_iterate (gimple, remove_ivs, i, oldiv_stmt); i++)
|
|
423 remove_iv (oldiv_stmt);
|
|
424
|
|
425 VEC_free (tree, heap, oldivs);
|
|
426 VEC_free (tree, heap, invariants);
|
|
427 VEC_free (gimple, heap, remove_ivs);
|
|
428 scev_reset ();
|
|
429
|
|
430 if (modified)
|
|
431 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_full_phi);
|
|
432 }
|