145
|
1 /* Copyright (C) 2005-2020 Free Software Foundation, Inc.
|
0
|
2 Contributed by Richard Henderson <rth@redhat.com>.
|
|
3
|
111
|
4 This file is part of the GNU Offloading and Multi Processing Library
|
|
5 (libgomp).
|
0
|
6
|
|
7 Libgomp is free software; you can redistribute it and/or modify it
|
|
8 under the terms of the GNU General Public License as published by
|
|
9 the Free Software Foundation; either version 3, or (at your option)
|
|
10 any later version.
|
|
11
|
|
12 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
14 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
15 more details.
|
|
16
|
|
17 Under Section 7 of GPL version 3, you are granted additional
|
|
18 permissions described in the GCC Runtime Library Exception, version
|
|
19 3.1, as published by the Free Software Foundation.
|
|
20
|
|
21 You should have received a copy of the GNU General Public License and
|
|
22 a copy of the GCC Runtime Library Exception along with this program;
|
|
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
24 <http://www.gnu.org/licenses/>. */
|
|
25
|
|
26 /* This file contains routines for managing work-share iteration, both
|
|
27 for loops and sections. */
|
|
28
|
|
29 #include "libgomp.h"
|
|
30 #include <stdlib.h>
|
|
31
|
|
32
|
|
33 /* This function implements the STATIC scheduling method. The caller should
|
|
34 iterate *pstart <= x < *pend. Return zero if there are more iterations
|
|
35 to perform; nonzero if not. Return less than 0 if this thread had
|
|
36 received the absolutely last iteration. */
|
|
37
|
|
38 int
|
|
39 gomp_iter_static_next (long *pstart, long *pend)
|
|
40 {
|
|
41 struct gomp_thread *thr = gomp_thread ();
|
|
42 struct gomp_team *team = thr->ts.team;
|
|
43 struct gomp_work_share *ws = thr->ts.work_share;
|
|
44 unsigned long nthreads = team ? team->nthreads : 1;
|
|
45
|
|
46 if (thr->ts.static_trip == -1)
|
|
47 return -1;
|
|
48
|
|
49 /* Quick test for degenerate teams and orphaned constructs. */
|
|
50 if (nthreads == 1)
|
|
51 {
|
|
52 *pstart = ws->next;
|
|
53 *pend = ws->end;
|
|
54 thr->ts.static_trip = -1;
|
|
55 return ws->next == ws->end;
|
|
56 }
|
|
57
|
|
58 /* We interpret chunk_size zero as "unspecified", which means that we
|
|
59 should break up the iterations such that each thread makes only one
|
|
60 trip through the outer loop. */
|
|
61 if (ws->chunk_size == 0)
|
|
62 {
|
111
|
63 unsigned long n, q, i, t;
|
0
|
64 unsigned long s0, e0;
|
|
65 long s, e;
|
|
66
|
|
67 if (thr->ts.static_trip > 0)
|
|
68 return 1;
|
|
69
|
|
70 /* Compute the total number of iterations. */
|
|
71 s = ws->incr + (ws->incr > 0 ? -1 : 1);
|
|
72 n = (ws->end - ws->next + s) / ws->incr;
|
|
73 i = thr->ts.team_id;
|
|
74
|
|
75 /* Compute the "zero-based" start and end points. That is, as
|
|
76 if the loop began at zero and incremented by one. */
|
|
77 q = n / nthreads;
|
111
|
78 t = n % nthreads;
|
|
79 if (i < t)
|
|
80 {
|
|
81 t = 0;
|
|
82 q++;
|
|
83 }
|
|
84 s0 = q * i + t;
|
0
|
85 e0 = s0 + q;
|
|
86
|
|
87 /* Notice when no iterations allocated for this thread. */
|
|
88 if (s0 >= e0)
|
|
89 {
|
|
90 thr->ts.static_trip = 1;
|
|
91 return 1;
|
|
92 }
|
|
93
|
|
94 /* Transform these to the actual start and end numbers. */
|
|
95 s = (long)s0 * ws->incr + ws->next;
|
|
96 e = (long)e0 * ws->incr + ws->next;
|
|
97
|
|
98 *pstart = s;
|
|
99 *pend = e;
|
|
100 thr->ts.static_trip = (e0 == n ? -1 : 1);
|
|
101 return 0;
|
|
102 }
|
|
103 else
|
|
104 {
|
|
105 unsigned long n, s0, e0, i, c;
|
|
106 long s, e;
|
|
107
|
|
108 /* Otherwise, each thread gets exactly chunk_size iterations
|
|
109 (if available) each time through the loop. */
|
|
110
|
|
111 s = ws->incr + (ws->incr > 0 ? -1 : 1);
|
|
112 n = (ws->end - ws->next + s) / ws->incr;
|
|
113 i = thr->ts.team_id;
|
|
114 c = ws->chunk_size;
|
|
115
|
|
116 /* Initial guess is a C sized chunk positioned nthreads iterations
|
|
117 in, offset by our thread number. */
|
|
118 s0 = (thr->ts.static_trip * nthreads + i) * c;
|
|
119 e0 = s0 + c;
|
|
120
|
|
121 /* Detect overflow. */
|
|
122 if (s0 >= n)
|
|
123 return 1;
|
|
124 if (e0 > n)
|
|
125 e0 = n;
|
|
126
|
|
127 /* Transform these to the actual start and end numbers. */
|
|
128 s = (long)s0 * ws->incr + ws->next;
|
|
129 e = (long)e0 * ws->incr + ws->next;
|
|
130
|
|
131 *pstart = s;
|
|
132 *pend = e;
|
|
133
|
|
134 if (e0 == n)
|
|
135 thr->ts.static_trip = -1;
|
|
136 else
|
|
137 thr->ts.static_trip++;
|
|
138 return 0;
|
|
139 }
|
|
140 }
|
|
141
|
|
142
|
|
143 /* This function implements the DYNAMIC scheduling method. Arguments are
|
|
144 as for gomp_iter_static_next. This function must be called with ws->lock
|
|
145 held. */
|
|
146
|
|
147 bool
|
|
148 gomp_iter_dynamic_next_locked (long *pstart, long *pend)
|
|
149 {
|
|
150 struct gomp_thread *thr = gomp_thread ();
|
|
151 struct gomp_work_share *ws = thr->ts.work_share;
|
|
152 long start, end, chunk, left;
|
|
153
|
|
154 start = ws->next;
|
|
155 if (start == ws->end)
|
|
156 return false;
|
|
157
|
|
158 chunk = ws->chunk_size;
|
|
159 left = ws->end - start;
|
|
160 if (ws->incr < 0)
|
|
161 {
|
|
162 if (chunk < left)
|
|
163 chunk = left;
|
|
164 }
|
|
165 else
|
|
166 {
|
|
167 if (chunk > left)
|
|
168 chunk = left;
|
|
169 }
|
|
170 end = start + chunk;
|
|
171
|
|
172 ws->next = end;
|
|
173 *pstart = start;
|
|
174 *pend = end;
|
|
175 return true;
|
|
176 }
|
|
177
|
|
178
|
|
179 #ifdef HAVE_SYNC_BUILTINS
|
|
180 /* Similar, but doesn't require the lock held, and uses compare-and-swap
|
|
181 instead. Note that the only memory value that changes is ws->next. */
|
|
182
|
|
183 bool
|
|
184 gomp_iter_dynamic_next (long *pstart, long *pend)
|
|
185 {
|
|
186 struct gomp_thread *thr = gomp_thread ();
|
|
187 struct gomp_work_share *ws = thr->ts.work_share;
|
|
188 long start, end, nend, chunk, incr;
|
|
189
|
|
190 end = ws->end;
|
|
191 incr = ws->incr;
|
|
192 chunk = ws->chunk_size;
|
|
193
|
|
194 if (__builtin_expect (ws->mode, 1))
|
|
195 {
|
|
196 long tmp = __sync_fetch_and_add (&ws->next, chunk);
|
|
197 if (incr > 0)
|
|
198 {
|
|
199 if (tmp >= end)
|
|
200 return false;
|
|
201 nend = tmp + chunk;
|
|
202 if (nend > end)
|
|
203 nend = end;
|
|
204 *pstart = tmp;
|
|
205 *pend = nend;
|
|
206 return true;
|
|
207 }
|
|
208 else
|
|
209 {
|
|
210 if (tmp <= end)
|
|
211 return false;
|
|
212 nend = tmp + chunk;
|
|
213 if (nend < end)
|
|
214 nend = end;
|
|
215 *pstart = tmp;
|
|
216 *pend = nend;
|
|
217 return true;
|
|
218 }
|
|
219 }
|
|
220
|
111
|
221 start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
|
0
|
222 while (1)
|
|
223 {
|
|
224 long left = end - start;
|
|
225 long tmp;
|
|
226
|
|
227 if (start == end)
|
|
228 return false;
|
|
229
|
|
230 if (incr < 0)
|
|
231 {
|
|
232 if (chunk < left)
|
|
233 chunk = left;
|
|
234 }
|
|
235 else
|
|
236 {
|
|
237 if (chunk > left)
|
|
238 chunk = left;
|
|
239 }
|
|
240 nend = start + chunk;
|
|
241
|
|
242 tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
|
|
243 if (__builtin_expect (tmp == start, 1))
|
|
244 break;
|
|
245
|
|
246 start = tmp;
|
|
247 }
|
|
248
|
|
249 *pstart = start;
|
|
250 *pend = nend;
|
|
251 return true;
|
|
252 }
|
|
253 #endif /* HAVE_SYNC_BUILTINS */
|
|
254
|
|
255
|
|
256 /* This function implements the GUIDED scheduling method. Arguments are
|
|
257 as for gomp_iter_static_next. This function must be called with the
|
|
258 work share lock held. */
|
|
259
|
|
260 bool
|
|
261 gomp_iter_guided_next_locked (long *pstart, long *pend)
|
|
262 {
|
|
263 struct gomp_thread *thr = gomp_thread ();
|
|
264 struct gomp_work_share *ws = thr->ts.work_share;
|
|
265 struct gomp_team *team = thr->ts.team;
|
|
266 unsigned long nthreads = team ? team->nthreads : 1;
|
|
267 unsigned long n, q;
|
|
268 long start, end;
|
|
269
|
|
270 if (ws->next == ws->end)
|
|
271 return false;
|
|
272
|
|
273 start = ws->next;
|
|
274 n = (ws->end - start) / ws->incr;
|
|
275 q = (n + nthreads - 1) / nthreads;
|
|
276
|
|
277 if (q < ws->chunk_size)
|
|
278 q = ws->chunk_size;
|
|
279 if (q <= n)
|
|
280 end = start + q * ws->incr;
|
|
281 else
|
|
282 end = ws->end;
|
|
283
|
|
284 ws->next = end;
|
|
285 *pstart = start;
|
|
286 *pend = end;
|
|
287 return true;
|
|
288 }
|
|
289
|
|
290 #ifdef HAVE_SYNC_BUILTINS
|
|
291 /* Similar, but doesn't require the lock held, and uses compare-and-swap
|
|
292 instead. Note that the only memory value that changes is ws->next. */
|
|
293
|
|
294 bool
|
|
295 gomp_iter_guided_next (long *pstart, long *pend)
|
|
296 {
|
|
297 struct gomp_thread *thr = gomp_thread ();
|
|
298 struct gomp_work_share *ws = thr->ts.work_share;
|
|
299 struct gomp_team *team = thr->ts.team;
|
|
300 unsigned long nthreads = team ? team->nthreads : 1;
|
|
301 long start, end, nend, incr;
|
|
302 unsigned long chunk_size;
|
|
303
|
111
|
304 start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
|
0
|
305 end = ws->end;
|
|
306 incr = ws->incr;
|
|
307 chunk_size = ws->chunk_size;
|
|
308
|
|
309 while (1)
|
|
310 {
|
|
311 unsigned long n, q;
|
|
312 long tmp;
|
|
313
|
|
314 if (start == end)
|
|
315 return false;
|
|
316
|
|
317 n = (end - start) / incr;
|
|
318 q = (n + nthreads - 1) / nthreads;
|
|
319
|
|
320 if (q < chunk_size)
|
|
321 q = chunk_size;
|
|
322 if (__builtin_expect (q <= n, 1))
|
|
323 nend = start + q * incr;
|
|
324 else
|
|
325 nend = end;
|
|
326
|
|
327 tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
|
|
328 if (__builtin_expect (tmp == start, 1))
|
|
329 break;
|
|
330
|
|
331 start = tmp;
|
|
332 }
|
|
333
|
|
334 *pstart = start;
|
|
335 *pend = nend;
|
|
336 return true;
|
|
337 }
|
|
338 #endif /* HAVE_SYNC_BUILTINS */
|