Mercurial > hg > CbC > CbC_llvm
comparison lib/Support/regengine.inc @ 0:95c75e76d11b LLVM3.4
LLVM 3.4
author | Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp> |
---|---|
date | Thu, 12 Dec 2013 13:56:28 +0900 |
parents | |
children | 54457678186b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:95c75e76d11b |
---|---|
1 /*- | |
2 * This code is derived from OpenBSD's libc/regex, original license follows: | |
3 * | |
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. | |
5 * Copyright (c) 1992, 1993, 1994 | |
6 * The Regents of the University of California. All rights reserved. | |
7 * | |
8 * This code is derived from software contributed to Berkeley by | |
9 * Henry Spencer. | |
10 * | |
11 * Redistribution and use in source and binary forms, with or without | |
12 * modification, are permitted provided that the following conditions | |
13 * are met: | |
14 * 1. Redistributions of source code must retain the above copyright | |
15 * notice, this list of conditions and the following disclaimer. | |
16 * 2. Redistributions in binary form must reproduce the above copyright | |
17 * notice, this list of conditions and the following disclaimer in the | |
18 * documentation and/or other materials provided with the distribution. | |
19 * 3. Neither the name of the University nor the names of its contributors | |
20 * may be used to endorse or promote products derived from this software | |
21 * without specific prior written permission. | |
22 * | |
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
33 * SUCH DAMAGE. | |
34 * | |
35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 | |
36 */ | |
37 | |
38 /* | |
39 * The matching engine and friends. This file is #included by regexec.c | |
40 * after suitable #defines of a variety of macros used herein, so that | |
41 * different state representations can be used without duplicating masses | |
42 * of code. | |
43 */ | |
44 | |
45 #ifdef SNAMES | |
46 #define matcher smatcher | |
47 #define fast sfast | |
48 #define slow sslow | |
49 #define dissect sdissect | |
50 #define backref sbackref | |
51 #define step sstep | |
52 #define print sprint | |
53 #define at sat | |
54 #define match smat | |
55 #define nope snope | |
56 #endif | |
57 #ifdef LNAMES | |
58 #define matcher lmatcher | |
59 #define fast lfast | |
60 #define slow lslow | |
61 #define dissect ldissect | |
62 #define backref lbackref | |
63 #define step lstep | |
64 #define print lprint | |
65 #define at lat | |
66 #define match lmat | |
67 #define nope lnope | |
68 #endif | |
69 | |
70 /* another structure passed up and down to avoid zillions of parameters */ | |
71 struct match { | |
72 struct re_guts *g; | |
73 int eflags; | |
74 llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ | |
75 const char *offp; /* offsets work from here */ | |
76 const char *beginp; /* start of string -- virtual NUL precedes */ | |
77 const char *endp; /* end of string -- virtual NUL here */ | |
78 const char *coldp; /* can be no match starting before here */ | |
79 const char **lastpos; /* [nplus+1] */ | |
80 STATEVARS; | |
81 states st; /* current states */ | |
82 states fresh; /* states for a fresh start */ | |
83 states tmp; /* temporary */ | |
84 states empty; /* empty set of states */ | |
85 }; | |
86 | |
87 static int matcher(struct re_guts *, const char *, size_t, | |
88 llvm_regmatch_t[], int); | |
89 static const char *dissect(struct match *, const char *, const char *, sopno, | |
90 sopno); | |
91 static const char *backref(struct match *, const char *, const char *, sopno, | |
92 sopno, sopno, int); | |
93 static const char *fast(struct match *, const char *, const char *, sopno, sopno); | |
94 static const char *slow(struct match *, const char *, const char *, sopno, sopno); | |
95 static states step(struct re_guts *, sopno, sopno, states, int, states); | |
96 #define MAX_RECURSION 100 | |
97 #define BOL (OUT+1) | |
98 #define EOL (BOL+1) | |
99 #define BOLEOL (BOL+2) | |
100 #define NOTHING (BOL+3) | |
101 #define BOW (BOL+4) | |
102 #define EOW (BOL+5) | |
103 #define CODEMAX (BOL+5) /* highest code used */ | |
104 #define NONCHAR(c) ((c) > CHAR_MAX) | |
105 #define NNONCHAR (CODEMAX-CHAR_MAX) | |
106 #ifdef REDEBUG | |
107 static void print(struct match *, char *, states, int, FILE *); | |
108 #endif | |
109 #ifdef REDEBUG | |
110 static void at(struct match *, char *, char *, char *, sopno, sopno); | |
111 #endif | |
112 #ifdef REDEBUG | |
113 static char *pchar(int); | |
114 #endif | |
115 | |
116 #ifdef REDEBUG | |
117 #define SP(t, s, c) print(m, t, s, c, stdout) | |
118 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) | |
119 #define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } | |
120 static int nope = 0; | |
121 #else | |
122 #define SP(t, s, c) /* nothing */ | |
123 #define AT(t, p1, p2, s1, s2) /* nothing */ | |
124 #define NOTE(s) /* nothing */ | |
125 #endif | |
126 | |
127 /* | |
128 - matcher - the actual matching engine | |
129 */ | |
130 static int /* 0 success, REG_NOMATCH failure */ | |
131 matcher(struct re_guts *g, const char *string, size_t nmatch, | |
132 llvm_regmatch_t pmatch[], | |
133 int eflags) | |
134 { | |
135 const char *endp; | |
136 size_t i; | |
137 struct match mv; | |
138 struct match *m = &mv; | |
139 const char *dp; | |
140 const sopno gf = g->firststate+1; /* +1 for OEND */ | |
141 const sopno gl = g->laststate; | |
142 const char *start; | |
143 const char *stop; | |
144 | |
145 /* simplify the situation where possible */ | |
146 if (g->cflags®_NOSUB) | |
147 nmatch = 0; | |
148 if (eflags®_STARTEND) { | |
149 start = string + pmatch[0].rm_so; | |
150 stop = string + pmatch[0].rm_eo; | |
151 } else { | |
152 start = string; | |
153 stop = start + strlen(start); | |
154 } | |
155 if (stop < start) | |
156 return(REG_INVARG); | |
157 | |
158 /* prescreening; this does wonders for this rather slow code */ | |
159 if (g->must != NULL) { | |
160 for (dp = start; dp < stop; dp++) | |
161 if (*dp == g->must[0] && stop - dp >= g->mlen && | |
162 memcmp(dp, g->must, (size_t)g->mlen) == 0) | |
163 break; | |
164 if (dp == stop) /* we didn't find g->must */ | |
165 return(REG_NOMATCH); | |
166 } | |
167 | |
168 /* match struct setup */ | |
169 m->g = g; | |
170 m->eflags = eflags; | |
171 m->pmatch = NULL; | |
172 m->lastpos = NULL; | |
173 m->offp = string; | |
174 m->beginp = start; | |
175 m->endp = stop; | |
176 STATESETUP(m, 4); | |
177 SETUP(m->st); | |
178 SETUP(m->fresh); | |
179 SETUP(m->tmp); | |
180 SETUP(m->empty); | |
181 CLEAR(m->empty); | |
182 | |
183 /* this loop does only one repetition except for backrefs */ | |
184 for (;;) { | |
185 endp = fast(m, start, stop, gf, gl); | |
186 if (endp == NULL) { /* a miss */ | |
187 free(m->pmatch); | |
188 free((void*)m->lastpos); | |
189 STATETEARDOWN(m); | |
190 return(REG_NOMATCH); | |
191 } | |
192 if (nmatch == 0 && !g->backrefs) | |
193 break; /* no further info needed */ | |
194 | |
195 /* where? */ | |
196 assert(m->coldp != NULL); | |
197 for (;;) { | |
198 NOTE("finding start"); | |
199 endp = slow(m, m->coldp, stop, gf, gl); | |
200 if (endp != NULL) | |
201 break; | |
202 assert(m->coldp < m->endp); | |
203 m->coldp++; | |
204 } | |
205 if (nmatch == 1 && !g->backrefs) | |
206 break; /* no further info needed */ | |
207 | |
208 /* oh my, he wants the subexpressions... */ | |
209 if (m->pmatch == NULL) | |
210 m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) * | |
211 sizeof(llvm_regmatch_t)); | |
212 if (m->pmatch == NULL) { | |
213 STATETEARDOWN(m); | |
214 return(REG_ESPACE); | |
215 } | |
216 for (i = 1; i <= m->g->nsub; i++) | |
217 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; | |
218 if (!g->backrefs && !(m->eflags®_BACKR)) { | |
219 NOTE("dissecting"); | |
220 dp = dissect(m, m->coldp, endp, gf, gl); | |
221 } else { | |
222 if (g->nplus > 0 && m->lastpos == NULL) | |
223 m->lastpos = (const char **)malloc((g->nplus+1) * | |
224 sizeof(char *)); | |
225 if (g->nplus > 0 && m->lastpos == NULL) { | |
226 free(m->pmatch); | |
227 STATETEARDOWN(m); | |
228 return(REG_ESPACE); | |
229 } | |
230 NOTE("backref dissect"); | |
231 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); | |
232 } | |
233 if (dp != NULL) | |
234 break; | |
235 | |
236 /* uh-oh... we couldn't find a subexpression-level match */ | |
237 assert(g->backrefs); /* must be back references doing it */ | |
238 assert(g->nplus == 0 || m->lastpos != NULL); | |
239 for (;;) { | |
240 if (dp != NULL || endp <= m->coldp) | |
241 break; /* defeat */ | |
242 NOTE("backoff"); | |
243 endp = slow(m, m->coldp, endp-1, gf, gl); | |
244 if (endp == NULL) | |
245 break; /* defeat */ | |
246 /* try it on a shorter possibility */ | |
247 #ifndef NDEBUG | |
248 for (i = 1; i <= m->g->nsub; i++) { | |
249 assert(m->pmatch[i].rm_so == -1); | |
250 assert(m->pmatch[i].rm_eo == -1); | |
251 } | |
252 #endif | |
253 NOTE("backoff dissect"); | |
254 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); | |
255 } | |
256 assert(dp == NULL || dp == endp); | |
257 if (dp != NULL) /* found a shorter one */ | |
258 break; | |
259 | |
260 /* despite initial appearances, there is no match here */ | |
261 NOTE("false alarm"); | |
262 if (m->coldp == stop) | |
263 break; | |
264 start = m->coldp + 1; /* recycle starting later */ | |
265 } | |
266 | |
267 /* fill in the details if requested */ | |
268 if (nmatch > 0) { | |
269 pmatch[0].rm_so = m->coldp - m->offp; | |
270 pmatch[0].rm_eo = endp - m->offp; | |
271 } | |
272 if (nmatch > 1) { | |
273 assert(m->pmatch != NULL); | |
274 for (i = 1; i < nmatch; i++) | |
275 if (i <= m->g->nsub) | |
276 pmatch[i] = m->pmatch[i]; | |
277 else { | |
278 pmatch[i].rm_so = -1; | |
279 pmatch[i].rm_eo = -1; | |
280 } | |
281 } | |
282 | |
283 if (m->pmatch != NULL) | |
284 free((char *)m->pmatch); | |
285 if (m->lastpos != NULL) | |
286 free((char *)m->lastpos); | |
287 STATETEARDOWN(m); | |
288 return(0); | |
289 } | |
290 | |
291 /* | |
292 - dissect - figure out what matched what, no back references | |
293 */ | |
294 static const char * /* == stop (success) always */ | |
295 dissect(struct match *m, const char *start, const char *stop, sopno startst, | |
296 sopno stopst) | |
297 { | |
298 int i; | |
299 sopno ss; /* start sop of current subRE */ | |
300 sopno es; /* end sop of current subRE */ | |
301 const char *sp; /* start of string matched by it */ | |
302 const char *stp; /* string matched by it cannot pass here */ | |
303 const char *rest; /* start of rest of string */ | |
304 const char *tail; /* string unmatched by rest of RE */ | |
305 sopno ssub; /* start sop of subsubRE */ | |
306 sopno esub; /* end sop of subsubRE */ | |
307 const char *ssp; /* start of string matched by subsubRE */ | |
308 const char *sep; /* end of string matched by subsubRE */ | |
309 const char *oldssp; /* previous ssp */ | |
310 | |
311 AT("diss", start, stop, startst, stopst); | |
312 sp = start; | |
313 for (ss = startst; ss < stopst; ss = es) { | |
314 /* identify end of subRE */ | |
315 es = ss; | |
316 switch (OP(m->g->strip[es])) { | |
317 case OPLUS_: | |
318 case OQUEST_: | |
319 es += OPND(m->g->strip[es]); | |
320 break; | |
321 case OCH_: | |
322 while (OP(m->g->strip[es]) != O_CH) | |
323 es += OPND(m->g->strip[es]); | |
324 break; | |
325 } | |
326 es++; | |
327 | |
328 /* figure out what it matched */ | |
329 switch (OP(m->g->strip[ss])) { | |
330 case OEND: | |
331 assert(nope); | |
332 break; | |
333 case OCHAR: | |
334 sp++; | |
335 break; | |
336 case OBOL: | |
337 case OEOL: | |
338 case OBOW: | |
339 case OEOW: | |
340 break; | |
341 case OANY: | |
342 case OANYOF: | |
343 sp++; | |
344 break; | |
345 case OBACK_: | |
346 case O_BACK: | |
347 assert(nope); | |
348 break; | |
349 /* cases where length of match is hard to find */ | |
350 case OQUEST_: | |
351 stp = stop; | |
352 for (;;) { | |
353 /* how long could this one be? */ | |
354 rest = slow(m, sp, stp, ss, es); | |
355 assert(rest != NULL); /* it did match */ | |
356 /* could the rest match the rest? */ | |
357 tail = slow(m, rest, stop, es, stopst); | |
358 if (tail == stop) | |
359 break; /* yes! */ | |
360 /* no -- try a shorter match for this one */ | |
361 stp = rest - 1; | |
362 assert(stp >= sp); /* it did work */ | |
363 } | |
364 ssub = ss + 1; | |
365 esub = es - 1; | |
366 /* did innards match? */ | |
367 if (slow(m, sp, rest, ssub, esub) != NULL) { | |
368 const char *dp = dissect(m, sp, rest, ssub, esub); | |
369 (void)dp; /* avoid warning if assertions off */ | |
370 assert(dp == rest); | |
371 } else /* no */ | |
372 assert(sp == rest); | |
373 sp = rest; | |
374 break; | |
375 case OPLUS_: | |
376 stp = stop; | |
377 for (;;) { | |
378 /* how long could this one be? */ | |
379 rest = slow(m, sp, stp, ss, es); | |
380 assert(rest != NULL); /* it did match */ | |
381 /* could the rest match the rest? */ | |
382 tail = slow(m, rest, stop, es, stopst); | |
383 if (tail == stop) | |
384 break; /* yes! */ | |
385 /* no -- try a shorter match for this one */ | |
386 stp = rest - 1; | |
387 assert(stp >= sp); /* it did work */ | |
388 } | |
389 ssub = ss + 1; | |
390 esub = es - 1; | |
391 ssp = sp; | |
392 oldssp = ssp; | |
393 for (;;) { /* find last match of innards */ | |
394 sep = slow(m, ssp, rest, ssub, esub); | |
395 if (sep == NULL || sep == ssp) | |
396 break; /* failed or matched null */ | |
397 oldssp = ssp; /* on to next try */ | |
398 ssp = sep; | |
399 } | |
400 if (sep == NULL) { | |
401 /* last successful match */ | |
402 sep = ssp; | |
403 ssp = oldssp; | |
404 } | |
405 assert(sep == rest); /* must exhaust substring */ | |
406 assert(slow(m, ssp, sep, ssub, esub) == rest); | |
407 { | |
408 const char *dp = dissect(m, ssp, sep, ssub, esub); | |
409 (void)dp; /* avoid warning if assertions off */ | |
410 assert(dp == sep); | |
411 } | |
412 sp = rest; | |
413 break; | |
414 case OCH_: | |
415 stp = stop; | |
416 for (;;) { | |
417 /* how long could this one be? */ | |
418 rest = slow(m, sp, stp, ss, es); | |
419 assert(rest != NULL); /* it did match */ | |
420 /* could the rest match the rest? */ | |
421 tail = slow(m, rest, stop, es, stopst); | |
422 if (tail == stop) | |
423 break; /* yes! */ | |
424 /* no -- try a shorter match for this one */ | |
425 stp = rest - 1; | |
426 assert(stp >= sp); /* it did work */ | |
427 } | |
428 ssub = ss + 1; | |
429 esub = ss + OPND(m->g->strip[ss]) - 1; | |
430 assert(OP(m->g->strip[esub]) == OOR1); | |
431 for (;;) { /* find first matching branch */ | |
432 if (slow(m, sp, rest, ssub, esub) == rest) | |
433 break; /* it matched all of it */ | |
434 /* that one missed, try next one */ | |
435 assert(OP(m->g->strip[esub]) == OOR1); | |
436 esub++; | |
437 assert(OP(m->g->strip[esub]) == OOR2); | |
438 ssub = esub + 1; | |
439 esub += OPND(m->g->strip[esub]); | |
440 if (OP(m->g->strip[esub]) == OOR2) | |
441 esub--; | |
442 else | |
443 assert(OP(m->g->strip[esub]) == O_CH); | |
444 } | |
445 { | |
446 const char *dp = dissect(m, sp, rest, ssub, esub); | |
447 (void)dp; /* avoid warning if assertions off */ | |
448 assert(dp == rest); | |
449 } | |
450 sp = rest; | |
451 break; | |
452 case O_PLUS: | |
453 case O_QUEST: | |
454 case OOR1: | |
455 case OOR2: | |
456 case O_CH: | |
457 assert(nope); | |
458 break; | |
459 case OLPAREN: | |
460 i = OPND(m->g->strip[ss]); | |
461 assert(0 < i && i <= m->g->nsub); | |
462 m->pmatch[i].rm_so = sp - m->offp; | |
463 break; | |
464 case ORPAREN: | |
465 i = OPND(m->g->strip[ss]); | |
466 assert(0 < i && i <= m->g->nsub); | |
467 m->pmatch[i].rm_eo = sp - m->offp; | |
468 break; | |
469 default: /* uh oh */ | |
470 assert(nope); | |
471 break; | |
472 } | |
473 } | |
474 | |
475 assert(sp == stop); | |
476 return(sp); | |
477 } | |
478 | |
479 /* | |
480 - backref - figure out what matched what, figuring in back references | |
481 */ | |
482 static const char * /* == stop (success) or NULL (failure) */ | |
483 backref(struct match *m, const char *start, const char *stop, sopno startst, | |
484 sopno stopst, sopno lev, int rec) /* PLUS nesting level */ | |
485 { | |
486 int i; | |
487 sopno ss; /* start sop of current subRE */ | |
488 const char *sp; /* start of string matched by it */ | |
489 sopno ssub; /* start sop of subsubRE */ | |
490 sopno esub; /* end sop of subsubRE */ | |
491 const char *ssp; /* start of string matched by subsubRE */ | |
492 const char *dp; | |
493 size_t len; | |
494 int hard; | |
495 sop s; | |
496 llvm_regoff_t offsave; | |
497 cset *cs; | |
498 | |
499 AT("back", start, stop, startst, stopst); | |
500 sp = start; | |
501 | |
502 /* get as far as we can with easy stuff */ | |
503 hard = 0; | |
504 for (ss = startst; !hard && ss < stopst; ss++) | |
505 switch (OP(s = m->g->strip[ss])) { | |
506 case OCHAR: | |
507 if (sp == stop || *sp++ != (char)OPND(s)) | |
508 return(NULL); | |
509 break; | |
510 case OANY: | |
511 if (sp == stop) | |
512 return(NULL); | |
513 sp++; | |
514 break; | |
515 case OANYOF: | |
516 cs = &m->g->sets[OPND(s)]; | |
517 if (sp == stop || !CHIN(cs, *sp++)) | |
518 return(NULL); | |
519 break; | |
520 case OBOL: | |
521 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || | |
522 (sp < m->endp && *(sp-1) == '\n' && | |
523 (m->g->cflags®_NEWLINE)) ) | |
524 { /* yes */ } | |
525 else | |
526 return(NULL); | |
527 break; | |
528 case OEOL: | |
529 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || | |
530 (sp < m->endp && *sp == '\n' && | |
531 (m->g->cflags®_NEWLINE)) ) | |
532 { /* yes */ } | |
533 else | |
534 return(NULL); | |
535 break; | |
536 case OBOW: | |
537 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || | |
538 (sp < m->endp && *(sp-1) == '\n' && | |
539 (m->g->cflags®_NEWLINE)) || | |
540 (sp > m->beginp && | |
541 !ISWORD(*(sp-1))) ) && | |
542 (sp < m->endp && ISWORD(*sp)) ) | |
543 { /* yes */ } | |
544 else | |
545 return(NULL); | |
546 break; | |
547 case OEOW: | |
548 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || | |
549 (sp < m->endp && *sp == '\n' && | |
550 (m->g->cflags®_NEWLINE)) || | |
551 (sp < m->endp && !ISWORD(*sp)) ) && | |
552 (sp > m->beginp && ISWORD(*(sp-1))) ) | |
553 { /* yes */ } | |
554 else | |
555 return(NULL); | |
556 break; | |
557 case O_QUEST: | |
558 break; | |
559 case OOR1: /* matches null but needs to skip */ | |
560 ss++; | |
561 s = m->g->strip[ss]; | |
562 do { | |
563 assert(OP(s) == OOR2); | |
564 ss += OPND(s); | |
565 } while (OP(s = m->g->strip[ss]) != O_CH); | |
566 /* note that the ss++ gets us past the O_CH */ | |
567 break; | |
568 default: /* have to make a choice */ | |
569 hard = 1; | |
570 break; | |
571 } | |
572 if (!hard) { /* that was it! */ | |
573 if (sp != stop) | |
574 return(NULL); | |
575 return(sp); | |
576 } | |
577 ss--; /* adjust for the for's final increment */ | |
578 | |
579 /* the hard stuff */ | |
580 AT("hard", sp, stop, ss, stopst); | |
581 s = m->g->strip[ss]; | |
582 switch (OP(s)) { | |
583 case OBACK_: /* the vilest depths */ | |
584 i = OPND(s); | |
585 assert(0 < i && i <= m->g->nsub); | |
586 if (m->pmatch[i].rm_eo == -1) | |
587 return(NULL); | |
588 assert(m->pmatch[i].rm_so != -1); | |
589 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; | |
590 if (len == 0 && rec++ > MAX_RECURSION) | |
591 return(NULL); | |
592 assert(stop - m->beginp >= len); | |
593 if (sp > stop - len) | |
594 return(NULL); /* not enough left to match */ | |
595 ssp = m->offp + m->pmatch[i].rm_so; | |
596 if (memcmp(sp, ssp, len) != 0) | |
597 return(NULL); | |
598 while (m->g->strip[ss] != SOP(O_BACK, i)) | |
599 ss++; | |
600 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); | |
601 break; | |
602 case OQUEST_: /* to null or not */ | |
603 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); | |
604 if (dp != NULL) | |
605 return(dp); /* not */ | |
606 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); | |
607 break; | |
608 case OPLUS_: | |
609 assert(m->lastpos != NULL); | |
610 assert(lev+1 <= m->g->nplus); | |
611 m->lastpos[lev+1] = sp; | |
612 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); | |
613 break; | |
614 case O_PLUS: | |
615 if (sp == m->lastpos[lev]) /* last pass matched null */ | |
616 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); | |
617 /* try another pass */ | |
618 m->lastpos[lev] = sp; | |
619 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); | |
620 if (dp == NULL) | |
621 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); | |
622 else | |
623 return(dp); | |
624 break; | |
625 case OCH_: /* find the right one, if any */ | |
626 ssub = ss + 1; | |
627 esub = ss + OPND(s) - 1; | |
628 assert(OP(m->g->strip[esub]) == OOR1); | |
629 for (;;) { /* find first matching branch */ | |
630 dp = backref(m, sp, stop, ssub, esub, lev, rec); | |
631 if (dp != NULL) | |
632 return(dp); | |
633 /* that one missed, try next one */ | |
634 if (OP(m->g->strip[esub]) == O_CH) | |
635 return(NULL); /* there is none */ | |
636 esub++; | |
637 assert(OP(m->g->strip[esub]) == OOR2); | |
638 ssub = esub + 1; | |
639 esub += OPND(m->g->strip[esub]); | |
640 if (OP(m->g->strip[esub]) == OOR2) | |
641 esub--; | |
642 else | |
643 assert(OP(m->g->strip[esub]) == O_CH); | |
644 } | |
645 break; | |
646 case OLPAREN: /* must undo assignment if rest fails */ | |
647 i = OPND(s); | |
648 assert(0 < i && i <= m->g->nsub); | |
649 offsave = m->pmatch[i].rm_so; | |
650 m->pmatch[i].rm_so = sp - m->offp; | |
651 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); | |
652 if (dp != NULL) | |
653 return(dp); | |
654 m->pmatch[i].rm_so = offsave; | |
655 return(NULL); | |
656 break; | |
657 case ORPAREN: /* must undo assignment if rest fails */ | |
658 i = OPND(s); | |
659 assert(0 < i && i <= m->g->nsub); | |
660 offsave = m->pmatch[i].rm_eo; | |
661 m->pmatch[i].rm_eo = sp - m->offp; | |
662 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); | |
663 if (dp != NULL) | |
664 return(dp); | |
665 m->pmatch[i].rm_eo = offsave; | |
666 return(NULL); | |
667 break; | |
668 default: /* uh oh */ | |
669 assert(nope); | |
670 break; | |
671 } | |
672 | |
673 /* "can't happen" */ | |
674 assert(nope); | |
675 /* NOTREACHED */ | |
676 return NULL; | |
677 } | |
678 | |
679 /* | |
680 - fast - step through the string at top speed | |
681 */ | |
682 static const char * /* where tentative match ended, or NULL */ | |
683 fast(struct match *m, const char *start, const char *stop, sopno startst, | |
684 sopno stopst) | |
685 { | |
686 states st = m->st; | |
687 states fresh = m->fresh; | |
688 states tmp = m->tmp; | |
689 const char *p = start; | |
690 int c = (start == m->beginp) ? OUT : *(start-1); | |
691 int lastc; /* previous c */ | |
692 int flagch; | |
693 int i; | |
694 const char *coldp; /* last p after which no match was underway */ | |
695 | |
696 CLEAR(st); | |
697 SET1(st, startst); | |
698 st = step(m->g, startst, stopst, st, NOTHING, st); | |
699 ASSIGN(fresh, st); | |
700 SP("start", st, *p); | |
701 coldp = NULL; | |
702 for (;;) { | |
703 /* next character */ | |
704 lastc = c; | |
705 c = (p == m->endp) ? OUT : *p; | |
706 if (EQ(st, fresh)) | |
707 coldp = p; | |
708 | |
709 /* is there an EOL and/or BOL between lastc and c? */ | |
710 flagch = '\0'; | |
711 i = 0; | |
712 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || | |
713 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { | |
714 flagch = BOL; | |
715 i = m->g->nbol; | |
716 } | |
717 if ( (c == '\n' && m->g->cflags®_NEWLINE) || | |
718 (c == OUT && !(m->eflags®_NOTEOL)) ) { | |
719 flagch = (flagch == BOL) ? BOLEOL : EOL; | |
720 i += m->g->neol; | |
721 } | |
722 if (i != 0) { | |
723 for (; i > 0; i--) | |
724 st = step(m->g, startst, stopst, st, flagch, st); | |
725 SP("boleol", st, c); | |
726 } | |
727 | |
728 /* how about a word boundary? */ | |
729 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && | |
730 (c != OUT && ISWORD(c)) ) { | |
731 flagch = BOW; | |
732 } | |
733 if ( (lastc != OUT && ISWORD(lastc)) && | |
734 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { | |
735 flagch = EOW; | |
736 } | |
737 if (flagch == BOW || flagch == EOW) { | |
738 st = step(m->g, startst, stopst, st, flagch, st); | |
739 SP("boweow", st, c); | |
740 } | |
741 | |
742 /* are we done? */ | |
743 if (ISSET(st, stopst) || p == stop) | |
744 break; /* NOTE BREAK OUT */ | |
745 | |
746 /* no, we must deal with this character */ | |
747 ASSIGN(tmp, st); | |
748 ASSIGN(st, fresh); | |
749 assert(c != OUT); | |
750 st = step(m->g, startst, stopst, tmp, c, st); | |
751 SP("aft", st, c); | |
752 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); | |
753 p++; | |
754 } | |
755 | |
756 assert(coldp != NULL); | |
757 m->coldp = coldp; | |
758 if (ISSET(st, stopst)) | |
759 return(p+1); | |
760 else | |
761 return(NULL); | |
762 } | |
763 | |
764 /* | |
765 - slow - step through the string more deliberately | |
766 */ | |
767 static const char * /* where it ended */ | |
768 slow(struct match *m, const char *start, const char *stop, sopno startst, | |
769 sopno stopst) | |
770 { | |
771 states st = m->st; | |
772 states empty = m->empty; | |
773 states tmp = m->tmp; | |
774 const char *p = start; | |
775 int c = (start == m->beginp) ? OUT : *(start-1); | |
776 int lastc; /* previous c */ | |
777 int flagch; | |
778 int i; | |
779 const char *matchp; /* last p at which a match ended */ | |
780 | |
781 AT("slow", start, stop, startst, stopst); | |
782 CLEAR(st); | |
783 SET1(st, startst); | |
784 SP("sstart", st, *p); | |
785 st = step(m->g, startst, stopst, st, NOTHING, st); | |
786 matchp = NULL; | |
787 for (;;) { | |
788 /* next character */ | |
789 lastc = c; | |
790 c = (p == m->endp) ? OUT : *p; | |
791 | |
792 /* is there an EOL and/or BOL between lastc and c? */ | |
793 flagch = '\0'; | |
794 i = 0; | |
795 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || | |
796 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { | |
797 flagch = BOL; | |
798 i = m->g->nbol; | |
799 } | |
800 if ( (c == '\n' && m->g->cflags®_NEWLINE) || | |
801 (c == OUT && !(m->eflags®_NOTEOL)) ) { | |
802 flagch = (flagch == BOL) ? BOLEOL : EOL; | |
803 i += m->g->neol; | |
804 } | |
805 if (i != 0) { | |
806 for (; i > 0; i--) | |
807 st = step(m->g, startst, stopst, st, flagch, st); | |
808 SP("sboleol", st, c); | |
809 } | |
810 | |
811 /* how about a word boundary? */ | |
812 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && | |
813 (c != OUT && ISWORD(c)) ) { | |
814 flagch = BOW; | |
815 } | |
816 if ( (lastc != OUT && ISWORD(lastc)) && | |
817 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { | |
818 flagch = EOW; | |
819 } | |
820 if (flagch == BOW || flagch == EOW) { | |
821 st = step(m->g, startst, stopst, st, flagch, st); | |
822 SP("sboweow", st, c); | |
823 } | |
824 | |
825 /* are we done? */ | |
826 if (ISSET(st, stopst)) | |
827 matchp = p; | |
828 if (EQ(st, empty) || p == stop) | |
829 break; /* NOTE BREAK OUT */ | |
830 | |
831 /* no, we must deal with this character */ | |
832 ASSIGN(tmp, st); | |
833 ASSIGN(st, empty); | |
834 assert(c != OUT); | |
835 st = step(m->g, startst, stopst, tmp, c, st); | |
836 SP("saft", st, c); | |
837 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); | |
838 p++; | |
839 } | |
840 | |
841 return(matchp); | |
842 } | |
843 | |
844 | |
845 /* | |
846 - step - map set of states reachable before char to set reachable after | |
847 */ | |
848 static states | |
849 step(struct re_guts *g, | |
850 sopno start, /* start state within strip */ | |
851 sopno stop, /* state after stop state within strip */ | |
852 states bef, /* states reachable before */ | |
853 int ch, /* character or NONCHAR code */ | |
854 states aft) /* states already known reachable after */ | |
855 { | |
856 cset *cs; | |
857 sop s; | |
858 sopno pc; | |
859 onestate here; /* note, macros know this name */ | |
860 sopno look; | |
861 int i; | |
862 | |
863 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { | |
864 s = g->strip[pc]; | |
865 switch (OP(s)) { | |
866 case OEND: | |
867 assert(pc == stop-1); | |
868 break; | |
869 case OCHAR: | |
870 /* only characters can match */ | |
871 assert(!NONCHAR(ch) || ch != (char)OPND(s)); | |
872 if (ch == (char)OPND(s)) | |
873 FWD(aft, bef, 1); | |
874 break; | |
875 case OBOL: | |
876 if (ch == BOL || ch == BOLEOL) | |
877 FWD(aft, bef, 1); | |
878 break; | |
879 case OEOL: | |
880 if (ch == EOL || ch == BOLEOL) | |
881 FWD(aft, bef, 1); | |
882 break; | |
883 case OBOW: | |
884 if (ch == BOW) | |
885 FWD(aft, bef, 1); | |
886 break; | |
887 case OEOW: | |
888 if (ch == EOW) | |
889 FWD(aft, bef, 1); | |
890 break; | |
891 case OANY: | |
892 if (!NONCHAR(ch)) | |
893 FWD(aft, bef, 1); | |
894 break; | |
895 case OANYOF: | |
896 cs = &g->sets[OPND(s)]; | |
897 if (!NONCHAR(ch) && CHIN(cs, ch)) | |
898 FWD(aft, bef, 1); | |
899 break; | |
900 case OBACK_: /* ignored here */ | |
901 case O_BACK: | |
902 FWD(aft, aft, 1); | |
903 break; | |
904 case OPLUS_: /* forward, this is just an empty */ | |
905 FWD(aft, aft, 1); | |
906 break; | |
907 case O_PLUS: /* both forward and back */ | |
908 FWD(aft, aft, 1); | |
909 i = ISSETBACK(aft, OPND(s)); | |
910 BACK(aft, aft, OPND(s)); | |
911 if (!i && ISSETBACK(aft, OPND(s))) { | |
912 /* oho, must reconsider loop body */ | |
913 pc -= OPND(s) + 1; | |
914 INIT(here, pc); | |
915 } | |
916 break; | |
917 case OQUEST_: /* two branches, both forward */ | |
918 FWD(aft, aft, 1); | |
919 FWD(aft, aft, OPND(s)); | |
920 break; | |
921 case O_QUEST: /* just an empty */ | |
922 FWD(aft, aft, 1); | |
923 break; | |
924 case OLPAREN: /* not significant here */ | |
925 case ORPAREN: | |
926 FWD(aft, aft, 1); | |
927 break; | |
928 case OCH_: /* mark the first two branches */ | |
929 FWD(aft, aft, 1); | |
930 assert(OP(g->strip[pc+OPND(s)]) == OOR2); | |
931 FWD(aft, aft, OPND(s)); | |
932 break; | |
933 case OOR1: /* done a branch, find the O_CH */ | |
934 if (ISSTATEIN(aft, here)) { | |
935 for (look = 1; | |
936 OP(s = g->strip[pc+look]) != O_CH; | |
937 look += OPND(s)) | |
938 assert(OP(s) == OOR2); | |
939 FWD(aft, aft, look); | |
940 } | |
941 break; | |
942 case OOR2: /* propagate OCH_'s marking */ | |
943 FWD(aft, aft, 1); | |
944 if (OP(g->strip[pc+OPND(s)]) != O_CH) { | |
945 assert(OP(g->strip[pc+OPND(s)]) == OOR2); | |
946 FWD(aft, aft, OPND(s)); | |
947 } | |
948 break; | |
949 case O_CH: /* just empty */ | |
950 FWD(aft, aft, 1); | |
951 break; | |
952 default: /* ooooops... */ | |
953 assert(nope); | |
954 break; | |
955 } | |
956 } | |
957 | |
958 return(aft); | |
959 } | |
960 | |
961 #ifdef REDEBUG | |
962 /* | |
963 - print - print a set of states | |
964 */ | |
965 static void | |
966 print(struct match *m, char *caption, states st, int ch, FILE *d) | |
967 { | |
968 struct re_guts *g = m->g; | |
969 int i; | |
970 int first = 1; | |
971 | |
972 if (!(m->eflags®_TRACE)) | |
973 return; | |
974 | |
975 (void)fprintf(d, "%s", caption); | |
976 if (ch != '\0') | |
977 (void)fprintf(d, " %s", pchar(ch)); | |
978 for (i = 0; i < g->nstates; i++) | |
979 if (ISSET(st, i)) { | |
980 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); | |
981 first = 0; | |
982 } | |
983 (void)fprintf(d, "\n"); | |
984 } | |
985 | |
986 /* | |
987 - at - print current situation | |
988 */ | |
989 static void | |
990 at(struct match *m, char *title, char *start, char *stop, sopno startst, | |
991 sopno stopst) | |
992 { | |
993 if (!(m->eflags®_TRACE)) | |
994 return; | |
995 | |
996 (void)printf("%s %s-", title, pchar(*start)); | |
997 (void)printf("%s ", pchar(*stop)); | |
998 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); | |
999 } | |
1000 | |
1001 #ifndef PCHARDONE | |
1002 #define PCHARDONE /* never again */ | |
1003 /* | |
1004 - pchar - make a character printable | |
1005 * | |
1006 * Is this identical to regchar() over in debug.c? Well, yes. But a | |
1007 * duplicate here avoids having a debugging-capable regexec.o tied to | |
1008 * a matching debug.o, and this is convenient. It all disappears in | |
1009 * the non-debug compilation anyway, so it doesn't matter much. | |
1010 */ | |
1011 static char * /* -> representation */ | |
1012 pchar(int ch) | |
1013 { | |
1014 static char pbuf[10]; | |
1015 | |
1016 if (isprint(ch) || ch == ' ') | |
1017 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); | |
1018 else | |
1019 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); | |
1020 return(pbuf); | |
1021 } | |
1022 #endif | |
1023 #endif | |
1024 | |
1025 #undef matcher | |
1026 #undef fast | |
1027 #undef slow | |
1028 #undef dissect | |
1029 #undef backref | |
1030 #undef step | |
1031 #undef print | |
1032 #undef at | |
1033 #undef match | |
1034 #undef nope |