Mercurial > hg > Members > nobuyasu > test
comparison regen/src/parse.c @ 11:1e0cd7fade8b
add regen
author | nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sun, 19 Jun 2011 16:36:05 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
10:41634f26cd6f | 11:1e0cd7fade8b |
---|---|
1 #include "re.h" | |
2 #include "util.h" | |
3 #include "parse.h" | |
4 #include "codegen.h" | |
5 | |
6 /* | |
7 Grammar: | |
8 | |
9 EOP: eall: { e0 } EOP | |
10 Union: e0: e1 { '|' e1 }* | |
11 Concatination: e1: e2 { e2 }* | |
12 Repetition: e2: e3 { '+' | '*' | '?' } | |
13 Literal: e3: Literal | '.' | '$' | '[]' | '^' | '$' | Backref | '(' e0 ')' | |
14 */ | |
15 | |
16 static EXPR_TYPE Toktype; | |
17 static UCHARP Toklit, Ptr, End; | |
18 static REGEX *R; | |
19 EXPR *test; | |
20 | |
21 void | |
22 lex() | |
23 { | |
24 if (Ptr == End) { | |
25 Toktype = EOP; | |
26 Toklit = 0; | |
27 } else switch (Toklit = Ptr++, *Toklit) { | |
28 case '.': Toktype = Dot; break; | |
29 case '[': Toktype = Charclass; break; | |
30 case '|': Toktype = Union; break; | |
31 case '?': Toktype = Qmark; break; | |
32 case '+': Toktype = Plus; break; | |
33 case '*': Toktype = Star; break; | |
34 case '(': Toktype = Lpar; break; | |
35 case ')': Toktype = Rpar; break; | |
36 case '^': Toktype = BegLine; break; | |
37 case '$': Toktype = EndLine; break; | |
38 case '\\': Toktype = Literal; | |
39 if (Ptr == End) exitmsg("bad \\"); | |
40 Toklit = Ptr++; | |
41 break; | |
42 default: | |
43 Toktype = Literal; | |
44 } | |
45 } | |
46 | |
47 void | |
48 parse(REGEX *r, CODEGEN *g) | |
49 { | |
50 EXPR *e, *dotstar; | |
51 R = r; | |
52 Ptr = (UCHARP)r->regex; | |
53 End = (UCHARP)r->regex_end; | |
54 lex(); | |
55 | |
56 e = e0(r); | |
57 if (Toktype != EOP) exitmsg("expected end of pattern."); | |
58 | |
59 r->min_length = e->min_length; | |
60 r->max_length = e->max_length; | |
61 | |
62 fill_words(e); | |
63 | |
64 r->must = e->must; | |
65 | |
66 int i; | |
67 for (i = 0; i < r->must.in->size; i++) { | |
68 if (strlen(r->must.in->words[i]) > r->must_max_length) { | |
69 r->must_max_length = strlen(r->must.in->words[i]); | |
70 r->must_max_word = r->must.in->words[i]; | |
71 } | |
72 } | |
73 for (i = 0; i < NCHAR; i++) { | |
74 if (r->involve[i]) r->ninvolve++; | |
75 } | |
76 | |
77 if (r->anchored == FALSE) { | |
78 dotstar = newexpr(r, Dot, '*', (EXPR *)0, (EXPR *)0); | |
79 dotstar = newexpr(r, Star, '\0', dotstar, (EXPR *)0); | |
80 e = newexpr(r, Concat, '\0', dotstar, e); | |
81 } | |
82 | |
83 e = newexpr(r, EOP, '#', e, (EXPR *)0); | |
84 | |
85 fill_follow(e); | |
86 | |
87 r->root = e; | |
88 | |
89 if (r->debug) { | |
90 printf("regex: %s\n\n", r->regex); | |
91 printf("min-length: %d\n", r->min_length); | |
92 printf("max-length: %d\n", r->max_length); | |
93 printf("ninvolve: %d\n\n", r->ninvolve); | |
94 for (i = 0; i < NCHAR; i++) { | |
95 if (r->involve[i]) { | |
96 printf("\"%s\", ", ASCII[i]); | |
97 } | |
98 } | |
99 puts("\n"); | |
100 if (r->must.is != NULL) { | |
101 printf("is: %s\n", r->must.is); | |
102 } | |
103 if (r->must.in != NULL) { | |
104 for (i = 0; i < r->must.in->size; i++) { | |
105 printf("in: %s\n", r->must.in->words[i]); | |
106 } | |
107 } | |
108 if (r->must.left != NULL) { | |
109 printf("left: %s\n", r->must.left); | |
110 } | |
111 if (r->must.right != NULL) { | |
112 printf("right: %s\n", r->must.right); | |
113 } | |
114 for (i = 0; i < r->maxid; i++) { | |
115 print_pos(r->lit_expr[i]); | |
116 } | |
117 exit(0); | |
118 } | |
119 } | |
120 | |
121 EXPR * | |
122 e0(REGEX *r) | |
123 { | |
124 EXPR *e, *f; | |
125 | |
126 e = e1(r); | |
127 while (Toktype == Union) { | |
128 lex(); | |
129 f = e1(r); | |
130 e = newexpr(r, Union, '\0', e, f); | |
131 } | |
132 | |
133 return e; | |
134 } | |
135 | |
136 EXPR * | |
137 e1(REGEX *r) | |
138 { | |
139 EXPR *e, *f; | |
140 | |
141 e = e2(r); | |
142 | |
143 while (Toktype == Literal || Toktype == Charclass | |
144 || Toktype == Dot || Toktype == Lpar | |
145 || Toktype == EndLine || Toktype == BegLine) { | |
146 f = e2(r); | |
147 e = newexpr(r, Concat, '\0', e, f); | |
148 } | |
149 return(e); | |
150 } | |
151 | |
152 EXPR * | |
153 e2(REGEX *r) | |
154 { | |
155 EXPR *e; | |
156 e = e3(r); | |
157 for ( ; PlusStarQmark(Toktype); lex()) { | |
158 e = newexpr(r, Toktype, '\0', e, (EXPR *)0); | |
159 } | |
160 return(e); | |
161 } | |
162 | |
163 EXPR * | |
164 e3(REGEX *r) | |
165 { | |
166 EXPR *e; | |
167 UCHAR *tbl; | |
168 int count; | |
169 | |
170 switch(Toktype) { | |
171 case Literal: | |
172 e = newexpr(r, Literal, *Toklit, (EXPR *)0, (EXPR *)0); | |
173 break; | |
174 case BegLine: | |
175 e = newexpr(r, BegLine, '^', (EXPR *)0, (EXPR *)0); | |
176 break; | |
177 case EndLine: | |
178 e = newexpr(r, EndLine, '$', (EXPR *)0, (EXPR *)0); | |
179 break; | |
180 case Dot: | |
181 e = newexpr(r, Dot, '.', (EXPR *)0, (EXPR *)0); | |
182 break; | |
183 case Charclass: | |
184 tbl = (UCHARP)xmalloc(NCHAR, "charclass tbl"); | |
185 count = ccl(tbl); | |
186 if (count == 1) { | |
187 e = newexpr(r, Literal, *tbl, (EXPR *)0, (EXPR *)0); | |
188 xfree(tbl); | |
189 } else { | |
190 e = newexpr(r, Charclass, '[', (EXPR *)count, (EXPR *)tbl); | |
191 } | |
192 break; | |
193 case Lpar: | |
194 lex(r); | |
195 e = e0(r); | |
196 if (Toktype != Rpar) exitmsg("expected a ')'"); | |
197 break; | |
198 default: | |
199 exitmsg("expected a lit or '('"); | |
200 break; | |
201 } | |
202 lex(); | |
203 return e; | |
204 } | |
205 | |
206 int | |
207 ccl(UCHAR *tbl) { | |
208 int i; | |
209 UCHAR lastc = 0;; | |
210 int range = 0; | |
211 int count = 0; | |
212 UCHAR comp; | |
213 | |
214 tbl[NCHAR] = 0; // count of characters in chaclass. | |
215 memset((char *)tbl, 0, NCHAR); | |
216 | |
217 if (*Ptr == '^') { | |
218 Ptr++; | |
219 for (i = 32; i < 126; i ++) { | |
220 tbl[i] = 1; | |
221 } | |
222 comp = 0; | |
223 } else { | |
224 comp = 1; | |
225 } | |
226 | |
227 if (*Ptr == ']' || *Ptr == '-') { | |
228 tbl[*Ptr] = comp; | |
229 lastc = *Ptr++; | |
230 } | |
231 | |
232 for (; (Ptr < End) && (*Ptr != ']'); Ptr++) { | |
233 if (!range && *Ptr == '-') { | |
234 range = 1; | |
235 continue; | |
236 } | |
237 if (*Ptr == '\\') { | |
238 if (++Ptr >= End) { | |
239 exitmsg(" [ ] imbalance"); | |
240 } | |
241 } | |
242 | |
243 tbl[*Ptr] = comp; | |
244 | |
245 if (range) { | |
246 for (i = (*Ptr) - 1; i > lastc; i--) { | |
247 tbl[i] = comp; | |
248 } | |
249 range = 0; | |
250 } | |
251 lastc = *Ptr; | |
252 } | |
253 if (Ptr == End) exitmsg(" [ ] imbalance"); | |
254 | |
255 if (range) { | |
256 tbl['-'] = comp; | |
257 range = 0; | |
258 } | |
259 | |
260 Ptr++; | |
261 | |
262 for (i = 0; i < NCHAR; i++) { | |
263 if (tbl[i]) { | |
264 lastc = i; | |
265 count++; | |
266 } | |
267 } | |
268 if (count == 1) *tbl = (char)lastc; | |
269 | |
270 /* Return the character count. */ | |
271 return count; | |
272 } | |
273 | |
274 EXPR * | |
275 newexpr(REGEX *r, EXPR_TYPE t, UCHAR lit, EXPR* left, EXPR* right) | |
276 { | |
277 int i; | |
278 EXPR *e = (EXPR *)xmalloc(sizeof(EXPR), "newexpr"); | |
279 e->type = t; | |
280 e->lit = lit; | |
281 e->l = left; | |
282 e->r = right; | |
283 e->first_pos = newpos(); | |
284 e->last_pos = newpos(); | |
285 e->follow_pos = newpos(); | |
286 e->before_pos = newpos(); | |
287 e->parent = 0; | |
288 | |
289 switch (t) { | |
290 case EOP: | |
291 e->id = 0; | |
292 r->lit_expr[e->id] = e; | |
293 break; | |
294 case Dot: case Charclass: case Literal: case BegLine: case EndLine: | |
295 e->id = r->maxid++; | |
296 r->lit_expr[e->id] = e; | |
297 break; | |
298 default: | |
299 e->id = -1; | |
300 break; | |
301 } | |
302 | |
303 switch (t) { | |
304 case Dot: | |
305 if (e->lit != '*') { | |
306 for (i = 0; i < NCHAR; i++) { | |
307 r->involve[i] = 1; | |
308 } | |
309 } | |
310 break; | |
311 case Charclass: | |
312 for (i = 0; i < NCHAR; i++) { | |
313 if (e->tbl[i]) { | |
314 r->involve[i] = 1; | |
315 } | |
316 } | |
317 break; | |
318 case BegLine: case EndLine: | |
319 r->involve[NL] = 1; | |
320 break; | |
321 case Literal: | |
322 r->involve[e->lit] = 1; | |
323 break; | |
324 default: | |
325 break; | |
326 } | |
327 | |
328 switch (t) { | |
329 case Union: case Concat: | |
330 left->parent = e; | |
331 right->parent = e; | |
332 break; | |
333 case EOP: case Plus: case Star: case Qmark: | |
334 left->parent = e; | |
335 break; | |
336 default: | |
337 break; | |
338 } | |
339 | |
340 rept_compress(e, e->l); /* optimize */ | |
341 fill_expr(e); | |
342 return e; | |
343 } | |
344 | |
345 void | |
346 fill_expr(e) | |
347 EXPR *e; | |
348 { | |
349 short l, r; | |
350 | |
351 switch(e->type) { | |
352 case EOP: | |
353 e->max_length = e->l->max_length; | |
354 e->min_length = e->l->min_length; | |
355 break; | |
356 case Plus: | |
357 e->max_length = -1; | |
358 e->min_length = e->l->min_length; | |
359 break; | |
360 case Star: | |
361 e->max_length = -1; | |
362 e->min_length = 0; | |
363 break; | |
364 case Qmark: | |
365 e->max_length = e->l->max_length; | |
366 e->min_length = 0; | |
367 break; | |
368 case Union: | |
369 l = e->l->max_length; r = e->r->max_length; | |
370 e->max_length = (INFOR(l, r) ? -1 : MAX(l, r)); | |
371 l = e->l->min_length; r = e->r->min_length; | |
372 e->min_length = MIN(e->l->min_length, e->r->min_length); | |
373 break; | |
374 case Concat: | |
375 l = e->l->max_length; r = e->r->max_length; | |
376 e->max_length = (INFOR(l, r) ? -1 : l + r); | |
377 l = e->l->min_length; r = e->r->min_length; | |
378 e->min_length = l + r; | |
379 break; | |
380 default: | |
381 e->min_length = e->max_length = 1; | |
382 } | |
383 | |
384 switch (e->type) { | |
385 case Star: case Qmark: case Plus: | |
386 copypos(e->l->first_pos, e->first_pos); | |
387 copypos(e->l->last_pos, e->last_pos); | |
388 break; | |
389 case Union: | |
390 copypos(e->r->first_pos, e->first_pos); | |
391 copypos(e->l->first_pos, e->first_pos); | |
392 copypos(e->r->last_pos, e->last_pos); | |
393 copypos(e->l->last_pos, e->last_pos); | |
394 break; | |
395 case Concat: | |
396 if (e->l->min_length == 0) { | |
397 copypos(e->l->first_pos, e->first_pos); | |
398 copypos(e->r->first_pos, e->first_pos); | |
399 } else { | |
400 copypos(e->l->first_pos, e->first_pos); | |
401 } | |
402 if (e->r->min_length == 0) { | |
403 copypos(e->l->last_pos, e->last_pos); | |
404 copypos(e->r->last_pos, e->last_pos); | |
405 } else { | |
406 copypos(e->r->last_pos, e->last_pos); | |
407 } | |
408 break; | |
409 case BegLine: case EndLine: case Literal: case Dot: case Charclass: case EOP: | |
410 addpos(e->first_pos, e->id); | |
411 addpos(e->last_pos, e->id); | |
412 default: | |
413 break; | |
414 } | |
415 } | |
416 | |
417 /* singleton */ | |
418 static const WORD_SET NOWORDS = {NULL, 0}; | |
419 static const MUST NOMUST = { | |
420 (WORD_SET *)&NOWORDS, NULL, NULL, NULL | |
421 }; | |
422 | |
423 void fill_words(EXPR *e) { | |
424 int l1, l2, i, j; | |
425 char *str; | |
426 | |
427 switch (e->type) { | |
428 case EOP: | |
429 fill_words(e->l); | |
430 e->must = e->l->must; | |
431 break; | |
432 case Plus: | |
433 fill_words(e->l); | |
434 e->must = e->l->must; | |
435 e->must.is = NULL; | |
436 break; | |
437 case Union: | |
438 fill_words(e->l); | |
439 fill_words(e->r); | |
440 /* fill must.is */ | |
441 if (e->l->must.is && e->r->must.is | |
442 && !strcmp(e->l->must.is, e->r->must.is)) { | |
443 e->must.is = e->must.left = e->must.right = e->l->must.is; | |
444 e->must.in = word_set_new(1); | |
445 e->must.in->words[0] = e->must.is; | |
446 return; | |
447 } else { | |
448 e->must.is = NULL; | |
449 } | |
450 | |
451 /* fill must.left */ | |
452 if (e->l->must.left && e->r->must.left) { | |
453 l1 = strlen(e->l->must.left); | |
454 l2 = strlen(e->r->must.left); | |
455 for (i = 0; i < l1 && i < l2; i++) { | |
456 if (e->l->must.left[i] != e->r->must.left[i]) { | |
457 break; | |
458 } | |
459 } | |
460 if (i) { | |
461 str = (char *)xmalloc(sizeof(char) * (i+1), "must: union-left"); | |
462 for (j = 0; j < i; j++) str[j] = e->l->must.left[j]; | |
463 str[j] = '\0'; | |
464 e->must.left = str; | |
465 } else { | |
466 /* no common words */ | |
467 e->must.left = NULL; | |
468 } | |
469 } else { | |
470 e->must.left = NULL; | |
471 } | |
472 | |
473 /* fill must.right */ | |
474 if (e->l->must.right && e->r->must.right) { | |
475 l1 = strlen(e->l->must.right); | |
476 l2 = strlen(e->r->must.right); | |
477 for (i = 0; i < l1 && i < l2; i++) { | |
478 if (e->l->must.right[l1-i-1] != e->r->must.right[l2-i-1]) { | |
479 break; | |
480 } | |
481 } | |
482 if (i) { | |
483 str = (char *)xmalloc(sizeof(char) * (i+1), "must: union-right"); | |
484 for (j = 0; j < i; j++) str[i-j-1] = e->l->must.right[l1-j-1]; | |
485 str[j] = '\0'; | |
486 e->must.right = str; | |
487 } else { | |
488 /* no common words */ | |
489 e->must.right = NULL; | |
490 } | |
491 } else { | |
492 e->must.right = NULL; | |
493 } | |
494 | |
495 /* fill must.in */ | |
496 if (e->must.left && e->must.right) { | |
497 l1 = strlen(e->must.left); | |
498 l2 = strlen(e->must.right); | |
499 e->must.in = word_set_new(1); | |
500 e->must.in->words[0] = l1 > l2 ? e->must.left : e->must.right; | |
501 } else if (e->must.left || e->must.right) { | |
502 e->must.in = word_set_new(1); | |
503 e->must.in->words[0] = e->must.left ? e->must.left : e->must.right; | |
504 } else { | |
505 e->must.in = word_set_new(0); | |
506 } | |
507 | |
508 break; | |
509 case Concat: | |
510 /* fill must.is */ | |
511 fill_words(e->l); | |
512 fill_words(e->r); | |
513 if (e->l->must.is && e->r->must.is) { | |
514 l1 = strlen(e->l->must.is); | |
515 l2 = strlen(e->r->must.is); | |
516 str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-is"); | |
517 strcpy(str, e->l->must.is); | |
518 strcat(str, e->r->must.is); | |
519 e->must.is = e->must.left = e->must.right = str; | |
520 e->must.in = word_set_new(1); | |
521 e->must.in->words[0] = str; | |
522 break; | |
523 } else { | |
524 e->must.is = NULL; | |
525 } | |
526 | |
527 /* fill must.left */ | |
528 if (e->l->must.is && e->r->must.left) { | |
529 l1 = strlen(e->l->must.is); | |
530 l2 = strlen(e->r->must.left); | |
531 str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-is"); | |
532 strcpy(str, e->l->must.is); | |
533 strcat(str, e->r->must.left); | |
534 e->must.left = str; | |
535 } else { | |
536 e->must.left = e->l->must.left; | |
537 } | |
538 | |
539 /* fill must.right */ | |
540 if (e->r->must.is && e->l->must.right) { | |
541 l1 = strlen(e->r->must.is); | |
542 l2 = strlen(e->l->must.right); | |
543 str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-is"); | |
544 strcpy(str, e->l->must.right); | |
545 strcat(str, e->r->must.is); | |
546 e->must.right = str; | |
547 } else { | |
548 e->must.right = e->r->must.right; | |
549 } | |
550 | |
551 /* fill must.in */ | |
552 if (e->must.is) { | |
553 e->must.in = word_set_new(1); | |
554 e->must.in->words[0] = e->must.is; | |
555 } else if (e->l->must.is && e->r->must.left) { | |
556 l1 = e->r->must.in->size; | |
557 e->must.in = word_set_new(l1); | |
558 e->must.in->words[0] = e->must.left; | |
559 for (i = 0, j = 0; i < l1; i++) { | |
560 if (!strcmp(e->r->must.left, e->r->must.in->words[i]) && j == 0) { | |
561 j = 1; | |
562 continue; | |
563 } | |
564 e->must.in->words[i+1-j] = e->r->must.in->words[i]; | |
565 } | |
566 } else if (e->l->must.right && e->r->must.is) { | |
567 l1 = e->l->must.in->size; | |
568 e->must.in = word_set_new(l1); | |
569 e->must.in->words[0] = e->must.right; | |
570 for (i = 0, j = 0; i < l1; i++) { | |
571 if (!strcmp(e->l->must.right, e->l->must.in->words[i]) && j == 0) { | |
572 j = 1; | |
573 continue; | |
574 } | |
575 e->must.in->words[i+1-j] = e->l->must.in->words[i]; | |
576 } | |
577 } else if (e->l->must.right && e->r->must.left) { | |
578 l1 = strlen(e->l->must.right); | |
579 l2 = strlen(e->r->must.left); | |
580 str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-in"); | |
581 strcpy(str, e->l->must.right); | |
582 strcat(str, e->r->must.left); | |
583 l1 = e->l->must.in->size; | |
584 l2 = e->r->must.in->size; | |
585 e->must.in = word_set_new(l1+l2-1); | |
586 e->must.in->words[0] = str; | |
587 for (i = 0, j = 0; i < l1; i++) { | |
588 if (!strcmp(e->l->must.right, e->l->must.in->words[i]) && j == 0) { | |
589 j = 1; | |
590 continue; | |
591 } | |
592 e->must.in->words[i+1-j] = e->l->must.in->words[i]; | |
593 } | |
594 for (i = 0, j = 0; i < l2; i++) { | |
595 if (!strcmp(e->l->must.right, e->l->must.in->words[i]) && j == 0) { | |
596 j = 1; | |
597 continue; | |
598 } | |
599 e->must.in->words[i+l1-j] = e->r->must.in->words[i]; | |
600 } | |
601 } else { | |
602 l1 = e->l->must.in->size; | |
603 l2 = e->r->must.in->size; | |
604 e->must.in = word_set_new(l1+l2); | |
605 for (i = 0; i < l1; i++) { | |
606 e->must.in->words[i] = e->l->must.in->words[i]; | |
607 } | |
608 for (i = 0; i < l2; i++) { | |
609 e->must.in->words[i+l1] = e->r->must.in->words[i]; | |
610 } | |
611 } | |
612 break; | |
613 case Literal: | |
614 str = (char *)xcalloc(sizeof(char), 2, "must: literal"); | |
615 str[0] = e->lit; | |
616 e->must.is = e->must.left = e->must.right = str; | |
617 e->must.in = word_set_new(1); | |
618 e->must.in->words[0] = str; | |
619 break; | |
620 default: | |
621 e->must = NOMUST; | |
622 } | |
623 } | |
624 | |
625 WORD_SET * | |
626 word_set_new (int size) | |
627 { | |
628 if (size == 0) return (WORD_SET *)&NOWORDS; | |
629 | |
630 WORD_SET *word_set = (WORD_SET *)xmalloc(sizeof(WORD_SET), "word_set"); | |
631 word_set->size = size; | |
632 if (size > 0) { | |
633 word_set->words = (char **)xcalloc(sizeof(char *), size, "words"); | |
634 } | |
635 return word_set; | |
636 } | |
637 | |
638 void fill_follow(EXPR *e) | |
639 { | |
640 switch (e->type) { | |
641 case EOP: | |
642 connect(e->l->last_pos, e->first_pos); | |
643 fill_follow(e->l); | |
644 break; | |
645 case Concat: | |
646 connect(e->l->last_pos, e->r->first_pos); | |
647 fill_follow(e->r); | |
648 fill_follow(e->l); | |
649 break; | |
650 case Union: | |
651 fill_follow(e->r); | |
652 fill_follow(e->l); | |
653 break; | |
654 case Star: case Plus: | |
655 connect(e->l->last_pos, e->l->first_pos); | |
656 fill_follow(e->l); | |
657 break; | |
658 case Qmark: | |
659 fill_follow(e->l); | |
660 break; | |
661 default: | |
662 break; | |
663 } | |
664 return; | |
665 } | |
666 | |
667 POSITION_SET * | |
668 newpos() | |
669 { | |
670 static const int init_size = 10; | |
671 POSITION_SET *pos = (POSITION_SET *)xmalloc(sizeof(POSITION_SET) ,"position set"); | |
672 pos->npos = 0; | |
673 pos->pos = (int *)xmalloc(sizeof(int)*init_size, "array in position set"); | |
674 pos->size = init_size; | |
675 return pos; | |
676 } | |
677 | |
678 void | |
679 connect(POSITION_SET *set1, POSITION_SET *set2) | |
680 { | |
681 int i, j; | |
682 EXPR **tbl = R->lit_expr; | |
683 for (i = 0; i < set1->npos; i++) { | |
684 for (j = 0; j < set2->npos; j++) { | |
685 addpos(tbl[set1->pos[i]]->follow_pos, | |
686 set2->pos[j]); | |
687 addpos(tbl[set2->pos[j]]->before_pos, | |
688 set1->pos[i]); | |
689 } | |
690 } | |
691 } | |
692 | |
693 void | |
694 copypos(POSITION_SET *src, POSITION_SET *dst) | |
695 { | |
696 int i; | |
697 for (i = 0; i < src->npos; i++) { | |
698 addpos(dst, src->pos[i]); | |
699 } | |
700 } | |
701 | |
702 void | |
703 addpos(POSITION_SET *set, int id) | |
704 { | |
705 int i; | |
706 | |
707 if (id < 0 || R->maxid < id) { | |
708 exitmsg("invalid id for addpos: %d\n", id); | |
709 } | |
710 | |
711 /* check duplication. */ | |
712 for (i = 0; i < set->npos; i++) { | |
713 if (set->pos[i] == id) { | |
714 return; | |
715 } | |
716 } | |
717 | |
718 /* check for realloc. before adding. */ | |
719 if (set->npos >= set->size) { | |
720 set->size *= 2; | |
721 set->pos = (int *)xrealloc(set->pos, (set->size)*sizeof(int), "realloc addpos"); | |
722 } | |
723 | |
724 set->pos[set->npos] = id; | |
725 set->npos++; | |
726 } | |
727 | |
728 void | |
729 delete_rept(EXPR *e) | |
730 { | |
731 #ifdef DEBUG | |
732 print_expr(R->root); | |
733 #endif | |
734 e->l->parent = e->parent; | |
735 if (e->parent->l == e) e->parent->l = e->l; | |
736 else e->parent->r = e->l; | |
737 | |
738 percolate(e->parent); | |
739 rept_compress (e->parent, e->l); | |
740 | |
741 xfree(e); | |
742 #ifdef DEBUG | |
743 print_expr(R->root); | |
744 #endif | |
745 } | |
746 | |
747 void | |
748 rept_compress(EXPR *parent, EXPR* child) | |
749 { | |
750 switch(parent->type) { | |
751 case Plus: case Star: case Qmark: break; | |
752 default: return; | |
753 } | |
754 switch(child->type) { | |
755 case Plus: case Star: case Qmark: break; | |
756 default: return; | |
757 } | |
758 | |
759 if (parent->type != child->type && parent->type != Star) { | |
760 change_rept(parent, Star); | |
761 } | |
762 | |
763 delete_rept(child); | |
764 } | |
765 | |
766 void | |
767 percolate(EXPR *e) | |
768 { | |
769 fill_expr(e); | |
770 if (e->parent) percolate(e->parent); | |
771 } | |
772 | |
773 void | |
774 change_rept(EXPR *e, EXPR_TYPE type) | |
775 { | |
776 #ifdef DEBUG | |
777 print_expr(R->root); | |
778 #endif | |
779 e->type = type; | |
780 percolate(e); | |
781 #ifdef DEBUG | |
782 print_expr(R->root); | |
783 #endif | |
784 } | |
785 | |
786 void | |
787 delete_expr(EXPR *e) | |
788 { | |
789 EXPR *parent = e->parent; | |
790 EXPR *grandparent, *sibling; | |
791 | |
792 if (!parent) { | |
793 exitmsg("INTERNAL ERROR: delete_expr abort %d\n", e->type); | |
794 } | |
795 | |
796 switch(parent->type) { | |
797 case Plus: case Star: case Qmark: case Union: | |
798 delete_expr(parent); | |
799 return; | |
800 case Concat: | |
801 break; | |
802 default: | |
803 exitmsg("INTERNAL ERROR: delete_expr abort %d\n", parent->type); | |
804 } | |
805 #ifdef DEBUG | |
806 print_expr(R->root); | |
807 #endif | |
808 grandparent = parent->parent; | |
809 | |
810 if (e == parent->r) { | |
811 sibling = parent->l; | |
812 } else { | |
813 sibling = parent->r; | |
814 } | |
815 | |
816 sibling->parent = grandparent; | |
817 | |
818 if (parent == grandparent->r) { | |
819 grandparent->r = sibling; | |
820 } else { | |
821 grandparent->l = sibling; | |
822 } | |
823 | |
824 percolate(grandparent); | |
825 | |
826 parent->r = parent->l = 0; | |
827 free_expr(e); | |
828 | |
829 rept_compress(grandparent, sibling); | |
830 #ifdef DEBUG | |
831 print_expr(R->root); | |
832 #endif | |
833 } | |
834 | |
835 BOOL | |
836 match_every_line(REGEX *r) | |
837 { | |
838 if (r->root->min_length == 0) { | |
839 free_expr(r->root); | |
840 r->maxid = 1; | |
841 r->root = newexpr(r, Literal, NL, (EXPR *)0, (EXPR *)0); | |
842 r->root = newexpr(r, EOP, '#', r->root, (EXPR *)0); | |
843 return TRUE; | |
844 } | |
845 return FALSE; | |
846 } | |
847 | |
848 void | |
849 free_expr(EXPR *e) | |
850 { | |
851 if (e == 0) return; | |
852 switch (e->type) { | |
853 case Charclass: | |
854 /* Don't free the table, because it may be shared, | |
855 * but zero the pointer to it. TODO | |
856 */ | |
857 e->r = 0; | |
858 default: | |
859 xfree(e->first_pos); | |
860 xfree(e->last_pos); | |
861 xfree(e->follow_pos); | |
862 } | |
863 | |
864 if (e->r) free_expr(e->r); | |
865 if (e->l) free_expr(e->l); | |
866 | |
867 xfree(e); | |
868 } | |
869 | |
870 void | |
871 print_expr(EXPR *e) | |
872 { | |
873 switch(e->type) { | |
874 case EOP: | |
875 print_expr(e->l); printf("#\n"); return; | |
876 case Plus: | |
877 print_expr(e->l); printf("+"); return; | |
878 case Star: | |
879 print_expr(e->l); printf("*"); return; | |
880 case Qmark: | |
881 print_expr(e->l); printf("?"); return; | |
882 case Concat: | |
883 if (PlusStarQmark(e->parent->type)) printf("("); | |
884 print_expr(e->l); print_expr(e->r); | |
885 if (PlusStarQmark(e->parent->type)) printf(")"); | |
886 return; | |
887 case Union: | |
888 if (PlusStarQmark(e->parent->type)) printf("("); | |
889 if (e->parent->type == Concat) printf("("); | |
890 print_expr(e->l); printf("|"); print_expr(e->r); | |
891 if (PlusStarQmark(e->parent->type)) printf(")"); | |
892 if (e->parent->type == Concat) printf(")"); | |
893 return; | |
894 case Dot: | |
895 printf("."); return; | |
896 case Charclass:; | |
897 int i; | |
898 printf("["); | |
899 for (i = 0; i < NCHAR; i++) { | |
900 if (((UCHARP)e->r)[i]) { | |
901 printf("%c", (UCHAR)i); | |
902 } | |
903 } | |
904 printf("]"); return; | |
905 case BegLine: | |
906 printf("^"); | |
907 return; | |
908 case EndLine: | |
909 printf("$"); | |
910 return; | |
911 case Literal: | |
912 switch (e->lit) { | |
913 case '\n': | |
914 printf("\\n"); return; | |
915 case '\v': | |
916 printf("\\v"); return; | |
917 case '\f': | |
918 printf("\\f"); return; | |
919 case '\t': | |
920 printf("\\t"); return; | |
921 case '\r': | |
922 printf("\\r"); return; | |
923 default: | |
924 printf("%c", e->lit); return; | |
925 } | |
926 default: | |
927 printf("%c", e->lit); return; | |
928 } | |
929 } | |
930 | |
931 void print_pos(EXPR *e) | |
932 { | |
933 int i; | |
934 printf("expr %d (lit='%c')\n", e->id, e->lit); | |
935 printf("first_pos:\n"); | |
936 for (i = 0; i < e->first_pos->npos; i++) { | |
937 printf("\texpr %d;\n", e->first_pos->pos[i]); | |
938 } | |
939 printf("last_pos:\n"); | |
940 for (i = 0; i < e->last_pos->npos; i++) { | |
941 printf("\texpr %d;\n", e->last_pos->pos[i]); | |
942 } | |
943 printf("follow_pos:\n"); | |
944 for (i = 0; i < e->follow_pos->npos; i++) { | |
945 printf("\texpr %d;\n", e->follow_pos->pos[i]); | |
946 } | |
947 printf("before_pos:\n"); | |
948 for (i = 0; i < e->before_pos->npos; i++) { | |
949 printf("\texpr %d;\n", e->before_pos->pos[i]); | |
950 } | |
951 | |
952 printf("\n"); | |
953 } |