Mercurial > hg > Members > nobuyasu > test
view regen/src/parse.c @ 15:b48915bffa79 draft default tip
commit search.py
author | Nobuyasu Oshiro <dimolto@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sun, 01 Jul 2012 00:31:22 +0900 |
parents | 1e0cd7fade8b |
children |
line wrap: on
line source
#include "re.h" #include "util.h" #include "parse.h" #include "codegen.h" /* Grammar: EOP: eall: { e0 } EOP Union: e0: e1 { '|' e1 }* Concatination: e1: e2 { e2 }* Repetition: e2: e3 { '+' | '*' | '?' } Literal: e3: Literal | '.' | '$' | '[]' | '^' | '$' | Backref | '(' e0 ')' */ static EXPR_TYPE Toktype; static UCHARP Toklit, Ptr, End; static REGEX *R; EXPR *test; void lex() { if (Ptr == End) { Toktype = EOP; Toklit = 0; } else switch (Toklit = Ptr++, *Toklit) { case '.': Toktype = Dot; break; case '[': Toktype = Charclass; break; case '|': Toktype = Union; break; case '?': Toktype = Qmark; break; case '+': Toktype = Plus; break; case '*': Toktype = Star; break; case '(': Toktype = Lpar; break; case ')': Toktype = Rpar; break; case '^': Toktype = BegLine; break; case '$': Toktype = EndLine; break; case '\\': Toktype = Literal; if (Ptr == End) exitmsg("bad \\"); Toklit = Ptr++; break; default: Toktype = Literal; } } void parse(REGEX *r, CODEGEN *g) { EXPR *e, *dotstar; R = r; Ptr = (UCHARP)r->regex; End = (UCHARP)r->regex_end; lex(); e = e0(r); if (Toktype != EOP) exitmsg("expected end of pattern."); r->min_length = e->min_length; r->max_length = e->max_length; fill_words(e); r->must = e->must; int i; for (i = 0; i < r->must.in->size; i++) { if (strlen(r->must.in->words[i]) > r->must_max_length) { r->must_max_length = strlen(r->must.in->words[i]); r->must_max_word = r->must.in->words[i]; } } for (i = 0; i < NCHAR; i++) { if (r->involve[i]) r->ninvolve++; } if (r->anchored == FALSE) { dotstar = newexpr(r, Dot, '*', (EXPR *)0, (EXPR *)0); dotstar = newexpr(r, Star, '\0', dotstar, (EXPR *)0); e = newexpr(r, Concat, '\0', dotstar, e); } e = newexpr(r, EOP, '#', e, (EXPR *)0); fill_follow(e); r->root = e; if (r->debug) { printf("regex: %s\n\n", r->regex); printf("min-length: %d\n", r->min_length); printf("max-length: %d\n", r->max_length); printf("ninvolve: %d\n\n", r->ninvolve); for (i = 0; i < NCHAR; i++) { if (r->involve[i]) { printf("\"%s\", ", ASCII[i]); } } puts("\n"); if (r->must.is != NULL) { printf("is: %s\n", r->must.is); } if (r->must.in != NULL) { for (i = 0; i < r->must.in->size; i++) { printf("in: %s\n", r->must.in->words[i]); } } if (r->must.left != NULL) { printf("left: %s\n", r->must.left); } if (r->must.right != NULL) { printf("right: %s\n", r->must.right); } for (i = 0; i < r->maxid; i++) { print_pos(r->lit_expr[i]); } exit(0); } } EXPR * e0(REGEX *r) { EXPR *e, *f; e = e1(r); while (Toktype == Union) { lex(); f = e1(r); e = newexpr(r, Union, '\0', e, f); } return e; } EXPR * e1(REGEX *r) { EXPR *e, *f; e = e2(r); while (Toktype == Literal || Toktype == Charclass || Toktype == Dot || Toktype == Lpar || Toktype == EndLine || Toktype == BegLine) { f = e2(r); e = newexpr(r, Concat, '\0', e, f); } return(e); } EXPR * e2(REGEX *r) { EXPR *e; e = e3(r); for ( ; PlusStarQmark(Toktype); lex()) { e = newexpr(r, Toktype, '\0', e, (EXPR *)0); } return(e); } EXPR * e3(REGEX *r) { EXPR *e; UCHAR *tbl; int count; switch(Toktype) { case Literal: e = newexpr(r, Literal, *Toklit, (EXPR *)0, (EXPR *)0); break; case BegLine: e = newexpr(r, BegLine, '^', (EXPR *)0, (EXPR *)0); break; case EndLine: e = newexpr(r, EndLine, '$', (EXPR *)0, (EXPR *)0); break; case Dot: e = newexpr(r, Dot, '.', (EXPR *)0, (EXPR *)0); break; case Charclass: tbl = (UCHARP)xmalloc(NCHAR, "charclass tbl"); count = ccl(tbl); if (count == 1) { e = newexpr(r, Literal, *tbl, (EXPR *)0, (EXPR *)0); xfree(tbl); } else { e = newexpr(r, Charclass, '[', (EXPR *)count, (EXPR *)tbl); } break; case Lpar: lex(r); e = e0(r); if (Toktype != Rpar) exitmsg("expected a ')'"); break; default: exitmsg("expected a lit or '('"); break; } lex(); return e; } int ccl(UCHAR *tbl) { int i; UCHAR lastc = 0;; int range = 0; int count = 0; UCHAR comp; tbl[NCHAR] = 0; // count of characters in chaclass. memset((char *)tbl, 0, NCHAR); if (*Ptr == '^') { Ptr++; for (i = 32; i < 126; i ++) { tbl[i] = 1; } comp = 0; } else { comp = 1; } if (*Ptr == ']' || *Ptr == '-') { tbl[*Ptr] = comp; lastc = *Ptr++; } for (; (Ptr < End) && (*Ptr != ']'); Ptr++) { if (!range && *Ptr == '-') { range = 1; continue; } if (*Ptr == '\\') { if (++Ptr >= End) { exitmsg(" [ ] imbalance"); } } tbl[*Ptr] = comp; if (range) { for (i = (*Ptr) - 1; i > lastc; i--) { tbl[i] = comp; } range = 0; } lastc = *Ptr; } if (Ptr == End) exitmsg(" [ ] imbalance"); if (range) { tbl['-'] = comp; range = 0; } Ptr++; for (i = 0; i < NCHAR; i++) { if (tbl[i]) { lastc = i; count++; } } if (count == 1) *tbl = (char)lastc; /* Return the character count. */ return count; } EXPR * newexpr(REGEX *r, EXPR_TYPE t, UCHAR lit, EXPR* left, EXPR* right) { int i; EXPR *e = (EXPR *)xmalloc(sizeof(EXPR), "newexpr"); e->type = t; e->lit = lit; e->l = left; e->r = right; e->first_pos = newpos(); e->last_pos = newpos(); e->follow_pos = newpos(); e->before_pos = newpos(); e->parent = 0; switch (t) { case EOP: e->id = 0; r->lit_expr[e->id] = e; break; case Dot: case Charclass: case Literal: case BegLine: case EndLine: e->id = r->maxid++; r->lit_expr[e->id] = e; break; default: e->id = -1; break; } switch (t) { case Dot: if (e->lit != '*') { for (i = 0; i < NCHAR; i++) { r->involve[i] = 1; } } break; case Charclass: for (i = 0; i < NCHAR; i++) { if (e->tbl[i]) { r->involve[i] = 1; } } break; case BegLine: case EndLine: r->involve[NL] = 1; break; case Literal: r->involve[e->lit] = 1; break; default: break; } switch (t) { case Union: case Concat: left->parent = e; right->parent = e; break; case EOP: case Plus: case Star: case Qmark: left->parent = e; break; default: break; } rept_compress(e, e->l); /* optimize */ fill_expr(e); return e; } void fill_expr(e) EXPR *e; { short l, r; switch(e->type) { case EOP: e->max_length = e->l->max_length; e->min_length = e->l->min_length; break; case Plus: e->max_length = -1; e->min_length = e->l->min_length; break; case Star: e->max_length = -1; e->min_length = 0; break; case Qmark: e->max_length = e->l->max_length; e->min_length = 0; break; case Union: l = e->l->max_length; r = e->r->max_length; e->max_length = (INFOR(l, r) ? -1 : MAX(l, r)); l = e->l->min_length; r = e->r->min_length; e->min_length = MIN(e->l->min_length, e->r->min_length); break; case Concat: l = e->l->max_length; r = e->r->max_length; e->max_length = (INFOR(l, r) ? -1 : l + r); l = e->l->min_length; r = e->r->min_length; e->min_length = l + r; break; default: e->min_length = e->max_length = 1; } switch (e->type) { case Star: case Qmark: case Plus: copypos(e->l->first_pos, e->first_pos); copypos(e->l->last_pos, e->last_pos); break; case Union: copypos(e->r->first_pos, e->first_pos); copypos(e->l->first_pos, e->first_pos); copypos(e->r->last_pos, e->last_pos); copypos(e->l->last_pos, e->last_pos); break; case Concat: if (e->l->min_length == 0) { copypos(e->l->first_pos, e->first_pos); copypos(e->r->first_pos, e->first_pos); } else { copypos(e->l->first_pos, e->first_pos); } if (e->r->min_length == 0) { copypos(e->l->last_pos, e->last_pos); copypos(e->r->last_pos, e->last_pos); } else { copypos(e->r->last_pos, e->last_pos); } break; case BegLine: case EndLine: case Literal: case Dot: case Charclass: case EOP: addpos(e->first_pos, e->id); addpos(e->last_pos, e->id); default: break; } } /* singleton */ static const WORD_SET NOWORDS = {NULL, 0}; static const MUST NOMUST = { (WORD_SET *)&NOWORDS, NULL, NULL, NULL }; void fill_words(EXPR *e) { int l1, l2, i, j; char *str; switch (e->type) { case EOP: fill_words(e->l); e->must = e->l->must; break; case Plus: fill_words(e->l); e->must = e->l->must; e->must.is = NULL; break; case Union: fill_words(e->l); fill_words(e->r); /* fill must.is */ if (e->l->must.is && e->r->must.is && !strcmp(e->l->must.is, e->r->must.is)) { e->must.is = e->must.left = e->must.right = e->l->must.is; e->must.in = word_set_new(1); e->must.in->words[0] = e->must.is; return; } else { e->must.is = NULL; } /* fill must.left */ if (e->l->must.left && e->r->must.left) { l1 = strlen(e->l->must.left); l2 = strlen(e->r->must.left); for (i = 0; i < l1 && i < l2; i++) { if (e->l->must.left[i] != e->r->must.left[i]) { break; } } if (i) { str = (char *)xmalloc(sizeof(char) * (i+1), "must: union-left"); for (j = 0; j < i; j++) str[j] = e->l->must.left[j]; str[j] = '\0'; e->must.left = str; } else { /* no common words */ e->must.left = NULL; } } else { e->must.left = NULL; } /* fill must.right */ if (e->l->must.right && e->r->must.right) { l1 = strlen(e->l->must.right); l2 = strlen(e->r->must.right); for (i = 0; i < l1 && i < l2; i++) { if (e->l->must.right[l1-i-1] != e->r->must.right[l2-i-1]) { break; } } if (i) { str = (char *)xmalloc(sizeof(char) * (i+1), "must: union-right"); for (j = 0; j < i; j++) str[i-j-1] = e->l->must.right[l1-j-1]; str[j] = '\0'; e->must.right = str; } else { /* no common words */ e->must.right = NULL; } } else { e->must.right = NULL; } /* fill must.in */ if (e->must.left && e->must.right) { l1 = strlen(e->must.left); l2 = strlen(e->must.right); e->must.in = word_set_new(1); e->must.in->words[0] = l1 > l2 ? e->must.left : e->must.right; } else if (e->must.left || e->must.right) { e->must.in = word_set_new(1); e->must.in->words[0] = e->must.left ? e->must.left : e->must.right; } else { e->must.in = word_set_new(0); } break; case Concat: /* fill must.is */ fill_words(e->l); fill_words(e->r); if (e->l->must.is && e->r->must.is) { l1 = strlen(e->l->must.is); l2 = strlen(e->r->must.is); str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-is"); strcpy(str, e->l->must.is); strcat(str, e->r->must.is); e->must.is = e->must.left = e->must.right = str; e->must.in = word_set_new(1); e->must.in->words[0] = str; break; } else { e->must.is = NULL; } /* fill must.left */ if (e->l->must.is && e->r->must.left) { l1 = strlen(e->l->must.is); l2 = strlen(e->r->must.left); str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-is"); strcpy(str, e->l->must.is); strcat(str, e->r->must.left); e->must.left = str; } else { e->must.left = e->l->must.left; } /* fill must.right */ if (e->r->must.is && e->l->must.right) { l1 = strlen(e->r->must.is); l2 = strlen(e->l->must.right); str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-is"); strcpy(str, e->l->must.right); strcat(str, e->r->must.is); e->must.right = str; } else { e->must.right = e->r->must.right; } /* fill must.in */ if (e->must.is) { e->must.in = word_set_new(1); e->must.in->words[0] = e->must.is; } else if (e->l->must.is && e->r->must.left) { l1 = e->r->must.in->size; e->must.in = word_set_new(l1); e->must.in->words[0] = e->must.left; for (i = 0, j = 0; i < l1; i++) { if (!strcmp(e->r->must.left, e->r->must.in->words[i]) && j == 0) { j = 1; continue; } e->must.in->words[i+1-j] = e->r->must.in->words[i]; } } else if (e->l->must.right && e->r->must.is) { l1 = e->l->must.in->size; e->must.in = word_set_new(l1); e->must.in->words[0] = e->must.right; for (i = 0, j = 0; i < l1; i++) { if (!strcmp(e->l->must.right, e->l->must.in->words[i]) && j == 0) { j = 1; continue; } e->must.in->words[i+1-j] = e->l->must.in->words[i]; } } else if (e->l->must.right && e->r->must.left) { l1 = strlen(e->l->must.right); l2 = strlen(e->r->must.left); str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-in"); strcpy(str, e->l->must.right); strcat(str, e->r->must.left); l1 = e->l->must.in->size; l2 = e->r->must.in->size; e->must.in = word_set_new(l1+l2-1); e->must.in->words[0] = str; for (i = 0, j = 0; i < l1; i++) { if (!strcmp(e->l->must.right, e->l->must.in->words[i]) && j == 0) { j = 1; continue; } e->must.in->words[i+1-j] = e->l->must.in->words[i]; } for (i = 0, j = 0; i < l2; i++) { if (!strcmp(e->l->must.right, e->l->must.in->words[i]) && j == 0) { j = 1; continue; } e->must.in->words[i+l1-j] = e->r->must.in->words[i]; } } else { l1 = e->l->must.in->size; l2 = e->r->must.in->size; e->must.in = word_set_new(l1+l2); for (i = 0; i < l1; i++) { e->must.in->words[i] = e->l->must.in->words[i]; } for (i = 0; i < l2; i++) { e->must.in->words[i+l1] = e->r->must.in->words[i]; } } break; case Literal: str = (char *)xcalloc(sizeof(char), 2, "must: literal"); str[0] = e->lit; e->must.is = e->must.left = e->must.right = str; e->must.in = word_set_new(1); e->must.in->words[0] = str; break; default: e->must = NOMUST; } } WORD_SET * word_set_new (int size) { if (size == 0) return (WORD_SET *)&NOWORDS; WORD_SET *word_set = (WORD_SET *)xmalloc(sizeof(WORD_SET), "word_set"); word_set->size = size; if (size > 0) { word_set->words = (char **)xcalloc(sizeof(char *), size, "words"); } return word_set; } void fill_follow(EXPR *e) { switch (e->type) { case EOP: connect(e->l->last_pos, e->first_pos); fill_follow(e->l); break; case Concat: connect(e->l->last_pos, e->r->first_pos); fill_follow(e->r); fill_follow(e->l); break; case Union: fill_follow(e->r); fill_follow(e->l); break; case Star: case Plus: connect(e->l->last_pos, e->l->first_pos); fill_follow(e->l); break; case Qmark: fill_follow(e->l); break; default: break; } return; } POSITION_SET * newpos() { static const int init_size = 10; POSITION_SET *pos = (POSITION_SET *)xmalloc(sizeof(POSITION_SET) ,"position set"); pos->npos = 0; pos->pos = (int *)xmalloc(sizeof(int)*init_size, "array in position set"); pos->size = init_size; return pos; } void connect(POSITION_SET *set1, POSITION_SET *set2) { int i, j; EXPR **tbl = R->lit_expr; for (i = 0; i < set1->npos; i++) { for (j = 0; j < set2->npos; j++) { addpos(tbl[set1->pos[i]]->follow_pos, set2->pos[j]); addpos(tbl[set2->pos[j]]->before_pos, set1->pos[i]); } } } void copypos(POSITION_SET *src, POSITION_SET *dst) { int i; for (i = 0; i < src->npos; i++) { addpos(dst, src->pos[i]); } } void addpos(POSITION_SET *set, int id) { int i; if (id < 0 || R->maxid < id) { exitmsg("invalid id for addpos: %d\n", id); } /* check duplication. */ for (i = 0; i < set->npos; i++) { if (set->pos[i] == id) { return; } } /* check for realloc. before adding. */ if (set->npos >= set->size) { set->size *= 2; set->pos = (int *)xrealloc(set->pos, (set->size)*sizeof(int), "realloc addpos"); } set->pos[set->npos] = id; set->npos++; } void delete_rept(EXPR *e) { #ifdef DEBUG print_expr(R->root); #endif e->l->parent = e->parent; if (e->parent->l == e) e->parent->l = e->l; else e->parent->r = e->l; percolate(e->parent); rept_compress (e->parent, e->l); xfree(e); #ifdef DEBUG print_expr(R->root); #endif } void rept_compress(EXPR *parent, EXPR* child) { switch(parent->type) { case Plus: case Star: case Qmark: break; default: return; } switch(child->type) { case Plus: case Star: case Qmark: break; default: return; } if (parent->type != child->type && parent->type != Star) { change_rept(parent, Star); } delete_rept(child); } void percolate(EXPR *e) { fill_expr(e); if (e->parent) percolate(e->parent); } void change_rept(EXPR *e, EXPR_TYPE type) { #ifdef DEBUG print_expr(R->root); #endif e->type = type; percolate(e); #ifdef DEBUG print_expr(R->root); #endif } void delete_expr(EXPR *e) { EXPR *parent = e->parent; EXPR *grandparent, *sibling; if (!parent) { exitmsg("INTERNAL ERROR: delete_expr abort %d\n", e->type); } switch(parent->type) { case Plus: case Star: case Qmark: case Union: delete_expr(parent); return; case Concat: break; default: exitmsg("INTERNAL ERROR: delete_expr abort %d\n", parent->type); } #ifdef DEBUG print_expr(R->root); #endif grandparent = parent->parent; if (e == parent->r) { sibling = parent->l; } else { sibling = parent->r; } sibling->parent = grandparent; if (parent == grandparent->r) { grandparent->r = sibling; } else { grandparent->l = sibling; } percolate(grandparent); parent->r = parent->l = 0; free_expr(e); rept_compress(grandparent, sibling); #ifdef DEBUG print_expr(R->root); #endif } BOOL match_every_line(REGEX *r) { if (r->root->min_length == 0) { free_expr(r->root); r->maxid = 1; r->root = newexpr(r, Literal, NL, (EXPR *)0, (EXPR *)0); r->root = newexpr(r, EOP, '#', r->root, (EXPR *)0); return TRUE; } return FALSE; } void free_expr(EXPR *e) { if (e == 0) return; switch (e->type) { case Charclass: /* Don't free the table, because it may be shared, * but zero the pointer to it. TODO */ e->r = 0; default: xfree(e->first_pos); xfree(e->last_pos); xfree(e->follow_pos); } if (e->r) free_expr(e->r); if (e->l) free_expr(e->l); xfree(e); } void print_expr(EXPR *e) { switch(e->type) { case EOP: print_expr(e->l); printf("#\n"); return; case Plus: print_expr(e->l); printf("+"); return; case Star: print_expr(e->l); printf("*"); return; case Qmark: print_expr(e->l); printf("?"); return; case Concat: if (PlusStarQmark(e->parent->type)) printf("("); print_expr(e->l); print_expr(e->r); if (PlusStarQmark(e->parent->type)) printf(")"); return; case Union: if (PlusStarQmark(e->parent->type)) printf("("); if (e->parent->type == Concat) printf("("); print_expr(e->l); printf("|"); print_expr(e->r); if (PlusStarQmark(e->parent->type)) printf(")"); if (e->parent->type == Concat) printf(")"); return; case Dot: printf("."); return; case Charclass:; int i; printf("["); for (i = 0; i < NCHAR; i++) { if (((UCHARP)e->r)[i]) { printf("%c", (UCHAR)i); } } printf("]"); return; case BegLine: printf("^"); return; case EndLine: printf("$"); return; case Literal: switch (e->lit) { case '\n': printf("\\n"); return; case '\v': printf("\\v"); return; case '\f': printf("\\f"); return; case '\t': printf("\\t"); return; case '\r': printf("\\r"); return; default: printf("%c", e->lit); return; } default: printf("%c", e->lit); return; } } void print_pos(EXPR *e) { int i; printf("expr %d (lit='%c')\n", e->id, e->lit); printf("first_pos:\n"); for (i = 0; i < e->first_pos->npos; i++) { printf("\texpr %d;\n", e->first_pos->pos[i]); } printf("last_pos:\n"); for (i = 0; i < e->last_pos->npos; i++) { printf("\texpr %d;\n", e->last_pos->pos[i]); } printf("follow_pos:\n"); for (i = 0; i < e->follow_pos->npos; i++) { printf("\texpr %d;\n", e->follow_pos->pos[i]); } printf("before_pos:\n"); for (i = 0; i < e->before_pos->npos; i++) { printf("\texpr %d;\n", e->before_pos->pos[i]); } printf("\n"); }