Mercurial > hg > Members > nobuyasu > test
diff regen/src/parse.c @ 11:1e0cd7fade8b
add regen
author | nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sun, 19 Jun 2011 16:36:05 +0900 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regen/src/parse.c Sun Jun 19 16:36:05 2011 +0900 @@ -0,0 +1,953 @@ +#include "re.h" +#include "util.h" +#include "parse.h" +#include "codegen.h" + +/* +Grammar: + +EOP: eall: { e0 } EOP +Union: e0: e1 { '|' e1 }* +Concatination: e1: e2 { e2 }* +Repetition: e2: e3 { '+' | '*' | '?' } +Literal: e3: Literal | '.' | '$' | '[]' | '^' | '$' | Backref | '(' e0 ')' +*/ + +static EXPR_TYPE Toktype; +static UCHARP Toklit, Ptr, End; +static REGEX *R; +EXPR *test; + +void +lex() +{ + if (Ptr == End) { + Toktype = EOP; + Toklit = 0; + } else switch (Toklit = Ptr++, *Toklit) { + case '.': Toktype = Dot; break; + case '[': Toktype = Charclass; break; + case '|': Toktype = Union; break; + case '?': Toktype = Qmark; break; + case '+': Toktype = Plus; break; + case '*': Toktype = Star; break; + case '(': Toktype = Lpar; break; + case ')': Toktype = Rpar; break; + case '^': Toktype = BegLine; break; + case '$': Toktype = EndLine; break; + case '\\': Toktype = Literal; + if (Ptr == End) exitmsg("bad \\"); + Toklit = Ptr++; + break; + default: + Toktype = Literal; + } +} + +void +parse(REGEX *r, CODEGEN *g) +{ + EXPR *e, *dotstar; + R = r; + Ptr = (UCHARP)r->regex; + End = (UCHARP)r->regex_end; + lex(); + + e = e0(r); + if (Toktype != EOP) exitmsg("expected end of pattern."); + + r->min_length = e->min_length; + r->max_length = e->max_length; + + fill_words(e); + + r->must = e->must; + + int i; + for (i = 0; i < r->must.in->size; i++) { + if (strlen(r->must.in->words[i]) > r->must_max_length) { + r->must_max_length = strlen(r->must.in->words[i]); + r->must_max_word = r->must.in->words[i]; + } + } + for (i = 0; i < NCHAR; i++) { + if (r->involve[i]) r->ninvolve++; + } + + if (r->anchored == FALSE) { + dotstar = newexpr(r, Dot, '*', (EXPR *)0, (EXPR *)0); + dotstar = newexpr(r, Star, '\0', dotstar, (EXPR *)0); + e = newexpr(r, Concat, '\0', dotstar, e); + } + + e = newexpr(r, EOP, '#', e, (EXPR *)0); + + fill_follow(e); + + r->root = e; + + if (r->debug) { + printf("regex: %s\n\n", r->regex); + printf("min-length: %d\n", r->min_length); + printf("max-length: %d\n", r->max_length); + printf("ninvolve: %d\n\n", r->ninvolve); + for (i = 0; i < NCHAR; i++) { + if (r->involve[i]) { + printf("\"%s\", ", ASCII[i]); + } + } + puts("\n"); + if (r->must.is != NULL) { + printf("is: %s\n", r->must.is); + } + if (r->must.in != NULL) { + for (i = 0; i < r->must.in->size; i++) { + printf("in: %s\n", r->must.in->words[i]); + } + } + if (r->must.left != NULL) { + printf("left: %s\n", r->must.left); + } + if (r->must.right != NULL) { + printf("right: %s\n", r->must.right); + } + for (i = 0; i < r->maxid; i++) { + print_pos(r->lit_expr[i]); + } + exit(0); + } +} + +EXPR * +e0(REGEX *r) +{ + EXPR *e, *f; + + e = e1(r); + while (Toktype == Union) { + lex(); + f = e1(r); + e = newexpr(r, Union, '\0', e, f); + } + + return e; +} + +EXPR * +e1(REGEX *r) +{ + EXPR *e, *f; + + e = e2(r); + + while (Toktype == Literal || Toktype == Charclass + || Toktype == Dot || Toktype == Lpar + || Toktype == EndLine || Toktype == BegLine) { + f = e2(r); + e = newexpr(r, Concat, '\0', e, f); + } + return(e); +} + +EXPR * +e2(REGEX *r) +{ + EXPR *e; + e = e3(r); + for ( ; PlusStarQmark(Toktype); lex()) { + e = newexpr(r, Toktype, '\0', e, (EXPR *)0); + } + return(e); +} + +EXPR * +e3(REGEX *r) +{ + EXPR *e; + UCHAR *tbl; + int count; + + switch(Toktype) { + case Literal: + e = newexpr(r, Literal, *Toklit, (EXPR *)0, (EXPR *)0); + break; + case BegLine: + e = newexpr(r, BegLine, '^', (EXPR *)0, (EXPR *)0); + break; + case EndLine: + e = newexpr(r, EndLine, '$', (EXPR *)0, (EXPR *)0); + break; + case Dot: + e = newexpr(r, Dot, '.', (EXPR *)0, (EXPR *)0); + break; + case Charclass: + tbl = (UCHARP)xmalloc(NCHAR, "charclass tbl"); + count = ccl(tbl); + if (count == 1) { + e = newexpr(r, Literal, *tbl, (EXPR *)0, (EXPR *)0); + xfree(tbl); + } else { + e = newexpr(r, Charclass, '[', (EXPR *)count, (EXPR *)tbl); + } + break; + case Lpar: + lex(r); + e = e0(r); + if (Toktype != Rpar) exitmsg("expected a ')'"); + break; + default: + exitmsg("expected a lit or '('"); + break; + } + lex(); + return e; +} + +int +ccl(UCHAR *tbl) { + int i; + UCHAR lastc = 0;; + int range = 0; + int count = 0; + UCHAR comp; + + tbl[NCHAR] = 0; // count of characters in chaclass. + memset((char *)tbl, 0, NCHAR); + + if (*Ptr == '^') { + Ptr++; + for (i = 32; i < 126; i ++) { + tbl[i] = 1; + } + comp = 0; + } else { + comp = 1; + } + + if (*Ptr == ']' || *Ptr == '-') { + tbl[*Ptr] = comp; + lastc = *Ptr++; + } + + for (; (Ptr < End) && (*Ptr != ']'); Ptr++) { + if (!range && *Ptr == '-') { + range = 1; + continue; + } + if (*Ptr == '\\') { + if (++Ptr >= End) { + exitmsg(" [ ] imbalance"); + } + } + + tbl[*Ptr] = comp; + + if (range) { + for (i = (*Ptr) - 1; i > lastc; i--) { + tbl[i] = comp; + } + range = 0; + } + lastc = *Ptr; + } + if (Ptr == End) exitmsg(" [ ] imbalance"); + + if (range) { + tbl['-'] = comp; + range = 0; + } + + Ptr++; + + for (i = 0; i < NCHAR; i++) { + if (tbl[i]) { + lastc = i; + count++; + } + } + if (count == 1) *tbl = (char)lastc; + + /* Return the character count. */ + return count; +} + +EXPR * +newexpr(REGEX *r, EXPR_TYPE t, UCHAR lit, EXPR* left, EXPR* right) +{ + int i; + EXPR *e = (EXPR *)xmalloc(sizeof(EXPR), "newexpr"); + e->type = t; + e->lit = lit; + e->l = left; + e->r = right; + e->first_pos = newpos(); + e->last_pos = newpos(); + e->follow_pos = newpos(); + e->before_pos = newpos(); + e->parent = 0; + + switch (t) { + case EOP: + e->id = 0; + r->lit_expr[e->id] = e; + break; + case Dot: case Charclass: case Literal: case BegLine: case EndLine: + e->id = r->maxid++; + r->lit_expr[e->id] = e; + break; + default: + e->id = -1; + break; + } + + switch (t) { + case Dot: + if (e->lit != '*') { + for (i = 0; i < NCHAR; i++) { + r->involve[i] = 1; + } + } + break; + case Charclass: + for (i = 0; i < NCHAR; i++) { + if (e->tbl[i]) { + r->involve[i] = 1; + } + } + break; + case BegLine: case EndLine: + r->involve[NL] = 1; + break; + case Literal: + r->involve[e->lit] = 1; + break; + default: + break; + } + + switch (t) { + case Union: case Concat: + left->parent = e; + right->parent = e; + break; + case EOP: case Plus: case Star: case Qmark: + left->parent = e; + break; + default: + break; + } + + rept_compress(e, e->l); /* optimize */ + fill_expr(e); + return e; +} + +void +fill_expr(e) +EXPR *e; +{ + short l, r; + + switch(e->type) { + case EOP: + e->max_length = e->l->max_length; + e->min_length = e->l->min_length; + break; + case Plus: + e->max_length = -1; + e->min_length = e->l->min_length; + break; + case Star: + e->max_length = -1; + e->min_length = 0; + break; + case Qmark: + e->max_length = e->l->max_length; + e->min_length = 0; + break; + case Union: + l = e->l->max_length; r = e->r->max_length; + e->max_length = (INFOR(l, r) ? -1 : MAX(l, r)); + l = e->l->min_length; r = e->r->min_length; + e->min_length = MIN(e->l->min_length, e->r->min_length); + break; + case Concat: + l = e->l->max_length; r = e->r->max_length; + e->max_length = (INFOR(l, r) ? -1 : l + r); + l = e->l->min_length; r = e->r->min_length; + e->min_length = l + r; + break; + default: + e->min_length = e->max_length = 1; + } + + switch (e->type) { + case Star: case Qmark: case Plus: + copypos(e->l->first_pos, e->first_pos); + copypos(e->l->last_pos, e->last_pos); + break; + case Union: + copypos(e->r->first_pos, e->first_pos); + copypos(e->l->first_pos, e->first_pos); + copypos(e->r->last_pos, e->last_pos); + copypos(e->l->last_pos, e->last_pos); + break; + case Concat: + if (e->l->min_length == 0) { + copypos(e->l->first_pos, e->first_pos); + copypos(e->r->first_pos, e->first_pos); + } else { + copypos(e->l->first_pos, e->first_pos); + } + if (e->r->min_length == 0) { + copypos(e->l->last_pos, e->last_pos); + copypos(e->r->last_pos, e->last_pos); + } else { + copypos(e->r->last_pos, e->last_pos); + } + break; + case BegLine: case EndLine: case Literal: case Dot: case Charclass: case EOP: + addpos(e->first_pos, e->id); + addpos(e->last_pos, e->id); + default: + break; + } +} + +/* singleton */ +static const WORD_SET NOWORDS = {NULL, 0}; +static const MUST NOMUST = { + (WORD_SET *)&NOWORDS, NULL, NULL, NULL +}; + +void fill_words(EXPR *e) { + int l1, l2, i, j; + char *str; + + switch (e->type) { + case EOP: + fill_words(e->l); + e->must = e->l->must; + break; + case Plus: + fill_words(e->l); + e->must = e->l->must; + e->must.is = NULL; + break; + case Union: + fill_words(e->l); + fill_words(e->r); + /* fill must.is */ + if (e->l->must.is && e->r->must.is + && !strcmp(e->l->must.is, e->r->must.is)) { + e->must.is = e->must.left = e->must.right = e->l->must.is; + e->must.in = word_set_new(1); + e->must.in->words[0] = e->must.is; + return; + } else { + e->must.is = NULL; + } + + /* fill must.left */ + if (e->l->must.left && e->r->must.left) { + l1 = strlen(e->l->must.left); + l2 = strlen(e->r->must.left); + for (i = 0; i < l1 && i < l2; i++) { + if (e->l->must.left[i] != e->r->must.left[i]) { + break; + } + } + if (i) { + str = (char *)xmalloc(sizeof(char) * (i+1), "must: union-left"); + for (j = 0; j < i; j++) str[j] = e->l->must.left[j]; + str[j] = '\0'; + e->must.left = str; + } else { + /* no common words */ + e->must.left = NULL; + } + } else { + e->must.left = NULL; + } + + /* fill must.right */ + if (e->l->must.right && e->r->must.right) { + l1 = strlen(e->l->must.right); + l2 = strlen(e->r->must.right); + for (i = 0; i < l1 && i < l2; i++) { + if (e->l->must.right[l1-i-1] != e->r->must.right[l2-i-1]) { + break; + } + } + if (i) { + str = (char *)xmalloc(sizeof(char) * (i+1), "must: union-right"); + for (j = 0; j < i; j++) str[i-j-1] = e->l->must.right[l1-j-1]; + str[j] = '\0'; + e->must.right = str; + } else { + /* no common words */ + e->must.right = NULL; + } + } else { + e->must.right = NULL; + } + + /* fill must.in */ + if (e->must.left && e->must.right) { + l1 = strlen(e->must.left); + l2 = strlen(e->must.right); + e->must.in = word_set_new(1); + e->must.in->words[0] = l1 > l2 ? e->must.left : e->must.right; + } else if (e->must.left || e->must.right) { + e->must.in = word_set_new(1); + e->must.in->words[0] = e->must.left ? e->must.left : e->must.right; + } else { + e->must.in = word_set_new(0); + } + + break; + case Concat: + /* fill must.is */ + fill_words(e->l); + fill_words(e->r); + if (e->l->must.is && e->r->must.is) { + l1 = strlen(e->l->must.is); + l2 = strlen(e->r->must.is); + str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-is"); + strcpy(str, e->l->must.is); + strcat(str, e->r->must.is); + e->must.is = e->must.left = e->must.right = str; + e->must.in = word_set_new(1); + e->must.in->words[0] = str; + break; + } else { + e->must.is = NULL; + } + + /* fill must.left */ + if (e->l->must.is && e->r->must.left) { + l1 = strlen(e->l->must.is); + l2 = strlen(e->r->must.left); + str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-is"); + strcpy(str, e->l->must.is); + strcat(str, e->r->must.left); + e->must.left = str; + } else { + e->must.left = e->l->must.left; + } + + /* fill must.right */ + if (e->r->must.is && e->l->must.right) { + l1 = strlen(e->r->must.is); + l2 = strlen(e->l->must.right); + str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-is"); + strcpy(str, e->l->must.right); + strcat(str, e->r->must.is); + e->must.right = str; + } else { + e->must.right = e->r->must.right; + } + + /* fill must.in */ + if (e->must.is) { + e->must.in = word_set_new(1); + e->must.in->words[0] = e->must.is; + } else if (e->l->must.is && e->r->must.left) { + l1 = e->r->must.in->size; + e->must.in = word_set_new(l1); + e->must.in->words[0] = e->must.left; + for (i = 0, j = 0; i < l1; i++) { + if (!strcmp(e->r->must.left, e->r->must.in->words[i]) && j == 0) { + j = 1; + continue; + } + e->must.in->words[i+1-j] = e->r->must.in->words[i]; + } + } else if (e->l->must.right && e->r->must.is) { + l1 = e->l->must.in->size; + e->must.in = word_set_new(l1); + e->must.in->words[0] = e->must.right; + for (i = 0, j = 0; i < l1; i++) { + if (!strcmp(e->l->must.right, e->l->must.in->words[i]) && j == 0) { + j = 1; + continue; + } + e->must.in->words[i+1-j] = e->l->must.in->words[i]; + } + } else if (e->l->must.right && e->r->must.left) { + l1 = strlen(e->l->must.right); + l2 = strlen(e->r->must.left); + str = (char *)xmalloc(sizeof(char) * (l1+l2+1), "must: concat-in"); + strcpy(str, e->l->must.right); + strcat(str, e->r->must.left); + l1 = e->l->must.in->size; + l2 = e->r->must.in->size; + e->must.in = word_set_new(l1+l2-1); + e->must.in->words[0] = str; + for (i = 0, j = 0; i < l1; i++) { + if (!strcmp(e->l->must.right, e->l->must.in->words[i]) && j == 0) { + j = 1; + continue; + } + e->must.in->words[i+1-j] = e->l->must.in->words[i]; + } + for (i = 0, j = 0; i < l2; i++) { + if (!strcmp(e->l->must.right, e->l->must.in->words[i]) && j == 0) { + j = 1; + continue; + } + e->must.in->words[i+l1-j] = e->r->must.in->words[i]; + } + } else { + l1 = e->l->must.in->size; + l2 = e->r->must.in->size; + e->must.in = word_set_new(l1+l2); + for (i = 0; i < l1; i++) { + e->must.in->words[i] = e->l->must.in->words[i]; + } + for (i = 0; i < l2; i++) { + e->must.in->words[i+l1] = e->r->must.in->words[i]; + } + } + break; + case Literal: + str = (char *)xcalloc(sizeof(char), 2, "must: literal"); + str[0] = e->lit; + e->must.is = e->must.left = e->must.right = str; + e->must.in = word_set_new(1); + e->must.in->words[0] = str; + break; + default: + e->must = NOMUST; + } +} + +WORD_SET * +word_set_new (int size) +{ + if (size == 0) return (WORD_SET *)&NOWORDS; + + WORD_SET *word_set = (WORD_SET *)xmalloc(sizeof(WORD_SET), "word_set"); + word_set->size = size; + if (size > 0) { + word_set->words = (char **)xcalloc(sizeof(char *), size, "words"); + } + return word_set; +} + +void fill_follow(EXPR *e) +{ + switch (e->type) { + case EOP: + connect(e->l->last_pos, e->first_pos); + fill_follow(e->l); + break; + case Concat: + connect(e->l->last_pos, e->r->first_pos); + fill_follow(e->r); + fill_follow(e->l); + break; + case Union: + fill_follow(e->r); + fill_follow(e->l); + break; + case Star: case Plus: + connect(e->l->last_pos, e->l->first_pos); + fill_follow(e->l); + break; + case Qmark: + fill_follow(e->l); + break; + default: + break; + } + return; +} + +POSITION_SET * +newpos() +{ + static const int init_size = 10; + POSITION_SET *pos = (POSITION_SET *)xmalloc(sizeof(POSITION_SET) ,"position set"); + pos->npos = 0; + pos->pos = (int *)xmalloc(sizeof(int)*init_size, "array in position set"); + pos->size = init_size; + return pos; +} + +void +connect(POSITION_SET *set1, POSITION_SET *set2) +{ + int i, j; + EXPR **tbl = R->lit_expr; + for (i = 0; i < set1->npos; i++) { + for (j = 0; j < set2->npos; j++) { + addpos(tbl[set1->pos[i]]->follow_pos, + set2->pos[j]); + addpos(tbl[set2->pos[j]]->before_pos, + set1->pos[i]); + } + } +} + +void +copypos(POSITION_SET *src, POSITION_SET *dst) +{ + int i; + for (i = 0; i < src->npos; i++) { + addpos(dst, src->pos[i]); + } +} + +void +addpos(POSITION_SET *set, int id) +{ + int i; + + if (id < 0 || R->maxid < id) { + exitmsg("invalid id for addpos: %d\n", id); + } + + /* check duplication. */ + for (i = 0; i < set->npos; i++) { + if (set->pos[i] == id) { + return; + } + } + + /* check for realloc. before adding. */ + if (set->npos >= set->size) { + set->size *= 2; + set->pos = (int *)xrealloc(set->pos, (set->size)*sizeof(int), "realloc addpos"); + } + + set->pos[set->npos] = id; + set->npos++; +} + +void +delete_rept(EXPR *e) +{ +#ifdef DEBUG + print_expr(R->root); +#endif + e->l->parent = e->parent; + if (e->parent->l == e) e->parent->l = e->l; + else e->parent->r = e->l; + + percolate(e->parent); + rept_compress (e->parent, e->l); + + xfree(e); +#ifdef DEBUG + print_expr(R->root); +#endif +} + +void +rept_compress(EXPR *parent, EXPR* child) +{ + switch(parent->type) { + case Plus: case Star: case Qmark: break; + default: return; + } + switch(child->type) { + case Plus: case Star: case Qmark: break; + default: return; + } + + if (parent->type != child->type && parent->type != Star) { + change_rept(parent, Star); + } + + delete_rept(child); +} + +void +percolate(EXPR *e) +{ + fill_expr(e); + if (e->parent) percolate(e->parent); +} + +void +change_rept(EXPR *e, EXPR_TYPE type) +{ +#ifdef DEBUG + print_expr(R->root); +#endif + e->type = type; + percolate(e); +#ifdef DEBUG + print_expr(R->root); +#endif +} + +void +delete_expr(EXPR *e) +{ + EXPR *parent = e->parent; + EXPR *grandparent, *sibling; + + if (!parent) { + exitmsg("INTERNAL ERROR: delete_expr abort %d\n", e->type); + } + + switch(parent->type) { + case Plus: case Star: case Qmark: case Union: + delete_expr(parent); + return; + case Concat: + break; + default: + exitmsg("INTERNAL ERROR: delete_expr abort %d\n", parent->type); + } +#ifdef DEBUG + print_expr(R->root); +#endif + grandparent = parent->parent; + + if (e == parent->r) { + sibling = parent->l; + } else { + sibling = parent->r; + } + + sibling->parent = grandparent; + + if (parent == grandparent->r) { + grandparent->r = sibling; + } else { + grandparent->l = sibling; + } + + percolate(grandparent); + + parent->r = parent->l = 0; + free_expr(e); + + rept_compress(grandparent, sibling); +#ifdef DEBUG + print_expr(R->root); +#endif +} + +BOOL +match_every_line(REGEX *r) +{ + if (r->root->min_length == 0) { + free_expr(r->root); + r->maxid = 1; + r->root = newexpr(r, Literal, NL, (EXPR *)0, (EXPR *)0); + r->root = newexpr(r, EOP, '#', r->root, (EXPR *)0); + return TRUE; + } + return FALSE; +} + +void +free_expr(EXPR *e) +{ + if (e == 0) return; + switch (e->type) { + case Charclass: + /* Don't free the table, because it may be shared, + * but zero the pointer to it. TODO + */ + e->r = 0; + default: + xfree(e->first_pos); + xfree(e->last_pos); + xfree(e->follow_pos); + } + + if (e->r) free_expr(e->r); + if (e->l) free_expr(e->l); + + xfree(e); +} + +void +print_expr(EXPR *e) +{ + switch(e->type) { + case EOP: + print_expr(e->l); printf("#\n"); return; + case Plus: + print_expr(e->l); printf("+"); return; + case Star: + print_expr(e->l); printf("*"); return; + case Qmark: + print_expr(e->l); printf("?"); return; + case Concat: + if (PlusStarQmark(e->parent->type)) printf("("); + print_expr(e->l); print_expr(e->r); + if (PlusStarQmark(e->parent->type)) printf(")"); + return; + case Union: + if (PlusStarQmark(e->parent->type)) printf("("); + if (e->parent->type == Concat) printf("("); + print_expr(e->l); printf("|"); print_expr(e->r); + if (PlusStarQmark(e->parent->type)) printf(")"); + if (e->parent->type == Concat) printf(")"); + return; + case Dot: + printf("."); return; + case Charclass:; + int i; + printf("["); + for (i = 0; i < NCHAR; i++) { + if (((UCHARP)e->r)[i]) { + printf("%c", (UCHAR)i); + } + } + printf("]"); return; + case BegLine: + printf("^"); + return; + case EndLine: + printf("$"); + return; + case Literal: + switch (e->lit) { + case '\n': + printf("\\n"); return; + case '\v': + printf("\\v"); return; + case '\f': + printf("\\f"); return; + case '\t': + printf("\\t"); return; + case '\r': + printf("\\r"); return; + default: + printf("%c", e->lit); return; + } + default: + printf("%c", e->lit); return; + } +} + +void print_pos(EXPR *e) +{ + int i; + printf("expr %d (lit='%c')\n", e->id, e->lit); + printf("first_pos:\n"); + for (i = 0; i < e->first_pos->npos; i++) { + printf("\texpr %d;\n", e->first_pos->pos[i]); + } + printf("last_pos:\n"); + for (i = 0; i < e->last_pos->npos; i++) { + printf("\texpr %d;\n", e->last_pos->pos[i]); + } + printf("follow_pos:\n"); + for (i = 0; i < e->follow_pos->npos; i++) { + printf("\texpr %d;\n", e->follow_pos->pos[i]); + } + printf("before_pos:\n"); + for (i = 0; i < e->before_pos->npos; i++) { + printf("\texpr %d;\n", e->before_pos->pos[i]); + } + + printf("\n"); +}