Mercurial > hg > Members > shinya > pyrect
changeset 97:5db856953793
implement range-expression. and add repeat-mn syntax(ex. A{1,10}).
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sun, 12 Dec 2010 19:02:37 +0900 |
parents | 7649b26eeb4d |
children | 4d498b002de5 |
files | pyrect/regexp/ast.py pyrect/regexp/callgraph.py pyrect/regexp/fa.py pyrect/regexp/lexer.py pyrect/regexp/nfa_translator.py pyrect/regexp/parser.py pyrect/translator/grep_translator.py |
diffstat | 7 files changed, 100 insertions(+), 79 deletions(-) [+] |
line wrap: on
line diff
--- a/pyrect/regexp/ast.py Mon Dec 06 12:06:27 2010 +0900 +++ b/pyrect/regexp/ast.py Sun Dec 12 19:02:37 2010 +0900 @@ -134,6 +134,9 @@ else: return -1 +class SpecialInputNode(InputNode): + __metaclass__ = Singleton + class Character(InputNode): import curses.ascii as ascii ASCII = ascii.controlnames + \ @@ -167,7 +170,7 @@ BegLine, EndLine, """ -class Anchor(InputNode): +class Anchor(SpecialInputNode): pass class BegLine(Anchor): @@ -213,3 +216,20 @@ def __str__(self): return "%s-%s" % (self.lower, self.upper) + +class RepMN(SpecialInputNode): + def __init__(self, min, max, op): + self.op = op + self.min = min + self.max = max + + def __str__(self): + if self.max == self.min: + return "%s{%d}" % (self.op, self.min) + elif self.max == None: + return "%s{%d,}" % (self.op, self.min) + else: + return "%s{%d, %d}" % (self.op, self.min, self.max) + + def __hash__(self): + return self.op.__hash__()+self.min.__hash__()+self.max.__hash__()
--- a/pyrect/regexp/callgraph.py Mon Dec 06 12:06:27 2010 +0900 +++ b/pyrect/regexp/callgraph.py Sun Dec 12 19:02:37 2010 +0900 @@ -30,49 +30,6 @@ self.inputs = set() self.map = self.create_call_graph(self.fa) - def combine(self): - """ combine series input. """ - com = dict() - - for state, value in self.map.iteritems(): - if len(value) == 1: - input_, next_ = value.items()[0] - if state != next_ and not state in self.accepts\ - and state != self.start and isinstance(input_, Character): - com[state] = [input_, next_, set()] - - for state in com.iterkeys(): - cut = True - for s, trans in self.map.iteritems(): - if state in trans.values() \ - and state != s: - com[state][2].add(s) - - def combine_(): - recursion = False - for state, value in com.iteritems(): - if value[1] in com: - com_next = com[value[1]] - com[state] = [value[0]+com_next[0], com_next[1], value[2]] - if state in com_next[2]: - com_next[2].remove(state) - - recursion = True - break - if recursion: combine_() - - combine_() - - # merge & simplify - for key, value in com.iteritems(): - for refer in value[2]: - for input_ , next_ in self.map[refer].iteritems(): - if key == next_: - self.map[refer][input_ + value[0]] = value[1] - self.map[refer].pop(input_) - self.map.pop(key) - self.states.remove(key) - def create_call_graph(self, fa): transitionCases = dict() # transitionCases is hash table that binds a state and inputs, @@ -95,6 +52,7 @@ for (state, input) in transitionCases.iteritems(): caseMap = dict() + for case in input: self.inputs.add(case) if self.type == "DFA":
--- a/pyrect/regexp/fa.py Mon Dec 06 12:06:27 2010 +0900 +++ b/pyrect/regexp/fa.py Sun Dec 12 19:02:37 2010 +0900 @@ -43,7 +43,11 @@ string += " %s x %s -> %s," % (s, i, ns) return string - def iterallitems(self): + def iterstates(self): + for s, t self.iteritems(): + yield s, t + + def itertrans(self): for s, v in self.iteritems(): for i, n in v.iteritems(): yield (s, i), n
--- a/pyrect/regexp/lexer.py Mon Dec 06 12:06:27 2010 +0900 +++ b/pyrect/regexp/lexer.py Sun Dec 12 19:02:37 2010 +0900 @@ -18,7 +18,8 @@ 'ANYCHAR', 'ESCAPECHAR', 'PLUS', - 'QMARK' + 'QMARK', + 'REPMN' ) def t_ESCAPECHAR(t): @@ -42,6 +43,10 @@ ur'\+' return t +def t_REPMN(t): + ur'{[1-9][0-9]*(,([1-9][0-9]*)?)?}' + return t + def t_LPAREN(t): ur'\(' return t
--- a/pyrect/regexp/nfa_translator.py Mon Dec 06 12:06:27 2010 +0900 +++ b/pyrect/regexp/nfa_translator.py Sun Dec 12 19:02:37 2010 +0900 @@ -13,9 +13,8 @@ >>> nfa = nfat.translate(prs.parse('(A|B)*C')) >>> nfa.map {(3, ''): set([2, 4]), (2, ''): set([0, 1]), (1, (Character:'B')): set([3]), (4, (Character:'C')): set([5]), (0, (Character:'A')): set([3])} - >>> anfa = ANFA(nfa) - >>> anfa.map - {(1, (Character:'B')): set([3]), (0, (Character:'A')): set([3]), (3, ''): set([2, 4]), (6, ''): set([0, 1, 2, 3, 4, 5]), (2, ''): set([0, 1]), (4, (Character:'C')): set([5])} + >>> nfa = nfat.translate(prs.parse('A{2}B{3,}C{4,5}')) + >>> nfa.map """ def __init__(self): @@ -121,6 +120,16 @@ union = reduce(Union, elems) return union.accept(self) + def visit_RepMN(self, repmn): + frag = repmn.op.accept(self) + s = self.state_id + frag.set_accept(s) + frag.connect(s, "REPEAT", frag.start) + s_ = self.state_id + frag.set_accept(s_) + frag.accept_to(s, repmn) + + return frag def test(): import doctest
--- a/pyrect/regexp/parser.py Mon Dec 06 12:06:27 2010 +0900 +++ b/pyrect/regexp/parser.py Sun Dec 12 19:02:37 2010 +0900 @@ -18,15 +18,8 @@ >>> ast = parser.parse('^\^(A|B)?C+$') >>> print ast ((((^.'^').(('A'|'B'))?).('C')+).$) - - multi byte も OK!! - >>> parser.parse('Aあ*い+う?B') - Concat(Concat(Concat(Concat((Character:'A').(Star:('あ')*)).(Plus:('い')+)).(Qmark:('う')?)).(Character:'B')) - >>> parser.parse('あい*う') - Concat(Concat((MBCharacter:'あ').(Star:('い')*)).(MBCharacter:'う')) - >>> parser.parse('[a-f123]') - CharClass[(Range:'a'-'f'),(Character:'1'),(Character:'2'),(Character:'3')] - >>> parser.parse('/\* *TODO') + >>> parser.parse('A+B?C*D{1}E{2,}F{3,4}') + Concat(Concat(Concat(Concat(Concat((Plus:('A')+).(Qmark:('B')?)).(Star:('C')*)).(RepMN:'D'{1})).(RepMN:'E'{2,})).(RepMN:'F'{3, 4})) """ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -43,7 +36,7 @@ ---------------------------------------- regexp -> regexp UNION branch | branch branch -> branch closure | closure -closure -> closure STAR | closure QMARK | closure PLUS | atom +closure -> closure STAR | closure QMARK | closure PLUS | closure REPMN | atom atom -> LPAREN regexp RPAREN | LBRACKET charclass RBRACKET | ANYCHAR | NORMALCHAR | CARET | DOLLAR | MBCHAR | ESCAPECHAR charclass -> charclass cclass | cclass @@ -61,7 +54,7 @@ hairy... -/*from gnu-grep, The grammar understod by the parser is as follow. (dfa.c:1221) +/*from gnu-grep, The grammar understod by the parser is as follow. regexp: regexp OR branch branch @@ -128,6 +121,33 @@ p[0] = Plus(p[1]) def p_closure4(p): + 'closure : closure REPMN' + mn = p[2][1:-1].split(',') + m = int(mn[0]) + if len(mn) == 1: + #repeat exact m + n = m + elif mn[1] == '': + #repeat over m + n = None + else: + #repeat between m to n + n = int(mn[1]) + + if type(p[1]) == Star: + p[0] = p[1] + elif type(p[1]) == Plus: + p[0] = Repmn(m, None, p[1].op) + elif type(p[1]) == Qmark: + p[0] = RepMN(0, n, p[1].op) + elif type(p[1]) == RepMN: + p[1].min *= m + p[1].max *= n + p[0] = p[1] + else: + p[0] = RepMN(m, n, p[1]) + +def p_closure5(p): 'closure : atom' p[0] = p[1]
--- a/pyrect/translator/grep_translator.py Mon Dec 06 12:06:27 2010 +0900 +++ b/pyrect/translator/grep_translator.py Sun Dec 12 19:02:37 2010 +0900 @@ -3,7 +3,7 @@ import os from c_translator import CTranslator from pyrect.regexp import Regexp -from pyrect.regexp.ast import ASTWalker, AnyChar, Character +from pyrect.regexp.ast import ASTWalker, AnyChar, Character, SpecialInputNode class GREPTranslateExeption(Exception): pass @@ -177,12 +177,23 @@ self.emit( "if (tmp2 == key+%d) goto next;" % (l-1)) self.demit( "}") self.demit( "}") - self.emiti( "switch(buf[%d]) {" % l) - for k, v in skip.iteritems(): - self.emiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) - self.emit( "buf += %d; break;" % v), self.dedent() - self.emiti("default: buf += %d;" % (l+1)), self.dedent() - self.demit( "}") + if self.table_lookup: + self.emiti("static const void * tbl[256] = {[0 ... 255] &&any, %s};" + % ", ".join("[%d] &&add%s" % (ord(c), s) for c, s in skip.iteritems())) + self.emit("goto *tbl[buf[%d]];" % l) + defun = [] + for s in skip.itervalues(): + if s in defun: continue + defun.append(s) + self.emit("add%s: buf += %s; goto ends;" % (s, s)) + self.emit("any: buf += %d; ends:;" % (l+1)) + else: + self.emiti( "switch(buf[%d]) {" % l) + for k, v in skip.iteritems(): + self.emiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) + self.emit( "buf += %d; break;" % v), self.dedent() + self.emiti("default: buf += %d;" % (l+1)), self.dedent() + self.demit( "}") self.demit("}") self.emit( "return;") self.emit( "next:") @@ -194,17 +205,11 @@ self.emit( "UCHARP end_ = end - %d;" % (min_len-1)) self.emit( "if (buf > end_) return;") self.emiti( "do {") - if self.table_lookup and False: - self.emit("static const void *tbl[256] = {[0 ... 255] &&ends, %s};" - % ", ".join("[%d] &&ret" % ord(x) for x in chars)) - self.emit("goto *tbl[buf[%d]];" % (min_len-1)) - self.emit("ends:;") - else: - self.emiti( "switch (buf[%d]) {" % (min_len-1)) - for c in chars: - self.emit( "case %d: /* %s */" % (ord(c), Character.ascii(c))) - self.emit( "goto ret;") - self.demit( "}") + self.emiti( "switch (buf[%d]) {" % (min_len-1)) + for c in chars: + self.emit( "case %d: /* %s */" % (ord(c), Character.ascii(c))) + self.emit( "goto ret;") + self.demit( "}") self.demit( "} while((buf += %d) <= end_);" % min_len) self.emit( "ret: return %s(%s);" % (self.state_name(self.cg.start) , self.args)) self.demit("}", 2) @@ -289,7 +294,7 @@ transition[eol] = "reject" for input_ in transition.keys(): - if type(input_) in self.special_rule: + if isinstance(input_, SpecialInputNode): self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) self.emit_switch(transition, default)