changeset 97:5db856953793

implement range-expression. and add repeat-mn syntax(ex. A{1,10}).
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Sun, 12 Dec 2010 19:02:37 +0900
parents 7649b26eeb4d
children 4d498b002de5
files pyrect/regexp/ast.py pyrect/regexp/callgraph.py pyrect/regexp/fa.py pyrect/regexp/lexer.py pyrect/regexp/nfa_translator.py pyrect/regexp/parser.py pyrect/translator/grep_translator.py
diffstat 7 files changed, 100 insertions(+), 79 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/regexp/ast.py	Mon Dec 06 12:06:27 2010 +0900
+++ b/pyrect/regexp/ast.py	Sun Dec 12 19:02:37 2010 +0900
@@ -134,6 +134,9 @@
         else:
             return -1
 
+class SpecialInputNode(InputNode):
+    __metaclass__ = Singleton
+
 class Character(InputNode):
     import curses.ascii as ascii
     ASCII = ascii.controlnames + \
@@ -167,7 +170,7 @@
 BegLine, EndLine,
 """
 
-class Anchor(InputNode):
+class Anchor(SpecialInputNode):
     pass
 
 class BegLine(Anchor):
@@ -213,3 +216,20 @@
 
     def __str__(self):
         return "%s-%s" % (self.lower, self.upper)
+
+class RepMN(SpecialInputNode):
+    def __init__(self, min, max, op):
+        self.op = op
+        self.min = min
+        self.max = max
+
+    def __str__(self):
+        if self.max == self.min:
+            return "%s{%d}" % (self.op, self.min)
+        elif self.max == None:
+            return "%s{%d,}" % (self.op, self.min)
+        else:
+            return "%s{%d, %d}" % (self.op, self.min, self.max)
+
+    def __hash__(self):
+        return self.op.__hash__()+self.min.__hash__()+self.max.__hash__()
--- a/pyrect/regexp/callgraph.py	Mon Dec 06 12:06:27 2010 +0900
+++ b/pyrect/regexp/callgraph.py	Sun Dec 12 19:02:37 2010 +0900
@@ -30,49 +30,6 @@
         self.inputs = set()
         self.map = self.create_call_graph(self.fa)
 
-    def combine(self):
-        """ combine series input. """
-        com = dict()
-
-        for state, value in self.map.iteritems():
-            if len(value) == 1:
-                input_, next_ = value.items()[0]
-                if state != next_ and not state in self.accepts\
-                       and state != self.start and isinstance(input_, Character):
-                    com[state] = [input_, next_, set()]
-
-        for state in com.iterkeys():
-            cut = True
-            for s, trans in self.map.iteritems():
-                if state in trans.values() \
-                    and state != s:
-                    com[state][2].add(s)
-
-        def combine_():
-            recursion = False
-            for state, value in com.iteritems():
-                if value[1] in com:
-                    com_next = com[value[1]]
-                    com[state] = [value[0]+com_next[0], com_next[1], value[2]]
-                    if state in com_next[2]:
-                        com_next[2].remove(state)
-
-                    recursion = True
-                    break
-            if recursion: combine_()
-
-        combine_()
-
-        # merge & simplify
-        for key, value in com.iteritems():
-            for refer in value[2]:
-                for input_ , next_ in self.map[refer].iteritems():
-                    if key == next_:
-                        self.map[refer][input_ + value[0]] = value[1]
-                        self.map[refer].pop(input_)
-            self.map.pop(key)
-            self.states.remove(key)
-
     def create_call_graph(self, fa):
         transitionCases = dict()
         # transitionCases is hash table that binds a state and inputs,
@@ -95,6 +52,7 @@
 
         for (state, input) in transitionCases.iteritems():
             caseMap = dict()
+
             for case in input:
                 self.inputs.add(case)
                 if self.type == "DFA":
--- a/pyrect/regexp/fa.py	Mon Dec 06 12:06:27 2010 +0900
+++ b/pyrect/regexp/fa.py	Sun Dec 12 19:02:37 2010 +0900
@@ -43,7 +43,11 @@
             string += " %s x %s -> %s," % (s, i, ns)
         return string
 
-    def iterallitems(self):
+    def iterstates(self):
+        for s, t self.iteritems():
+            yield s, t
+
+    def itertrans(self):
         for s, v in self.iteritems():
             for i, n in v.iteritems():
                 yield (s, i), n
--- a/pyrect/regexp/lexer.py	Mon Dec 06 12:06:27 2010 +0900
+++ b/pyrect/regexp/lexer.py	Sun Dec 12 19:02:37 2010 +0900
@@ -18,7 +18,8 @@
     'ANYCHAR',
     'ESCAPECHAR',
     'PLUS',
-    'QMARK'
+    'QMARK',
+    'REPMN'
     )
 
 def t_ESCAPECHAR(t):
@@ -42,6 +43,10 @@
     ur'\+'
     return t
 
+def t_REPMN(t):
+    ur'{[1-9][0-9]*(,([1-9][0-9]*)?)?}'
+    return t
+
 def t_LPAREN(t):
     ur'\('
     return t
--- a/pyrect/regexp/nfa_translator.py	Mon Dec 06 12:06:27 2010 +0900
+++ b/pyrect/regexp/nfa_translator.py	Sun Dec 12 19:02:37 2010 +0900
@@ -13,9 +13,8 @@
     >>> nfa = nfat.translate(prs.parse('(A|B)*C'))
     >>> nfa.map
     {(3, ''): set([2, 4]), (2, ''): set([0, 1]), (1, (Character:'B')): set([3]), (4, (Character:'C')): set([5]), (0, (Character:'A')): set([3])}
-    >>> anfa = ANFA(nfa)
-    >>> anfa.map
-    {(1, (Character:'B')): set([3]), (0, (Character:'A')): set([3]), (3, ''): set([2, 4]), (6, ''): set([0, 1, 2, 3, 4, 5]), (2, ''): set([0, 1]), (4, (Character:'C')): set([5])}
+    >>> nfa = nfat.translate(prs.parse('A{2}B{3,}C{4,5}'))
+    >>> nfa.map
     """
 
     def __init__(self):
@@ -121,6 +120,16 @@
         union = reduce(Union, elems)
         return union.accept(self)
 
+    def visit_RepMN(self, repmn):
+        frag = repmn.op.accept(self)
+        s = self.state_id
+        frag.set_accept(s)
+        frag.connect(s, "REPEAT", frag.start)
+        s_ = self.state_id
+        frag.set_accept(s_)
+        frag.accept_to(s, repmn)
+
+        return frag
 
 def test():
     import doctest
--- a/pyrect/regexp/parser.py	Mon Dec 06 12:06:27 2010 +0900
+++ b/pyrect/regexp/parser.py	Sun Dec 12 19:02:37 2010 +0900
@@ -18,15 +18,8 @@
     >>> ast = parser.parse('^\^(A|B)?C+$')
     >>> print ast
     ((((^.'^').(('A'|'B'))?).('C')+).$)
-
-    multi byte も OK!!
-    >>> parser.parse('Aあ*い+う?B')
-    Concat(Concat(Concat(Concat((Character:'A').(Star:('あ')*)).(Plus:('い')+)).(Qmark:('う')?)).(Character:'B'))
-    >>> parser.parse('あい*う')
-    Concat(Concat((MBCharacter:'あ').(Star:('い')*)).(MBCharacter:'う'))
-    >>> parser.parse('[a-f123]')
-    CharClass[(Range:'a'-'f'),(Character:'1'),(Character:'2'),(Character:'3')]
-    >>> parser.parse('/\* *TODO')
+    >>> parser.parse('A+B?C*D{1}E{2,}F{3,4}')
+    Concat(Concat(Concat(Concat(Concat((Plus:('A')+).(Qmark:('B')?)).(Star:('C')*)).(RepMN:'D'{1})).(RepMN:'E'{2,})).(RepMN:'F'{3, 4}))
     """
     BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
@@ -43,7 +36,7 @@
 ----------------------------------------
 regexp    -> regexp UNION branch | branch
 branch    -> branch closure | closure
-closure   -> closure STAR | closure QMARK | closure PLUS | atom
+closure   -> closure STAR | closure QMARK | closure PLUS | closure REPMN | atom
 atom      -> LPAREN regexp RPAREN | LBRACKET charclass RBRACKET
            | ANYCHAR | NORMALCHAR | CARET | DOLLAR | MBCHAR | ESCAPECHAR
 charclass -> charclass cclass | cclass
@@ -61,7 +54,7 @@
 
 hairy...
 
-/*from gnu-grep, The grammar understod by the parser is as follow. (dfa.c:1221)
+/*from gnu-grep, The grammar understod by the parser is as follow.
    regexp:
      regexp OR branch
      branch
@@ -128,6 +121,33 @@
     p[0] = Plus(p[1])
 
 def p_closure4(p):
+    'closure : closure REPMN'
+    mn = p[2][1:-1].split(',')
+    m = int(mn[0])
+    if len(mn) == 1:
+        #repeat exact m
+        n = m
+    elif mn[1] == '':
+        #repeat over m
+        n = None
+    else:
+        #repeat between m to n
+        n = int(mn[1])
+
+    if type(p[1]) == Star:
+        p[0] = p[1]
+    elif type(p[1]) == Plus:
+        p[0] = Repmn(m, None, p[1].op)
+    elif type(p[1]) == Qmark:
+        p[0] = RepMN(0, n, p[1].op)
+    elif type(p[1]) == RepMN:
+        p[1].min *= m
+        p[1].max *= n
+        p[0] = p[1]
+    else:
+        p[0] = RepMN(m, n, p[1])
+
+def p_closure5(p):
     'closure : atom'
     p[0] = p[1]
 
--- a/pyrect/translator/grep_translator.py	Mon Dec 06 12:06:27 2010 +0900
+++ b/pyrect/translator/grep_translator.py	Sun Dec 12 19:02:37 2010 +0900
@@ -3,7 +3,7 @@
 import os
 from c_translator import CTranslator
 from pyrect.regexp import Regexp
-from pyrect.regexp.ast import ASTWalker, AnyChar, Character
+from pyrect.regexp.ast import ASTWalker, AnyChar, Character, SpecialInputNode
 
 class GREPTranslateExeption(Exception):
     pass
@@ -177,12 +177,23 @@
         self.emit(       "if (tmp2 == key+%d) goto next;" % (l-1))
         self.demit(    "}")
         self.demit(  "}")
-        self.emiti(  "switch(buf[%d]) {" % l)
-        for k, v in skip.iteritems():
-            self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
-            self.emit(     "buf += %d; break;" % v), self.dedent()
-        self.emiti("default: buf += %d;" % (l+1)), self.dedent()
-        self.demit(  "}")
+        if self.table_lookup:
+            self.emiti("static const void * tbl[256] = {[0 ... 255] &&any, %s};"
+                       % ", ".join("[%d] &&add%s" % (ord(c), s) for c, s in skip.iteritems()))
+            self.emit("goto *tbl[buf[%d]];" % l)
+            defun = []
+            for s in skip.itervalues():
+                if s in defun: continue
+                defun.append(s)
+                self.emit("add%s: buf += %s; goto ends;" % (s, s))
+            self.emit("any: buf += %d; ends:;" % (l+1))
+        else:
+            self.emiti(  "switch(buf[%d]) {" % l)
+            for k, v in skip.iteritems():
+                self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
+                self.emit(     "buf += %d; break;" % v), self.dedent()
+            self.emiti("default: buf += %d;" % (l+1)), self.dedent()
+            self.demit(  "}")
         self.demit("}")
         self.emit( "return;")
         self.emit( "next:")
@@ -194,17 +205,11 @@
         self.emit(   "UCHARP end_ = end - %d;" % (min_len-1))
         self.emit(   "if (buf > end_) return;")
         self.emiti(  "do {")
-        if self.table_lookup and False:
-            self.emit("static const void *tbl[256] = {[0 ... 255] &&ends, %s};"
-                      % ", ".join("[%d] &&ret" % ord(x) for x in chars))
-            self.emit("goto *tbl[buf[%d]];" % (min_len-1))
-            self.emit("ends:;")
-        else:
-            self.emiti(    "switch (buf[%d]) {" % (min_len-1))
-            for c in chars:
-                self.emit(   "case %d: /* %s */" % (ord(c), Character.ascii(c)))
-            self.emit(       "goto ret;")
-            self.demit(    "}")
+        self.emiti(    "switch (buf[%d]) {" % (min_len-1))
+        for c in chars:
+            self.emit(   "case %d: /* %s */" % (ord(c), Character.ascii(c)))
+        self.emit(       "goto ret;")
+        self.demit(    "}")
         self.demit(  "} while((buf += %d) <= end_);" % min_len)
         self.emit(   "ret: return %s(%s);" % (self.state_name(self.cg.start) , self.args))
         self.demit("}", 2)
@@ -289,7 +294,7 @@
             transition[eol] = "reject"
 
         for input_ in transition.keys():
-            if type(input_) in self.special_rule:
+            if isinstance(input_, SpecialInputNode):
                 self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))
 
         self.emit_switch(transition, default)