changeset 69:4de11d799dee

add boost algorithm. but it's buggy, not work.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Sun, 07 Nov 2010 09:32:25 +0900
parents 56a997f2c121
children
files pyrect/jitgrep.py pyrect/regexp/__init__.py pyrect/regexp/analyzer.py pyrect/regexp/ast.py pyrect/translator/grep_translator.py
diffstat 5 files changed, 126 insertions(+), 61 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/jitgrep.py	Sun Nov 07 03:10:43 2010 +0900
+++ b/pyrect/jitgrep.py	Sun Nov 07 09:32:25 2010 +0900
@@ -6,13 +6,14 @@
 import time
 from optparse import OptionParser
 from pyrect.translator import *
-from pyrect.regexp import Regexp
+from pyrect.regexp import Regexp, CharCollector
 
 def main(argv):
     myusage = """%prog [--buf-size=size] [--dump]
                   [--time] [--debug] [--cc=compiler] [-c]
                   [-Olevel] regexp [file..] [--out=file]
-                  [--thread=thread_num] [--disable-filter]"""
+                  [--thread=thread_num] [--disable-filter]
+                  [--disable-booster]"""
     psr = OptionParser(usage=myusage)
 
     redirect = ""
@@ -27,6 +28,7 @@
     psr.add_option("--time", action="store_true", dest="time", default=False, help="Print compile/matching time.")
     psr.add_option("--thread", action="store", type="string", dest="thread", default="0", metavar="FILE", help="number of thread.")
     psr.add_option("--disable-filter", action="store_true", dest="no_filter", default=False, help="disable fixed-string filter(Boyer-Moore).")
+    psr.add_option("--disable-booster", action="store_true", dest="no_booster", default=False, help="disable booster.")
     psr.add_option("--debug", action="store_true", dest="debug", default=False, help="Dump commands, not evalute matching (except interactive mode).")
     psr.add_option("--label", action="store_true", dest="label", default=False, help="label implimentation in C.")
     psr.add_option("--dump", action="store_true", dest="dump", default=False, help="Dump generated grep-source.")
@@ -62,7 +64,11 @@
         return
 
     if opts.time : start_time = time.time()
+
     reg = Regexp(".*"+string)
+    reg.chars = Regexp.get_chars(string)
+    (reg.max_len, _, _) = Regexp.get_analyze(string)
+
     if cbc:
         grept = CbCGREPTranslator(reg)
     elif opts.label:
@@ -70,7 +76,9 @@
     else:
         grept = GREPTranslator(reg)
         grept.filter = not opts.no_filter
+        grept.skip_boost = not opts.no_booster
         grept.thread_line = int(opts.thread)
+
     grept.bufsize = bufsize
 
     if opts.dump:
--- a/pyrect/regexp/__init__.py	Sun Nov 07 03:10:43 2010 +0900
+++ b/pyrect/regexp/__init__.py	Sun Nov 07 09:32:25 2010 +0900
@@ -7,13 +7,14 @@
 from pyrect.regexp.dfa_translator import DFATranslator
 from pyrect.regexp.callgraph import CallGraph
 from pyrect.regexp.analyzer import Analyzer
+from pyrect.regexp.char_collector import CharCollector
 
 class Regexp(object):
     """Regexp is basic class in Pyrect.
     this contains regexp, dfa, nfa,, actually it's include all.
     >>> regexp = Regexp('(A|B)*C')
     >>> regexp.dfacg.map
-    {'1': {'A': ['1'], 'C': ['2'], 'B': ['0']}, '0': {'A': ['1'], 'C': ['2'], 'B': ['0']}, '3': {'A': ['1'], 'C': ['2'], 'B': ['0']}, '2': {}}
+    {'1': {}, '0': {(Character:'A'): '0', (Character:'B'): '0', (Character:'C'): '1'}}
     >>> regexp.matches('ABC')
     True
     >>> regexp = Regexp('(a|b)*cd*e')
@@ -33,6 +34,23 @@
         an = Analyzer()
         self.max_len, self.min_len, self.must_words =\
                       an.analyze(self.ast)
+        an = CharCollector()
+        self.chars = an.analyze(self.ast)
+
+    @classmethod
+    def get_chars(cls, regexp):
+        an = CharCollector()
+        return an.analyze(cls.parse(regexp))
+
+    @classmethod
+    def get_analyze(cls, regexp):
+        an = Analyzer()
+        return an.analyze(cls.parse(regexp))
+
+    @classmethod
+    def parse(cls, regexp):
+        psr = Parser()
+        return psr.parse(regexp)
 
     def matches(self, string):
         runtime = self.dfa.get_runtime()
--- a/pyrect/regexp/analyzer.py	Sun Nov 07 03:10:43 2010 +0900
+++ b/pyrect/regexp/analyzer.py	Sun Nov 07 09:32:25 2010 +0900
@@ -26,7 +26,7 @@
     >>> an.analyze(prs.parse('(plus)?(qmark)?'))
     (9, 0, [])
     >>> an.analyze(prs.parse('\*+ \[\['))
-    (inf, 1, [])
+    (inf, 4, ['* [['])
     """
 
     def __init__(self, ast=None):
@@ -35,7 +35,6 @@
         else:
             self.max_len = 0
             self.min_len = 0
-            self.must_words = []
 
     def analyze(self, ast=None):
         if ast:
@@ -52,15 +51,11 @@
     def visit_Character(self, ast):
         return 1, 1, [chr(ast.char)]
 
-    def visit_Concat(self, concat):
-        (max1, min1, key1) = concat.op1.accept(self)
-        (max2, min2, key2) = concat.op2.accept(self)
+    def concat(self, (max1, min1, key1), (max2, min2, key2)):
+        return max1 + max2, min1 + min2, key1[0:-1] \
+               + ([key1[-1] + key2[0]]) + key2[1:]
 
-        return max1 + max2, min1 + min2, key1[0:-1] + ([key1[-1] + key2[0]]) + key2[1:]
-
-    def visit_Union(self, union):
-        (max1, min1, _) = union.op1.accept(self)
-        (max2, min2, _) = union.op2.accept(self)
+    def union(self, (max1, min1, _), (max2, min2, __)):
         return max(max1, max2), min(min1, min2), ["", ""]
 
     def visit_Star(self, star):
--- a/pyrect/regexp/ast.py	Sun Nov 07 03:10:43 2010 +0900
+++ b/pyrect/regexp/ast.py	Sun Nov 07 09:32:25 2010 +0900
@@ -8,7 +8,32 @@
 
 class ASTWalker(object):
     def visit(self, ast):
-        pass
+        return
+
+    def visit_Star(self, star):
+        return star.op.accept(self)
+
+    def visit_Plus(self, plus):
+        return plus.op.accept(self)
+
+    def visit_Qmark(self, qmark):
+        return qmark.op.accept(self)
+
+    def visit_Concat(self, concat):
+        r1 = concat.op1.accept(self)
+        r2 = concat.op2.accept(self)
+        return self.concat(r1, r2)
+
+    def visit_Union(self, union):
+        r1 = union.op1.accept(self)
+        r2 = union.op2.accept(self)
+        return self.union(r1, r2)
+
+    def union(self, r1, r2):
+        return
+
+    def concat(self, r1, r2):
+        return
 
 # AST-Nodes
 class Node(object):
@@ -127,6 +152,10 @@
     def __hash__(self):
         return self.char.__hash__()
 
+    @classmethod
+    def ascii(cls, c):
+        return cls.ASCII[ord(c)]
+
 class MBCharacter(Character):
     def __init__(self, mbchar):
         ret = Character.__init__(self, mbchar)
--- a/pyrect/translator/grep_translator.py	Sun Nov 07 03:10:43 2010 +0900
+++ b/pyrect/translator/grep_translator.py	Sun Nov 07 09:32:25 2010 +0900
@@ -2,7 +2,7 @@
 
 import os
 from c_translator import CTranslator
-from pyrect.regexp import Regexp, Analyzer
+from pyrect.regexp import Regexp
 from pyrect.regexp.ast import ASTWalker, AnyChar, Character
 
 class GREPTranslateExeption(Exception):
@@ -28,6 +28,8 @@
         self.thread_line = 1
         self.filter = True
         self.filter_only = False
+        self.filter_prefix = False
+        self.skip_boost = True
         self.start = "matcher"
         self.interface = "UCHARP beg, UCHARP buf, UCHARP end"
         self.args = "beg, buf, end"
@@ -51,48 +53,47 @@
         self.emit("#include <fcntl.h>")
         self.emit("#include <string.h>")
 
-        #self.emit_skip()
-
-        for state in self.cg.map.iterkeys():
-            self.emit("void %s(%s);" % (self.state_name(state), self.interface))
-        self.emit('void accept(%s);' % self.interface)
         self.emit('void reject(%s);' % self.interface)
         self.emit("void matcher(%s);" % self.interface, 2)
+        self.emit('void accept(%s);' % self.interface)
 
+        key = None
         if self.filter and self.regexp.must_words:
-            self.emit_filter(self.regexp.must_words)
-        else: self.filter = False
+            key = max(self.regexp.must_words, key=len)
+            if len(self.regexp.must_words) == 1 and len(key) == self.regexp.min_len:
+                self.filter_only = True
+        else:
+            self.filter = False
+
+        if not self.filter_only:
+            for state in self.cg.map.iterkeys():
+                self.emit("void %s(%s);" % (self.state_name(state), self.interface))
+            self.emit()
+
+        if self.filter: self.emit_filter(key)
+
+        if self.skip_boost and not self.filter_only and \
+               not AnyChar() in self.regexp.chars and \
+               self.regexp.min_len > 2:
+            self.emit_booster(self.regexp.min_len, self.regexp.chars)
+        else:
+            self.skip_boost = False
 
         grepsource = open(self.BASE_DIR + "/template/grep.c")
         self.emit(grepsource.read())
 
-    #TODO: filter is faster than dfa in matching.
-    #but slower than that in compiling.
-    #We have to improve this problem.
-    def emit_filter(self, words):
-        def longest(s1, s2):
-            if len(s1) >= len(s2):
-                return s1
-            else:
-                return s2
-
-        key = reduce(longest, words)
-
-        if len(words) == 1 and len(key) == self.regexp.min_len:
-            self.filter_only = True
-        else:
-            filter_prefix = False
-
+    def emit_filter(self, key):
+        l = len(key)
         def emit_next():
-            self.emit("beg = memrchr(buf, '\\n', beg);")
             if self.filter_only:
                 self.emit("accept(%s);" % self.args)
             elif self.filter_prefix:
-                self.emit("buf -= %d;" % len(key))
-                self.emit("dfa(%s);" % self.args)
+                self.emit("buf -= %d;" % l)
+                self.emit("%s(%s);" % (self.state_name(self.cg.start) ,self.args))
             else:
+                self.emit("beg = memrchr(buf, '\\n', beg);")
                 self.emit("buf = beg;")
-                self.emit("dfa(%s);" % self.args)
+                self.emit("%s(%s);" % (self.state_name(self.cg.start), self.args))
 
         self.emit("UCHARP memrchr(UCHARP p, int c, UCHARP beg);", 2)
 
@@ -105,33 +106,46 @@
             self.emitd("}", 2)
             return
 
+        self.emit('static UCHAR key[] = "%s";' % key)
+
         skip = [str(l)] * 256
         for i in range(l - 1):
             skip[ord(key[i])] = str(l-1-i)
 
-        self.emit('static const UCHAR key[] = "%s";' % key)
-
         self.emiti(   "static const UCHAR skip[256] = {")
         for i in range(8):
             i = i * 32
             self.emit(",".join(skip[i:i+32]) + ",")
         self.emitd(   "};")
 
-        self.emit("UCHARP tmp = buf; UCHAR c;",  2)
-        self.emit("int i; buf += %d - skip[c];" % (l-1))
-        self.emiti("while ((buf += skip[c]) < end) {")
-        self.emiti(  "if ((c = *buf) == %d /*'%c'*/) {" % (ord(key[l-1]), key[l-1]))
-        self.emit(     "i = %d; tmp = buf;" % (l-1))
-        self.emit(     "while (key[--i] == *(--tmp)) {")
-        self.emit(       "if (i == 0) goto next;")
+        self.emit("UCHARP tmp1, *tmp2;",  2)
+        self.emit("int i; buf += %d;" % (l-1))
+        self.emiti("while (buf < end) {")
+        self.emiti(  "if (*buf == %d /*'%c'*/) {" % (ord(key[l-1]), key[l-1]))
+        self.emit(     "tmp1 = key+%d; tmp2 = buf;" % (l-1))
+        self.emit(     "while (*(--tmp1) == *(--tmp2)) {")
+        self.emit(       "if (tmp1 == key) goto next;")
         self.emitd(    "}")
         self.emitd(  "}")
+        self.emit(   "buf += skip[*buf];")
         self.emitd("}")
         self.emit( "return;")
         self.emit( "next:")
         emit_next()
         self.emitd("}", 2)
 
+    def emit_booster(self, min_len, chars):
+        self.emiti("void booster(%s) {" % self.interface)
+        self.emiti(  "do {")
+        self.emiti(    "switch (*(buf+%d)) {" % (min_len-1))
+        for c in chars:
+            self.emit(   "case %d: /* %s */" % (ord(c), Character.ascii(c)))
+        self.emit(       "goto ret;")
+        self.emitd(    "}")
+        self.emitd(  "} while(buf += %d);" % (min_len-1))
+        self.emit(   "ret: return %s(%s);" % (self.state_name(self.cg.start) , self.args))
+        self.emitd("}", 2)
+
     def emit_driver(self):
         self.emiti("void matcher(%s) {" % self.interface)
         if self.filter:
@@ -144,12 +158,12 @@
 
     def emit_accept_state(self):
         self.emiti("void accept(%s) {" % self.interface)
-        self.emit(   "buf--;")
         self.emit(   "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));")
-        self.emit(   'if (ret == NULL) {fprintf(stderr, "memchr NULL err!"); exit(0);}')
+        if self.skip_boost or self.filter:
+            self.emit(   "beg = memrchr(buf, '\\n', beg);")
+        self.emit(   'if (ret == NULL) ret = end;')
         self.emiti(  "if (ret > end) {")
-        self.emit(     "ret--;")
-        self.emit(     "print_line(beg, ret);")
+        self.emit(     "print_line(beg, end);")
         self.emit(     "return;")
         self.emitd(  "}")
         self.emit(   "print_line(beg, ret);")
@@ -161,7 +175,7 @@
         self.emiti("void reject(%s) {" % self.interface)
         self.emit(   "if (buf >= end) return;")
         self.emit(   "beg = buf;")
-        self.emit(   "%s(%s);" % (self.start, self.args))
+        self.emit(   "return %s(%s);" % (self.start, self.args))
         self.emitd("}", 2)
 
     def emit_switch(self, case, default=None):
@@ -172,9 +186,10 @@
         self.emiti("switch(*buf++) {")
         for case, next_ in case.iteritems():
             self.trans_stmt.emit(case, self.state_name(next_))
-        if default:
-            if default == self.state_name(self.cg.start):
-                self.emit("default: return %s(%s);" % (default, self.args))
+        if default == self.state_name(self.cg.start) and self.skip_boost:
+            self.emit("default: return booster(%s);" % self.args)
+        else:
+            self.emit("default: return %s(%s);" % (default, self.args))
         self.emitd("}")
 
     def emit_state(self, cur_state, transition):
@@ -215,7 +230,7 @@
             self._emit("/* %s */" % input_node.__repr__())
 
         def visit_Character(self, char):
-            self._emit("case %d: /* match %s */" % (char.char, char))
+            self._emit("case %d: /* %s */" % (char.char, char))
             self._emit("  return %s(%s);" % (self.next, self.args))
 
         # Special Rule