changeset 57:81b44ae1cd73

add fixed-string filter(Boyer-Moore), and add option '--disable-filter'.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Mon, 01 Nov 2010 14:41:03 +0900
parents ee9945561f80
children 81337db23999
files pyrect/jitgrep.py pyrect/regexp/__init__.py pyrect/regexp/analyzer.py pyrect/regexp/dfa_translator.py pyrect/translator/grep_translator.py pyrect/translator/template/grep.c
diffstat 6 files changed, 107 insertions(+), 36 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/jitgrep.py	Wed Oct 27 20:46:41 2010 +0900
+++ b/pyrect/jitgrep.py	Mon Nov 01 14:41:03 2010 +0900
@@ -12,7 +12,7 @@
     myusage = """%prog [--buf-size=size] [--dump]
                   [--time] [--debug] [--cc=compiler] [-c]
                   [-Olevel] regexp [file..] [--out=file]
-                  [--thread=thread_num]"""
+                  [--thread=thread_num] [--disable-filter]"""
     psr = OptionParser(usage=myusage)
 
     redirect = ""
@@ -23,9 +23,10 @@
                    help="Choose compiler (default is gcc).")
     psr.add_option("-c", action="store_true", dest="compile", default=False , help="compile only.")
     psr.add_option("--buf-size=size", action="store", type="string", dest="bufsize", default="1M" , help="Set read-buffer size (e.x. 1024, 1024K, 2M)")
-    psr.add_option("--CFLAGS", action="store", type="string", dest="cflags", default="-O3", help="Print compile/matching time.")
+    psr.add_option("--cflags", action="store", type="string", dest="cflags", default="-O3", help="Print compile/matching time.")
     psr.add_option("--time", action="store_true", dest="time", default=False, help="Print compile/matching time.")
     psr.add_option("--thread", action="store", type="string", dest="thread", default="0", metavar="FILE", help="number of thread.")
+    psr.add_option("--disable-filter", action="store_true", dest="no_filter", default=False, help="disable fixed-string filter(Boyer-Moore).")
     psr.add_option("--debug", action="store_true", dest="debug", default=False, help="Dump commands, not evalute matching (except interactive mode).")
     psr.add_option("--label", action="store_true", dest="label", default=False, help="label implimentation in C.")
     psr.add_option("--dump", action="store_true", dest="dump", default=False, help="Dump generated grep-source.")
@@ -68,8 +69,9 @@
         grept = GoToGREPTranslator(reg)
     else:
         grept = GREPTranslator(reg)
+        grept.filter = not opts.no_filter
+        grept.thread_num = int(opts.thread)
     grept.bufsize = bufsize
-    grept.thread_num = int(opts.thread)
 
     if opts.dump:
         grept.translate()
--- a/pyrect/regexp/__init__.py	Wed Oct 27 20:46:41 2010 +0900
+++ b/pyrect/regexp/__init__.py	Mon Nov 01 14:41:03 2010 +0900
@@ -6,6 +6,7 @@
 from pyrect.regexp.nfa_translator import NFATranslator
 from pyrect.regexp.dfa_translator import DFATranslator
 from pyrect.regexp.callgraph import CallGraph
+from pyrect.regexp.analyzer import Analyzer
 
 class Regexp(object):
     """Regexp is basic class in Pyrect.
@@ -29,6 +30,9 @@
         self.dfa    = DFATranslator().translate(self.nfa)
         self.nfacg  = CallGraph(self.nfa)
         self.dfacg  = CallGraph(self.dfa)
+        an = Analyzer()
+        self.max_len, self.min_len, self.must_words =\
+                      an.analyze(self.ast)
 
     def matches(self, string):
         runtime = self.dfa.get_runtime()
--- a/pyrect/regexp/analyzer.py	Wed Oct 27 20:46:41 2010 +0900
+++ b/pyrect/regexp/analyzer.py	Mon Nov 01 14:41:03 2010 +0900
@@ -16,49 +16,60 @@
     >>> prs = Parser()
     >>> an  = Analyzer()
     >>> an.analyze(prs.parse('fixed-string'))
-    12
+    (12, 12, ['fixed-string'])
     >>> an.analyze(prs.parse('(build|fndecl|gcc)'))
-    6
-    >>> an.analyze(prs.parse('(AB|CD)*123'))
-    inf
-    >>> an.analyze(prs.parse('((12)*|3)|456'))
-    inf
+    (6, 3, [])
+    >>> an.analyze(prs.parse('123(AB|CD)*456'))
+    (inf, 6, ['123', '456'])
+    >>> an.analyze(prs.parse('((12)+|3)|456'))
+    (inf, 1, [])
     >>> an.analyze(prs.parse('(plus)?(qmark)?'))
-    9
+    (9, 0, [])
     """
-    def __init__(self):
-        self.maxlen = 0
+    def __init__(self, ast=None):
+        if ast:
+            self.analyze(ast)
+        else:
+            self.max_len = 0
+            self.min_len = 0
+            self.must_words = []
 
     def analyze(self, ast=None):
         if ast:
-            self.maxlen = ast.accept(self)
-        return self.maxlen
+            self.max_len, self.min_len, self.must_words = ast.accept(self)
+            self.must_words = [x for x in self.must_words if x != ""]
+        return self.max_len, self.min_len, self.must_words
 
     def visit(self, ast):
         """Following Classes contain no-Keywords.
         Union, Star
         """
-        return 1
+        return 1, 1, [str(ast)]
+
+    def visit_Character(self, ast):
+        return 1, 1, [chr(ast.char)]
 
     def visit_Concat(self, concat):
-        a1 = concat.op1.accept(self)
-        a2 = concat.op2.accept(self)
+        (max1, min1, key1) = concat.op1.accept(self)
+        (max2, min2, key2) = concat.op2.accept(self)
 
-        return a1 + a2
+        return max1 + max2, min1 + min2, key1[0:-1] + ([key1[-1] + key2[0]]) + key2[1:]
 
     def visit_Union(self, union):
-        a1 = union.op1.accept(self)
-        a2 = union.op2.accept(self)
-        return max(a1, a2)
+        (max1, min1, _) = union.op1.accept(self)
+        (max2, min2, _) = union.op2.accept(self)
+        return max(max1, max2), min(min1, min2), ["", ""]
 
     def visit_Star(self, star):
-        return float("inf")
+        return float("inf"), 0, ["", ""]
 
     def visit_Plus(self, plus):
-        return float("inf")
+        (_, m, k) = plus.op.accept(self)
+        return float("inf"), m, ["", k, ""]
 
     def visit_Qmark(self, qmark):
-        return qmark.op.accept(self)
+        (m, _, _) = qmark.op.accept(self)
+        return m, 0, ["", ""]
 
 def test():
     import doctest
--- a/pyrect/regexp/dfa_translator.py	Wed Oct 27 20:46:41 2010 +0900
+++ b/pyrect/regexp/dfa_translator.py	Mon Nov 01 14:41:03 2010 +0900
@@ -39,16 +39,15 @@
         done = set()
 
         while que:
-            stateSet = que.pop()
+            states = que.pop()
 
-
-            for state in stateSet:
+            for state in states:
                 for k, v in nfa.map.iteritems():
                     if state == k[0] and k[1] != '':
-                        slot = map_.setdefault((stateSet, k[1]), set())
+                        slot = map_.setdefault((states, k[1]), set())
                         slot.update(nfa.epsilon_expand(v))
 
-            done.add(stateSet)
+            done.add(states)
 
             for v in map_.itervalues():
                 if not v in done:
--- a/pyrect/translator/grep_translator.py	Wed Oct 27 20:46:41 2010 +0900
+++ b/pyrect/translator/grep_translator.py	Mon Nov 01 14:41:03 2010 +0900
@@ -2,7 +2,7 @@
 
 import os
 from c_translator import CTranslator
-from pyrect.regexp import Regexp
+from pyrect.regexp import Regexp, Analyzer
 
 class GREPTranslateExeption(Exception):
     pass
@@ -25,6 +25,7 @@
         self.__bufsize = 1024 * 1024
         self.parallel_match = False
         self.thread_num = 0
+        self.filter = True
 
     def getbufsize(self,):
         return self.__bufsize
@@ -35,24 +36,71 @@
 
     def emit_initialization(self):
         CTranslator.emit_initialization(self)
+
+        if self.thread_num > 1:
+            self.emit("#define GREP paragrep")
+        else:
+            self.emit("#define GREP grep")
+
         self.emit("#define LINEBUFSIZE %d" % self.bufsize)
         self.emit("#define READBUFSIZE %d" % self.bufsize)
         self.emit('#define THREAD_NUM %d' % self.thread_num)
         self.emit('#define THREAD_BUF %d' % 3)
         self.emit('#include <pthread.h>')
-        if self.thread_num > 1:
-            self.emit("#define GREP paragrep")
-        else:
-            self.emit("#define GREP grep")
         self.emit("#include <stdlib.h>")
         self.emit("#include <string.h>")
         self.emit("char readbuf[%d];" % (self.bufsize))
         self.emit("int DFA(unsigned char* s);", 2)
+
+        if self.filter and self.regexp.must_words:
+            self.emit_filter(self.regexp.must_words)
+
         grepsource = open(self.BASE_DIR + "/template/grep.c")
         self.emit(grepsource.read())
 
-    def emit_filter(self):
-        pass
+    def emit_filter(self, words):
+        def longest(s1, s2):
+            return s1 if len(s1) >= len(s2) else s2
+        key = reduce(longest, words)
+
+        if len(words) == 1:
+            if len(key) == self.regexp.min_len:
+                self.emit("#define FILTER_ONLY 1", 1)
+        else:
+            self.emit("#define WITH_FILTER 1", 1)
+
+        self.emiti("int FILTER(unsigned char* text, int n) {")
+        l = len(key)
+        if l == 1:
+            self.emit("   return (strchr(text, %d) != NULL)" % ord(key))
+            self.emitd("}", 2)
+            return
+
+        skip = [str(l)] * 256
+        for i in range(l - 1):
+            skip[ord(key[i])] = str(l-1-i)
+
+        self.emit('static unsigned char key[] = "%s";' % key)
+        self.emiti(   "static int skip[256] = {")
+        for i in range(8):
+            i = i * 32
+            self.emit(",".join(skip[i:i+32]) + ",")
+        self.emitd(   "};")
+
+        self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l))
+        self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2)
+        self.emiti("while (i < n) {")
+        self.emit(   "c = text[i];")
+        self.emiti(  "if (c == tail) {")
+        self.emit(     "j = len - 1; k = i;")
+        self.emiti(    "while (key[--j] == text[--k]) {")
+        self.emit(       "if (j == 0) return 1;")
+        self.emitd(    "}")
+        self.emitd(  "}")
+        self.emit(   "i += skip[c];")
+        self.emitd("}")
+        self.emit( "return 0;")
+        self.emitd("}", 2)
 
     def emit_driver(self):
         self.emiti("int DFA(unsigned char *text) {")
--- a/pyrect/translator/template/grep.c	Wed Oct 27 20:46:41 2010 +0900
+++ b/pyrect/translator/template/grep.c	Mon Nov 01 14:41:03 2010 +0900
@@ -53,7 +53,14 @@
     n = strlen(lbuf);
     if (n > 0 && buf[n-1] == '\n')
       lbuf[n-1] = '\0';
+
+#ifdef FILTER_ONLY
+    if (FILTER(buf, n-1)) {
+#elif  defined(WITH_FILTER)
+    if (FILTER(buf, n-1) && DFA(buf)) {
+#else
     if (DFA(buf)) {
+#endif
       nmatch++;
       if (name != NULL)
         printf("%s:", name);