changeset 75:e06786b3c2dc

modify filtering algorithm, unloop string-compare!!
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Mon, 08 Nov 2010 03:45:14 +0900
parents a6a0504dea7b
children dd6d2b9e48ad
files pyrect/jitgrep.py pyrect/translator/grep_translator.py
diffstat 2 files changed, 70 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/jitgrep.py	Mon Nov 08 02:19:18 2010 +0900
+++ b/pyrect/jitgrep.py	Mon Nov 08 03:45:14 2010 +0900
@@ -12,7 +12,7 @@
     myusage = """%prog [--buf-size=size] [--dump]
                   [--time] [--debug] [--cc=compiler] [-c]
                   [-Olevel] regexp [file..] [--out=file]
-                  [--thread=thread_num] [--disable-filter]
+                  [--thread=thread_num] [--filter=algorithm]
                   [--disable-booster]"""
     psr = OptionParser(usage=myusage)
 
@@ -27,8 +27,8 @@
     psr.add_option("--cflags", action="store", type="string", dest="cflags", default="-O3", help="Print compile/matching time.")
     psr.add_option("--time", action="store_true", dest="time", default=False, help="Print compile/matching time.")
     psr.add_option("--thread", action="store", type="string", dest="thread", default="0", metavar="FILE", help="number of thread.")
-    psr.add_option("--disable-filter", action="store_true", dest="no_filter", default=False, help="disable fixed-string filter(Boyer-Moore).")
-    psr.add_option("--disable-booster", action="store_true", dest="no_booster", default=False, help="disable booster.")
+    psr.add_option("--disable-booster", action="store_true", dest="no_boost", default=False, help="disable boosetr (default: use booster).")
+    psr.add_option("--filter", action="store", type="string", dest="filter", default="bmh", help="chose filtering-algorithm bmh(default), quick, or none.")
     psr.add_option("--debug", action="store_true", dest="debug", default=False, help="Dump commands, not evalute matching (except interactive mode).")
     psr.add_option("--label", action="store_true", dest="label", default=False, help="label implimentation in C.")
     psr.add_option("--dump", action="store_true", dest="dump", default=False, help="Dump generated grep-source.")
@@ -75,8 +75,8 @@
         grept = GoToGREPTranslator(reg)
     else:
         grept = GREPTranslator(reg)
-        grept.filter = not opts.no_filter
-        grept.skip_boost = not opts.no_booster
+        grept.filter = opts.filter
+        grept.skip_boost = not opts.no_boost
         grept.thread_line = int(opts.thread)
 
     grept.bufsize = bufsize
--- a/pyrect/translator/grep_translator.py	Mon Nov 08 02:19:18 2010 +0900
+++ b/pyrect/translator/grep_translator.py	Mon Nov 08 03:45:14 2010 +0900
@@ -26,7 +26,7 @@
         self.__bufsize = 1024 * 1024
         self.thread_dfa = 1
         self.thread_line = 1
-        self.filter = True
+        self.filter = "bmh"
         self.filter_only = False
         self.filter_prefix = False
         self.skip_boost = True
@@ -39,7 +39,7 @@
     def setbufsize(self, bufsize):
         self.__bufsize = abs(bufsize)
 
-    bufsize = property(getbufsize, setbufsize)
+        bufsize = property(getbufsize, setbufsize)
 
     def emit_initialization(self):
         self.emit("#include <stdio.h>")
@@ -58,7 +58,9 @@
         self.emit('void accept(%s);' % self.interface)
 
         key = None
-        if self.filter and self.regexp.must_words:
+
+        if (self.filter == "bmh" or self.filter == "quick")\
+               and self.regexp.must_words:
             key = max(self.regexp.must_words, key=len)
             if len(self.regexp.must_words) == 1 and len(key) == self.regexp.min_len:
                 self.filter_only = True
@@ -70,7 +72,10 @@
                 self.emit("void %s(%s);" % (self.state_name(state), self.interface))
             self.emit()
 
-        if self.filter: self.emit_filter(key)
+        if self.filter == "bmh":
+            self.emit_bmh_filter(key)
+        else:
+            self.emit_quick_filter(key)
 
         if self.skip_boost and not self.filter_only and \
                not AnyChar() in self.regexp.chars and \
@@ -82,7 +87,7 @@
         grepsource = open(self.BASE_DIR + "/template/grep.c")
         self.emit(grepsource.read())
 
-    def emit_filter(self, key):
+    def emit_bmh_filter(self, key):
         l = len(key)
         def emit_next():
             if self.filter_only:
@@ -97,7 +102,7 @@
 
         self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2)
 
-        self.emiti("void bm_filter(%s) {" % self.interface)
+        self.emiti("void bmh_filter(%s) {" % self.interface)
         l = len(key)
         if l == 1:
             self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
@@ -112,14 +117,12 @@
         for i in range(l-1):
             skip[key[i]] = l-1-i
 
-        self.emit("UCHARP tmp1, *tmp2;",  2)
+        self.emit("UCHARP tmp;",  2)
         self.emit("int i; buf += %d;" % (l-1))
-        self.emiti("while (buf < end) {")
-        self.emiti(  "if (*buf == %d /*'%c'*/) {" % (ord(key[l-1]), key[l-1]))
-        self.emit(     "tmp1 = key+%d; tmp2 = buf;" % (l-1))
-        self.emit(     "while (*(--tmp1) == *(--tmp2)) {")
-        self.emit(       "if (tmp1 == key) goto next;")
-        self.emitd(    "}")
+        self.emiti("while ((tmp = buf) < end) {")
+        self.emiti(  "if (%s) {" %
+                     " && ".join(reversed(["*tmp-- == %d" % ord(c) for c in key])))
+        self.emit(       "goto next;")
         self.emitd(  "}")
         self.emiti(  "switch(*buf) {")
         for k, v in skip.iteritems():
@@ -133,6 +136,54 @@
         emit_next()
         self.emitd("}", 2)
 
+    def emit_quick_filter(self, key):
+        l = len(key)
+        def emit_next():
+            if self.filter_only:
+                self.emit("accept(%s);" % self.args)
+            elif self.filter_prefix:
+                self.emit("buf+%d;" % l)
+                self.emit("%s(%s);" % (self.state_name(self.cg.start) ,self.args))
+            else:
+                self.emit("beg = get_line_beg(buf, beg);")
+                self.emit("buf = beg;")
+                self.emit("%s(%s);" % (self.state_name(self.cg.start), self.args))
+
+        self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2)
+
+        self.emiti("void quick_filter(%s) {" % self.interface)
+        l = len(key)
+        if l == 1:
+            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
+            self.emit("if (buf == NULL) return;")
+            emit_next()
+            self.emitd("}", 2)
+            return
+
+        self.emit('static UCHAR key[] = "%s";' % key)
+
+        skip = dict()
+        for i in range(l):
+            skip[key[i]] = l-i
+
+        self.emit("UCHARP tmp, *end_ = end - %d;" % l,  2)
+        self.emiti("while ((tmp = buf) <= end_) {")
+        self.emiti(  "if (%s) {" %
+                     " && ".join(["*tmp++ == %d" % ord(c) for c in key]))
+        self.emit(     "goto next;")
+        self.emitd(  "}")
+        self.emiti(  "switch(buf[%d]) {" % l)
+        for k, v in skip.iteritems():
+            self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
+            self.emit(     "buf += %d; break;" % v), self.dedent()
+        self.emiti("default: buf += %d;" % (l+1)), self.dedent()
+        self.emitd(  "}")
+        self.emitd("}")
+        self.emit( "return;")
+        self.emit( "next:")
+        emit_next()
+        self.emitd("}", 2)
+
     def emit_booster(self, min_len, chars):
         self.emiti("void booster(%s) {" % self.interface)
         self.emiti(  "do {")
@@ -149,7 +200,7 @@
     def emit_driver(self):
         self.emiti("void matcher(%s) {" % self.interface)
         if self.filter:
-            self.emit(   "%s(%s);" % ("bm_filter", self.args))
+            self.emit(   "%s(%s);" % (self.filter + "_filter", self.args))
         else:
             self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
         self.emit(   "return;")