# HG changeset patch # User Ryoma SHINYA # Date 1289089945 -32400 # Node ID 4de11d799dee54361e74d888104918fc1ffeeadf # Parent 56a997f2c1219fd91885293525734caac121e104 add boost algorithm. but it's buggy, not work. diff -r 56a997f2c121 -r 4de11d799dee pyrect/jitgrep.py --- a/pyrect/jitgrep.py Sun Nov 07 03:10:43 2010 +0900 +++ b/pyrect/jitgrep.py Sun Nov 07 09:32:25 2010 +0900 @@ -6,13 +6,14 @@ import time from optparse import OptionParser from pyrect.translator import * -from pyrect.regexp import Regexp +from pyrect.regexp import Regexp, CharCollector def main(argv): myusage = """%prog [--buf-size=size] [--dump] [--time] [--debug] [--cc=compiler] [-c] [-Olevel] regexp [file..] [--out=file] - [--thread=thread_num] [--disable-filter]""" + [--thread=thread_num] [--disable-filter] + [--disable-booster]""" psr = OptionParser(usage=myusage) redirect = "" @@ -27,6 +28,7 @@ psr.add_option("--time", action="store_true", dest="time", default=False, help="Print compile/matching time.") psr.add_option("--thread", action="store", type="string", dest="thread", default="0", metavar="FILE", help="number of thread.") psr.add_option("--disable-filter", action="store_true", dest="no_filter", default=False, help="disable fixed-string filter(Boyer-Moore).") + psr.add_option("--disable-booster", action="store_true", dest="no_booster", default=False, help="disable booster.") psr.add_option("--debug", action="store_true", dest="debug", default=False, help="Dump commands, not evalute matching (except interactive mode).") psr.add_option("--label", action="store_true", dest="label", default=False, help="label implimentation in C.") psr.add_option("--dump", action="store_true", dest="dump", default=False, help="Dump generated grep-source.") @@ -62,7 +64,11 @@ return if opts.time : start_time = time.time() + reg = Regexp(".*"+string) + reg.chars = Regexp.get_chars(string) + (reg.max_len, _, _) = Regexp.get_analyze(string) + if cbc: grept = CbCGREPTranslator(reg) elif opts.label: @@ -70,7 +76,9 @@ else: grept = GREPTranslator(reg) grept.filter = not opts.no_filter + grept.skip_boost = not opts.no_booster grept.thread_line = int(opts.thread) + grept.bufsize = bufsize if opts.dump: diff -r 56a997f2c121 -r 4de11d799dee pyrect/regexp/__init__.py --- a/pyrect/regexp/__init__.py Sun Nov 07 03:10:43 2010 +0900 +++ b/pyrect/regexp/__init__.py Sun Nov 07 09:32:25 2010 +0900 @@ -7,13 +7,14 @@ from pyrect.regexp.dfa_translator import DFATranslator from pyrect.regexp.callgraph import CallGraph from pyrect.regexp.analyzer import Analyzer +from pyrect.regexp.char_collector import CharCollector class Regexp(object): """Regexp is basic class in Pyrect. this contains regexp, dfa, nfa,, actually it's include all. >>> regexp = Regexp('(A|B)*C') >>> regexp.dfacg.map - {'1': {'A': ['1'], 'C': ['2'], 'B': ['0']}, '0': {'A': ['1'], 'C': ['2'], 'B': ['0']}, '3': {'A': ['1'], 'C': ['2'], 'B': ['0']}, '2': {}} + {'1': {}, '0': {(Character:'A'): '0', (Character:'B'): '0', (Character:'C'): '1'}} >>> regexp.matches('ABC') True >>> regexp = Regexp('(a|b)*cd*e') @@ -33,6 +34,23 @@ an = Analyzer() self.max_len, self.min_len, self.must_words =\ an.analyze(self.ast) + an = CharCollector() + self.chars = an.analyze(self.ast) + + @classmethod + def get_chars(cls, regexp): + an = CharCollector() + return an.analyze(cls.parse(regexp)) + + @classmethod + def get_analyze(cls, regexp): + an = Analyzer() + return an.analyze(cls.parse(regexp)) + + @classmethod + def parse(cls, regexp): + psr = Parser() + return psr.parse(regexp) def matches(self, string): runtime = self.dfa.get_runtime() diff -r 56a997f2c121 -r 4de11d799dee pyrect/regexp/analyzer.py --- a/pyrect/regexp/analyzer.py Sun Nov 07 03:10:43 2010 +0900 +++ b/pyrect/regexp/analyzer.py Sun Nov 07 09:32:25 2010 +0900 @@ -26,7 +26,7 @@ >>> an.analyze(prs.parse('(plus)?(qmark)?')) (9, 0, []) >>> an.analyze(prs.parse('\*+ \[\[')) - (inf, 1, []) + (inf, 4, ['* [[']) """ def __init__(self, ast=None): @@ -35,7 +35,6 @@ else: self.max_len = 0 self.min_len = 0 - self.must_words = [] def analyze(self, ast=None): if ast: @@ -52,15 +51,11 @@ def visit_Character(self, ast): return 1, 1, [chr(ast.char)] - def visit_Concat(self, concat): - (max1, min1, key1) = concat.op1.accept(self) - (max2, min2, key2) = concat.op2.accept(self) + def concat(self, (max1, min1, key1), (max2, min2, key2)): + return max1 + max2, min1 + min2, key1[0:-1] \ + + ([key1[-1] + key2[0]]) + key2[1:] - return max1 + max2, min1 + min2, key1[0:-1] + ([key1[-1] + key2[0]]) + key2[1:] - - def visit_Union(self, union): - (max1, min1, _) = union.op1.accept(self) - (max2, min2, _) = union.op2.accept(self) + def union(self, (max1, min1, _), (max2, min2, __)): return max(max1, max2), min(min1, min2), ["", ""] def visit_Star(self, star): diff -r 56a997f2c121 -r 4de11d799dee pyrect/regexp/ast.py --- a/pyrect/regexp/ast.py Sun Nov 07 03:10:43 2010 +0900 +++ b/pyrect/regexp/ast.py Sun Nov 07 09:32:25 2010 +0900 @@ -8,7 +8,32 @@ class ASTWalker(object): def visit(self, ast): - pass + return + + def visit_Star(self, star): + return star.op.accept(self) + + def visit_Plus(self, plus): + return plus.op.accept(self) + + def visit_Qmark(self, qmark): + return qmark.op.accept(self) + + def visit_Concat(self, concat): + r1 = concat.op1.accept(self) + r2 = concat.op2.accept(self) + return self.concat(r1, r2) + + def visit_Union(self, union): + r1 = union.op1.accept(self) + r2 = union.op2.accept(self) + return self.union(r1, r2) + + def union(self, r1, r2): + return + + def concat(self, r1, r2): + return # AST-Nodes class Node(object): @@ -127,6 +152,10 @@ def __hash__(self): return self.char.__hash__() + @classmethod + def ascii(cls, c): + return cls.ASCII[ord(c)] + class MBCharacter(Character): def __init__(self, mbchar): ret = Character.__init__(self, mbchar) diff -r 56a997f2c121 -r 4de11d799dee pyrect/translator/grep_translator.py --- a/pyrect/translator/grep_translator.py Sun Nov 07 03:10:43 2010 +0900 +++ b/pyrect/translator/grep_translator.py Sun Nov 07 09:32:25 2010 +0900 @@ -2,7 +2,7 @@ import os from c_translator import CTranslator -from pyrect.regexp import Regexp, Analyzer +from pyrect.regexp import Regexp from pyrect.regexp.ast import ASTWalker, AnyChar, Character class GREPTranslateExeption(Exception): @@ -28,6 +28,8 @@ self.thread_line = 1 self.filter = True self.filter_only = False + self.filter_prefix = False + self.skip_boost = True self.start = "matcher" self.interface = "UCHARP beg, UCHARP buf, UCHARP end" self.args = "beg, buf, end" @@ -51,48 +53,47 @@ self.emit("#include ") self.emit("#include ") - #self.emit_skip() - - for state in self.cg.map.iterkeys(): - self.emit("void %s(%s);" % (self.state_name(state), self.interface)) - self.emit('void accept(%s);' % self.interface) self.emit('void reject(%s);' % self.interface) self.emit("void matcher(%s);" % self.interface, 2) + self.emit('void accept(%s);' % self.interface) + key = None if self.filter and self.regexp.must_words: - self.emit_filter(self.regexp.must_words) - else: self.filter = False + key = max(self.regexp.must_words, key=len) + if len(self.regexp.must_words) == 1 and len(key) == self.regexp.min_len: + self.filter_only = True + else: + self.filter = False + + if not self.filter_only: + for state in self.cg.map.iterkeys(): + self.emit("void %s(%s);" % (self.state_name(state), self.interface)) + self.emit() + + if self.filter: self.emit_filter(key) + + if self.skip_boost and not self.filter_only and \ + not AnyChar() in self.regexp.chars and \ + self.regexp.min_len > 2: + self.emit_booster(self.regexp.min_len, self.regexp.chars) + else: + self.skip_boost = False grepsource = open(self.BASE_DIR + "/template/grep.c") self.emit(grepsource.read()) - #TODO: filter is faster than dfa in matching. - #but slower than that in compiling. - #We have to improve this problem. - def emit_filter(self, words): - def longest(s1, s2): - if len(s1) >= len(s2): - return s1 - else: - return s2 - - key = reduce(longest, words) - - if len(words) == 1 and len(key) == self.regexp.min_len: - self.filter_only = True - else: - filter_prefix = False - + def emit_filter(self, key): + l = len(key) def emit_next(): - self.emit("beg = memrchr(buf, '\\n', beg);") if self.filter_only: self.emit("accept(%s);" % self.args) elif self.filter_prefix: - self.emit("buf -= %d;" % len(key)) - self.emit("dfa(%s);" % self.args) + self.emit("buf -= %d;" % l) + self.emit("%s(%s);" % (self.state_name(self.cg.start) ,self.args)) else: + self.emit("beg = memrchr(buf, '\\n', beg);") self.emit("buf = beg;") - self.emit("dfa(%s);" % self.args) + self.emit("%s(%s);" % (self.state_name(self.cg.start), self.args)) self.emit("UCHARP memrchr(UCHARP p, int c, UCHARP beg);", 2) @@ -105,33 +106,46 @@ self.emitd("}", 2) return + self.emit('static UCHAR key[] = "%s";' % key) + skip = [str(l)] * 256 for i in range(l - 1): skip[ord(key[i])] = str(l-1-i) - self.emit('static const UCHAR key[] = "%s";' % key) - self.emiti( "static const UCHAR skip[256] = {") for i in range(8): i = i * 32 self.emit(",".join(skip[i:i+32]) + ",") self.emitd( "};") - self.emit("UCHARP tmp = buf; UCHAR c;", 2) - self.emit("int i; buf += %d - skip[c];" % (l-1)) - self.emiti("while ((buf += skip[c]) < end) {") - self.emiti( "if ((c = *buf) == %d /*'%c'*/) {" % (ord(key[l-1]), key[l-1])) - self.emit( "i = %d; tmp = buf;" % (l-1)) - self.emit( "while (key[--i] == *(--tmp)) {") - self.emit( "if (i == 0) goto next;") + self.emit("UCHARP tmp1, *tmp2;", 2) + self.emit("int i; buf += %d;" % (l-1)) + self.emiti("while (buf < end) {") + self.emiti( "if (*buf == %d /*'%c'*/) {" % (ord(key[l-1]), key[l-1])) + self.emit( "tmp1 = key+%d; tmp2 = buf;" % (l-1)) + self.emit( "while (*(--tmp1) == *(--tmp2)) {") + self.emit( "if (tmp1 == key) goto next;") self.emitd( "}") self.emitd( "}") + self.emit( "buf += skip[*buf];") self.emitd("}") self.emit( "return;") self.emit( "next:") emit_next() self.emitd("}", 2) + def emit_booster(self, min_len, chars): + self.emiti("void booster(%s) {" % self.interface) + self.emiti( "do {") + self.emiti( "switch (*(buf+%d)) {" % (min_len-1)) + for c in chars: + self.emit( "case %d: /* %s */" % (ord(c), Character.ascii(c))) + self.emit( "goto ret;") + self.emitd( "}") + self.emitd( "} while(buf += %d);" % (min_len-1)) + self.emit( "ret: return %s(%s);" % (self.state_name(self.cg.start) , self.args)) + self.emitd("}", 2) + def emit_driver(self): self.emiti("void matcher(%s) {" % self.interface) if self.filter: @@ -144,12 +158,12 @@ def emit_accept_state(self): self.emiti("void accept(%s) {" % self.interface) - self.emit( "buf--;") self.emit( "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));") - self.emit( 'if (ret == NULL) {fprintf(stderr, "memchr NULL err!"); exit(0);}') + if self.skip_boost or self.filter: + self.emit( "beg = memrchr(buf, '\\n', beg);") + self.emit( 'if (ret == NULL) ret = end;') self.emiti( "if (ret > end) {") - self.emit( "ret--;") - self.emit( "print_line(beg, ret);") + self.emit( "print_line(beg, end);") self.emit( "return;") self.emitd( "}") self.emit( "print_line(beg, ret);") @@ -161,7 +175,7 @@ self.emiti("void reject(%s) {" % self.interface) self.emit( "if (buf >= end) return;") self.emit( "beg = buf;") - self.emit( "%s(%s);" % (self.start, self.args)) + self.emit( "return %s(%s);" % (self.start, self.args)) self.emitd("}", 2) def emit_switch(self, case, default=None): @@ -172,9 +186,10 @@ self.emiti("switch(*buf++) {") for case, next_ in case.iteritems(): self.trans_stmt.emit(case, self.state_name(next_)) - if default: - if default == self.state_name(self.cg.start): - self.emit("default: return %s(%s);" % (default, self.args)) + if default == self.state_name(self.cg.start) and self.skip_boost: + self.emit("default: return booster(%s);" % self.args) + else: + self.emit("default: return %s(%s);" % (default, self.args)) self.emitd("}") def emit_state(self, cur_state, transition): @@ -215,7 +230,7 @@ self._emit("/* %s */" % input_node.__repr__()) def visit_Character(self, char): - self._emit("case %d: /* match %s */" % (char.char, char)) + self._emit("case %d: /* %s */" % (char.char, char)) self._emit(" return %s(%s);" % (self.next, self.args)) # Special Rule