# HG changeset patch # User Ryoma SHINYA # Date 1288590063 -32400 # Node ID 81b44ae1cd730f0c47c7557e16fc94545714b82c # Parent ee9945561f80ab2bec35343c2feafc767c66dfba add fixed-string filter(Boyer-Moore), and add option '--disable-filter'. diff -r ee9945561f80 -r 81b44ae1cd73 pyrect/jitgrep.py --- a/pyrect/jitgrep.py Wed Oct 27 20:46:41 2010 +0900 +++ b/pyrect/jitgrep.py Mon Nov 01 14:41:03 2010 +0900 @@ -12,7 +12,7 @@ myusage = """%prog [--buf-size=size] [--dump] [--time] [--debug] [--cc=compiler] [-c] [-Olevel] regexp [file..] [--out=file] - [--thread=thread_num]""" + [--thread=thread_num] [--disable-filter]""" psr = OptionParser(usage=myusage) redirect = "" @@ -23,9 +23,10 @@ help="Choose compiler (default is gcc).") psr.add_option("-c", action="store_true", dest="compile", default=False , help="compile only.") psr.add_option("--buf-size=size", action="store", type="string", dest="bufsize", default="1M" , help="Set read-buffer size (e.x. 1024, 1024K, 2M)") - psr.add_option("--CFLAGS", action="store", type="string", dest="cflags", default="-O3", help="Print compile/matching time.") + psr.add_option("--cflags", action="store", type="string", dest="cflags", default="-O3", help="Print compile/matching time.") psr.add_option("--time", action="store_true", dest="time", default=False, help="Print compile/matching time.") psr.add_option("--thread", action="store", type="string", dest="thread", default="0", metavar="FILE", help="number of thread.") + psr.add_option("--disable-filter", action="store_true", dest="no_filter", default=False, help="disable fixed-string filter(Boyer-Moore).") psr.add_option("--debug", action="store_true", dest="debug", default=False, help="Dump commands, not evalute matching (except interactive mode).") psr.add_option("--label", action="store_true", dest="label", default=False, help="label implimentation in C.") psr.add_option("--dump", action="store_true", dest="dump", default=False, help="Dump generated grep-source.") @@ -68,8 +69,9 @@ grept = GoToGREPTranslator(reg) else: grept = GREPTranslator(reg) + grept.filter = not opts.no_filter + grept.thread_num = int(opts.thread) grept.bufsize = bufsize - grept.thread_num = int(opts.thread) if opts.dump: grept.translate() diff -r ee9945561f80 -r 81b44ae1cd73 pyrect/regexp/__init__.py --- a/pyrect/regexp/__init__.py Wed Oct 27 20:46:41 2010 +0900 +++ b/pyrect/regexp/__init__.py Mon Nov 01 14:41:03 2010 +0900 @@ -6,6 +6,7 @@ from pyrect.regexp.nfa_translator import NFATranslator from pyrect.regexp.dfa_translator import DFATranslator from pyrect.regexp.callgraph import CallGraph +from pyrect.regexp.analyzer import Analyzer class Regexp(object): """Regexp is basic class in Pyrect. @@ -29,6 +30,9 @@ self.dfa = DFATranslator().translate(self.nfa) self.nfacg = CallGraph(self.nfa) self.dfacg = CallGraph(self.dfa) + an = Analyzer() + self.max_len, self.min_len, self.must_words =\ + an.analyze(self.ast) def matches(self, string): runtime = self.dfa.get_runtime() diff -r ee9945561f80 -r 81b44ae1cd73 pyrect/regexp/analyzer.py --- a/pyrect/regexp/analyzer.py Wed Oct 27 20:46:41 2010 +0900 +++ b/pyrect/regexp/analyzer.py Mon Nov 01 14:41:03 2010 +0900 @@ -16,49 +16,60 @@ >>> prs = Parser() >>> an = Analyzer() >>> an.analyze(prs.parse('fixed-string')) - 12 + (12, 12, ['fixed-string']) >>> an.analyze(prs.parse('(build|fndecl|gcc)')) - 6 - >>> an.analyze(prs.parse('(AB|CD)*123')) - inf - >>> an.analyze(prs.parse('((12)*|3)|456')) - inf + (6, 3, []) + >>> an.analyze(prs.parse('123(AB|CD)*456')) + (inf, 6, ['123', '456']) + >>> an.analyze(prs.parse('((12)+|3)|456')) + (inf, 1, []) >>> an.analyze(prs.parse('(plus)?(qmark)?')) - 9 + (9, 0, []) """ - def __init__(self): - self.maxlen = 0 + def __init__(self, ast=None): + if ast: + self.analyze(ast) + else: + self.max_len = 0 + self.min_len = 0 + self.must_words = [] def analyze(self, ast=None): if ast: - self.maxlen = ast.accept(self) - return self.maxlen + self.max_len, self.min_len, self.must_words = ast.accept(self) + self.must_words = [x for x in self.must_words if x != ""] + return self.max_len, self.min_len, self.must_words def visit(self, ast): """Following Classes contain no-Keywords. Union, Star """ - return 1 + return 1, 1, [str(ast)] + + def visit_Character(self, ast): + return 1, 1, [chr(ast.char)] def visit_Concat(self, concat): - a1 = concat.op1.accept(self) - a2 = concat.op2.accept(self) + (max1, min1, key1) = concat.op1.accept(self) + (max2, min2, key2) = concat.op2.accept(self) - return a1 + a2 + return max1 + max2, min1 + min2, key1[0:-1] + ([key1[-1] + key2[0]]) + key2[1:] def visit_Union(self, union): - a1 = union.op1.accept(self) - a2 = union.op2.accept(self) - return max(a1, a2) + (max1, min1, _) = union.op1.accept(self) + (max2, min2, _) = union.op2.accept(self) + return max(max1, max2), min(min1, min2), ["", ""] def visit_Star(self, star): - return float("inf") + return float("inf"), 0, ["", ""] def visit_Plus(self, plus): - return float("inf") + (_, m, k) = plus.op.accept(self) + return float("inf"), m, ["", k, ""] def visit_Qmark(self, qmark): - return qmark.op.accept(self) + (m, _, _) = qmark.op.accept(self) + return m, 0, ["", ""] def test(): import doctest diff -r ee9945561f80 -r 81b44ae1cd73 pyrect/regexp/dfa_translator.py --- a/pyrect/regexp/dfa_translator.py Wed Oct 27 20:46:41 2010 +0900 +++ b/pyrect/regexp/dfa_translator.py Mon Nov 01 14:41:03 2010 +0900 @@ -39,16 +39,15 @@ done = set() while que: - stateSet = que.pop() + states = que.pop() - - for state in stateSet: + for state in states: for k, v in nfa.map.iteritems(): if state == k[0] and k[1] != '': - slot = map_.setdefault((stateSet, k[1]), set()) + slot = map_.setdefault((states, k[1]), set()) slot.update(nfa.epsilon_expand(v)) - done.add(stateSet) + done.add(states) for v in map_.itervalues(): if not v in done: diff -r ee9945561f80 -r 81b44ae1cd73 pyrect/translator/grep_translator.py --- a/pyrect/translator/grep_translator.py Wed Oct 27 20:46:41 2010 +0900 +++ b/pyrect/translator/grep_translator.py Mon Nov 01 14:41:03 2010 +0900 @@ -2,7 +2,7 @@ import os from c_translator import CTranslator -from pyrect.regexp import Regexp +from pyrect.regexp import Regexp, Analyzer class GREPTranslateExeption(Exception): pass @@ -25,6 +25,7 @@ self.__bufsize = 1024 * 1024 self.parallel_match = False self.thread_num = 0 + self.filter = True def getbufsize(self,): return self.__bufsize @@ -35,24 +36,71 @@ def emit_initialization(self): CTranslator.emit_initialization(self) + + if self.thread_num > 1: + self.emit("#define GREP paragrep") + else: + self.emit("#define GREP grep") + self.emit("#define LINEBUFSIZE %d" % self.bufsize) self.emit("#define READBUFSIZE %d" % self.bufsize) self.emit('#define THREAD_NUM %d' % self.thread_num) self.emit('#define THREAD_BUF %d' % 3) self.emit('#include ') - if self.thread_num > 1: - self.emit("#define GREP paragrep") - else: - self.emit("#define GREP grep") self.emit("#include ") self.emit("#include ") self.emit("char readbuf[%d];" % (self.bufsize)) self.emit("int DFA(unsigned char* s);", 2) + + if self.filter and self.regexp.must_words: + self.emit_filter(self.regexp.must_words) + grepsource = open(self.BASE_DIR + "/template/grep.c") self.emit(grepsource.read()) - def emit_filter(self): - pass + def emit_filter(self, words): + def longest(s1, s2): + return s1 if len(s1) >= len(s2) else s2 + key = reduce(longest, words) + + if len(words) == 1: + if len(key) == self.regexp.min_len: + self.emit("#define FILTER_ONLY 1", 1) + else: + self.emit("#define WITH_FILTER 1", 1) + + self.emiti("int FILTER(unsigned char* text, int n) {") + l = len(key) + if l == 1: + self.emit(" return (strchr(text, %d) != NULL)" % ord(key)) + self.emitd("}", 2) + return + + skip = [str(l)] * 256 + for i in range(l - 1): + skip[ord(key[i])] = str(l-1-i) + + self.emit('static unsigned char key[] = "%s";' % key) + self.emiti( "static int skip[256] = {") + for i in range(8): + i = i * 32 + self.emit(",".join(skip[i:i+32]) + ",") + self.emitd( "};") + + self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l)) + self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2) + self.emiti("while (i < n) {") + self.emit( "c = text[i];") + self.emiti( "if (c == tail) {") + self.emit( "j = len - 1; k = i;") + self.emiti( "while (key[--j] == text[--k]) {") + self.emit( "if (j == 0) return 1;") + self.emitd( "}") + self.emitd( "}") + self.emit( "i += skip[c];") + self.emitd("}") + self.emit( "return 0;") + self.emitd("}", 2) def emit_driver(self): self.emiti("int DFA(unsigned char *text) {") diff -r ee9945561f80 -r 81b44ae1cd73 pyrect/translator/template/grep.c --- a/pyrect/translator/template/grep.c Wed Oct 27 20:46:41 2010 +0900 +++ b/pyrect/translator/template/grep.c Mon Nov 01 14:41:03 2010 +0900 @@ -53,7 +53,14 @@ n = strlen(lbuf); if (n > 0 && buf[n-1] == '\n') lbuf[n-1] = '\0'; + +#ifdef FILTER_ONLY + if (FILTER(buf, n-1)) { +#elif defined(WITH_FILTER) + if (FILTER(buf, n-1) && DFA(buf)) { +#else if (DFA(buf)) { +#endif nmatch++; if (name != NULL) printf("%s:", name);