Mercurial > hg > Members > shinya > pyrect
changeset 101:5e509a9c951c
modify CbC-grep.
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 14 Dec 2010 04:08:09 +0900 |
parents | 6aab6b1038f0 |
children | a38b57592d45 |
files | pyrect/jitgrep.py pyrect/regexp/callgraph.py pyrect/regexp/dfa_translator.py pyrect/regexp/fa.py pyrect/translator/cbc_grep_translator.py |
diffstat | 5 files changed, 328 insertions(+), 101 deletions(-) [+] |
line wrap: on
line diff
--- a/pyrect/jitgrep.py Sun Dec 12 23:14:05 2010 +0900 +++ b/pyrect/jitgrep.py Tue Dec 14 04:08:09 2010 +0900 @@ -9,7 +9,7 @@ from pyrect.regexp import Regexp, CharCollector def main(argv): - myusage = """%prog [--buf-size=size] [--dump] + myusage = """%prog [--buf-size=size] [--dump] [--CbC] [--time] [--debug] [--cc=compiler] [-c] [-Olevel] regexp [file..] [--out=file] [--thread=thread_num] [--filter=algorithm] @@ -34,6 +34,7 @@ psr.add_option("--label", action="store_true", dest="label", default=False, help="label implimentation in C.") psr.add_option("--dump", action="store_true", dest="dump", default=False, help="Dump generated grep-source.") psr.add_option("--out", action="store", type="string", dest="out", default="", metavar="FILE", help="Output file.") + psr.add_option("--CbC", action="store_true", dest="cbc", default=False, help="emit CbC-source.") (opts, args) = psr.parse_args(argv) @@ -41,12 +42,8 @@ psr.print_usage() return - if opts.cc == "cbc": - cbc = True - opts.cc = "$CBCROOT/INSTALL_DIR/bin/gcc" - opts.cflags += " -L$CBCROOT/gcc -w" - else: - cbc = False + if opts.cbc == True: + opts.cc = "gcc-cbc" if opts.debug: print("option", opts) if opts.debug: print("args", args) @@ -70,7 +67,7 @@ reg.chars = Regexp.get_chars(string) (reg.max_len, _, _) = Regexp.get_analyze(string) - if cbc: + if opts.cbc: grept = CbCGREPTranslator(reg) else: if opts.label:
--- a/pyrect/regexp/callgraph.py Sun Dec 12 23:14:05 2010 +0900 +++ b/pyrect/regexp/callgraph.py Tue Dec 14 04:08:09 2010 +0900 @@ -1,4 +1,4 @@ -#!/Usr/bin/env python +#!/usr/bin/env python # -*- encoding: utf-8 -*- import copy
--- a/pyrect/regexp/dfa_translator.py Sun Dec 12 23:14:05 2010 +0900 +++ b/pyrect/regexp/dfa_translator.py Tue Dec 14 04:08:09 2010 +0900 @@ -2,7 +2,7 @@ #-*- encoding: utf-8 -*- from pyrect.regexp.parser import Parser -from pyrect.regexp.ast import ASTWalker, AnyChar +from pyrect.regexp.ast import ASTWalker, AnyChar, Range from pyrect.regexp.fa import Transition from pyrect.regexp.nfa_translator import NFATranslator from pyrect.regexp.nfa import NFA, SuffixNFA, SuffixTrieNFA @@ -42,13 +42,12 @@ while que: states = que.pop() + anys = set() for state in states: trans = nfa.transition[state] - if trans == None: continue - for i, n in trans.iteritems(): - if i == AnyChar(): - anys.update(nfa.epsilon_expand(n)) + if trans and trans.has_key(AnyChar()): + anys.update(nfa.epsilon_expand(trans[AnyChar()])) for state in states: trans = nfa.transition[state]
--- a/pyrect/regexp/fa.py Sun Dec 12 23:14:05 2010 +0900 +++ b/pyrect/regexp/fa.py Tue Dec 14 04:08:09 2010 +0900 @@ -1,5 +1,7 @@ #!/usr/bin/env python +from pyrect.regexp.ast import AnyChar, Range + class FA(object): def __init__(self, transition, start, accepts, states=None): self.transition = transition
--- a/pyrect/translator/cbc_grep_translator.py Sun Dec 12 23:14:05 2010 +0900 +++ b/pyrect/translator/cbc_grep_translator.py Tue Dec 14 04:08:09 2010 +0900 @@ -1,128 +1,357 @@ #!/usr/bin/env python import os +from c_translator import CTranslator from pyrect.regexp import Regexp -from pyrect.translator import CbCTranslator +from pyrect.regexp.ast import ASTWalker, AnyChar, Character, SpecialInputNode class CbCGREPTranslateExeption(Exception): pass -class CbCGREPTranslator(CbCTranslator): +class CbCGREPTranslator(CTranslator): """CbCGREPTranslator This Class can translate form DFA into grep source-code. which based on (beautiful) mini-grep introduced \"The Practice of Programming\" written by Rob Pike & Brian W. Kernighan. (see template/grep.c) >>> string = \"(build|fndecl|gcc)\" >>> reg = Regexp(string) - >>> tje = GREPTranslator(reg) + >>> tje = CbCGREPTranslator(reg) >>> tje.translate() """ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) def __init__(self, regexp): - CbCTranslator.__init__(self, regexp) - self.interface = "unsigned char *s, unsigned char* cur, unsigned char* buf, FILE *f, char* filename" - self.args = "s, cur, buf, f, filename" - self.print_file = False + CTranslator.__init__(self, regexp) self.__bufsize = 1024 * 1024 - self.trans_stmt = self._trans_stmt(self.emit, self.args) + self.thread_dfa = 1 + self.thread_line = 1 + self.filter = "quick" + self.filter_only = False + self.filter_prefix = False + self.skip_boost = True + self.table_lookup = False + self.start = "entry" + self.interface = "UCHARP beg, UCHARP buf, UCHARP end, ENVP envp" + self.args = "beg, buf, end, envp" - def getbufsize(self): + def getbufsize(self,): return self.__bufsize def setbufsize(self, bufsize): self.__bufsize = abs(bufsize) - bufsize = property(getbufsize, setbufsize) - - def state_name(self, state): - if state in ("accept", "reject", "next_ptr", "next_line", "returner"): - return state - else: - return "state_" + state - - - def emit_accept_state(self): - self.emit("__code accept(%s) {\n" % self.interface) - if self.print_file: - self.emit(" printf(\"%s: %s\\n\", filename, buf);\n") - else: - self.emit(" printf(\"%s\\n\", buf);\n") - self.emit(" goto next_line(%s);\n}\n\n" % self.args) - - def emit_reject_state(self): - self.emit(""" -__code reject(%s) { - goto next_ptr(%s); -} -""" % (self.interface, self.args)) - - def emit_next_state(self): - self.emit (""" -__code next_ptr(%s) { - if(*cur++ == '\\0') - goto next_line(%s); - s = cur; - goto DFA(%s); -} -""" % (self.interface, self.args, self.args)) - - self.emit(""" -__code next_line(%s) { - if(fgets(buf, LINEBUFSIZE, f) == NULL) { - goto returner(%s); - } - int n = strlen(buf); - if (n > 0 && buf[n-1] == '\\n') - buf[n-1] = '\\0'; - cur = buf; - s = cur; - goto DFA(%s); -} -""" % (self.interface, self.args, self.args)) - self.emit(""" -__code returner(%s) { - return; -}""" % self.interface) + bufsize = property(getbufsize, setbufsize) def emit_initialization(self): self.emit("#include <stdio.h>") self.emit("#include <stdlib.h>") + self.emit("#include <sys/mman.h>") + self.emit("#include <sys/types.h>") + self.emit("#include <sys/stat.h>") + self.emit("#include <fcntl.h>") + self.emit("#include <unistd.h>") self.emit("#include <string.h>", 2) - self.emit("#define LINEBUFSIZE 1024") - self.emit("#define READBUFSIZE %d" % self.bufsize, 2) + + self.emit("typedef unsigned char UCHAR;") + self.emit("typedef unsigned char *UCHARP;") + self.emiti("typedef struct env_ {") + self.emit( "void (*ret)();") + self.demit("} ENV;") + self.emit("typedef ENV *ENVP;", 2) + + self.emit("void matcher(UCHARP beg, UCHARP buf, UCHARP end);") + self.emit('__code entry(%s);' % self.interface, 2) + + self.emit('__code accept(%s);' % self.interface) + self.emit('__code reject(%s);' % self.interface, 2) + + key = None + + if (self.filter == "bmh" or self.filter == "quick")\ + and self.regexp.must_words: + key = max(self.regexp.must_words, key=len) + if len(self.regexp.must_words) == 1 and len(key) == self.regexp.min_len: + self.filter_only = True + else: + self.filter = False + + if not self.filter_only: + for state in self.fa.transition.iterkeys(): + self.emit("__code %s(%s);" % (self.state_name(state), self.interface)) + self.emit() + + if self.filter == "bmh": + self.emit_bmh_filter(key) + elif self.filter: + self.emit_quick_filter(key) + + if self.skip_boost and not self.filter_only and \ + not AnyChar() in self.regexp.chars and \ + self.regexp.min_len > 2: + self.emit_booster(self.regexp.min_len, self.regexp.chars) + else: + self.skip_boost = False + + grepsource = open(self.BASE_DIR + "/template/grep.c") + self.emit(grepsource.read()) + + def emit_bmh_filter(self, key): + l = len(key) + def emit_next(): + if self.filter_only: + self.emit("goto accept(%s);" % self.args) + elif self.filter_prefix: + self.emit("buf++;") + self.emit("goto %s(%s);" % (self.state_name(self.fa.start), self.args)) + else: + self.emit("beg = get_line_beg(buf, beg);") + self.emit("buf = beg;") + self.emit("goto %s(%s);" % (self.state_name(self.fa.start), self.args)) + + self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2) + self.emiti("__code bmh_filter(%s) {" % self.interface) + l = len(key) + if l == 1: + self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key)) + self.emit("if (buf == NULL) (*envp->ret)();") + emit_next() + self.demit("}", 2) + return + + self.emit('static const UCHAR key[] = "%s";' % key) + + skip = dict() + for i in range(l-1): + skip[key[i]] = l-1-i + + self.emit("UCHARP tmp1, tmp2; buf += %d;" % (l-1), 2) - self.emit("__code DFA(%s);\n" % self.interface) - for state in self.cg.map.keys() + ["accept", "reject", "next_ptr", "next_line", "returner"]: - self.emit("__code %s(%s);" % (self.state_name(state), self.interface)) - self.emit() - grepsource = open(self.BASE_DIR + "/template/grep.cbc") - self.emit(grepsource.read()) - self.emit_next_state() + self.emiti("while (buf < end) {") + self.emiti( "if (*buf == %d /* %s */) {" % (ord(key[-1]), Character.ascii(key[-1]))) + self.emit( "tmp1 = buf, tmp2 = (UCHARP)key+%d;" % (l-1)) + self.emiti( "while (*(--tmp1) == *(--tmp2)) {") + self.emit( "if (tmp2 == key) goto next;") + self.demit( "}") + self.demit( "}") + self.emiti( "switch(*buf) {") + for k, v in skip.iteritems(): + self.emiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) + self.emit( "buf += %d; break;" % v), self.dedent() + self.emiti("default: buf += %d;" % l), self.dedent() + self.demit( "}") + self.demit("}") + self.emit( "(*envp->ret)();") + self.emit( "next:") + emit_next() + self.demit("}", 2) + + def emit_quick_filter(self, key): + l = len(key) + def emit_next(): + if self.filter_only: + self.emit("goto accept(%s);" % self.args) + elif self.filter_prefix: + self.emit("buf+%d;" % l) + self.emit("goto %s(%s);" % (self.state_name(self.fa.start) ,self.args)) + else: + self.emit("beg = get_line_beg(buf, beg);") + self.emit("buf = beg;") + self.emit("goto %s(%s);" % (self.state_name(self.fa.start), self.args)) + + self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2) + + self.emiti("__code quick_filter(%s) {" % self.interface) + l = len(key) + if l == 1: + self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key)) + self.emit("if (buf == NULL) (*envp->ret)();") + emit_next() + self.demit("}", 2) + return + + self.emit('static const UCHAR key[] = "%s";' % key) - def emit_filter(self): - pass + skip = dict() + for i in range(l): + skip[key[i]] = l-i + + self.emit("UCHARP tmp1, tmp2, end_ = end - %d;" % (l-1), 2) + self.emiti("while (buf < end_) {") + self.emiti( "if (*buf == %d /* %s */) {" % (ord(key[0]), Character.ascii(key[0]))) + self.emit( "tmp1 = buf, tmp2 = (UCHARP)key;") + self.emiti( "while (*(++tmp1) == *(++tmp2)){") + self.emit( "if (tmp2 == key+%d) goto next;" % (l-1)) + self.demit( "}") + self.demit( "}") + if self.table_lookup: + self.emiti("static const void * tbl[256] = {[0 ... 255] &&any, %s};" + % ", ".join("[%d] &&add%s" % (ord(c), s) for c, s in skip.iteritems())) + self.emit("goto *tbl[buf[%d]];" % l) + defun = [] + for s in skip.itervalues(): + if s in defun: continue + defun.append(s) + self.emit("add%s: buf += %s; goto ends;" % (s, s)) + self.emit("any: buf += %d; ends:;" % (l+1)) + else: + self.emiti( "switch(buf[%d]) {" % l) + for k, v in skip.iteritems(): + self.emiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) + self.emit( "buf += %d; break;" % v), self.dedent() + self.emiti("default: buf += %d;" % (l+1)), self.dedent() + self.demit( "}") + self.demit("}") + self.emit( "(*envp->ret)();") + self.emit( "next:") + emit_next() + self.demit("}", 2) + + def emit_booster(self, min_len, chars): + self.emiti("__code booster(%s) {" % self.interface) + self.emit( "UCHARP end_ = end - %d;" % (min_len-1)) + self.emit( "if (buf > end_) (*envp->ret)();") + self.emiti( "do {") + self.emiti( "switch (buf[%d]) {" % (min_len-1)) + for c in chars: + self.emit( "case %d: /* %s */" % (ord(c), Character.ascii(c))) + self.emit( "goto ret;") + self.demit( "}") + self.demit( "} while((buf += %d) <= end_);" % min_len) + self.emit( "ret: goto %s(%s);" % (self.state_name(self.fa.start) , self.args)) + self.demit("}", 2) def emit_driver(self): - self.emit(""" -int main(int argc, char* argv[]) { - grepmain(argc, argv); - return; -} -""") - self.emit(""" -__code DFA(%s) { - goto %s(%s); -} -""" % (self.interface, self.state_name(self.cg.start), self.args)) + self.emiti("void matcher(UCHARP beg, UCHARP buf, UCHARP end) {") + self.emit( "__label__ _return;") + self.emiti( "void __return() {") + self.emit( "goto _return;") + self.demit( "}") + self.emit( "ENVP envp = malloc(sizeof(ENV));") + self.emit( "envp->ret = __return;") + self.emit( "goto entry(%s);" % self.args) + self.demiti("_return:") + self.emit( "return;") + self.demit("}", 2) + + self.emiti("__code entry(%s) {" % self.interface) + if self.filter: + self.emit( "goto %s(%s);" % (self.filter + "_filter", self.args)) + else: + self.emit( "goto %s(%s);" % (self.state_name(self.fa.start), self.args)) + self.demit("}", 2) + + def emit_accept_state(self): + self.emiti("__code accept(%s) {" % self.interface) + self.emit( "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));") + if self.skip_boost or self.filter: + self.emit( "beg = get_line_beg(buf, beg);") + self.emiti( "if (ret == NULL) {") + self.emit( "print_line(beg, end);") + self.emit( "(*envp->ret)();") + self.demit( "}") + self.emit( "print_line(beg, ret);") + self.emit( "beg = buf = ret + 1;") + self.emit( "goto %s(%s);" % (self.start, self.args)) + self.demit("}", 2) + + def emit_reject_state(self): + self.emiti("__code reject(%s) {" % self.interface) + self.emit( "if (buf >= end) (*envp->ret)();") + self.emit( "beg = buf;") + self.emit( "goto %s(%s);" % (self.start, self.args)) + self.demit("}", 2) + + def emit_switch(self, case, default=None): + if not case: + if default: + self.emit("goto %s(%s);" % (default, self.args)) + return + self.emiti("switch(*buf++) {") + for case, next_ in case.iteritems(): + self.trans_stmt.emit(case, self.state_name(next_)) + if default == self.state_name(self.fa.start) and self.skip_boost: + self.emit("default: goto booster(%s);" % self.args) + else: + self.emit("default: goto %s(%s);" % (default, self.args)) + self.demit("}") def emit_state(self, cur_state, transition): - if cur_state in self.cg.accepts: - self.emiti("__code %s(%s) {" % (self.state_name(cur_state), self.interface)) - self.emit( "goto accept(%s);" % self.args) - self.emitd("}") + if self.filter_only: return + + self.emiti("__code %s(%s) {" % (self.state_name(cur_state), self.interface)) + + if cur_state in self.fa.accepts: + self.emit( "goto accept(beg, buf-1, end, envp);") + self.demit("}", 2) + return + + if transition.has_key(AnyChar()): + default = self.state_name(transition.pop(AnyChar())) else: - CbCTranslator.emit_state(self, cur_state, transition) + default = self.state_name(self.fa.start) + + if self.table_lookup and (cur_state == self.fa.start or \ + self.state_name(cur_state) == default): + if self.skip_boost and default == self.state_name(self.fa.start): + default = "booster" + tbl = dict() + for eol in self.eols: + tbl[eol.char] = "reject" + for c, n in transition.iteritems(): + tbl[c.char] = self.state_name(n) + self.emit( "static __code (*%s_table[256])(UCHARP, UCHARP, UCHARP) = {[0 ... 255] = (void*)%s, %s};" + % (self.state_name(cur_state), default, + ", ".join("[%d] = %s" % (i, s) for (i, s) in tbl.items()))) + self.emit( "goto %s_table[*buf++](%s);" % (self.state_name(cur_state), self.args)) + self.demit("}", 2) + return + + for eol in self.eols: + transition[eol] = "reject" + + for input_ in transition.keys(): + if isinstance(input_, SpecialInputNode): + self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) + + self.emit_switch(transition, default) + + self.demit("}", 2) + + class _trans_stmt(ASTWalker): + def __init__(self, emit): + self._emit = emit + self.args = "beg, buf, end, envp" + + def emit(self, input_node, next_): + self.next = next_ + input_node.accept(self) + + def visit(self, input_node): + self._emit("/* UNKNOW RULE */") + self._emit("/* %s */" % input_node.__repr__()) + + def visit_Character(self, char): + self._emit("case %d: /* %s */" % (char.char, char)) + self._emit(" goto %s(%s);" % (self.next, self.args)) + + # Special Rule + def visit_BegLine(self, begline): + self._emit("/* begin of line */") + self._emit("if (buf == beg)") + self._emit(" goto %s(%s);" % (self.next, self.args), 2) + + def visit_Range(self, range): + if isinstance(range.lower, MBCharacter) and not \ + isinstance(range.upper, MBCharacter) or \ + isinstance(range.upper, MBCharacter) and not \ + isinstance(range.lower, MBCharacter): + return + + if isinstance(range.lower, MBCharacter): + self.visit(range) + else: + self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char)) + self._emit(" goto %s(beg, buf+1, end, envp);" % self.next, 2) def test(): import doctest