Mercurial > hg > Members > shinya > pyrect
view pyrect/translator/grep_translator.py @ 106:8102bf4bbec6
modify range stmt.
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 14 Dec 2010 15:02:25 +0900 |
parents | a38b57592d45 |
children | d591da6e2988 |
line wrap: on
line source
#!/usr/bin/env python import os from c_translator import CTranslator from pyrect.regexp import Regexp from pyrect.regexp.ast import ASTWalker, AnyChar, Character, SpecialInputNode class GREPTranslateExeption(Exception): pass class GREPTranslator(CTranslator): """GREPTranslator This Class can translate form DFA into grep source-code. which based on (beautiful) mini-grep introduced \"The Practice of Programming\" written by Rob Pike & Brian W. Kernighan. (see template/grep.c) >>> string = \"(build|fndecl|gcc)\" >>> reg = Regexp(string) >>> tje = GREPTranslator(reg) >>> tje.translate() """ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) def __init__(self, regexp): CTranslator.__init__(self, regexp) self.__bufsize = 1024 * 1024 self.thread_dfa = 1 self.thread_line = 1 self.filter = "quick" self.filter_only = False self.filter_prefix = False self.skip_boost = True self.table_lookup = False self.start = "matcher" self.interface = "UCHARP beg, UCHARP buf, UCHARP end" self.args = "beg, buf, end" def getbufsize(self,): return self.__bufsize def setbufsize(self, bufsize): self.__bufsize = abs(bufsize) bufsize = property(getbufsize, setbufsize) def emit_initialization(self): self.emit("#include <stdio.h>") self.emit("#include <stdlib.h>") self.emit("#include <sys/mman.h>") self.emit("#include <sys/types.h>") self.emit("#include <sys/stat.h>") self.emit("#include <fcntl.h>") self.emit("#include <unistd.h>") self.emit("#include <string.h>", 2) self.emit("typedef unsigned char UCHAR;") self.emit("typedef unsigned char *UCHARP;", 2) self.emit('void reject(%s);' % self.interface) self.emit("void matcher(%s);" % self.interface) self.emit('void accept(%s);' % self.interface, 2) key = None if (self.filter == "bmh" or self.filter == "quick")\ and self.regexp.must_words: key = max(self.regexp.must_words, key=len) if len(self.regexp.must_words) == 1 and len(key) == self.regexp.min_len: self.filter_only = True else: self.filter = False if not self.filter_only: for state in self.fa.transition.iterkeys(): self.emit("void %s(%s);" % (self.state_name(state), self.interface)) self.emit() if self.filter == "bmh": self.emit_bmh_filter(key) elif self.filter: self.emit_quick_filter(key) if self.skip_boost and not self.filter_only and \ not AnyChar() in self.regexp.chars and \ self.regexp.min_len >= 2: self.emit_booster(self.regexp.min_len, self.regexp.chars) else: self.skip_boost = False grepsource = open(self.BASE_DIR + "/template/grep.c") self.emit(grepsource.read()) def emit_bmh_filter(self, key): l = len(key) def emit_next(): if self.filter_only: self.emit("return accept(%s);" % self.args) elif self.filter_prefix: self.emit("buf++;") self.emit("return %s(%s);" % (self.state_name(self.fa.start), self.args)) else: self.emit("beg = get_line_beg(buf, beg);") self.emit("buf = beg;") self.emit("return %s(%s);" % (self.state_name(self.fa.start), self.args)) self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2) self.emiti("void bmh_filter(%s) {" % self.interface) l = len(key) if l == 1: self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key)) self.emit("if (buf == NULL) return;") emit_next() self.demit("}", 2) return self.emit('static const UCHAR key[] = "%s";' % key) skip = dict() for i in range(l-1): skip[key[i]] = l-1-i self.emit("UCHARP tmp1, tmp2; buf += %d;" % (l-1), 2) self.emiti("while (buf < end) {") self.emiti( "if (*buf == %d /* %s */) {" % (ord(key[-1]), Character.ascii(key[-1]))) self.emit( "tmp1 = buf, tmp2 = (UCHARP)key+%d;" % (l-1)) self.emiti( "while (*(--tmp1) == *(--tmp2)) {") self.emit( "if (tmp2 == key) goto next;") self.demit( "}") self.demit( "}") self.emiti( "switch(*buf) {") for k, v in skip.iteritems(): self.emiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) self.emit( "buf += %d; break;" % v), self.dedent() self.emiti("default: buf += %d;" % l), self.dedent() self.demit( "}") self.demit("}") self.emit( "return;") self.emit( "next:") emit_next() self.demit("}", 2) def emit_quick_filter(self, key): l = len(key) def emit_next(): if self.filter_only: self.emit("return accept(%s);" % self.args) elif self.filter_prefix: self.emit("buf+%d;" % l) self.emit("return %s(%s);" % (self.state_name(self.fa.start) ,self.args)) else: self.emit("beg = get_line_beg(buf, beg);") self.emit("buf = beg;") self.emit("return %s(%s);" % (self.state_name(self.fa.start), self.args)) self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2) self.emiti("void quick_filter(%s) {" % self.interface) l = len(key) if l == 1: self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key)) self.emit("if (buf == NULL) return;") emit_next() self.demit("}", 2) return self.emit('static const UCHAR key[] = "%s";' % key) skip = dict() for i in range(l): skip[key[i]] = l-i self.emit("UCHARP tmp1, tmp2, end_ = end - %d;" % (l-1), 2) self.emiti("while (buf < end_) {") self.emiti( "if (*buf == %d /* %s */) {" % (ord(key[0]), Character.ascii(key[0]))) self.emit( "tmp1 = buf, tmp2 = (UCHARP)key;") self.emiti( "while (*(++tmp1) == *(++tmp2)){") self.emit( "if (tmp2 == key+%d) goto next;" % (l-1)) self.demit( "}") self.demit( "}") if self.table_lookup: self.emiti("static const void * tbl[256] = {[0 ... 255] &&any, %s};" % ", ".join("[%d] &&add%s" % (ord(c), s) for c, s in skip.iteritems())) self.emit("goto *tbl[buf[%d]];" % l) defun = [] for s in skip.itervalues(): if s in defun: continue defun.append(s) self.emit("add%s: buf += %s; goto ends;" % (s, s)) self.emit("any: buf += %d; ends:;" % (l+1)) else: self.emiti( "switch(buf[%d]) {" % l) for k, v in skip.iteritems(): self.emiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) self.emit( "buf += %d; break;" % v), self.dedent() self.emiti("default: buf += %d;" % (l+1)), self.dedent() self.demit( "}") self.demit("}") self.emit( "return;") self.emit( "next:") emit_next() self.demit("}", 2) def emit_booster(self, min_len, chars): self.emiti("void booster(%s) {" % self.interface) self.emit( "UCHARP end_ = end - %d;" % (min_len-1)) self.emit( "if (buf > end_) return;") self.emiti( "do {") self.emiti( "switch (buf[%d]) {" % (min_len-1)) for c in chars: self.emit( "case %d: /* %s */" % (ord(c), Character.ascii(c))) self.emit( "goto ret;") self.demit( "}") self.demit( "} while((buf += %d) <= end_);" % min_len) self.emit( "ret: return %s(%s);" % (self.state_name(self.fa.start) , self.args)) self.demit("}", 2) def emit_driver(self): self.emiti("void matcher(%s) {" % self.interface) if self.filter: self.emit( "%s(%s);" % (self.filter + "_filter", self.args)) else: self.emit( "%s(%s);" % (self.state_name(self.fa.start), self.args)) self.emit( "return;") self.demit("}", 2) def emit_accept_state(self): self.emiti("void accept(%s) {" % self.interface) self.emit( "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));") if self.skip_boost or self.filter: self.emit( "beg = get_line_beg(buf, beg);") self.emiti( "if (ret == NULL) {") self.emit( "print_line(beg, end);") self.emit( "return;") self.demit( "}") self.emit( "print_line(beg, ret);") self.emit( "beg = buf = ret + 1;") self.emit( "return %s(%s);" % (self.start, self.args)) self.demit("}", 2) def emit_reject_state(self): self.emiti("void reject(%s) {" % self.interface) self.emit( "if (buf >= end) return;") self.emit( "beg = buf;") self.emit( "return %s(%s);" % (self.start, self.args)) self.demit("}", 2) def emit_switch(self, case, default=None): if not case: if default: self.emit("return %s(%s);" % (default, self.args)) return self.emiti("switch(*buf++) {") for case, next_ in case.iteritems(): self.trans_stmt.emit(case, self.state_name(next_)) if default == self.state_name(self.fa.start) and self.skip_boost: self.emit("default: return booster(%s);" % self.args) else: self.emit("default: return %s(%s);" % (default, self.args)) self.demit("}") def emit_state(self, cur_state, transition): if self.filter_only: return self.emiti("void %s(%s) {" % (self.state_name(cur_state), self.interface)) if cur_state in self.fa.accepts: self.emit( "return accept(beg, buf-1, end);") self.demit("}", 2) return if transition.has_key(AnyChar()): default = self.state_name(transition.pop(AnyChar())) else: default = self.state_name(self.fa.start) if self.table_lookup and (cur_state == self.fa.start or \ self.state_name(cur_state) == default): if self.skip_boost and default == self.state_name(self.fa.start): default = "booster" tbl = dict() for eol in self.eols: tbl[eol.char] = "reject" for c, n in transition.iteritems(): tbl[c.char] = self.state_name(n) self.emit( "static void (*%s_table[256])(UCHARP, UCHARP, UCHARP) = {[0 ... 255] = (void*)%s, %s};" % (self.state_name(cur_state), default, ", ".join("[%d] = %s" % (i, s) for (i, s) in tbl.items()))) self.emit( "return %s_table[*buf++](%s);" % (self.state_name(cur_state), self.args)) self.demit("}", 2) return for eol in self.eols: transition[eol] = "reject" for input_ in transition.keys(): if isinstance(input_, SpecialInputNode): self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) self.emit_switch(transition, default) self.demit("}", 2) class _trans_stmt(ASTWalker): def __init__(self, emit): self._emit = emit self.args = "beg, buf, end" def emit(self, input_node, next_): self.next = next_ input_node.accept(self) def visit(self, input_node): self._emit("/* UNKNOW RULE */") self._emit("/* %s */" % input_node.__repr__()) def visit_Character(self, char): self._emit("case %d: /* %s */" % (char.char, char)) self._emit(" return %s(%s);" % (self.next, self.args)) # Special Rule def visit_BegLine(self, begline): self._emit("/* begin of line */") self._emit("if (buf == beg)") self._emit(" return %s(%s);" % (self.next, self.args), 2) def visit_Range(self, range): if isinstance(range.lower, MBCharacter) and not \ isinstance(range.upper, MBCharacter) or \ isinstance(range.upper, MBCharacter) and not \ isinstance(range.lower, MBCharacter): return if isinstance(range.lower, MBCharacter): self.visit(range) else: self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char)) self._emit(" return %s(beg, buf+1, end);" % self.next, 2) def test(): import doctest doctest.testmod() if __name__ == '__main__': test()