Mercurial > hg > Members > shinya > pyrect
view pyrect/translator/grep_translator.py @ 61:974ff97dd88a
modify I/O routine. use mmap. it's really faster than fgets ;-)
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 05 Nov 2010 01:34:14 +0900 |
parents | fd3d0b8326fe |
children | c981dc66b258 bee3a64d6cbc b02b321d0e06 |
line wrap: on
line source
#!/usr/bin/env python import os from c_translator import CTranslator from pyrect.regexp import Regexp, Analyzer from pyrect.regexp.ast import ASTWalker, AnyChar, Character class GREPTranslateExeption(Exception): pass class GREPTranslator(CTranslator): """GREPTranslator This Class can translate form DFA into grep source-code. which based on (beautiful) mini-grep introduced \"The Practice of Programming\" written by Rob Pike & Brian W. Kernighan. (see template/grep.c) >>> string = \"(build|fndecl|gcc)\" >>> reg = Regexp(string) >>> tje = GREPTranslator(reg) >>> tje.translate() """ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) def __init__(self, regexp): CTranslator.__init__(self, regexp, fa="DFA") self.__bufsize = 1024 * 1024 self.thread_dfa = 1 self.thread_line = 1 self.filter = True self.interface = "UCHARP beg, UCHARP buf, UCHARP end" self.args = "beg, buf, end" def getbufsize(self,): return self.__bufsize def setbufsize(self, bufsize): self.__bufsize = abs(bufsize) bufsize = property(getbufsize, setbufsize) def emit_initialization(self): self.emit("#include <stdio.h>") self.emit("#define GREP grep") self.emit("#define UCHARP unsigned char *") self.emit("#include <stdlib.h>") self.emit("#include <sys/mman.h>") self.emit("#include <sys/types.h>") self.emit("#include <sys/stat.h>") self.emit("#include <fcntl.h>") self.emit("#include <string.h>") self.emit_skip() for state in self.cg.map.iterkeys(): self.emit("void %s(%s);" % (self.state_name(state), self.interface)) self.emit('void accept(%s);' % self.interface) self.emit('void reject(%s);' % self.interface) self.emit("void dfa(%s);" % self.interface, 2) #if self.filter and self.regexp.must_words: # self.emit_filter(self.regexp.must_words) grepsource = open(self.BASE_DIR + "/template/grep.c") self.emit(grepsource.read()) def emit_filter(self, words): def longest(s1, s2): if len(s1) >= len(s2): return s1 else: return s2 key = reduce(longest, words) if len(words) == 1: if len(key) == self.regexp.min_len: self.emit("#define MATCH (bm_filter(beg, buf, n-1))", 1) else: self.emit("#define (bm_filter(beg, buf, n-1) && DFA(beg, buf, n-1))", 1) self.emit("#define FILTER bm_filter", 2) self.emiti("int bm_filter(unsigned char* buf, int n) {") l = len(key) if l == 1: self.emit(" return (strchr(buf, %d) != NULL)" % ord(key)) self.emitd("}", 2) return skip = [str(l)] * 256 for i in range(l - 1): skip[ord(key[i])] = str(l-1-i) self.emit('static unsigned char key[] = "%s";' % key) self.emiti( "static int skip[256] = {") for i in range(8): i = i * 32 self.emit(",".join(skip[i:i+32]) + ",") self.emitd( "};") self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l)) self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2) self.emiti("while (i < n) {") self.emit( "c = buf[i];") self.emiti( "if (c == tail) {") self.emit( "j = len - 1; k = i;") self.emiti( "while (key[--j] == buf[--k]) {") self.emit( "if (j == 0) return 1;") self.emitd( "}") self.emitd( "}") self.emit( "i += skip[c];") self.emitd("}") self.emit( "return 0;") self.emitd("}", 2) def emit_driver(self): self.emiti("void dfa(%s) {" % self.interface) self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) self.emit( "return;") self.emitd("}") return def emit_accept_state(self): self.emiti("void accept(%s) {" % self.interface) self.emit( "buf--;") self.emit( "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));") self.emit( 'if (ret == NULL) {fprintf(stderr, "memchr NULL err!"); exit(0);}') self.emiti( "if (ret > end) {") self.emit( "ret--;") self.emit( "print_line(beg, ret);") self.emit( "return;") self.emitd( "}") self.emit( "print_line(beg, ret);") self.emit( "beg = buf = ret + 1;") self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) self.emitd("}", 2) def emit_reject_state(self): self.emiti("void reject(%s) {" % self.interface) self.emit( "if (buf >= end) return;") self.emit( "beg = buf;") self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) self.emitd("}", 2) def emit_switch(self, case, default=None): if not case: if default: self.emit("return %s(%s);" % (default, self.args)) return self.emiti("switch(*buf++) {") for case, next_ in case.iteritems(): self.trans_stmt.emit(case, self.state_name(next_)) if default: if default == self.state_name(self.cg.start): self.emit("default: return %s(%s);" % (default, self.args)) self.emitd("}") def emit_state(self, cur_state, transition): self.emiti("void %s(%s) {" % (self.state_name(cur_state), self.interface)) if cur_state in self.cg.accepts: self.emit( "return accept(beg, buf-1, end);") self.emitd("}", 2) return default = self.state_name(self.cg.start) for eol in self.eols: transition[eol] = "reject" for input_ in transition.keys(): if type(input_) in self.special_rule: self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) elif type(input_) is AnyChar: default = self.state_name(transition.pop(input_)) self.emit_switch(transition, default) self.emitd("}", 2) class _trans_stmt(ASTWalker): def __init__(self, emit): self._emit = emit self.args = "beg, buf, end" def emit(self, input_node, next_): self.next = next_ input_node.accept(self) def visit(self, input_node): self._emit("/* UNKNOW RULE */") self._emit("/* %s */" % input_node.__repr__()) def visit_Character(self, char): self._emit("case %d: /* match %s */" % (char.char, char)) self._emit(" return %s(%s);" % (self.next, self.args)) # Special Rule def visit_BegLine(self, begline): self._emit("/* begin of line */") self._emit("if (buf == beg)") self._emit(" return %s(%s);" % (self.next, self.args), 2) def visit_Range(self, range): if isinstance(range.lower, MBCharacter) and not \ isinstance(range.upper, MBCharacter) or \ isinstance(range.upper, MBCharacter) and not \ isinstance(range.lower, MBCharacter): return if isinstance(range.lower, MBCharacter): self.visit(range) else: self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char)) self._emit(" return %s(beg, buf+1, end);" % self.next, 2) def test(): import doctest doctest.testmod() if __name__ == '__main__': test()