Mercurial > hg > Members > shinya > pyrect
changeset 67:b02b321d0e06
implement bm_filter on mmap. but it's slower than dfa. ?;-(
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sun, 07 Nov 2010 01:48:19 +0900 |
parents | c657ca0f0b30 |
children | 56a997f2c121 |
files | pyrect/regexp/analyzer.py pyrect/translator/grep_translator.py pyrect/translator/template/grep.c |
diffstat | 3 files changed, 67 insertions(+), 31 deletions(-) [+] |
line wrap: on
line diff
--- a/pyrect/regexp/analyzer.py Sat Nov 06 00:55:01 2010 +0900 +++ b/pyrect/regexp/analyzer.py Sun Nov 07 01:48:19 2010 +0900 @@ -8,7 +8,7 @@ """ from pyrect.regexp.parser import Parser -from pyrect.regexp.ast import ASTWalker +from pyrect.regexp.ast import ASTWalker, Plus class Analyzer(ASTWalker): """ Extract with Visitor-Pattern. @@ -25,7 +25,10 @@ (inf, 1, []) >>> an.analyze(prs.parse('(plus)?(qmark)?')) (9, 0, []) + >>> an.analyze(prs.parse('\*+ \[\[')) + (inf, 1, []) """ + def __init__(self, ast=None): if ast: self.analyze(ast) @@ -65,7 +68,7 @@ def visit_Plus(self, plus): (_, m, k) = plus.op.accept(self) - return float("inf"), m, ["", k, ""] + return float("inf"), m, k + [""] + k def visit_Qmark(self, qmark): (m, _, _) = qmark.op.accept(self)
--- a/pyrect/translator/grep_translator.py Sat Nov 06 00:55:01 2010 +0900 +++ b/pyrect/translator/grep_translator.py Sun Nov 07 01:48:19 2010 +0900 @@ -27,6 +27,7 @@ self.thread_dfa = 1 self.thread_line = 1 self.filter = True + self.start = "matcher" self.interface = "UCHARP beg, UCHARP buf, UCHARP end" self.args = "beg, buf, end" @@ -40,6 +41,7 @@ def emit_initialization(self): self.emit("#include <stdio.h>") self.emit("#define GREP grep") + self.emit("#define UCHAR unsigned char") self.emit("#define UCHARP unsigned char *") self.emit("#include <stdlib.h>") self.emit("#include <sys/mman.h>") @@ -54,10 +56,11 @@ self.emit("void %s(%s);" % (self.state_name(state), self.interface)) self.emit('void accept(%s);' % self.interface) self.emit('void reject(%s);' % self.interface) - self.emit("void dfa(%s);" % self.interface, 2) + self.emit("void matcher(%s);" % self.interface, 2) - #if self.filter and self.regexp.must_words: - # self.emit_filter(self.regexp.must_words) + if self.filter and self.regexp.must_words: + self.emit_filter(self.regexp.must_words) + else: self.filter = False grepsource = open(self.BASE_DIR + "/template/grep.c") self.emit(grepsource.read()) @@ -71,17 +74,31 @@ key = reduce(longest, words) - if len(words) == 1: - if len(key) == self.regexp.min_len: - self.emit("#define MATCH (bm_filter(beg, buf, n-1))", 1) + if len(words) == 1 and len(key) == self.regexp.min_len: + filter_only = True else: - self.emit("#define (bm_filter(beg, buf, n-1) && DFA(beg, buf, n-1))", 1) + filter_only = False + filter_prefix = False - self.emit("#define FILTER bm_filter", 2) - self.emiti("int bm_filter(unsigned char* buf, int n) {") + def emit_next(): + self.emit("beg = memrchr(buf, '\\n', beg);") + if filter_only: + self.emit("accept(%s);" % self.args) + elif filter_prefix: + self.emit("buf -= %d;" % len(key)) + self.emit("dfa(%s);" % self.args) + else: + self.emit("buf = beg;") + self.emit("dfa(%s);" % self.args) + + self.emit("UCHARP memrchr(UCHARP p, int c, UCHARP beg);", 2) + + self.emiti("void bm_filter(%s) {" % self.interface) l = len(key) if l == 1: - self.emit(" return (strchr(buf, %d) != NULL)" % ord(key)) + self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key)) + self.emit("if (buf == NULL) return;") + emit_next() self.emitd("}", 2) return @@ -89,37 +106,46 @@ for i in range(l - 1): skip[ord(key[i])] = str(l-1-i) - self.emit('static unsigned char key[] = "%s";' % key) + self.emit('static UCHAR key[] = "%s";' % key) self.emiti( "static int skip[256] = {") for i in range(8): i = i * 32 self.emit(",".join(skip[i:i+32]) + ",") self.emitd( "};") - self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l)) - self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2) - self.emiti("while (i < n) {") - self.emit( "c = buf[i];") - self.emiti( "if (c == tail) {") - self.emit( "j = len - 1; k = i;") - self.emiti( "while (key[--j] == buf[--k]) {") - self.emit( "if (j == 0) return 1;") + self.emit("UCHARP tmp; register UCHAR c;", 2) + self.emit("int i; buf += %d;" % (l-1)) + self.emiti("while (buf < end) {") + self.emiti( "if ((c = *buf) == %d /*'%c'*/) {" % (ord(key[l-1]), key[l-1])) + self.emit( "i = %d; tmp = buf;" % (l-1)) + self.emiti( "while (key[--i] == *(--tmp)) {") + self.emiti( "if (i == 0) {") + self.emit( "beg = memrchr(buf, '\\n', beg);") + self.emit( "goto next;") + self.emitd( "}") self.emitd( "}") self.emitd( "}") - self.emit( "i += skip[c];") + self.emit( "buf += skip[c];") self.emitd("}") - self.emit( "return 0;") + self.emit( "return;") + self.emit( "next:") + emit_next() self.emitd("}", 2) def emit_driver(self): - self.emiti("void dfa(%s) {" % self.interface) - self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) + self.emiti("void matcher(%s) {" % self.interface) + if self.filter: + self.emit( "%s(%s);" % ("bm_filter", self.args)) + else: + self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) self.emit( "return;") self.emitd("}") return def emit_accept_state(self): self.emiti("void accept(%s) {" % self.interface) + #self.emit( "printf(\"*beg = %c, *buf = %c, *end = %c; \\n\", *beg, *buf, *end);") + #self.emit( "printf(\"beg = %x, buf = %x, end = %x; \\n\", beg, buf, end);") self.emit( "buf--;") self.emit( "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));") self.emit( 'if (ret == NULL) {fprintf(stderr, "memchr NULL err!"); exit(0);}') @@ -130,14 +156,14 @@ self.emitd( "}") self.emit( "print_line(beg, ret);") self.emit( "beg = buf = ret + 1;") - self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) + self.emit( "%s(%s);" % (self.start, self.args)) self.emitd("}", 2) def emit_reject_state(self): self.emiti("void reject(%s) {" % self.interface) self.emit( "if (buf >= end) return;") self.emit( "beg = buf;") - self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) + self.emit( "%s(%s);" % (self.start, self.args)) self.emitd("}", 2) def emit_switch(self, case, default=None):
--- a/pyrect/translator/template/grep.c Sat Nov 06 00:55:01 2010 +0900 +++ b/pyrect/translator/template/grep.c Sun Nov 07 01:48:19 2010 +0900 @@ -1,10 +1,17 @@ -void print_line(unsigned char *beg, unsigned char *end) { +UCHARP memrchr(UCHARP p, int c, UCHARP beg) { + while(p > beg) { + if ((*--p) == c) return p+1; + } + return beg; +} + +void print_line(UCHARP beg, UCHARP end) { fwrite(beg, sizeof(char), (end - beg + 1), stdout); } void grep(char *regexp, int fd, char *name) { caddr_t file_mmap; - unsigned char *buf, *end, *beg; + UCHARP buf, *end, *beg; off_t size; struct stat sb; @@ -21,10 +28,10 @@ exit(0); } - beg = buf = (unsigned char *) file_mmap; + beg = buf = (UCHARP) file_mmap; end = beg + size - 1; - dfa(beg, beg, end); + matcher(beg, beg, end); munmap(file_mmap, size); pthread_exit(NULL);