Mercurial > hg > Members > shinya > pyrect
changeset 59:fd3d0b8326fe fgest stable
implement regexp-syntax any-char ('.').
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Thu, 04 Nov 2010 17:06:44 +0900 |
parents | 81337db23999 |
children | 5ab54a732ddb |
files | pyrect/jitgrep.py pyrect/regexp/ast.py pyrect/regexp/dfa_translator.py pyrect/regexp/nfa_translator.py pyrect/translator/grep_translator.py pyrect/translator/template/grep.c |
diffstat | 6 files changed, 70 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/pyrect/jitgrep.py Mon Nov 01 14:50:52 2010 +0900 +++ b/pyrect/jitgrep.py Thu Nov 04 17:06:44 2010 +0900 @@ -70,7 +70,7 @@ else: grept = GREPTranslator(reg) grept.filter = not opts.no_filter - grept.thread_num = int(opts.thread) + grept.thread_line = int(opts.thread) grept.bufsize = bufsize if opts.dump:
--- a/pyrect/regexp/ast.py Mon Nov 01 14:50:52 2010 +0900 +++ b/pyrect/regexp/ast.py Thu Nov 04 17:06:44 2010 +0900 @@ -102,7 +102,7 @@ return FixedString(self, other) def __hash__(self): - return id(self) + return id(self.__str__()) def __cmp__(self, other): if self.__hash__() == other.__hash__():
--- a/pyrect/regexp/dfa_translator.py Mon Nov 01 14:50:52 2010 +0900 +++ b/pyrect/regexp/dfa_translator.py Thu Nov 04 17:06:44 2010 +0900 @@ -2,7 +2,7 @@ #-*- encoding: utf-8 -*- from pyrect.regexp.parser import Parser -from pyrect.regexp.ast import ASTWalker +from pyrect.regexp.ast import ASTWalker, AnyChar from pyrect.regexp.nfa import NFA from pyrect.regexp.nfa_translator import NFATranslator from pyrect.regexp.dfa import DFA @@ -40,13 +40,20 @@ while que: states = que.pop() + anys = set() + for state in states: + for k, v in nfa.map.iteritems(): + if state == k[0] and k[1] == AnyChar(): + anys.update(nfa.epsilon_expand(v)) for state in states: for k, v in nfa.map.iteritems(): if state == k[0] and k[1] != '': - slot = map_.setdefault((states, k[1]), set()) + default = set(anys) + slot = map_.setdefault((states, k[1]), default) slot.update(nfa.epsilon_expand(v)) + done.add(states) for v in map_.itervalues():
--- a/pyrect/regexp/nfa_translator.py Mon Nov 01 14:50:52 2010 +0900 +++ b/pyrect/regexp/nfa_translator.py Thu Nov 04 17:06:44 2010 +0900 @@ -63,6 +63,7 @@ frag2 = union.op2.accept(self) frag = frag1 | frag2 + s = self.state_id frag.connect(s, '', frag1.start) frag.connect(s, '', frag2.start)
--- a/pyrect/translator/grep_translator.py Mon Nov 01 14:50:52 2010 +0900 +++ b/pyrect/translator/grep_translator.py Thu Nov 04 17:06:44 2010 +0900 @@ -23,8 +23,8 @@ def __init__(self, regexp): CTranslator.__init__(self, regexp, fa="DFA") self.__bufsize = 1024 * 1024 - self.parallel_match = False - self.thread_num = 0 + self.thread_dfa = 1 + self.thread_line = 1 self.filter = True def getbufsize(self,): @@ -36,21 +36,24 @@ def emit_initialization(self): CTranslator.emit_initialization(self) + if self.thread_dfa > 1 and self.regexp.max_len != float("inf"): + self.emit("#define DFA paradfa") + self.emit("#define THREAD_NUM %d" % self.thread_dfa) + self.emit("#define REG_MAX_LEN %d" % self.regexp.max_len) + else: + self.emit("#define DFA dfa") + self.emit("#define THREAD_NUM 1 // no threading") + self.emit("#define REG_MAX_LEN -1") - if self.thread_num > 1: - self.emit("#define GREP paragrep") - else: - self.emit("#define GREP grep") - + self.emit("#define GREP grep") self.emit("#define LINEBUFSIZE %d" % self.bufsize) self.emit("#define READBUFSIZE %d" % self.bufsize) - self.emit('#define THREAD_NUM %d' % self.thread_num) - self.emit('#define THREAD_BUF %d' % 3) self.emit('#include <pthread.h>') self.emit("#include <stdlib.h>") self.emit("#include <string.h>") self.emit("char readbuf[%d];" % (self.bufsize)) - self.emit("int DFA(unsigned char* s);", 2) + self.emit("int dfa(unsigned char* s, int len);", 2) + self.emit("int paradfa(unsigned char* s, int len);", 2) if self.filter and self.regexp.must_words: self.emit_filter(self.regexp.must_words) @@ -73,7 +76,8 @@ else: self.emit("#define WITH_FILTER 1", 1) - self.emiti("int FILTER(unsigned char* text, int n) {") + self.emit("#define FILTER bm_filter", 2) + self.emiti("int bm_filter(unsigned char* text, int n) {") l = len(key) if l == 1: self.emit(" return (strchr(text, %d) != NULL)" % ord(key)) @@ -107,13 +111,14 @@ self.emitd("}", 2) def emit_driver(self): - self.emiti("int DFA(unsigned char *text) {") - self.emiti( "do {") - self.emiti( "if(%s(text))" % self.state_name(self.cg.start)) - self.emit( "return 1;") - self.emitd( r"} while (*text++ != '\0');") - self.emitd("return 0;") - self.emitd("}", 2) + self.emiti("int dfa(unsigned char *text, int len) {") + self.emit( "len++; //try match for n+1 times.") + self.emiti( "while (len--) {") + self.emit( "if (%s(text++)) return 1;" % self.state_name(self.cg.start)) + self.emitd( "}") + self.emit( "return 0;") + self.emitd("}") + return def emit_state(self, cur_state, transition): if cur_state in self.cg.accepts:
--- a/pyrect/translator/template/grep.c Mon Nov 01 14:50:52 2010 +0900 +++ b/pyrect/translator/template/grep.c Thu Nov 04 17:06:44 2010 +0900 @@ -1,14 +1,43 @@ typedef struct _thread_arg { unsigned char *buf; + int len; int match; } thread_arg_t; void* thread_dfa(void *arg) { thread_arg_t* targ = (thread_arg_t*)arg; - targ->match = DFA(targ->buf); + targ->match = DFA(targ->buf, targ->len); return NULL; } +int paradfa(unsigned char *text, int len) { + pthread_t hundle[THREAD_NUM]; + thread_arg_t targ[THREAD_NUM]; + + if (len + REG_MAX_LEN <= REG_MAX_LEN) + return dfa(text, len); + + int i, t_len = (len + THREAD_NUM - 1) / THREAD_NUM; + for (i = 0; i < THREAD_NUM; i++) { + targ[i].buf = text + (unsigned char)(i * t_len); + targ[i].len = t_len + REG_MAX_LEN; + } + targ[THREAD_NUM - 1].len = len - (t_len * (THREAD_NUM - 1)); + + for (i = 0; i < THREAD_NUM; i++) { + pthread_create(&hundle[i], NULL, (void *)thread_dfa, (void *)&targ[i]); + } + + for (i = 0; i < THREAD_NUM; i++) { + pthread_join(hundle[i], NULL); + } + + for (i = 0; i < THREAD_NUM; i++) { + if (targ[i].match) return 1; + } + return 0; +} + int paragrep(char *regexp, FILE *f, char *name) { int nmatch, used_buf = 0, reading = 1; @@ -25,13 +54,12 @@ n = strlen(lbuf[i]); if (n > 0 && lbuf[i][n-1] == '\n') lbuf[i][n-1] = '\0'; + targ[i].buf = (unsigned char *)lbuf[i]; + targ[i].len = n - 1; + pthread_create(&hundle[i], NULL, (void *)thread_dfa, (void *)&targ[i]); } } for (j = 0; j < i; j++) { - targ[j].buf = (unsigned char *)lbuf[j]; - pthread_create(&hundle[j], NULL, (void *)thread_dfa, (void *)&targ[j]); - } - for (j = 0; j < i; j++) { pthread_join(hundle[j], NULL); if (targ[j].match) { nmatch++; @@ -57,9 +85,9 @@ #ifdef FILTER_ONLY if (FILTER(buf, n-1)) { #elif defined(WITH_FILTER) - if (FILTER(buf, n-1) && DFA(buf)) { + if (FILTER(buf, n-1) && DFA(buf, n-1)) { #else - if (DFA(buf)) { + if (DFA(buf, n-1)) { #endif nmatch++; if (name != NULL)