Mercurial > hg > Members > shinya > pyrect
changeset 49:7f4221018adf
accept UTF-8 encoding. but some foundational bug in converting algorithm NFA. maybe, which is not too difficult.
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 09 Aug 2010 04:34:13 +0900 |
parents | dd2c7e815346 |
children | d1afae06e776 |
files | pyrect/regexp/ast.py pyrect/regexp/lexer.py pyrect/regexp/parser.py pyrect/translator/c_translator.py pyrect/translator/grep_translator.py pyrect/translator/template/grep.c |
diffstat | 6 files changed, 121 insertions(+), 148 deletions(-) [+] |
line wrap: on
line diff
--- a/pyrect/regexp/ast.py Sun Aug 08 04:14:10 2010 +0900 +++ b/pyrect/regexp/ast.py Mon Aug 09 04:34:13 2010 +0900 @@ -121,10 +121,8 @@ class MBCharacter(Character): def __init__(self, mbchar): - Character.__init__(self, unicode(mbchar)) - - def __str__(self): - return "'" + self.char.encode('utf-8') + "'" + Character.__init__(self, mbchar) + self.bytes = map(ord, str(mbchar)) class EscapeCharacter(Character): def __init__(self, char):
--- a/pyrect/regexp/lexer.py Sun Aug 08 04:14:10 2010 +0900 +++ b/pyrect/regexp/lexer.py Mon Aug 09 04:34:13 2010 +0900 @@ -80,7 +80,8 @@ return t def t_MBCHAR(t): - ur'[^ -~]+' # match multi byte code. + # match multi byte code. -> see http://ja.wikipedia.org/wiki/UTF-8 + u'(\xc2-\xdf].|[\xe0-\xef]..|[\xe0-\xef]..|[\xf0-\xf7]...|[\xf8-\xfb]....|[\xfc-\xfd].....)' return t def t_error(t):
--- a/pyrect/regexp/parser.py Sun Aug 08 04:14:10 2010 +0900 +++ b/pyrect/regexp/parser.py Mon Aug 09 04:34:13 2010 +0900 @@ -20,33 +20,19 @@ ((((^.'^').(('A'|'B'))?).('C')+).$) multi byte も OK!! - >>> parser.parse('aあいc?') - Concat(Concat((Character:'a').Concat((MBCharacter:'あ').(MBCharacter:'い'))).(Qmark:('c')?)) - >>> parser.parse('[123A-Zあ]') - CharClass[(Character:'1'),(Character:'2'),(Range:'Z'-'A'),(Character:'3'),(MBCharacter:'あ')] + >>> parser.parse('Aあ*い+う?B') + Concat(Concat(Concat(Concat((Character:'A').(Star:('あ')*)).(Plus:('い')+)).(Qmark:('う')?)).(Character:'B')) + >>> parser.parse('あい*う') + Concat(Concat((MBCharacter:'あ').(Star:('い')*)).(MBCharacter:'う')) """ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) - @staticmethod - def mbparse(uni_bytes): - mbchars = list() - - for mbchar in uni_bytes: - mbchars.append(MBCharacter(mbchar)) - - ret = mbchars[0] - - for mbchar in mbchars[1:]: - ret = Concat(ret, mbchar) - - return ret - def __init__(self): self.yacc = yacc.yacc(outputdir=self.BASE_DIR, debug=False) self.lexer = lex def parse(self, expression): - self.lexer.input(unicode(expression, 'utf-8')) + self.lexer.input(expression) self.ast = self.yacc.parse(lexer=self.lexer) return self.ast @@ -173,7 +159,7 @@ def p_atom7(p): 'atom : MBCHAR' - p[0] = Parser.mbparse(p[1]) + p[0] = MBCharacter(p[1]) def p_atom8(p): 'atom : ESCAPECHAR' @@ -188,13 +174,13 @@ p[0] = p[1] def p_cclass1(p): + 'cclass : cset' + p[0] = frozenset([p[1]]) + +def p_cclass2(p): 'cclass : cset DASH cset' p[0] = frozenset([Range(p[1], p[3])]) -def p_cclass2(p): - 'cclass : cset' - p[0] = frozenset([p[1]]) - def p_cset1(p): '''cset : NORMALCHAR | LPAREN
--- a/pyrect/translator/c_translator.py Sun Aug 08 04:14:10 2010 +0900 +++ b/pyrect/translator/c_translator.py Mon Aug 09 04:34:13 2010 +0900 @@ -20,7 +20,7 @@ if fa == "DFA": self.cg = regexp.dfacg self.debug = False - self.special_type = (Range, Anchor) + self.special_rule = (Range, BegLine, MBCharacter) self.trans_stmt = self._trans_stmt(self.emit) def state_name(self, name): @@ -30,24 +30,39 @@ return "state_"+str(name) def emit_accept_state(self): - self.emiti("void accept(char* s) {") - self.emit( r'printf("string matches regexp. \n\n");') + self.emiti("int accept(unsigned char* s) {") + self.emit( "return 1;") self.emitd("}", 2) def emit_reject_state(self): - self.emiti("void reject(char* s) {") - self.emit( r'printf("string matches regexp. \n\n");') + self.emiti("int reject(unsigned char* s) {") + self.emit( "return 0;") self.emitd("}", 2) + def emit_skip(self): + self.emiti("const char skip_tbl[256] = {") + self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,") + self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,") + self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,") + self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,") + self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,") + self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,") + self.emit("2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,") + self.emit("3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,") + self.emit("};") + self.emitd("#define SKIP(s) ((s) + skip_tbl[*(unsigned char *)s])", 2) + #self.emitd("#define SKIP(s) s+1", 2) + def emit_driver(self): - self.emiti("int main(int argc, char* argv[]) {") + self.emiti("int main(int argc, unsigned char* argv[]) {") self.emit( 'buf = argv[1];') - self.emit( 'puts("regexp: %s");"' % self.regexp.regexp) + self.emit( 'puts("regexp: %s");' % self.regexp.regexp) self.emit( 'puts("number of state: %d");' % len(self.cg.states)) - self.emit( r'printf(\"string: %s\n\", argv[1]);') - self.emit( "%s(argv[1]);" % self.state_name(self.cg.start)) - if self.cg.type == "NFA": - self.emit("reject(argv[1]);") + self.emit( r'printf("string: %s\n", argv[1]);') + self.emit0( "if (%s(argv[1]))" % self.state_name(self.cg.start)) + self.emit( r'printf("accept: regexp matches string. \n\n");') + self.emit0( "else ") + self.emit( r'printf("reject: regexp not matches string. \n\n");') self.emit( "return 0;") self.emitd("}", 2) @@ -73,13 +88,13 @@ string = string[2:] if len(string) == 1: ptr = "'%s'" % string[0] - cmp_stmt.append(("char *", offset, ptr)) + cmp_stmt.append(("unsigned char *", offset, ptr)) offset += 1 self.emit() self.emit0("if (") for stmt in cmp_stmt: - self.emit0("*(%s)((char *)s+%d) == %s" % stmt) + self.emit0("*(%s)((unsigned char *)s+%d) == %s" % stmt) if stmt != cmp_stmt[-1]: self.emit(" && ") self.emiti(")") @@ -99,12 +114,16 @@ self.emitd() def emit_strcmp3(self, string, next): - self.emit('static char* string = \"%s\";' % string) + self.emit('static unsigned char* string = \"%s\";' % string) self.emiti("if (memcmp(string, s, %d) == 0)" % len(string)) self.emit("return %s(s+%d);" % (self.state_name(next), len(string))) self.emitd() def emit_switch(self, case, default=None): + if not case: + if default: + self.emit("return %s(s);" % default) + return self.emiti("switch(*s++) {") for case, next_ in case.iteritems(): self.trans_stmt.emit(case, self.state_name(next_)) @@ -113,40 +132,47 @@ self.emitd("}") def emit_state(self, cur_state, transition): - self.emiti("int %s(char* s) {" % self.state_name(cur_state)) + self.emiti("int %s(unsigned char* s) {" % self.state_name(cur_state)) if self.debug: self.emit(r'printf("state: %s, input: %%s\n", s);' % cur_state) if self.cg.type == "NFA": + default = None if '' in transition: epsilon_transition = transition.pop('') for n in epsilon_transition: self.emit("\t%s%s(s);\n" % (self.callType, self.state_name(n))) + else: + default = "reject" + + any_ = None + + for input_ in transition.keys(): + if type(input_) in self.special_rule: + self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) + elif type(input_) is AnyChar: + any_ = (input_, self.state_name(transition.pop(input_))) + default = None if cur_state in self.cg.accepts: eol = Character(r'\0') transition[eol] = "accept" - for input_ in transition.keys(): - for special in self.special_type: - if isinstance(input_, special): - self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) + self.emit_switch(transition, default) - if transition: - if self.cg.type == "DFA": - self.emit_switch(transition, default="reject") - else: - self.emit_switch(transition) + if any_: + self.trans_stmt.emit(any_[0], any_[1]) self.emitd("}", 2) def emit_initialization(self): self.emit("#include <stdio.h>") for state in self.cg.map.iterkeys(): - self.emit("int %s(char* s);" % self.state_name(state)) - self.emit('int accept(char* s);') - self.emit('int reject(char* s);') - self.emit('char* buf;', 2) + self.emit("int %s(unsigned char* s);" % self.state_name(state)) + self.emit('int accept(unsigned char* s);') + self.emit('int reject(unsigned char* s);') + self.emit('unsigned char* buf;') + self.emit_skip() def emit_from_callgraph(self): # self.emit C-source code @@ -171,22 +197,27 @@ self._emit("/* UNKNOW RULE */") self._emit("/* %s */" % input_node.__repr__()) - def visit_MBCharacter(self, mbchar): - self.visit(mbchar) - def visit_Character(self, char): self._emit("case '%s':" % char.char) self._emit(" return %s(s);" % self.next) - def visit_BegLine(self, begline): - self._emit("if (s == buf)") - self._emit(" return %s(s);" % self.next) - def visit_EndLine(self, endline): - self._emit(r"if (*s == '\0')") + self._emit(r"case '\0':") self._emit(" return %s(s);" % self.next) # Special Rule + + def visit_MBCharacter(self, mbchar): + self._emit("/* match %s */" % mbchar) + bytes = mbchar.bytes + self._emit(" if(%s)" % \ + " && ".join(["*(s+%d) == 0x%x" % (d, x) for d, x in enumerate(bytes)])) + self._emit(" return %s(s+%d);" % (self.next, len(bytes)), 2) + + def visit_BegLine(self, begline): + self._emit("if (s == buf)") + self._emit(" return %s(s);" % self.next, 2) + def visit_Range(self, range): if isinstance(range.lower, MBCharacter) and not \ isinstance(range.upper, MBCharacter) or \ @@ -197,8 +228,13 @@ if isinstance(range.lower, MBCharacter): self.visit(range) else: - self._emit("if ('%s' <= *s && *s =< '%s')" % (range.lower.char, range.upper.char)) - self._emit(" return %s(s+1);" % self.next) + self._emit("if ('%s' <= *s && *s <= '%s')" % (range.lower.char, range.upper.char)) + self._emit(" return %s(s+1);" % self.next, 2) + + def visit_AnyChar(self, anychar): + self._emit(r"if (*s != '\0')") + self._emit( "return %s(SKIP(s));" % self.next, 2) + self._emit("return reject(s);") def test(): import doctest
--- a/pyrect/translator/grep_translator.py Sun Aug 08 04:14:10 2010 +0900 +++ b/pyrect/translator/grep_translator.py Mon Aug 09 04:34:13 2010 +0900 @@ -1,6 +1,6 @@ #!/usr/bin/env python -import copy +import os from c_translator import CTranslator from pyrect.regexp import Regexp @@ -18,12 +18,10 @@ >>> tje.translate() """ + BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + def __init__(self, regexp): - CTranslator.__init__(self, regexp) - self.funType = 'int ' - self.callType = 'return ' - self.breakStatement = '' - self.begline = False + CTranslator.__init__(self, regexp, fa="DFA") self.__bufsize = 1024 def getbufsize(self,): @@ -33,83 +31,36 @@ bufsize = property(getbufsize, setbufsize) - def emit_accept_state(self): - self.emit (""" -%saccept(char* s) { -\treturn 1; -}\n""" % self.funType) - - def emit_reject_state(self): - self.emit (""" -%sreject(char* s) { -\treturn 0; -}\n""" % self.funType) - def emit_initialization(self): - self.emit("#include <stdio.h>\n") - self.emit("#include <stdlib.h>\n") - self.emit("#include <string.h>\n\n") - - self.emit("#define LINEBUFSIZE 1024\n") - self.emit("#define READBUFSIZE %d\n\n" % (self.bufsize)) - self.emit("char readbuf[%d];\n\n" % (self.bufsize)) - - self.emit("%sDFA(char* s);\n" % (self.funType)) - for state in self.cg.map.iterkeys(): - self.emit(self.funType + self.state_name(state) + "(char* s);\n") - self.emit(self.funType + 'accept(char* s);\n') - self.emit(self.funType + 'reject(char* s);\n') - grepsource = open("template/grep.c") + CTranslator.emit_initialization(self) + self.emit("#define LINEBUFSIZE 1024") + self.emit("#define READBUFSIZE %d" % (self.bufsize)) + self.emit("#include <stdlib.h>") + self.emit("#include <string.h>") + self.emit("char readbuf[%d];" % (self.bufsize)) + self.emit("int DFA(unsigned char* s);", 2) + grepsource = open(self.BASE_DIR + "/template/grep.c") self.emit(grepsource.read()) def emit_filter(self): pass - def emit_matcher(self): - self.emit("int match(char *regexp, char *text) {\n") - if self.begline: - self.emit("\t\treturn DFA(text);\n") - else: - self.emit(""" -\tdo { -\t\tif (DFA(text)) -\t\t\treturn 1; -\t} while (*text++ != '\\0'); -\treturn 0; -""") - self.emit("}\n\n") - def emit_driver(self): - self.emit(""" -int main(int argc, char* argv[]) { -\treturn grepmain(argc, argv); -}\n\n""") - self.emit_matcher() - self.emit(""" -%sDFA(char *s) { - return %s(s); -}\n\n""" % (self.funType, self.state_name(self.cg.start))) + self.emiti("int DFA(unsigned char *text) {") + self.emiti( "do {") + self.emiti( "if(%s(text))" % self.state_name(self.cg.start)) + self.emit( "return 1;") + self.emitd( r"} while (*text++ != '\0');") + self.emit("return 0;") + self.emitd("}", 2) def emit_state(self, cur_state, transition): - transition = copy.deepcopy(transition) - self.emit(self.funType + self.state_name(cur_state) + "(char* s) {\n") - strings = [x for x in transition.keys() if len(x) > 1] - if strings: - self.emit("\tchar *ls;\n") - for string in strings: - self.emit_strcmp2(string, transition.pop(string)) - if cur_state in self.cg.accepts: - self.emit("\treturn accept(s);\n") + self.emiti("int %s(unsigned char* s) {" % self.state_name(cur_state)) + self.emit( "return accept(s);") + self.emitd("}") else: - if transition: - if self.cg.type == "DFA": - self.emit_switch(transition, default="reject") - else: - self.emit_switch(transition) - else: - self.emit("\t return reject(s);\n") - self.emit("}\n\n") + CTranslator.emit_state(self, cur_state, transition) def test(): import doctest
--- a/pyrect/translator/template/grep.c Sun Aug 08 04:14:10 2010 +0900 +++ b/pyrect/translator/template/grep.c Mon Aug 09 04:34:13 2010 +0900 @@ -1,22 +1,23 @@ -int grep(char * regexp, FILE *f, char *name) { +int grep(char *regexp, FILE *f, char *name) { int n, nmatch; - char buf[LINEBUFSIZE]; + char lbuf[LINEBUFSIZE]; + buf = (unsigned char *)lbuf; nmatch = 0; - while (fgets(buf, sizeof buf, f) != NULL) { - n = strlen(buf); + while (fgets(lbuf, sizeof lbuf, f) != NULL) { + n = strlen(lbuf); if (n > 0 && buf[n-1] == '\n') - buf[n-1] = '\0'; - if (match(regexp, buf)) { + lbuf[n-1] = '\0'; + if (DFA(buf)) { nmatch++; if (name != NULL) printf("%s:", name); - printf("%s\n", buf); + printf("%s\n", lbuf); } } return nmatch; } -int grepmain(int argc, char* argv[]) { +int main(int argc, char* argv[]) { int i, nmatch; FILE *f; @@ -27,7 +28,7 @@ nmatch = 0; if (argc == 2) { if (grep(argv[1], stdin, NULL)) - nmatch; + nmatch++; } else { for (i = 2; i < argc; i++) { f = fopen(argv[i], "r");