Mercurial > hg > Members > shinya > pyrect
changeset 39:43b277a00905
add Lexer/Parser/AST class. it's can parse Regexp to AST. (Pyrect will shift to more flexible/robust system.)
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 13 Jul 2010 07:53:28 +0900 |
parents | 06826250198b |
children | 962ae4154724 |
files | pyrect/llgrep.py pyrect/regexp/__init__.py pyrect/regexp/ast.py pyrect/regexp/kwset.py pyrect/regexp/lexer.py pyrect/regexp/parser.py |
diffstat | 6 files changed, 306 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/pyrect/llgrep.py Mon Jul 12 06:24:57 2010 +0900 +++ b/pyrect/llgrep.py Tue Jul 13 07:53:28 2010 +0900 @@ -14,8 +14,6 @@ psr = OptionParser(usage=myusage) redirect = "" - srcpath = "/tmp/jitgrep_dfa.c" - binpath = "/tmp/jitgrep" psr.add_option("-O", action="store_true", dest="optimize", default=False, help="Print compile/matching time.") psr.add_option("--time", action="store_true", dest="time", default=False, help="Print compile/matching time.") @@ -49,11 +47,12 @@ end_time = time.time() print("Compiling : " + str(end_time - start_time) + " Sec.") - if (opts.time): redirect = "> /dev/null" - if (opts.out): redirect = ">" + opts.out - + if (opts.time): sys.stdout = open("/dev/null", "r") + if (opts.out): sys.stdout = open(opts.out, "r") + sys.stderr = open("/dev/null", "r") if (opts.time): start_time = time.time() grept.execute() + sys.stdout = sys.__stdout__ if (opts.time): end_time = time.time() print("Matching : " + str(end_time - start_time) + " Sec.")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/__init__.py Tue Jul 13 07:53:28 2010 +0900 @@ -0,0 +1,6 @@ +#!/usr/env/bin python + +from parser import Parser +from node import * + +__all__ = [Parser, "node"]
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/ast.py Tue Jul 13 07:53:28 2010 +0900 @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +""" +General-Node-set. Parser create AST (be composed of Nodes) from Regexp. +Node are Printable, and Keywords Countable(kwset_node). +""" + +class ASTWalker(): + def visit(self, ast): + pass + +# AST-Nodes +class Node(object): + def __init__(self): + pass + def __str__(self): + return str(self.__class__) + def accept(self, visitor): + visit = "visit_%s" % self.__class__.__name__ + return getattr(visitor, visit, visitor.visit)(self) + +class Character(Node): + def __init__(self, char): + self.char = char + + def __str__(self): + return "'" + self.char + "'" + +class Concat(Node): + def __init__(self, op1, op2): + self.op1 = op1 + self.op2 = op2 + + def concat(self, key1, key2): + if isinstance(key1, str) and isinstance(key2, str): + return key1 + key2 + elif isinstance(key1, str) and isinstance(key2, list): + if key2[0]: + key2[0] = key1 + key2[0] + else: + key2 = [key1] + key2 + return key2 + elif isinstance(key1, list) and isinstance(key2, str): + if key1[-1]: + key1[-1] = key1[-1] + key2 + else: + key1 = key1 + [key2] + return key1 + else: + key1 + key2 + + def __str__(self): + return "(%s.%s)" % (self.op1, self.op2) + +class Union(Node): + def __init__(self, op1, op2): + self.op1 = op1 + self.op2 = op2 + + def __str__(self): + return "(%s|%s)" % (self.op1, self.op2) + +class Star(Node): + def __init__(self, op): + self.op = op + + def __str__(self): + return "(%s)*" % self.op
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/kwset.py Tue Jul 13 07:53:28 2010 +0900 @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +""" +Extract Keywords from AST. Keywords, +which are necessary words to be accepted with Regular-Expression. +and which are used to Fixed-String-Filtering (ex: Boyer-Moore). +kwset is also used in GNU-GREP. +""" + +from parser import Parser +from ast import ASTWalker + +class KeywordsExtractor(ASTWalker): + """ Extract with Visitor-Pattern. + AST (ast), is represented by Node-Tree. + """ + def __init__(self): + self.keywords = [] + + def extract_keywords(self, ast=None): + if ast: + self.keywords = ast.accept(self) + return self.keywords + + def visit(self, ast): + """Following Classes contain no-Keywords. + Union, Star + """ + return [''] + + def visit_Character(self, character): + return character.char + + def visit_Concat(self, concat): + key1 = concat.op1.accept(self) + key2 = concat.op2.accept(self) + + if isinstance(key1, str) and isinstance(key2, str): + return key1 + key2 + elif isinstance(key1, str) and isinstance(key2, list): + if key2[0]: + key2[0] = key1 + key2[0] + else: + key2 = [key1] + key2 + return key2 + elif isinstance(key1, list) and isinstance(key2, str): + if key1[-1]: + key1[-1] = key1[-1] + key2 + else: + key1 = key1 + [key2] + return key1 + else: + return key1 + key2 + +def extract_keywords(ast): + return KeywordsExtractor().extract_keywords(ast) + +def test(): + """ + >>> prs = Parser() + >>> kex = KeywordsExtractor() + >>> kex.extract_keywords(prs.parse('(AB|CD)*123')) + ['', '123'] + >>> kex.extract_keywords(prs.parse('WOOO*PS!!')) + ['WOO', '', 'PS!!'] + >>> kex.extract_keywords(prs.parse('(build|fndecl|gcc)')) + [''] + """ + import doctest + doctest.testmod() + +if __name__ == "__main__": test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/lexer.py Tue Jul 13 07:53:28 2010 +0900 @@ -0,0 +1,36 @@ +from ply import lex + +tokens = ( + 'UNION', + 'STAR', + 'LPAREN', + 'RPAREN', + 'ANYCHAR', + ) + +def t_UNION(t): + ur'\|' + return t + +def t_STAR(t): + ur'\*' + return t + +def t_LPAREN(t): + ur'\(' + return t + +def t_RPAREN(t): + ur'\)' + return t + +def t_ANYCHAR(t): + ur'\\?(.)' + t.value = t.value[-1:] + return t + +def t_error(t): + print "Illegal character '%s'" % t.value[0] + raise t + +lex.lex()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/parser.py Tue Jul 13 07:53:28 2010 +0900 @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +from ply import yacc +from lexer import lex, tokens +from ast import * + +class Parser(object): + """Parser + This class can parse from Regexp to AST. + if you want something to do from AST, then modify Nodes. + >>> parser = Parser() + >>> ast = parser.parse('(AB|CD)*123') + >>> print ast + (((((('A'.'B')|('C'.'D')))*.'1').'2').'3') + """ + + def __init__(self): + self.yacc = yacc.yacc() + self.lexer = lex + + def parse(self, expression): + self.lexer.input(expression) + self.ast = self.yacc.parse(lexer=self.lexer) + return self.ast + +"""Parse following language +---------------------------------------- +regexp -> regexp UNION branch | branch +branch -> branch closure | closure +closure -> closure STAR | atom +atom -> LPAREN regexp RPAREN | ANYCHAR + +old parse rule +(A) expression -> subexpr EOF +(B) subexpr -> seq '|' subexpr | seq +(C) seq -> subseq | '' +(D) subseq -> star subseq | star +(E) star -> factor '*' | factor +(F) factor -> '(' subexpr ')' | CHARACTER + +hairy... + +/*from gnu-grep, The grammar understod by the parser is as follow. (dfa.c:1221) + regexp: + regexp OR branch + branch + + branch: + branch closure + closure + + closure: + closure QMARK + closure STAR + closure PLUS + closure REPMN + atom + + atom: + <normal character> + <multibyte character> + ANYCHAR + MBCSET + CSET + BACKREF + BEGLINE + ENDLINE + BEGWORD + ENDWORD + LIMWORD + NOTLIMWORD + CRANGE + LPAREN regexp RPAREN + <empty> + The parser builds a parse tree in postfix form in an array of tokens. */ + + *and more detail for gnu-grep's grammar, see grep/src/dfa.h . +""" + +# Parsing-Rule +def p_regexp1(p): + 'regexp : regexp UNION branch' + p[0] = Union(p[1], p[3]) + +def p_regexp2(p): + 'regexp : branch' + p[0] = p[1] + +def p_branch1(p): + 'branch : branch closure' + p[0] = Concat(p[1], p[2]) + +def p_branch2(p): + 'branch : closure' + p[0] = p[1] + +def p_closure1(p): + 'closure : closure STAR' + p[0] = Star(p[1]) + +def p_closure2(p): + 'closure : atom' + p[0] = p[1] + +def p_atom1(p): + 'atom : LPAREN regexp RPAREN' + p[0] = p[2] + +def p_atom2(p): + 'atom : ANYCHAR' + p[0] = Character(p[1]) + +def p_error(p): + raise Exception("syntax error") + +def test(): + import doctest + doctest.testmod() + +if __name__ == "__main__": test()