Mercurial > hg > Members > shinya > pyrect
view pyrect/regexp/parser.py @ 53:1f8c474ca8b3
bug fix: modify escape character parsing rule.
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 25 Aug 2010 20:50:52 +0900 |
parents | c48284580d5a |
children | 5db856953793 |
line wrap: on
line source
#!/usr/bin/env python #-*- encoding: utf-8 -*- from ply import yacc import os import unicodedata from pyrect.regexp.lexer import lex, tokens from pyrect.regexp.ast import * class Parser(object): """Parser This class can parse from Regexp to AST. if you want something to do from AST, then modify Nodes. >>> parser = Parser() >>> ast = parser.parse('(AB|CD)*123') >>> print ast (((((('A'.'B')|('C'.'D')))*.'1').'2').'3') >>> ast = parser.parse('^\^(A|B)?C+$') >>> print ast ((((^.'^').(('A'|'B'))?).('C')+).$) multi byte も OK!! >>> parser.parse('Aあ*い+う?B') Concat(Concat(Concat(Concat((Character:'A').(Star:('あ')*)).(Plus:('い')+)).(Qmark:('う')?)).(Character:'B')) >>> parser.parse('あい*う') Concat(Concat((MBCharacter:'あ').(Star:('い')*)).(MBCharacter:'う')) >>> parser.parse('[a-f123]') CharClass[(Range:'a'-'f'),(Character:'1'),(Character:'2'),(Character:'3')] >>> parser.parse('/\* *TODO') """ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) def __init__(self): self.yacc = yacc.yacc(outputdir=self.BASE_DIR, debug=False) self.lexer = lex def parse(self, expression): self.lexer.input(expression) self.ast = self.yacc.parse(lexer=self.lexer) return self.ast """Parse following language ---------------------------------------- regexp -> regexp UNION branch | branch branch -> branch closure | closure closure -> closure STAR | closure QMARK | closure PLUS | atom atom -> LPAREN regexp RPAREN | LBRACKET charclass RBRACKET | ANYCHAR | NORMALCHAR | CARET | DOLLAR | MBCHAR | ESCAPECHAR charclass -> charclass cclass | cclass cclass -> cset DASH cset | cset cset -> NORMALCHAR | LPAREN | RPAREN | ANYCHAR | CARET | DOLLAR | PLUS | QMARK | STAR | DASH | MBCHAR old parse rule (A) expression -> subexpr EOF (B) subexpr -> seq '|' subexpr | seq (C) seq -> subseq | '' (D) subseq -> star subseq | star (E) star -> factor '*' | factor (F) factor -> '(' subexpr ')' | CHARACTER hairy... /*from gnu-grep, The grammar understod by the parser is as follow. (dfa.c:1221) regexp: regexp OR branch branch branch: branch closure closure closure: closure QMARK closure STAR closure PLUS closure REPMN atom atom: <normal character> <multibyte character> ANYCHAR MBCSET CSET BACKREF BEGLINE ENDLINE BEGWORD ENDWORD LIMWORD NOTLIMWORD CRANGE LPAREN regexp RPAREN <empty> The parser builds a parse tree in postfix form in an array of tokens. */ *and more detail for gnu-grep's grammar, see grep/src/dfa.h . """ # Parsing-Rule def p_regexp1(p): 'regexp : regexp UNION branch' p[0] = Union(p[1], p[3]) def p_regexp2(p): 'regexp : branch' p[0] = p[1] def p_branch1(p): 'branch : branch closure' p[0] = Concat(p[1], p[2]) def p_branch2(p): 'branch : closure' p[0] = p[1] def p_closure1(p): 'closure : closure STAR' p[0] = Star(p[1]) def p_closure2(p): 'closure : closure QMARK' p[0] = Qmark(p[1]) def p_closure3(p): 'closure : closure PLUS' p[0] = Plus(p[1]) def p_closure4(p): 'closure : atom' p[0] = p[1] def p_atom1(p): 'atom : LPAREN regexp RPAREN' p[0] = p[2] def p_atom2(p): '''atom : NORMALCHAR | DASH''' p[0] = Character(p[1]) def p_atom3(p): 'atom : ANYCHAR' p[0] = AnyChar() def p_atom4(p): '''atom : RBRACKET charclass LBRACKET | RBRACKET CARET charclass LBRACKET''' if p[2] == '^': p[0] = CharClass(p[3], inverse=True) else: p[0] = CharClass(p[2]) def p_atom5(p): 'atom : CARET' p[0] = BegLine() def p_atom6(p): 'atom : DOLLAR' p[0] = EndLine() def p_atom7(p): 'atom : MBCHAR' ret = Character(p[1][0]) for byte in p[1][1:]: ret = Concat(ret, Character(byte)) p[0] = ret def p_atom8(p): 'atom : ESCAPECHAR' p[0] = Character(p[1]) def p_charclass1(p): 'charclass : charclass cclass' p[0] = p[1] + p[2] def p_charclass2(p): 'charclass : cclass' p[0] = p[1] def p_cclass1(p): 'cclass : cset' p[0] = (p[1],) def p_cclass2(p): 'cclass : cset DASH cset' p[0] = (Range(p[1], p[3]),) def p_cset1(p): '''cset : NORMALCHAR | LPAREN | RPAREN | ANYCHAR | CARET | DOLLAR | PLUS | QMARK | STAR | DASH''' p[0] = Character(p[1]) def p_cset2(p): 'cset : MBCHAR' ret = Character(p[1][0]) for byte in p[1][1:]: ret = Concat(ret, Character(byte)) p[0] = ret def p_error(p): raise Exception("syntax error") def test(): import doctest doctest.testmod() if __name__ == "__main__": test()