view pyrect/regexp/parser.py @ 53:1f8c474ca8b3

bug fix: modify escape character parsing rule.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Wed, 25 Aug 2010 20:50:52 +0900
parents c48284580d5a
children 5db856953793
line wrap: on
line source

#!/usr/bin/env python
#-*- encoding: utf-8 -*-

from ply import yacc
import os
import unicodedata
from pyrect.regexp.lexer import lex, tokens
from pyrect.regexp.ast import *

class Parser(object):
    """Parser
    This class can parse from Regexp to AST.
    if you want something to do from AST, then modify Nodes.
    >>> parser = Parser()
    >>> ast = parser.parse('(AB|CD)*123')
    >>> print ast
    (((((('A'.'B')|('C'.'D')))*.'1').'2').'3')
    >>> ast = parser.parse('^\^(A|B)?C+$')
    >>> print ast
    ((((^.'^').(('A'|'B'))?).('C')+).$)

    multi byte も OK!!
    >>> parser.parse('Aあ*い+う?B')
    Concat(Concat(Concat(Concat((Character:'A').(Star:('あ')*)).(Plus:('い')+)).(Qmark:('う')?)).(Character:'B'))
    >>> parser.parse('あい*う')
    Concat(Concat((MBCharacter:'あ').(Star:('い')*)).(MBCharacter:'う'))
    >>> parser.parse('[a-f123]')
    CharClass[(Range:'a'-'f'),(Character:'1'),(Character:'2'),(Character:'3')]
    >>> parser.parse('/\* *TODO')
    """
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))

    def __init__(self):
        self.yacc  = yacc.yacc(outputdir=self.BASE_DIR, debug=False)
        self.lexer = lex

    def parse(self, expression):
        self.lexer.input(expression)
        self.ast = self.yacc.parse(lexer=self.lexer)
        return self.ast

"""Parse following language
----------------------------------------
regexp    -> regexp UNION branch | branch
branch    -> branch closure | closure
closure   -> closure STAR | closure QMARK | closure PLUS | atom
atom      -> LPAREN regexp RPAREN | LBRACKET charclass RBRACKET
           | ANYCHAR | NORMALCHAR | CARET | DOLLAR | MBCHAR | ESCAPECHAR
charclass -> charclass cclass | cclass
cclass    -> cset DASH cset | cset
cset      -> NORMALCHAR | LPAREN | RPAREN | ANYCHAR | CARET | DOLLAR
           | PLUS | QMARK | STAR | DASH | MBCHAR

old parse rule
(A) expression -> subexpr EOF
(B) subexpr -> seq '|' subexpr | seq
(C) seq -> subseq | ''
(D) subseq -> star subseq | star
(E) star -> factor '*' | factor
(F) factor -> '(' subexpr ')' | CHARACTER

hairy...

/*from gnu-grep, The grammar understod by the parser is as follow. (dfa.c:1221)
   regexp:
     regexp OR branch
     branch

   branch:
     branch closure
     closure

   closure:
     closure QMARK
     closure STAR
     closure PLUS
     closure REPMN
     atom

   atom:
     <normal character>
     <multibyte character>
     ANYCHAR
     MBCSET
     CSET
     BACKREF
     BEGLINE
     ENDLINE
     BEGWORD
     ENDWORD
     LIMWORD
     NOTLIMWORD
     CRANGE
     LPAREN regexp RPAREN
     <empty>
     The parser builds a parse tree in postfix form in an array of tokens. */

     *and more detail for gnu-grep's grammar, see grep/src/dfa.h .
"""

# Parsing-Rule
def p_regexp1(p):
    'regexp : regexp UNION branch'
    p[0] = Union(p[1], p[3])

def p_regexp2(p):
    'regexp : branch'
    p[0] = p[1]

def p_branch1(p):
    'branch : branch closure'
    p[0] = Concat(p[1], p[2])

def p_branch2(p):
    'branch : closure'
    p[0] = p[1]

def p_closure1(p):
    'closure : closure STAR'
    p[0] = Star(p[1])

def p_closure2(p):
    'closure : closure QMARK'
    p[0] = Qmark(p[1])

def p_closure3(p):
    'closure : closure PLUS'
    p[0] = Plus(p[1])

def p_closure4(p):
    'closure : atom'
    p[0] = p[1]

def p_atom1(p):
    'atom : LPAREN regexp RPAREN'
    p[0] = p[2]

def p_atom2(p):
    '''atom : NORMALCHAR
            | DASH'''
    p[0] = Character(p[1])

def p_atom3(p):
    'atom : ANYCHAR'
    p[0] = AnyChar()

def p_atom4(p):
    '''atom : RBRACKET charclass LBRACKET
          | RBRACKET CARET charclass LBRACKET'''
    if p[2] == '^':
        p[0] = CharClass(p[3], inverse=True)
    else:
        p[0] = CharClass(p[2])

def p_atom5(p):
    'atom : CARET'
    p[0] = BegLine()

def p_atom6(p):
    'atom : DOLLAR'
    p[0] = EndLine()

def p_atom7(p):
    'atom : MBCHAR'
    ret = Character(p[1][0])
    for byte in p[1][1:]:
        ret = Concat(ret, Character(byte))
    p[0] = ret

def p_atom8(p):
    'atom : ESCAPECHAR'
    p[0] = Character(p[1])

def p_charclass1(p):
    'charclass : charclass cclass'
    p[0] = p[1] + p[2]

def p_charclass2(p):
    'charclass : cclass'
    p[0] = p[1]

def p_cclass1(p):
    'cclass : cset'
    p[0] = (p[1],)

def p_cclass2(p):
    'cclass : cset DASH cset'
    p[0] = (Range(p[1], p[3]),)

def p_cset1(p):
    '''cset : NORMALCHAR
            | LPAREN
            | RPAREN
            | ANYCHAR
            | CARET
            | DOLLAR
            | PLUS
            | QMARK
            | STAR
            | DASH'''
    p[0] = Character(p[1])

def p_cset2(p):
    'cset : MBCHAR'
    ret = Character(p[1][0])
    for byte in p[1][1:]:
        ret = Concat(ret, Character(byte))

    p[0] = ret

def p_error(p):
    raise Exception("syntax error")

def test():
    import doctest
    doctest.testmod()

if __name__ == "__main__": test()