Mercurial > hg > Members > nobuyasu > test
view pyrect/pyrect/regexp/analyzer.py @ 9:493c96d030c0
add pyrect
author | nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 14 Jun 2011 17:24:03 +0900 (2011-06-14) |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python """ Extract Keywords from AST. Keywords, which are necessary words to be accepted with Regular-Expression. and which are used to Fixed-String-Filtering (ex: Boyer-Moore). kwset is also used in GNU-GREP. """ from pyrect.regexp.parser import Parser from pyrect.regexp.ast import ASTWalker, Plus class Analyzer(ASTWalker): """ Extract with Visitor-Pattern. AST (ast), is represented by Node-Tree. >>> prs = Parser() >>> an = Analyzer() >>> an.analyze(prs.parse('fixed-string')) (12, 12, ['fixed-string']) >>> an.analyze(prs.parse('(build|fndecl|gcc)')) (6, 3, []) >>> an.analyze(prs.parse('123(AB|CD)*456')) (inf, 6, ['123', '456']) >>> an.analyze(prs.parse('((12)+|3)|456')) (inf, 1, []) >>> an.analyze(prs.parse('^(plus)?(qmark)?')) (9, 0, []) >>> an.analyze(prs.parse('\*+ \[\[')) (inf, 4, ['* [[']) """ def __init__(self, ast=None): if ast: self.analyze(ast) else: self.max_len = 0 self.min_len = 0 def analyze(self, ast=None): if ast: self.max_len, self.min_len, self.must_words = ast.accept(self) self.must_words = [x for x in self.must_words if x != ""] return self.max_len, self.min_len, self.must_words def visit(self, ast): """Following Classes contain no-Keywords. Union, Star """ return 1, 1, [str(ast)] def visit_Character(self, ast): return 1, 1, [chr(ast.char)] def concat(self, (max1, min1, key1), (max2, min2, key2)): return max1 + max2, min1 + min2, key1[0:-1] \ + ([key1[-1] + key2[0]]) + key2[1:] def union(self, (max1, min1, _), (max2, min2, __)): return max(max1, max2), min(min1, min2), ["", ""] def visit_Star(self, star): return float("inf"), 0, ["", ""] def visit_Plus(self, plus): (_, m, k) = plus.op.accept(self) return float("inf"), m, k + [""] + k def visit_Qmark(self, qmark): (m, _, _) = qmark.op.accept(self) return m, 0, ["", ""] def visit_CharClass(self, cclass): return 1, 1, ["", ""] def test(): import doctest doctest.testmod() if __name__ == "__main__": test()