Mercurial > hg > Members > shinya > pyrect
changeset 43:83c69d42faa8
replace converting-flow, module dfareg with module regexp. it's is substantial changing in implimentation.
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 03 Aug 2010 05:35:38 +0900 |
parents | 84059ea1a2db |
children | b4cb7d72dde5 |
files | pyrect/__init__.py pyrect/c_translator.py pyrect/cbc_translator.py pyrect/cbcgrep_translator.py pyrect/converter.py pyrect/dfa_translator.py pyrect/dot_translator.py pyrect/goto_grep_translator.py pyrect/grep_translator.py pyrect/jitgrep.py pyrect/llgrep.py pyrect/llvm_bench.py pyrect/llvm_grep_translator.py pyrect/llvm_translator.py pyrect/regexp/__init__.py pyrect/regexp/callgraph.py pyrect/regexp/dfa.py pyrect/regexp/dfa_translator.py pyrect/regexp/fa.py pyrect/regexp/kwset.py pyrect/regexp/lexer.py pyrect/regexp/nfa.py pyrect/regexp/nfa_translator.py pyrect/regexp/parser.py pyrect/regexp/testall.py pyrect/translator.py |
diffstat | 25 files changed, 552 insertions(+), 150 deletions(-) [+] |
line wrap: on
line diff
--- a/pyrect/__init__.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/__init__.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,5 +1,3 @@ #!/usr/bin/env python -from dfareg import Regexp -def compile(regexp): - return Regexp(regexp) +from regexp import Regexp
--- a/pyrect/c_translator.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/c_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,7 +1,7 @@ #!/Usr/bin/env python -from dfareg import Regexp, CallGraph -from translator import Translator +from pyrect.regexp import Regexp +from pyrect.translator import Translator class CTranslator(Translator): """CTranslator @@ -10,28 +10,22 @@ NFA: using stack, deepening depth-first search. >>> string = '(A|B)*C' >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> nfacg = CallGraph(reg.nfa) - >>> CTranslator(string, dfacg).translate() - >>> CTranslator(string, nfacg).translate() + >>> CTranslator(reg).translate() + >>> CTranslator(reg).translate() """ - def __init__(self, regexp, cg): - Translator.__init__(self, regexp, cg) + def __init__(self, regexp): + Translator.__init__(self, regexp) self.funType = 'void ' self.callType = '' self.breakStatement = '\t\t\tbreak;' self.debug = False self.eols = ('\\0', '\\n') - if self.cg.type == "DFA": - self.name_hash = self.create_name_hash() def modify_state_name(self, state_name): if state_name == "accept" or state_name == "reject": - return state_name - if self.cg.type == "DFA": - return "state_"+self.name_hash[state_name] + return str(state_name) else: - return "state_"+state_name + return "state_"+str(state_name) def emit_accept_state(self): self.emit ("""
--- a/pyrect/cbc_translator.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/cbc_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,7 +1,7 @@ #!/usr/bin/env python -from dfareg import Regexp, CallGraph -from c_translator import CTranslator +from pyrect.regexp import Regexp +from pyrect.c_translator import CTranslator class CbCTranslateExeption(Exception): pass @@ -11,15 +11,13 @@ CbCTranslator >>> string = \"(A|B)*C\" >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> ct = CbCTranslator(string, dfacg) + >>> ct = CbCTranslator(reg) >>> ct.translate() >>> ct.debug = True >>> ct.translate() """ - def __init__(self, regexp, cg): - if cg.type == "NFA": raise CbCTranslateExeption("can't translate CbC from NFA") - CTranslator.__init__(self, regexp, cg) + def __init__(self, regexp): + CTranslator.__init__(self, regexp) self.funType = '__code ' self.callType = 'goto ' self.breakStatement = ''
--- a/pyrect/cbcgrep_translator.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/cbcgrep_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,7 +1,7 @@ #!/usr/bin/env python -from grep_translator import GREPTranslator -from dfareg import Regexp, CallGraph +from pyrect.grep_translator import GREPTranslator +from pyrect.regexp import Regexp class CbCGREPTranslateExeption(Exception): pass @@ -13,14 +13,12 @@ written by Rob Pike & Brian W. Kernighan. (see template/grep.c) >>> string = \"(build|fndecl|gcc)\" >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> tje = GREPTranslator(string, dfacg) + >>> tje = GREPTranslator(reg) >>> tje.translate() """ - def __init__(self, regexp, cg): - if cg.type == "NFA": raise CbCGREPTranslateExeption("can't translate grep from NFA") - GREPTranslator.__init__(self, regexp, cg) + def __init__(self, regexp): + GREPTranslator.__init__(self, regexp) self.funType = '__code ' self.interface = "char *s, char* cur, char* buf, FILE *f, char* filename" self.args = "s, cur, buf, f, filename" @@ -29,7 +27,7 @@ self.print_file = False self.__bufsize = 1024 - def getbufsize(self,): + def getbufsize(self): return self.__bufsize def setbufsize(self, bufsize): self.__bufsize = abs(bufsize)
--- a/pyrect/converter.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/converter.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,27 +1,25 @@ #!/usr/bin/env python import sys -from dfareg import Regexp, CallGraph -from c_translator import CTranslator -from cbc_translator import CbCTranslator -from dot_translator import DotTranslator -from llvm_translator import LLVMTranslator -from grep_translator import GREPTranslator +from pyrect.regexp import Regexp +from pyrect.c_translator import CTranslator +from pyrect.cbc_translator import CbCTranslator +from pyrect.dot_translator import DotTranslator +from pyrect.llvm_translator import LLVMTranslator from optparse import OptionParser def main(argv): myusage = "%prog [-C] regexp" psr = OptionParser(usage=myusage) + psr.add_option("--C", action="store_true", dest="emit_cbc", default=False, help="emit C-source (as default)") psr.add_option("--CbC", action="store_true", dest="emit_cbc", default=False, help="emit CbC-source") - psr.add_option("--grep", action="store_true", dest="emit_grep", default=False, help="emit grep-source") psr.add_option("--LLVM", action="store_true", dest="emit_llvm", default=False, help="emit LLVM-source") psr.add_option("--Dot", action="store_true", dest="emit_dot", default=False, help="emit Dot-source") psr.add_option("--from-dfa", action="store_true", dest="dfa", default=True, help="translate from DFA") psr.add_option("--from-nfa", action="store_true", dest="nfa", default=False, help="translate from NFA") psr.add_option("-o", action="store", type="string", dest="output", default=False, help="output file", metavar="FILE") - psr.add_option("-O", action="store", type="string", dest="optimize", default=False, help="do optimization (only in llvm).", metavar="FILE") + psr.add_option("-O", action="store_true", dest="optimize", default=False, help="do optimization (only in llvm).") psr.add_option("-g", action="store_true", dest="debug", default=False, help="embed debug info") - psr.add_option("-D", action="store_true", dest="emit_dot", default=False, help="emit Dot file") (opts, args) = psr.parse_args(sys.argv) if len(args) == 1: psr.print_help() @@ -33,24 +31,22 @@ output = open(opts.output,"w") if opts.nfa: - fa = reg.nfa + fa = "NFA" else: - fa = reg.dfa + fa = "DFA" if opts.emit_dot: - translator = DotTranslator(reg.regexp, CallGraph(fa)) + translator = DotTranslator(reg, fa) translator.debug = opts.debug elif opts.emit_llvm: - translator = LLVMTranslator(reg.regexp, CallGraph(fa)) + translator = LLVMTranslator(reg) translator.debug = opts.debug translator.optimize = opts.optimize - elif opts.emit_grep: - translator = GREPTranslator(reg.regexp, CallGraph(fa)) elif opts.emit_cbc: - translator = CbCTranslator(reg.regexp, CallGraph(fa)) + translator = CbCTranslator(reg) translator.debug = opts.debug else: - translator = CTranslator(reg.regexp, CallGraph(fa)) + translator = CTranslator(reg) translator.debug = opts.debug translator.translate(output)
--- a/pyrect/dfa_translator.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/dfa_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,11 +1,10 @@ #!/usr/bin/env python -from grep_translator import GREPTranslator -from dfareg import Regexp, CallGraph +from pyrect.grep_translator import GREPTranslator +from pyrect.regexp import Regexp -'''(build|fndecl|gcc)''' -class DFATranslator(GREPTranslator): - """DFATranslator +class GNUGREPTranslator(GREPTranslator): + """GNUGREPTranslator This class can translate from DFA into size_t DFA(char* s). which is entirely equivalent to dfaexec(..) in GNU-grep (see src/dfa.c). * but which is not work currently. (when search large-file, there is fewer @@ -13,13 +12,12 @@ * probably, there is some problem exists about buffering. >>> string = '(build|fndecl|gcc)' >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> tje = DFATranslator(string, dfacg) + >>> tje = GNUGREPTranslator(reg) >>> tje.translate() """ - def __init__(self, regexp, cg): - GREPTranslator.__init__(self, regexp, cg) + def __init__(self, regexp): + GREPTranslator.__init__(self, regexp) self.funType = 'size_t ' self.callType = 'return ' self.breakStatement = '' @@ -53,7 +51,7 @@ } } while (*s != '\\n' && *s++ != '\\0'); return (size_t) -1; -}\n\n""" % (self.regexp, self.funType, self.modify_state_name(self.cg.start))) +}\n\n""" % (self.regexp.regexp, self.funType, self.modify_state_name(self.cg.start))) def emit_state(self, cur_state, transition): self.emit(self.funType + self.modify_state_name(cur_state) + "(char* s) {\n")
--- a/pyrect/dot_translator.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/dot_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,8 +1,8 @@ #!/usr/bin/env python import re -from translator import Translator -from dfareg import CallGraph, Regexp +from pyrect.translator import Translator +from pyrect.regexp import Regexp class DotTranslator(Translator): """ @@ -12,21 +12,16 @@ --code/graph/makepdf.sh is generate graph script. >>> string = \"(A|B)*C\" >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> nfacg = CallGraph(reg.nfa) - >>> DotTranslator(string, dfacg).translate() - >>> DotTranslator(string, nfacg).translate() + >>> DotTranslator(reg).translate() + >>> DotTranslator(reg).translate() """ - def __init__(self, regexp, cg): - Translator.__init__(self, regexp, cg) - if self.cg.type == "DFA": - self.name_hash = self.create_name_hash() + def __init__(self, regexp, fa="DFA"): + Translator.__init__(self, regexp) + if fa == "NFA": + self.cg = regexp.nfacg def modify_state_name(self, state_name): - if self.cg.type == "DFA": - return "s"+self.name_hash[state_name] - else: - return "s"+state_name + return "s"+state_name def emit_from_callgraph(self): self.emit('''
--- a/pyrect/goto_grep_translator.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/goto_grep_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,7 +1,7 @@ #!/usr/bin/env python from c_translator import CTranslator -from dfareg import Regexp, CallGraph +from pyrect.regexp import Regexp class GREPTranslateExeption(Exception): pass @@ -13,14 +13,12 @@ written by Rob Pike & Brian W. Kernighan. (see template/grep.label) >>> string = \"(build|fndecl|gcc)\" >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> tje = GoToGREPTranslator(string, dfacg) + >>> tje = GoToGREPTranslator(reg) >>> tje.translate() """ - def __init__(self, regexp, cg): - if cg.type == "NFA": raise GREPTranslateExeption("can't translate grep from NFA") - CTranslator.__init__(self, regexp, cg) + def __init__(self, regexp): + CTranslator.__init__(self, regexp) self.funType = 'void ' self.callType = 'return ' self.breakStatement = ''
--- a/pyrect/grep_translator.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/grep_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,7 +1,7 @@ #!/usr/bin/env python -from c_translator import CTranslator -from dfareg import Regexp, CallGraph +from pyrect.c_translator import CTranslator +from pyrect.regexp import Regexp class GREPTranslateExeption(Exception): pass @@ -13,14 +13,12 @@ written by Rob Pike & Brian W. Kernighan. (see template/grep.c) >>> string = \"(build|fndecl|gcc)\" >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> tje = GREPTranslator(string, dfacg) + >>> tje = GREPTranslator(reg) >>> tje.translate() """ - def __init__(self, regexp, cg): - if cg.type == "NFA": raise GREPTranslateExeption("can't translate grep from NFA") - CTranslator.__init__(self, regexp, cg) + def __init__(self, regexp): + CTranslator.__init__(self, regexp) self.funType = 'int ' self.callType = 'return ' self.breakStatement = ''
--- a/pyrect/jitgrep.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/jitgrep.py Tue Aug 03 05:35:38 2010 +0900 @@ -5,10 +5,10 @@ import re import time from optparse import OptionParser -from grep_translator import GREPTranslator -from goto_grep_translator import GoToGREPTranslator -from cbcgrep_translator import CbCGREPTranslator -from dfareg import Regexp, CallGraph +from pyrect.grep_translator import GREPTranslator +from pyrect.goto_grep_translator import GoToGREPTranslator +from pyrect.cbcgrep_translator import CbCGREPTranslator +from pyrect.regexp import Regexp def main(argv): myusage = """%prog [--buf-size=size] [--dump] @@ -66,13 +66,12 @@ if opts.time : start_time = time.time() reg = Regexp(string) - dfacg = CallGraph(reg.dfa) if cbc: - grept = CbCGREPTranslator(string, dfacg) + grept = CbCGREPTranslator(reg) elif opts.label: - grept = GoToGREPTranslator(string, dfacg) + grept = GoToGREPTranslator(reg) else: - grept = GREPTranslator(string, dfacg) + grept = GREPTranslator(reg) grept.begline = begline grept.bufsize = bufsize
--- a/pyrect/llgrep.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/llgrep.py Tue Aug 03 05:35:38 2010 +0900 @@ -6,7 +6,7 @@ import time from optparse import OptionParser from llvm_grep_translator import LLVMGREPTranslator -from dfareg import Regexp, CallGraph +from regexp import Regexp def main(argv): myusage = """%prog [--time] [--dump] @@ -28,8 +28,7 @@ if opts.time : start_time = time.time() reg = Regexp(args[1]) - dfacg = CallGraph(reg.dfa) - grept = LLVMGREPTranslator(args[1], dfacg) + grept = LLVMGREPTranslator(reg) grept.optimize = opts.optimize grept.args = args[2:]
--- a/pyrect/llvm_bench.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/llvm_bench.py Tue Aug 03 05:35:38 2010 +0900 @@ -3,9 +3,8 @@ import sys,re from optparse import OptionParser from timeit import Timer -from dfareg import Regexp -from dfareg import Regexp, CallGraph -from llvm_translator import LLVMTranslator +from pyrect.regexp import Regexp +from pyrect.llvm_translator import LLVMTranslator myusage = "%prog [-O] [-p] [-s string] [-r regexp] [-E]" psr = OptionParser(usage=myusage) @@ -13,16 +12,16 @@ psr.add_option("-E", action="store_true", dest="execute", default=False, help="execute Module") psr.add_option("-p", action="store_true", dest="print_", default=False, help="print module") psr.add_option("-g", action="store_true", dest="debug", default=False, help="add debug stmt to module") +psr.add_option("-v", action="store_true", dest="verbose", default=False, help="verbose mode") psr.add_option("-n", action="store", type="int", dest="number", default=1000, help="number of eval", metavar="INT") psr.add_option("-r", action="store", type="string", dest="regexp", default="(A|B)*C", help="regexp", metavar="FILE") psr.add_option("-s", action="store", type="string", dest="string", default="A"+"B"*500+"C", help="string", metavar="FILE") (opts, args) = psr.parse_args(sys.argv) -print "regexp = \"%s\", string = \"%s\", loop = %d" % (opts.regexp, opts.string, opts.number) +if opts.verbose: print "regexp = \"%s\", string = \"%s\", loop = %d" % (opts.regexp, opts.string, opts.number) reg = Regexp(opts.regexp) -dfacg = CallGraph(reg.dfa) -llvmt = LLVMTranslator(reg.regexp, dfacg) +llvmt = LLVMTranslator(reg) llvmt.string = opts.string llvmt.debug = opts.debug llvmt.optimize = opts.optimize @@ -36,5 +35,4 @@ print "re :", Timer(setup='from __main__ import reg', stmt='reg.search("%s")' % opts.string).timeit(opts.number) reg = Regexp(opts.regexp) - print "dfareg:", Timer(setup='from __main__ import reg', stmt='reg.matches("%s")' % opts.string).timeit(opts.number) - + print "pyrect:", Timer(setup='from __main__ import reg', stmt='reg.matches("%s")' % opts.string).timeit(opts.number)
--- a/pyrect/llvm_grep_translator.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/llvm_grep_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -4,7 +4,7 @@ from llvm.passes import * from llvm.ee import * from llvm_translator import LLVMTranslator -from dfareg import Regexp, CallGraph +from pyrect.regexp import Regexp class LLVMGREPTranslator(LLVMTranslator): """LLVMGREPTranslator @@ -12,20 +12,19 @@ which can translate LLVM-IR, and also can execute it's self. >>> string = 'def' >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> lt = LLVMGREPTranslator(string, dfacg) + >>> lt = LLVMGREPTranslator(reg) >>> lt.translate() >>> ret = lt.execute() >>> isinstance(ret, llvm.ee.GenericValue) True """ - def __init__(self, regexp, cg): - LLVMTranslator.__init__(self, regexp, cg) + def __init__(self, regexp): + LLVMTranslator.__init__(self, regexp) llfile = file("template/grep.ll") self.llvm_module = Module.from_assembly(llfile) self.compiled = False - self.string = regexp + self.string = regexp.regexp self.args = [] def modify_state_name(self, state_name): @@ -35,7 +34,7 @@ return state_name def emit_driver(self): - self.regexp_str = self.new_str_const(self.regexp) + self.regexp_str = self.new_str_const(self.string) dfa = self.llvm_module.get_or_insert_function( Type.function(self.int_t, (self.charptr_t,)), "DFA") dfa_entry = dfa.append_basic_block("entry")
--- a/pyrect/llvm_translator.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/llvm_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -3,8 +3,8 @@ from llvm.core import * from llvm.passes import * from llvm.ee import * -from translator import Translator -from dfareg import Regexp, CallGraph +from pyrect.translator import Translator +from pyrect.regexp import Regexp class LLVMTranslator(Translator): """LLVMTranslator @@ -12,12 +12,10 @@ and also can JIT-Compile/evaluate it's self using llvm-py. >>> string = '(A|B)*C' >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> lt = LLVMTranslator(string, dfacg) + >>> lt = LLVMTranslator(reg) >>> lt.debug = True >>> lt.translate() - >>> isinstance(lt.execute(), llvm.ee.GenericValue) - True + >>> lt.execute() """ # define llvm core types, and const @@ -29,22 +27,14 @@ const_one = Constant.int(int_t, 1) llvm.GuaranteedTailCallOpt = True - def __init__(self, regexp, cg): #(self, modName, regexp, string, self.impl_label, optimized, debug): - Translator.__init__(self, regexp, cg) + def __init__(self, regexp): + Translator.__init__(self, regexp) self.optimize = False self.debug = False self.string = "ABC" self.llvm_module = Module.new(self.cg.type) - if self.cg.type == "DFA": - self.name_hash = self.create_name_hash() self.compiled = False - def modify_state_name(self, state_name): - if self.cg.type == "DFA": - return self.name_hash[state_name] - else: - return state_name - def emit_driver(self): main = self.llvm_module.add_function( Type.function(self.int_t, (self.int_t,)), "unitmain") @@ -166,8 +156,9 @@ def execute(self): if not self.compiled: self.jitcompile() - return self.ee.run_function(self.main, - (GenericValue.int(self.int_t, 0),)) + self.ee.run_function(self.main, + (GenericValue.int(self.int_t, 0),)) + return def new_str_const(self, val): '''create string(array of int) as a global value '''
--- a/pyrect/regexp/__init__.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/regexp/__init__.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,6 +1,42 @@ -#!/usr/env/bin python +#!/usr/bin/env python + +from pyrect.regexp.parser import Parser +from pyrect.regexp.dfa import DFA +from pyrect.regexp.nfa import NFA +from pyrect.regexp.nfa_translator import NFATranslator +from pyrect.regexp.dfa_translator import DFATranslator +from pyrect.regexp.callgraph import CallGraph -from parser import Parser -from node import * +class Regexp(object): + """Regexp is basic class in Pyrect. + this contains regexp, dfa, nfa,, actually it's include all. + >>> regexp = Regexp('(A|B)*C') + >>> regexp.dfacg.map + {'1': {'A': ['1'], 'C': ['2'], 'B': ['0']}, '0': {'A': ['1'], 'C': ['2'], 'B': ['0']}, '3': {'A': ['1'], 'C': ['2'], 'B': ['0']}, '2': {}} + >>> regexp.matches('ABC') + True + >>> regexp = Regexp('(a|b)*cd*e') + >>> regexp.matches('abababcdddde') + True + >>> regexp.matches('ababccdeee') + False + """ + def __init__(self, regexp): + self.regexp = regexp + self.ast = Parser().parse(regexp) + self.nfa = NFATranslator().translate(self.ast) + self.dfa = DFATranslator().translate(self.nfa) + self.nfacg = CallGraph(self.nfa) + self.dfacg = CallGraph(self.dfa) -__all__ = [Parser, "node"] + def matches(self, string): + runtime = self.dfa.get_runtime() + return runtime.accept(string) + + +def test(): + import doctest + doctest.testmod() + + +if __name__ == "__main__": test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/callgraph.py Tue Aug 03 05:35:38 2010 +0900 @@ -0,0 +1,79 @@ +#!/Usr/bin/env python + +from pyrect.regexp.parser import Parser +from pyrect.regexp.nfa_translator import NFATranslator +from pyrect.regexp.dfa_translator import DFATranslator +from pyrect.regexp.dfa import DFA +from pyrect.regexp.nfa import NFA + +# create call graph (as dictionary) +class CallGraph(object): + """CallGraph is State Transition Diagram. + it's can be create from DFA or DFA. + >>> prs = Parser() + >>> reg = \"AA*|B\" + >>> nfa = NFATranslator().translate(prs.parse(reg)) + >>> dfa = DFATranslator().translate(prs.parse(reg)) + >>> dfacg = CallGraph(dfa) + >>> nfacg = CallGraph(nfa) + >>> dfacg.map + {'1': {'A': ['1']}, '0': {}, '3': {'A': ['1']}, '2': {'A': ['3'], 'B': ['0']}} + >>> dfacg.states + ['0', '1', '2', '3'] + >>> dfacg.accepts + ['0', '1', '3'] + >>> nfacg.map + {'1': {'': ['4']}, '0': {'A': ['1']}, '3': {'': ['2']}, '2': {'A': ['3']}, '5': {'B': ['6']}, '4': {'': ['2']}, '7': {'': ['0', '5']}, '6': {}} + >>> nfacg.states + ['0', '1', '2', '3', '4', '5', '6', '7'] + >>> nfacg.accepts + """ + + def __init__(self, fa): + self.fa = fa + if isinstance(fa, DFA): + self.type = "DFA" + else: + self.type = "NFA" + self.start = str(self.fa.start) + self.states = map(str, self.fa.states) + self.accepts = map(str, self.fa.accepts) + self.inputs = set() + self.map = self.create_call_graph(self.fa) + + def create_call_graph(self, fa): + transitionCases = dict() + # transitionCases is hash table that binds a state and inputs, + # it's useful for transition definition + # fa is dfa or nfa + # dfa.map => {(1, 'C'): 2, (3, 'C'): 2, (0, 'A'): 1, ...} + # : 1 x 'C' -> 2 + # nfa.map => {(6, ''): set([5, 7]), (1, 'A'): set([2]), ....} + # : 6 x '' -> set([5, 7]) + + for (state, input) in fa.map.iterkeys(): + slot = transitionCases.setdefault(state, set()) + slot.add(input) + + callGraph = dict() + + # create CallGraph + for state in self.states: + callGraph[str(state)] = dict() + + for (state, input) in transitionCases.iteritems(): + caseMap = dict() + for case in input: + self.inputs.add(case) + if self.type == "DFA": + caseMap[case] = [str(fa.map[state, case])] + else: + caseMap[case] = map(str, fa.map[state, case]) + callGraph[str(state)] = caseMap + return callGraph + +def test(): + import doctest + doctest.testmod() + +if __name__ == "__main__": test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/dfa.py Tue Aug 03 05:35:38 2010 +0900 @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +import curses.ascii as ascii +import copy +from fa import FA + +class DFA(FA): + def __init__(self, start, accepts, map_, states): + def transition(state, input): + return map_[state, input] + FA.__init__(self, transition, start, accepts, map_, states) + + def get_runtime(self): + return DFARuntime(self) + +class DFARuntime(object): + def __init__(self, DFA): + self.DFA = DFA + self.curState = self.DFA.start + + def do_transition(self, char): + self.curState = self.DFA.transition( + self.curState, char) + + def is_accept_state(self): + return self.curState in self.DFA.accepts + + def accept(self, string): + try: + for alphabet in string: + self.do_transition(alphabet) + except KeyError: + return False + return self.is_accept_state()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/dfa_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +from pyrect.regexp.parser import Parser +from pyrect.regexp.ast import ASTWalker +from pyrect.regexp.nfa import NFA +from pyrect.regexp.nfa_translator import NFATranslator +from pyrect.regexp.dfa import DFA + +class DFATranslator(): + """ Create DFA from AST with Visitor-Pattern. + actually DFATranslator use NFATranslator and it convert NFA into DFA. + >>> prs = Parser() + >>> dfat = DFATranslator() + >>> dfat.translate(prs.parse('ABC')).map + {(2, 'A'): 3, (0, 'C'): 1, (3, 'B'): 0} + >>> dfat.translate(prs.parse('A|B')).map + {(1, 'A'): 2, (1, 'B'): 0} + >>> dfat.translate(prs.parse('(A|B)*C')).map + {(1, 'C'): 2, (3, 'C'): 2, (0, 'A'): 1, (3, 'B'): 0, (3, 'A'): 1, (1, 'B'): 0, (1, 'A'): 1, (0, 'B'): 0, (0, 'C'): 2} + >>> dfat.translate(prs.parse('(A|B)*C')).start + 3 + >>> dfat.translate(prs.parse('(A|B)*C')).accepts + [2] + >>> dfat.translate(prs.parse('(A|B)*C')).states + [0, 3, 1, 2] + """ + + def __init__(self): + self.dfa = None + self.nfa = None + + def translate(self, arg=None): + if isinstance(arg, NFA): + self.nfa = arg + self.dfa = self.convert(self.nfa) + elif arg: + self.nfa = NFATranslator().translate(arg) + self.dfa = self.convert(self.nfa) + return self.dfa + + def convert(self, nfa): + map_ = dict() + que = set(frozenset([nfa.epsilon_expand(set([nfa.start]))])) + done = set() + + while que: + stateSet = que.pop() + + for state in stateSet: + for k, v in nfa.map.iteritems(): + if state == k[0] and k[1] != '': + slot = map_.setdefault((stateSet, k[1]), set()) + slot.update(nfa.epsilon_expand(v)) + + done.add(stateSet) + + for v in map_.itervalues(): + if not v in done: + que.add(frozenset(v)) + + name_hash = dict() + states = set() + + states.add(nfa.epsilon_expand(frozenset([nfa.start]))) + for state in map_.itervalues(): + states.add(frozenset(state)) + for index, state in enumerate(states): + name_hash[frozenset(state)] = index + states = list() + for state in name_hash.itervalues(): + states.append(state) + + start = name_hash[nfa.epsilon_expand(frozenset([nfa.start]))] + + accepts = list() + for state in name_hash.iterkeys(): + if state & set(nfa.accepts): + accepts.append(name_hash[state]) + + map__ = dict() + for key, value in map_.iteritems(): + map__[(name_hash[frozenset(key[0])],key[1])] = name_hash[frozenset(value)] + + return DFA( + start, + accepts, + map__, + states + ) + +def test(): + import doctest + doctest.testmod() + +if __name__ == '__main__' : test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/fa.py Tue Aug 03 05:35:38 2010 +0900 @@ -0,0 +1,13 @@ +#!/usr/bin/env python + +class FA(object): + def __init__(self, transition, start, accepts, map_, states=None): + self.transition = transition + self.start = start + self.accepts = accepts + self.map = map_ + self.states = states + + def accept(self, laungage): + return False +
--- a/pyrect/regexp/kwset.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/regexp/kwset.py Tue Aug 03 05:35:38 2010 +0900 @@ -7,8 +7,8 @@ kwset is also used in GNU-GREP. """ -from parser import Parser -from ast import ASTWalker +from pyrect.regexp.parser import Parser +from pyrect.regexp.ast import ASTWalker class KeywordsExtractor(ASTWalker): """ Extract with Visitor-Pattern.
--- a/pyrect/regexp/lexer.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/regexp/lexer.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,3 +1,5 @@ +#!/usr/bin/env python + from ply import lex tokens = (
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/nfa.py Tue Aug 03 05:35:38 2010 +0900 @@ -0,0 +1,94 @@ +#!/usr/bin/env python + +import curses.ascii as ascii +import copy +from pyrect.regexp.fa import FA + +class SpecialRule(object): + """High-level state transition rule. + low-lvel : [0-9] -> input in (0,1,2,3,4,5,6,7,8,9) + high-level : [0-9] -> 0 <= input <= 9 + """ + + def __init__(self): + """ low-level accept rule. as normal character.""" + self.accepts = set() + + def accepte_to(self, input): + return self.__accepts + + def __comp__(self, other): + return self.__hash__() - other.__hash__() + + def __hash__(self): + return self.__str__().__hash__() + + def __repr__(self): + return self.__str__() + + def __str__(self): + return "" + +class Range(SpecialRule): + def __init__(self, lower, higher): + SpecialRule.__init__(self, to) + self.lower = lower + self.higher = higher + self.accepts = set([chr(x) for x in range(ord(self.lower), ord(self.higher)+1)]) + + def accept_to(self, input): + """ [0-9] -> (0,1,...) """ + input in accepts + + def __str__(self): + return "[%c-%c]" % (lower, higher) + +class NFAFragment(object): + def __init__(self): + self.start = None # integer + self.accepts = None # frozenset + self.map = dict() # transition, char or special -> frozenset([states]) + + def connect(self, from_, char, to): + slot = self.map.setdefault((from_, char), set()) + slot.add(to) + + def new_skelton(self): + # copy fragment (self), and it return + newFrag = NFAFragment(); + newFrag.map = copy.deepcopy(self.map) + return newFrag + + def __or__(self, frag): + newFrag = self.new_skelton() + for k, v in frag.map.iteritems(): + newFrag.map[k] = v.copy() + return newFrag + + def __str__(self): + return ("NFA: delta -> %s, start -> %s, accepts -> %s" % + (self.map, self.start, self.accepts)) + + def build(self): + map_ = self.map + def transition(state, char): + return frozenset(map_.get((state, char), [])) + + return NFA( + transition, + self.start, + self.accepts, + self.map + ) + +class NFA(FA): + def epsilon_expand(self, set_): + que = set(set_) + done = set() + while que: + stat = que.pop() + nexts = self.transition(stat, "") + done.add(stat) + for nextStat in nexts: + if not nextStat in done: que.add(nextStat) + return frozenset(done)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/regexp/nfa_translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +from pyrect.regexp.parser import Parser +from pyrect.regexp.ast import ASTWalker +from pyrect.regexp.nfa import * + +class NFATranslator(ASTWalker): + """ Create NFA from AST with Visitor-Pattern. + AST (ast), is represented by Node-Tree. + >>> prs = Parser() + >>> nfat = NFATranslator() + >>> nfat.translate(prs.parse('ABC')).map + {(3, ''): set([4]), (4, 'C'): set([5]), (1, ''): set([2]), (0, 'A'): set([1]), (2, 'B'): set([3])} + >>> nfat.translate(prs.parse('A|B')).map + {(0, 'A'): set([1]), (2, 'B'): set([3]), (4, ''): set([0, 2])} + >>> nfat.translate(prs.parse('(A|B)*C')).map + {(0, 'A'): set([1]), (3, ''): set([4, 6]), (6, 'C'): set([7]), (2, 'B'): set([3]), (5, ''): set([4, 6]), (1, ''): set([4, 6]), (4, ''): set([0, 2])} + >>> nfat.translate(prs.parse('(A|B)*C')).accepts + [7] + """ + + def __init__(self): + self.ast = None + self.nfa = None + + def get_state_id(self): + self.__state_id += 1 + return self.__state_id + + state_id = property(get_state_id) + + def translate(self, ast=None): + if ast: + self.ast = ast + if self.ast: + self.__state_id = -1 + self.nfa = ast.accept(self).build() + self.nfa.states = range(self.__state_id + 1) + self.nfa.accepts = list(self.nfa.accepts) + return self.nfa + + def visit(self, ast): + return None + + def visit_Character(self, character): + frag = NFAFragment() + s1 = self.state_id + s2 = self.state_id + frag.connect(s1, character.char, s2) + frag.start = s1 + frag.accepts = frozenset([s2]) + return frag + + def visit_Union(self, union): + frag1 = union.op1.accept(self) + frag2 = union.op2.accept(self) + frag = frag1 | frag2 + s = self.state_id + frag.connect(s, '', frag1.start) + frag.connect(s, '', frag2.start) + frag.start = s + frag.accepts = frag1.accepts | frag2.accepts + return frag + + def visit_Concat(self, concat): + frag1 = concat.op1.accept(self) + frag2 = concat.op2.accept(self) + + frag = frag1 | frag2 + + for state in frag1.accepts: + frag.connect(state, '', frag2.start) + + frag.start = frag1.start + frag.accepts = frag2.accepts + return frag + + def visit_Star(self, star): + fragOrig = star.op.accept(self) + frag = fragOrig.new_skelton() + + for state in fragOrig.accepts: + frag.connect(state, '', fragOrig.start) + + s = self.state_id + frag.connect(s, '', fragOrig.start) + frag.start = s + frag.accepts = fragOrig.accepts | frozenset([s]) + return frag + +def test(): + import doctest + doctest.testmod() + +if __name__ == "__main__": test()
--- a/pyrect/regexp/parser.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/regexp/parser.py Tue Aug 03 05:35:38 2010 +0900 @@ -1,8 +1,9 @@ #!/usr/bin/env python -from ply import yacc -from lexer import lex, tokens -from ast import * +from ply import yacc +import os +from pyrect.regexp.lexer import lex, tokens +from pyrect.regexp.ast import * class Parser(object): """Parser @@ -13,9 +14,10 @@ >>> print ast (((((('A'.'B')|('C'.'D')))*.'1').'2').'3') """ + BASE_DIR = os.path.dirname(os.path.abspath(__file__)) def __init__(self): - self.yacc = yacc.yacc() + self.yacc = yacc.yacc(outputdir=self.BASE_DIR, debug=False) self.lexer = lex def parse(self, expression):
--- a/pyrect/translator.py Mon Aug 02 04:03:23 2010 +0900 +++ b/pyrect/translator.py Tue Aug 03 05:35:38 2010 +0900 @@ -3,23 +3,16 @@ import sys class Translator(object): - def __init__(self, regexp, cg): + def __init__(self, regexp): self.regexp = regexp - self.cg = cg + self.cg = regexp.dfacg self.stream = None def emit(self, string): self.stream.write(string) - def create_name_hash(self): - name_hash = dict() - states = list(self.cg.states) - for index in range(len(states)): - name_hash[states[index]] = str(index) - return name_hash - def modify_state_name(self, state_name): - return state_name + return str(state_name) def emit_from_callgraph(self): pass