Mercurial > hg > Members > shinya > pyrect
changeset 2:9816b755ac91
classify src
author | ryoma <e075725@ie.u-ryukyu.ac.jp> |
---|---|
date | Thu, 17 Jun 2010 15:53:50 +0900 |
parents | d5e81ef9afba |
children | a193b4ff3909 |
files | __init__.py benchReg.py dfareg.py reg2llvm.py src/__init__.py src/benchReg.py src/dfareg.py src/reg2llvm.py |
diffstat | 8 files changed, 695 insertions(+), 695 deletions(-) [+] |
line wrap: on
line diff
--- a/__init__.py Thu Jun 17 15:27:17 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -def compile(regexp): - return Regexp(regexp)
--- a/benchReg.py Thu Jun 17 15:27:17 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -#!/usr/bin/env python - -import sys,re -from optparse import OptionParser -from timeit import Timer -from dfareg import Regexp -from reg2llvm import RegLLVM - -myusage = "%prog [-O] [-p] [-s string] [-r regexp] [-E]" -psr = OptionParser(usage=myusage) -psr.add_option("-O", action="store_true", dest="optimizeFlag", default=False, help="optimimzation") -psr.add_option("-L", action="store_true", dest="implLabel", default=False, help="impliment with label") -psr.add_option("-E", action="store_true", dest="executeFlag", default=False, help="execute Module") -psr.add_option("-p", action="store_true", dest="printFlag", default=False, help="print module") -psr.add_option("-d", action="store_true", dest="dumpFlag", default=False, help="add debug stmt to module") -psr.add_option("-n", action="store", type="int", dest="number", default=1000, help="number of eval", metavar="INT") -psr.add_option("-r", action="store", type="string", dest="regexp", default="(A|B)*C", help="regexp", metavar="FILE") -psr.add_option("-s", action="store", type="string", dest="string", default="A"+"B"*500+"C", help="string", metavar="FILE") -(opts, args) = psr.parse_args(sys.argv) - -print "regexp = \"%s\", string = \"%s\", loop = %d" % (opts.regexp, opts.string, opts.number) - -regLLVM = RegLLVM("regllvm", opts.regexp, opts.string, opts.implLabel, opts.optimizeFlag, opts.dumpFlag) - -if (opts.printFlag): regLLVM.printModule() - -if (opts.executeFlag): - print "LLVM :", Timer(setup='from __main__ import regLLVM', stmt='regLLVM.execute()').timeit(opts.number) - - reg = re.compile("^%s$" % opts.regexp) - print "re :", Timer(setup='from __main__ import reg', stmt='reg.search("%s")' % opts.string).timeit(opts.number) - - reg = Regexp(opts.regexp) - print "dfareg:", Timer(setup='from __main__ import reg', stmt='reg.matches("%s")' % opts.string).timeit(opts.number) -
--- a/dfareg.py Thu Jun 17 15:27:17 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,456 +0,0 @@ -#!/usr/bin/env python - -import copy -import sys -from optparse import OptionParser - -class NondeterministicFiniteAutomaton(object): - def __init__(self, transition, start, accepts, map_): - self.transition = transition - self.start = start - self.accepts = accepts - self.map = map_ - - def epsilonExpand(self, set_): - que = set(set_) - done = set() - while que: - stat = que.pop() - nexts = self.transition(stat, "") - done.add(stat) - for nextStat in nexts: - if not nextStat in done: que.add(nextStat) - return frozenset(done) - -class DeterministicFiniteAutomaton(object): - def __init__(self, transition, start, accepts, map_): - self.transition = transition - self.start = start - self.accepts = accepts - self.map = map_ - def getRuntime(self): - return DFARuntime(self) - -class DFARuntime(object): - def __init__(self, DFA): - self.DFA = DFA - self.curState = self.DFA.start - - def doTransition(self, char): - self.curState = self.DFA.transition( - self.curState, char) - - def isAcceptState(self): - return self.curState in self.DFA.accepts - - def doesAccept(self, input): - for alphabet in input: - self.doTransition(alphabet) - return self.isAcceptState() - -class Token(object): - CHARACTER = 0 - OPE_UNION = 1 - OPE_STAR = 2 - LPAREN = 3 - RPAREN = 4 - EOF = 5 - - def __init__(self, value, kind): - self.value = value - self.kind = kind - - -class Lexer(object): - def __init__(self, str): - self.stringList = list(str) - - def scan(self): - if not self.stringList: - return Token(None, Token.EOF) - - ch = self.stringList.pop(0) - if ch == '\\': - return Token(self.stringList.pop(0), Token.CHARACTER) - elif ch == '|': - return Token(ch, Token.OPE_UNION) - elif ch == '(': - return Token(ch, Token.LPAREN) - elif ch == ')': - return Token(ch, Token.RPAREN) - elif ch == '*': - return Token(ch, Token.OPE_STAR) - else: - return Token(ch, Token.CHARACTER) - -""" - Context-free Grammer: - (A) expression -> subexpr EOF - (B) subexpr -> seq '|' subexpr | seq - (C) seq -> subseq | '' - (D) subseq -> star subseq | star - (E) star -> factor '*' | factor - (F) factor -> '(' subexpr ')' | CHARACTER -""" - -class Parser(object): - def __init__(self, lexer): - self.lexer = lexer - self.look = None - self.move() - - def match(self, tag): - if self.look.kind != tag: - raise Exeption("syntax error") - self.move() - - def move(self): - self.look = self.lexer.scan() - - def factor(self): - if self.look.kind == Token.LPAREN: - # factor -> '(' subexpr ')' - self.match(Token.LPAREN) - node = self.subexpr() - self.match(Token.RPAREN) - return node - else: - # factor -> CHARACTER - node = Character(self.look.value) - self.match(Token.CHARACTER) - return node - - def star(self): - # star -> factor '*' | factor - node = self.factor() - if self.look.kind == Token.OPE_STAR: - self.match(Token.OPE_STAR) - node = Star(node) - return node - - def seq(self): - if self.look.kind == Token.LPAREN \ - or self.look.kind == Token.CHARACTER: - # seq -> subseq - return self.subseq() - else: - # seq -> '' - return Character("") - - def subseq(self): - node1 = self.star() - if self.look.kind == Token.LPAREN \ - or self.look.kind == Token.CHARACTER: - # subseq -> star subseq - node2 = self.subseq() - node = Concat(node1, node2) - return node - else: - # subseq -> star - return node1 - - def subexpr(self): - node1 = self.seq() - if self.look.kind == Token.OPE_UNION: - self.match(Token.OPE_UNION) - # subexpr -> seq '|' subexpr - node2 = self.subexpr() - node = Union(node1, node2) - return node - else: - # subexpr -> seq - return node1 - - def expression(self): - # expression -> subexpr EOF - node = self.subexpr() - self.match(Token.EOF) - - # Create TREE, NFA - context = Context() - fragment = node.assemble(context) - return fragment.build() - -class Context(object): - def __init__(self): - self.stateCount = 0 - - def newState(self): - self.stateCount += 1 - return self.stateCount - -class NFAFragment(object): - def __init__(self): - self.start = None # integer - self.accepts = None # frozenset - self.map = dict() # transition - - def connect(self, from_, char, to): - slot = self.map.setdefault((from_, char), set()) - slot.add(to) - - def newSkelton(self): - # copy fragment (self), and it return - newFrag = NFAFragment(); - newFrag.map = copy.deepcopy(self.map) - return newFrag - - def __or__(self, frag): - newFrag = self.newSkelton() - for k, v in frag.map.iteritems(): - newFrag.map[k] = v.copy() - return newFrag - - def build(self): - map_ = self.map - def transition(state, char): - return frozenset(map_.get((state, char), [])) - - return NondeterministicFiniteAutomaton( - transition, - self.start, - self.accepts, - self.map - ) - -class Character(object): - def __init__(self, char): - self.char = char - - def assemble(self, context): - frag = NFAFragment() - s1 = context.newState() - s2 = context.newState() - frag.connect(s1, self.char, s2) - frag.start = s1 - frag.accepts = frozenset([s2]) - return frag - -class Union(object): - def __init__(self, op1, op2): - self.op1 = op1 - self.op2 = op2 - - def assemble(self, context): - frag1 = self.op1.assemble(context) - frag2 = self.op2.assemble(context) - frag = frag1 | frag2 - s = context.newState() - frag.connect(s, "", frag1.start) - frag.connect(s, "", frag2.start) - frag.start = s - frag.accepts = frag1.accepts | frag2.accepts - return frag - -class Concat(object): - def __init__(self, op1, op2): - self.op1 = op1 - self.op2 = op2 - - def assemble(self, context): - frag1 = self.op1.assemble(context) - frag2 = self.op2.assemble(context) - frag = frag1 | frag2 - - for state in frag1.accepts: - frag.connect(state, "", frag2.start) - - frag.start = frag1.start - frag.accepts = frag2.accepts - return frag - -class Star(object): - def __init__(self, op): - self.op = op - - def assemble(self, context): - fragOrig = self.op.assemble(context) - frag = fragOrig.newSkelton() - - for state in fragOrig.accepts: - frag.connect(state, "", fragOrig.start) - - s = context.newState() - frag.connect(s, "", fragOrig.start) - frag.start = s - frag.accepts = fragOrig.accepts | frozenset([s]) - return frag - -class NonDisjoinSets(object): - def __init__(self, sub): - self.sub = sub - def __contains__(self, a_set): - return a_set & self.sub - -def nfa2dfa(nfa): - def transition(set_, alpha): - ret = set() - for elem in set_: - ret |= nfa.transition(elem, alpha) - return nfa.epsilonExpand(frozenset(ret)) - - def stateSetTransition(stateSet, char): - trans = set() - for state in stateSet: - t = nfa.map.setdefault((state, char), None) - if not t == None: trans.update(nfa.epsilonExpand(t)) - return frozenset(trans) - - map_ = dict() - que = set(frozenset([nfa.epsilonExpand(set([nfa.start]))])) - done = set() - - while que: - stateSet = que.pop() - - for state in stateSet: - for k, v in nfa.map.iteritems(): - if state == k[0] and k[1] != '': - slot = map_.setdefault((stateSet, k[1]), set()) - slot.update(nfa.epsilonExpand(v)) - - done.add(stateSet) - - for v in map_.itervalues(): - if not v in done: - que.add(frozenset(v)) - - return DeterministicFiniteAutomaton( - transition, - nfa.epsilonExpand(frozenset([nfa.start])), - NonDisjoinSets(nfa.accepts), - map_ - ) - -# create call graph (as dictionary) -class CallGraph(object): - def __init__(self, dfa): - self.dfa = dfa - self.start = "state_"+self.set2name(dfa.start) - (self.map, self.accepts) = self.getCallGraph(self.dfa) - - # return state name from set - def set2name(self, set_): - l = list(set_) - l.sort() - return '_'.join(str(x) for x in l) - - def getCallGraph(self, dfa): - - funcMap = dict() - funcMap[dfa.start] = set() - - # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....} - # : frozenset(1, 15) x 'd' -> frozenset([16]) - - for v in dfa.map.itervalues(): - funcMap[frozenset(v)] = set() - - for key in dfa.map.iterkeys(): - slot = funcMap.setdefault(key[0], set()) - slot.add(key[1]) - - for v in dfa.map.itervalues(): - funcMap[frozenset(v)] = set() - - for key in dfa.map.iterkeys(): - slot = funcMap.setdefault(key[0], set()) - slot.add(key[1]) - - callGraph = dict() - accepts = list() - - # emit cbc code - for k in funcMap.iterkeys(): - callGraph["state_"+self.set2name(k)] = set() - - for k, v in funcMap.iteritems(): - caseMap = dict() - for case in v: - caseMap[case] = "state_"+self.set2name(dfa.map[(frozenset(k), case)]) - if k in dfa.accepts: - accepts.append("state_"+self.set2name(k)) - caseMap['\\0'] = "accept" - callGraph["state_"+self.set2name(k)] = caseMap - - return (callGraph, accepts) - -# emit CbC-souce, from CallGraph -def dfa2CbC(cg, regexp, emitCFlag): - if emitCFlag: - fun_type = 'void ' - call_t = '' - else: - fun_type = '__code ' - call_t = 'goto ' - - # emit cbc(or c) code - print "#include <stdio.h>\n" - for k in cg.map.iterkeys(): - print fun_type + k + "(char* s);" - print fun_type + 'accept(char* s);' - print fun_type + 'reject(char* s);' - print """ -int main(int argc, char* argv[]) { -\tputs(\"regexp: %s\"); -\tputs(\"number of state: %d\"); -\tprintf(\"string: %%s\\n\", argv[1]); -\t%s%s(argv[1]); -\treturn 0; -}\n""" % (regexp, len(cg.map), call_t, cg.start) - - for k, v in cg.map.iteritems(): - print fun_type + k + "(char* s) {" - print "\tswitch(*s++) {" - for case, next_state in v.iteritems(): - print "\t\tcase '%s': " % (case) - print "\t\t\t%s%s(s);\n\t\t\tbreak;" % (call_t, next_state) - - print "\t\tdefault: %sreject(s);\n\t}" % call_t - print "}\n" - - print """ -%saccept(char* s) { -\tprintf(\"\\nstring matches regexp. \\n\\n\"); -}\n""" % fun_type - print """ -%sreject(char* s) { -\tprintf(\"\\nstring does not match regexp. \\n\\n\"); -}\n""" % fun_type - -class Regexp(object): - def __init__(self, regexp): - self.regexp = regexp - self.nfa = None - self.dfa = None - self._compile() - - def _compile(self): - lexer_ = Lexer(self.regexp) - parser_ = Parser(lexer_) - self.nfa = parser_.expression() - self.dfa = nfa2dfa(self.nfa) - - def emitCbC(self, emitCFlag=False): - cg = CallGraph(self.dfa) - dfa2CbC(cg, self.regexp, emitCFlag) - - def matches(self, string): - runtime = self.dfa.getRuntime() - return runtime.doesAccept(string) - -def compile(regexp): - return Regexp(regexp) - -def main(argv): - myusage = "%prog [-C] regexp" - psr = OptionParser(usage=myusage) - psr.add_option("-C", action="store_true", dest="emitCFlag", default=False, help="emit C-source") - (opts, args) = psr.parse_args(sys.argv) - if len(args) == 1: - psr.print_help() - exit() - r = compile(args[1]) - r.emitCbC(opts.emitCFlag) - -if __name__ == '__main__' : main(sys.argv)
--- a/reg2llvm.py Thu Jun 17 15:27:17 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ -#!/usr/bin/env python - -# Import the llvm-py modules. -# encodingfrom llvm import * -from llvm.core import * -from llvm.passes import * -from llvm.ee import * # new import: ee = Execution Engine -from dfareg import * - -class RegLLVM(object): - # define llvm core types, and const - int_t = Type.int() - char_t = Type.int(8) - charptr_t = Type.pointer(char_t) - charptrptr_t = Type.pointer(charptr_t) - const_zero = Constant.int(int_t, 0) - const_one = Constant.int(int_t, 1) - llvm.GuaranteedTailCallOpt = True - - def __init__(self, modName, regexp, string, impl_label, optimized, dump): - # Create a module - self.mod = Module.new(modName) - self.regexp = regexp - self.optimized = optimized - self.dump = dump - self.impl_label = impl_label - - self.matchp_str = self.new_str_const(string) - self.dump_str = self.new_str_const("state: %s, arg: %c(int %d)\n") - - def optional_func_decl(fun): - fun.calling_convertion = CC_X86_FASTCALL - fun.args[0].name = "index" - - state_list = dict() - main = self.mod.add_function( - Type.function(self.int_t, (self.int_t,)), "main") - optional_func_decl(main) - main_entry = main.append_basic_block("entry") - - - if (impl_label): - accept_state = main.append_basic_block("accpet") - reject_state = main.append_basic_block("reject") - index_ptr = Builder.new(main_entry).malloc(self.int_t) - Builder.new(accept_state).free(index_ptr) - Builder.new(reject_state).free(index_ptr) - else: - # Create function - accept and reject (final state). - accept_state = self.mod.add_function( - Type.function(self.int_t, (self.int_t,)), "accept") - optional_func_decl(accept_state) - reject_state = self.mod.add_function( - Type.function(self.int_t, (self.int_t,)), "reject") - optional_func_decl(reject_state) - - state_list["accept"] = accept_state - state_list["reject"] = reject_state - - # create Regexp Object (Regexp has dfa) - r = Regexp(regexp) - self.cg = CallGraph(r.dfa) - - # add state to module, (as function or label). - if (impl_label): - for state in self.cg.map.iterkeys(): - label = main.append_basic_block(state) - state_list[state] = label - else: - for state in self.cg.map.iterkeys(): - fun = self.mod.add_function( - Type.function(self.int_t, (self.int_t,)), state) - optional_func_decl(fun) - state_list[state] = fun - - # emit instructions - if (impl_label): emit = Builder.new(accept_state) - else: emit = Builder.new(accept_state.append_basic_block("entry")) - - if (dump): self.emit_call_printf(emit, "%s does match regexp\n", self.gep_first(emit, self.matchp_str)) - emit.ret(self.const_one) - - if (impl_label): emit = Builder.new(reject_state) - else: emit = Builder.new(reject_state.append_basic_block("entry")) - if (dump): self.emit_call_printf(emit, "%s does not match regexp\n", self.gep_first(emit, self.matchp_str)) - emit.ret(self.const_zero) - - if (impl_label): - # emit transition instruction with jump instruction - emit = Builder.new(main_entry) - emit.store(main.args[0], index_ptr) - emit.branch(state_list[self.cg.start]) - - for state, transition in self.cg.map.iteritems(): - emit = Builder.new(state_list[state]) - index = emit.load(index_ptr) - ptr = emit.gep(self.matchp_str, (self.const_zero, index)) - emit.store(emit.add(self.const_one, index), index_ptr) - char = emit.load(ptr) - si = emit.switch(char, state_list['reject'], len(transition)) - local_label = 0 - for case, next_state in transition.iteritems(): - bb = main.append_basic_block("%s_case%d" % (state, local_label)) #create default bb - emit = Builder.new(bb) - emit.branch(state_list[next_state]) - si.add_case(self.char_const(case), bb) - local_label += 1 - else: - for state, transition in self.cg.map.iteritems(): - cases = dict() - for case, next_state in transition.iteritems(): - cases[self.char_const(case)] = state_list[next_state] - state_fun = state_list[state] - emit = Builder.new(state_fun.append_basic_block("entry")) - ptr = emit.gep(self.matchp_str, (self.const_zero, state_fun.args[0])) - next_index = emit.add(state_fun.args[0], self.const_one) - char = emit.load(ptr) - - if (dump): self.emit_call_printf(emit, self.dump_str, self.gep_first(emit, self.new_str_const(fun.name)), char, char) - - label = 0 - default_bb = state_fun.append_basic_block("default") #create default bb - builder = Builder.new(default_bb) # default is reject. - ret = builder.call(reject_state, (next_index,)) - builder.ret(ret) - - si = emit.switch(char, default_bb, len(cases)) # create switch instruction with deafult case. - for case, nextFun in cases.iteritems(): - bb = state_fun.append_basic_block("case%d" % label) #create default bb - builder = Builder.new(bb) - ret = builder.call(nextFun, (next_index,)) - builder.ret(ret) - si.add_case(case, bb) - label += 1 - emit = Builder.new(main_entry) - ret = emit.call(state_list[self.cg.start], (main.args[0],)) - emit.ret(ret) - - self.mp = ModuleProvider.new(self.mod) - if (optimized): self.optimize() - self.ee = ExecutionEngine.new(self.mp) - self.main = main - - def getExecutionEngine(self): - return self.ee - - def optimize(self): - #optimization passes - pm = PassManager.new() - pm.add(TargetData.new('')) - pm.add(PASS_FUNCTION_INLINING) - pm.run(self.mod) - fp = FunctionPassManager.new(self.mp) - fp.add(TargetData.new('')) - fp.add(PASS_BLOCK_PLACEMENT) - fp.add(PASS_INSTRUCTION_COMBINING) - fp.add(PASS_TAIL_CALL_ELIMINATION) - fp.add(PASS_AGGRESSIVE_DCE) - fp.add(PASS_DEAD_INST_ELIMINATION) - fp.add(PASS_DEAD_CODE_ELIMINATION) - for fun in self.mod.functions: - fp.run(fun) - - def printModule(self): - print self.mod - - def execute(self): - return self.ee.run_function(self.main, - (GenericValue.int(self.int_t, 0),)) - - def new_str_const(self, val): - '''create string(array of int) as a global value ''' - str = self.mod.add_global_variable(Type.array(self.char_t, len(val) + 1), "") - str.initializer = Constant.stringz(val) - return str - - def gep_first(self, emit, val): - '''get pointer of array''' - return emit.gep(val, (self.const_zero, self.const_zero)) - - def char_const(self, val): - '''create constant int value''' - if isinstance(val, str): - if val == '\\0': - return Constant.int(self.char_t, 0) - else: - return Constant.int(self.char_t, ord(val)) - else: - exit('char_const: invalid argument.', val) - - def emit_call_printf(self, emit, string, *args): - '''emit libc printf function call instruction''' - try: - printf = self.mod.get_function_named("printf") - except llvm.LLVMException: - printf = self.mod.add_function( - Type.function(Type.void(), - (Type.pointer(self.char_t, 0),), 1), "printf") - if isinstance(string, str): - string = self.new_str_const(string) - emit.call(printf, - [self.gep_first(emit, string)]+list(args))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/__init__.py Thu Jun 17 15:53:50 2010 +0900 @@ -0,0 +1,2 @@ +def compile(regexp): + return Regexp(regexp)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/benchReg.py Thu Jun 17 15:53:50 2010 +0900 @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +import sys,re +from optparse import OptionParser +from timeit import Timer +from dfareg import Regexp +from reg2llvm import RegLLVM + +myusage = "%prog [-O] [-p] [-s string] [-r regexp] [-E]" +psr = OptionParser(usage=myusage) +psr.add_option("-O", action="store_true", dest="optimizeFlag", default=False, help="optimimzation") +psr.add_option("-L", action="store_true", dest="implLabel", default=False, help="impliment with label") +psr.add_option("-E", action="store_true", dest="executeFlag", default=False, help="execute Module") +psr.add_option("-p", action="store_true", dest="printFlag", default=False, help="print module") +psr.add_option("-d", action="store_true", dest="dumpFlag", default=False, help="add debug stmt to module") +psr.add_option("-n", action="store", type="int", dest="number", default=1000, help="number of eval", metavar="INT") +psr.add_option("-r", action="store", type="string", dest="regexp", default="(A|B)*C", help="regexp", metavar="FILE") +psr.add_option("-s", action="store", type="string", dest="string", default="A"+"B"*500+"C", help="string", metavar="FILE") +(opts, args) = psr.parse_args(sys.argv) + +print "regexp = \"%s\", string = \"%s\", loop = %d" % (opts.regexp, opts.string, opts.number) + +regLLVM = RegLLVM("regllvm", opts.regexp, opts.string, opts.implLabel, opts.optimizeFlag, opts.dumpFlag) + +if (opts.printFlag): regLLVM.printModule() + +if (opts.executeFlag): + print "LLVM :", Timer(setup='from __main__ import regLLVM', stmt='regLLVM.execute()').timeit(opts.number) + + reg = re.compile("^%s$" % opts.regexp) + print "re :", Timer(setup='from __main__ import reg', stmt='reg.search("%s")' % opts.string).timeit(opts.number) + + reg = Regexp(opts.regexp) + print "dfareg:", Timer(setup='from __main__ import reg', stmt='reg.matches("%s")' % opts.string).timeit(opts.number) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/dfareg.py Thu Jun 17 15:53:50 2010 +0900 @@ -0,0 +1,456 @@ +#!/usr/bin/env python + +import copy +import sys +from optparse import OptionParser + +class NondeterministicFiniteAutomaton(object): + def __init__(self, transition, start, accepts, map_): + self.transition = transition + self.start = start + self.accepts = accepts + self.map = map_ + + def epsilonExpand(self, set_): + que = set(set_) + done = set() + while que: + stat = que.pop() + nexts = self.transition(stat, "") + done.add(stat) + for nextStat in nexts: + if not nextStat in done: que.add(nextStat) + return frozenset(done) + +class DeterministicFiniteAutomaton(object): + def __init__(self, transition, start, accepts, map_): + self.transition = transition + self.start = start + self.accepts = accepts + self.map = map_ + def getRuntime(self): + return DFARuntime(self) + +class DFARuntime(object): + def __init__(self, DFA): + self.DFA = DFA + self.curState = self.DFA.start + + def doTransition(self, char): + self.curState = self.DFA.transition( + self.curState, char) + + def isAcceptState(self): + return self.curState in self.DFA.accepts + + def doesAccept(self, input): + for alphabet in input: + self.doTransition(alphabet) + return self.isAcceptState() + +class Token(object): + CHARACTER = 0 + OPE_UNION = 1 + OPE_STAR = 2 + LPAREN = 3 + RPAREN = 4 + EOF = 5 + + def __init__(self, value, kind): + self.value = value + self.kind = kind + + +class Lexer(object): + def __init__(self, str): + self.stringList = list(str) + + def scan(self): + if not self.stringList: + return Token(None, Token.EOF) + + ch = self.stringList.pop(0) + if ch == '\\': + return Token(self.stringList.pop(0), Token.CHARACTER) + elif ch == '|': + return Token(ch, Token.OPE_UNION) + elif ch == '(': + return Token(ch, Token.LPAREN) + elif ch == ')': + return Token(ch, Token.RPAREN) + elif ch == '*': + return Token(ch, Token.OPE_STAR) + else: + return Token(ch, Token.CHARACTER) + +""" + Context-free Grammer: + (A) expression -> subexpr EOF + (B) subexpr -> seq '|' subexpr | seq + (C) seq -> subseq | '' + (D) subseq -> star subseq | star + (E) star -> factor '*' | factor + (F) factor -> '(' subexpr ')' | CHARACTER +""" + +class Parser(object): + def __init__(self, lexer): + self.lexer = lexer + self.look = None + self.move() + + def match(self, tag): + if self.look.kind != tag: + raise Exeption("syntax error") + self.move() + + def move(self): + self.look = self.lexer.scan() + + def factor(self): + if self.look.kind == Token.LPAREN: + # factor -> '(' subexpr ')' + self.match(Token.LPAREN) + node = self.subexpr() + self.match(Token.RPAREN) + return node + else: + # factor -> CHARACTER + node = Character(self.look.value) + self.match(Token.CHARACTER) + return node + + def star(self): + # star -> factor '*' | factor + node = self.factor() + if self.look.kind == Token.OPE_STAR: + self.match(Token.OPE_STAR) + node = Star(node) + return node + + def seq(self): + if self.look.kind == Token.LPAREN \ + or self.look.kind == Token.CHARACTER: + # seq -> subseq + return self.subseq() + else: + # seq -> '' + return Character("") + + def subseq(self): + node1 = self.star() + if self.look.kind == Token.LPAREN \ + or self.look.kind == Token.CHARACTER: + # subseq -> star subseq + node2 = self.subseq() + node = Concat(node1, node2) + return node + else: + # subseq -> star + return node1 + + def subexpr(self): + node1 = self.seq() + if self.look.kind == Token.OPE_UNION: + self.match(Token.OPE_UNION) + # subexpr -> seq '|' subexpr + node2 = self.subexpr() + node = Union(node1, node2) + return node + else: + # subexpr -> seq + return node1 + + def expression(self): + # expression -> subexpr EOF + node = self.subexpr() + self.match(Token.EOF) + + # Create TREE, NFA + context = Context() + fragment = node.assemble(context) + return fragment.build() + +class Context(object): + def __init__(self): + self.stateCount = 0 + + def newState(self): + self.stateCount += 1 + return self.stateCount + +class NFAFragment(object): + def __init__(self): + self.start = None # integer + self.accepts = None # frozenset + self.map = dict() # transition + + def connect(self, from_, char, to): + slot = self.map.setdefault((from_, char), set()) + slot.add(to) + + def newSkelton(self): + # copy fragment (self), and it return + newFrag = NFAFragment(); + newFrag.map = copy.deepcopy(self.map) + return newFrag + + def __or__(self, frag): + newFrag = self.newSkelton() + for k, v in frag.map.iteritems(): + newFrag.map[k] = v.copy() + return newFrag + + def build(self): + map_ = self.map + def transition(state, char): + return frozenset(map_.get((state, char), [])) + + return NondeterministicFiniteAutomaton( + transition, + self.start, + self.accepts, + self.map + ) + +class Character(object): + def __init__(self, char): + self.char = char + + def assemble(self, context): + frag = NFAFragment() + s1 = context.newState() + s2 = context.newState() + frag.connect(s1, self.char, s2) + frag.start = s1 + frag.accepts = frozenset([s2]) + return frag + +class Union(object): + def __init__(self, op1, op2): + self.op1 = op1 + self.op2 = op2 + + def assemble(self, context): + frag1 = self.op1.assemble(context) + frag2 = self.op2.assemble(context) + frag = frag1 | frag2 + s = context.newState() + frag.connect(s, "", frag1.start) + frag.connect(s, "", frag2.start) + frag.start = s + frag.accepts = frag1.accepts | frag2.accepts + return frag + +class Concat(object): + def __init__(self, op1, op2): + self.op1 = op1 + self.op2 = op2 + + def assemble(self, context): + frag1 = self.op1.assemble(context) + frag2 = self.op2.assemble(context) + frag = frag1 | frag2 + + for state in frag1.accepts: + frag.connect(state, "", frag2.start) + + frag.start = frag1.start + frag.accepts = frag2.accepts + return frag + +class Star(object): + def __init__(self, op): + self.op = op + + def assemble(self, context): + fragOrig = self.op.assemble(context) + frag = fragOrig.newSkelton() + + for state in fragOrig.accepts: + frag.connect(state, "", fragOrig.start) + + s = context.newState() + frag.connect(s, "", fragOrig.start) + frag.start = s + frag.accepts = fragOrig.accepts | frozenset([s]) + return frag + +class NonDisjoinSets(object): + def __init__(self, sub): + self.sub = sub + def __contains__(self, a_set): + return a_set & self.sub + +def nfa2dfa(nfa): + def transition(set_, alpha): + ret = set() + for elem in set_: + ret |= nfa.transition(elem, alpha) + return nfa.epsilonExpand(frozenset(ret)) + + def stateSetTransition(stateSet, char): + trans = set() + for state in stateSet: + t = nfa.map.setdefault((state, char), None) + if not t == None: trans.update(nfa.epsilonExpand(t)) + return frozenset(trans) + + map_ = dict() + que = set(frozenset([nfa.epsilonExpand(set([nfa.start]))])) + done = set() + + while que: + stateSet = que.pop() + + for state in stateSet: + for k, v in nfa.map.iteritems(): + if state == k[0] and k[1] != '': + slot = map_.setdefault((stateSet, k[1]), set()) + slot.update(nfa.epsilonExpand(v)) + + done.add(stateSet) + + for v in map_.itervalues(): + if not v in done: + que.add(frozenset(v)) + + return DeterministicFiniteAutomaton( + transition, + nfa.epsilonExpand(frozenset([nfa.start])), + NonDisjoinSets(nfa.accepts), + map_ + ) + +# create call graph (as dictionary) +class CallGraph(object): + def __init__(self, dfa): + self.dfa = dfa + self.start = "state_"+self.set2name(dfa.start) + (self.map, self.accepts) = self.getCallGraph(self.dfa) + + # return state name from set + def set2name(self, set_): + l = list(set_) + l.sort() + return '_'.join(str(x) for x in l) + + def getCallGraph(self, dfa): + + funcMap = dict() + funcMap[dfa.start] = set() + + # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....} + # : frozenset(1, 15) x 'd' -> frozenset([16]) + + for v in dfa.map.itervalues(): + funcMap[frozenset(v)] = set() + + for key in dfa.map.iterkeys(): + slot = funcMap.setdefault(key[0], set()) + slot.add(key[1]) + + for v in dfa.map.itervalues(): + funcMap[frozenset(v)] = set() + + for key in dfa.map.iterkeys(): + slot = funcMap.setdefault(key[0], set()) + slot.add(key[1]) + + callGraph = dict() + accepts = list() + + # emit cbc code + for k in funcMap.iterkeys(): + callGraph["state_"+self.set2name(k)] = set() + + for k, v in funcMap.iteritems(): + caseMap = dict() + for case in v: + caseMap[case] = "state_"+self.set2name(dfa.map[(frozenset(k), case)]) + if k in dfa.accepts: + accepts.append("state_"+self.set2name(k)) + caseMap['\\0'] = "accept" + callGraph["state_"+self.set2name(k)] = caseMap + + return (callGraph, accepts) + +# emit CbC-souce, from CallGraph +def dfa2CbC(cg, regexp, emitCFlag): + if emitCFlag: + fun_type = 'void ' + call_t = '' + else: + fun_type = '__code ' + call_t = 'goto ' + + # emit cbc(or c) code + print "#include <stdio.h>\n" + for k in cg.map.iterkeys(): + print fun_type + k + "(char* s);" + print fun_type + 'accept(char* s);' + print fun_type + 'reject(char* s);' + print """ +int main(int argc, char* argv[]) { +\tputs(\"regexp: %s\"); +\tputs(\"number of state: %d\"); +\tprintf(\"string: %%s\\n\", argv[1]); +\t%s%s(argv[1]); +\treturn 0; +}\n""" % (regexp, len(cg.map), call_t, cg.start) + + for k, v in cg.map.iteritems(): + print fun_type + k + "(char* s) {" + print "\tswitch(*s++) {" + for case, next_state in v.iteritems(): + print "\t\tcase '%s': " % (case) + print "\t\t\t%s%s(s);\n\t\t\tbreak;" % (call_t, next_state) + + print "\t\tdefault: %sreject(s);\n\t}" % call_t + print "}\n" + + print """ +%saccept(char* s) { +\tprintf(\"\\nstring matches regexp. \\n\\n\"); +}\n""" % fun_type + print """ +%sreject(char* s) { +\tprintf(\"\\nstring does not match regexp. \\n\\n\"); +}\n""" % fun_type + +class Regexp(object): + def __init__(self, regexp): + self.regexp = regexp + self.nfa = None + self.dfa = None + self._compile() + + def _compile(self): + lexer_ = Lexer(self.regexp) + parser_ = Parser(lexer_) + self.nfa = parser_.expression() + self.dfa = nfa2dfa(self.nfa) + + def emitCbC(self, emitCFlag=False): + cg = CallGraph(self.dfa) + dfa2CbC(cg, self.regexp, emitCFlag) + + def matches(self, string): + runtime = self.dfa.getRuntime() + return runtime.doesAccept(string) + +def compile(regexp): + return Regexp(regexp) + +def main(argv): + myusage = "%prog [-C] regexp" + psr = OptionParser(usage=myusage) + psr.add_option("-C", action="store_true", dest="emitCFlag", default=False, help="emit C-source") + (opts, args) = psr.parse_args(sys.argv) + if len(args) == 1: + psr.print_help() + exit() + r = compile(args[1]) + r.emitCbC(opts.emitCFlag) + +if __name__ == '__main__' : main(sys.argv)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/reg2llvm.py Thu Jun 17 15:53:50 2010 +0900 @@ -0,0 +1,202 @@ +#!/usr/bin/env python + +# Import the llvm-py modules. +# encodingfrom llvm import * +from llvm.core import * +from llvm.passes import * +from llvm.ee import * # new import: ee = Execution Engine +from dfareg import * + +class RegLLVM(object): + # define llvm core types, and const + int_t = Type.int() + char_t = Type.int(8) + charptr_t = Type.pointer(char_t) + charptrptr_t = Type.pointer(charptr_t) + const_zero = Constant.int(int_t, 0) + const_one = Constant.int(int_t, 1) + llvm.GuaranteedTailCallOpt = True + + def __init__(self, modName, regexp, string, impl_label, optimized, dump): + # Create a module + self.mod = Module.new(modName) + self.regexp = regexp + self.optimized = optimized + self.dump = dump + self.impl_label = impl_label + + self.matchp_str = self.new_str_const(string) + self.dump_str = self.new_str_const("state: %s, arg: %c(int %d)\n") + + def optional_func_decl(fun): + fun.calling_convertion = CC_X86_FASTCALL + fun.args[0].name = "index" + + state_list = dict() + main = self.mod.add_function( + Type.function(self.int_t, (self.int_t,)), "main") + optional_func_decl(main) + main_entry = main.append_basic_block("entry") + + + if (impl_label): + accept_state = main.append_basic_block("accpet") + reject_state = main.append_basic_block("reject") + index_ptr = Builder.new(main_entry).malloc(self.int_t) + Builder.new(accept_state).free(index_ptr) + Builder.new(reject_state).free(index_ptr) + else: + # Create function - accept and reject (final state). + accept_state = self.mod.add_function( + Type.function(self.int_t, (self.int_t,)), "accept") + optional_func_decl(accept_state) + reject_state = self.mod.add_function( + Type.function(self.int_t, (self.int_t,)), "reject") + optional_func_decl(reject_state) + + state_list["accept"] = accept_state + state_list["reject"] = reject_state + + # create Regexp Object (Regexp has dfa) + r = Regexp(regexp) + self.cg = CallGraph(r.dfa) + + # add state to module, (as function or label). + if (impl_label): + for state in self.cg.map.iterkeys(): + label = main.append_basic_block(state) + state_list[state] = label + else: + for state in self.cg.map.iterkeys(): + fun = self.mod.add_function( + Type.function(self.int_t, (self.int_t,)), state) + optional_func_decl(fun) + state_list[state] = fun + + # emit instructions + if (impl_label): emit = Builder.new(accept_state) + else: emit = Builder.new(accept_state.append_basic_block("entry")) + + if (dump): self.emit_call_printf(emit, "%s does match regexp\n", self.gep_first(emit, self.matchp_str)) + emit.ret(self.const_one) + + if (impl_label): emit = Builder.new(reject_state) + else: emit = Builder.new(reject_state.append_basic_block("entry")) + if (dump): self.emit_call_printf(emit, "%s does not match regexp\n", self.gep_first(emit, self.matchp_str)) + emit.ret(self.const_zero) + + if (impl_label): + # emit transition instruction with jump instruction + emit = Builder.new(main_entry) + emit.store(main.args[0], index_ptr) + emit.branch(state_list[self.cg.start]) + + for state, transition in self.cg.map.iteritems(): + emit = Builder.new(state_list[state]) + index = emit.load(index_ptr) + ptr = emit.gep(self.matchp_str, (self.const_zero, index)) + emit.store(emit.add(self.const_one, index), index_ptr) + char = emit.load(ptr) + si = emit.switch(char, state_list['reject'], len(transition)) + local_label = 0 + for case, next_state in transition.iteritems(): + bb = main.append_basic_block("%s_case%d" % (state, local_label)) #create default bb + emit = Builder.new(bb) + emit.branch(state_list[next_state]) + si.add_case(self.char_const(case), bb) + local_label += 1 + else: + for state, transition in self.cg.map.iteritems(): + cases = dict() + for case, next_state in transition.iteritems(): + cases[self.char_const(case)] = state_list[next_state] + state_fun = state_list[state] + emit = Builder.new(state_fun.append_basic_block("entry")) + ptr = emit.gep(self.matchp_str, (self.const_zero, state_fun.args[0])) + next_index = emit.add(state_fun.args[0], self.const_one) + char = emit.load(ptr) + + if (dump): self.emit_call_printf(emit, self.dump_str, self.gep_first(emit, self.new_str_const(fun.name)), char, char) + + label = 0 + default_bb = state_fun.append_basic_block("default") #create default bb + builder = Builder.new(default_bb) # default is reject. + ret = builder.call(reject_state, (next_index,)) + builder.ret(ret) + + si = emit.switch(char, default_bb, len(cases)) # create switch instruction with deafult case. + for case, nextFun in cases.iteritems(): + bb = state_fun.append_basic_block("case%d" % label) #create default bb + builder = Builder.new(bb) + ret = builder.call(nextFun, (next_index,)) + builder.ret(ret) + si.add_case(case, bb) + label += 1 + emit = Builder.new(main_entry) + ret = emit.call(state_list[self.cg.start], (main.args[0],)) + emit.ret(ret) + + self.mp = ModuleProvider.new(self.mod) + if (optimized): self.optimize() + self.ee = ExecutionEngine.new(self.mp) + self.main = main + + def getExecutionEngine(self): + return self.ee + + def optimize(self): + #optimization passes + pm = PassManager.new() + pm.add(TargetData.new('')) + pm.add(PASS_FUNCTION_INLINING) + pm.run(self.mod) + fp = FunctionPassManager.new(self.mp) + fp.add(TargetData.new('')) + fp.add(PASS_BLOCK_PLACEMENT) + fp.add(PASS_INSTRUCTION_COMBINING) + fp.add(PASS_TAIL_CALL_ELIMINATION) + fp.add(PASS_AGGRESSIVE_DCE) + fp.add(PASS_DEAD_INST_ELIMINATION) + fp.add(PASS_DEAD_CODE_ELIMINATION) + for fun in self.mod.functions: + fp.run(fun) + + def printModule(self): + print self.mod + + def execute(self): + return self.ee.run_function(self.main, + (GenericValue.int(self.int_t, 0),)) + + def new_str_const(self, val): + '''create string(array of int) as a global value ''' + str = self.mod.add_global_variable(Type.array(self.char_t, len(val) + 1), "") + str.initializer = Constant.stringz(val) + return str + + def gep_first(self, emit, val): + '''get pointer of array''' + return emit.gep(val, (self.const_zero, self.const_zero)) + + def char_const(self, val): + '''create constant int value''' + if isinstance(val, str): + if val == '\\0': + return Constant.int(self.char_t, 0) + else: + return Constant.int(self.char_t, ord(val)) + else: + exit('char_const: invalid argument.', val) + + def emit_call_printf(self, emit, string, *args): + '''emit libc printf function call instruction''' + try: + printf = self.mod.get_function_named("printf") + except llvm.LLVMException: + printf = self.mod.add_function( + Type.function(Type.void(), + (Type.pointer(self.char_t, 0),), 1), "printf") + if isinstance(string, str): + string = self.new_str_const(string) + emit.call(printf, + [self.gep_first(emit, string)]+list(args))