changeset 2:9816b755ac91

classify src
author ryoma <e075725@ie.u-ryukyu.ac.jp>
date Thu, 17 Jun 2010 15:53:50 +0900
parents d5e81ef9afba
children a193b4ff3909
files __init__.py benchReg.py dfareg.py reg2llvm.py src/__init__.py src/benchReg.py src/dfareg.py src/reg2llvm.py
diffstat 8 files changed, 695 insertions(+), 695 deletions(-) [+]
line wrap: on
line diff
--- a/__init__.py	Thu Jun 17 15:27:17 2010 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-def compile(regexp):
-    return Regexp(regexp)
--- a/benchReg.py	Thu Jun 17 15:27:17 2010 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,35 +0,0 @@
-#!/usr/bin/env python
-
-import sys,re
-from optparse import OptionParser
-from timeit import Timer
-from dfareg import Regexp
-from reg2llvm import RegLLVM
-
-myusage = "%prog [-O] [-p] [-s string] [-r regexp] [-E]"
-psr = OptionParser(usage=myusage)
-psr.add_option("-O", action="store_true", dest="optimizeFlag", default=False, help="optimimzation")
-psr.add_option("-L", action="store_true", dest="implLabel", default=False, help="impliment with label")
-psr.add_option("-E", action="store_true", dest="executeFlag", default=False, help="execute Module")
-psr.add_option("-p", action="store_true", dest="printFlag", default=False, help="print module")
-psr.add_option("-d", action="store_true", dest="dumpFlag", default=False, help="add debug stmt to module")
-psr.add_option("-n", action="store", type="int", dest="number", default=1000, help="number of eval", metavar="INT")
-psr.add_option("-r", action="store", type="string", dest="regexp", default="(A|B)*C", help="regexp", metavar="FILE")
-psr.add_option("-s", action="store", type="string", dest="string", default="A"+"B"*500+"C", help="string", metavar="FILE")
-(opts, args) = psr.parse_args(sys.argv)
-
-print "regexp = \"%s\", string = \"%s\", loop = %d" % (opts.regexp, opts.string, opts.number)
-
-regLLVM = RegLLVM("regllvm", opts.regexp, opts.string, opts.implLabel, opts.optimizeFlag, opts.dumpFlag)
-
-if (opts.printFlag):   regLLVM.printModule()
-
-if (opts.executeFlag):
-    print "LLVM  :", Timer(setup='from __main__ import regLLVM', stmt='regLLVM.execute()').timeit(opts.number)
-
-    reg = re.compile("^%s$" % opts.regexp)
-    print "re    :", Timer(setup='from __main__ import reg', stmt='reg.search("%s")' % opts.string).timeit(opts.number)
-
-    reg = Regexp(opts.regexp)
-    print "dfareg:", Timer(setup='from __main__ import reg', stmt='reg.matches("%s")' % opts.string).timeit(opts.number)
-
--- a/dfareg.py	Thu Jun 17 15:27:17 2010 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,456 +0,0 @@
-#!/usr/bin/env python
-
-import copy
-import sys
-from optparse import OptionParser
-
-class NondeterministicFiniteAutomaton(object):
-    def __init__(self, transition, start, accepts, map_):
-        self.transition = transition
-        self.start = start
-        self.accepts = accepts
-        self.map = map_
-
-    def epsilonExpand(self, set_):
-        que = set(set_)
-        done = set()
-        while que:
-            stat = que.pop()
-            nexts = self.transition(stat, "")
-            done.add(stat)
-            for nextStat in nexts:
-                if not nextStat in done: que.add(nextStat)
-        return frozenset(done)
-
-class DeterministicFiniteAutomaton(object):
-    def __init__(self, transition, start, accepts, map_):
-        self.transition = transition
-        self.start = start
-        self.accepts = accepts
-        self.map = map_
-    def getRuntime(self):
-        return DFARuntime(self)
-
-class DFARuntime(object):
-    def __init__(self, DFA):
-        self.DFA = DFA
-        self.curState = self.DFA.start
-
-    def doTransition(self, char):
-        self.curState = self.DFA.transition(
-            self.curState, char)
-
-    def isAcceptState(self):
-        return self.curState in self.DFA.accepts
-
-    def doesAccept(self, input):
-        for alphabet in input:
-            self.doTransition(alphabet)
-        return self.isAcceptState()
-
-class Token(object):
-    CHARACTER  = 0
-    OPE_UNION  = 1
-    OPE_STAR   = 2
-    LPAREN     = 3
-    RPAREN     = 4
-    EOF        = 5
-
-    def __init__(self, value, kind):
-        self.value = value
-        self.kind  = kind
-
-
-class Lexer(object):
-    def __init__(self, str):
-        self.stringList = list(str)
-
-    def scan(self):
-        if not self.stringList:
-            return Token(None, Token.EOF)
-
-        ch = self.stringList.pop(0)
-        if   ch == '\\':
-            return Token(self.stringList.pop(0), Token.CHARACTER)
-        elif ch == '|':
-            return Token(ch, Token.OPE_UNION)
-        elif ch == '(':
-            return Token(ch, Token.LPAREN)
-        elif ch == ')':
-            return Token(ch, Token.RPAREN)
-        elif ch == '*':
-            return Token(ch, Token.OPE_STAR)
-        else:
-            return Token(ch, Token.CHARACTER)
-
-"""
- Context-free Grammer:
- (A) expression -> subexpr EOF
- (B) subexpr -> seq '|' subexpr | seq
- (C) seq -> subseq | ''
- (D) subseq -> star subseq | star
- (E) star -> factor '*' | factor
- (F) factor -> '(' subexpr ')' | CHARACTER
-"""
-
-class Parser(object):
-    def __init__(self, lexer):
-        self.lexer = lexer
-        self.look  = None
-        self.move()
-
-    def match(self, tag):
-        if self.look.kind != tag:
-            raise Exeption("syntax error")
-        self.move()
-
-    def move(self):
-        self.look = self.lexer.scan()
-
-    def factor(self):
-        if self.look.kind == Token.LPAREN:
-            # factor -> '(' subexpr ')'
-            self.match(Token.LPAREN)
-            node = self.subexpr()
-            self.match(Token.RPAREN)
-            return node
-        else:
-            # factor -> CHARACTER
-            node = Character(self.look.value)
-            self.match(Token.CHARACTER)
-            return node
-
-    def star(self):
-        # star -> factor '*' | factor
-        node = self.factor()
-        if self.look.kind == Token.OPE_STAR:
-            self.match(Token.OPE_STAR)
-            node = Star(node)
-        return node
-
-    def seq(self):
-        if self.look.kind == Token.LPAREN \
-               or self.look.kind == Token.CHARACTER:
-            # seq -> subseq
-            return self.subseq()
-        else:
-            # seq -> ''
-            return Character("")
-
-    def subseq(self):
-        node1 = self.star()
-        if self.look.kind == Token.LPAREN \
-               or self.look.kind == Token.CHARACTER:
-            # subseq -> star subseq
-            node2 = self.subseq()
-            node  = Concat(node1, node2)
-            return node
-        else:
-            # subseq -> star
-            return node1
-
-    def subexpr(self):
-        node1 = self.seq()
-        if self.look.kind == Token.OPE_UNION:
-            self.match(Token.OPE_UNION)
-            # subexpr    -> seq '|' subexpr
-            node2 = self.subexpr()
-            node  = Union(node1, node2)
-            return node
-        else:
-            # subexpr    -> seq
-            return node1
-
-    def expression(self):
-        # expression -> subexpr EOF
-        node     = self.subexpr()
-        self.match(Token.EOF)
-
-        # Create TREE, NFA
-        context  = Context()
-        fragment = node.assemble(context)
-        return fragment.build()
-
-class Context(object):
-    def __init__(self):
-        self.stateCount = 0
-
-    def newState(self):
-        self.stateCount += 1
-        return self.stateCount
-
-class NFAFragment(object):
-    def __init__(self):
-        self.start   = None    # integer
-        self.accepts = None    # frozenset
-        self.map     = dict()  # transition
-
-    def connect(self, from_, char, to):
-        slot = self.map.setdefault((from_, char), set())
-        slot.add(to)
-
-    def newSkelton(self):
-        # copy fragment (self), and it return
-        newFrag = NFAFragment();
-        newFrag.map = copy.deepcopy(self.map)
-        return newFrag
-
-    def __or__(self, frag):
-        newFrag = self.newSkelton()
-        for k, v in frag.map.iteritems():
-            newFrag.map[k] = v.copy()
-        return newFrag
-
-    def build(self):
-        map_ = self.map
-        def transition(state, char):
-            return frozenset(map_.get((state, char), []))
-
-        return NondeterministicFiniteAutomaton(
-            transition,
-            self.start,
-            self.accepts,
-            self.map
-            )
-
-class Character(object):
-    def __init__(self, char):
-        self.char = char
-
-    def assemble(self, context):
-        frag = NFAFragment()
-        s1 = context.newState()
-        s2 = context.newState()
-        frag.connect(s1, self.char, s2)
-        frag.start = s1
-        frag.accepts = frozenset([s2])
-        return frag
-
-class Union(object):
-    def __init__(self, op1, op2):
-        self.op1 = op1
-        self.op2 = op2
-
-    def assemble(self, context):
-        frag1 = self.op1.assemble(context)
-        frag2 = self.op2.assemble(context)
-        frag = frag1 | frag2
-        s = context.newState()
-        frag.connect(s, "", frag1.start)
-        frag.connect(s, "", frag2.start)
-        frag.start = s
-        frag.accepts = frag1.accepts | frag2.accepts
-        return frag
-
-class Concat(object):
-    def __init__(self, op1, op2):
-        self.op1 = op1
-        self.op2 = op2
-
-    def assemble(self, context):
-        frag1 = self.op1.assemble(context)
-        frag2 = self.op2.assemble(context)
-        frag = frag1 | frag2
-
-        for state in frag1.accepts:
-            frag.connect(state, "", frag2.start)
-
-        frag.start   = frag1.start
-        frag.accepts = frag2.accepts
-        return frag
-
-class Star(object):
-    def __init__(self, op):
-        self.op = op
-
-    def assemble(self, context):
-        fragOrig = self.op.assemble(context)
-        frag = fragOrig.newSkelton()
-
-        for state in fragOrig.accepts:
-            frag.connect(state, "", fragOrig.start)
-
-        s = context.newState()
-        frag.connect(s, "", fragOrig.start)
-        frag.start = s
-        frag.accepts = fragOrig.accepts | frozenset([s])
-        return frag
-
-class NonDisjoinSets(object):
-    def __init__(self, sub):
-        self.sub = sub
-    def __contains__(self, a_set):
-        return a_set & self.sub
-
-def nfa2dfa(nfa):
-    def transition(set_, alpha):
-        ret = set()
-        for elem in set_:
-            ret |= nfa.transition(elem, alpha)
-        return nfa.epsilonExpand(frozenset(ret))
-
-    def stateSetTransition(stateSet, char):
-        trans = set()
-        for state in stateSet:
-            t = nfa.map.setdefault((state, char), None)
-            if not t == None: trans.update(nfa.epsilonExpand(t))
-        return frozenset(trans)
-
-    map_ = dict()
-    que =  set(frozenset([nfa.epsilonExpand(set([nfa.start]))]))
-    done = set()
-
-    while que:
-        stateSet = que.pop()
-
-        for state in stateSet:
-            for k, v in nfa.map.iteritems():
-                if state == k[0] and k[1] != '':
-                    slot = map_.setdefault((stateSet, k[1]), set())
-                    slot.update(nfa.epsilonExpand(v))
-
-        done.add(stateSet)
-
-        for v in map_.itervalues():
-            if not v in done:
-                que.add(frozenset(v))
-
-    return DeterministicFiniteAutomaton(
-        transition,
-        nfa.epsilonExpand(frozenset([nfa.start])),
-        NonDisjoinSets(nfa.accepts),
-        map_
-        )
-
-# create call graph (as dictionary)
-class CallGraph(object):
-    def __init__(self, dfa):
-        self.dfa = dfa
-        self.start = "state_"+self.set2name(dfa.start)
-        (self.map, self.accepts) = self.getCallGraph(self.dfa)
-
-    # return state name from set
-    def set2name(self, set_):
-        l = list(set_)
-        l.sort()
-        return '_'.join(str(x) for x in l)
-
-    def getCallGraph(self, dfa):
-
-        funcMap = dict()
-        funcMap[dfa.start] = set()
-
-        # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....}
-        #             : frozenset(1, 15) x 'd' -> frozenset([16])
-
-        for v in dfa.map.itervalues():
-            funcMap[frozenset(v)] = set()
-
-        for key in dfa.map.iterkeys():
-            slot = funcMap.setdefault(key[0], set())
-            slot.add(key[1])
-
-        for v in dfa.map.itervalues():
-            funcMap[frozenset(v)] = set()
-
-        for key in dfa.map.iterkeys():
-            slot = funcMap.setdefault(key[0], set())
-            slot.add(key[1])
-
-        callGraph = dict()
-        accepts = list()
-
-        # emit cbc code
-        for k in funcMap.iterkeys():
-            callGraph["state_"+self.set2name(k)] = set()
-
-        for k, v in funcMap.iteritems():
-            caseMap = dict()
-            for case in v:
-                caseMap[case] = "state_"+self.set2name(dfa.map[(frozenset(k), case)])
-            if k in dfa.accepts:
-                accepts.append("state_"+self.set2name(k))
-                caseMap['\\0'] = "accept"
-            callGraph["state_"+self.set2name(k)] = caseMap
-
-        return (callGraph, accepts)
-
-# emit CbC-souce, from CallGraph
-def dfa2CbC(cg, regexp, emitCFlag):
-    if emitCFlag:
-        fun_type = 'void '
-        call_t = ''
-    else:
-        fun_type = '__code '
-        call_t = 'goto '
-
-    # emit cbc(or c) code
-    print "#include <stdio.h>\n"
-    for k in cg.map.iterkeys():
-        print fun_type + k + "(char* s);"
-    print fun_type + 'accept(char* s);'
-    print fun_type + 'reject(char* s);'
-    print """
-int main(int argc, char* argv[]) {
-\tputs(\"regexp: %s\");
-\tputs(\"number of state: %d\");
-\tprintf(\"string: %%s\\n\", argv[1]);
-\t%s%s(argv[1]);
-\treturn 0;
-}\n""" % (regexp, len(cg.map), call_t, cg.start)
-
-    for k, v in cg.map.iteritems():
-        print fun_type + k + "(char* s) {"
-        print "\tswitch(*s++) {"
-        for case, next_state in v.iteritems():
-            print "\t\tcase '%s': " % (case)
-            print "\t\t\t%s%s(s);\n\t\t\tbreak;" % (call_t, next_state)
-
-        print "\t\tdefault: %sreject(s);\n\t}" % call_t
-        print "}\n"
-
-    print """
-%saccept(char* s) {
-\tprintf(\"\\nstring matches regexp. \\n\\n\");
-}\n""" % fun_type
-    print """
-%sreject(char* s) {
-\tprintf(\"\\nstring does not match regexp. \\n\\n\");
-}\n""" % fun_type
-
-class Regexp(object):
-    def __init__(self, regexp):
-        self.regexp = regexp
-        self.nfa    = None
-        self.dfa    = None
-        self._compile()
-
-    def _compile(self):
-        lexer_  = Lexer(self.regexp)
-        parser_ = Parser(lexer_)
-        self.nfa = parser_.expression()
-        self.dfa = nfa2dfa(self.nfa)
-
-    def emitCbC(self, emitCFlag=False):
-        cg = CallGraph(self.dfa)
-        dfa2CbC(cg, self.regexp, emitCFlag)
-
-    def matches(self, string):
-        runtime = self.dfa.getRuntime()
-        return runtime.doesAccept(string)
-
-def compile(regexp):
-    return Regexp(regexp)
-
-def main(argv):
-    myusage = "%prog [-C] regexp"
-    psr = OptionParser(usage=myusage)
-    psr.add_option("-C", action="store_true", dest="emitCFlag", default=False, help="emit C-source")
-    (opts, args) = psr.parse_args(sys.argv)
-    if len(args) == 1:
-        psr.print_help()
-        exit()
-    r = compile(args[1])
-    r.emitCbC(opts.emitCFlag)
-
-if __name__ == '__main__' : main(sys.argv)
--- a/reg2llvm.py	Thu Jun 17 15:27:17 2010 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,202 +0,0 @@
-#!/usr/bin/env python
-
-# Import the llvm-py modules.
-# encodingfrom llvm import *
-from llvm.core import *
-from llvm.passes import *
-from llvm.ee import *  # new import: ee = Execution Engine
-from dfareg import *
-
-class RegLLVM(object):
-    # define llvm core types, and const
-    int_t = Type.int()
-    char_t = Type.int(8)
-    charptr_t = Type.pointer(char_t)
-    charptrptr_t = Type.pointer(charptr_t)
-    const_zero = Constant.int(int_t, 0)
-    const_one = Constant.int(int_t, 1)
-    llvm.GuaranteedTailCallOpt = True
-
-    def __init__(self, modName, regexp, string, impl_label, optimized, dump):
-        # Create a module
-        self.mod = Module.new(modName)
-        self.regexp = regexp
-        self.optimized = optimized
-        self.dump = dump
-        self.impl_label = impl_label
-
-        self.matchp_str = self.new_str_const(string)
-        self.dump_str = self.new_str_const("state: %s, arg: %c(int %d)\n")
-
-        def optional_func_decl(fun):
-            fun.calling_convertion = CC_X86_FASTCALL
-            fun.args[0].name = "index"
-
-        state_list = dict()
-        main = self.mod.add_function(
-            Type.function(self.int_t, (self.int_t,)), "main")
-        optional_func_decl(main)
-        main_entry = main.append_basic_block("entry")
-
-
-        if (impl_label):
-            accept_state = main.append_basic_block("accpet")
-            reject_state = main.append_basic_block("reject")
-            index_ptr = Builder.new(main_entry).malloc(self.int_t)
-            Builder.new(accept_state).free(index_ptr)
-            Builder.new(reject_state).free(index_ptr)
-        else:
-            # Create function - accept and reject (final state).
-            accept_state = self.mod.add_function(
-                Type.function(self.int_t, (self.int_t,)), "accept")
-            optional_func_decl(accept_state)
-            reject_state = self.mod.add_function(
-                Type.function(self.int_t, (self.int_t,)), "reject")
-            optional_func_decl(reject_state)
-
-        state_list["accept"] = accept_state
-        state_list["reject"] = reject_state
-
-        # create Regexp Object (Regexp has dfa)
-        r = Regexp(regexp)
-        self.cg = CallGraph(r.dfa)
-
-        # add state to module, (as function or label).
-        if (impl_label):
-            for state in self.cg.map.iterkeys():
-                label = main.append_basic_block(state)
-                state_list[state] = label
-        else:
-            for state in self.cg.map.iterkeys():
-                fun = self.mod.add_function(
-                    Type.function(self.int_t, (self.int_t,)), state)
-                optional_func_decl(fun)
-                state_list[state] = fun
-
-        # emit instructions
-        if (impl_label): emit = Builder.new(accept_state)
-        else:            emit = Builder.new(accept_state.append_basic_block("entry"))
-
-        if (dump): self.emit_call_printf(emit, "%s does match regexp\n", self.gep_first(emit, self.matchp_str))
-        emit.ret(self.const_one)
-
-        if (impl_label): emit = Builder.new(reject_state)
-        else:            emit = Builder.new(reject_state.append_basic_block("entry"))
-        if (dump): self.emit_call_printf(emit, "%s does not match regexp\n", self.gep_first(emit, self.matchp_str))
-        emit.ret(self.const_zero)
-
-        if (impl_label):
-            # emit transition instruction with jump instruction
-            emit = Builder.new(main_entry)
-            emit.store(main.args[0], index_ptr)
-            emit.branch(state_list[self.cg.start])
-
-            for state, transition in self.cg.map.iteritems():
-                emit = Builder.new(state_list[state])
-                index = emit.load(index_ptr)
-                ptr = emit.gep(self.matchp_str, (self.const_zero, index))
-                emit.store(emit.add(self.const_one, index), index_ptr)
-                char = emit.load(ptr)
-                si = emit.switch(char, state_list['reject'], len(transition))
-                local_label = 0
-                for case, next_state in transition.iteritems():
-                    bb = main.append_basic_block("%s_case%d" % (state, local_label))   #create default bb
-                    emit = Builder.new(bb)
-                    emit.branch(state_list[next_state])
-                    si.add_case(self.char_const(case), bb)
-                    local_label += 1
-        else:
-            for state, transition in self.cg.map.iteritems():
-                cases = dict()
-                for case, next_state in transition.iteritems():
-                    cases[self.char_const(case)] = state_list[next_state]
-                state_fun = state_list[state]
-                emit = Builder.new(state_fun.append_basic_block("entry"))
-                ptr = emit.gep(self.matchp_str, (self.const_zero, state_fun.args[0]))
-                next_index = emit.add(state_fun.args[0], self.const_one)
-                char = emit.load(ptr)
-
-                if (dump): self.emit_call_printf(emit, self.dump_str, self.gep_first(emit, self.new_str_const(fun.name)), char, char)
-
-                label = 0
-                default_bb = state_fun.append_basic_block("default") #create default bb
-                builder = Builder.new(default_bb)              # default is reject.
-                ret = builder.call(reject_state, (next_index,))
-                builder.ret(ret)
-
-                si = emit.switch(char, default_bb, len(cases)) # create switch instruction with deafult case.
-                for case, nextFun in cases.iteritems():
-                    bb = state_fun.append_basic_block("case%d" % label)   #create default bb
-                    builder = Builder.new(bb)
-                    ret = builder.call(nextFun, (next_index,))
-                    builder.ret(ret)
-                    si.add_case(case, bb)
-                    label += 1
-            emit = Builder.new(main_entry)
-            ret = emit.call(state_list[self.cg.start], (main.args[0],))
-            emit.ret(ret)
-
-        self.mp = ModuleProvider.new(self.mod)
-        if (optimized): self.optimize()
-        self.ee = ExecutionEngine.new(self.mp)
-        self.main = main
-
-    def getExecutionEngine(self):
-        return self.ee
-
-    def optimize(self):
-        #optimization passes
-        pm = PassManager.new()
-        pm.add(TargetData.new(''))
-        pm.add(PASS_FUNCTION_INLINING)
-        pm.run(self.mod)
-        fp = FunctionPassManager.new(self.mp)
-        fp.add(TargetData.new(''))
-        fp.add(PASS_BLOCK_PLACEMENT)
-        fp.add(PASS_INSTRUCTION_COMBINING)
-        fp.add(PASS_TAIL_CALL_ELIMINATION)
-        fp.add(PASS_AGGRESSIVE_DCE)
-        fp.add(PASS_DEAD_INST_ELIMINATION)
-        fp.add(PASS_DEAD_CODE_ELIMINATION)
-        for fun in self.mod.functions:
-            fp.run(fun)
-
-    def printModule(self):
-        print self.mod
-
-    def execute(self):
-        return self.ee.run_function(self.main,
-                                    (GenericValue.int(self.int_t, 0),))
-
-    def new_str_const(self, val):
-        '''create string(array of int) as a global value '''
-        str = self.mod.add_global_variable(Type.array(self.char_t, len(val) + 1), "")
-        str.initializer = Constant.stringz(val)
-        return str
-
-    def gep_first(self, emit, val):
-        '''get pointer of array'''
-        return emit.gep(val, (self.const_zero, self.const_zero))
-
-    def char_const(self, val):
-        '''create constant int value'''
-        if isinstance(val, str):
-            if val == '\\0':
-                return Constant.int(self.char_t, 0)
-            else:
-                return Constant.int(self.char_t, ord(val))
-        else:
-            exit('char_const: invalid argument.', val)
-
-    def emit_call_printf(self, emit, string, *args):
-        '''emit libc printf function call instruction'''
-        try:
-            printf = self.mod.get_function_named("printf")
-        except llvm.LLVMException:
-            printf = self.mod.add_function(
-                Type.function(Type.void(),
-                              (Type.pointer(self.char_t, 0),), 1), "printf")
-        if isinstance(string, str):
-            string = self.new_str_const(string)
-        emit.call(printf,
-                  [self.gep_first(emit, string)]+list(args))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/__init__.py	Thu Jun 17 15:53:50 2010 +0900
@@ -0,0 +1,2 @@
+def compile(regexp):
+    return Regexp(regexp)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/benchReg.py	Thu Jun 17 15:53:50 2010 +0900
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+
+import sys,re
+from optparse import OptionParser
+from timeit import Timer
+from dfareg import Regexp
+from reg2llvm import RegLLVM
+
+myusage = "%prog [-O] [-p] [-s string] [-r regexp] [-E]"
+psr = OptionParser(usage=myusage)
+psr.add_option("-O", action="store_true", dest="optimizeFlag", default=False, help="optimimzation")
+psr.add_option("-L", action="store_true", dest="implLabel", default=False, help="impliment with label")
+psr.add_option("-E", action="store_true", dest="executeFlag", default=False, help="execute Module")
+psr.add_option("-p", action="store_true", dest="printFlag", default=False, help="print module")
+psr.add_option("-d", action="store_true", dest="dumpFlag", default=False, help="add debug stmt to module")
+psr.add_option("-n", action="store", type="int", dest="number", default=1000, help="number of eval", metavar="INT")
+psr.add_option("-r", action="store", type="string", dest="regexp", default="(A|B)*C", help="regexp", metavar="FILE")
+psr.add_option("-s", action="store", type="string", dest="string", default="A"+"B"*500+"C", help="string", metavar="FILE")
+(opts, args) = psr.parse_args(sys.argv)
+
+print "regexp = \"%s\", string = \"%s\", loop = %d" % (opts.regexp, opts.string, opts.number)
+
+regLLVM = RegLLVM("regllvm", opts.regexp, opts.string, opts.implLabel, opts.optimizeFlag, opts.dumpFlag)
+
+if (opts.printFlag):   regLLVM.printModule()
+
+if (opts.executeFlag):
+    print "LLVM  :", Timer(setup='from __main__ import regLLVM', stmt='regLLVM.execute()').timeit(opts.number)
+
+    reg = re.compile("^%s$" % opts.regexp)
+    print "re    :", Timer(setup='from __main__ import reg', stmt='reg.search("%s")' % opts.string).timeit(opts.number)
+
+    reg = Regexp(opts.regexp)
+    print "dfareg:", Timer(setup='from __main__ import reg', stmt='reg.matches("%s")' % opts.string).timeit(opts.number)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/dfareg.py	Thu Jun 17 15:53:50 2010 +0900
@@ -0,0 +1,456 @@
+#!/usr/bin/env python
+
+import copy
+import sys
+from optparse import OptionParser
+
+class NondeterministicFiniteAutomaton(object):
+    def __init__(self, transition, start, accepts, map_):
+        self.transition = transition
+        self.start = start
+        self.accepts = accepts
+        self.map = map_
+
+    def epsilonExpand(self, set_):
+        que = set(set_)
+        done = set()
+        while que:
+            stat = que.pop()
+            nexts = self.transition(stat, "")
+            done.add(stat)
+            for nextStat in nexts:
+                if not nextStat in done: que.add(nextStat)
+        return frozenset(done)
+
+class DeterministicFiniteAutomaton(object):
+    def __init__(self, transition, start, accepts, map_):
+        self.transition = transition
+        self.start = start
+        self.accepts = accepts
+        self.map = map_
+    def getRuntime(self):
+        return DFARuntime(self)
+
+class DFARuntime(object):
+    def __init__(self, DFA):
+        self.DFA = DFA
+        self.curState = self.DFA.start
+
+    def doTransition(self, char):
+        self.curState = self.DFA.transition(
+            self.curState, char)
+
+    def isAcceptState(self):
+        return self.curState in self.DFA.accepts
+
+    def doesAccept(self, input):
+        for alphabet in input:
+            self.doTransition(alphabet)
+        return self.isAcceptState()
+
+class Token(object):
+    CHARACTER  = 0
+    OPE_UNION  = 1
+    OPE_STAR   = 2
+    LPAREN     = 3
+    RPAREN     = 4
+    EOF        = 5
+
+    def __init__(self, value, kind):
+        self.value = value
+        self.kind  = kind
+
+
+class Lexer(object):
+    def __init__(self, str):
+        self.stringList = list(str)
+
+    def scan(self):
+        if not self.stringList:
+            return Token(None, Token.EOF)
+
+        ch = self.stringList.pop(0)
+        if   ch == '\\':
+            return Token(self.stringList.pop(0), Token.CHARACTER)
+        elif ch == '|':
+            return Token(ch, Token.OPE_UNION)
+        elif ch == '(':
+            return Token(ch, Token.LPAREN)
+        elif ch == ')':
+            return Token(ch, Token.RPAREN)
+        elif ch == '*':
+            return Token(ch, Token.OPE_STAR)
+        else:
+            return Token(ch, Token.CHARACTER)
+
+"""
+ Context-free Grammer:
+ (A) expression -> subexpr EOF
+ (B) subexpr -> seq '|' subexpr | seq
+ (C) seq -> subseq | ''
+ (D) subseq -> star subseq | star
+ (E) star -> factor '*' | factor
+ (F) factor -> '(' subexpr ')' | CHARACTER
+"""
+
+class Parser(object):
+    def __init__(self, lexer):
+        self.lexer = lexer
+        self.look  = None
+        self.move()
+
+    def match(self, tag):
+        if self.look.kind != tag:
+            raise Exeption("syntax error")
+        self.move()
+
+    def move(self):
+        self.look = self.lexer.scan()
+
+    def factor(self):
+        if self.look.kind == Token.LPAREN:
+            # factor -> '(' subexpr ')'
+            self.match(Token.LPAREN)
+            node = self.subexpr()
+            self.match(Token.RPAREN)
+            return node
+        else:
+            # factor -> CHARACTER
+            node = Character(self.look.value)
+            self.match(Token.CHARACTER)
+            return node
+
+    def star(self):
+        # star -> factor '*' | factor
+        node = self.factor()
+        if self.look.kind == Token.OPE_STAR:
+            self.match(Token.OPE_STAR)
+            node = Star(node)
+        return node
+
+    def seq(self):
+        if self.look.kind == Token.LPAREN \
+               or self.look.kind == Token.CHARACTER:
+            # seq -> subseq
+            return self.subseq()
+        else:
+            # seq -> ''
+            return Character("")
+
+    def subseq(self):
+        node1 = self.star()
+        if self.look.kind == Token.LPAREN \
+               or self.look.kind == Token.CHARACTER:
+            # subseq -> star subseq
+            node2 = self.subseq()
+            node  = Concat(node1, node2)
+            return node
+        else:
+            # subseq -> star
+            return node1
+
+    def subexpr(self):
+        node1 = self.seq()
+        if self.look.kind == Token.OPE_UNION:
+            self.match(Token.OPE_UNION)
+            # subexpr    -> seq '|' subexpr
+            node2 = self.subexpr()
+            node  = Union(node1, node2)
+            return node
+        else:
+            # subexpr    -> seq
+            return node1
+
+    def expression(self):
+        # expression -> subexpr EOF
+        node     = self.subexpr()
+        self.match(Token.EOF)
+
+        # Create TREE, NFA
+        context  = Context()
+        fragment = node.assemble(context)
+        return fragment.build()
+
+class Context(object):
+    def __init__(self):
+        self.stateCount = 0
+
+    def newState(self):
+        self.stateCount += 1
+        return self.stateCount
+
+class NFAFragment(object):
+    def __init__(self):
+        self.start   = None    # integer
+        self.accepts = None    # frozenset
+        self.map     = dict()  # transition
+
+    def connect(self, from_, char, to):
+        slot = self.map.setdefault((from_, char), set())
+        slot.add(to)
+
+    def newSkelton(self):
+        # copy fragment (self), and it return
+        newFrag = NFAFragment();
+        newFrag.map = copy.deepcopy(self.map)
+        return newFrag
+
+    def __or__(self, frag):
+        newFrag = self.newSkelton()
+        for k, v in frag.map.iteritems():
+            newFrag.map[k] = v.copy()
+        return newFrag
+
+    def build(self):
+        map_ = self.map
+        def transition(state, char):
+            return frozenset(map_.get((state, char), []))
+
+        return NondeterministicFiniteAutomaton(
+            transition,
+            self.start,
+            self.accepts,
+            self.map
+            )
+
+class Character(object):
+    def __init__(self, char):
+        self.char = char
+
+    def assemble(self, context):
+        frag = NFAFragment()
+        s1 = context.newState()
+        s2 = context.newState()
+        frag.connect(s1, self.char, s2)
+        frag.start = s1
+        frag.accepts = frozenset([s2])
+        return frag
+
+class Union(object):
+    def __init__(self, op1, op2):
+        self.op1 = op1
+        self.op2 = op2
+
+    def assemble(self, context):
+        frag1 = self.op1.assemble(context)
+        frag2 = self.op2.assemble(context)
+        frag = frag1 | frag2
+        s = context.newState()
+        frag.connect(s, "", frag1.start)
+        frag.connect(s, "", frag2.start)
+        frag.start = s
+        frag.accepts = frag1.accepts | frag2.accepts
+        return frag
+
+class Concat(object):
+    def __init__(self, op1, op2):
+        self.op1 = op1
+        self.op2 = op2
+
+    def assemble(self, context):
+        frag1 = self.op1.assemble(context)
+        frag2 = self.op2.assemble(context)
+        frag = frag1 | frag2
+
+        for state in frag1.accepts:
+            frag.connect(state, "", frag2.start)
+
+        frag.start   = frag1.start
+        frag.accepts = frag2.accepts
+        return frag
+
+class Star(object):
+    def __init__(self, op):
+        self.op = op
+
+    def assemble(self, context):
+        fragOrig = self.op.assemble(context)
+        frag = fragOrig.newSkelton()
+
+        for state in fragOrig.accepts:
+            frag.connect(state, "", fragOrig.start)
+
+        s = context.newState()
+        frag.connect(s, "", fragOrig.start)
+        frag.start = s
+        frag.accepts = fragOrig.accepts | frozenset([s])
+        return frag
+
+class NonDisjoinSets(object):
+    def __init__(self, sub):
+        self.sub = sub
+    def __contains__(self, a_set):
+        return a_set & self.sub
+
+def nfa2dfa(nfa):
+    def transition(set_, alpha):
+        ret = set()
+        for elem in set_:
+            ret |= nfa.transition(elem, alpha)
+        return nfa.epsilonExpand(frozenset(ret))
+
+    def stateSetTransition(stateSet, char):
+        trans = set()
+        for state in stateSet:
+            t = nfa.map.setdefault((state, char), None)
+            if not t == None: trans.update(nfa.epsilonExpand(t))
+        return frozenset(trans)
+
+    map_ = dict()
+    que =  set(frozenset([nfa.epsilonExpand(set([nfa.start]))]))
+    done = set()
+
+    while que:
+        stateSet = que.pop()
+
+        for state in stateSet:
+            for k, v in nfa.map.iteritems():
+                if state == k[0] and k[1] != '':
+                    slot = map_.setdefault((stateSet, k[1]), set())
+                    slot.update(nfa.epsilonExpand(v))
+
+        done.add(stateSet)
+
+        for v in map_.itervalues():
+            if not v in done:
+                que.add(frozenset(v))
+
+    return DeterministicFiniteAutomaton(
+        transition,
+        nfa.epsilonExpand(frozenset([nfa.start])),
+        NonDisjoinSets(nfa.accepts),
+        map_
+        )
+
+# create call graph (as dictionary)
+class CallGraph(object):
+    def __init__(self, dfa):
+        self.dfa = dfa
+        self.start = "state_"+self.set2name(dfa.start)
+        (self.map, self.accepts) = self.getCallGraph(self.dfa)
+
+    # return state name from set
+    def set2name(self, set_):
+        l = list(set_)
+        l.sort()
+        return '_'.join(str(x) for x in l)
+
+    def getCallGraph(self, dfa):
+
+        funcMap = dict()
+        funcMap[dfa.start] = set()
+
+        # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....}
+        #             : frozenset(1, 15) x 'd' -> frozenset([16])
+
+        for v in dfa.map.itervalues():
+            funcMap[frozenset(v)] = set()
+
+        for key in dfa.map.iterkeys():
+            slot = funcMap.setdefault(key[0], set())
+            slot.add(key[1])
+
+        for v in dfa.map.itervalues():
+            funcMap[frozenset(v)] = set()
+
+        for key in dfa.map.iterkeys():
+            slot = funcMap.setdefault(key[0], set())
+            slot.add(key[1])
+
+        callGraph = dict()
+        accepts = list()
+
+        # emit cbc code
+        for k in funcMap.iterkeys():
+            callGraph["state_"+self.set2name(k)] = set()
+
+        for k, v in funcMap.iteritems():
+            caseMap = dict()
+            for case in v:
+                caseMap[case] = "state_"+self.set2name(dfa.map[(frozenset(k), case)])
+            if k in dfa.accepts:
+                accepts.append("state_"+self.set2name(k))
+                caseMap['\\0'] = "accept"
+            callGraph["state_"+self.set2name(k)] = caseMap
+
+        return (callGraph, accepts)
+
+# emit CbC-souce, from CallGraph
+def dfa2CbC(cg, regexp, emitCFlag):
+    if emitCFlag:
+        fun_type = 'void '
+        call_t = ''
+    else:
+        fun_type = '__code '
+        call_t = 'goto '
+
+    # emit cbc(or c) code
+    print "#include <stdio.h>\n"
+    for k in cg.map.iterkeys():
+        print fun_type + k + "(char* s);"
+    print fun_type + 'accept(char* s);'
+    print fun_type + 'reject(char* s);'
+    print """
+int main(int argc, char* argv[]) {
+\tputs(\"regexp: %s\");
+\tputs(\"number of state: %d\");
+\tprintf(\"string: %%s\\n\", argv[1]);
+\t%s%s(argv[1]);
+\treturn 0;
+}\n""" % (regexp, len(cg.map), call_t, cg.start)
+
+    for k, v in cg.map.iteritems():
+        print fun_type + k + "(char* s) {"
+        print "\tswitch(*s++) {"
+        for case, next_state in v.iteritems():
+            print "\t\tcase '%s': " % (case)
+            print "\t\t\t%s%s(s);\n\t\t\tbreak;" % (call_t, next_state)
+
+        print "\t\tdefault: %sreject(s);\n\t}" % call_t
+        print "}\n"
+
+    print """
+%saccept(char* s) {
+\tprintf(\"\\nstring matches regexp. \\n\\n\");
+}\n""" % fun_type
+    print """
+%sreject(char* s) {
+\tprintf(\"\\nstring does not match regexp. \\n\\n\");
+}\n""" % fun_type
+
+class Regexp(object):
+    def __init__(self, regexp):
+        self.regexp = regexp
+        self.nfa    = None
+        self.dfa    = None
+        self._compile()
+
+    def _compile(self):
+        lexer_  = Lexer(self.regexp)
+        parser_ = Parser(lexer_)
+        self.nfa = parser_.expression()
+        self.dfa = nfa2dfa(self.nfa)
+
+    def emitCbC(self, emitCFlag=False):
+        cg = CallGraph(self.dfa)
+        dfa2CbC(cg, self.regexp, emitCFlag)
+
+    def matches(self, string):
+        runtime = self.dfa.getRuntime()
+        return runtime.doesAccept(string)
+
+def compile(regexp):
+    return Regexp(regexp)
+
+def main(argv):
+    myusage = "%prog [-C] regexp"
+    psr = OptionParser(usage=myusage)
+    psr.add_option("-C", action="store_true", dest="emitCFlag", default=False, help="emit C-source")
+    (opts, args) = psr.parse_args(sys.argv)
+    if len(args) == 1:
+        psr.print_help()
+        exit()
+    r = compile(args[1])
+    r.emitCbC(opts.emitCFlag)
+
+if __name__ == '__main__' : main(sys.argv)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/reg2llvm.py	Thu Jun 17 15:53:50 2010 +0900
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+
+# Import the llvm-py modules.
+# encodingfrom llvm import *
+from llvm.core import *
+from llvm.passes import *
+from llvm.ee import *  # new import: ee = Execution Engine
+from dfareg import *
+
+class RegLLVM(object):
+    # define llvm core types, and const
+    int_t = Type.int()
+    char_t = Type.int(8)
+    charptr_t = Type.pointer(char_t)
+    charptrptr_t = Type.pointer(charptr_t)
+    const_zero = Constant.int(int_t, 0)
+    const_one = Constant.int(int_t, 1)
+    llvm.GuaranteedTailCallOpt = True
+
+    def __init__(self, modName, regexp, string, impl_label, optimized, dump):
+        # Create a module
+        self.mod = Module.new(modName)
+        self.regexp = regexp
+        self.optimized = optimized
+        self.dump = dump
+        self.impl_label = impl_label
+
+        self.matchp_str = self.new_str_const(string)
+        self.dump_str = self.new_str_const("state: %s, arg: %c(int %d)\n")
+
+        def optional_func_decl(fun):
+            fun.calling_convertion = CC_X86_FASTCALL
+            fun.args[0].name = "index"
+
+        state_list = dict()
+        main = self.mod.add_function(
+            Type.function(self.int_t, (self.int_t,)), "main")
+        optional_func_decl(main)
+        main_entry = main.append_basic_block("entry")
+
+
+        if (impl_label):
+            accept_state = main.append_basic_block("accpet")
+            reject_state = main.append_basic_block("reject")
+            index_ptr = Builder.new(main_entry).malloc(self.int_t)
+            Builder.new(accept_state).free(index_ptr)
+            Builder.new(reject_state).free(index_ptr)
+        else:
+            # Create function - accept and reject (final state).
+            accept_state = self.mod.add_function(
+                Type.function(self.int_t, (self.int_t,)), "accept")
+            optional_func_decl(accept_state)
+            reject_state = self.mod.add_function(
+                Type.function(self.int_t, (self.int_t,)), "reject")
+            optional_func_decl(reject_state)
+
+        state_list["accept"] = accept_state
+        state_list["reject"] = reject_state
+
+        # create Regexp Object (Regexp has dfa)
+        r = Regexp(regexp)
+        self.cg = CallGraph(r.dfa)
+
+        # add state to module, (as function or label).
+        if (impl_label):
+            for state in self.cg.map.iterkeys():
+                label = main.append_basic_block(state)
+                state_list[state] = label
+        else:
+            for state in self.cg.map.iterkeys():
+                fun = self.mod.add_function(
+                    Type.function(self.int_t, (self.int_t,)), state)
+                optional_func_decl(fun)
+                state_list[state] = fun
+
+        # emit instructions
+        if (impl_label): emit = Builder.new(accept_state)
+        else:            emit = Builder.new(accept_state.append_basic_block("entry"))
+
+        if (dump): self.emit_call_printf(emit, "%s does match regexp\n", self.gep_first(emit, self.matchp_str))
+        emit.ret(self.const_one)
+
+        if (impl_label): emit = Builder.new(reject_state)
+        else:            emit = Builder.new(reject_state.append_basic_block("entry"))
+        if (dump): self.emit_call_printf(emit, "%s does not match regexp\n", self.gep_first(emit, self.matchp_str))
+        emit.ret(self.const_zero)
+
+        if (impl_label):
+            # emit transition instruction with jump instruction
+            emit = Builder.new(main_entry)
+            emit.store(main.args[0], index_ptr)
+            emit.branch(state_list[self.cg.start])
+
+            for state, transition in self.cg.map.iteritems():
+                emit = Builder.new(state_list[state])
+                index = emit.load(index_ptr)
+                ptr = emit.gep(self.matchp_str, (self.const_zero, index))
+                emit.store(emit.add(self.const_one, index), index_ptr)
+                char = emit.load(ptr)
+                si = emit.switch(char, state_list['reject'], len(transition))
+                local_label = 0
+                for case, next_state in transition.iteritems():
+                    bb = main.append_basic_block("%s_case%d" % (state, local_label))   #create default bb
+                    emit = Builder.new(bb)
+                    emit.branch(state_list[next_state])
+                    si.add_case(self.char_const(case), bb)
+                    local_label += 1
+        else:
+            for state, transition in self.cg.map.iteritems():
+                cases = dict()
+                for case, next_state in transition.iteritems():
+                    cases[self.char_const(case)] = state_list[next_state]
+                state_fun = state_list[state]
+                emit = Builder.new(state_fun.append_basic_block("entry"))
+                ptr = emit.gep(self.matchp_str, (self.const_zero, state_fun.args[0]))
+                next_index = emit.add(state_fun.args[0], self.const_one)
+                char = emit.load(ptr)
+
+                if (dump): self.emit_call_printf(emit, self.dump_str, self.gep_first(emit, self.new_str_const(fun.name)), char, char)
+
+                label = 0
+                default_bb = state_fun.append_basic_block("default") #create default bb
+                builder = Builder.new(default_bb)              # default is reject.
+                ret = builder.call(reject_state, (next_index,))
+                builder.ret(ret)
+
+                si = emit.switch(char, default_bb, len(cases)) # create switch instruction with deafult case.
+                for case, nextFun in cases.iteritems():
+                    bb = state_fun.append_basic_block("case%d" % label)   #create default bb
+                    builder = Builder.new(bb)
+                    ret = builder.call(nextFun, (next_index,))
+                    builder.ret(ret)
+                    si.add_case(case, bb)
+                    label += 1
+            emit = Builder.new(main_entry)
+            ret = emit.call(state_list[self.cg.start], (main.args[0],))
+            emit.ret(ret)
+
+        self.mp = ModuleProvider.new(self.mod)
+        if (optimized): self.optimize()
+        self.ee = ExecutionEngine.new(self.mp)
+        self.main = main
+
+    def getExecutionEngine(self):
+        return self.ee
+
+    def optimize(self):
+        #optimization passes
+        pm = PassManager.new()
+        pm.add(TargetData.new(''))
+        pm.add(PASS_FUNCTION_INLINING)
+        pm.run(self.mod)
+        fp = FunctionPassManager.new(self.mp)
+        fp.add(TargetData.new(''))
+        fp.add(PASS_BLOCK_PLACEMENT)
+        fp.add(PASS_INSTRUCTION_COMBINING)
+        fp.add(PASS_TAIL_CALL_ELIMINATION)
+        fp.add(PASS_AGGRESSIVE_DCE)
+        fp.add(PASS_DEAD_INST_ELIMINATION)
+        fp.add(PASS_DEAD_CODE_ELIMINATION)
+        for fun in self.mod.functions:
+            fp.run(fun)
+
+    def printModule(self):
+        print self.mod
+
+    def execute(self):
+        return self.ee.run_function(self.main,
+                                    (GenericValue.int(self.int_t, 0),))
+
+    def new_str_const(self, val):
+        '''create string(array of int) as a global value '''
+        str = self.mod.add_global_variable(Type.array(self.char_t, len(val) + 1), "")
+        str.initializer = Constant.stringz(val)
+        return str
+
+    def gep_first(self, emit, val):
+        '''get pointer of array'''
+        return emit.gep(val, (self.const_zero, self.const_zero))
+
+    def char_const(self, val):
+        '''create constant int value'''
+        if isinstance(val, str):
+            if val == '\\0':
+                return Constant.int(self.char_t, 0)
+            else:
+                return Constant.int(self.char_t, ord(val))
+        else:
+            exit('char_const: invalid argument.', val)
+
+    def emit_call_printf(self, emit, string, *args):
+        '''emit libc printf function call instruction'''
+        try:
+            printf = self.mod.get_function_named("printf")
+        except llvm.LLVMException:
+            printf = self.mod.add_function(
+                Type.function(Type.void(),
+                              (Type.pointer(self.char_t, 0),), 1), "printf")
+        if isinstance(string, str):
+            string = self.new_str_const(string)
+        emit.call(printf,
+                  [self.gep_first(emit, string)]+list(args))