changeset 62:70947ddafad7

added test code, CbC-example/regexp
author ryoma <e075725@ie.u-ryukyu.ac.jp>
date Sun, 25 Apr 2010 17:46:01 +0900
parents 60c1b2f8487a
children b362627d71ba
files CbC-examples/regexp/dfareg CbC-examples/regexp/dfareg.c CbC-examples/regexp/dfareg.py
diffstat 3 files changed, 596 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file CbC-examples/regexp/dfareg has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CbC-examples/regexp/dfareg.c	Sun Apr 25 17:46:01 2010 +0900
@@ -0,0 +1,182 @@
+#include <stdio.h>
+
+#include <stdlib.h>
+
+__code state_20_21(char* s);
+__code state_1(char* s);
+__code state_3_8_9_13_23_24(char* s);
+__code state_6_7(char* s);
+__code state_2_3_9_13_23_24_25(char* s);
+__code state_10_11(char* s);
+__code state_18_19(char* s);
+__code state_4_5(char* s);
+__code state_14_15(char* s);
+__code state_3_9_13_22_23_24(char* s);
+__code state_3_9_12_13_23_24(char* s);
+__code state_16_17(char* s);
+__code accept();
+__code reject();
+
+int main(int argc, char* argv[]) {
+	if (argc == 1) {
+		printf("usage: %s text\n", argv[0]);
+		exit(0);
+}
+
+	puts("regexp: P(erl|HP|ython)*");
+	puts("number of state: 12");
+	printf("string: %s\n", argv[1]);
+	goto state_1(argv[1]);
+	return 0;
+}
+
+__code state_20_21(char* s) {
+	switch(*s++) {
+		case 'n': 
+			goto state_3_9_13_22_23_24(s);
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_1(char* s) {
+	switch(*s++) {
+		case 'P': 
+			goto state_2_3_9_13_23_24_25(s);
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_3_8_9_13_23_24(char* s) {
+	switch(*s++) {
+		case 'y': 
+			goto state_14_15(s);
+			break;
+		case 'H': 
+			goto state_10_11(s);
+			break;
+		case 'e': 
+			goto state_4_5(s);
+			break;
+		case '\0': goto accept();
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_6_7(char* s) {
+	switch(*s++) {
+		case 'l': 
+			goto state_3_8_9_13_23_24(s);
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_2_3_9_13_23_24_25(char* s) {
+	switch(*s++) {
+		case 'y': 
+			goto state_14_15(s);
+			break;
+		case 'H': 
+			goto state_10_11(s);
+			break;
+		case 'e': 
+			goto state_4_5(s);
+			break;
+		case '\0': goto accept();
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_10_11(char* s) {
+	switch(*s++) {
+		case 'P': 
+			goto state_3_9_12_13_23_24(s);
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_18_19(char* s) {
+	switch(*s++) {
+		case 'o': 
+			goto state_20_21(s);
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_4_5(char* s) {
+	switch(*s++) {
+		case 'r': 
+			goto state_6_7(s);
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_14_15(char* s) {
+	switch(*s++) {
+		case 't': 
+			goto state_16_17(s);
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_3_9_13_22_23_24(char* s) {
+	switch(*s++) {
+		case 'y': 
+			goto state_14_15(s);
+			break;
+		case 'H': 
+			goto state_10_11(s);
+			break;
+		case 'e': 
+			goto state_4_5(s);
+			break;
+		case '\0': goto accept();
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_3_9_12_13_23_24(char* s) {
+	switch(*s++) {
+		case 'y': 
+			goto state_14_15(s);
+			break;
+		case 'H': 
+			goto state_10_11(s);
+			break;
+		case 'e': 
+			goto state_4_5(s);
+			break;
+		case '\0': goto accept();
+			break;
+		default: goto reject();
+	}
+}
+
+__code state_16_17(char* s) {
+	switch(*s++) {
+		case 'h': 
+			goto state_18_19(s);
+			break;
+		default: goto reject();
+	}
+}
+
+
+__code accept() {
+	printf("\nstring matches regexp. \n\n");
+}
+
+
+__code reject() {
+	printf("\nstring does not match regexp. \n\n");
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CbC-examples/regexp/dfareg.py	Sun Apr 25 17:46:01 2010 +0900
@@ -0,0 +1,414 @@
+#!/usr/bin/env python
+
+import copy
+import sys
+
+class NondeterministicFiniteAutomaton(object):
+    def __init__(self, transition, start, accepts, map_):
+        self.transition = transition
+        self.start = start
+        self.accepts = accepts
+        self.map = map_
+
+    def epsilonExpand(self, set_):
+        que = set(set_)
+        done = set()
+        while que:
+            stat = que.pop()
+            nexts = self.transition(stat, "")
+            done.add(stat)
+            for nextStat in nexts:
+                if not nextStat in done: que.add(nextStat)
+        return frozenset(done)
+
+class DeterministicFiniteAutomaton(object):
+    def __init__(self, transition, start, accepts, map_):
+        self.transition = transition
+        self.start = start
+        self.accepts = accepts
+        self.map = map_
+    def getRuntime(self):
+        return DFARuntime(self)
+
+class DFARuntime(object):
+    def __init__(self, DFA):
+        self.DFA = DFA
+        self.curState = self.DFA.start
+
+    def doTransition(self, char):
+        self.curState = self.DFA.transition(
+            self.curState, char)
+
+    def isAcceptState(self):
+        return self.curState in self.DFA.accepts
+
+    def doesAccept(self, input):
+        for alphabet in input:
+            self.doTransition(alphabet)
+        return self.isAcceptState()
+
+class Token(object):
+    CHARACTER  = 0
+    OPE_UNION  = 1
+    OPE_STAR   = 2
+    LPAREN     = 3
+    RPAREN     = 4
+    EOF        = 5
+
+    def __init__(self, value, kind):
+        self.value = value
+        self.kind  = kind
+
+
+class Lexer(object):
+    def __init__(self, str):
+        self.stringList = list(str)
+
+    def scan(self):
+        if not self.stringList:
+            return Token(None, Token.EOF)
+
+        ch = self.stringList.pop(0)
+        if   ch == '\\':
+            return Token(self.stringList.pop(0), Token.CHARACTER)
+        elif ch == '|':
+            return Token(ch, Token.OPE_UNION)
+        elif ch == '(':
+            return Token(ch, Token.LPAREN)
+        elif ch == ')':
+            return Token(ch, Token.RPAREN)
+        elif ch == '*':
+            return Token(ch, Token.OPE_STAR)
+        else:
+            return Token(ch, Token.CHARACTER)
+
+"""
+ Context-free Grammer:
+ (A) expression -> subexpr EOF
+ (B) subexpr -> seq '|' subexpr | seq
+ (C) seq -> subseq | ''
+ (D) subseq -> star subseq | star
+ (E) star -> factor '*' | factor
+ (F) factor -> '(' subexpr ')' | CHARACTER
+"""
+
+class Parser(object):
+    def __init__(self, lexer):
+        self.lexer = lexer
+        self.look  = None
+        self.move()
+
+    def match(self, tag):
+        if self.look.kind != tag:
+            raise Exeption("syntax error")
+        self.move()
+
+    def move(self):
+        self.look = self.lexer.scan()
+
+    def factor(self):
+        if self.look.kind == Token.LPAREN:
+            # factor -> '(' subexpr ')'
+            self.match(Token.LPAREN)
+            node = self.subexpr()
+            self.match(Token.RPAREN)
+            return node
+        else:
+            # factor -> CHARACTER
+            node = Character(self.look.value)
+            self.match(Token.CHARACTER)
+            return node
+
+    def star(self):
+        # star -> factor '*' | factor
+        node = self.factor()
+        if self.look.kind == Token.OPE_STAR:
+            self.match(Token.OPE_STAR)
+            node = Star(node)
+        return node
+
+    def seq(self):
+        if self.look.kind == Token.LPAREN \
+               or self.look.kind == Token.CHARACTER:
+            # seq -> subseq
+            return self.subseq()
+        else:
+            # seq -> ''
+            return Character("")
+
+    def subseq(self):
+        node1 = self.star()
+        if self.look.kind == Token.LPAREN \
+               or self.look.kind == Token.CHARACTER:
+            # subseq -> star subseq
+            node2 = self.subseq()
+            node  = Concat(node1, node2)
+            return node
+        else:
+            # subseq -> star
+            return node1
+
+    def subexpr(self):
+        node1 = self.seq()
+        if self.look.kind == Token.OPE_UNION:
+            self.match(Token.OPE_UNION)
+            # subexpr    -> seq '|' subexpr
+            node2 = self.subexpr()
+            node  = Union(node1, node2)
+            return node
+        else:
+            # subexpr    -> seq
+            return node1
+
+    def expression(self):
+        # expression -> subexpr EOF
+        node     = self.subexpr()
+        self.match(Token.EOF)
+
+        # Create TREE, NFA
+        context  = Context()
+        fragment = node.assemble(context)
+        return fragment.build()
+
+class Context(object):
+    def __init__(self):
+        self.stateCount = 0
+
+    def newState(self):
+        self.stateCount += 1
+        return self.stateCount
+
+class NFAFragment(object):
+    def __init__(self):
+        self.start   = None    # integer
+        self.accepts = None    # frozenset
+        self.map     = dict()  # transition
+
+    def connect(self, from_, char, to):
+        slot = self.map.setdefault((from_, char), set())
+        slot.add(to)
+
+    def newSkelton(self):
+        # copy fragment (self), and it return
+        newFrag = NFAFragment();
+        newFrag.map = copy.deepcopy(self.map)
+        return newFrag
+
+    def __or__(self, frag):
+        newFrag = self.newSkelton()
+        for k, v in frag.map.iteritems():
+            newFrag.map[k] = v.copy()
+        return newFrag
+
+    def build(self):
+        map_ = self.map
+        def transition(state, char):
+            return frozenset(map_.get((state, char), []))
+
+        return NondeterministicFiniteAutomaton(
+            transition,
+            self.start,
+            self.accepts,
+            self.map
+            )
+
+class Character(object):
+    def __init__(self, char):
+        self.char = char
+
+    def assemble(self, context):
+        frag = NFAFragment()
+        s1 = context.newState()
+        s2 = context.newState()
+        frag.connect(s1, self.char, s2)
+        frag.start = s1
+        frag.accepts = frozenset([s2])
+        return frag
+
+class Union(object):
+    def __init__(self, op1, op2):
+        self.op1 = op1
+        self.op2 = op2
+
+    def assemble(self, context):
+        frag1 = self.op1.assemble(context)
+        frag2 = self.op2.assemble(context)
+        frag = frag1 | frag2
+        s = context.newState()
+        frag.connect(s, "", frag1.start)
+        frag.connect(s, "", frag2.start)
+        frag.start = s
+        frag.accepts = frag1.accepts | frag2.accepts
+        return frag
+
+class Concat(object):
+    def __init__(self, op1, op2):
+        self.op1 = op1
+        self.op2 = op2
+
+    def assemble(self, context):
+        frag1 = self.op1.assemble(context)
+        frag2 = self.op2.assemble(context)
+        frag = frag1 | frag2
+
+        for state in frag1.accepts:
+            frag.connect(state, "", frag2.start)
+
+        frag.start   = frag1.start
+        frag.accepts = frag2.accepts
+        return frag
+
+class Star(object):
+    def __init__(self, op):
+        self.op = op
+
+    def assemble(self, context):
+        fragOrig = self.op.assemble(context)
+        frag = fragOrig.newSkelton()
+
+        for state in fragOrig.accepts:
+            frag.connect(state, "", fragOrig.start)
+
+        s = context.newState()
+        frag.connect(s, "", fragOrig.start)
+        frag.start = s
+        frag.accepts = fragOrig.accepts | frozenset([s])
+        return frag
+
+class NonDisjoinSets(object):
+    def __init__(self, sub):
+        self.sub = sub
+    def __contains__(self, a_set):
+        return a_set & self.sub
+
+def nfa2dfa(nfa):
+    def transition(set_, alpha):
+        ret = set()
+        for elem in set_:
+            ret |= nfa.transition(elem, alpha)
+        return nfa.epsilonExpand(frozenset(ret))
+
+    def stateSetTransition(stateSet, char):
+        trans = set()
+        for state in stateSet:
+            t = nfa.map.setdefault((state, char), None)
+            if not t == None: trans.update(nfa.epsilonExpand(t))
+        return frozenset(trans)
+
+    map_ = dict()
+    que =  set(frozenset([nfa.epsilonExpand(set([nfa.start]))]))
+    done = set()
+
+    while que:
+        stateSet = que.pop()
+
+        for state in stateSet:
+            for k, v in nfa.map.iteritems():
+                if state == k[0] and k[1] != '':
+                    slot = map_.setdefault((stateSet, k[1]), set())
+                    slot.update(nfa.epsilonExpand(v))
+
+        done.add(stateSet)
+
+        for v in map_.itervalues():
+            if not v in done:
+                que.add(frozenset(v))
+
+    return DeterministicFiniteAutomaton(
+        transition,
+        nfa.epsilonExpand(frozenset([nfa.start])),
+        NonDisjoinSets(nfa.accepts),
+        map_
+        )
+
+# emit CbC-souce, from dfa
+def dfa2CbC(dfa, regexp):
+
+    # return state name from set
+    def set2name(set_):
+        l = list(set_)
+        l.sort()
+        return '_'.join(str(x) for x in l)
+
+    funcMap = dict()
+    funcMap[dfa.start] = set()
+
+    # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....}
+    #             : frozenset(1, 15) x 'd' -> frozenset([16])
+
+    for v in dfa.map.itervalues():
+        funcMap[frozenset(v)] = set()
+
+    for key in dfa.map.iterkeys():
+        slot = funcMap.setdefault(key[0], set())
+        slot.add(key[1])
+
+    # emit cbc code
+    print "#include <stdio.h>\n"
+    print "#include <stdlib.h>\n"
+    for k in funcMap.iterkeys():
+        print '__code state_' + set2name(k) + "(char* s);"
+    print '__code accept();'
+    print '__code reject();'
+    print """
+int main(int argc, char* argv[]) {
+\tif (argc == 1) {
+\t\tprintf(\"usage: %%s text\\n\", argv[0]);
+\t\texit(0);
+}
+
+\tputs(\"regexp: %s\");
+\tputs(\"number of state: %d\");
+\tprintf(\"string: %%s\\n\", argv[1]);
+\tgoto state_%s(argv[1]);
+\treturn 0;
+}\n""" % (regexp, len(funcMap), set2name(dfa.start))
+
+    for k, v in funcMap.iteritems():
+        print '__code state_' + set2name(k) + "(char* s) {"
+        print "\tswitch(*s++) {"
+        for case in v:
+            print "\t\tcase '%c': " % (case)
+            print "\t\t\tgoto state_%s(s);\n\t\t\tbreak;" % set2name(dfa.map[(frozenset(k), case)])
+        if k in dfa.accepts: print "\t\tcase '\\0': goto accept();\n\t\t\tbreak;"
+        print "\t\tdefault: goto reject();\n\t}"
+        print "}\n"
+
+    print """
+__code accept() {
+\tprintf(\"\\nstring matches regexp. \\n\\n\");
+}\n"""
+    print """
+__code reject() {
+\tprintf(\"\\nstring does not match regexp. \\n\\n\");
+}\n"""
+
+class Regexp(object):
+    def __init__(self, regexp):
+        self.regexp = regexp
+        self.nfa    = None
+        self.dfa    = None
+        self._compile()
+
+    def _compile(self):
+        lexer_  = Lexer(self.regexp)
+        parser_ = Parser(lexer_)
+        self.nfa = parser_.expression()
+        self.dfa = nfa2dfa(self.nfa)
+
+    def emitCbC(self):
+        dfa2CbC(self.dfa, self.regexp)
+
+    def matches(self, string):
+        runtime = self.dfa.getRuntime()
+        return runtime.doesAccept(string)
+
+def compile(regexp):
+    return Regexp(regexp)
+
+def main(argv):
+    if len(argv) == 1 : exit("usage: dfareg.py regexp [> file]")
+    r = compile(argv[1])
+    r.emitCbC()
+
+if __name__ == '__main__' : main(sys.argv)