view src/dfareg.py @ 9:973b596bd124

remove Regexp.emitDot()
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Sat, 03 Jul 2010 03:49:41 +0900
parents 11fba907c0af
children 2d00b46ce34d
line wrap: on
line source

#!/usr/bin/env python

import copy
import re
from optparse import OptionParser

class NondeterministicFiniteAutomaton(object):
    def __init__(self, transition, start, accepts, map_):
        self.transition = transition
        self.start = start
        self.accepts = accepts
        self.map = map_

    def epsilonExpand(self, set_):
        que = set(set_)
        done = set()
        while que:
            stat = que.pop()
            nexts = self.transition(stat, "")
            done.add(stat)
            for nextStat in nexts:
                if not nextStat in done: que.add(nextStat)
        return frozenset(done)

class DeterministicFiniteAutomaton(object):
    def __init__(self, transition, start, accepts, map_):
        self.transition = transition
        self.start = start
        self.accepts = accepts
        self.map = map_
    def getRuntime(self):
        return DFARuntime(self)

class DFARuntime(object):
    def __init__(self, DFA):
        self.DFA = DFA
        self.curState = self.DFA.start

    def doTransition(self, char):
        self.curState = self.DFA.transition(
            self.curState, char)

    def isAcceptState(self):
        return self.curState in self.DFA.accepts

    def doesAccept(self, input):
        for alphabet in input:
            self.doTransition(alphabet)
        return self.isAcceptState()

class Token(object):
    CHARACTER  = 0
    OPE_UNION  = 1
    OPE_STAR   = 2
    LPAREN     = 3
    RPAREN     = 4
    EOF        = 5

    def __init__(self, value, kind):
        self.value = value
        self.kind  = kind


class Lexer(object):
    def __init__(self, str):
        self.stringList = list(str)

    def scan(self):
        if not self.stringList:
            return Token(None, Token.EOF)

        ch = self.stringList.pop(0)
        if   ch == '\\':
            return Token(self.stringList.pop(0), Token.CHARACTER)
        elif ch == '|':
            return Token(ch, Token.OPE_UNION)
        elif ch == '(':
            return Token(ch, Token.LPAREN)
        elif ch == ')':
            return Token(ch, Token.RPAREN)
        elif ch == '*':
            return Token(ch, Token.OPE_STAR)
        else:
            return Token(ch, Token.CHARACTER)

"""Context-free Grammer:
 (A) expression -> subexpr EOF
 (B) subexpr -> seq '|' subexpr | seq
 (C) seq -> subseq | ''
 (D) subseq -> star subseq | star
 (E) star -> factor '*' | factor
 (F) factor -> '(' subexpr ')' | CHARACTER
"""

class Parser(object):
    def __init__(self, lexer):
        self.lexer = lexer
        self.look  = None
        self.move()

    def match(self, tag):
        if self.look.kind != tag:
            raise Exeption("syntax error")
        self.move()

    def move(self):
        self.look = self.lexer.scan()

    def factor(self):
        if self.look.kind == Token.LPAREN:
            # factor -> '(' subexpr ')'
            self.match(Token.LPAREN)
            node = self.subexpr()
            self.match(Token.RPAREN)
            return node
        else:
            # factor -> CHARACTER
            node = Character(self.look.value)
            self.match(Token.CHARACTER)
            return node

    def star(self):
        # star -> factor '*' | factor
        node = self.factor()
        if self.look.kind == Token.OPE_STAR:
            self.match(Token.OPE_STAR)
            node = Star(node)
        return node

    def seq(self):
        if self.look.kind == Token.LPAREN \
               or self.look.kind == Token.CHARACTER:
            # seq -> subseq
            return self.subseq()
        else:
            # seq -> ''
            return Character("")

    def subseq(self):
        node1 = self.star()
        if self.look.kind == Token.LPAREN \
               or self.look.kind == Token.CHARACTER:
            # subseq -> star subseq
            node2 = self.subseq()
            node  = Concat(node1, node2)
            return node
        else:
            # subseq -> star
            return node1

    def subexpr(self):
        node1 = self.seq()
        if self.look.kind == Token.OPE_UNION:
            self.match(Token.OPE_UNION)
            # subexpr    -> seq '|' subexpr
            node2 = self.subexpr()
            node  = Union(node1, node2)
            return node
        else:
            # subexpr    -> seq
            return node1

    def expression(self):
        # expression -> subexpr EOF
        node     = self.subexpr()
        self.match(Token.EOF)

        # Create TREE, NFA
        context  = Context()
        fragment = node.assemble(context)
        return fragment.build()

class Context(object):
    def __init__(self):
        self.stateCount = 0

    def newState(self):
        self.stateCount += 1
        return self.stateCount

class NFAFragment(object):
    def __init__(self):
        self.start   = None    # integer
        self.accepts = None    # frozenset
        self.map     = dict()  # transition

    def connect(self, from_, char, to):
        slot = self.map.setdefault((from_, char), set())
        slot.add(to)

    def newSkelton(self):
        # copy fragment (self), and it return
        newFrag = NFAFragment();
        newFrag.map = copy.deepcopy(self.map)
        return newFrag

    def __or__(self, frag):
        newFrag = self.newSkelton()
        for k, v in frag.map.iteritems():
            newFrag.map[k] = v.copy()
        return newFrag

    def build(self):
        map_ = self.map
        def transition(state, char):
            return frozenset(map_.get((state, char), []))

        return NondeterministicFiniteAutomaton(
            transition,
            self.start,
            self.accepts,
            self.map
            )

class Character(object):
    def __init__(self, char):
        self.char = char

    def assemble(self, context):
        frag = NFAFragment()
        s1 = context.newState()
        s2 = context.newState()
        frag.connect(s1, self.char, s2)
        frag.start = s1
        frag.accepts = frozenset([s2])
        return frag

class Union(object):
    def __init__(self, op1, op2):
        self.op1 = op1
        self.op2 = op2

    def assemble(self, context):
        frag1 = self.op1.assemble(context)
        frag2 = self.op2.assemble(context)
        frag = frag1 | frag2
        s = context.newState()
        frag.connect(s, "", frag1.start)
        frag.connect(s, "", frag2.start)
        frag.start = s
        frag.accepts = frag1.accepts | frag2.accepts
        return frag

class Concat(object):
    def __init__(self, op1, op2):
        self.op1 = op1
        self.op2 = op2

    def assemble(self, context):
        frag1 = self.op1.assemble(context)
        frag2 = self.op2.assemble(context)
        frag = frag1 | frag2

        for state in frag1.accepts:
            frag.connect(state, "", frag2.start)

        frag.start   = frag1.start
        frag.accepts = frag2.accepts
        return frag

class Star(object):
    def __init__(self, op):
        self.op = op

    def assemble(self, context):
        fragOrig = self.op.assemble(context)
        frag = fragOrig.newSkelton()

        for state in fragOrig.accepts:
            frag.connect(state, "", fragOrig.start)

        s = context.newState()
        frag.connect(s, "", fragOrig.start)
        frag.start = s
        frag.accepts = fragOrig.accepts | frozenset([s])
        return frag

class NonDisjoinSets(object):
    def __init__(self, sub):
        self.sub = sub
    def __contains__(self, a_set):
        return a_set & self.sub

def nfa2dfa(nfa):
    def transition(set_, alpha):
        ret = set()
        for elem in set_:
            ret |= nfa.transition(elem, alpha)
        return nfa.epsilonExpand(frozenset(ret))

    def stateSetTransition(stateSet, char):
        trans = set()
        for state in stateSet:
            t = nfa.map.setdefault((state, char), None)
            if not t == None: trans.update(nfa.epsilonExpand(t))
        return frozenset(trans)

    map_ = dict()
    que =  set(frozenset([nfa.epsilonExpand(set([nfa.start]))]))
    done = set()

    while que:
        stateSet = que.pop()

        for state in stateSet:
            for k, v in nfa.map.iteritems():
                if state == k[0] and k[1] != '':
                    slot = map_.setdefault((stateSet, k[1]), set())
                    slot.update(nfa.epsilonExpand(v))

        done.add(stateSet)

        for v in map_.itervalues():
            if not v in done:
                que.add(frozenset(v))

    return DeterministicFiniteAutomaton(
        transition,
        nfa.epsilonExpand(frozenset([nfa.start])),
        NonDisjoinSets(nfa.accepts),
        map_
        )

# create call graph (as dictionary)
class CallGraph(object):
    """CallGraph is State Transition Diagram.
    it's can be create from DFA or DFA.
    >>> reg = Regexp(\"AA*|B\")
    >>> dfacg = CallGraph(reg.dfa)
    >>> nfacg = CallGraph(reg.nfa)
    >>> dfacg.map
    {'1_6_8': {'A': ['2_3_5'], 'B': ['7']}, '2_3_5': {'A': ['3_4']}, '7': {}, '3_4': {'A': ['3_4']}}
    >>> dfacg.states
    set(['1_6_8', '2_3_5', '7', '3_4'])
    >>> dfacg.accepts
    set(['2_3_5', '7', '3_4'])
    >>> nfacg.map
    {'1': {'A': ['2']}, '3': {'A': ['4']}, '2': {'': ['5']}, '5': {'': ['3']}, '4': {'': ['3']}, '7': {}, '6': {'B': ['7']}, '8': {'': ['1', '6']}}
    >>> nfacg.states
    set(['1', '3', '2', '5', '4', '7', '6', '8'])
    >>> nfacg.accepts
    set(['5', '4', '7'])
    """

    def __init__(self, fa):
        self.fa = fa
        if isinstance(fa, DeterministicFiniteAutomaton):
            self.type = "DFA"
            self.start = self.set2name(fa.start)
        else:
            self.type = "NFA"
            self.start = str(fa.start)
        self.inputs = set()
        self.states = set()
        (self.map, self.accepts) = self.getCallGraph(self.fa)

    # return state name from set
    def set2name(self, set_):
        l = list(set_)
        l.sort()
        return ('_'.join(str(x) for x in l))

    def getCallGraph(self, fa):
        transitionCases = dict()
        if self.type is "DFA":
            transitionCases[fa.start] = set()
        else:
            transitionCases[frozenset([fa.start])] = set()

        # transitionCases is hash table that binds a state and inputs,
        # it's useful for transition definition

        # fa is dfa or nfa
        # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....}
        #             : frozenset(1, 15) x 'd' -> frozenset([16])
        # nfa.map => {(6, ''): set([5, 7]), (1, 'A'): set([2]),   ....}
        #             : 6 x '' -> set([5, 7])

        for nextState in fa.map.itervalues():
            if self.type is "DFA":
                transitionCases[frozenset(nextState)] = set()
            else:
                for nextState_ in nextState:
                    transitionCases[frozenset([nextState_])] = set()

        for (state, input) in fa.map.iterkeys():
            if self.type is "NFA":
                state = [state]
            slot = transitionCases.setdefault(frozenset(state), set())
            slot.add(input)

        callGraph = dict()
        accepts = set()

        # create CallGraph
        for state in transitionCases.iterkeys():
            callGraph[self.set2name(state)] = set()

        for (state, input) in transitionCases.iteritems():
            caseMap = dict()
            self.states.add(self.set2name(state))
            if self.type is "DFA":
                for case in input:
                    self.inputs.add(case)
                    caseMap[case] = [self.set2name(fa.map[frozenset(state), case])]
                if state in fa.accepts:
                    accepts.add(self.set2name(state))
            else:
                self.states.add(str(list(state)[0]))
                for case in input:
                    self.inputs.add(case)
                    caseMap[case] = [str(next_) for next_ in fa.map[list(state)[0], case]]
                if list(state)[0] in fa.accepts:
                    accepts.add(self.set2name(state))
            callGraph[self.set2name(state)] = caseMap
        return (callGraph, accepts)


class Regexp(object):
    def __init__(self, regexp):
        self.regexp = regexp
        self.nfa    = None
        self.dfa    = None
        self._compile()

    def _compile(self):
        lexer_  = Lexer(self.regexp)
        parser_ = Parser(lexer_)
        self.nfa = parser_.expression()
        self.dfa = nfa2dfa(self.nfa)

    def matches(self, string):
        runtime = self.dfa.getRuntime()
        return runtime.doesAccept(string)

def compile(regexp):
    return Regexp(regexp)

def test():
    import doctest
    doctest.testmod()

if __name__ == '__main__' : test()