Mercurial > hg > CbC > CbC_gcc
comparison CbC-examples/regexp/dfareg.py @ 62:70947ddafad7
added test code, CbC-example/regexp
author | ryoma <e075725@ie.u-ryukyu.ac.jp> |
---|---|
date | Sun, 25 Apr 2010 17:46:01 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
61:60c1b2f8487a | 62:70947ddafad7 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import copy | |
4 import sys | |
5 | |
6 class NondeterministicFiniteAutomaton(object): | |
7 def __init__(self, transition, start, accepts, map_): | |
8 self.transition = transition | |
9 self.start = start | |
10 self.accepts = accepts | |
11 self.map = map_ | |
12 | |
13 def epsilonExpand(self, set_): | |
14 que = set(set_) | |
15 done = set() | |
16 while que: | |
17 stat = que.pop() | |
18 nexts = self.transition(stat, "") | |
19 done.add(stat) | |
20 for nextStat in nexts: | |
21 if not nextStat in done: que.add(nextStat) | |
22 return frozenset(done) | |
23 | |
24 class DeterministicFiniteAutomaton(object): | |
25 def __init__(self, transition, start, accepts, map_): | |
26 self.transition = transition | |
27 self.start = start | |
28 self.accepts = accepts | |
29 self.map = map_ | |
30 def getRuntime(self): | |
31 return DFARuntime(self) | |
32 | |
33 class DFARuntime(object): | |
34 def __init__(self, DFA): | |
35 self.DFA = DFA | |
36 self.curState = self.DFA.start | |
37 | |
38 def doTransition(self, char): | |
39 self.curState = self.DFA.transition( | |
40 self.curState, char) | |
41 | |
42 def isAcceptState(self): | |
43 return self.curState in self.DFA.accepts | |
44 | |
45 def doesAccept(self, input): | |
46 for alphabet in input: | |
47 self.doTransition(alphabet) | |
48 return self.isAcceptState() | |
49 | |
50 class Token(object): | |
51 CHARACTER = 0 | |
52 OPE_UNION = 1 | |
53 OPE_STAR = 2 | |
54 LPAREN = 3 | |
55 RPAREN = 4 | |
56 EOF = 5 | |
57 | |
58 def __init__(self, value, kind): | |
59 self.value = value | |
60 self.kind = kind | |
61 | |
62 | |
63 class Lexer(object): | |
64 def __init__(self, str): | |
65 self.stringList = list(str) | |
66 | |
67 def scan(self): | |
68 if not self.stringList: | |
69 return Token(None, Token.EOF) | |
70 | |
71 ch = self.stringList.pop(0) | |
72 if ch == '\\': | |
73 return Token(self.stringList.pop(0), Token.CHARACTER) | |
74 elif ch == '|': | |
75 return Token(ch, Token.OPE_UNION) | |
76 elif ch == '(': | |
77 return Token(ch, Token.LPAREN) | |
78 elif ch == ')': | |
79 return Token(ch, Token.RPAREN) | |
80 elif ch == '*': | |
81 return Token(ch, Token.OPE_STAR) | |
82 else: | |
83 return Token(ch, Token.CHARACTER) | |
84 | |
85 """ | |
86 Context-free Grammer: | |
87 (A) expression -> subexpr EOF | |
88 (B) subexpr -> seq '|' subexpr | seq | |
89 (C) seq -> subseq | '' | |
90 (D) subseq -> star subseq | star | |
91 (E) star -> factor '*' | factor | |
92 (F) factor -> '(' subexpr ')' | CHARACTER | |
93 """ | |
94 | |
95 class Parser(object): | |
96 def __init__(self, lexer): | |
97 self.lexer = lexer | |
98 self.look = None | |
99 self.move() | |
100 | |
101 def match(self, tag): | |
102 if self.look.kind != tag: | |
103 raise Exeption("syntax error") | |
104 self.move() | |
105 | |
106 def move(self): | |
107 self.look = self.lexer.scan() | |
108 | |
109 def factor(self): | |
110 if self.look.kind == Token.LPAREN: | |
111 # factor -> '(' subexpr ')' | |
112 self.match(Token.LPAREN) | |
113 node = self.subexpr() | |
114 self.match(Token.RPAREN) | |
115 return node | |
116 else: | |
117 # factor -> CHARACTER | |
118 node = Character(self.look.value) | |
119 self.match(Token.CHARACTER) | |
120 return node | |
121 | |
122 def star(self): | |
123 # star -> factor '*' | factor | |
124 node = self.factor() | |
125 if self.look.kind == Token.OPE_STAR: | |
126 self.match(Token.OPE_STAR) | |
127 node = Star(node) | |
128 return node | |
129 | |
130 def seq(self): | |
131 if self.look.kind == Token.LPAREN \ | |
132 or self.look.kind == Token.CHARACTER: | |
133 # seq -> subseq | |
134 return self.subseq() | |
135 else: | |
136 # seq -> '' | |
137 return Character("") | |
138 | |
139 def subseq(self): | |
140 node1 = self.star() | |
141 if self.look.kind == Token.LPAREN \ | |
142 or self.look.kind == Token.CHARACTER: | |
143 # subseq -> star subseq | |
144 node2 = self.subseq() | |
145 node = Concat(node1, node2) | |
146 return node | |
147 else: | |
148 # subseq -> star | |
149 return node1 | |
150 | |
151 def subexpr(self): | |
152 node1 = self.seq() | |
153 if self.look.kind == Token.OPE_UNION: | |
154 self.match(Token.OPE_UNION) | |
155 # subexpr -> seq '|' subexpr | |
156 node2 = self.subexpr() | |
157 node = Union(node1, node2) | |
158 return node | |
159 else: | |
160 # subexpr -> seq | |
161 return node1 | |
162 | |
163 def expression(self): | |
164 # expression -> subexpr EOF | |
165 node = self.subexpr() | |
166 self.match(Token.EOF) | |
167 | |
168 # Create TREE, NFA | |
169 context = Context() | |
170 fragment = node.assemble(context) | |
171 return fragment.build() | |
172 | |
173 class Context(object): | |
174 def __init__(self): | |
175 self.stateCount = 0 | |
176 | |
177 def newState(self): | |
178 self.stateCount += 1 | |
179 return self.stateCount | |
180 | |
181 class NFAFragment(object): | |
182 def __init__(self): | |
183 self.start = None # integer | |
184 self.accepts = None # frozenset | |
185 self.map = dict() # transition | |
186 | |
187 def connect(self, from_, char, to): | |
188 slot = self.map.setdefault((from_, char), set()) | |
189 slot.add(to) | |
190 | |
191 def newSkelton(self): | |
192 # copy fragment (self), and it return | |
193 newFrag = NFAFragment(); | |
194 newFrag.map = copy.deepcopy(self.map) | |
195 return newFrag | |
196 | |
197 def __or__(self, frag): | |
198 newFrag = self.newSkelton() | |
199 for k, v in frag.map.iteritems(): | |
200 newFrag.map[k] = v.copy() | |
201 return newFrag | |
202 | |
203 def build(self): | |
204 map_ = self.map | |
205 def transition(state, char): | |
206 return frozenset(map_.get((state, char), [])) | |
207 | |
208 return NondeterministicFiniteAutomaton( | |
209 transition, | |
210 self.start, | |
211 self.accepts, | |
212 self.map | |
213 ) | |
214 | |
215 class Character(object): | |
216 def __init__(self, char): | |
217 self.char = char | |
218 | |
219 def assemble(self, context): | |
220 frag = NFAFragment() | |
221 s1 = context.newState() | |
222 s2 = context.newState() | |
223 frag.connect(s1, self.char, s2) | |
224 frag.start = s1 | |
225 frag.accepts = frozenset([s2]) | |
226 return frag | |
227 | |
228 class Union(object): | |
229 def __init__(self, op1, op2): | |
230 self.op1 = op1 | |
231 self.op2 = op2 | |
232 | |
233 def assemble(self, context): | |
234 frag1 = self.op1.assemble(context) | |
235 frag2 = self.op2.assemble(context) | |
236 frag = frag1 | frag2 | |
237 s = context.newState() | |
238 frag.connect(s, "", frag1.start) | |
239 frag.connect(s, "", frag2.start) | |
240 frag.start = s | |
241 frag.accepts = frag1.accepts | frag2.accepts | |
242 return frag | |
243 | |
244 class Concat(object): | |
245 def __init__(self, op1, op2): | |
246 self.op1 = op1 | |
247 self.op2 = op2 | |
248 | |
249 def assemble(self, context): | |
250 frag1 = self.op1.assemble(context) | |
251 frag2 = self.op2.assemble(context) | |
252 frag = frag1 | frag2 | |
253 | |
254 for state in frag1.accepts: | |
255 frag.connect(state, "", frag2.start) | |
256 | |
257 frag.start = frag1.start | |
258 frag.accepts = frag2.accepts | |
259 return frag | |
260 | |
261 class Star(object): | |
262 def __init__(self, op): | |
263 self.op = op | |
264 | |
265 def assemble(self, context): | |
266 fragOrig = self.op.assemble(context) | |
267 frag = fragOrig.newSkelton() | |
268 | |
269 for state in fragOrig.accepts: | |
270 frag.connect(state, "", fragOrig.start) | |
271 | |
272 s = context.newState() | |
273 frag.connect(s, "", fragOrig.start) | |
274 frag.start = s | |
275 frag.accepts = fragOrig.accepts | frozenset([s]) | |
276 return frag | |
277 | |
278 class NonDisjoinSets(object): | |
279 def __init__(self, sub): | |
280 self.sub = sub | |
281 def __contains__(self, a_set): | |
282 return a_set & self.sub | |
283 | |
284 def nfa2dfa(nfa): | |
285 def transition(set_, alpha): | |
286 ret = set() | |
287 for elem in set_: | |
288 ret |= nfa.transition(elem, alpha) | |
289 return nfa.epsilonExpand(frozenset(ret)) | |
290 | |
291 def stateSetTransition(stateSet, char): | |
292 trans = set() | |
293 for state in stateSet: | |
294 t = nfa.map.setdefault((state, char), None) | |
295 if not t == None: trans.update(nfa.epsilonExpand(t)) | |
296 return frozenset(trans) | |
297 | |
298 map_ = dict() | |
299 que = set(frozenset([nfa.epsilonExpand(set([nfa.start]))])) | |
300 done = set() | |
301 | |
302 while que: | |
303 stateSet = que.pop() | |
304 | |
305 for state in stateSet: | |
306 for k, v in nfa.map.iteritems(): | |
307 if state == k[0] and k[1] != '': | |
308 slot = map_.setdefault((stateSet, k[1]), set()) | |
309 slot.update(nfa.epsilonExpand(v)) | |
310 | |
311 done.add(stateSet) | |
312 | |
313 for v in map_.itervalues(): | |
314 if not v in done: | |
315 que.add(frozenset(v)) | |
316 | |
317 return DeterministicFiniteAutomaton( | |
318 transition, | |
319 nfa.epsilonExpand(frozenset([nfa.start])), | |
320 NonDisjoinSets(nfa.accepts), | |
321 map_ | |
322 ) | |
323 | |
324 # emit CbC-souce, from dfa | |
325 def dfa2CbC(dfa, regexp): | |
326 | |
327 # return state name from set | |
328 def set2name(set_): | |
329 l = list(set_) | |
330 l.sort() | |
331 return '_'.join(str(x) for x in l) | |
332 | |
333 funcMap = dict() | |
334 funcMap[dfa.start] = set() | |
335 | |
336 # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....} | |
337 # : frozenset(1, 15) x 'd' -> frozenset([16]) | |
338 | |
339 for v in dfa.map.itervalues(): | |
340 funcMap[frozenset(v)] = set() | |
341 | |
342 for key in dfa.map.iterkeys(): | |
343 slot = funcMap.setdefault(key[0], set()) | |
344 slot.add(key[1]) | |
345 | |
346 # emit cbc code | |
347 print "#include <stdio.h>\n" | |
348 print "#include <stdlib.h>\n" | |
349 for k in funcMap.iterkeys(): | |
350 print '__code state_' + set2name(k) + "(char* s);" | |
351 print '__code accept();' | |
352 print '__code reject();' | |
353 print """ | |
354 int main(int argc, char* argv[]) { | |
355 \tif (argc == 1) { | |
356 \t\tprintf(\"usage: %%s text\\n\", argv[0]); | |
357 \t\texit(0); | |
358 } | |
359 | |
360 \tputs(\"regexp: %s\"); | |
361 \tputs(\"number of state: %d\"); | |
362 \tprintf(\"string: %%s\\n\", argv[1]); | |
363 \tgoto state_%s(argv[1]); | |
364 \treturn 0; | |
365 }\n""" % (regexp, len(funcMap), set2name(dfa.start)) | |
366 | |
367 for k, v in funcMap.iteritems(): | |
368 print '__code state_' + set2name(k) + "(char* s) {" | |
369 print "\tswitch(*s++) {" | |
370 for case in v: | |
371 print "\t\tcase '%c': " % (case) | |
372 print "\t\t\tgoto state_%s(s);\n\t\t\tbreak;" % set2name(dfa.map[(frozenset(k), case)]) | |
373 if k in dfa.accepts: print "\t\tcase '\\0': goto accept();\n\t\t\tbreak;" | |
374 print "\t\tdefault: goto reject();\n\t}" | |
375 print "}\n" | |
376 | |
377 print """ | |
378 __code accept() { | |
379 \tprintf(\"\\nstring matches regexp. \\n\\n\"); | |
380 }\n""" | |
381 print """ | |
382 __code reject() { | |
383 \tprintf(\"\\nstring does not match regexp. \\n\\n\"); | |
384 }\n""" | |
385 | |
386 class Regexp(object): | |
387 def __init__(self, regexp): | |
388 self.regexp = regexp | |
389 self.nfa = None | |
390 self.dfa = None | |
391 self._compile() | |
392 | |
393 def _compile(self): | |
394 lexer_ = Lexer(self.regexp) | |
395 parser_ = Parser(lexer_) | |
396 self.nfa = parser_.expression() | |
397 self.dfa = nfa2dfa(self.nfa) | |
398 | |
399 def emitCbC(self): | |
400 dfa2CbC(self.dfa, self.regexp) | |
401 | |
402 def matches(self, string): | |
403 runtime = self.dfa.getRuntime() | |
404 return runtime.doesAccept(string) | |
405 | |
406 def compile(regexp): | |
407 return Regexp(regexp) | |
408 | |
409 def main(argv): | |
410 if len(argv) == 1 : exit("usage: dfareg.py regexp [> file]") | |
411 r = compile(argv[1]) | |
412 r.emitCbC() | |
413 | |
414 if __name__ == '__main__' : main(sys.argv) |