Mercurial > hg > Applications > Grep
comparison c/regexParser/createRegexTree.cc @ 82:1d9bbf922bb6
add createRegexTree.cc
author | Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 07 Oct 2015 18:13:27 +0900 |
parents | |
children | 5072a44ed842 |
comparison
equal
deleted
inserted
replaced
81:27883946b2dc | 82:1d9bbf922bb6 |
---|---|
1 #include <stdlib.h> | |
2 #include "regexParser.h" | |
3 | |
4 NodePtr createNode(unsigned char,NodePtr,NodePtr); | |
5 NodePtr charClass(RegexInfoPtr); | |
6 NodePtr group(RegexInfoPtr); | |
7 void token(RegexInfoPtr); | |
8 NodePtr regexAtom(RegexInfoPtr); | |
9 NodePtr regex(RegexInfoPtr); | |
10 | |
11 /** | |
12 * Create a node of regex parse tree. | |
13 * tokenType | |
14 * regexPosition(state) | |
15 * stateTransitionTable | |
16 */ | |
17 NodePtr createNode(unsigned char character, NodePtr left, NodePtr right) { | |
18 NodePtr n = (NodePtr)malloc(sizeof(Node)); | |
19 n->self = n; | |
20 n->Value.character = character; | |
21 n->left = left; | |
22 n->right = right; | |
23 return n; | |
24 } | |
25 | |
26 // <charClass> ::= '['<literal>'-'<literal>']' | |
27 NodePtr charClass(RegexInfoPtr ri) { | |
28 NodePtr n = (NodePtr)malloc(sizeof(Node)); | |
29 unsigned char startChar = ri->ptr[0]; | |
30 while (ri->ptr[0] == '-') { | |
31 ri->ptr++; | |
32 } | |
33 unsigned char endChar = ri->ptr[0]; | |
34 unsigned char *charTable = (unsigned char*)malloc(sizeof(char)*256); | |
35 | |
36 return n; | |
37 } | |
38 | |
39 // <literal> ::= [a-z][A-Z][0-9] | |
40 NodePtr literal(RegexInfoPtr ri) { | |
41 NodePtr n = createNode(ri->ptr[0],0,0); | |
42 ri->ptr++; | |
43 return n; | |
44 } | |
45 | |
46 // <group> ::= '('<regex>')' | |
47 NodePtr group(RegexInfoPtr ri) { | |
48 return regex(ri); | |
49 } | |
50 | |
51 | |
52 | |
53 void token(RegexInfoPtr ri) { | |
54 while (ri->ptr[0] != '\0') { | |
55 if (ri->ptr[0] == '('){ | |
56 ri->ptr++; | |
57 ri->tokenType = '('; | |
58 ri->tokenValue = 0; | |
59 if (ri->ptr[1] == ')') { | |
60 ri->ptr++; | |
61 } | |
62 return; | |
63 } else if (ri->ptr[0] == ')') { | |
64 ri->ptr++; | |
65 ri->tokenType = ')'; | |
66 ri->tokenValue = ri->ptr[0]; | |
67 return; | |
68 } else if (ri->ptr[0] == '[') { | |
69 ri->ptr++; | |
70 ri->tokenType = '['; | |
71 ri->tokenValue = ri->ptr[0]; | |
72 if (ri->ptr[1] == ']') { | |
73 ri->ptr++; | |
74 } | |
75 return; | |
76 } else if (ri->ptr[0] == '|'){ | |
77 ri->ptr++; | |
78 ri->tokenType = '|'; | |
79 ri->tokenValue = 0; | |
80 return; | |
81 } else if (ri->ptr[0] == '*'){ | |
82 ri->ptr++; | |
83 ri->tokenType = '*'; | |
84 ri->tokenValue = 0; | |
85 return; | |
86 } else if (ri->ptr[0] == '\\'){ | |
87 // need more proccesing | |
88 /* | |
89 \277 | |
90 \0xa5 | |
91 \[ | |
92 \\ | |
93 \utf-8 etc... | |
94 */ | |
95 } else { | |
96 ri->tokenType = 'a'; | |
97 ri->tokenValue = ri->ptr[0]; | |
98 return; | |
99 } | |
100 } | |
101 | |
102 ri->tokenType = 0; | |
103 ri->tokenValue = 0; | |
104 return; | |
105 } | |
106 | |
107 // <regexAtom> ::= <literal>|<charClass>|<group> | |
108 NodePtr regexAtom(RegexInfoPtr ri) { | |
109 | |
110 token(ri); | |
111 NodePtr n = NULL; | |
112 if (ri->tokenType == 'a') n = literal(ri); | |
113 else if (ri->tokenType == '[') n = charClass(ri); | |
114 else if (ri->tokenType == '(') n = group(ri); | |
115 | |
116 return n; | |
117 } | |
118 | |
119 // <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex> | |
120 NodePtr regex(RegexInfoPtr ri) { | |
121 NodePtr n = regexAtom(ri); | |
122 while (ri->ptr[0]) { | |
123 token(ri); | |
124 if (ri->tokenType == '*') { | |
125 n = createNode('*',n,0); | |
126 } else if (ri->tokenType == '|') { | |
127 NodePtr n1 = regex(ri); | |
128 n = createNode('|',n,n1); | |
129 } else if (ri->tokenType == ')') { | |
130 return n; | |
131 } else { | |
132 NodePtr n1 = regex(ri); | |
133 n = createNode('+',n,n1); | |
134 } | |
135 } return n; | |
136 } | |
137 |