Mercurial > hg > Applications > Grep
comparison c/regexParser/regexParser.cc @ 121:aa266a4db47c pairPro
merge
author | Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Thu, 26 Nov 2015 21:17:26 +0900 |
parents | 2f0653f8eabb 5d29b6a1b50f |
children | 188d866227a4 |
comparison
equal
deleted
inserted
replaced
119:2f0653f8eabb | 121:aa266a4db47c |
---|---|
1 #include <stdlib.h> | 1 #include <stdlib.h> |
2 #include <stdio.h> | 2 #include <stdio.h> |
3 #include "regexParser.h" | 3 #include "regexParser.h" |
4 #include "error.h" | 4 #include "error.h" |
5 | 5 |
6 static NodePtr createNode(RegexInfoPtr,NodePtr,NodePtr); | 6 static NodePtr allocateNode(); |
7 static NodePtr createNode(RegexInfoPtr,unsigned char*,NodePtr,NodePtr); | |
7 static NodePtr charClass(RegexInfoPtr); | 8 static NodePtr charClass(RegexInfoPtr); |
8 static NodePtr group(RegexInfoPtr); | 9 static NodePtr group(RegexInfoPtr); |
9 static void token(RegexInfoPtr); | 10 static void token(RegexInfoPtr); |
10 static NodePtr regexAtom(RegexInfoPtr); | 11 static NodePtr regexAtom(RegexInfoPtr); |
11 NodePtr regex(RegexInfoPtr); | 12 NodePtr regex(RegexInfoPtr); |
15 * tokenType | 16 * tokenType |
16 * regexPosition(state) | 17 * regexPosition(state) |
17 * stateTransitionTable | 18 * stateTransitionTable |
18 */ | 19 */ |
19 | 20 |
21 static | |
20 NodePtr allocateNode() { | 22 NodePtr allocateNode() { |
21 NodePtr n = (NodePtr)malloc(sizeof(node)); | 23 NodePtr n = (NodePtr)malloc(sizeof(node)); |
22 n->cc = (CharClassPtr)malloc(sizeof(CharClass)); | 24 n->cc = (CharClassPtr)malloc(sizeof(CharClass)); |
23 n->cc->cond = (ConditionList)malloc(sizeof(Condition)); | 25 n->cc->cond = (ConditionList)malloc(sizeof(Condition)); |
24 return n; | 26 return n; |
25 } | 27 } |
26 | 28 |
27 static | 29 static |
28 NodePtr createNode(RegexInfoPtr ri, NodePtr left, NodePtr right) { | 30 NodePtr createNode(RegexInfoPtr ri,unsigned char *character, NodePtr left, NodePtr right) { |
29 NodePtr n = allocateNode(); | 31 NodePtr n = allocateNode(); |
30 if (n == NULL) { | 32 if (n == NULL) { |
31 mallocFailedMessage(); | 33 mallocFailedMessage(); |
32 } | 34 } |
33 | 35 |
34 n->tokenType = ri->tokenType; | 36 n->tokenType = ri->tokenType; |
35 n->cc->cond->character = ri->tokenValue; | |
36 n->left = left; | 37 n->left = left; |
37 n->right = right; | 38 n->right = right; |
39 n->nodeNumber = ri->nodeNumber; | |
40 ri->nodeNumber++; | |
38 | 41 |
39 if (ri->tokenType == 'a') { | 42 if (ri->tokenType == 'a') { |
40 n->nodeNumber = ri->nodeNumber; | |
41 ri->nodeNumber++; | |
42 ri->tokenType = 0; | 43 ri->tokenType = 0; |
44 n->cc->cond->w = getWord(ri->tokenValue); | |
45 ri->ptr += n->cc->cond->w->length-1; | |
46 } else { | |
47 WordPtr w = (WordPtr)malloc(sizeof(Word)); | |
48 w->word = character; | |
49 w->length = 1; | |
50 n->cc->cond->w = w; | |
43 } | 51 } |
44 return n; | 52 return n; |
45 } | 53 } |
46 | 54 |
47 // <charClass> ::= '['<literal>'-'<literal>']' | 55 // <charClass> ::= '['<literal>'-'<literal>']' |
58 } | 66 } |
59 | 67 |
60 // <literal> ::= [a-z][A-Z][0-9] | 68 // <literal> ::= [a-z][A-Z][0-9] |
61 static | 69 static |
62 NodePtr literal(RegexInfoPtr ri) { | 70 NodePtr literal(RegexInfoPtr ri) { |
63 NodePtr n = createNode(ri,0,0); | 71 NodePtr n = createNode(ri,ri->ptr,0,0); |
64 ri->ptr++; | |
65 return n; | 72 return n; |
66 } | 73 } |
67 | 74 |
68 // <group> ::= '('<regex>')' | 75 // <group> ::= '('<regex>')' |
69 static | 76 static |
75 void token(RegexInfoPtr ri) { | 82 void token(RegexInfoPtr ri) { |
76 while (ri->ptr[0] != '\0') { | 83 while (ri->ptr[0] != '\0') { |
77 if (ri->ptr[0] == '('){ | 84 if (ri->ptr[0] == '('){ |
78 ri->ptr++; | 85 ri->ptr++; |
79 ri->tokenType = '('; | 86 ri->tokenType = '('; |
80 ri->tokenValue = 0; | 87 ri->tokenValue = NULL; |
81 if (ri->ptr[1] == ')') { | 88 if (ri->ptr[1] == ')') { |
82 ri->ptr++; | 89 ri->ptr++; |
83 } | 90 } |
84 return; | 91 return; |
85 } else if (ri->ptr[0] == ')') { | 92 } else if (ri->ptr[0] == ')') { |
86 ri->ptr++; | 93 ri->ptr++; |
87 ri->tokenType = ')'; | 94 ri->tokenType = ')'; |
88 ri->tokenValue = ri->ptr[0]; | 95 ri->tokenValue = ri->ptr; |
89 return; | 96 return; |
90 } else if (ri->ptr[0] == '[') { | 97 } else if (ri->ptr[0] == '[') { |
91 ri->ptr++; | 98 ri->ptr++; |
92 ri->tokenType = '['; | 99 ri->tokenType = '['; |
93 ri->tokenValue = ri->ptr[0]; | 100 ri->tokenValue = ri->ptr; |
94 if (ri->ptr[1] == ']') { | 101 if (ri->ptr[1] == ']') { |
95 ri->ptr++; | 102 ri->ptr++; |
96 } | 103 } |
97 return; | 104 return; |
98 } else if (ri->ptr[0] == '|'){ | 105 } else if (ri->ptr[0] == '|'){ |
99 ri->ptr++; | 106 ri->ptr++; |
100 ri->tokenType = '|'; | 107 ri->tokenType = '|'; |
101 ri->tokenValue = '|'; | 108 ri->tokenValue = NULL; |
102 return; | 109 return; |
103 } else if (ri->ptr[0] == '*'){ | 110 } else if (ri->ptr[0] == '*'){ |
104 ri->ptr++; | 111 ri->ptr++; |
105 ri->tokenType = '*'; | 112 ri->tokenType = '*'; |
106 ri->tokenValue = '*'; | 113 ri->tokenValue = NULL; |
107 return; | 114 return; |
108 } else if (ri->ptr[0] == '\\'){ | 115 } else if (ri->ptr[0] == '\\'){ |
109 // need more proccesing | 116 // need more proccesing |
110 /* | 117 /* |
111 \277 | 118 \277 |
114 \\ | 121 \\ |
115 \utf-8 etc... | 122 \utf-8 etc... |
116 */ | 123 */ |
117 } else { | 124 } else { |
118 ri->tokenType = 'a'; | 125 ri->tokenType = 'a'; |
119 ri->tokenValue = ri->ptr[0]; | 126 ri->tokenValue = ri->ptr; |
127 ri->ptr++; | |
120 return; | 128 return; |
121 } | 129 } |
122 } | 130 } |
123 ri->tokenType = 0; | |
124 ri->tokenValue = 0; | |
125 return; | 131 return; |
126 } | 132 } |
127 | 133 |
128 // <regexAtom> ::= <literal>|<charClass>|<group> | 134 // <regexAtom> ::= <literal>|<charClass>|<group> |
129 static | 135 static |
142 NodePtr regex(RegexInfoPtr ri) { | 148 NodePtr regex(RegexInfoPtr ri) { |
143 NodePtr n = regexAtom(ri); | 149 NodePtr n = regexAtom(ri); |
144 while (ri->ptr[0]) { | 150 while (ri->ptr[0]) { |
145 token(ri); | 151 token(ri); |
146 if (ri->tokenType == '*') { | 152 if (ri->tokenType == '*') { |
147 n = createNode(ri,n,0); | 153 unsigned char *syntax = (unsigned char*)malloc(sizeof(unsigned char)); |
154 syntax[0] = '*'; | |
155 n = createNode(ri,syntax,n,0); | |
148 } else if (ri->tokenType == '|') { | 156 } else if (ri->tokenType == '|') { |
149 NodePtr n1 = regex(ri); | 157 NodePtr n1 = regex(ri); |
150 ri->tokenValue = '|'; | 158 unsigned char *syntax = (unsigned char*)malloc(sizeof(unsigned char)); |
151 n = createNode(ri,n,n1); | 159 syntax[0] = '|'; |
160 n = createNode(ri,syntax,n,n1); | |
152 } else if (ri->tokenType == ')') { | 161 } else if (ri->tokenType == ')') { |
153 return n; | 162 return n; |
154 } else { | 163 } else { |
155 NodePtr n1 = regex(ri); | 164 NodePtr n1 = regex(ri); |
156 ri->tokenValue = '+'; | 165 unsigned char *syntax = (unsigned char*)malloc(sizeof(unsigned char)); |
157 n = createNode(ri,n,n1); | 166 syntax[0] = '+'; |
167 n = createNode(ri,syntax,n,n1); | |
158 } | 168 } |
159 } return n; | 169 } return n; |
160 } | 170 } |