Mercurial > hg > Applications > Grep
changeset 82:1d9bbf922bb6
add createRegexTree.cc
author | Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 07 Oct 2015 18:13:27 +0900 |
parents | 27883946b2dc |
children | d19afac949a5 |
files | c/regexParser/createRegexTree.cc c/regexParser/main.cc |
diffstat | 2 files changed, 138 insertions(+), 133 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/c/regexParser/createRegexTree.cc Wed Oct 07 18:13:27 2015 +0900 @@ -0,0 +1,137 @@ +#include <stdlib.h> +#include "regexParser.h" + +NodePtr createNode(unsigned char,NodePtr,NodePtr); +NodePtr charClass(RegexInfoPtr); +NodePtr group(RegexInfoPtr); +void token(RegexInfoPtr); +NodePtr regexAtom(RegexInfoPtr); +NodePtr regex(RegexInfoPtr); + +/** + * Create a node of regex parse tree. + * tokenType + * regexPosition(state) + * stateTransitionTable + */ +NodePtr createNode(unsigned char character, NodePtr left, NodePtr right) { + NodePtr n = (NodePtr)malloc(sizeof(Node)); + n->self = n; + n->Value.character = character; + n->left = left; + n->right = right; + return n; +} + +// <charClass> ::= '['<literal>'-'<literal>']' +NodePtr charClass(RegexInfoPtr ri) { + NodePtr n = (NodePtr)malloc(sizeof(Node)); + unsigned char startChar = ri->ptr[0]; + while (ri->ptr[0] == '-') { + ri->ptr++; + } + unsigned char endChar = ri->ptr[0]; + unsigned char *charTable = (unsigned char*)malloc(sizeof(char)*256); + + return n; +} + +// <literal> ::= [a-z][A-Z][0-9] +NodePtr literal(RegexInfoPtr ri) { + NodePtr n = createNode(ri->ptr[0],0,0); + ri->ptr++; + return n; +} + +// <group> ::= '('<regex>')' +NodePtr group(RegexInfoPtr ri) { + return regex(ri); +} + + + +void token(RegexInfoPtr ri) { + while (ri->ptr[0] != '\0') { + if (ri->ptr[0] == '('){ + ri->ptr++; + ri->tokenType = '('; + ri->tokenValue = 0; + if (ri->ptr[1] == ')') { + ri->ptr++; + } + return; + } else if (ri->ptr[0] == ')') { + ri->ptr++; + ri->tokenType = ')'; + ri->tokenValue = ri->ptr[0]; + return; + } else if (ri->ptr[0] == '[') { + ri->ptr++; + ri->tokenType = '['; + ri->tokenValue = ri->ptr[0]; + if (ri->ptr[1] == ']') { + ri->ptr++; + } + return; + } else if (ri->ptr[0] == '|'){ + ri->ptr++; + ri->tokenType = '|'; + ri->tokenValue = 0; + return; + } else if (ri->ptr[0] == '*'){ + ri->ptr++; + ri->tokenType = '*'; + ri->tokenValue = 0; + return; + } else if (ri->ptr[0] == '\\'){ + // need more proccesing + /* + \277 + \0xa5 + \[ + \\ + \utf-8 etc... + */ + } else { + ri->tokenType = 'a'; + ri->tokenValue = ri->ptr[0]; + return; + } + } + + ri->tokenType = 0; + ri->tokenValue = 0; + return; +} + +// <regexAtom> ::= <literal>|<charClass>|<group> +NodePtr regexAtom(RegexInfoPtr ri) { + + token(ri); + NodePtr n = NULL; + if (ri->tokenType == 'a') n = literal(ri); + else if (ri->tokenType == '[') n = charClass(ri); + else if (ri->tokenType == '(') n = group(ri); + + return n; +} + +// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex> +NodePtr regex(RegexInfoPtr ri) { + NodePtr n = regexAtom(ri); + while (ri->ptr[0]) { + token(ri); + if (ri->tokenType == '*') { + n = createNode('*',n,0); + } else if (ri->tokenType == '|') { + NodePtr n1 = regex(ri); + n = createNode('|',n,n1); + } else if (ri->tokenType == ')') { + return n; + } else { + NodePtr n1 = regex(ri); + n = createNode('+',n,n1); + } + } return n; +} +
--- a/c/regexParser/main.cc Wed Oct 07 16:47:03 2015 +0900 +++ b/c/regexParser/main.cc Wed Oct 07 18:13:27 2015 +0900 @@ -11,141 +11,9 @@ #include <string.h> #include "regexParser.h" -NodePtr createNode(unsigned char,NodePtr,NodePtr); -NodePtr charClass(RegexInfoPtr); -NodePtr group(RegexInfoPtr); -void token(RegexInfoPtr); -NodePtr regexAtom(RegexInfoPtr); -NodePtr regex(RegexInfoPtr); +extern NodePtr regex(RegexInfoPtr); extern void printTree(NodePtr); -/** - * Create a node of regex parse tree. - * tokenType - * regexPosition(state) - * stateTransitionTable - */ -NodePtr createNode(unsigned char character, NodePtr left, NodePtr right) { - NodePtr n = (NodePtr)malloc(sizeof(Node)); - n->self = n; - n->Value.character = character; - n->left = left; - n->right = right; - return n; -} - -// <charClass> ::= '['<literal>'-'<literal>']' -NodePtr charClass(RegexInfoPtr ri) { - NodePtr n = (NodePtr)malloc(sizeof(Node)); - unsigned char startChar = ri->ptr[0]; - while (ri->ptr[0] == '-') { - ri->ptr++; - } - unsigned char endChar = ri->ptr[0]; - unsigned char *charTable = (unsigned char*)malloc(sizeof(char)*256); - - return n; -} - -// <literal> ::= [a-z][A-Z][0-9] -NodePtr literal(RegexInfoPtr ri) { - NodePtr n = createNode(ri->ptr[0],0,0); - ri->ptr++; - return n; -} - -// <group> ::= '('<regex>')' -NodePtr group(RegexInfoPtr ri) { - return regex(ri); -} - - - -void token(RegexInfoPtr ri) { - while (ri->ptr[0] != '\0') { - if (ri->ptr[0] == '('){ - ri->ptr++; - ri->tokenType = '('; - ri->tokenValue = 0; - if (ri->ptr[1] == ')') { - ri->ptr++; - } - return; - } else if (ri->ptr[0] == ')') { - ri->ptr++; - ri->tokenType = ')'; - ri->tokenValue = ri->ptr[0]; - return; - } else if (ri->ptr[0] == '[') { - ri->ptr++; - ri->tokenType = '['; - ri->tokenValue = ri->ptr[0]; - if (ri->ptr[1] == ']') { - ri->ptr++; - } - return; - } else if (ri->ptr[0] == '|'){ - ri->ptr++; - ri->tokenType = '|'; - ri->tokenValue = 0; - return; - } else if (ri->ptr[0] == '*'){ - ri->ptr++; - ri->tokenType = '*'; - ri->tokenValue = 0; - return; - } else if (ri->ptr[0] == '\\'){ - // need more proccesing - /* - \277 - \0xa5 - \[ - \\ - \utf-8 etc... - */ - } else { - ri->tokenType = 'a'; - ri->tokenValue = ri->ptr[0]; - return; - } - } - - ri->tokenType = 0; - ri->tokenValue = 0; - return; -} - -// <regexAtom> ::= <literal>|<charClass>|<group> -NodePtr regexAtom(RegexInfoPtr ri) { - - token(ri); - NodePtr n = NULL; - if (ri->tokenType == 'a') n = literal(ri); - else if (ri->tokenType == '[') n = charClass(ri); - else if (ri->tokenType == '(') n = group(ri); - - return n; -} - -// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex> -NodePtr regex(RegexInfoPtr ri) { - NodePtr n = regexAtom(ri); - while (ri->ptr[0]) { - token(ri); - if (ri->tokenType == '*') { - n = createNode('*',n,0); - } else if (ri->tokenType == '|') { - NodePtr n1 = regex(ri); - n = createNode('|',n,n1); - } else if (ri->tokenType == ')') { - return n; - } else { - NodePtr n1 = regex(ri); - n = createNode('+',n,n1); - } - } return n; -} - int main(int argc, char **argv) {