Mercurial > hg > Applications > Grep
diff c/regexParser/createRegexTree.cc @ 82:1d9bbf922bb6
add createRegexTree.cc
author | Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 07 Oct 2015 18:13:27 +0900 |
parents | |
children | 5072a44ed842 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/c/regexParser/createRegexTree.cc Wed Oct 07 18:13:27 2015 +0900 @@ -0,0 +1,137 @@ +#include <stdlib.h> +#include "regexParser.h" + +NodePtr createNode(unsigned char,NodePtr,NodePtr); +NodePtr charClass(RegexInfoPtr); +NodePtr group(RegexInfoPtr); +void token(RegexInfoPtr); +NodePtr regexAtom(RegexInfoPtr); +NodePtr regex(RegexInfoPtr); + +/** + * Create a node of regex parse tree. + * tokenType + * regexPosition(state) + * stateTransitionTable + */ +NodePtr createNode(unsigned char character, NodePtr left, NodePtr right) { + NodePtr n = (NodePtr)malloc(sizeof(Node)); + n->self = n; + n->Value.character = character; + n->left = left; + n->right = right; + return n; +} + +// <charClass> ::= '['<literal>'-'<literal>']' +NodePtr charClass(RegexInfoPtr ri) { + NodePtr n = (NodePtr)malloc(sizeof(Node)); + unsigned char startChar = ri->ptr[0]; + while (ri->ptr[0] == '-') { + ri->ptr++; + } + unsigned char endChar = ri->ptr[0]; + unsigned char *charTable = (unsigned char*)malloc(sizeof(char)*256); + + return n; +} + +// <literal> ::= [a-z][A-Z][0-9] +NodePtr literal(RegexInfoPtr ri) { + NodePtr n = createNode(ri->ptr[0],0,0); + ri->ptr++; + return n; +} + +// <group> ::= '('<regex>')' +NodePtr group(RegexInfoPtr ri) { + return regex(ri); +} + + + +void token(RegexInfoPtr ri) { + while (ri->ptr[0] != '\0') { + if (ri->ptr[0] == '('){ + ri->ptr++; + ri->tokenType = '('; + ri->tokenValue = 0; + if (ri->ptr[1] == ')') { + ri->ptr++; + } + return; + } else if (ri->ptr[0] == ')') { + ri->ptr++; + ri->tokenType = ')'; + ri->tokenValue = ri->ptr[0]; + return; + } else if (ri->ptr[0] == '[') { + ri->ptr++; + ri->tokenType = '['; + ri->tokenValue = ri->ptr[0]; + if (ri->ptr[1] == ']') { + ri->ptr++; + } + return; + } else if (ri->ptr[0] == '|'){ + ri->ptr++; + ri->tokenType = '|'; + ri->tokenValue = 0; + return; + } else if (ri->ptr[0] == '*'){ + ri->ptr++; + ri->tokenType = '*'; + ri->tokenValue = 0; + return; + } else if (ri->ptr[0] == '\\'){ + // need more proccesing + /* + \277 + \0xa5 + \[ + \\ + \utf-8 etc... + */ + } else { + ri->tokenType = 'a'; + ri->tokenValue = ri->ptr[0]; + return; + } + } + + ri->tokenType = 0; + ri->tokenValue = 0; + return; +} + +// <regexAtom> ::= <literal>|<charClass>|<group> +NodePtr regexAtom(RegexInfoPtr ri) { + + token(ri); + NodePtr n = NULL; + if (ri->tokenType == 'a') n = literal(ri); + else if (ri->tokenType == '[') n = charClass(ri); + else if (ri->tokenType == '(') n = group(ri); + + return n; +} + +// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex> +NodePtr regex(RegexInfoPtr ri) { + NodePtr n = regexAtom(ri); + while (ri->ptr[0]) { + token(ri); + if (ri->tokenType == '*') { + n = createNode('*',n,0); + } else if (ri->tokenType == '|') { + NodePtr n1 = regex(ri); + n = createNode('|',n,n1); + } else if (ri->tokenType == ')') { + return n; + } else { + NodePtr n1 = regex(ri); + n = createNode('+',n,n1); + } + } return n; +} +