Mercurial > hg > Applications > Grep
diff c/regexParser/main.cc @ 63:8fd3d35e9861
add token function
author | masa |
---|---|
date | Thu, 23 Jul 2015 18:01:02 +0900 |
parents | a49b4a8b8c14 |
children | e0ad6c145f89 |
line wrap: on
line diff
--- a/c/regexParser/main.cc Tue Jul 14 16:45:07 2015 +0900 +++ b/c/regexParser/main.cc Thu Jul 23 18:01:02 2015 +0900 @@ -12,9 +12,22 @@ #include <stdlib.h> #include <string.h> +typedef struct charClass { + unsigned char table[256]; + struct utf8Range { + unsigned char *begin; + unsigned char *end; + struct utf8Range next; + } *rangeList; +} + typedef struct node { + unsigned char type; + union value { + charClass *cc; + unsigned char *string; + } struct node *self; - char character; struct node *left; struct node *right; } Node, *NodePtr; @@ -25,16 +38,18 @@ NodePtr charClass(); NodePtr string(); NodePtr group(); -NodePtr _or(); +NodePtr orexp(); NodePtr asterisk(); NodePtr regex(); NodePtr createNode(char,NodePtr,NodePtr); bool isLiteral(char c) { - if (('a'<=c && c<='z')||('A'<=c && c<='Z')||('0'<=c && c<='9')) { - return true; - } - return false; + if (*ptr > 0x7f) return true; + else if (*ptr == '(') return false; + else if (*ptr == '[') return false; + else if (*ptr == '|') return false; + else if (*ptr == '*') return false; + return true; } void printNodeDate(NodePtr n) { @@ -85,54 +100,86 @@ return n; } -// <group> ::= '('<regex>')' | '('<regex>'|'<regex>')' +// <group> ::= '('<regex>')' NodePtr group() { - NodePtr n; + token(); + NodePtr n = regex(); + token(); if (*ptr == ')') { - n = createNode(0,0,0); - ptr++; + n = createNode('(',n,0); } else { - ptr++; - n = regex(); + // ) reqiured } return n; } - -// <or> ::= <regex>'|'<regex> -NodePtr _or() { - ptr++; - NodePtr n = createNode('|',regexHeadNode,regex()); - return n; +// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex> +NodePtr regex() { + NodePtr n = regexAtom(); + while (*ptr) { + token(); + if (tokenType == '*') { + n = createNode('*',n,0); + } else if (tokenType == '|') { + NodePtr n1 = regex(); + n = createNode('|',n,n1); + } else { + NodePtr n1 = regex(); + n = createNode('+',n,n1); + } + } } -// <*> ::= <regex>'*' -NodePtr asterisk() { - ptr++; - NodePtr n = createNode('*',regexHeadNode,regex()); +// <regexAtom> ::= <literal>|<charClass>|<group> +NodePtr regexAtom() { + + token(); + NodePter n; + if (tokenType == 'a') n = literal(); + else if (tokenType == '[') n = charClass(); + else if (tokenType == '(') n = group(); + return n; } -// <regex> ::= <string>|<or>|<charClass>|<group>|<*> -NodePtr regex() { - - NodePtr n; - +void token() { while (*ptr != '\0') { if ((*ptr == '(') || (*ptr == ')')) { - n = group(); + tokenType = *ptr++; + tokenValue = 0; + return ; } else if (*ptr == '[') { - n = charClass(); + tokenType = '['; + tokenValue = ptr; + if (ptr[1] == ']') { + ptr++; + } + while (*ptr != ']') ptr++; + ptr++; + return; } else if (*ptr == '|'){ - n = _or(); + tokenType = '|'; + tokenValue = 0; + return; } else if (*ptr == '*'){ - n = asterisk(); - } else { - n = string(); - regexHeadNode = n; + tokenType = '*'; + tokenValue = 0; + return; } + + tokenType = 'a'; + tokenValue = ptr; + + if (*ptr == '\\') ptr++; // need more proccesing + /* + \277 + \0xa5 + \[ + \\ + \utf-8 etc... + */ + } - return n; } int main(int argc, char **argv)