Mercurial > hg > Members > masakoha > testcode
view regexParser/regexParser.cc @ 308:1188debbef10
separate CharClass
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 08 Feb 2016 12:45:45 +0900 |
parents | 3e78631a6222 |
children | a4484c02cba5 |
line wrap: on
line source
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <ctype.h> #include "regexParser.h" #include "CharClass.h" static NodePtr charClass(RegexInfoPtr); static void token(RegexInfoPtr); static NodePtr regexAtom(RegexInfoPtr); /** * Create a node of regex parse tree. * tokenType * regexPosition(state) * stateTransitionTable */ static NodePtr allocateNode() { NodePtr n = NEW(Node); n->cc = NULL; n->stateNum = 0; n->nextStateNum = 0; n->state = NULL; n->nextState = NULL; n->left = NULL; n->right = NULL; return n; } NodePtr createNode(RegexInfoPtr ri,unsigned char type,CharClassPtr cc, NodePtr left, NodePtr right) { NodePtr n = allocateNode(); n->tokenType = type; n->cc = cc; n->state = NULL; n->left = left; n->right = right; return n; } // <charClass> ::= '['<literal>'-'<literal>']' static NodePtr charClass(RegexInfoPtr ri) { CharClassPtr cc = NULL; NodePtr n = createNode(ri,'c',cc,0,0); unsigned char begin = *ri->ptr; unsigned char end = *ri->ptr; for (ri->ptr++; *ri->ptr && *ri->ptr != ']'; ri->ptr++) { if (*ri->ptr == '-') { end = *(ri->ptr + 1); cc = insertCharClass(cc, begin, end); ri->ptr++; continue; } else { cc = insertCharClass(cc, begin, end); } if (ri->ptr[0] == 0 || ri->ptr[0] == ']') break; begin = *ri->ptr; end = *ri->ptr; } n->cc = insertCharClass(cc, begin, end); // TODO literal support // merge rangeList here if (*ri->ptr) ri->ptr++; token(ri); return n; } // <literal> ::= [a-z][A-Z][0-9] static NodePtr literal(RegexInfoPtr ri) { CharClassPtr cc = createCharClassWord(ri); token(ri); NodePtr n = createNode(ri,'a',cc,0,0); return n; } static void token(RegexInfoPtr ri) { while (ri->ptr[0] != '\0') { if (ri->ptr[0] == '('){ ri->ptr++; ri->tokenType = '('; ri->tokenValue = NULL; return; } else if (ri->ptr[0] == ')') { ri->ptr++; ri->tokenType = ')'; ri->tokenValue = ri->ptr; return; } else if (ri->ptr[0] == ']') { ri->ptr++; ri->tokenType = ']'; ri->tokenValue = ri->ptr; return; } else if (ri->ptr[0] == '|'){ ri->ptr++; ri->tokenType = '|'; ri->tokenValue = NULL; return; } else if (ri->ptr[0] == '*'){ ri->ptr++; ri->tokenType = '*'; ri->tokenValue = NULL; return; } else if (ri->ptr[0] == '\\'){ // need more proccesing /* \277 \0xa5 \[ \\ \utf-8 etc... */ ri->ptr++; ri->tokenType = 'a'; ri->tokenValue = ri->ptr; ri->ptr++; return; } else if (ri->ptr[0] == '[') { ri->ptr++; ri->tokenType = 'c'; ri->tokenValue = ri->ptr; return; } else { ri->tokenType = 'a'; ri->tokenValue = ri->ptr; // if (isalnum(ri->ptr[0])) { ri->ptr++; // } return; } } ri->tokenType = 0; ri->tokenValue = NULL; return; } // <regexAtom> ::= <literal>|<charClass>|<group> static NodePtr regexAtom(RegexInfoPtr ri) { NodePtr n = NULL; if (ri->tokenType == 'c') n = charClass(ri); else if (ri->tokenType == 'a') n = literal(ri); else if (ri->tokenType == '(') { n = regex(ri); if (ri->tokenType != ')') { // error fprintf(stderr,"unclosed ')' before %s \n", ri->ptr); return createNode(ri,0,0,0,0); } token(ri); } if (ri->tokenType == '*') { n = createNode(ri,'*',0,n,0); token(ri); } return n; } // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex> NodePtr regex(RegexInfoPtr ri) { token(ri); NodePtr n = regexAtom(ri); while (ri->tokenType) { if (ri->tokenType == '*') { n = createNode(ri,'*',0,n,0); token(ri); return n; } else if (ri->tokenType == '|') { n = createNode(ri,'|',0,n,0); NodePtr n1 = regex(ri); n->right = n1; } else if (ri->tokenType == ')') { return n; } else if (ri->tokenType == ']') { // error return n; } else { n = createNode(ri,'+',0,n,0); NodePtr n1 = regexAtom(ri); n->right = n1; } } return n; }