Mercurial > hg > Members > masakoha > testcode
view c/regexParser/regexParser.cc @ 127:b061cd8205cc pairPro
merge
author | masa |
---|---|
date | Tue, 01 Dec 2015 21:50:09 +0900 |
parents | 639b0b437ebf c363a66dc1a7 |
children | f827682d4687 |
line wrap: on
line source
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <ctype.h> #include "regexParser.h" #include "error.h" static NodePtr allocateNode(); static NodePtr createNode(RegexInfoPtr,unsigned char,NodePtr,NodePtr); static NodePtr charClass(RegexInfoPtr); static NodePtr group(RegexInfoPtr); static void token(RegexInfoPtr); static NodePtr regexAtom(RegexInfoPtr); NodePtr regex(RegexInfoPtr); /** * Create a node of regex parse tree. * tokenType * regexPosition(state) * stateTransitionTable */ static NodePtr allocateNode() { NodePtr n = (NodePtr)malloc(sizeof(node)); n->cc = NULL; n->left = NULL; n->right = NULL; return n; } static CharClassPtr createCharClassWord(RegexInfoPtr ri) { CharClassPtr cc = NEW(CharClass); cc->type = 'a'; cc->cond = NEW(Condition); cc->cond->w = NEW(Word); cc->cond->w->word = ri->tokenValue; cc->cond->w->length = ri->ptr - ri->tokenValue; return cc; } static NodePtr createNode(RegexInfoPtr ri,unsigned char type, NodePtr left, NodePtr right) { NodePtr n = allocateNode(); n->tokenType = type; n->left = left; n->right = right; n->nodeNumber = ri->nodeNumber; ri->nodeNumber++; if (type == 'a') { n->cc = createCharClassWord(ri); } return n; } // <charClass> ::= '['<literal>'-'<literal>']' static NodePtr charClass(RegexInfoPtr ri) { NodePtr n = allocateNode(); n->tokenType = 'c'; n->nodeNumber = ri->nodeNumber; ri->nodeNumber++; CharClassPtr cc = NEW(CharClass); cc->type = 'r'; cc->cond = NEW(Condition); cc->cond->range = NEW(RangeList); cc->cond->range->begin = ri->ptr; cc->cond->range->end = ri->ptr + 1; cc->cond->range->next = NULL; int i = 0; RangeListPtr rangeList = cc->cond->range; while (ri->ptr[i] != ']') { if (ri->ptr[i] == '-') i++; rangeList->end = ri->ptr + i; rangeList->next = NEW(RangeList); rangeList = rangeList->next; rangeList->begin = ri->ptr+i+1; rangeList->next = NULL; i++; } // TODO literal support rangeList->end = ri->ptr + i - 1; return n; } // <literal> ::= [a-z][A-Z][0-9] static NodePtr literal(RegexInfoPtr ri) { NodePtr n = createNode(ri,'a',0,0); return n; } // <group> ::= '('<regex>')' static NodePtr group(RegexInfoPtr ri) { return regex(ri); } static void token(RegexInfoPtr ri) { while (ri->ptr[0] != '\0') { if (ri->ptr[0] == '('){ ri->ptr++; ri->tokenType = '('; ri->tokenValue = NULL; return; } else if (ri->ptr[0] == ')') { ri->ptr++; ri->tokenType = ')'; ri->tokenValue = ri->ptr; return; } else if (ri->ptr[0] == '[') { ri->ptr++; ri->tokenType = 'c'; ri->tokenValue = ri->ptr; return; } else if (ri->ptr[0] == '|'){ ri->ptr++; ri->tokenType = '|'; ri->tokenValue = NULL; return; } else if (ri->ptr[0] == '*'){ ri->ptr++; ri->tokenType = '*'; ri->tokenValue = NULL; return; } else if (ri->ptr[0] == '\\'){ // need more proccesing /* \277 \0xa5 \[ \\ \utf-8 etc... */ } else { ri->tokenType = 'a'; ri->tokenValue = ri->ptr; while (isalnum(ri->ptr[0])) { ri->ptr++; } return; } } return; } // <regexAtom> ::= <literal>|<charClass> static NodePtr regexAtom(RegexInfoPtr ri) { token(ri); NodePtr n = NULL; if (ri->tokenType == 'c') n = charClass(ri); return n; } // <regex> ::= <regexAtom> | <regexAtom>'*' | <regexAtom>'|'<regex> | <regexAtom><regex> | '(' regex ')' NodePtr regex(RegexInfoPtr ri) { NodePtr n = NULL; while (ri->ptr[0]) { token(ri); if (ri->tokenType == '*') { n = createNode(ri,'*',n,0); } else if (ri->tokenType == '|') { NodePtr n1 = regex(ri); n = createNode(ri,'|',n,n1); } else if (ri->tokenType == ')') { return n; } else if (ri->tokenType == 'a') { NodePtr n1 = literal(ri); unsigned char *syntax = (unsigned char*)malloc(sizeof(unsigned char)); syntax[0] = '+'; n = createNode(ri,syntax,n,n1); } else { // return NULL NodePtr n1 = regex(ri); n = createNode(ri,'a',n,n1); } } return n; }