view c/regexParser/createRegexParser.cc @ 97:0b6940588e88 impl-bitvector

add node->parent
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Tue, 10 Nov 2015 16:18:06 +0900
parents 1cdad0468484
children 1e5b56e8263b
line wrap: on
line source

#include <stdlib.h>
#include <stdio.h>
#include "regexParser.h"

NodePtr createNode(RegexInfoPtr,unsigned char,NodePtr,NodePtr);
NodePtr charClass(RegexInfoPtr);
NodePtr group(RegexInfoPtr);
void token(RegexInfoPtr);
NodePtr regexAtom(RegexInfoPtr);
NodePtr regex(RegexInfoPtr);

/**
 * Create a node of regex parse tree.
 *     tokenType
 *     regexPosition(state)
 *     stateTransitionTable
 */
NodePtr createNode(RegexInfoPtr ri,unsigned char character, NodePtr left, NodePtr right) {
    NodePtr n = (NodePtr)malloc(sizeof(Node));
    n->tokenType = ri->tokenType;
    n->self = n;
    n->Value.character = character;
    n->left = left;
    n->right = right;

    if (ri->tokenType != 'a') {
        n->right = right;
        n->left->parent = n->right->parent = n->self;
    }

    if (ri->tokenType == 'a') {
        n->nodeNumber = ri->nodeNumber;
        ri->nodeNumber++;
        ri->tokenType = 0;
    }
    return n;
}

// <charClass> ::= '['<literal>'-'<literal>']'
NodePtr charClass(RegexInfoPtr ri) {
    NodePtr n = (NodePtr)malloc(sizeof(Node));
    unsigned char startChar = ri->ptr[0];
    while (ri->ptr[0] == '-') {
        ri->ptr++;
    }
    unsigned char endChar = ri->ptr[0];
    unsigned char *charTable = (unsigned char*)malloc(sizeof(char)*256);

    return n;
}

// <literal> ::= [a-z][A-Z][0-9]
NodePtr literal(RegexInfoPtr ri) {
    unsigned char *top = ri->ptr;
    NodePtr n = createNode(ri,ri->ptr[0],0,0);
    ri->ptr++;
    return n;
}

// <group> ::= '('<regex>')'
NodePtr group(RegexInfoPtr ri) {
    return regex(ri);
}



void token(RegexInfoPtr ri) {
    while (ri->ptr[0] != '\0') {
        if (ri->ptr[0] == '('){
            ri->ptr++;
            ri->tokenType = '(';
            ri->tokenValue = 0;
            if (ri->ptr[1] == ')') {
                ri->ptr++;
            }
            return;
        } else if (ri->ptr[0] == ')') {
            ri->ptr++;
            ri->tokenType = ')';
            ri->tokenValue = ri->ptr[0];
            return;
        } else if (ri->ptr[0] == '[') {
            ri->ptr++;
            ri->tokenType = '[';
            ri->tokenValue = ri->ptr[0];
            if (ri->ptr[1] == ']') {
                ri->ptr++;
            }
            return;
        } else if (ri->ptr[0] == '|'){
            ri->ptr++;
            ri->tokenType = '|';
            ri->tokenValue = 0;
            return;
        } else if (ri->ptr[0] == '*'){
            ri->ptr++;
            ri->tokenType = '*';
            ri->tokenValue = 0;
            return;
        } else if (ri->ptr[0] == '\\'){
            // need more proccesing 
            /*
                \277
                \0xa5
                \[
                \\
                \utf-8 etc...
            */
        } else {
            ri->tokenType = 'a';
            ri->tokenValue = ri->ptr[0];
            return;
        }
    }
    ri->tokenType = 0;
    ri->tokenValue = 0;
    return;
}

// <regexAtom> ::= <literal>|<charClass>|<group>
NodePtr regexAtom(RegexInfoPtr ri) {

    token(ri);
    NodePtr n = NULL;
    if (ri->tokenType == 'a') n = literal(ri);
    else if (ri->tokenType == '[') n = charClass(ri);
    else if (ri->tokenType == '(') n = group(ri);

    return n;
}

// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex>
NodePtr regex(RegexInfoPtr ri) {
    NodePtr n = regexAtom(ri);
    while (ri->ptr[0]) {
        token(ri);
        if (ri->tokenType == '*') {
            n = createNode(ri,'*',n,0);
        } else if (ri->tokenType == '|') {
            NodePtr n1 = regex(ri);
            n = createNode(ri,'|',n,n1);
        } else if (ri->tokenType == ')') {
            return n;
        } else {
            NodePtr n1 = regex(ri);
            n = createNode(ri,'+',n,n1);
        }
    } return n;
}