view regexParser/regexParser.cc @ 185:d25f4f3b4c34 pairPro

add comment
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Thu, 24 Dec 2015 20:27:49 +0900
parents 7ae0a3070647
children ef798db705e9
line wrap: on
line source

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "regexParser.h"

static NodePtr charClass(RegexInfoPtr);
static void token(RegexInfoPtr);
static NodePtr regexAtom(RegexInfoPtr);
NodePtr regex(RegexInfoPtr);

/**
 * Create a node of regex parse tree.
 *     tokenType
 *     regexPosition(state)
 *     stateTransitionTable
 */

static
NodePtr allocateNode() {
    NodePtr n = NEW(Node);
    n->cc = NULL;
    n->left = NULL;
    n->right = NULL;
    return n;
}

static
NodePtr createNode(unsigned char type,CharClassPtr cc, NodePtr left, NodePtr right) {
    NodePtr n = allocateNode();
    n->tokenType = type;
    n->cc = cc;
    n->state = NULL;
    n->left = left;
    n->right = right;
    return n;
}

CharClassPtr createCharClassWord(RegexInfoPtr ri) {
    CharClassPtr cc = NEW(CharClass);
    cc->type = 'a';
    cc->cond.w.word = ri->tokenValue;
    cc->cond.w.length = ri->ptr - ri->tokenValue;
    cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue;
    return cc;
}

/*
    cond.range.begin  cond.range.end
           |----------------|
  1.b---e
  2.b------e
  3.b------------e
  4.b-----------------------e
  5.b----------------------------e

           |----------------|
  6.       b---------e
  7.       b----------------e
  8.       b---------------------e

           |----------------|
  9.               b-----e
  10.              b--------e
  11.              b-------------e

           |----------------|
  12.                       b-----e

           |----------------|
  13.                          b--e

 */
CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end) {
    if (cc == NULL) {
        createCharClassRange(begin,end,0,0);
    }
    if (end < cc->cond.range.begin ) { // 1
        if (cc->left) {
            cc->left = insertCharClass(cc->left,begin,end);
        } else {
            cc->left = createCharClassRange(begin,end,0,0);
        }
        return cc;
    } else if (end == cc->cond.range.begin ) { // 2
        cc->cond.range.begin = begin;
        return cc;
    } else if (end <= cc->cond.range.end) {  // 3,4,6,7,9,10
        if (begin < cc->cond.range.begin) {  // 3,4
            cc->cond.range.begin = begin;
        }
        return cc;
    } else if (begin > cc->cond.range.end ) { // 13
        if (cc->right) {
            cc->right = insertCharClass(cc->right,begin,end);
        } else {
            cc->right = createCharClassRange(begin,end,0,0);
        }
        return cc;
    }
    if (cc->right) {
        CharClassPtr right = cc->right;
        begin = cc->cond.range.begin;
        free(cc);
        return insertCharClass(right,begin,end);
    }
    if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12
        if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8
    } else if (begin < cc->cond.range.begin) { // 5
        cc->cond.range.begin = begin;
        cc->cond.range.end = end; 
    } else {
        printf("insertCharClass Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end);
    }
    return cc;
}

// <charClass> ::= '['<literal>'-'<literal>']'
static
NodePtr charClass(RegexInfoPtr ri) {
    CharClassPtr cc = NEW(CharClass);
    NodePtr n = createNode(ri,'c',cc,0,0);
    cc->type = 'r';
    cc->nextState.bitContainer = 0;
    cc->left = NULL;
    cc->right = NULL;
    RangeListPtr rangeList = &cc->cond.range;
    rangeList->begin = *ri->ptr;
    rangeList->end = *ri->ptr;
    rangeList->next = NULL;

    for (ri->ptr++; *ri->ptr && *ri->ptr != ']'; ri->ptr++) {
        if (*ri->ptr == '-') {
            rangeList->end = *(ri->ptr + 1);
            ri->ptr++;
            continue;
        }
        if (ri->ptr[0] == 0 || ri->ptr[0] == ']') break;
        rangeList->next = NEW(RangeList);
        rangeList = rangeList->next;
        rangeList->begin = *ri->ptr;
        rangeList->end = *ri->ptr;
        rangeList->next = NULL;
    }

    RangeListPtr r = cc->cond.range.next;
    cc->cond.range.next = 0;
    for (; r; r = r->next) {
        cc = insertCharClass(cc, r->begin, r->end);
    }

    // TODO literal support
    // merge rangeList here
    if (*ri->ptr) ri->ptr++;
    token(ri);
    return n;
}

// <literal> ::= [a-z][A-Z][0-9]
static
NodePtr literal(RegexInfoPtr ri) {
    CharClassPtr cc = createCharClassWord(ri);
    token(ri);
    NodePtr n = createNode(ri,'a',cc,0,0);
    return n;
}

static
void token(RegexInfoPtr ri) {
    while (ri->ptr[0] != '\0') {
        if (ri->ptr[0] == '('){
            ri->ptr++;
            ri->tokenType = '(';
            ri->tokenValue = NULL;
            return;
        } else if (ri->ptr[0] == ')') {
            ri->ptr++;
            ri->tokenType = ')';
            ri->tokenValue = ri->ptr;
            return;
        } else if (ri->ptr[0] == ']') {
            ri->ptr++;
            ri->tokenType = ']';
            ri->tokenValue = ri->ptr;
            return;
        } else if (ri->ptr[0] == '|'){
            ri->ptr++;
            ri->tokenType = '|';
            ri->tokenValue = NULL;
            return;
        } else if (ri->ptr[0] == '*'){
            ri->ptr++;
            ri->tokenType = '*';
            ri->tokenValue = NULL;
            return;
        } else if (ri->ptr[0] == '\\'){
            // need more proccesing 
            /*
                \277
                \0xa5
                \[
                \\
                \utf-8 etc...
            */
        } else if (ri->ptr[0] == '[') {
            ri->ptr++;
            ri->tokenType = 'c';
            ri->tokenValue = ri->ptr;
            return;
        } else {
            ri->tokenType = 'a';
            ri->tokenValue = ri->ptr;
            if (isalnum(ri->ptr[0])) {
                ri->ptr++;
            }
            return;
        }
    }
    ri->tokenType = 0;
    ri->tokenValue = NULL;
    return;
}

// <regexAtom> ::= <literal>|<charClass>|<group>
static
NodePtr regexAtom(RegexInfoPtr ri) {

    NodePtr n = NULL;
    if (ri->tokenType == 'c') n = charClass(ri);
    else if (ri->tokenType == 'a') n = literal(ri);
    else if (ri->tokenType == '(') {
        n = regex(ri);
        if (ri->tokenType != ')') {
            // error
        }
        token(ri);
    }
    if (ri->tokenType == '*') {
        n = createNode(ri,'*',0,n,0);
        token(ri);
    }
    return n;
}

RegexInfoPtr createRegexInfo (RegexInfoPtr ri) {
    ri->stateNumber++;
    return ri;
}


// <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex>
NodePtr regex(RegexInfoPtr ri) {
    token(ri);
    NodePtr n = regexAtom(ri);
    while (ri->tokenType) {
        if (ri->tokenType == '*') {
            n = createNode(ri,'*',0,n,0);
            token(ri);
            return n;
        } else if (ri->tokenType == '|') {
            n = createNode(ri,'|',0,n,0);
            NodePtr n1 = regex(ri);
            n->right = n1;
        } else if (ri->tokenType == ')') {
            return n;
        } else {
            n = createNode(ri,'+',0,n,0);
            NodePtr n1 = regexAtom(ri);
            n->right = n1;
        }
    }
    return n;
}