changeset 310:df27e6cab846

CharClassMerge with Word ( no match implementation )
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Mon, 08 Feb 2016 19:23:37 +0900
parents 058c87665213
children 1d79e61a9365
files regexParser/CharClass.cc regexParser/regexParser.h regexParser/test/ccMerge.cc
diffstat 3 files changed, 73 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- a/regexParser/CharClass.cc	Mon Feb 08 18:04:28 2016 +0900
+++ b/regexParser/CharClass.cc	Mon Feb 08 19:23:37 2016 +0900
@@ -16,6 +16,7 @@
     cc->cond.range.end = end;
     cc->cond.w.word = NULL;
     cc->cond.w.length = 0;
+    cc->cond.w.next = NULL;
     cc->left = left;
     cc->right = right;
     cc->nextState.bitContainer = state;
@@ -28,10 +29,41 @@
     cc->right = NULL;
     cc->cond.w.word = ri->tokenValue;
     cc->cond.w.length = ri->ptr - ri->tokenValue;
+    cc->cond.w.next = NULL;
     cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue;
     return cc;
 }
 
+CharClassPtr createCharClassRangeWord(unsigned long begin, unsigned long end,CharClassPtr cc,CharClassPtr m, CharClassPtr left, CharClassPtr right) {
+    // same range and seme nextState?
+    CharClassPtr ncc = NEW(CharClass);
+    ncc->cond.range.begin = begin;
+    ncc->cond.range.end = end;
+    ncc->cond.w = cc->cond.w;
+    ncc->left = left;
+    ncc->right = right;
+    if (!m) {
+        ncc->nextState.bitContainer = cc->nextState.bitContainer;
+        return ncc;
+    }
+    if (m->cond.w.word) {
+        WordPtr w = &m->cond.w;
+        WordPtr *next = &ncc->cond.w.next;
+        while(w->next) {   // insert sort?
+            WordPtr n = NEW(Word);
+            n->word = w->word;
+            n->length = w->length;
+            *next = n;
+            next = &n->next;
+            w = w->next;
+        }
+        *next = &m->cond.w;
+    } 
+    ncc->nextState.bitContainer = m->nextState.bitContainer | cc->nextState.bitContainer;
+    return ncc;
+}
+
+
 /*
     cond.range.begin  cond.range.end
            |----------------|
@@ -107,14 +139,14 @@
 
 
 
-CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, BitVector nextState) ;
+CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, CharClassPtr m) ;
 
-CharClassPtr mergeCCTree(CharClassPtr cc,unsigned char mBegin,unsigned char mEnd,BitVector nextState) {
+CharClassPtr mergeCCTree(CharClassPtr cc,unsigned char mBegin,unsigned char mEnd,CharClassPtr m) {
     CharClassPtr cc1;
     if (cc) {
-        cc1 = charClassMerge(cc,mBegin,mEnd,nextState);
+        cc1 = charClassMerge(cc,mBegin,mEnd,m);
     } else {
-        cc1 = createCharClassRange(mBegin,mEnd,nextState.bitContainer,NULL,NULL);
+        cc1 = createCharClassRangeWord(mBegin,mEnd,m,NULL,NULL,NULL);
     }
     return cc1;
 }
@@ -146,74 +178,74 @@
 
  */
 
-CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, BitVector nextState) {
+CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, CharClassPtr m) {
     // 重なっているccの領域を分割する
     // 必要ならばnextStateを重ねあわせる
     // 変更があった場合は新しくリストを作って返す
     if (end < cc->cond.range.begin ) { // 1
         if (cc->left) {
-            return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,charClassMerge(cc->left,begin,end,nextState),cc->right);
+            return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.end,m,NULL,charClassMerge(cc->left,begin,end,m),cc->right);
         } else {
-            return createCharClassRange(begin,end,nextState.bitContainer,NULL,cc);
+            return createCharClassRangeWord(begin,end,m,NULL,NULL,cc);
         }
     } else if (end == cc->cond.range.begin && begin != end ) { // 2
-        CharClassPtr cc1 = mergeCCTree(cc->left,begin,end-1,nextState);
+        CharClassPtr cc1 = mergeCCTree(cc->left,begin,end-1,m);
         if (cc->cond.range.begin == cc->cond.range.end) {
-            return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,
-                cc->nextState.bitContainer | nextState.bitContainer, cc1,cc->right);
+            return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.end,
+                cc,m, cc1,cc->right);
         }
-        CharClassPtr cc3 = createCharClassRange(cc->cond.range.begin+1,cc->cond.range.end,cc->nextState.bitContainer,cc->left,cc->right);
-        return createCharClassRange(cc->cond.range.begin,cc->cond.range.begin,
-            cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3);
+        CharClassPtr cc3 = createCharClassRangeWord(cc->cond.range.begin+1,cc->cond.range.end,cc,NULL,cc->left,cc->right);
+        return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.begin,
+            cc,m,cc1,cc3);
     } else if (end < cc->cond.range.end) { // range.begin < end
         if (begin < cc->cond.range.begin) {  // 3
-            CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState);
-            CharClassPtr cc3 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,cc->left,cc->right);
-            return createCharClassRange(cc->cond.range.begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3);
+            CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,m);
+            CharClassPtr cc3 = createCharClassRangeWord(end+1,cc->cond.range.end,cc,NULL,cc->left,cc->right);
+            return createCharClassRangeWord(cc->cond.range.begin,end,m,cc,cc1,cc3);
         }
         if (begin == cc->cond.range.begin) {  // 6
-            CharClassPtr cc2 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,NULL,cc->right);
-            return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc2);
+            CharClassPtr cc2 = createCharClassRangeWord(end+1,cc->cond.range.end,cc,NULL,NULL,cc->right);
+            return createCharClassRangeWord(begin,end,cc,m,cc->left,cc2);
         }
         // 9
-        CharClassPtr cc2 = createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,NULL);
-        CharClassPtr cc3 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,NULL,cc->right);
-        return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc2,cc3);
+        CharClassPtr cc2 = createCharClassRangeWord(cc->cond.range.begin,begin-1,cc,NULL,cc->left,NULL);
+        CharClassPtr cc3 = createCharClassRangeWord(end+1,cc->cond.range.end,cc,NULL,NULL,cc->right);
+        return createCharClassRangeWord(begin,end,cc,m,cc2,cc3);
     } else if (end == cc->cond.range.end) {
         if (begin == cc->cond.range.begin) { // 7
-            return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc->right);
+            return createCharClassRangeWord(begin,end,cc,m,cc->left,cc->right);
         } else if (begin < cc->cond.range.begin) { // 4
-            CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState);
-            return createCharClassRange(cc->cond.range.begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc->right);
+            CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,m);
+            return createCharClassRangeWord(cc->cond.range.begin,end,cc,m,cc1,cc->right);
         }
         // 10 cond.range.begin < begin
-        CharClassPtr cc2 = createCharClassRange(begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,NULL,cc->right);
-        return createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,cc2);
+        CharClassPtr cc2 = createCharClassRangeWord(begin,cc->cond.range.end,cc,m,NULL,cc->right);
+        return createCharClassRangeWord(cc->cond.range.begin,begin-1,cc,NULL,cc->left,cc2);
     }
     if (begin > cc->cond.range.end ) { // 13
         if (cc->right) {
-            return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,cc->nextState.bitContainer,cc->left,charClassMerge(cc->right,begin,end,nextState));
+            return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.end,cc,NULL,cc->left,charClassMerge(cc->right,begin,end,m));
         } else {
-            return createCharClassRange(begin,end,nextState.bitContainer,cc,NULL);
+            return createCharClassRangeWord(begin,end,m,NULL,cc,NULL);
         }
     }
     if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) {
         if (end > cc->cond.range.end) { // cond.range.end < end
             if (begin == cc->cond.range.begin) {    // 8
-                CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState);
-                return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,
-                    cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc1);
+                CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,m);
+                return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.end,
+                    cc,m,cc->left,cc1);
             }
             if (begin > cc->cond.range.begin) {  // 11,12
-                CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState);
-                CharClassPtr cc3 = createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,NULL);
-                return createCharClassRange(begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,cc3,cc1);
+                CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,m);
+                CharClassPtr cc3 = createCharClassRangeWord(cc->cond.range.begin,begin-1,cc,m,cc->left,NULL);
+                return createCharClassRangeWord(begin,cc->cond.range.end,cc,m,cc3,cc1);
             }
         }
     } else if (begin < cc->cond.range.begin) { // 5
-        CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState);
-        CharClassPtr cc3 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState);
-        return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3);
+        CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,m);
+        CharClassPtr cc3 = mergeCCTree(cc->right,cc->cond.range.end+1,end,m);
+        return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.end,cc,m,cc1,cc3);
     } else {
         printf("charClassMerge Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end);
     }
@@ -294,13 +326,11 @@
     }
     CharClassWalkerPtr walk = createCharClassWalker(x->cc);
     CharClassPtr ccy = y;
-    BitVector bi;
     while (hasNext(walk)) {
         CharClassPtr cc = getNext(walk);
         unsigned long begin = cc->cond.range.begin;
         unsigned long end = cc->cond.range.end;
-        bi = cc->nextState;
-        ccy = charClassMerge(ccy,begin,end,bi);
+        ccy = charClassMerge(ccy,begin,end,cc);
     }
     free(walk);
     return ccy;
--- a/regexParser/regexParser.h	Mon Feb 08 18:04:28 2016 +0900
+++ b/regexParser/regexParser.h	Mon Feb 08 19:23:37 2016 +0900
@@ -13,7 +13,7 @@
     int length;
     // look up table for BM search.
     // BitVector nextState;
-    // struct word *next;
+    struct word *next;
 } Word, *WordPtr;
 
 typedef struct utf8Range {
--- a/regexParser/test/ccMerge.cc	Mon Feb 08 18:04:28 2016 +0900
+++ b/regexParser/test/ccMerge.cc	Mon Feb 08 19:23:37 2016 +0900
@@ -24,6 +24,8 @@
     NodePtr n = NULL;
     StatePtr s = NULL;
     TGValue tgv = createTGValue();
+    State dummy;
+    tgv.tg->stateEnd = &dummy;
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i],"-regex") == 0) {
             ri.ptr = (unsigned char*)argv[i+1]; i++;