changeset 293:948428caf616

NFA maximum match worked
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Tue, 02 Feb 2016 10:38:45 +0900
parents 868f01f1ba8e
children bcb3b0cd5604
files regexParser/CeriumGrep.cc regexParser/TODO regexParser/cerium/CeriumMain.cc regexParser/cerium/ppe/Exec.cc regexParser/cerium/ppe/Print.cc regexParser/fileread.cc regexParser/grepWalk.cc regexParser/grepWalk.h regexParser/subsetConstruction.cc regexParser/threadedSearch.cc
diffstat 10 files changed, 92 insertions(+), 29 deletions(-) [+]
line wrap: on
line diff
--- a/regexParser/CeriumGrep.cc	Mon Feb 01 21:52:57 2016 +0900
+++ b/regexParser/CeriumGrep.cc	Tue Feb 02 10:38:45 2016 +0900
@@ -65,7 +65,7 @@
         st_mmap_t st_mmap = createSt_mmap(filename,fd);
         Buffer buff = createBuffer(st_mmap);
         if (ts) threadedSearch(tgv.tg,buff);
-        else grepWalk(tgv.tg,buff);
+        else grepWalk(tgv.tg,buff.buffptr,buff);
         close(fd);
     }
 
--- a/regexParser/TODO	Mon Feb 01 21:52:57 2016 +0900
+++ b/regexParser/TODO	Tue Feb 02 10:38:45 2016 +0900
@@ -1,3 +1,58 @@
+Tue Feb  2 09:55:40 JST 2016
+
+    % ./regexParser -subst -regex '(a|b)*a(a|b)(a|b)'
+    ---Print Node----
+                        a(1)->(1)
+                    |
+                        b(1)->(1)
+                *
+            +
+                a(4)->(4)
+        +
+                a(4)->(8)
+            |
+                b(4)->(8)
+     +
+            a(8)->(2)
+        |
+            b(8)->(2)
+    -----------------
+    state : 1 
+    node : + 1 -> 1
+                        [a-a] (5)
+                    [b-b] (1)
+
+    state : 2*
+    node : e 2 -> 1
+
+    state : 4 
+    node : | 4 -> 1
+                    [a-a] (8)
+                        [b-b] (8)
+
+    state : 8 
+    node : | 8 -> 1
+                    [a-a] (2)
+                        [b-b] (2)
+
+    state : 5 
+                    [a-a] (1)
+                        [b-b] (9)
+
+    state : 9 
+                    [a-a] (1)          <---- 間違い  2 とmergeしているはずだが...
+                        [b-b] (3)
+
+    state : 3*
+                        [a-a] (5)
+                    [b-b] (1)
+
+    やはり charClassMerge のbugだった。
+
+    createCharClassRangeで、同じものだったら新しく作らないってのがあると良い
+    charClassMerg が同じものを返す場合があるってことね
+
+
 Mon Feb  1 01:51:10 JST 2016 kono
 
     非決定性がある時の maxmum match がよろしくない
--- a/regexParser/cerium/CeriumMain.cc	Mon Feb 01 21:52:57 2016 +0900
+++ b/regexParser/cerium/CeriumMain.cc	Tue Feb 02 10:38:45 2016 +0900
@@ -34,7 +34,7 @@
         ResultPtr r = NEW(Result);
         r->continued = false;
         r->begin = tsv.matchBegin;
-        r->end = tsv.buff.buffptr;
+        r->end = tsv.matchEnd;
         *tsv.blk->resultEnd = r;
         r->next = NULL;
         tsv.blk->resultEnd = &r->next;
--- a/regexParser/cerium/ppe/Exec.cc	Mon Feb 01 21:52:57 2016 +0900
+++ b/regexParser/cerium/ppe/Exec.cc	Tue Feb 02 10:38:45 2016 +0900
@@ -26,7 +26,7 @@
     tsv.blk->resultEnd = &result;
     unsigned char *end = tsv.buff.buffend;
     tsv.buff.buffend = tsv.buff.buff+1;
-    tsv.matchBegin = NULL;
+    tsv.matchBegin = tsv.buff.buffptr;
     tsv.matchEnd = NULL;
     tsv = tSearch(tsv);
     tsv.blk->blockBegin = tsv.current;
--- a/regexParser/cerium/ppe/Print.cc	Mon Feb 01 21:52:57 2016 +0900
+++ b/regexParser/cerium/ppe/Print.cc	Tue Feb 02 10:38:45 2016 +0900
@@ -37,7 +37,7 @@
 #endif
             if ((prevBlockEnd->bitState.bitContainer & ~blockBegin->bitState.bitContainer)==0) {
                 // 前のブロックの matchBegin から最初 result の end までがマッチ
-                fwrite(prev->begin,r->end - prev->begin-1,1,stdout);
+                fwrite(prev->begin,r->end - prev->begin,1,stdout);
 // printf("####");
                 if (!r->continued) puts("");
             }
--- a/regexParser/fileread.cc	Mon Feb 01 21:52:57 2016 +0900
+++ b/regexParser/fileread.cc	Tue Feb 02 10:38:45 2016 +0900
@@ -32,7 +32,7 @@
 
 Buffer createBuffer(st_mmap_t st_mmap) {
     Buffer buff;
-    buff.buff = buff.buffptr = buff.matchBegin = st_mmap.file_mmap;
+    buff.buff = buff.buffptr = st_mmap.file_mmap;
     buff.buffend = buff.buff + st_mmap.size;
     return buff;
 }
--- a/regexParser/grepWalk.cc	Mon Feb 01 21:52:57 2016 +0900
+++ b/regexParser/grepWalk.cc	Tue Feb 02 10:38:45 2016 +0900
@@ -3,17 +3,24 @@
 #include "grepWalk.h"
 #include "subsetConstruction.h"
 
-void grepMatch(TransitionGeneratorPtr tg,Buffer buff);
-void grep(TransitionGeneratorPtr tg,Buffer buff);
-void grepSkip(TransitionGeneratorPtr tg,Buffer buff);
+void grep(TransitionGeneratorPtr tg,unsigned char *matchBegin,Buffer buff,unsigned long d) ;
 
-void grepMatch(TransitionGeneratorPtr tg,Buffer buff) {
-    fwrite(buff.matchBegin,buff.buffptr-buff.matchBegin-1,1,stdout);
-    puts("\n");
-    grepSkip(tg,buff);
+void grepSkip(TransitionGeneratorPtr tg,unsigned char *matchBegin, Buffer buff) {
+    matchBegin = buff.buffptr;
+    grep(tg,matchBegin,buff,1); // 1 is initState
 }
 
-void grep(TransitionGeneratorPtr tg,Buffer buff,unsigned long d) {
+void grepWalk(TransitionGeneratorPtr tg, unsigned char *matchBegin, Buffer buff) {
+    grepSkip(tg,matchBegin,buff);
+}
+
+void grepMatch(TransitionGeneratorPtr tg,unsigned char *matchBegin, Buffer buff) {
+    fwrite(matchBegin,buff.buffptr-matchBegin,1,stdout);
+    puts("\n");
+    grepSkip(tg,matchBegin,buff);
+}
+
+void grep(TransitionGeneratorPtr tg,unsigned char *matchBegin,Buffer buff,unsigned long d) {
     unsigned char c = *buff.buffptr++;
     if (c=='\0') return;
     StatePtr state = tg->stateList;
@@ -37,19 +44,11 @@
     }
 
     if (found == false) {
-        grepSkip(tg,buff);
+        grepSkip(tg,matchBegin,buff);
     } else if (found == true && (cc->nextState.bitContainer | 2)) { // Accept
-        grepMatch(tg,buff);
+        grepMatch(tg,matchBegin,buff);
     } else {
-        grep(tg,buff,cc->nextState.bitContainer);
+        grep(tg,matchBegin,buff,cc->nextState.bitContainer);
     }
 }
 
-void grepSkip(TransitionGeneratorPtr tg,Buffer buff) {
-    buff.matchBegin = buff.buffptr;
-    grep(tg,buff,1); // 1 is initState
-}
-
-void grepWalk(TransitionGeneratorPtr tg, Buffer buff) {
-    grepSkip(tg,buff);
-}
--- a/regexParser/grepWalk.h	Mon Feb 01 21:52:57 2016 +0900
+++ b/regexParser/grepWalk.h	Tue Feb 02 10:38:45 2016 +0900
@@ -1,3 +1,3 @@
 #include "regexParser.h"
 
-extern void grepWalk(TransitionGeneratorPtr tg, Buffer buff);
+extern void grepWalk(TransitionGeneratorPtr tg, unsigned char *matchBegin, Buffer buff);
--- a/regexParser/subsetConstruction.cc	Mon Feb 01 21:52:57 2016 +0900
+++ b/regexParser/subsetConstruction.cc	Tue Feb 02 10:38:45 2016 +0900
@@ -107,7 +107,7 @@
     }
     if (begin > cc->cond.range.end ) { // 13
         if (cc->right) {
-            return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,cc->left,charClassMerge(cc->right,begin,end,nextState));
+            return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,cc->nextState.bitContainer,cc->left,charClassMerge(cc->right,begin,end,nextState));
         } else {
             return createCharClassRange(begin,end,nextState.bitContainer,cc,NULL);
         }
--- a/regexParser/threadedSearch.cc	Mon Feb 01 21:52:57 2016 +0900
+++ b/regexParser/threadedSearch.cc	Tue Feb 02 10:38:45 2016 +0900
@@ -82,8 +82,12 @@
     return state->tState;
 }
 
+#define DEBUG 0
+
 TSValue tSearch(TSValue tsv) {
-    TSValuePtr tsvp = &tsv;
+#if DEBUG
+    TSValuePtr tsvp = &tsv;   // make tsv visible in lldb
+#endif
     next: while (tsv.buff.buffptr < tsv.buff.buffend) {
         unsigned char c = *tsv.buff.buffptr++;
 //        printState(tsv.current->state);
@@ -99,21 +103,25 @@
                     // match the word.
                     // if (not match) continue;
                 }
+                tsv = tsv.current->stateMatch(tsv);
                 if (ccv->tState) {
                     tsv.current = ccv->tState;
                 } else {
                     tsv.current = nextTState(ccv->state,tsv.tg);
                     ccv->tState = tsv.current;
                 }
-                tsv = tsv.current->stateMatch(tsv);
                 goto next;
             }
         }
         tsv = tsv.current->stateSkip(tsv);
         tsv.matchBegin = tsv.buff.buffptr;
     }
+#if DEBUG
     *tsvp = tsv;
     return *tsvp;
+#else
+    return tsv;
+#endif
 }
 
 void threadedSearch(TransitionGeneratorPtr tg, Buffer buff) {
@@ -124,7 +132,8 @@
     tsv.tg->stateSkip = stateSkip;
     tsv.tg->stateMatch = stateMatch;
     tsv.tg->stateNothing = stateNothing;
-    tsv.matchBegin = tsv.matchEnd = NULL;
+    tsv.matchBegin = buff.buffptr;
+    tsv.matchEnd = NULL;
     tsv.current = generateTState(tg->stateList,tg);
     tg->stateStart = NEW(State);
     *tg->stateStart = *tg->stateList;