Mercurial > hg > Applications > Grep
changeset 293:948428caf616
NFA maximum match worked
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 02 Feb 2016 10:38:45 +0900 |
parents | 868f01f1ba8e |
children | bcb3b0cd5604 |
files | regexParser/CeriumGrep.cc regexParser/TODO regexParser/cerium/CeriumMain.cc regexParser/cerium/ppe/Exec.cc regexParser/cerium/ppe/Print.cc regexParser/fileread.cc regexParser/grepWalk.cc regexParser/grepWalk.h regexParser/subsetConstruction.cc regexParser/threadedSearch.cc |
diffstat | 10 files changed, 92 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/regexParser/CeriumGrep.cc Mon Feb 01 21:52:57 2016 +0900 +++ b/regexParser/CeriumGrep.cc Tue Feb 02 10:38:45 2016 +0900 @@ -65,7 +65,7 @@ st_mmap_t st_mmap = createSt_mmap(filename,fd); Buffer buff = createBuffer(st_mmap); if (ts) threadedSearch(tgv.tg,buff); - else grepWalk(tgv.tg,buff); + else grepWalk(tgv.tg,buff.buffptr,buff); close(fd); }
--- a/regexParser/TODO Mon Feb 01 21:52:57 2016 +0900 +++ b/regexParser/TODO Tue Feb 02 10:38:45 2016 +0900 @@ -1,3 +1,58 @@ +Tue Feb 2 09:55:40 JST 2016 + + % ./regexParser -subst -regex '(a|b)*a(a|b)(a|b)' + ---Print Node---- + a(1)->(1) + | + b(1)->(1) + * + + + a(4)->(4) + + + a(4)->(8) + | + b(4)->(8) + + + a(8)->(2) + | + b(8)->(2) + ----------------- + state : 1 + node : + 1 -> 1 + [a-a] (5) + [b-b] (1) + + state : 2* + node : e 2 -> 1 + + state : 4 + node : | 4 -> 1 + [a-a] (8) + [b-b] (8) + + state : 8 + node : | 8 -> 1 + [a-a] (2) + [b-b] (2) + + state : 5 + [a-a] (1) + [b-b] (9) + + state : 9 + [a-a] (1) <---- 間違い 2 とmergeしているはずだが... + [b-b] (3) + + state : 3* + [a-a] (5) + [b-b] (1) + + やはり charClassMerge のbugだった。 + + createCharClassRangeで、同じものだったら新しく作らないってのがあると良い + charClassMerg が同じものを返す場合があるってことね + + Mon Feb 1 01:51:10 JST 2016 kono 非決定性がある時の maxmum match がよろしくない
--- a/regexParser/cerium/CeriumMain.cc Mon Feb 01 21:52:57 2016 +0900 +++ b/regexParser/cerium/CeriumMain.cc Tue Feb 02 10:38:45 2016 +0900 @@ -34,7 +34,7 @@ ResultPtr r = NEW(Result); r->continued = false; r->begin = tsv.matchBegin; - r->end = tsv.buff.buffptr; + r->end = tsv.matchEnd; *tsv.blk->resultEnd = r; r->next = NULL; tsv.blk->resultEnd = &r->next;
--- a/regexParser/cerium/ppe/Exec.cc Mon Feb 01 21:52:57 2016 +0900 +++ b/regexParser/cerium/ppe/Exec.cc Tue Feb 02 10:38:45 2016 +0900 @@ -26,7 +26,7 @@ tsv.blk->resultEnd = &result; unsigned char *end = tsv.buff.buffend; tsv.buff.buffend = tsv.buff.buff+1; - tsv.matchBegin = NULL; + tsv.matchBegin = tsv.buff.buffptr; tsv.matchEnd = NULL; tsv = tSearch(tsv); tsv.blk->blockBegin = tsv.current;
--- a/regexParser/cerium/ppe/Print.cc Mon Feb 01 21:52:57 2016 +0900 +++ b/regexParser/cerium/ppe/Print.cc Tue Feb 02 10:38:45 2016 +0900 @@ -37,7 +37,7 @@ #endif if ((prevBlockEnd->bitState.bitContainer & ~blockBegin->bitState.bitContainer)==0) { // 前のブロックの matchBegin から最初 result の end までがマッチ - fwrite(prev->begin,r->end - prev->begin-1,1,stdout); + fwrite(prev->begin,r->end - prev->begin,1,stdout); // printf("####"); if (!r->continued) puts(""); }
--- a/regexParser/fileread.cc Mon Feb 01 21:52:57 2016 +0900 +++ b/regexParser/fileread.cc Tue Feb 02 10:38:45 2016 +0900 @@ -32,7 +32,7 @@ Buffer createBuffer(st_mmap_t st_mmap) { Buffer buff; - buff.buff = buff.buffptr = buff.matchBegin = st_mmap.file_mmap; + buff.buff = buff.buffptr = st_mmap.file_mmap; buff.buffend = buff.buff + st_mmap.size; return buff; }
--- a/regexParser/grepWalk.cc Mon Feb 01 21:52:57 2016 +0900 +++ b/regexParser/grepWalk.cc Tue Feb 02 10:38:45 2016 +0900 @@ -3,17 +3,24 @@ #include "grepWalk.h" #include "subsetConstruction.h" -void grepMatch(TransitionGeneratorPtr tg,Buffer buff); -void grep(TransitionGeneratorPtr tg,Buffer buff); -void grepSkip(TransitionGeneratorPtr tg,Buffer buff); +void grep(TransitionGeneratorPtr tg,unsigned char *matchBegin,Buffer buff,unsigned long d) ; -void grepMatch(TransitionGeneratorPtr tg,Buffer buff) { - fwrite(buff.matchBegin,buff.buffptr-buff.matchBegin-1,1,stdout); - puts("\n"); - grepSkip(tg,buff); +void grepSkip(TransitionGeneratorPtr tg,unsigned char *matchBegin, Buffer buff) { + matchBegin = buff.buffptr; + grep(tg,matchBegin,buff,1); // 1 is initState } -void grep(TransitionGeneratorPtr tg,Buffer buff,unsigned long d) { +void grepWalk(TransitionGeneratorPtr tg, unsigned char *matchBegin, Buffer buff) { + grepSkip(tg,matchBegin,buff); +} + +void grepMatch(TransitionGeneratorPtr tg,unsigned char *matchBegin, Buffer buff) { + fwrite(matchBegin,buff.buffptr-matchBegin,1,stdout); + puts("\n"); + grepSkip(tg,matchBegin,buff); +} + +void grep(TransitionGeneratorPtr tg,unsigned char *matchBegin,Buffer buff,unsigned long d) { unsigned char c = *buff.buffptr++; if (c=='\0') return; StatePtr state = tg->stateList; @@ -37,19 +44,11 @@ } if (found == false) { - grepSkip(tg,buff); + grepSkip(tg,matchBegin,buff); } else if (found == true && (cc->nextState.bitContainer | 2)) { // Accept - grepMatch(tg,buff); + grepMatch(tg,matchBegin,buff); } else { - grep(tg,buff,cc->nextState.bitContainer); + grep(tg,matchBegin,buff,cc->nextState.bitContainer); } } -void grepSkip(TransitionGeneratorPtr tg,Buffer buff) { - buff.matchBegin = buff.buffptr; - grep(tg,buff,1); // 1 is initState -} - -void grepWalk(TransitionGeneratorPtr tg, Buffer buff) { - grepSkip(tg,buff); -}
--- a/regexParser/grepWalk.h Mon Feb 01 21:52:57 2016 +0900 +++ b/regexParser/grepWalk.h Tue Feb 02 10:38:45 2016 +0900 @@ -1,3 +1,3 @@ #include "regexParser.h" -extern void grepWalk(TransitionGeneratorPtr tg, Buffer buff); +extern void grepWalk(TransitionGeneratorPtr tg, unsigned char *matchBegin, Buffer buff);
--- a/regexParser/subsetConstruction.cc Mon Feb 01 21:52:57 2016 +0900 +++ b/regexParser/subsetConstruction.cc Tue Feb 02 10:38:45 2016 +0900 @@ -107,7 +107,7 @@ } if (begin > cc->cond.range.end ) { // 13 if (cc->right) { - return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,cc->left,charClassMerge(cc->right,begin,end,nextState)); + return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,cc->nextState.bitContainer,cc->left,charClassMerge(cc->right,begin,end,nextState)); } else { return createCharClassRange(begin,end,nextState.bitContainer,cc,NULL); }
--- a/regexParser/threadedSearch.cc Mon Feb 01 21:52:57 2016 +0900 +++ b/regexParser/threadedSearch.cc Tue Feb 02 10:38:45 2016 +0900 @@ -82,8 +82,12 @@ return state->tState; } +#define DEBUG 0 + TSValue tSearch(TSValue tsv) { - TSValuePtr tsvp = &tsv; +#if DEBUG + TSValuePtr tsvp = &tsv; // make tsv visible in lldb +#endif next: while (tsv.buff.buffptr < tsv.buff.buffend) { unsigned char c = *tsv.buff.buffptr++; // printState(tsv.current->state); @@ -99,21 +103,25 @@ // match the word. // if (not match) continue; } + tsv = tsv.current->stateMatch(tsv); if (ccv->tState) { tsv.current = ccv->tState; } else { tsv.current = nextTState(ccv->state,tsv.tg); ccv->tState = tsv.current; } - tsv = tsv.current->stateMatch(tsv); goto next; } } tsv = tsv.current->stateSkip(tsv); tsv.matchBegin = tsv.buff.buffptr; } +#if DEBUG *tsvp = tsv; return *tsvp; +#else + return tsv; +#endif } void threadedSearch(TransitionGeneratorPtr tg, Buffer buff) { @@ -124,7 +132,8 @@ tsv.tg->stateSkip = stateSkip; tsv.tg->stateMatch = stateMatch; tsv.tg->stateNothing = stateNothing; - tsv.matchBegin = tsv.matchEnd = NULL; + tsv.matchBegin = buff.buffptr; + tsv.matchEnd = NULL; tsv.current = generateTState(tg->stateList,tg); tg->stateStart = NEW(State); *tg->stateStart = *tg->stateList;