changeset 63:020ba001c58a

modify I/O routine. use mmap. it's really faster than fgets ;-)
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Fri, 05 Nov 2010 01:39:42 +0900
parents 5ab54a732ddb
children c981dc66b258 bee3a64d6cbc c657ca0f0b30
files pyrect/jitgrep.py pyrect/regexp/ast.py pyrect/translator/c_translator.py pyrect/translator/grep_translator.py pyrect/translator/template/grep.c
diffstat 5 files changed, 169 insertions(+), 123 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/jitgrep.py	Thu Nov 04 22:04:34 2010 +0900
+++ b/pyrect/jitgrep.py	Fri Nov 05 01:39:42 2010 +0900
@@ -62,7 +62,7 @@
         return
 
     if opts.time : start_time = time.time()
-    reg = Regexp(string)
+    reg = Regexp(".*"+string)
     if cbc:
         grept = CbCGREPTranslator(reg)
     elif opts.label:
--- a/pyrect/regexp/ast.py	Thu Nov 04 22:04:34 2010 +0900
+++ b/pyrect/regexp/ast.py	Fri Nov 05 01:39:42 2010 +0900
@@ -113,15 +113,16 @@
             return -1
 
 class Character(InputNode):
+    import curses.ascii as ascii
+    ASCII = ascii.controlnames + \
+            ["'"+chr(c)+"'" for c in range(33, 127)]\
+            + ['DEL'] + [r"\x%x" % c for c in range(128, 256)]
+
     def __init__(self, char):
         self.char = ord(char)
 
     def __str__(self):
-        if not self.char in range(33, 127): # not Ascii
-            c = r"\\x%x" % self.char
-        else:
-            c = chr(self.char)
-        return "'" + c + "'"
+        return self.ASCII[self.char]
 
     def __hash__(self):
         return self.char.__hash__()
--- a/pyrect/translator/c_translator.py	Thu Nov 04 22:04:34 2010 +0900
+++ b/pyrect/translator/c_translator.py	Fri Nov 05 01:39:42 2010 +0900
@@ -19,6 +19,7 @@
         if fa == "DFA":
             self.cg = regexp.dfacg
         self.debug = False
+        self.eols = (Character('\0'), Character('\n'), Character('\r'))
         self.special_rule = (Range, BegLine, MBCharacter)
         self.trans_stmt = self._trans_stmt(self.emit)
 
@@ -143,24 +144,21 @@
         else:
             default = "reject"
 
-        any_ = None
-
         for input_ in transition.keys():
             if type(input_) in self.special_rule:
                 self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))
             elif type(input_) is AnyChar:
-                any_ = (input_, self.state_name(transition.pop(input_)))
-                default = None
+                default = self.state_name(transition.pop(input_))
 
         if cur_state in self.cg.accepts:
-            eol = Character('\0')
-            transition[eol] = "accept"
+            for eol in self.eols:
+                transition[eol] = "accept"
+        elif default != "reject":
+            for eol in self.eols:
+                transition[eol] = "reject"
 
         self.emit_switch(transition, default)
 
-        if any_:
-            self.trans_stmt.emit(any_[0], any_[1])
-
         self.emitd("}", 2)
 
     def emit_initialization(self):
@@ -196,7 +194,7 @@
             self._emit("/* %s */" % input_node.__repr__())
 
         def visit_Character(self, char):
-            self._emit("case %d: /* match %s */" % (char.char, chr(char.char)))
+            self._emit("case %d: /* match %s */" % (char.char, char))
             self._emit("  return %s(s);" % self.next)
 
         def visit_EndLine(self, endline):
@@ -231,11 +229,6 @@
                 self._emit("if ('%s' <= *s && *s <= '%s')" % (range.lower.char, range.upper.char))
                 self._emit("  return %s(s+1);" % self.next, 2)
 
-        def visit_AnyChar(self, anychar):
-            self._emit(r"if (*s != '\0')")
-            self._emit(   "return %s(SKIP(s));" % self.next, 2)
-            self._emit("return reject(s);")
-
 def test():
     import doctest
     doctest.testmod()
--- a/pyrect/translator/grep_translator.py	Thu Nov 04 22:04:34 2010 +0900
+++ b/pyrect/translator/grep_translator.py	Fri Nov 05 01:39:42 2010 +0900
@@ -3,6 +3,7 @@
 import os
 from c_translator import CTranslator
 from pyrect.regexp import Regexp, Analyzer
+from pyrect.regexp.ast import ASTWalker, AnyChar, Character
 
 class GREPTranslateExeption(Exception):
     pass
@@ -26,6 +27,8 @@
         self.thread_dfa = 1
         self.thread_line = 1
         self.filter = True
+        self.interface = "UCHARP beg, UCHARP buf, UCHARP end"
+        self.args = "beg, buf, end"
 
     def getbufsize(self,):
         return self.__bufsize
@@ -35,28 +38,26 @@
     bufsize = property(getbufsize, setbufsize)
 
     def emit_initialization(self):
-        CTranslator.emit_initialization(self)
-        if self.thread_dfa > 1 and self.regexp.max_len != float("inf"):
-            self.emit("#define DFA paradfa")
-            self.emit("#define THREAD_NUM %d" % self.thread_dfa)
-            self.emit("#define REG_MAX_LEN %d" % self.regexp.max_len)
-        else:
-            self.emit("#define DFA dfa")
-            self.emit("#define THREAD_NUM 1 // no threading")
-            self.emit("#define REG_MAX_LEN -1")
-
+        self.emit("#include <stdio.h>")
         self.emit("#define GREP grep")
-        self.emit("#define LINEBUFSIZE %d" % self.bufsize)
-        self.emit("#define READBUFSIZE %d" % self.bufsize)
-        self.emit('#include <pthread.h>')
+        self.emit("#define UCHARP unsigned char *")
         self.emit("#include <stdlib.h>")
+        self.emit("#include <sys/mman.h>")
+        self.emit("#include <sys/types.h>")
+        self.emit("#include <sys/stat.h>")
+        self.emit("#include <fcntl.h>")
         self.emit("#include <string.h>")
-        self.emit("char readbuf[%d];" % (self.bufsize))
-        self.emit("int dfa(unsigned char* s, int len);", 2)
-        self.emit("int paradfa(unsigned char* s, int len);", 2)
+
+        self.emit_skip()
 
-        if self.filter and self.regexp.must_words:
-            self.emit_filter(self.regexp.must_words)
+        for state in self.cg.map.iterkeys():
+            self.emit("void %s(%s);" % (self.state_name(state), self.interface))
+        self.emit('void accept(%s);' % self.interface)
+        self.emit('void reject(%s);' % self.interface)
+        self.emit("void dfa(%s);" % self.interface, 2)
+
+        #if self.filter and self.regexp.must_words:
+        #    self.emit_filter(self.regexp.must_words)
 
         grepsource = open(self.BASE_DIR + "/template/grep.c")
         self.emit(grepsource.read())
@@ -72,15 +73,15 @@
 
         if len(words) == 1:
             if len(key) == self.regexp.min_len:
-                self.emit("#define FILTER_ONLY 1", 1)
+                self.emit("#define MATCH (bm_filter(beg, buf, n-1))", 1)
         else:
-            self.emit("#define WITH_FILTER 1", 1)
+            self.emit("#define (bm_filter(beg, buf, n-1) && DFA(beg, buf, n-1))", 1)
 
         self.emit("#define FILTER bm_filter", 2)
-        self.emiti("int bm_filter(unsigned char* text, int n) {")
+        self.emiti("int bm_filter(unsigned char* buf, int n) {")
         l = len(key)
         if l == 1:
-            self.emit("   return (strchr(text, %d) != NULL)" % ord(key))
+            self.emit("   return (strchr(buf, %d) != NULL)" % ord(key))
             self.emitd("}", 2)
             return
 
@@ -98,10 +99,10 @@
         self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l))
         self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2)
         self.emiti("while (i < n) {")
-        self.emit(   "c = text[i];")
+        self.emit(   "c = buf[i];")
         self.emiti(  "if (c == tail) {")
         self.emit(     "j = len - 1; k = i;")
-        self.emiti(    "while (key[--j] == text[--k]) {")
+        self.emiti(    "while (key[--j] == buf[--k]) {")
         self.emit(       "if (j == 0) return 1;")
         self.emitd(    "}")
         self.emitd(  "}")
@@ -111,22 +112,104 @@
         self.emitd("}", 2)
 
     def emit_driver(self):
-        self.emiti("int dfa(unsigned char *text, int len) {")
-        self.emit(   "len++; //try match for n+1 times.")
-        self.emiti(  "while (len--) {")
-        self.emit(     "if (%s(text++)) return 1;" % self.state_name(self.cg.start))
-        self.emitd(  "}")
-        self.emit(   "return 0;")
+        self.emiti("void dfa(%s) {" % self.interface)
+        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
+        self.emit(   "return;")
         self.emitd("}")
         return
 
+    def emit_accept_state(self):
+        self.emiti("void accept(%s) {" % self.interface)
+        self.emit(   "buf--;")
+        self.emit(   "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));")
+        self.emit(   'if (ret == NULL) {fprintf(stderr, "memchr NULL err!"); exit(0);}')
+        self.emiti(  "if (ret > end) {")
+        self.emit(     "ret--;")
+        self.emit(     "print_line(beg, ret);")
+        self.emit(     "return;")
+        self.emitd(  "}")
+        self.emit(   "print_line(beg, ret);")
+        self.emit(   "beg = buf = ret + 1;")
+        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
+        self.emitd("}", 2)
+
+    def emit_reject_state(self):
+        self.emiti("void reject(%s) {" % self.interface)
+        self.emit(   "if (buf >= end) return;")
+        self.emit(   "beg = buf;")
+        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
+        self.emitd("}", 2)
+
+    def emit_switch(self, case, default=None):
+        if not case:
+            if default:
+                self.emit("return %s(%s);" % (default, self.args))
+            return
+        self.emiti("switch(*buf++) {")
+        for case, next_ in case.iteritems():
+            self.trans_stmt.emit(case, self.state_name(next_))
+        if default:
+            if default == self.state_name(self.cg.start):
+                self.emit("default: return %s(%s);" % (default, self.args))
+        self.emitd("}")
+
     def emit_state(self, cur_state, transition):
+        self.emiti("void %s(%s) {" % (self.state_name(cur_state), self.interface))
+
         if cur_state in self.cg.accepts:
-            self.emiti("int %s(unsigned char* s) {" % self.state_name(cur_state))
-            self.emit(   "return accept(s);")
-            self.emitd("}")
-        else:
-            CTranslator.emit_state(self, cur_state, transition)
+            self.emit(   "return accept(beg, buf-1, end);")
+            self.emitd("}", 2)
+            return
+
+        default = self.state_name(self.cg.start)
+        for eol in self.eols:
+            transition[eol] = "reject"
+
+        for input_ in transition.keys():
+            if type(input_) in self.special_rule:
+                self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))
+            elif type(input_) is AnyChar:
+                default = self.state_name(transition.pop(input_))
+
+        self.emit_switch(transition, default)
+
+        self.emitd("}", 2)
+
+    class _trans_stmt(ASTWalker):
+        def __init__(self, emit):
+            self._emit = emit
+            self.args = "beg, buf, end"
+
+        def emit(self, input_node, next_):
+            self.next = next_
+            input_node.accept(self)
+
+        def visit(self, input_node):
+            self._emit("/* UNKNOW RULE */")
+            self._emit("/* %s */" % input_node.__repr__())
+
+        def visit_Character(self, char):
+            self._emit("case %d: /* match %s */" % (char.char, char))
+            self._emit("  return %s(%s);" % (self.next, self.args))
+
+        # Special Rule
+        def visit_BegLine(self, begline):
+            self._emit("/* begin of line  */")
+            self._emit("if (buf == beg)")
+            self._emit("  return %s(%s);" % (self.next, self.args), 2)
+
+        def visit_Range(self, range):
+            if isinstance(range.lower, MBCharacter) and not \
+               isinstance(range.upper, MBCharacter) or  \
+               isinstance(range.upper, MBCharacter) and not \
+               isinstance(range.lower, MBCharacter):
+                return
+
+            if isinstance(range.lower, MBCharacter):
+                self.visit(range)
+            else:
+                self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char))
+                self._emit("  return %s(beg, buf+1, end);" % self.next, 2)
 
 def test():
     import doctest
--- a/pyrect/translator/template/grep.c	Thu Nov 04 22:04:34 2010 +0900
+++ b/pyrect/translator/template/grep.c	Fri Nov 05 01:39:42 2010 +0900
@@ -1,3 +1,4 @@
+/*
 typedef struct _thread_arg {
   unsigned char *buf;
   int len;
@@ -38,92 +39,60 @@
   return 0;
 }
 
-int paragrep(char *regexp, FILE *f, char *name) {
-  int nmatch, used_buf = 0,
-    reading = 1;
-  char lbuf[THREAD_NUM][LINEBUFSIZE];
-  pthread_t hundle[THREAD_NUM];
-  thread_arg_t targ[THREAD_NUM];
-  int i, j, n, m;
-  do {
-    for (i = 0; i < THREAD_NUM; i++) {
-      if (fgets(lbuf[i], sizeof lbuf[i], f) == NULL) {
-        reading = 0;
-        break;
-      } else {
-        n = strlen(lbuf[i]);
-        if (n > 0 && lbuf[i][n-1] == '\n')
-          lbuf[i][n-1] = '\0';
-        targ[i].buf = (unsigned char *)lbuf[i];
-        targ[i].len = n - 1;
-        pthread_create(&hundle[i], NULL, (void *)thread_dfa, (void *)&targ[i]);
-      }
-    }
-    for (j = 0; j < i; j++) {
-      pthread_join(hundle[j], NULL);
-      if (targ[j].match) {
-        nmatch++;
-        if (name != NULL)
-          printf("%s:", name);
-        printf("%s\n", targ[j].buf);
-      }
-    }
-  } while (i != 0);
-  return nmatch;
+*/
+
+void print_line(unsigned char *beg, unsigned char *end) {
+  fwrite(beg, sizeof(char), (end - beg + 1), stdout);
 }
 
-int grep(char *regexp, FILE *f, char *name) {
-  int n, nmatch;
-  char lbuf[LINEBUFSIZE];
-  buf = (unsigned char *)lbuf;
-  nmatch = 0;
-  while (fgets(lbuf, sizeof lbuf, f) != NULL) {
-    n = strlen(lbuf);
-    if (n > 0 && buf[n-1] == '\n')
-      lbuf[n-1] = '\0';
+void grep(char *regexp, int fd, char *name) {
+  caddr_t file_mmap;
+  unsigned char *buf, *end, *beg;
+  off_t size;
+  struct stat sb;
+
+  if (fstat(fd, &sb)) {
+    fprintf(stderr, "can't fstat %s\n", name);
+    exit(0);
+  }
 
-#ifdef FILTER_ONLY
-    if (FILTER(buf, n-1)) {
-#elif  defined(WITH_FILTER)
-    if (FILTER(buf, n-1) && DFA(buf, n-1)) {
-#else
-    if (DFA(buf, n-1)) {
-#endif
-      nmatch++;
-      if (name != NULL)
-        printf("%s:", name);
-      printf("%s\n", lbuf);
-    }
+  size = sb.st_size;
+  file_mmap = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, (off_t)0);
+
+  if (file_mmap == (caddr_t)-1) {
+    fprintf(stderr, "can't mmap %s\n", name);
+    exit(0);
   }
-  return nmatch;
+
+  beg = buf = (unsigned char *) file_mmap;
+  end = beg + size - 1;
+
+  dfa(beg, beg, end);
+
+  munmap(file_mmap, size);
+  return;
 }
 
 int main(int argc, char* argv[]) {
-  int i, nmatch;
-  FILE *f;
+  int i, fd;
 
   if (argc < 2) {
     fprintf(stderr, "usage: grep regexp [file ...]");
     exit(0);
   }
-  nmatch = 0;
   if (argc == 2) {
-    if (GREP(argv[1], stdin, NULL))
-      nmatch++;
+    return;
   } else {
     for (i = 2; i < argc; i++) {
-      f = fopen(argv[i], "r");
-      if (f == NULL) {
+      fd = open(argv[i], O_RDONLY, 0666);
+      if (fd == 0) {
         fprintf(stderr, "can't open %s:", argv[i]);
         continue;
       }
-      if (READBUFSIZE > 0)
-        setvbuf(f, readbuf, _IOFBF, READBUFSIZE);
-      if (GREP(argv[1], f, argc > 3 ? argv[i] : NULL) > 0)
-        nmatch++;
-      fclose(f);
+      GREP(argv[1], fd, argc > 3 ? argv[i] : NULL);
+      close(fd);
     }
   }
 
-  return nmatch;
+  return 0;
 }