0
|
1 /* m_getfld.c - read/parse a message */
|
|
2 #ifndef lint
|
|
3 static char ident[] = "@(#)$Id$";
|
|
4 #endif /* lint */
|
|
5
|
|
6 #include "../h/mh.h"
|
|
7 #include <stdio.h>
|
|
8 #include "../zotnet/mts.h"
|
|
9 #include <ctype.h>
|
|
10
|
|
11
|
|
12 /* This module has a long and checkered history. First, it didn't burst
|
|
13 maildrops correctly because it considered two CTRL-A:s in a row to be
|
|
14 an inter-message delimiter. It really is four CTRL-A:s followed by a
|
|
15 newline. Unfortunately, MMDF will convert this delimiter *inside* a
|
|
16 message to a CTRL-B followed by three CTRL-A:s and a newline. This
|
|
17 caused the old version of m_getfld() to declare eom prematurely. The
|
|
18 fix was a lot slower than
|
|
19
|
|
20 c == '\001' && peekc (iob) == '\001'
|
|
21
|
|
22 but it worked, and to increase generality, UUCP style maildrops could
|
|
23 be parsed as well. Unfortunately the speed issue finally caught up with
|
|
24 us since this routine is at the very heart of MH.
|
|
25
|
|
26 To speed things up considerably, the routine Eom() was made an auxilary
|
|
27 function called by the macro eom(). Unless we are bursting a maildrop,
|
|
28 the eom() macro returns FALSE saying we aren't at the end of the
|
|
29 message.
|
|
30
|
|
31 The next thing to do is to read the mtstailor file and initialize
|
|
32 delimiter[] and delimlen accordingly...
|
|
33
|
|
34 After mhl was made a built-in in msh, m_getfld() worked just fine
|
|
35 (using m_unknown() at startup). Until one day: a message which was
|
|
36 the result of a bursting was shown. Then, since the burst boundaries
|
|
37 aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
|
|
38 Very sad. The solution: introduce m_eomsbr(). This hook gets called
|
|
39 after the end of each line (since testing for eom involves an fseek()).
|
|
40 This worked fine, until one day: a message with no body portion arrived.
|
|
41 Then the
|
|
42
|
|
43 while (eom (c = Getc (iob), iob))
|
|
44 continue;
|
|
45
|
|
46 loop caused m_getfld() to return FMTERR. So, that logic was changed to
|
|
47 check for (*eom_action) and act accordingly.
|
|
48
|
|
49 This worked fine, until one day: someone didn't use four CTRL:A's as
|
|
50 their delimiters. So, the bullet got bit and we read mts.h and
|
|
51 continue to struggle on. It's not that bad though, since the only time
|
|
52 the code gets executed is when inc (or msh) calls it, and both of these
|
|
53 have already called mts_init().
|
|
54
|
|
55 ------------------------
|
|
56 (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
|
|
57
|
|
58 This routine was accounting for 60% of the cpu time used by most mh
|
|
59 programs. I spent a bit of time tuning and it now accounts for <10%
|
|
60 of the time used. Like any heavily tuned routine, it's a bit
|
|
61 complex and you want to be sure you understand everything that it's
|
|
62 doing before you start hacking on it. Let me try to emphasize
|
|
63 that: every line in this atrocity depends on every other line,
|
|
64 sometimes in subtle ways. You should understand it all, in detail,
|
|
65 before trying to change any part. If you do change it, test the
|
|
66 result thoroughly (I use a hand-constructed test file that exercises
|
|
67 all the ways a header name, header body, header continuation,
|
|
68 header-body separator, body line and body eom can align themselves
|
|
69 with respect to a buffer boundary). "Minor" bugs in this routine
|
|
70 result in garbaged or lost mail.
|
|
71
|
|
72 If you hack on this and slow it down, I, my children and my
|
|
73 children's children will curse you.
|
|
74
|
|
75 This routine gets used on three different types of files: normal,
|
|
76 single msg files, "packed" unix or mmdf mailboxs (when used by inc)
|
|
77 and packed, directoried bulletin board files (when used by msh).
|
|
78 The biggest impact of different file types is in "eom" testing. The
|
|
79 code has been carefully organized to test for eom at appropriate
|
|
80 times and at no other times (since the check is quite expensive).
|
|
81 I have tried to arrange things so that the eom check need only be
|
|
82 done on entry to this routine. Since an eom can only occur after a
|
|
83 newline, this is easy to manage for header fields. For the msg
|
|
84 body, we try to efficiently search the input buffer to see if
|
|
85 contains the eom delimiter. If it does, we take up to the
|
|
86 delimiter, otherwise we take everything in the buffer. (The change
|
|
87 to the body eom/copy processing produced the most noticeable
|
|
88 performance difference, particularly for "inc" and "show".)
|
|
89
|
|
90 There are three qualitatively different things this routine busts
|
|
91 out of a message: field names, field text and msg bodies. Field
|
|
92 names are typically short (~8 char) and the loop that extracts them
|
|
93 might terminate on a colon, newline or max width. I considered
|
|
94 using a Vax "scanc" to locate the end of the field followed by a
|
|
95 "bcopy" but the routine call overhead on a Vax is too large for this
|
|
96 to work on short names. If Berkeley ever makes "inline" part of the
|
|
97 C optimiser (so things like "scanc" turn into inline instructions) a
|
|
98 change here would be worthwhile.
|
|
99
|
|
100 Field text is typically 60 - 100 characters so there's (barely)
|
|
101 a win in doing a routine call to something that does a "locc"
|
|
102 followed by a "bmove". About 30% of the fields have continuations
|
|
103 (usually the 822 "received:" lines) and each continuation generates
|
|
104 another routine call. "Inline" would be a big win here, as well.
|
|
105
|
|
106 Messages, as of this writing, seem to come in two flavors: small
|
|
107 (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers
|
|
108 so message bodies average at least a few hundred characters.
|
|
109 Assuming your system uses reasonably sized stdio buffers (1K or
|
|
110 more), this routine should be able to remove the body in large
|
|
111 (>500 byte) chunks. The makes the cost of a call to "bcopy"
|
|
112 small but there is a premium on checking for the eom in packed
|
|
113 maildrops. The eom pattern is always a simple string so we can
|
|
114 construct an efficient pattern matcher for it (e.g., a Vax "matchc"
|
|
115 instruction). Some thought went into recognizing the start of
|
|
116 an eom that has been split across two buffers.
|
|
117
|
|
118 This routine wants to deal with large chunks of data so, rather
|
|
119 than "getc" into a local buffer, it uses stdio's buffer. If
|
|
120 you try to use it on a non-buffered file, you'll get what you
|
|
121 deserve. This routine "knows" that struct FILEs have a _ptr
|
|
122 and a _cnt to describe the current state of the buffer and
|
|
123 it knows that _filbuf ignores the _ptr & _cnt and simply fills
|
|
124 the buffer. If stdio on your system doesn't work this way, you
|
|
125 may have to make small changes in this routine.
|
|
126
|
|
127 This routine also "knows" that an EOF indication on a stream is
|
|
128 "sticky" (i.e., you will keep getting EOF until you reposition the
|
|
129 stream). If your system doesn't work this way it is broken and you
|
|
130 should complain to the vendor. As a consequence of the sticky
|
|
131 EOF, this routine will never return any kind of EOF status when
|
|
132 there is data in "name" or "buf").
|
|
133 */
|
|
134
|
|
135
|
|
136 #define Getc(iob) getc(iob)
|
|
137 #define eom(c,iob) (msg_style != MS_DEFAULT && \
|
|
138 (((c) == *msg_delim && m_Eom(c,iob)) ||\
|
|
139 (eom_action && (*eom_action)(c))))
|
|
140
|
|
141 static unsigned char *matchc();
|
|
142 static unsigned char *locc();
|
|
143
|
|
144 static unsigned char **pat_map;
|
|
145
|
|
146 extern int msg_count; /* defined in sbr/m_msgdef.c = 0
|
|
147 * disgusting hack for "inc" so it can
|
|
148 * know how many characters were stuffed
|
|
149 * in the buffer on the last call (see
|
|
150 * comments in uip/scansbr.c) */
|
|
151
|
|
152 extern int msg_style; /* defined in sbr/m_msgdef.c = MS_DEFAULT */
|
|
153 /*
|
|
154 * The "full" delimiter string for a packed maildrop consists
|
|
155 * of a newline followed by the actual delimiter. E.g., the
|
|
156 * full string for a Unix maildrop would be: "\n\nFrom ".
|
|
157 * "Fdelim" points to the start of the full string and is used
|
|
158 * in the BODY case of the main routine to search the buffer for
|
|
159 * a possible eom. Msg_delim points to the first character of
|
|
160 * the actual delim. string (i.e., fdelim+1). Edelim
|
|
161 * points to the 2nd character of actual delimiter string. It
|
|
162 * is used in m_Eom because the first character of the string
|
|
163 * has been read and matched before m_Eom is called.
|
|
164 */
|
|
165 extern char *msg_delim; /* defined in sbr/m_msgdef.c = "" */
|
|
166 static unsigned char *fdelim;
|
|
167 static unsigned char *delimend;
|
|
168 static int fdelimlen;
|
|
169 static unsigned char *edelim;
|
|
170 static int edelimlen;
|
|
171
|
|
172 #ifdef CONTENT_LENGTH
|
|
173 static int content_length = -1;
|
|
174 static long end_of_contents = -1;
|
|
175 #endif
|
|
176
|
|
177 static int (*eom_action) () = NULL;
|
|
178
|
|
179 #ifdef FILE__PTR
|
|
180 #define _ptr __ptr
|
|
181 #define _cnt __cnt
|
|
182 #endif
|
|
183
|
|
184 #ifdef _FSTDIO
|
|
185 #define _ptr _p /* Gag */
|
|
186 #define _cnt _r /* Retch */
|
|
187 #define _filbuf __srget /* Puke */
|
|
188 #endif
|
|
189
|
|
190 /* */
|
|
191
|
|
192 m_getfld (state, name, buf, bufsz, iob)
|
|
193 int state;
|
|
194 int bufsz;
|
|
195 unsigned char *name,
|
|
196 *buf;
|
|
197 register FILE *iob;
|
|
198 {
|
|
199 register unsigned char *cp;
|
|
200 register unsigned char *bp;
|
|
201 register unsigned char *ep;
|
|
202 register unsigned char *sp;
|
|
203 register int cnt;
|
|
204 register int c;
|
|
205 register int i;
|
|
206 register int j;
|
|
207
|
|
208 #ifdef CONTENT_LENGTH
|
|
209 /*
|
|
210 * When starting to read from a new file, we have to reset the state,
|
|
211 * but only if the state wasn't reset. That may save us a number of
|
|
212 * lseeks.
|
|
213 */
|
|
214 if (state == FLD &&
|
|
215 (content_length != -1 || end_of_contents != -1) &&
|
|
216 ftell(iob) == 0)
|
|
217 end_of_contents = content_length = -1;
|
|
218 #endif
|
|
219 if ((c = Getc(iob)) < 0) {
|
|
220 msg_count = 0;
|
|
221 *buf = 0;
|
|
222 return FILEEOF;
|
|
223 }
|
|
224 if (eom (c, iob)) {
|
|
225 if (! eom_action) {
|
|
226 /* flush null messages */
|
|
227 while ((c = Getc(iob)) >= 0 && eom (c, iob))
|
|
228 ;
|
|
229 if (c >= 0)
|
|
230 (void) ungetc(c, iob);
|
|
231 }
|
|
232 msg_count = 0;
|
|
233 *buf = 0;
|
|
234 return FILEEOF;
|
|
235 }
|
|
236
|
|
237 switch (state) {
|
|
238 case FLDEOF:
|
|
239 case BODYEOF:
|
|
240 case FLD:
|
|
241 if (c == '\n' || c == '-') {
|
|
242 /* we hit the header/body separator */
|
|
243 while (c != '\n' && (c = Getc(iob)) >= 0)
|
|
244 ;
|
|
245
|
|
246 #ifdef CONTENT_LENGTH
|
|
247 /*
|
|
248 * When we've found a content-length header, we're
|
|
249 * going to use it to tell where the message boundary
|
|
250 * is, if it is a valid mesage boundary.
|
|
251 * There can be a number of cases:
|
|
252 * - no bytes after <content-length> bytes: the usual format
|
|
253 * of a message in an MH folder.
|
|
254 * - only a newline - last message in mail drop.
|
|
255 * - "\nFrom " - beginning of next message
|
|
256 * - other - ignore Content-Length header, but issue warning
|
|
257 */
|
|
258 if (msg_style == MS_UUCP && content_length != -1) {
|
|
259 long here = ftell(iob);
|
|
260 static char delim[] = "\nFrom ";
|
|
261 char buf[sizeof(delim)-1];
|
|
262 int cnt;
|
|
263
|
|
264 /* compute position of character after file */
|
|
265 end_of_contents = here + content_length + 1;
|
|
266 content_length = -1;
|
|
267 /* And see whether this is a From header or eof. */
|
|
268 fseek(iob, end_of_contents - 1, 0);
|
|
269 cnt = fread(buf, sizeof(char), sizeof(buf), iob);
|
|
270 if (cnt != 0 && (cnt != 1 || buf[0] != '\n') &&
|
|
271 (cnt != sizeof(buf) ||
|
|
272 strncmp(buf,delim, sizeof(buf)) != 0)) {
|
|
273 advise (NULLCP, "invalid Content-Length: header\n");
|
|
274 end_of_contents = -1;
|
|
275 }
|
|
276 fseek(iob, here, 0);
|
|
277 }
|
|
278 #endif
|
|
279 if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
|
|
280 if (! eom_action) {
|
|
281 /* flush null messages */
|
|
282 while ((c = Getc(iob)) >= 0 && eom (c, iob))
|
|
283 ;
|
|
284 if (c >= 0)
|
|
285 (void) ungetc(c, iob);
|
|
286 }
|
|
287 msg_count = 0;
|
|
288 *buf = 0;
|
|
289 return FILEEOF;
|
|
290 }
|
|
291 state = BODY;
|
|
292 goto body;
|
|
293 }
|
|
294 /*
|
|
295 * get the name of this component. take characters up
|
|
296 * to a ':', a newline or NAMESZ-1 characters, whichever
|
|
297 * comes first.
|
|
298 */
|
|
299 cp = name; i = NAMESZ - 1;
|
|
300 for (;;) {
|
|
301 #ifdef _STDIO_USES_IOSTREAM
|
|
302 bp = sp = (unsigned char *) iob->_IO_read_ptr - 1;
|
|
303 j = (cnt = ((long) iob->_IO_read_end - (long) iob->_IO_read_ptr) + 1) < i?
|
|
304 cnt: i;
|
|
305 #else
|
|
306 bp = sp = (unsigned char *) iob->_ptr - 1;
|
|
307 j = (cnt = iob->_cnt+1) < i ? cnt : i;
|
|
308 #endif
|
|
309 while (--j >= 0 && (c = *bp++) != ':' && c != '\n')
|
|
310 *cp++ = c;
|
|
311
|
|
312 j = bp - sp;
|
|
313 if ((cnt -= j) <= 0) {
|
|
314 #ifdef _STDIO_USES_IOSTREAM
|
|
315 iob->_IO_read_ptr = iob->_IO_read_end;
|
|
316 if (__underflow((struct _IO_FILE *) iob) == EOF) {
|
|
317 #else
|
|
318 #ifdef FILBUF_ADJ
|
|
319 iob -> _ptr += iob -> _cnt;
|
|
320 iob -> _cnt = 0;
|
|
321 #endif /* FILBUF_ADJ */
|
|
322 if (_filbuf(iob) == EOF) {
|
|
323 #endif
|
|
324 *cp = *buf = 0;
|
|
325 advise (NULLCP, "eof encountered in field \"%s\"",
|
|
326 name);
|
|
327 return FMTERR;
|
|
328 }
|
|
329 #ifdef _STDIO_USES_IOSTREAM
|
|
330 iob->_IO_read_ptr++; /* NOT automatic in __underflow()! */
|
|
331 #endif
|
|
332 } else {
|
|
333 #ifdef _STDIO_USES_IOSTREAM
|
|
334 iob->_IO_read_ptr = bp + 1;
|
|
335 #else
|
|
336 iob->_ptr = bp + 1;
|
|
337 iob->_cnt = cnt - 1;
|
|
338 #endif
|
|
339 }
|
|
340 if (c == ':')
|
|
341 break;
|
|
342
|
|
343 /*
|
|
344 * something went wrong. possibilities are:
|
|
345 * . hit a newline (error)
|
|
346 * . got more than namesz chars. (error)
|
|
347 * . hit the end of the buffer. (loop)
|
|
348 */
|
|
349 if (c == '\n') {
|
|
350 *cp = *buf = 0;
|
|
351 advise (NULLCP, "eol encountered in field \"%s\"", name);
|
|
352 state = FMTERR;
|
|
353 goto finish;
|
|
354 }
|
|
355 if ((i -= j) <= 0) {
|
|
356 *cp = *buf = 0;
|
|
357 advise (NULLCP, "field name \"%s\" exceeds %d bytes",
|
|
358 name, NAMESZ - 1);
|
|
359 state = LENERR;
|
|
360 goto finish;
|
|
361 }
|
|
362 }
|
|
363
|
|
364 while (isspace (*--cp) && cp >= name)
|
|
365 ;
|
|
366 *++cp = 0;
|
|
367 /* fall through */
|
|
368
|
|
369 case FLDPLUS:
|
|
370 /*
|
|
371 * get (more of) the text of a field. take
|
|
372 * characters up to the end of this field (newline
|
|
373 * followed by non-blank) or bufsz-1 characters.
|
|
374 */
|
|
375 cp = buf; i = bufsz-1;
|
|
376 for (;;) {
|
|
377 #ifdef _STDIO_USES_IOSTREAM
|
|
378 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
|
|
379 bp = (unsigned char *) --iob->_IO_read_ptr;
|
|
380 #else
|
|
381 cnt = iob->_cnt++; bp = (unsigned char *) --iob->_ptr;
|
|
382 #endif
|
|
383 c = cnt < i ? cnt : i;
|
|
384 while (ep = locc( c, bp, '\n' )) {
|
|
385 /*
|
|
386 * if we hit the end of this field, return.
|
|
387 */
|
|
388 if ((j = *++ep) != ' ' && j != '\t') {
|
|
389 #ifdef _STDIO_USES_IOSTREAM
|
|
390 j = ep - (unsigned char *) iob->_IO_read_ptr;
|
|
391 (void) bcopy( iob->_IO_read_ptr, cp, j);
|
|
392 iob->_IO_read_ptr = ep;
|
|
393 #else
|
|
394 j = ep - (unsigned char *) iob->_ptr;
|
|
395 (void) bcopy( iob->_ptr, cp, j);
|
|
396 iob->_ptr = ep; iob->_cnt -= j;
|
|
397 #endif
|
|
398 cp += j;
|
|
399 state = FLD;
|
|
400 goto finish;
|
|
401 }
|
|
402 c -= ep - bp; bp = ep;
|
|
403 }
|
|
404 /*
|
|
405 * end of input or dest buffer - copy what we've found.
|
|
406 */
|
|
407 #ifdef _STDIO_USES_IOSTREAM
|
|
408 c += bp - (unsigned char *) iob->_IO_read_ptr;
|
|
409 (void) bcopy( iob->_IO_read_ptr, cp, c);
|
|
410 #else
|
|
411 c += bp - (unsigned char *) iob->_ptr;
|
|
412 (void) bcopy( iob->_ptr, cp, c);
|
|
413 #endif
|
|
414 i -= c; cp += c;
|
|
415 if (i <= 0) {
|
|
416 /* the dest buffer is full */
|
|
417 #ifdef _STDIO_USES_IOSTREAM
|
|
418 iob->_IO_read_ptr += c;
|
|
419 #else
|
|
420 iob->_cnt -= c; iob->_ptr += c;
|
|
421 #endif
|
|
422 state = FLDPLUS;
|
|
423 break;
|
|
424 }
|
|
425 /*
|
|
426 * There's one character left in the input buffer.
|
|
427 * Copy it & fill the buffer. If the last char
|
|
428 * was a newline and the next char is not whitespace,
|
|
429 * this is the end of the field. Otherwise loop.
|
|
430 */
|
|
431 --i;
|
|
432 #ifdef _STDIO_USES_IOSTREAM
|
|
433 *cp++ = j = *(iob->_IO_read_ptr + c);
|
|
434 iob->_IO_read_ptr = iob->_IO_read_end;
|
|
435 c = __underflow((struct _IO_FILE *) iob);
|
|
436 iob->_IO_read_ptr++; /* NOT automatic! */
|
|
437 #else
|
|
438 *cp++ = j = *(iob->_ptr + c);
|
|
439 #ifdef FILBUF_ADJ
|
|
440 iob -> _ptr += iob -> _cnt;
|
|
441 iob -> _cnt = 0;
|
|
442 #endif /* FILBUF_ADJ */
|
|
443 c = _filbuf(iob);
|
|
444 #endif
|
|
445 /* bugfix, 03/1998.
|
|
446 * If we encounter EOF halfway through reading the value (ie there is
|
|
447 * no trailing \n in the field) then __underflow() above returns EOF.
|
|
448 * Previously we didn't check for this, with the result that we then
|
|
449 * attempt to read from the stream and wind up segfaulting doing a
|
|
450 * bcopy() with length parameter -1. Instead, we just append a newline
|
|
451 * to what we've read, so the following conditional will pick it up
|
|
452 * and return the field value. Then the EOF is actually dealt with
|
|
453 * the next time this function is called.
|
|
454 * The reason for appending \n is because the callers are known to
|
|
455 * work with 'name: value\n', and setting j to '\n' is known to exit
|
|
456 * in the right way, and the less we change the less likely
|
|
457 * we are to introduce new bugs. And I'm scared of the curse in the
|
|
458 * comments at the top of this file :->
|
|
459 * -- PMM (pmaydell@chiark.greenend.org.uk)
|
|
460 */
|
|
461 if (c == EOF && j != '\0' && j != '\n') {
|
|
462 *cp++ = j = '\n';
|
|
463 advise (NULLCP, "file missing final eol");
|
|
464 }
|
|
465 /* bugfix end */
|
|
466 if ((j == '\0' || j == '\n') && c != ' ' && c != '\t') {
|
|
467 if (c != EOF)
|
|
468 #ifdef _STDIO_USES_IOSTREAM
|
|
469 --iob->_IO_read_ptr;
|
|
470 #else
|
|
471 --iob->_ptr, ++iob->_cnt;
|
|
472 #endif
|
|
473 state = FLD;
|
|
474 break;
|
|
475 }
|
|
476 }
|
|
477 break;
|
|
478
|
|
479 case BODY:
|
|
480 body:
|
|
481 /*
|
|
482 * get the message body up to bufsz characters or the
|
|
483 * end of the message. Sleazy hack: if bufsz is negative
|
|
484 * we assume that we were called to copy directly into
|
|
485 * the output buffer and we don't add an eos.
|
|
486 */
|
|
487 i = (bufsz < 0) ? -bufsz : bufsz-1;
|
|
488 #ifdef _STDIO_USES_IOSTREAM
|
|
489 bp = (unsigned char *) --iob->_IO_read_ptr;
|
|
490 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
|
|
491 #else
|
|
492 bp = (unsigned char *) --iob->_ptr; cnt = ++iob->_cnt;
|
|
493 #endif
|
|
494 c = (cnt < i ? cnt : i);
|
|
495 if (msg_style != MS_DEFAULT && c > 1) {
|
|
496 /*
|
|
497 * packed maildrop - only take up to the (possible)
|
|
498 * start of the next message. This "matchc" should
|
|
499 * probably be a Boyer-Moore matcher for non-vaxen,
|
|
500 * particularly since we have the alignment table
|
|
501 * all built for the end-of-buffer test (next).
|
|
502 * But our vax timings indicate that the "matchc"
|
|
503 * instruction is 50% faster than a carefully coded
|
|
504 * B.M. matcher for most strings. (So much for elegant
|
|
505 * algorithms vs. brute force.) Since I (currently)
|
|
506 * run MH on a vax, we use the matchc instruction. --vj
|
|
507 */
|
|
508 if (ep = matchc( fdelimlen, fdelim, c, bp ) )
|
|
509 c = ep - bp + 1;
|
|
510 else {
|
|
511 /*
|
|
512 * There's no delim in the buffer but there may be
|
|
513 * a partial one at the end. If so, we want to leave
|
|
514 * it so the "eom" check on the next call picks it up.
|
|
515 * Use a modified Boyer-Moore matcher to make this
|
|
516 * check relatively cheap. The first "if" figures
|
|
517 * out what position in the pattern matches the last
|
|
518 * character in the buffer. The inner "while" matches
|
|
519 * the pattern against the buffer, backwards starting
|
|
520 * at that position. Note that unless the buffer
|
|
521 * ends with one of the characters in the pattern
|
|
522 * (excluding the first and last), we do only one test.
|
|
523 */
|
|
524 ep = bp + c - 1;
|
|
525 if (sp = pat_map[*ep & 0x00ff]) {
|
|
526 do {
|
|
527 cp = sp;
|
|
528 while (*--ep == *--cp)
|
|
529 ;
|
|
530 if (cp < fdelim) {
|
|
531 if (ep >= bp)
|
|
532 /*
|
|
533 * ep < bp means that all the buffer
|
|
534 * contains is a prefix of delim.
|
|
535 * If this prefix is really a delim, the
|
|
536 * m_eom call at entry should have found
|
|
537 * it. Thus it's not a delim and we can
|
|
538 * take all of it.
|
|
539 */
|
|
540 c = (ep - bp) + 2;
|
|
541 break;
|
|
542 }
|
|
543 /* try matching one less char of delim string */
|
|
544 ep = bp + c - 1;
|
|
545 } while (--sp > fdelim);
|
|
546 }
|
|
547 }
|
|
548 }
|
|
549 (void) bcopy( bp, buf, c );
|
|
550 #ifdef _STDIO_USES_IOSTREAM
|
|
551 iob->_IO_read_ptr += c;
|
|
552 #else
|
|
553 iob->_cnt -= c;
|
|
554 iob->_ptr += c;
|
|
555 #endif
|
|
556 if (bufsz < 0) {
|
|
557 msg_count = c;
|
|
558 return (state);
|
|
559 }
|
|
560 cp = buf + c;
|
|
561 break;
|
|
562
|
|
563 default:
|
|
564 adios (NULLCP, "m_getfld() called with bogus state of %d", state);
|
|
565 }
|
|
566 finish:;
|
|
567 *cp = 0;
|
|
568 msg_count = cp - buf;
|
|
569
|
|
570 #ifdef CONTENT_LENGTH
|
|
571 /* Check whether this was a Content-Length header */
|
|
572 if (msg_style == MS_UUCP && state == FLD &&
|
|
573 uleq((char*)"content-length", (char*) name)) {
|
|
574 content_length = atoi(buf);
|
|
575 /* This value is computed when end-of-headers is detected */
|
|
576 end_of_contents = -1;
|
|
577 }
|
|
578 #endif
|
|
579 return (state);
|
|
580 }
|
|
581
|
|
582 /* */
|
|
583
|
|
584 #ifdef RPATHS
|
|
585 static char unixbuf[BUFSIZ] = "";
|
|
586 #endif /* RPATHS */
|
|
587
|
|
588 void
|
|
589 m_unknown(iob)
|
|
590 register FILE *iob;
|
|
591 {
|
|
592 register int c;
|
|
593 register long pos;
|
|
594 char text[10];
|
|
595 register char *cp;
|
|
596 register char *delimstr;
|
|
597
|
|
598 msg_style = MS_UNKNOWN;
|
|
599
|
|
600 /* Figure out what the message delimitter string is for this
|
|
601 * maildrop. (This used to be part of m_Eom but I didn't like
|
|
602 * the idea of an "if" statement that could only succeed on the
|
|
603 * first call to m_Eom getting executed on each call, i.e., at
|
|
604 * every newline in the message).
|
|
605 *
|
|
606 * If the first line of the maildrop is a Unix "from" line, we say the
|
|
607 * style is UUCP and eat the rest of the line. Otherwise we say the style
|
|
608 * is MMDF & look for the delimiter string specified when MH was built
|
|
609 * (or from the mtstailor file).
|
|
610 */
|
|
611 pos = ftell (iob);
|
|
612 if (fread (text, sizeof *text, 5, iob) == 5
|
|
613 && strncmp (text, "From ", 5) == 0) {
|
|
614 msg_style = MS_UUCP;
|
|
615 delimstr = "\nFrom ";
|
|
616 #ifndef RPATHS
|
|
617 while ((c = getc (iob)) != '\n' && c >= 0)
|
|
618 ;
|
|
619 #else /* RPATHS */
|
|
620 cp = unixbuf;
|
|
621 while ((c = getc (iob)) != '\n')
|
|
622 *cp++ = c;
|
|
623 *cp = 0;
|
|
624 #endif /* RPATHS */
|
|
625 } else {
|
|
626 /* not a Unix style maildrop */
|
|
627 (void) fseek (iob, pos, 0);
|
|
628 if (mmdlm2 == NULLCP || *mmdlm2 == 0)
|
|
629 mmdlm2 = "\001\001\001\001\n";
|
|
630 delimstr = mmdlm2;
|
|
631 msg_style = MS_MMDF;
|
|
632 }
|
|
633 c = strlen (delimstr);
|
|
634 fdelim = (unsigned char *)malloc((unsigned)c + 3);
|
|
635 *fdelim++ = '\0';
|
|
636 *fdelim = '\n';
|
|
637 msg_delim = (char *)fdelim+1;
|
|
638 edelim = (unsigned char *)msg_delim+1;
|
|
639 fdelimlen = c + 1;
|
|
640 edelimlen = c - 1;
|
|
641 (void)strcpy(msg_delim, delimstr);
|
|
642 delimend = (unsigned char *)msg_delim + edelimlen;
|
|
643 if (edelimlen <= 1)
|
|
644 adios (NULLCP, "maildrop delimiter must be at least 2 bytes");
|
|
645 /*
|
|
646 * build a Boyer-Moore end-position map for the matcher in m_getfld.
|
|
647 * N.B. - we don't match just the first char (since it's the newline
|
|
648 * separator) or the last char (since the matchc would have found it
|
|
649 * if it was a real delim).
|
|
650 */
|
|
651 pat_map = (unsigned char **) calloc (256, sizeof (unsigned char *));
|
|
652
|
|
653 for (cp = (char *)fdelim + 1; cp < (char *)delimend; cp++ )
|
|
654 pat_map[*cp] = (unsigned char *)cp;
|
|
655
|
|
656 if (msg_style == MS_MMDF) {
|
|
657 /* flush extra msg hdrs */
|
|
658 while ((c = Getc(iob)) >= 0 && eom (c, iob))
|
|
659 ;
|
|
660 if (c >= 0)
|
|
661 (void) ungetc(c, iob);
|
|
662 }
|
|
663 }
|
|
664
|
|
665
|
|
666 void m_eomsbr (action)
|
|
667 int (*action) ();
|
|
668 {
|
|
669 if (eom_action = action) {
|
|
670 msg_style = MS_MSH;
|
|
671 *msg_delim = 0;
|
|
672 fdelimlen = 1;
|
|
673 delimend = fdelim;
|
|
674 } else {
|
|
675 msg_style = MS_MMDF;
|
|
676 msg_delim = (char *)fdelim + 1;
|
|
677 fdelimlen = strlen((char *)fdelim);
|
|
678 delimend = (unsigned char *)(msg_delim + edelimlen);
|
|
679 }
|
|
680 }
|
|
681
|
|
682 /* */
|
|
683
|
|
684 /* test for msg delimiter string */
|
|
685
|
|
686 int m_Eom (c, iob)
|
|
687 register int c;
|
|
688 register FILE *iob;
|
|
689 {
|
|
690 register long pos = 0L;
|
|
691 register int i;
|
|
692 char text[10];
|
|
693 #ifdef RPATHS
|
|
694 register char *cp;
|
|
695 #endif /* RPATHS */
|
|
696
|
|
697 pos = ftell (iob);
|
|
698
|
|
699 #ifdef CONTENT_LENGTH
|
|
700 if (msg_style == MS_UUCP && end_of_contents != -1) {
|
|
701 if (end_of_contents == pos) {
|
|
702 end_of_contents = -1;
|
|
703 if ((fread (text, sizeof *text, edelimlen, iob) == edelimlen)
|
|
704 && (strncmp (text, (char *)edelim, edelimlen) == 0)) {
|
|
705 #ifndef RPATHS
|
|
706 while ((c = getc (iob)) != '\n')
|
|
707 if (c < 0)
|
|
708 break;
|
|
709 #else /* RPATHS */
|
|
710 cp = unixbuf;
|
|
711 while ((c = getc (iob)) != '\n' && c >= 0)
|
|
712 *cp++ = c;
|
|
713 *cp = 0;
|
|
714 #endif /* RPATHS */
|
|
715 }
|
|
716 return 1;
|
|
717 }
|
|
718 /* we've read past the end of a message, this should never happen
|
|
719 * because of the other checks we do */
|
|
720 if (end_of_contents < pos) {
|
|
721 end_of_contents = -1;
|
|
722 adios(NULLCP,
|
|
723 "Content-Length: header broken, can't read mailbox\n");
|
|
724 }
|
|
725 return 0;
|
|
726 }
|
|
727 #endif
|
|
728
|
|
729 if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
|
|
730 || strncmp (text, (char *)edelim, edelimlen)) {
|
|
731 if (i == 0 && msg_style == MS_UUCP)
|
|
732 /* the final newline in the (brain damaged) unix-format
|
|
733 * maildrop is part of the delimitter - delete it.
|
|
734 */
|
|
735 return 1;
|
|
736
|
|
737 #ifdef notdef
|
|
738 (void) fseek (iob, pos, 0);
|
|
739 #else
|
|
740 (void) fseek (iob, (long)(pos-1), 0);
|
|
741 (void) getc (iob); /* should be OK */
|
|
742 #endif /* !notdef */
|
|
743 return 0;
|
|
744 }
|
|
745
|
|
746 #ifdef CONTENT_LENGTH
|
|
747 /* There's one extra special case to be considered here:
|
|
748 * content_length > 0. That we got here is because the
|
|
749 * message body starts with "From "
|
|
750 */
|
|
751 if (msg_style == MS_UUCP && content_length > 0) {
|
|
752 (void) fseek (iob, (long)(pos-1), 0);
|
|
753 (void) getc (iob); /* should be OK */
|
|
754 return 0;
|
|
755 }
|
|
756 #endif
|
|
757
|
|
758 if (msg_style == MS_UUCP) {
|
|
759 #ifndef RPATHS
|
|
760 while ((c = getc (iob)) != '\n')
|
|
761 if (c < 0)
|
|
762 break;
|
|
763 #else /* RPATHS */
|
|
764 cp = unixbuf;
|
|
765 while ((c = getc (iob)) != '\n' && c >= 0)
|
|
766 *cp++ = c;
|
|
767 *cp = 0;
|
|
768 #endif /* RPATHS */
|
|
769 }
|
|
770
|
|
771 return 1;
|
|
772 }
|
|
773
|
|
774 /* */
|
|
775
|
|
776 #ifdef RPATHS
|
|
777 char *unixline () {
|
|
778 register char *cp,
|
|
779 *dp,
|
|
780 *pp;
|
|
781 static char unixfrom[BUFSIZ];
|
|
782 int i;
|
|
783
|
|
784 pp = unixfrom;
|
|
785 if (cp = dp = index (unixbuf, ' ')) {
|
|
786 while (cp = index (cp + 1, 'r'))
|
|
787 if (strncmp (cp, "remote from ", 12) == 0) {
|
|
788 *cp = 0;
|
|
789 (void) sprintf (pp, "%s!", cp + 12);
|
|
790 pp += strlen (pp);
|
|
791 break;
|
|
792 }
|
|
793 if (cp == NULL)
|
|
794 cp = unixbuf + strlen (unixbuf);
|
|
795 #if 0
|
|
796 if ((cp -= 25) >= dp)
|
|
797 #else
|
|
798 /* On most of BSD systems, the date field length of UNIX From line
|
|
799 is 25, but it's not suitable for other systems. We should not
|
|
800 use this length. */
|
|
801 while (cp > dp && *--cp != ':')
|
|
802 ;
|
|
803 for (i = 0; i < 4 && cp > dp; i++) {
|
|
804 while (!isspace(*--cp))
|
|
805 ;
|
|
806 while (isspace(*(cp - 1)))
|
|
807 --cp;
|
|
808 }
|
|
809 if (cp >= dp)
|
|
810 #endif
|
|
811 *cp = 0;
|
|
812 }
|
|
813
|
|
814 (void) sprintf (pp, "%s\n", unixbuf);
|
|
815 unixbuf[0] = 0;
|
|
816 return unixfrom;
|
|
817 }
|
|
818 #endif /* RPATHS */
|
|
819
|
|
820 /* */
|
|
821
|
|
822 /* matchc: find the first occurrence of string pat in string str.
|
|
823 * We can't use the C library routine strstr because the string
|
|
824 * won't have a trailing NUL. See also the note about using a
|
|
825 * Boyer-Moore search on non-Vaxen (in the only place this fn
|
|
826 * is used...)
|
|
827 */
|
|
828 #if (vax && !lint)
|
|
829 asm(".align 1");
|
|
830 asm("_matchc: .word 0");
|
|
831 asm(" movq 4(ap),r0");
|
|
832 asm(" movq 12(ap),r2");
|
|
833 asm(" matchc r0,(r1),r2,(r3)");
|
|
834 asm(" beql 1f");
|
|
835 asm(" movl 4(ap),r3");
|
|
836 asm("1: subl3 4(ap),r3,r0");
|
|
837 asm(" ret");
|
|
838 #else
|
|
839 static unsigned char *
|
|
840 matchc( patln, pat, strln, str )
|
|
841 int patln;
|
|
842 char *pat;
|
|
843 int strln;
|
|
844 register char *str;
|
|
845 {
|
|
846 register char *es = str + strln - patln;
|
|
847 register char *sp;
|
|
848 register char *pp;
|
|
849 register char *ep = pat + patln;
|
|
850 register char pc = *pat++;
|
|
851
|
|
852 /* es is a pointer to the last character we need to
|
|
853 * check (the pattern can't start beyond it because then
|
|
854 * the end of the pattern would be beyond the end of the
|
|
855 * string).
|
|
856 */
|
|
857 for(;;) {
|
|
858 /* Search for the next occurrence of pc (first character
|
|
859 * in the pattern.
|
|
860 */
|
|
861 do {
|
|
862 if (str > es)
|
|
863 return 0;
|
|
864 } while (pc != *str++);
|
|
865
|
|
866 /* At this point we have a match for the first
|
|
867 * character and basically do a strcmp() for the
|
|
868 * rest of the pattern. We know that the pattern
|
|
869 * will fit in the remainder of the string because
|
|
870 * of the es check.
|
|
871 */
|
|
872 sp = str; pp = pat;
|
|
873 while (pp < ep && *sp++ == *pp)
|
|
874 pp++;
|
|
875
|
|
876 if (pp >= ep) /* whole pattern matched? */
|
|
877 return ((unsigned char *)--str);
|
|
878
|
|
879 /* If we get this far then it wasn't a good match,
|
|
880 * so go back to looking for the first character in
|
|
881 * the pattern.
|
|
882 */
|
|
883
|
|
884 }
|
|
885 }
|
|
886 #endif
|
|
887
|
|
888 /* */
|
|
889
|
|
890 /*
|
|
891 * Locate character "term" in the next "cnt" characters of "src".
|
|
892 * If found, return its address, otherwise return 0.
|
|
893 */
|
|
894 #if (vax && !lint)
|
|
895 asm(".align 1");
|
|
896 asm("_locc: .word 0");
|
|
897 asm(" movq 4(ap),r0");
|
|
898 asm(" locc 12(ap),r0,(r1)");
|
|
899 asm(" beql 1f");
|
|
900 asm(" movl r1,r0");
|
|
901 asm("1: ret");
|
|
902 #else
|
|
903 static unsigned char *
|
|
904 locc( cnt, src, term )
|
|
905 register int cnt;
|
|
906 register unsigned char *src;
|
|
907 register unsigned char term;
|
|
908 {
|
|
909 while (*src++ != term && --cnt > 0);
|
|
910
|
|
911 return (cnt > 0 ? --src : (unsigned char *)0);
|
|
912 }
|
|
913 #endif
|
|
914
|
|
915 /* */
|
|
916
|
|
917 #if !defined (BSD42) && !defined (bcopy)
|
|
918 int bcmp (b1, b2, length)
|
|
919 register char *b1,
|
|
920 *b2;
|
|
921 register int length;
|
|
922 {
|
|
923 while (length-- > 0)
|
|
924 if (*b1++ != *b2++)
|
|
925 return 1;
|
|
926
|
|
927 return 0;
|
|
928 }
|
|
929
|
|
930
|
|
931 bcopy (b1, b2, length)
|
|
932 register char *b1,
|
|
933 *b2;
|
|
934 register int length;
|
|
935 {
|
|
936 while (length-- > 0)
|
|
937 *b2++ = *b1++;
|
|
938 }
|
|
939
|
|
940
|
|
941 bzero (b, length)
|
|
942 register char *b;
|
|
943 register int length;
|
|
944 {
|
|
945 while (length-- > 0)
|
|
946 *b++ = 0;
|
|
947 }
|
|
948 #endif /* not BSD42 */
|