xref: /illumos-gate/usr/src/cmd/awk/b.c (revision 3ee4fc2a)
1 /*
2  * Copyright (C) Lucent Technologies 1997
3  * All Rights Reserved
4  *
5  * Permission to use, copy, modify, and distribute this software and
6  * its documentation for any purpose and without fee is hereby
7  * granted, provided that the above copyright notice appear in all
8  * copies and that both that the copyright notice and this
9  * permission notice and warranty disclaimer appear in supporting
10  * documentation, and that the name Lucent Technologies or any of
11  * its entities not be used in advertising or publicity pertaining
12  * to distribution of the software without specific, written prior
13  * permission.
14  *
15  * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17  * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21  * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22  * THIS SOFTWARE.
23  */
24 
25 /*
26  * CDDL HEADER START
27  *
28  * The contents of this file are subject to the terms of the
29  * Common Development and Distribution License, Version 1.0 only
30  * (the "License").  You may not use this file except in compliance
31  * with the License.
32  *
33  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
34  * or http://www.opensolaris.org/os/licensing.
35  * See the License for the specific language governing permissions
36  * and limitations under the License.
37  *
38  * When distributing Covered Code, include this CDDL HEADER in each
39  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
40  * If applicable, add the following below this CDDL HEADER, with the
41  * fields enclosed by brackets "[]" replaced with your own identifying
42  * information: Portions Copyright [yyyy] [name of copyright owner]
43  *
44  * CDDL HEADER END
45  */
46 
47 /*
48  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
49  * Use is subject to license terms.
50  */
51 
52 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
53 /*	  All Rights Reserved  	*/
54 
55 /* lasciate ogne speranza, voi ch'intrate. */
56 
57 #define	DEBUG
58 
59 #include "awk.h"
60 #include "y.tab.h"
61 
62 #define	HAT	(NCHARS-1)	/* matches ^ in regular expr */
63 				/* NCHARS is 2**n */
64 #define	MAXLIN (3 * LINE_MAX)
65 
66 #define	type(v)		(v)->nobj	/* badly overloaded here */
67 #define	info(v)		(v)->ntype	/* badly overloaded here */
68 #define	left(v)		(v)->narg[0]
69 #define	right(v)	(v)->narg[1]
70 #define	parent(v)	(v)->nnext
71 
72 #define	LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
73 #define	ELEAF	case EMPTYRE:		/* empty string in regexp */
74 #define	UNARY	case STAR: case PLUS: case QUEST:
75 
76 /*
77  * encoding in tree Nodes:
78  *	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
79  *		left is index, right contains value or pointer to value
80  *	unary (STAR, PLUS, QUEST): left is child, right is null
81  *	binary (CAT, OR): left and right are children
82  *	parent contains pointer to parent
83  */
84 
85 int	*setvec;
86 int	*tmpset;
87 int	maxsetvec = 0;
88 
89 int	rtok;		/* next token in current re */
90 int	rlxval;
91 static uschar	*rlxstr;
92 static uschar	*prestr;	/* current position in current re */
93 static uschar	*lastre;	/* origin of last re */
94 
95 static	int setcnt;
96 static	int poscnt;
97 
98 char	*patbeg;
99 int	patlen;
100 
101 #define	NFA	20	/* cache this many dynamic fa's */
102 fa	*fatab[NFA];
103 int	nfatab	= 0;	/* entries in fatab */
104 
105 static fa	*mkdfa(const char *, int);
106 static int	makeinit(fa *, int);
107 static void	penter(Node *);
108 static void	freetr(Node *);
109 static void	overflo(const char *);
110 static void	growvec(const char *);
111 static void	cfoll(fa *, Node *);
112 static void	follow(Node *);
113 static Node	*reparse(const char *);
114 static int	relex(void);
115 static void	freefa(fa *);
116 static int	cgoto(fa *, int, int);
117 
118 fa *
makedfa(const char * s,int anchor)119 makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
120 {
121 	int i, use, nuse;
122 	fa *pfa;
123 	static int now = 1;
124 
125 	if (setvec == NULL) {	/* first time through any RE */
126 		maxsetvec = MAXLIN;
127 		setvec = (int *)malloc(maxsetvec * sizeof (int));
128 		tmpset = (int *)malloc(maxsetvec * sizeof (int));
129 		if (setvec == NULL || tmpset == NULL)
130 			overflo("out of space initializing makedfa");
131 	}
132 
133 	if (compile_time)	/* a constant for sure */
134 		return (mkdfa(s, anchor));
135 	for (i = 0; i < nfatab; i++) {	/* is it there already? */
136 		if (fatab[i]->anchor == anchor &&
137 		    strcmp((const char *)fatab[i]->restr, s) == 0) {
138 			fatab[i]->use = now++;
139 			return (fatab[i]);
140 		}
141 	}
142 	pfa = mkdfa(s, anchor);
143 	if (nfatab < NFA) {	/* room for another */
144 		fatab[nfatab] = pfa;
145 		fatab[nfatab]->use = now++;
146 		nfatab++;
147 		return (pfa);
148 	}
149 	use = fatab[0]->use;	/* replace least-recently used */
150 	nuse = 0;
151 	for (i = 1; i < nfatab; i++)
152 		if (fatab[i]->use < use) {
153 			use = fatab[i]->use;
154 			nuse = i;
155 		}
156 	freefa(fatab[nuse]);
157 	fatab[nuse] = pfa;
158 	pfa->use = now++;
159 	return (pfa);
160 }
161 
162 /*
163  * does the real work of making a dfa
164  * anchor = 1 for anchored matches, else 0
165  */
166 fa *
mkdfa(const char * s,int anchor)167 mkdfa(const char *s, int anchor)
168 {
169 	Node *p, *p1;
170 	fa *f;
171 
172 	p = reparse(s);
173 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
174 		/* put ALL STAR in front of reg.  exp. */
175 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
176 		/* put FINAL after reg.  exp. */
177 
178 	poscnt = 0;
179 	penter(p1);	/* enter parent pointers and leaf indices */
180 	if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL)
181 		overflo("out of space for fa");
182 	/* penter has computed number of positions in re */
183 	f->accept = poscnt-1;
184 	cfoll(f, p1);	/* set up follow sets */
185 	freetr(p1);
186 	if ((f->posns[0] =
187 	    (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) {
188 			overflo("out of space in makedfa");
189 	}
190 	if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL)
191 		overflo("out of space in makedfa");
192 	*f->posns[1] = 0;
193 	f->initstat = makeinit(f, anchor);
194 	f->anchor = anchor;
195 	f->restr = (uschar *)tostring(s);
196 	return (f);
197 }
198 
199 static int
makeinit(fa * f,int anchor)200 makeinit(fa *f, int anchor)
201 {
202 	int i, k;
203 
204 	f->curstat = 2;
205 	f->out[2] = 0;
206 	f->reset = 0;
207 	k = *(f->re[0].lfollow);
208 	xfree(f->posns[2]);
209 	if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL)
210 		overflo("out of space in makeinit");
211 	for (i = 0; i <= k; i++) {
212 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
213 	}
214 	if ((f->posns[2])[1] == f->accept)
215 		f->out[2] = 1;
216 	for (i = 0; i < NCHARS; i++)
217 		f->gototab[2][i] = 0;
218 	f->curstat = cgoto(f, 2, HAT);
219 	if (anchor) {
220 		*f->posns[2] = k-1;	/* leave out position 0 */
221 		for (i = 0; i < k; i++) {
222 			(f->posns[0])[i] = (f->posns[2])[i];
223 		}
224 
225 		f->out[0] = f->out[2];
226 		if (f->curstat != 2)
227 			--(*f->posns[f->curstat]);
228 	}
229 	return (f->curstat);
230 }
231 
232 void
penter(Node * p)233 penter(Node *p)	/* set up parent pointers and leaf indices */
234 {
235 	switch (type(p)) {
236 	ELEAF
237 	LEAF
238 		info(p) = poscnt;
239 		poscnt++;
240 		break;
241 	UNARY
242 		penter(left(p));
243 		parent(left(p)) = p;
244 		break;
245 	case CAT:
246 	case OR:
247 		penter(left(p));
248 		penter(right(p));
249 		parent(left(p)) = p;
250 		parent(right(p)) = p;
251 		break;
252 	default:	/* can't happen */
253 		FATAL("can't happen: unknown type %d in penter", type(p));
254 		break;
255 	}
256 }
257 
258 static void
freetr(Node * p)259 freetr(Node *p)	/* free parse tree */
260 {
261 	switch (type(p)) {
262 	ELEAF
263 	LEAF
264 		xfree(p);
265 		break;
266 	UNARY
267 		freetr(left(p));
268 		xfree(p);
269 		break;
270 	case CAT:
271 	case OR:
272 		freetr(left(p));
273 		freetr(right(p));
274 		xfree(p);
275 		break;
276 	default:	/* can't happen */
277 		FATAL("can't happen: unknown type %d in freetr", type(p));
278 		break;
279 	}
280 }
281 
282 static void
growvec(const char * msg)283 growvec(const char *msg)
284 {
285 	maxsetvec *= 4;
286 	setvec = (int *)realloc(setvec, maxsetvec * sizeof (int));
287 	tmpset = (int *)realloc(tmpset, maxsetvec * sizeof (int));
288 	if (setvec == NULL || tmpset == NULL)
289 		overflo(msg);
290 }
291 
292 /*
293  * in the parsing of regular expressions, metacharacters like . have
294  * to be seen literally; \056 is not a metacharacter.
295  */
296 
297 /*
298  * find and eval hex string at pp, return new p; only pick up one 8-bit
299  * byte (2 chars).
300  */
301 int
hexstr(uschar ** pp)302 hexstr(uschar **pp)
303 {
304 	uschar *p;
305 	int n = 0;
306 	int i;
307 
308 	for (i = 0, p = (uschar *)*pp; i < 2 && isxdigit(*p); i++, p++) {
309 		if (isdigit(*p))
310 			n = 16 * n + *p - '0';
311 		else if (*p >= 'a' && *p <= 'f')
312 			n = 16 * n + *p - 'a' + 10;
313 		else if (*p >= 'A' && *p <= 'F')
314 			n = 16 * n + *p - 'A' + 10;
315 	}
316 	*pp = (uschar *)p;
317 	return (n);
318 }
319 
320 #define	isoctdigit(c) ((c) >= '0' && (c) <= '7')
321 
322 /* pick up next thing after a \\ and increment *pp */
323 int
quoted(uschar ** pp)324 quoted(uschar **pp)
325 {
326 	uschar *p = *pp;
327 	int c;
328 
329 	if ((c = *p++) == 't')
330 		c = '\t';
331 	else if (c == 'n')
332 		c = '\n';
333 	else if (c == 'f')
334 		c = '\f';
335 	else if (c == 'r')
336 		c = '\r';
337 	else if (c == 'b')
338 		c = '\b';
339 	else if (c == '\\')
340 		c = '\\';
341 	else if (c == 'x') {	/* hexadecimal goo follows */
342 		c = hexstr(&p);	/* this adds a null if number is invalid */
343 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
344 		int n = c - '0';
345 		if (isoctdigit(*p)) {
346 			n = 8 * n + *p++ - '0';
347 			if (isoctdigit(*p))
348 				n = 8 * n + *p++ - '0';
349 		}
350 		c = n;
351 	} /* else */
352 		/* c = c; */
353 	*pp = p;
354 	return (c);
355 }
356 
357 char *
cclenter(const char * argp)358 cclenter(const char *argp)	/* add a character class */
359 {
360 	int i, c, c2;
361 	uschar *p = (uschar *)argp;
362 	uschar *op, *bp;
363 	static uschar *buf = NULL;
364 	static size_t bufsz = 100;
365 
366 	op = p;
367 	if (buf == NULL && (buf = (uschar *)malloc(bufsz)) == NULL)
368 		FATAL("out of space for character class [%.10s...] 1", p);
369 	bp = buf;
370 	for (i = 0; (c = *p++) != 0; ) {
371 		if (c == '\\') {
372 			c = quoted(&p);
373 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
374 			if (*p != 0) {
375 				c = bp[-1];
376 				c2 = *p++;
377 				if (c2 == '\\')
378 					c2 = quoted(&p);
379 				if (c > c2) {	/* empty; ignore */
380 					bp--;
381 					i--;
382 					continue;
383 				}
384 				while (c < c2) {
385 					if (!adjbuf((char **)&buf, &bufsz,
386 					    bp-buf+2, 100, (char **)&bp,
387 					    "cclenter1")) {
388 						FATAL(
389 			"out of space for character class [%.10s...] 2", p);
390 					}
391 					*bp++ = ++c;
392 					i++;
393 				}
394 				continue;
395 			}
396 		}
397 		if (!adjbuf((char **)&buf, &bufsz, bp-buf+2, 100, (char **)&bp,
398 		    "cclenter2"))
399 			FATAL(
400 			    "out of space for character class [%.10s...] 3", p);
401 		*bp++ = c;
402 		i++;
403 	}
404 	*bp = '\0';
405 	dprintf(("cclenter: in = |%s|, out = |%s|\n", op, buf));
406 	xfree(op);
407 	return ((char *)tostring((char *)buf));
408 }
409 
410 static void
overflo(const char * s)411 overflo(const char *s)
412 {
413 	FATAL("regular expression too big: %.30s...", gettext((char *)s));
414 }
415 
416 /* enter follow set of each leaf of vertex v into lfollow[leaf] */
417 static void
cfoll(fa * f,Node * v)418 cfoll(fa *f, Node *v)
419 {
420 	int i;
421 	int *p;
422 
423 	switch (type(v)) {
424 	ELEAF
425 	LEAF
426 		f->re[info(v)].ltype = type(v);
427 		f->re[info(v)].lval.np = right(v);
428 		while (f->accept >= maxsetvec) {	/* guessing here! */
429 			growvec("out of space in cfoll()");
430 		}
431 		for (i = 0; i <= f->accept; i++)
432 			setvec[i] = 0;
433 		setcnt = 0;
434 		follow(v);	/* computes setvec and setcnt */
435 		if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL)
436 			overflo("out of space building follow set");
437 		f->re[info(v)].lfollow = p;
438 		*p = setcnt;
439 		for (i = f->accept; i >= 0; i--) {
440 			if (setvec[i] == 1)
441 				*++p = i;
442 		}
443 		break;
444 	UNARY
445 		cfoll(f, left(v));
446 		break;
447 	case CAT:
448 	case OR:
449 		cfoll(f, left(v));
450 		cfoll(f, right(v));
451 		break;
452 	default:	/* can't happen */
453 		FATAL("can't happen: unknown type %d in cfoll", type(v));
454 	}
455 }
456 
457 /*
458  * collects initially active leaves of p into setvec
459  * returns 0 or 1 depending on whether p matches empty string
460  */
461 static int
first(Node * p)462 first(Node *p)
463 {
464 	int b, lp;
465 
466 	switch (type(p)) {
467 	ELEAF
468 	LEAF
469 		lp = info(p);	/* look for high-water mark of subscripts */
470 		while (setcnt >= maxsetvec || lp >= maxsetvec) {
471 			/* guessing here! */
472 			growvec("out of space in first()");
473 		}
474 		if (type(p) == EMPTYRE) {
475 			setvec[lp] = 0;
476 			return (0);
477 		}
478 		if (setvec[lp] != 1) {
479 			setvec[lp] = 1;
480 			setcnt++;
481 		}
482 		if (type(p) == CCL && (*(char *)right(p)) == '\0')
483 			return (0);		/* empty CCL */
484 		else
485 			return (1);
486 	case PLUS:
487 		if (first(left(p)) == 0)
488 			return (0);
489 		return (1);
490 	case STAR:
491 	case QUEST:
492 		(void) first(left(p));
493 		return (0);
494 	case CAT:
495 		if (first(left(p)) == 0 && first(right(p)) == 0)
496 			return (0);
497 		return (1);
498 	case OR:
499 		b = first(right(p));
500 		if (first(left(p)) == 0 || b == 0)
501 			return (0);
502 		return (1);
503 	}
504 	FATAL("can't happen: unknown type %d in first", type(p));
505 }
506 
507 /* collects leaves that can follow v into setvec */
508 static void
follow(Node * v)509 follow(Node *v)
510 {
511 	Node *p;
512 
513 	if (type(v) == FINAL)
514 		return;
515 	p = parent(v);
516 	switch (type(p)) {
517 	case STAR:
518 	case PLUS:
519 		(void) first(v);
520 		follow(p);
521 		return;
522 
523 	case OR:
524 	case QUEST:
525 		follow(p);
526 		return;
527 
528 	case CAT:
529 		if (v == left(p)) {	/* v is left child of p */
530 			if (first(right(p)) == 0) {
531 				follow(p);
532 				return;
533 			}
534 		} else		/* v is right child */
535 			follow(p);
536 		return;
537 	default:
538 		FATAL("unknown type %d in follow", type(p));
539 		break;
540 	}
541 }
542 
543 static int
member(int c,const char * sarg)544 member(int c, const char *sarg)	/* is c in s? */
545 {
546 	uschar *s = (uschar *)sarg;
547 
548 	while (*s)
549 		if (c == *s++)
550 			return (1);
551 	return (0);
552 }
553 
554 
555 int
match(fa * f,const char * p0)556 match(fa *f, const char *p0)	/* shortest match ? */
557 {
558 	int s, ns;
559 	uschar *p = (uschar *)p0;
560 
561 	s = f->reset ? makeinit(f, 0) : f->initstat;
562 	if (f->out[s])
563 		return (1);
564 	do {
565 		if ((ns = f->gototab[s][*p]) != 0)
566 			s = ns;
567 		else
568 			s = cgoto(f, s, *p);
569 		if (f->out[s])
570 			return (1);
571 	} while (*p++ != 0);
572 	return (0);
573 }
574 
575 int
pmatch(fa * f,const char * p0)576 pmatch(fa *f, const char *p0)	/* longest match, for sub */
577 {
578 	int s, ns;
579 	uschar *p = (uschar *)p0;
580 	uschar *q;
581 	int i, k;
582 
583 	if (f->reset) {
584 		f->initstat = s = makeinit(f, 1);
585 	} else {
586 		s = f->initstat;
587 	}
588 	patbeg = (char *)p;
589 	patlen = -1;
590 	do {
591 		q = p;
592 		do {
593 			if (f->out[s])		/* final state */
594 				patlen = q-p;
595 			if ((ns = f->gototab[s][*q]) != 0)
596 				s = ns;
597 			else
598 				s = cgoto(f, s, *q);
599 			if (s == 1) {	/* no transition */
600 				if (patlen >= 0) {
601 					patbeg = (char *)p;
602 					return (1);
603 				} else {
604 					goto nextin;	/* no match */
605 				}
606 			}
607 		} while (*q++ != 0);
608 		if (f->out[s])
609 			patlen = q - p - 1;	/* don't count $ */
610 		if (patlen >= 0) {
611 			patbeg = (char *)p;
612 			return (1);
613 		}
614 	nextin:
615 		s = 2;
616 		if (f->reset) {
617 			for (i = 2; i <= f->curstat; i++)
618 				xfree(f->posns[i]);
619 			k = *f->posns[0];
620 			if ((f->posns[2] =
621 			    (int *)calloc(k + 1, sizeof (int))) == NULL) {
622 				overflo("out of space in pmatch");
623 			}
624 			for (i = 0; i <= k; i++)
625 				(f->posns[2])[i] = (f->posns[0])[i];
626 			f->initstat = f->curstat = 2;
627 			f->out[2] = f->out[0];
628 			for (i = 0; i < NCHARS; i++)
629 				f->gototab[2][i] = 0;
630 		}
631 	} while (*p++ != 0);
632 	return (0);
633 }
634 
635 int
nematch(fa * f,const char * p0)636 nematch(fa *f, const char *p0)	/* non-empty match, for sub */
637 {
638 	int s, ns;
639 	uschar *p = (uschar *)p0;
640 	uschar *q;
641 	int i, k;
642 
643 	if (f->reset) {
644 		f->initstat = s = makeinit(f, 1);
645 	} else {
646 		s = f->initstat;
647 	}
648 	patlen = -1;
649 	while (*p) {
650 		q = p;
651 		do {
652 			if (f->out[s])		/* final state */
653 				patlen = q-p;
654 			if ((ns = f->gototab[s][*q]) != 0)
655 				s = ns;
656 			else
657 				s = cgoto(f, s, *q);
658 			if (s == 1) {	/* no transition */
659 				if (patlen > 0) {
660 					patbeg = (char *)p;
661 					return (1);
662 				} else
663 					goto nnextin;	/* no nonempty match */
664 			}
665 		} while (*q++ != 0);
666 		if (f->out[s])
667 			patlen = q - p - 1;	/* don't count $ */
668 		if (patlen > 0) {
669 			patbeg = (char *)p;
670 			return (1);
671 		}
672 	nnextin:
673 		s = 2;
674 		if (f->reset) {
675 			for (i = 2; i <= f->curstat; i++)
676 				xfree(f->posns[i]);
677 			k = *f->posns[0];
678 			if ((f->posns[2] =
679 			    (int *)calloc(k + 1, sizeof (int))) == NULL) {
680 				overflo("out of state space");
681 			}
682 			for (i = 0; i <= k; i++)
683 				(f->posns[2])[i] = (f->posns[0])[i];
684 			f->initstat = f->curstat = 2;
685 			f->out[2] = f->out[0];
686 			for (i = 0; i < NCHARS; i++)
687 				f->gototab[2][i] = 0;
688 		}
689 		p++;
690 	}
691 	return (0);
692 }
693 
694 static Node *regexp(void), *primary(void), *concat(Node *);
695 static Node *alt(Node *), *unary(Node *);
696 
697 /* parses regular expression pointed to by p */
698 /* uses relex() to scan regular expression */
699 static Node *
reparse(const char * p)700 reparse(const char *p)
701 {
702 	Node *np;
703 
704 	dprintf(("reparse <%s>\n", p));
705 
706 	/* prestr points to string to be parsed */
707 	lastre = prestr = (uschar *)p;
708 	rtok = relex();
709 	/* GNU compatibility: an empty regexp matches anything */
710 	if (rtok == '\0') {
711 		return (op2(EMPTYRE, NIL, NIL));
712 	}
713 	np = regexp();
714 	if (rtok != '\0')
715 		FATAL("syntax error in regular expression %s at %s",
716 		    lastre, prestr);
717 	return (np);
718 }
719 
720 static Node *
regexp(void)721 regexp(void)	/* top-level parse of reg expr */
722 {
723 	return (alt(concat(primary())));
724 }
725 
726 static Node *
primary(void)727 primary(void)
728 {
729 	Node *np;
730 
731 	switch (rtok) {
732 	case CHAR:
733 		np = op2(CHAR, NIL, itonp(rlxval));
734 		rtok = relex();
735 		return (unary(np));
736 	case ALL:
737 		rtok = relex();
738 		return (unary(op2(ALL, NIL, NIL)));
739 	case EMPTYRE:
740 		rtok = relex();
741 		return (unary(op2(ALL, NIL, NIL)));
742 	case DOT:
743 		rtok = relex();
744 		return (unary(op2(DOT, NIL, NIL)));
745 	case CCL:
746 		/*LINTED align*/
747 		np = op2(CCL, NIL, (Node *)cclenter((char *)rlxstr));
748 		rtok = relex();
749 		return (unary(np));
750 	case NCCL:
751 		/*LINTED align*/
752 		np = op2(NCCL, NIL, (Node *)cclenter((char *)rlxstr));
753 		rtok = relex();
754 		return (unary(np));
755 	case '^':
756 		rtok = relex();
757 		return (unary(op2(CHAR, NIL, itonp(HAT))));
758 	case '$':
759 		rtok = relex();
760 		return (unary(op2(CHAR, NIL, NIL)));
761 	case '(':
762 		rtok = relex();
763 		if (rtok == ')') {	/* special pleading for () */
764 			rtok = relex();
765 			return (unary(op2(CCL, NIL,
766 			    /*LINTED align*/
767 			    (Node *)tostring(""))));
768 		}
769 		np = regexp();
770 		if (rtok == ')') {
771 			rtok = relex();
772 			return (unary(np));
773 		} else {
774 			FATAL("syntax error in regular expression %s at %s",
775 			    lastre, prestr);
776 		}
777 		/* FALLTHROUGH */
778 	default:
779 		FATAL("illegal primary in regular expression %s at %s",
780 		    lastre, prestr);
781 	}
782 	/*NOTREACHED*/
783 	return (NULL);
784 }
785 
786 static Node *
concat(Node * np)787 concat(Node *np)
788 {
789 	switch (rtok) {
790 	case EMPTYRE:
791 	case CHAR:
792 	case DOT:
793 	case ALL:
794 	case CCL:
795 	case NCCL:
796 	case '$':
797 	case '(':
798 		return (concat(op2(CAT, np, primary())));
799 	default:
800 		return (np);
801 	}
802 }
803 
804 static Node *
alt(Node * np)805 alt(Node *np)
806 {
807 	if (rtok == OR) {
808 		rtok = relex();
809 		return (alt(op2(OR, np, concat(primary()))));
810 	}
811 	return (np);
812 }
813 
814 static Node *
unary(Node * np)815 unary(Node *np)
816 {
817 	switch (rtok) {
818 	case STAR:
819 		rtok = relex();
820 		return (unary(op2(STAR, np, NIL)));
821 	case PLUS:
822 		rtok = relex();
823 		return (unary(op2(PLUS, np, NIL)));
824 	case QUEST:
825 		rtok = relex();
826 		return (unary(op2(QUEST, np, NIL)));
827 	default:
828 		return (np);
829 	}
830 }
831 
832 /*
833  * Character class definitions conformant to the POSIX locale as
834  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
835  * and operating character sets are both ASCII (ISO646) or supersets
836  * thereof.
837  *
838  * Note that to avoid overflowing the temporary buffer used in
839  * relex(), the expanded character class (prior to range expansion)
840  * must be less than twice the size of their full name.
841  */
842 
843 struct charclass {
844 	const char *cc_name;
845 	int cc_namelen;
846 	int (*cc_func)(int);
847 } charclasses[] = {
848 	{ "alnum",	5,	isalnum },
849 	{ "alpha",	5,	isalpha },
850 	{ "blank",	5,	isblank },
851 	{ "cntrl",	5,	iscntrl },
852 	{ "digit",	5,	isdigit },
853 	{ "graph",	5,	isgraph },
854 	{ "lower",	5,	islower },
855 	{ "print",	5,	isprint },
856 	{ "punct",	5,	ispunct },
857 	{ "space",	5,	isspace },
858 	{ "upper",	5,	isupper },
859 	{ "xdigit",	6,	isxdigit },
860 	{ NULL,		0,	NULL },
861 };
862 
863 
864 static int
relex(void)865 relex(void)		/* lexical analyzer for reparse */
866 {
867 	int c, n;
868 	int cflag;
869 	static uschar *buf = 0;
870 	static size_t bufsz = 100;
871 	uschar *bp;
872 	struct charclass *cc;
873 	int i;
874 
875 	switch (c = *prestr++) {
876 	case '|': return OR;
877 	case '*': return STAR;
878 	case '+': return PLUS;
879 	case '?': return QUEST;
880 	case '.': return DOT;
881 	case '\0': prestr--; return '\0';
882 	case '^':
883 	case '$':
884 	case '(':
885 	case ')':
886 		return (c);
887 	case '\\':
888 		rlxval = quoted(&prestr);
889 		return (CHAR);
890 	default:
891 		rlxval = c;
892 		return (CHAR);
893 	case '[':
894 		if (buf == NULL && (buf = (uschar *)malloc(bufsz)) == NULL)
895 			FATAL("out of space in reg expr %.10s..", lastre);
896 		bp = buf;
897 		if (*prestr == '^') {
898 			cflag = 1;
899 			prestr++;
900 		} else
901 			cflag = 0;
902 		n = 2 * strlen((const char *)prestr) + 1;
903 		if (!adjbuf((char **)&buf, &bufsz, n, n, (char **)&bp,
904 		    "relex1"))
905 			FATAL("out of space for reg expr %.10s...", lastre);
906 		for (;;) {
907 			if ((c = *prestr++) == '\\') {
908 				*bp++ = '\\';
909 				if ((c = *prestr++) == '\0') {
910 					FATAL("nonterminated character class "
911 					    "%.20s...", lastre);
912 				}
913 				*bp++ = c;
914 			} else if (c == '[' && *prestr == ':') {
915 				/*
916 				 * Handle POSIX character class names.
917 				 * Dag-Erling Smorgrav, des@ofug.org
918 				 */
919 				for (cc = charclasses; cc->cc_name; cc++)
920 					if (strncmp((const char *)prestr + 1,
921 					    (const char *)cc->cc_name,
922 					    cc->cc_namelen) == 0)
923 						break;
924 
925 				if (cc->cc_name == NULL ||
926 				    prestr[1 + cc->cc_namelen] != ':' ||
927 				    prestr[2 + cc->cc_namelen] != ']') {
928 					*bp++ = c;
929 					continue;
930 				}
931 
932 				prestr += cc->cc_namelen + 3;
933 				/*
934 				 * BUG: We begin at 1, instead of 0, since we
935 				 * would otherwise prematurely terminate the
936 				 * string for classes like [[:cntrl:]]. This
937 				 * means that we can't match the NUL character,
938 				 * not without first adapting the entire
939 				 * program to track each string's length.
940 				 */
941 				for (i = 1; i < NCHARS; i++) {
942 					(void) adjbuf((char **)&buf, &bufsz,
943 					    bp - buf + 1, 100, (char **)&bp,
944 					    "relex2");
945 					if (cc->cc_func(i)) {
946 						*bp++ = i;
947 						n++;
948 					}
949 				}
950 			} else if (c == '\0') {
951 				FATAL("nonterminated character class %.20s",
952 				    lastre);
953 			} else if (bp == buf) {	/* 1st char is special */
954 				*bp++ = c;
955 			} else if (c == ']') {
956 				*bp++ = '\0';
957 				rlxstr = (uschar *)tostring((char *)buf);
958 				if (cflag == 0)
959 					return (CCL);
960 				else
961 					return (NCCL);
962 			} else
963 				*bp++ = c;
964 		}
965 		/*NOTREACHED*/
966 	}
967 	return (0);
968 }
969 
970 static int
cgoto(fa * f,int s,int c)971 cgoto(fa *f, int s, int c)
972 {
973 	int i, j, k;
974 	int *p, *q;
975 
976 	assert(c == HAT || c < NCHARS);
977 	while (f->accept >= maxsetvec) {	/* guessing here! */
978 		growvec("out of space in cgoto()");
979 	}
980 	for (i = 0; i <= f->accept; i++)
981 		setvec[i] = 0;
982 	setcnt = 0;
983 	/* compute positions of gototab[s,c] into setvec */
984 	p = f->posns[s];
985 	for (i = 1; i <= *p; i++) {
986 		if ((k = f->re[p[i]].ltype) != FINAL) {
987 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) ||
988 			    (k == DOT && c != 0 && c != HAT) ||
989 			    (k == ALL && c != 0) ||
990 			    (k == EMPTYRE && c != 0) ||
991 			    (k == CCL &&
992 			    member(c, (char *)f->re[p[i]].lval.up)) ||
993 			    (k == NCCL &&
994 			    !member(c, (char *)f->re[p[i]].lval.up) &&
995 			    c != 0 && c != HAT)) {
996 				q = f->re[p[i]].lfollow;
997 				for (j = 1; j <= *q; j++) {
998 					if (q[j] >= maxsetvec) {
999 						growvec("cgoto overflow");
1000 					}
1001 					if (setvec[q[j]] == 0) {
1002 						setcnt++;
1003 						setvec[q[j]] = 1;
1004 					}
1005 				}
1006 			}
1007 		}
1008 	}
1009 	/* determine if setvec is a previous state */
1010 	tmpset[0] = setcnt;
1011 	j = 1;
1012 	for (i = f->accept; i >= 0; i--)
1013 		if (setvec[i]) {
1014 			tmpset[j++] = i;
1015 		}
1016 	/* tmpset == previous state? */
1017 	for (i = 1; i <= f->curstat; i++) {
1018 		p = f->posns[i];
1019 		if ((k = tmpset[0]) != p[0])
1020 			goto different;
1021 		for (j = 1; j <= k; j++)
1022 			if (tmpset[j] != p[j])
1023 				goto different;
1024 		/* setvec is state i */
1025 		f->gototab[s][c] = i;
1026 		return (i);
1027 	different:;
1028 	}
1029 
1030 	/* add tmpset to current set of states */
1031 	if (f->curstat >= NSTATES-1) {
1032 		f->curstat = 2;
1033 		f->reset = 1;
1034 		for (i = 2; i < NSTATES; i++)
1035 			xfree(f->posns[i]);
1036 	} else
1037 		++(f->curstat);
1038 	for (i = 0; i < NCHARS; i++)
1039 		f->gototab[f->curstat][i] = 0;
1040 	xfree(f->posns[f->curstat]);
1041 	if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL)
1042 		overflo("out of space in cgoto");
1043 
1044 	f->posns[f->curstat] = p;
1045 	f->gototab[s][c] = f->curstat;
1046 	for (i = 0; i <= setcnt; i++)
1047 		p[i] = tmpset[i];
1048 	if (setvec[f->accept])
1049 		f->out[f->curstat] = 1;
1050 	else
1051 		f->out[f->curstat] = 0;
1052 	return (f->curstat);
1053 }
1054 
1055 static void
freefa(fa * f)1056 freefa(fa *f)	/* free a finite automaton */
1057 {
1058 	int i;
1059 
1060 	if (f == NULL)
1061 		return;
1062 	for (i = 0; i <= f->curstat; i++)
1063 		xfree(f->posns[i]);
1064 	for (i = 0; i <= f->accept; i++) {
1065 		xfree(f->re[i].lfollow);
1066 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1067 			xfree((f->re[i].lval.np));
1068 	}
1069 	xfree(f->restr);
1070 	xfree(f);
1071 }
1072