xref: /illumos-gate/usr/src/tools/cscope-fast/cgrep.c (revision dc1259b6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*	Copyright (c) 1990 AT&T	*/
23 /*	  All Rights Reserved	*/
24 
25 /*
26  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 /*
31  *	cscope - interactive C symbol or text cross-reference
32  *
33  *	text searching functions
34  */
35 
36 #include <fcntl.h>
37 #include <setjmp.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <memory.h>
41 #include <ctype.h>
42 #include <unistd.h>
43 #include "library.h"
44 
45 typedef enum {
46 	NO = 0,
47 	YES = 1
48 } BOOL;
49 
50 typedef struct re_bm {
51 	int delta0[256];
52 	int *delta2;
53 	uchar_t *cmap;
54 	uchar_t *bmpat;
55 	int patlen;
56 } re_bm;
57 
58 typedef struct Link {
59 	uchar_t lit;
60 	struct Node *node;
61 	struct Link *next;
62 } Link;
63 
64 typedef struct Node {
65 	short out;
66 	short d;
67 	short shift1;
68 	short shift2;
69 	long id;
70 	struct Node **tab;
71 	Link *alts;
72 	struct Node *fail;
73 } Node;
74 
75 typedef struct re_cw {
76 	int maxdepth, mindepth;
77 	long nodeid;
78 	int step[256];
79 	uchar_t *cmap;
80 	struct Node *root;
81 } re_cw;
82 
83 typedef enum {
84 	/* lit expression types */
85 	Literal, Dot, Charclass, EOP,
86 
87 	/* non-lit expression types */
88 	Cat, Alternate, Star, Plus, Quest,
89 
90 	/* not really expression types, just helping */
91 	Lpar, Rpar, Backslash
92 
93 } Exprtype;
94 
95 typedef int ID;
96 
97 typedef struct Expr {
98 	Exprtype type;	/* Type of expression (in grammar) */
99 	ID id;		/* unique ID of lit expression  */
100 	int lit;	/* Literal character or tag */
101 	int flen;	/* Number of following lit expressions */
102 	ID *follow;	/* Array of IDs of following lit expressions */
103 	struct Expr *l; /* pointer to Left child (or ccl count) */
104 	struct Expr *r; /* pointer to Right child (or ccl mask) */
105 	struct Expr *parent; /* pointer to Parent */
106 } Expr;
107 
108 typedef struct State {
109 	struct State *tab[256];
110 	int cnt; /* number of matched chars on full match, -1 otherwise  */
111 	int npos;	/* number of IDs in position set for this state. */
112 	int pos;	/* index into posbase for beginning of IDs */
113 } State;
114 
115 typedef struct FID {
116 	ID	id;	/* Lit Expression id */
117 	int	fcount; /* Number of Lit exp matches before this one. */
118 } FID;
119 
120 typedef struct Positionset {
121 	int count;	/* Number of lit exps in position set */
122 	ID last;	/* ID of last lit exp in position set */
123 	FID *base;	/* array of MAXID FIDS */
124 			/* 0 means not in position set */
125 			/* -1 means first in position set */
126 			/* n (>0) is ID of prev member of position set. */
127 } Positionset;
128 
129 typedef struct re_re {
130 	FID  *posbase;	/* Array of IDs from all states */
131 	int nposalloc;	/* Allocated size of posbase */
132 	int posnext;	/* Index into free space in posbase */
133 	int posreset;	/* Index into end of IDs for initial state in posbase */
134 	int maxid;	/* Number of (also maximum ID of) lit expressions */
135 	Expr *root;	/* Pointer to root (EOP) expression */
136 	Expr **ptr;	/* Pointer to array of ptrs to lit expressions. */
137 	uchar_t *cmap;	/* Character mapping array */
138 	Positionset firstpos;
139 	Positionset tmp;
140 	int nstates;	/* Number of current states defined */
141 	int statelim;	/* Limit on number of states before flushing */
142 	State *states;	/* Array of states */
143 	State istate;	/* Initial state */
144 } re_re;
145 
146 typedef struct {
147 	uchar_t *cmap;
148 	re_re *re_ptr;
149 	re_bm *bm_ptr;
150 	re_cw *cw_ptr;
151 	BOOL fullmatch;
152 	BOOL (*procfn)();
153 	BOOL (*succfn)();
154 	uchar_t *loc1;
155 	uchar_t *loc2;
156 	uchar_t *expression;
157 } PATTERN;
158 
159 typedef enum {
160 	BEGIN,		/* File is not yet in buffer at all */
161 	MORE,		/* File is partly in buffer */
162 	NO_MORE		/* File has been completely read into buffer */
163 } FILE_STAT;
164 
165 typedef struct {
166 	uchar_t	*prntbuf; /* current line of input from data file */
167 	uchar_t	*newline; /* end of line (real or sentinel \n) */
168 	long	ln;	/* line number */
169 } LINE;
170 
171 #define	NL '\n'
172 
173 #define	CCL_SIZ		32
174 #define	CCL_SET(a, c)	((a)[(c) >> 3] |= bittab[(c) & 07])
175 #define	CCL_CLR(a, c)	((a)[(c) >> 3] &= ~bittab[(c) & 07])
176 #define	CCL_CHK(a, c)	((a)[(c) >> 3] & bittab[(c) & 07])
177 
178 #define	ESIZE (BUFSIZ)
179 #define	MAXBUFSIZE (64*BUFSIZ)
180 
181 #define	MAXMALLOCS	1024
182 #define	MAXLIT	256	/* is plenty big enough */
183 #define	LARGE	MAXBUFSIZE+ESIZE+2
184 
185 #define	CLEAR(r, rps)	(void) memset((char *)(rps)->base, 0, \
186 			    (int)((r)->maxid * sizeof (FID))), \
187 			    (rps)->count = 0, (rps)->last = -1
188 #define	SET(rps, n, cnt) { \
189 	if ((rps)->base[n].id == 0) {\
190 		(rps)->count++;\
191 		(rps)->base[n].fcount = (cnt);\
192 		(rps)->base[n].id = (rps)->last;\
193 		(rps)->last = (n);\
194 	} else if ((cnt) > (rps)->base[n].fcount) {\
195 		(rps)->base[n].fcount = (cnt);\
196 	}}
197 
198 #define	START	{ _p = tmp; }
199 #define	ADDL(c)	{ if (_p >= &tmp[MAXLIT]) _p--; *_p++ = c; }
200 #define	FINISH	{ ADDL(0) if ((_p-tmp) > bestlen) \
201 		    (void) memmove(best, tmp, bestlen = _p-tmp); }
202 
203 
204 #define	NEW(N)	(froot?(t = froot, froot = froot->next, t->next = NULL, \
205 		    t->node = N, t): newlink(0, N))
206 #define	ADD(N)	if (qtail) qtail = qtail->next = NEW(N); \
207 			else qtail = qhead = NEW(N)
208 #define	DEL()	{ Link *_l = qhead; if ((qhead = qhead->next) == NULL) \
209 			{ qtail = NULL; } _l->next = froot; froot = _l; }
210 
211 static uchar_t	*buffer;
212 static uchar_t	*bufend;
213 static FILE	*output;
214 static char	*format;
215 static char	*file;
216 static int	file_desc;
217 static FILE_STAT file_stat;
218 static PATTERN	match_pattern;
219 static uchar_t	char_map[2][256];
220 static int	iflag;
221 static Exprtype	toktype;
222 static int	toklit, parno, maxid;
223 static uchar_t	tmp[MAXLIT], best[MAXLIT];
224 static uchar_t	*_p;
225 static int	bestlen;
226 static Node	*next_node;
227 static Link	*froot, *next_link;
228 static jmp_buf	env;
229 
230 static int nmalloc;
231 static uchar_t	*mallocs[MAXMALLOCS];
232 
233 static uchar_t	bittab[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
234 
235 #ifdef	DEBUG
236 #define		TRACE(n)	(n < debug)
237 #define		EPRINTSIZE	50000
238 static void spr(int c, int *p, uchar_t *buf);
239 void epr(Expr *e, uchar_t *res);
240 static int debug = 12;
241 #endif
242 
243 static void init_file(LINE *cur_ptr);
244 static void get_line(LINE *cur_ptr, uchar_t *s);
245 static void get_ncount(LINE *cur_ptr, uchar_t *s);
246 static int execute(void);
247 static State *startstate(re_re *r);
248 static State *stateof(re_re *r, Positionset *ps);
249 static State *nextstate(re_re *r, State *s, int a);
250 static State *addstate(re_re *r, Positionset *ps, int cnt);
251 static BOOL match(Expr *e, int a);
252 static BOOL first_lit(Positionset *fpos, Expr *e);
253 static void eptr(re_re *r, Expr *e);
254 static void efollow(re_re *r, Positionset *fpos, Expr *e);
255 static void follow(Positionset *fpos, Expr *e);
256 static void followstate(re_re *r, State *s, int a, Positionset *fpos);
257 static Expr *eall(re_re *r, PATTERN *pat);
258 static Expr *d0(re_re *r, PATTERN *pat);
259 static Expr *d1(re_re *r, PATTERN *pat);
260 static Expr *d2(re_re *r, PATTERN *pat);
261 static Expr *d3(re_re *r, PATTERN *pat);
262 static Expr *newexpr(Exprtype t, int lit, Expr *left, Expr *right);
263 static void lex(re_re *r, PATTERN *pat);
264 static int re_lit(PATTERN *pat, uchar_t **b, uchar_t **e);
265 static void traverse(PATTERN *pat, Expr *e);
266 static int ccl(PATTERN *pat, uchar_t *tab);
267 static BOOL altlist(), word();
268 static BOOL altlist(Expr *e, uchar_t *buf, re_cw *pat);
269 static Node *newnode(re_cw *c, int d);
270 static Link *newlink(uchar_t lit, Node *n);
271 static void fail(Node *root);
272 static void zeroroot(Node *root, Node *n);
273 static void shift(re_cw *c);
274 static void shifttab(Node *n);
275 static void shiftprop(re_cw *c, Node *n);
276 static void delta_2(re_bm *b);
277 static int getstate(re_re *r, Positionset *ps);
278 static void savestate(re_re *r);
279 static void stateinit(re_re *r);
280 static re_bm *re_bmcomp(uchar_t *pb, uchar_t *pe, uchar_t *cmap);
281 static re_cw *re_cwinit(uchar_t *cmap);
282 static re_cw *re_recw(re_re *r, uchar_t *map);
283 static re_re *egprep(PATTERN *pat);
284 static void re_cwadd(re_cw *c, uchar_t *s, uchar_t *e);
285 static void re_cwcomp(re_cw *c);
286 static void eginit(re_re *r);
287 static BOOL re_bmexec(PATTERN *pat, uchar_t *s, uchar_t *e, uchar_t **mb,
288     uchar_t **me);
289 static BOOL re_cwexec(PATTERN *pat, uchar_t *rs, uchar_t *re, uchar_t **mb,
290     uchar_t **me);
291 static BOOL re_reexec(PATTERN *pat, uchar_t *b, uchar_t *e, uchar_t **mb,
292     uchar_t **me);
293 static uchar_t *egmalloc(size_t n);
294 static void fgetfile(LINE *cur_ptr);
295 static void dogre(PATTERN *pat);
296 static BOOL pattern_match(PATTERN *pat, LINE *lptr);
297 static BOOL fixloc(uchar_t **mb, uchar_t **me);
298 static BOOL grepmatch(PATTERN *pat, uchar_t **mb, uchar_t **me);
299 static void err(char *s);
300 
301 static char *message;
302 
303 void
egrepcaseless(int i)304 egrepcaseless(int i)
305 {
306 	iflag = i;	/* simulate "egrep -i" */
307 }
308 
309 char *
egrepinit(char * expression)310 egrepinit(char *expression)
311 {
312 	static int firsttime = 1;
313 	int i;
314 
315 	if (firsttime) {
316 		firsttime = 0;
317 		for (i = 0; i < 256; i++) {
318 			char_map[0][i] = (uchar_t)i;
319 			char_map[1][i] = tolower(i);
320 		}
321 	}
322 	for (i = 0; i < nmalloc; i ++)
323 		free(mallocs[i]);
324 	nmalloc = 0;
325 	message = NULL;
326 
327 	match_pattern.expression = (uchar_t *)expression;
328 	match_pattern.cmap = char_map[iflag];
329 	if (setjmp(env) == 0) {
330 		dogre(&match_pattern);
331 #ifdef	DEBUG
332 		{
333 		PATTERN *p = match_pattern;
334 		if (p->procfn == re_bmexec)
335 			if (!p->fullmatch)
336 				if (p->succfn == re_reexec)
337 					printf("PARTIAL BOYER_MOORE\n");
338 				else
339 					printf("PARTIAL B_M with GREP\n");
340 			else
341 				printf("FULL BOYER_MOORE\n");
342 		else if (p->procfn == re_cwexec)
343 			printf("C_W\n");
344 		else
345 			printf("GENERAL\n");
346 		}
347 #endif
348 	}
349 	return (message);
350 }
351 
352 static void
dogre(PATTERN * pat)353 dogre(PATTERN *pat)
354 {
355 	uchar_t *lb, *le;
356 
357 #ifdef	DEBUG
358 	printf("PATTERN %s\n", pat->expression);
359 #endif
360 	pat->re_ptr = egprep(pat);
361 	bestlen = re_lit(pat, &lb, &le);
362 
363 	if (bestlen && pat->fullmatch) { /* Full Boyer Moore */
364 #ifdef	DEBUG
365 		printf("BESTLEN %d\n", bestlen);
366 		{
367 			uchar_t *p;
368 			for (p = lb; p < le; p++) printf("%c", *p);
369 			printf("\n");
370 		}
371 #endif
372 		pat->bm_ptr = re_bmcomp(lb, le, pat->cmap);
373 		pat->procfn = re_bmexec;
374 		return;
375 	}
376 	if (bestlen > 1) {
377 			/* Partial Boyer Moore */
378 		pat->bm_ptr = re_bmcomp(lb, le, pat->cmap);
379 		pat->procfn = re_bmexec;
380 		pat->fullmatch = NO;
381 	} else {
382 		pat->fullmatch = YES;
383 		if ((pat->cw_ptr = re_recw(pat->re_ptr, pat->cmap)) != NULL) {
384 			pat->procfn = re_cwexec; /* CW */
385 			return;
386 		}
387 	}
388 	/* general egrep regular expression */
389 	pat->succfn = re_reexec;
390 
391 	if (pat->fullmatch) {
392 		pat->procfn = pat->succfn;
393 		pat->succfn = NULL;
394 	}
395 }
396 
397 static BOOL
fixloc(uchar_t ** mb,uchar_t ** me)398 fixloc(uchar_t **mb, uchar_t **me)
399 {
400 	/* Handle match to null string */
401 
402 	while (*me <= *mb)
403 		(*me)++;
404 
405 	if (*(*me - 1) != NL)
406 		while (**me != NL)
407 			(*me)++;
408 
409 	/* Handle match to new-line only */
410 
411 	if (*mb == *me - 1 && **mb == NL) {
412 		(*me)++;
413 	}
414 
415 	/* Handle match including beginning or ending new-line */
416 
417 	if (**mb == NL)
418 		(*mb)++;
419 	if (*(*me - 1) == NL)
420 		(*me)--;
421 	return (YES);
422 }
423 
424 static BOOL
grepmatch(PATTERN * pat,uchar_t ** mb,uchar_t ** me)425 grepmatch(PATTERN *pat, uchar_t **mb, uchar_t **me)
426 {
427 	uchar_t *s, *f;
428 
429 	if (pat->fullmatch)
430 		return (fixloc(mb, me));
431 
432 	for (f = *me - 1; *f != NL; f++) {
433 	}
434 	f++;
435 	for (s = *mb; *s != NL; s--) {
436 	}
437 
438 	if ((*pat->succfn)(pat, s, f, mb, me)) {
439 		return (YES);
440 	} else {
441 		*mb = f;
442 		return (NO);
443 	}
444 }
445 
446 static void
eginit(re_re * r)447 eginit(re_re *r)
448 {
449 	unsigned int n;
450 
451 	r->ptr = (Expr **)egmalloc(r->maxid * sizeof (Expr *));
452 	eptr(r, r->root);
453 	n = r->maxid * sizeof (FID);
454 	r->firstpos.base = (FID *)egmalloc(n);
455 	r->tmp.base = (FID *)egmalloc(n);
456 	CLEAR(r, &r->firstpos);
457 	if (!first_lit(&r->firstpos, r->root->l)) {
458 		/*
459 		 * This expression matches the null string!!!!
460 		 * Add EOP to beginning position set.
461 		 */
462 		SET(&r->firstpos, r->root->id, 0)
463 		/* (void) printf("first of root->l == 0, b=%s\n", b); */
464 	}
465 	stateinit(r);
466 	(void) addstate(r, &r->firstpos, 0);
467 	savestate(r);
468 }
469 
470 static void
eptr(re_re * r,Expr * e)471 eptr(re_re *r, Expr *e)
472 {
473 	if ((e->id < 0) || (e->id >= r->maxid)) {
474 		err("internal error");
475 	}
476 	r->ptr[e->id] = e;
477 	if (e->type != Charclass) {
478 		if (e->l) eptr(r, e->l);
479 		if (e->r) eptr(r, e->r);
480 	}
481 }
482 
483 static BOOL
re_reexec(PATTERN * pat,uchar_t * b,uchar_t * e,uchar_t ** mb,uchar_t ** me)484 re_reexec(PATTERN *pat, uchar_t *b, uchar_t *e, uchar_t **mb, uchar_t **me)
485 {
486 	re_re *r = pat->re_ptr;
487 	State *s, *t;
488 
489 #ifdef	DEBUG
490 	if (TRACE(10)) {
491 		uchar_t buf[EPRINTSIZE];
492 		epr(r->root, buf);
493 		(void) printf("expr='%s'\n", buf);
494 	}
495 #endif
496 	s = startstate(r);
497 
498 	for (;;) {
499 		uchar_t c;
500 
501 		if (s->cnt >= 0) {
502 #ifdef	DEBUG
503 			if (TRACE(6))
504 				(void) printf("match at input '%s'\n", b);
505 #endif
506 			*mb = b - s->cnt;
507 			*me = b;
508 			if (fixloc(mb, me))
509 				return (YES);
510 		}
511 
512 		if (b >= e) break;
513 		c = pat->cmap[*b];
514 #ifdef	DEBUG
515 		if (TRACE(4))
516 			(void) printf("state %d: char '%c'\n", s-r->states, *b);
517 #endif
518 		if ((t = s->tab[c]) != NULL) s = t;
519 		else s = nextstate(r, s, (int)c);
520 		b++;
521 	}
522 #ifdef	DEBUG
523 	if (TRACE(3)) {
524 		uchar_t buf[EPRINTSIZE];
525 
526 		epr(r->root, buf);
527 		(void) printf("pat = %s\n", buf);
528 	}
529 #endif
530 	return (NO);
531 }
532 
533 static BOOL
match(Expr * e,int a)534 match(Expr *e, int a)
535 {
536 	switch (e->type) {
537 	case Dot:	return ((BOOL)(a != NL));
538 	case Literal:	return ((BOOL)(a == e->lit));
539 	case Charclass:	return ((BOOL)(CCL_CHK((uchar_t *)e->r, a)));
540 	default:	return (NO);
541 	}
542 }
543 
544 /*
545  * generates the followset for a node in fpos
546  */
547 static void
follow(Positionset * fpos,Expr * e)548 follow(Positionset *fpos, Expr *e)
549 {
550 	Expr *p;
551 
552 	if (e->type == EOP)
553 		return;
554 	else
555 		p = e->parent;
556 	switch (p->type) {
557 	case EOP:
558 		SET(fpos, p->id, 0)
559 		break;
560 	case Plus:
561 	case Star:
562 		(void) first_lit(fpos, e);
563 		follow(fpos, p);
564 		break;
565 	case Quest:
566 	case Alternate:
567 		follow(fpos, p);
568 		break;
569 	case Cat:
570 		if (e == p->r || !first_lit(fpos, p->r))
571 			follow(fpos, p);
572 		break;
573 	default:
574 		break;
575 	}
576 }
577 
578 /*
579  * first_lit returns NO if e is nullable and in the process,
580  * ets up fpos.
581  */
582 static BOOL
first_lit(Positionset * fpos,Expr * e)583 first_lit(Positionset *fpos, Expr *e)
584 {
585 	BOOL k;
586 
587 	switch (e->type) {
588 	case Literal:
589 	case Dot:
590 	case Charclass:
591 		SET(fpos, e->id, 0)
592 		return (YES);
593 	case EOP:
594 	case Star:
595 	case Quest:
596 		(void) first_lit(fpos, e->l);
597 		return (NO);
598 	case Plus:
599 		return (first_lit(fpos, e->l));
600 	case Cat:
601 		return ((BOOL)(first_lit(fpos, e->l) || first_lit(fpos, e->r)));
602 	case Alternate:
603 		k = first_lit(fpos, e->r);
604 		return ((BOOL)(first_lit(fpos, e->l) && k));
605 	default:
606 		err("internal error");
607 	}
608 	return (NO);
609 }
610 
611 static void
efollow(re_re * r,Positionset * fpos,Expr * e)612 efollow(re_re *r, Positionset *fpos, Expr *e)
613 {
614 	ID i, *p;
615 
616 	CLEAR(r, fpos);
617 	follow(fpos, e);
618 	e->flen = fpos->count;
619 	e->follow = (ID *)egmalloc(e->flen * sizeof (ID));
620 	p = e->follow;
621 #ifdef	DEBUG
622 	printf("ID = %d LIT %c FLEN = %d\n", e->id, e->lit, e->flen);
623 #endif
624 	for (i = fpos->last; i > 0; i = fpos->base[i].id) {
625 		*p++ = i;
626 #ifdef	DEBUG
627 	printf("FOLLOW ID = %d LIT %c\n", r->ptr[i]->id, r->ptr[i]->lit);
628 #endif
629 	}
630 	if (p != e->follow + e->flen) {
631 		err("internal error");
632 	}
633 }
634 
635 static State *
addstate(re_re * r,Positionset * ps,int cnt)636 addstate(re_re *r, Positionset *ps, int cnt)
637 {
638 	ID j;
639 	FID *p, *q;
640 	State *s;
641 
642 	if (cnt) {
643 		s = r->states + getstate(r, ps);
644 		(void) memset((char *)s->tab, 0, sizeof (s->tab));
645 		s->cnt = r->istate.cnt;
646 	} else {
647 		s = &r->istate;
648 		s->cnt = -1;
649 	}
650 	s->pos = r->posnext;
651 	r->posnext += ps->count;
652 	s->npos = ps->count;
653 	p = r->posbase + s->pos;
654 	for (j = ps->last; j > 0; p++, j = q->id) {
655 		q = &ps->base[j];
656 		p->id = j;
657 		p->fcount = q->fcount;
658 		if (p->id == r->root->id && s->cnt < p->fcount)
659 			s->cnt = p->fcount;
660 	}
661 #ifdef	DEBUG
662 	if (TRACE(3)) {
663 		uchar_t buf[2000];
664 		spr(s->npos, s->pos+r->posbase, buf);
665 		(void) printf("new state[%d] %s%s\n", s-r->states, buf,
666 		    s->cnt?" cnt":"");
667 	}
668 #endif
669 	return (s);
670 }
671 
672 static State *
nextstate(re_re * r,State * s,int a)673 nextstate(re_re *r, State *s, int a)
674 {
675 	State *news;
676 
677 	CLEAR(r, &r->tmp);
678 	followstate(r, s, a, &r->tmp);
679 	if (s != &r->istate) followstate(r, &r->istate, a, &r->tmp);
680 
681 #ifdef	DEBUG
682 	if (TRACE(5)) {
683 		uchar_t buf[2000];
684 		ppr(&r->tmp, buf);
685 		(void) printf("nextstate(%d, '%c'): found %s\n", s-r->states,
686 		    a, buf);
687 	}
688 #endif
689 	if (r->tmp.count == 0)
690 		news = &r->istate;
691 	else if ((news = stateof(r, &r->tmp)) == NULL)
692 		news = addstate(r, &r->tmp, 1);
693 	s->tab[a] = news;
694 #ifdef	DEBUG
695 	if (TRACE(5)) {
696 		(void) printf("nextstate(%d, '%c'): returning %ld\n",
697 		    s-r->states, a, news);
698 	}
699 #endif
700 	return (news);
701 }
702 
703 static void
followstate(re_re * r,State * s,int a,Positionset * fpos)704 followstate(re_re *r, State *s, int a, Positionset *fpos)
705 {
706 	int j;
707 	ID *q, *eq;
708 	Expr *e;
709 
710 	for (j = s->pos; j < (s->pos + s->npos); j++) {
711 		e = r->ptr[r->posbase[j].id];
712 		if (e->type == EOP) continue;
713 		if (match(e, a)) {
714 			if (e->follow == NULL) efollow(r, &r->firstpos, e);
715 			for (q = e->follow, eq = q + e->flen; q < eq; q++) {
716 				SET(fpos, *q, r->posbase[j].fcount + 1)
717 #ifdef	DEBUG
718 				printf("CHAR %c FC %c COUNT %d\n", a,
719 				    r->ptr[*q]->lit, r->posbase[j].fcount+1);
720 #endif
721 			}
722 		}
723 	}
724 }
725 
726 static uchar_t *
egmalloc(size_t n)727 egmalloc(size_t n)
728 {
729 	uchar_t *x;
730 
731 	x = (uchar_t *)mymalloc(n);
732 	mallocs[nmalloc++] = x;
733 	if (nmalloc >= MAXMALLOCS)
734 		nmalloc = MAXMALLOCS - 1;
735 	return (x);
736 }
737 
738 #ifdef	DEBUG
739 void
ppr(Positionse * ps,char * p)740 ppr(Positionse *ps, char *p)
741 {
742 	ID n;
743 
744 	if (ps->count < 1) {
745 		(void) sprintf(p, "{}");
746 		return;
747 	}
748 	*p++ = '{';
749 	for (n = ps->last; n > 0; n = ps->base[n].id) {
750 		(void) sprintf(p, "%d,", n);
751 		p = strchr(p, 0);
752 	}
753 	p[-1] = '}';
754 }
755 
756 void
epr(Expr * e,uchar_t * res)757 epr(Expr *e, uchar_t *res)
758 {
759 	uchar_t r1[EPRINTSIZE], r2[EPRINTSIZE];
760 	int i;
761 
762 	if (e == NULL) {
763 		(void) sprintf(res, "!0!");
764 		return;
765 	}
766 	switch (e->type) {
767 	case Dot:
768 	case Literal:
769 		spr(e->flen, e->follow, r1);
770 		(void) sprintf(res, "%c%s", e->lit, r1);
771 		break;
772 	case Charclass:
773 		*res++ = '[';
774 		for (i = 0; i < 256; i++)
775 			if (CCL_CHK((uchar_t *)e->r, i)) {
776 				*res++ = i;
777 			}
778 		*res++ = ']';
779 		*res = '\0';
780 		break;
781 	case Cat:
782 		epr(e->l, r1);
783 		epr(e->r, r2);
784 		(void) sprintf(res, "%s%s", r1, r2);
785 		break;
786 	case Alternate:
787 		epr(e->l, r1);
788 		epr(e->r, r2);
789 		(void) sprintf(res, "(%s|%s)", r1, r2);
790 		break;
791 	case Star:
792 		epr(e->l, r1);
793 		(void) sprintf(res, "(%s)*", r1);
794 		break;
795 	case Plus:
796 		epr(e->l, r1);
797 		(void) sprintf(res, "(%s)+", r1);
798 		break;
799 	case Quest:
800 		epr(e->l, r1);
801 		(void) sprintf(res, "(%s)?", r1);
802 		break;
803 	case EOP:
804 		epr(e->l, r1);
805 		(void) sprintf(res, "%s<EOP>", r1);
806 		break;
807 	default:
808 		(void) sprintf(res, "<undef type %d>", e->type);
809 		err(res);
810 		break;
811 	}
812 }
813 
814 static void
spr(int c,int * p,uchar_t * buf)815 spr(int c, int *p, uchar_t *buf)
816 {
817 	if (c > 0) {
818 		*buf++ = '{';
819 		*buf = '\0';
820 		while (--c > 0) {
821 			(void) sprintf(buf, "%d,", *p++);
822 			buf = strchr(buf, 0);
823 		}
824 		(void) sprintf(buf, "%d}", *p);
825 	} else
826 		(void) sprintf(buf, "{}");
827 }
828 #endif
829 
830 static void
stateinit(re_re * r)831 stateinit(re_re *r)
832 {
833 	/* CONSTANTCONDITION */
834 	r->statelim = (sizeof (int) < 4 ? 32 : 128);
835 	r->states = (State *)egmalloc(r->statelim * sizeof (State));
836 
837 	/* CONSTANTCONDITION */
838 	r->nposalloc = (sizeof (int) < 4 ? 2048 : 8192);
839 	r->posbase = (FID *)egmalloc(r->nposalloc * sizeof (FID));
840 	r->nstates = r->posnext = 0;
841 }
842 
843 static void
clrstates(re_re * r)844 clrstates(re_re *r)
845 {
846 	r->nstates = 0;		/* reclaim space for states and positions */
847 	r->posnext = r->posreset;
848 	(void) memset((char *)r->istate.tab, 0, sizeof (r->istate.tab));
849 }
850 
851 static void
savestate(re_re * r)852 savestate(re_re *r)
853 {
854 	r->posreset = r->posnext;	/* save for reset */
855 }
856 
857 static State *
startstate(re_re * r)858 startstate(re_re *r)
859 {
860 	return (&r->istate);
861 }
862 
863 static int
getstate(re_re * r,Positionset * ps)864 getstate(re_re *r, Positionset *ps)
865 {
866 	if (r->nstates >= r->statelim ||
867 	    r->posnext + ps->count >= r->nposalloc) {
868 		clrstates(r);
869 #ifdef	DEBUG
870 		printf("%d STATES FLUSHED\n", r->statelim);
871 #endif
872 	}
873 	return (r->nstates++);
874 }
875 
876 static State *
stateof(re_re * r,Positionset * ps)877 stateof(re_re *r, Positionset *ps)
878 {
879 	State *s;
880 	int i;
881 	FID *p, *e;
882 
883 	for (i = 0, s = r->states; i < r->nstates; i++, s++) {
884 		if (s->npos == ps->count) {
885 			for (p = s->pos+r->posbase, e = p+s->npos; p < e; p++)
886 				if (ps->base[p->id].id == 0 ||
887 				    ps->base[p->id].fcount != p->fcount)
888 					goto next;
889 			return (s);
890 		}
891 	next:;
892 	}
893 	return (NULL);
894 }
895 
896 static re_re *
egprep(PATTERN * pat)897 egprep(PATTERN *pat)
898 {
899 	re_re *r;
900 
901 	r = (re_re *)egmalloc(sizeof (re_re));
902 	(void) memset((char *)r, 0, sizeof (re_re));
903 
904 	pat->loc1 = pat->expression;
905 	pat->loc2 = pat->expression + strlen((char *)pat->expression);
906 
907 	parno = 0;
908 	maxid = 1;
909 	r->cmap = pat->cmap;
910 	lex(r, pat);
911 	r->root = newexpr(EOP, '#', eall(r, pat), (Expr *)NULL);
912 	r->maxid = maxid;
913 
914 	eginit(r);
915 	return (r);
916 }
917 
918 static Expr *
newexpr(Exprtype t,int lit,Expr * left,Expr * right)919 newexpr(Exprtype t, int lit, Expr *left, Expr *right)
920 {
921 	Expr *e = (Expr *)egmalloc(sizeof (Expr));
922 
923 	e->type = t;
924 	e->parent = NULL;
925 	e->lit = lit;
926 
927 	if (e->lit) e->id = maxid++;
928 	else e->id = 0;
929 
930 	if ((e->l = left) != NULL) {
931 		left->parent = e;
932 	}
933 	if ((e->r = right) != NULL) {
934 		right->parent = e;
935 	}
936 	e->follow = NULL;
937 	e->flen = 0;
938 	return (e);
939 }
940 
941 static void
lex(re_re * r,PATTERN * pat)942 lex(re_re *r, PATTERN *pat)
943 {
944 	if (pat->loc1 == pat->loc2) {
945 		toktype = EOP;
946 		toklit = -1;
947 	} else switch (toklit = *pat->loc1++) {
948 	case '.':	toktype = Dot; break;
949 	case '*':	toktype = Star; break;
950 	case '+':	toktype = Plus; break;
951 	case '?':	toktype = Quest; break;
952 	case '[':	toktype = Charclass; break;
953 	case '|':	toktype = Alternate; break;
954 	case '(':	toktype = Lpar; break;
955 	case ')':	toktype = Rpar; break;
956 	case '\\':	toktype = Backslash;
957 			if (pat->loc1 == pat->loc2) {
958 				err("syntax error - missing character "
959 				    "after \\");
960 			} else {
961 				toklit = r->cmap[*pat->loc1++];
962 			}
963 			break;
964 	case '^': case '$':	toktype = Literal; toklit = NL; break;
965 	default:	toktype = Literal; toklit = r->cmap[toklit]; break;
966 	}
967 }
968 
969 static int
ccl(PATTERN * pat,uchar_t * tab)970 ccl(PATTERN *pat, uchar_t *tab)
971 {
972 	int i;
973 	int range = 0;
974 	int lastc = -1;
975 	int count = 0;
976 	BOOL comp = NO;
977 
978 	(void) memset((char *)tab, 0, CCL_SIZ * sizeof (uchar_t));
979 	if (*pat->loc1 == '^') {
980 		pat->loc1++;
981 		comp = YES;
982 	}
983 	if (*pat->loc1 == ']') {
984 		uchar_t c = pat->cmap[*pat->loc1];
985 		CCL_SET(tab, c);
986 		lastc = *pat->loc1++;
987 	}
988 	/* scan for chars */
989 	for (; (pat->loc1 < pat->loc2) && (*pat->loc1 != ']'); pat->loc1++) {
990 		if (*pat->loc1 == '-') {
991 			if (lastc < 0) CCL_SET(tab, pat->cmap['-']);
992 			else range = 1;
993 			continue;
994 		}
995 		if (range) {
996 			for (i = *pat->loc1; i >= lastc; i--) {
997 				CCL_SET(tab, pat->cmap[i]);
998 			}
999 		} else {
1000 			uchar_t c = pat->cmap[*pat->loc1];
1001 			CCL_SET(tab, c);
1002 		}
1003 
1004 		range = 0;
1005 
1006 		lastc = *pat->loc1;
1007 	}
1008 	if (range) CCL_SET(tab, pat->cmap['-']);
1009 
1010 	if (pat->loc1 < pat->loc2) pat->loc1++;
1011 	else err("syntax error - missing ]");
1012 
1013 	if (comp) {
1014 		CCL_SET(tab, pat->cmap[NL]);
1015 		for (i = 0; i < CCL_SIZ; i++) tab[i] ^= 0xff;
1016 	}
1017 	for (i = 0; i < 256; i++) {
1018 		if (pat->cmap[i] != i) CCL_CLR(tab, i);
1019 		if (CCL_CHK(tab, i)) {
1020 			lastc = i;
1021 			count++;
1022 		}
1023 	}
1024 	if (count == 1)
1025 		*tab = (char)lastc;
1026 	return (count);
1027 }
1028 
1029 /*
1030  * egrep patterns:
1031  *
1032  * Alternation:	d0:	d1 { '|' d1 }*
1033  * Concatenation:	d1:	d2 { d2 }*
1034  * Repetition:	d2:	d3 { '*' | '?' | '+' }
1035  * Literal:	d3:	lit | '.' | '[]' | '(' d0 ')'
1036  */
1037 
1038 static Expr *
d3(re_re * r,PATTERN * pat)1039 d3(re_re *r, PATTERN *pat)
1040 {
1041 	Expr *e;
1042 	uchar_t *tab;
1043 	int count;
1044 
1045 	switch (toktype) {
1046 	case Backslash:
1047 	case Literal:
1048 		e = newexpr(Literal, toklit, (Expr *)NULL, (Expr *)NULL);
1049 		lex(r, pat);
1050 		break;
1051 	case Dot:
1052 		e = newexpr(Dot, '.', (Expr *)NULL, (Expr *)NULL);
1053 		lex(r, pat);
1054 		break;
1055 	case Charclass:
1056 		tab = egmalloc(CCL_SIZ * sizeof (uchar_t));
1057 		count = ccl(pat, tab);
1058 		if (count == 1) {
1059 			toklit = *tab;
1060 			e = newexpr(Literal, toklit, (Expr *)NULL,
1061 			    (Expr *)NULL);
1062 		} else {
1063 			e = newexpr(Charclass, '[', (Expr *)NULL, (Expr *)NULL);
1064 			e->l = (Expr *)count;	/* number of chars */
1065 			e->r = (Expr *)tab;	/* bitmap of chars */
1066 		}
1067 		lex(r, pat);
1068 		break;
1069 	case Lpar:
1070 		lex(r, pat);
1071 		count = ++parno;
1072 		e = d0(r, pat);
1073 		if (toktype == Rpar)
1074 			lex(r, pat);
1075 		else
1076 			err("syntax error - missing )");
1077 		return (e);
1078 	default:
1079 		err("syntax error");
1080 		e = NULL;
1081 	}
1082 	return (e);
1083 }
1084 
1085 static Expr *
d2(re_re * r,PATTERN * pat)1086 d2(re_re *r, PATTERN *pat)
1087 {
1088 	Expr *e;
1089 	Exprtype t;
1090 
1091 	e = d3(r, pat);
1092 	while ((toktype == Star) || (toktype == Plus) || (toktype == Quest)) {
1093 		t = toktype;
1094 		lex(r, pat);
1095 		e = newexpr(t, 0, e, (Expr *)NULL);
1096 	}
1097 	return (e);
1098 }
1099 
1100 static Expr *
d1(re_re * r,PATTERN * pat)1101 d1(re_re *r, PATTERN *pat)
1102 {
1103 	Expr *e, *f;
1104 
1105 	e = d2(r, pat);
1106 	while ((toktype == Literal) || (toktype == Dot) || (toktype == Lpar) ||
1107 	    (toktype == Backslash) || (toktype == Charclass)) {
1108 		f = d2(r, pat);
1109 		e = newexpr(Cat, 0, e, f);
1110 	}
1111 	return (e);
1112 }
1113 
1114 static Expr *
d0(re_re * r,PATTERN * pat)1115 d0(re_re *r, PATTERN *pat)
1116 {
1117 	Expr *e, *f;
1118 
1119 	e = d1(r, pat);
1120 	while (toktype == Alternate) {
1121 		lex(r, pat);
1122 		if (toktype == EOP)
1123 			continue;
1124 		f = d1(r, pat);
1125 		e = newexpr(Alternate, 0, e, f);
1126 	}
1127 	return (e);
1128 }
1129 
1130 static Expr *
eall(re_re * r,PATTERN * pat)1131 eall(re_re *r, PATTERN *pat)
1132 {
1133 	Expr *e;
1134 
1135 	while (toktype == Alternate)	/* bogus but user-friendly */
1136 		lex(r, pat);
1137 	e = d0(r, pat);
1138 	while (toktype == Alternate)	/* bogus but user-friendly */
1139 		lex(r, pat);
1140 	if (toktype != EOP)
1141 		err("syntax error");
1142 	return (e);
1143 }
1144 
1145 static void
err(char * s)1146 err(char *s)
1147 {
1148 	message = s;
1149 	longjmp(env, 1);
1150 }
1151 
1152 static int
re_lit(PATTERN * pat,uchar_t ** b,uchar_t ** e)1153 re_lit(PATTERN *pat, uchar_t **b, uchar_t **e)
1154 {
1155 	bestlen = 0;
1156 	pat->fullmatch = YES;
1157 	START
1158 	traverse(pat, pat->re_ptr->root->l);
1159 	FINISH
1160 	if (bestlen < 2)
1161 		return (0);
1162 	*b = egmalloc(bestlen * sizeof (uchar_t));
1163 	(void) memmove(*b, best, bestlen);
1164 	*e = *b + bestlen - 1;
1165 	return (bestlen - 1);
1166 }
1167 
1168 static void
traverse(PATTERN * pat,Expr * e)1169 traverse(PATTERN *pat, Expr *e)
1170 {
1171 	switch (e->type) {
1172 	case Literal:
1173 		ADDL(e->lit)
1174 		break;
1175 	case Cat:
1176 		traverse(pat, e->l);
1177 		traverse(pat, e->r);
1178 		break;
1179 	case Plus:
1180 		traverse(pat, e->l);
1181 		FINISH	/* can't go on past a + */
1182 		pat->fullmatch = NO;
1183 		START	/* but we can start with one! */
1184 		traverse(pat, e->l);
1185 		break;
1186 	default:
1187 		FINISH
1188 		pat->fullmatch = NO;
1189 		START
1190 		break;
1191 	}
1192 }
1193 
1194 static re_bm *
re_bmcomp(uchar_t * pb,uchar_t * pe,uchar_t * cmap)1195 re_bmcomp(uchar_t *pb, uchar_t *pe, uchar_t *cmap)
1196 {
1197 	int j;
1198 	int delta[256];
1199 	re_bm *b;
1200 
1201 	b = (re_bm *)egmalloc(sizeof (*b));
1202 
1203 	b->patlen = pe - pb;
1204 	b->cmap = cmap;
1205 	b->bmpat = pb;
1206 
1207 	delta_2(b);
1208 
1209 	for (j = 0; j < 256; j++)
1210 		delta[j] = b->patlen;
1211 
1212 	for (pe--; pb < pe; pb++)
1213 		delta[b->cmap[*pb]] = pe - pb;
1214 
1215 	delta[b->cmap[*pb]] = LARGE;
1216 
1217 	for (j = 0; j < 256; j++)
1218 		b->delta0[j] = delta[b->cmap[j]];
1219 	return (b);
1220 }
1221 
1222 static void
delta_2(re_bm * b)1223 delta_2(re_bm *b)
1224 {
1225 	int m = b->patlen;
1226 	int i, k, j;
1227 
1228 	b->delta2 = (int *)egmalloc(m * sizeof (int));
1229 
1230 	for (j = 0; j < m; j++) {
1231 		k = 1;
1232 again:
1233 		while (k <= j && b->bmpat[j-k] == b->bmpat[j]) k++;
1234 
1235 		for (i = j + 1; i < m; i++) {
1236 			if (k <= i && b->bmpat[i-k] != b->bmpat[i]) {
1237 				k++;
1238 				goto again;
1239 			}
1240 		}
1241 		b->delta2[j] = k + m - j - 1;
1242 	}
1243 }
1244 
1245 static BOOL
re_bmexec(PATTERN * pat,uchar_t * s,uchar_t * e,uchar_t ** mb,uchar_t ** me)1246 re_bmexec(PATTERN *pat, uchar_t *s, uchar_t *e, uchar_t **mb, uchar_t **me)
1247 {
1248 	re_bm *b = pat->bm_ptr;
1249 	uchar_t *sp;
1250 	int k;
1251 
1252 	s += b->patlen - 1;
1253 	while ((unsigned long)s < (unsigned long)e) {
1254 		while ((unsigned long)(s += b->delta0[*s]) < (unsigned long)e)
1255 			;
1256 		if ((unsigned long)s < (unsigned long)(e + b->patlen))
1257 			return (NO); /* no match */
1258 		s -= LARGE;
1259 		for (k = b->patlen-2, sp = s-1; k >= 0; k--) {
1260 			if (b->cmap[*sp--] != b->bmpat[k]) break;
1261 		}
1262 		if (k < 0) {
1263 			*mb = ++sp;
1264 			*me = s+1;
1265 			if (grepmatch(pat, mb, me))
1266 				return (YES);
1267 			s = *mb;
1268 		} else if (k < 0) {
1269 			s++;
1270 		} else {
1271 			int j;
1272 			j = b->delta2[k];
1273 			k = b->delta0[*++sp];
1274 			if ((j > k) || (k == (int)LARGE))
1275 				k = j;
1276 			s = sp + k;
1277 		}
1278 	}
1279 	return (NO);
1280 }
1281 
1282 static re_cw *
re_recw(re_re * r,uchar_t * map)1283 re_recw(re_re *r, uchar_t *map)
1284 {
1285 	Expr *e, *root = r->root;
1286 	re_cw *pat;
1287 	uchar_t *buf;
1288 
1289 	if (root->type != EOP)
1290 		return (NULL);
1291 	e = root->l;
1292 	pat = re_cwinit(map);
1293 	buf = (uchar_t *)egmalloc(20000 * sizeof (uchar_t));
1294 	if (!altlist(e, buf, pat)) {
1295 		return (NULL);
1296 	}
1297 	re_cwcomp(pat);
1298 	return (pat);
1299 }
1300 
1301 static BOOL
altlist(Expr * e,uchar_t * buf,re_cw * pat)1302 altlist(Expr *e, uchar_t *buf, re_cw *pat)
1303 {
1304 	if (e->type == Alternate)
1305 		return ((BOOL)(altlist(e->l, buf, pat) &&
1306 		    altlist(e->r, buf, pat)));
1307 	return (word(e, buf, pat));
1308 }
1309 
1310 static BOOL
word(Expr * e,uchar_t * buf,re_cw * pat)1311 word(Expr *e, uchar_t *buf, re_cw *pat)
1312 {
1313 	static uchar_t *p;
1314 
1315 	if (buf) p = buf;
1316 
1317 	if (e->type == Cat) {
1318 		if (!word(e->l, (uchar_t *)NULL, pat))
1319 			return (NO);
1320 		if (!word(e->r, (uchar_t *)NULL, pat))
1321 			return (NO);
1322 	} else if (e->type == Literal)
1323 		*p++ = e->lit;
1324 	else
1325 		return (NO);
1326 
1327 	if (buf)
1328 		re_cwadd(pat, buf, p);
1329 	return (YES);
1330 }
1331 
1332 static re_cw *
re_cwinit(uchar_t * cmap)1333 re_cwinit(uchar_t *cmap)
1334 {
1335 	re_cw *c;
1336 
1337 	froot = NULL;
1338 	next_node = NULL;
1339 	next_link = NULL;
1340 	c = (re_cw *)egmalloc(sizeof (*c));
1341 	c->nodeid = 0;
1342 	c->maxdepth = 0;
1343 	c->mindepth = 10000;
1344 	c->root = newnode(c, 0);
1345 	c->cmap = cmap;
1346 	return (c);
1347 }
1348 
1349 static void
re_cwadd(re_cw * c,uchar_t * s,uchar_t * e)1350 re_cwadd(re_cw *c, uchar_t *s, uchar_t *e)
1351 {
1352 	Node *p, *state;
1353 	Link *l;
1354 	int depth;
1355 
1356 	state = c->root;
1357 	while (s <= --e) {
1358 		for (l = state->alts; l; l = l->next)
1359 			if (l->lit == *e)
1360 				break;
1361 		if (l == NULL)
1362 			break;
1363 		else
1364 			state = l->node;
1365 	}
1366 	if (s <= e) {
1367 		depth = state->d+1;
1368 		l = newlink(*e--, p = newnode(c, depth++));
1369 		l->next = state->alts;
1370 		state->alts = l;
1371 		state = p;
1372 		while (s <= e) {
1373 			state->alts = newlink(*e--, p = newnode(c, depth++));
1374 			state = p;
1375 		}
1376 		if (c->mindepth >= depth) c->mindepth = depth-1;
1377 	}
1378 	state->out = 1;
1379 }
1380 
1381 #ifdef	DEBUG
1382 static
pr(Node * n)1383 pr(Node *n)
1384 {
1385 	Link *l;
1386 
1387 	printf("%d[%d,%d]: ", n->id, n->shift1, n->shift2);
1388 	for (l = n->alts; l; l = l->next) {
1389 		printf("edge from \"%d\" to \"%d\" label {\"%c\"};",
1390 		    n->id, l->node->id, l->lit);
1391 		if (l->node->out) {
1392 			printf(" draw \"%d\" as Doublecircle;", l->node->id);
1393 		}
1394 		if (l->node->fail) {
1395 			printf(" edge from \"%d\" to \"%d\" dashed;",
1396 			    l->node->id, l->node->fail->id);
1397 		}
1398 		printf("\n");
1399 		pr(l->node);
1400 	}
1401 }
1402 #endif
1403 
1404 static void
fail(Node * root)1405 fail(Node *root)
1406 {
1407 	Link *qhead = NULL, *qtail = NULL;
1408 	Link *l, *ll;
1409 	Link *t;
1410 	Node *state, *r, *s;
1411 	int a;
1412 
1413 	for (l = root->alts; l; l = l->next) {
1414 		ADD(l->node);
1415 		l->node->fail = root;
1416 	}
1417 	while (qhead) {
1418 		r = qhead->node;
1419 		DEL();
1420 		for (l = r->alts; l; l = l->next) {
1421 			s = l->node;
1422 			a = l->lit;
1423 			ADD(s);
1424 			state = r->fail;
1425 			while (state) {
1426 				for (ll = state->alts; ll; ll = ll->next)
1427 					if (ll->lit == a)
1428 						break;
1429 				if (ll || (state == root)) {
1430 					if (ll)
1431 						state = ll->node;
1432 					/*
1433 					 *	do it here as only other exit is
1434 					 *	state 0
1435 					 */
1436 					if (state->out) {
1437 						s->out = 1;
1438 					}
1439 					break;
1440 				} else
1441 					state = state->fail;
1442 			}
1443 			s->fail = state;
1444 		}
1445 	}
1446 	zeroroot(root, root);
1447 }
1448 
1449 static void
zeroroot(Node * root,Node * n)1450 zeroroot(Node *root, Node *n)
1451 {
1452 	Link *l;
1453 
1454 	if (n->fail == root)
1455 		n->fail = NULL;
1456 	for (l = n->alts; l; l = l->next)
1457 		zeroroot(root, l->node);
1458 }
1459 
1460 static void
shift(re_cw * c)1461 shift(re_cw *c)
1462 {
1463 	Link *qhead = NULL, *qtail = NULL;
1464 	Link *l;
1465 	Link *t;
1466 	Node *state, *r, *s;
1467 	int k;
1468 
1469 	for (k = 0; k < 256; k++)
1470 		c->step[k] = c->mindepth+1;
1471 	c->root->shift1 = 1;
1472 	c->root->shift2 = c->mindepth;
1473 	for (l = c->root->alts; l; l = l->next) {
1474 		l->node->shift2 = c->root->shift2;
1475 		ADD(l->node);
1476 		l->node->fail = NULL;
1477 	}
1478 	while (qhead) {
1479 		r = qhead->node;
1480 		DEL();
1481 		r->shift1 = c->mindepth;
1482 		r->shift2 = c->mindepth;
1483 		if ((state = r->fail) != NULL) {
1484 			do {
1485 				k = r->d - state->d;
1486 				if (k < state->shift1) {
1487 					state->shift1 = k;
1488 				}
1489 				if (r->out && (k < state->shift2)) {
1490 					state->shift2 = k;
1491 				}
1492 			} while ((state = state->fail) != NULL);
1493 		}
1494 		for (l = r->alts; l; l = l->next) {
1495 			s = l->node;
1496 			ADD(s);
1497 		}
1498 	}
1499 	shiftprop(c, c->root);
1500 	shifttab(c->root);
1501 	c->step[0] = 1;
1502 }
1503 
1504 static void
shifttab(Node * n)1505 shifttab(Node *n)
1506 {
1507 	Link *l;
1508 	Node **nn;
1509 
1510 	n->tab = nn = (Node **)egmalloc(256 * sizeof (Node *));
1511 	(void) memset((char *)n->tab, 0, 256 * sizeof (Node *));
1512 	for (l = n->alts; l; l = l->next)
1513 		nn[l->lit] = l->node;
1514 }
1515 
1516 static void
shiftprop(re_cw * c,Node * n)1517 shiftprop(re_cw *c, Node *n)
1518 {
1519 	Link *l;
1520 	Node *nn;
1521 
1522 	for (l = n->alts; l; l = l->next) {
1523 		if (c->step[l->lit] > l->node->d)
1524 			c->step[l->lit] = l->node->d;
1525 		nn = l->node;
1526 		if (n->shift2 < nn->shift2)
1527 			nn->shift2 = n->shift2;
1528 		shiftprop(c, nn);
1529 	}
1530 }
1531 
1532 static void
re_cwcomp(re_cw * c)1533 re_cwcomp(re_cw *c)
1534 {
1535 	fail(c->root);
1536 	shift(c);
1537 }
1538 
1539 static BOOL
re_cwexec(PATTERN * pat,uchar_t * rs,uchar_t * re,uchar_t ** mb,uchar_t ** me)1540 re_cwexec(PATTERN *pat, uchar_t *rs, uchar_t *re, uchar_t **mb, uchar_t **me)
1541 {
1542 	Node *state;
1543 	Link *l;
1544 	uchar_t *sp;
1545 	uchar_t *s;
1546 	uchar_t *e;
1547 	Node *ostate;
1548 	int k;
1549 	re_cw *c = pat->cw_ptr;
1550 	uchar_t fake[1];
1551 	uchar_t mappedsp;
1552 
1553 	fake[0] = 0;
1554 	state = c->root;
1555 
1556 	s = rs+c->mindepth-1;
1557 	e = re;
1558 	while (s < e) {
1559 		/* scan */
1560 		for (sp = s; (ostate = state) != NULL; ) {
1561 			mappedsp = c->cmap[*sp];
1562 			if (state->tab) {
1563 				if ((state = state->tab[mappedsp]) == NULL)
1564 					goto nomatch;
1565 			} else {
1566 				for (l = state->alts; ; l = l->next) {
1567 					if (l == NULL)
1568 						goto nomatch;
1569 					if (l->lit == mappedsp) {
1570 						state = l->node;
1571 						break;
1572 					}
1573 				}
1574 			}
1575 			if (state->out) {
1576 				*mb = sp;
1577 				*me = s+1;
1578 				if (fixloc(mb, me))
1579 					return (YES);
1580 			}
1581 			if (--sp < rs) {
1582 				sp = fake;
1583 				break;
1584 			}
1585 		}
1586 	nomatch:
1587 		k = c->step[c->cmap[*sp]] - ostate->d - 1;
1588 		if (k < ostate->shift1)
1589 			k = ostate->shift1;
1590 		if (k > ostate->shift2)
1591 			k = ostate->shift2;
1592 		s += k;
1593 		state = c->root;
1594 	}
1595 	return (NO);
1596 }
1597 
1598 static Node *
newnode(re_cw * c,int d)1599 newnode(re_cw *c, int d)
1600 {
1601 	static Node *lim = NULL;
1602 	static int incr = 256;
1603 
1604 	if (!next_node) lim = NULL;
1605 	if (next_node == lim) {
1606 		next_node = (Node *)egmalloc(incr * sizeof (Node));
1607 		lim = next_node + incr;
1608 	}
1609 	next_node->d = d;
1610 	if (d > c->maxdepth) c->maxdepth = d;
1611 	next_node->id = c->nodeid++;
1612 	next_node->alts = NULL;
1613 	next_node->tab = NULL;
1614 	next_node->out = 0;
1615 	return (next_node++);
1616 }
1617 
1618 static Link *
newlink(uchar_t lit,Node * n)1619 newlink(uchar_t lit, Node *n)
1620 {
1621 	static Link *lim = NULL;
1622 	static int incr = 256;
1623 
1624 	if (!next_link) lim = NULL;
1625 	if (next_link == lim) {
1626 		next_link = (Link *)egmalloc(incr * sizeof (Link));
1627 		lim = next_link + incr;
1628 	}
1629 	next_link->lit = lit;
1630 	next_link->node = n;
1631 	next_link->next = NULL;
1632 	return (next_link++);
1633 }
1634 
1635 int
egrep(char * f,FILE * o,char * fo)1636 egrep(char *f, FILE *o, char *fo)
1637 {
1638 	uchar_t		buff[MAXBUFSIZE + ESIZE];
1639 
1640 	buffer = buff;
1641 	*buffer++ = NL;  /* New line precedes buffer to prevent runover */
1642 	file = f;
1643 	output = o;
1644 	format = fo;
1645 	return (execute());
1646 }
1647 
1648 static int
execute(void)1649 execute(void)
1650 {
1651 	LINE		current;
1652 	BOOL		matched;
1653 	BOOL	all_searched; /* file being matched until end */
1654 
1655 	if ((file_desc = open(file, O_RDONLY)) < 0) {
1656 		return (-1);
1657 	}
1658 	init_file(&current);
1659 		/* while there is more get more text from the data stream */
1660 	for (;;) {
1661 		all_searched = NO;
1662 
1663 		/*
1664 		 * Find next new-line in buffer.
1665 		 * Begin after previous new line character
1666 		 */
1667 
1668 		current.prntbuf = current.newline + 1;
1669 		current.ln++; /* increment line number */
1670 
1671 		if (current.prntbuf < bufend) {
1672 			/* There is more text in buffer */
1673 
1674 			/*
1675 			 * Take our next
1676 			 * "line" as the entire remaining buffer.
1677 			 * However, if there is more of the file yet
1678 			 * to be read in, exclude any incomplete
1679 			 * line at end.
1680 			 */
1681 			if (file_stat == NO_MORE) {
1682 				all_searched = YES;
1683 				current.newline = bufend - 1;
1684 				if (*current.newline != NL) {
1685 					current.newline = bufend;
1686 				}
1687 			} else {
1688 				/*
1689 				 * Find end of the last
1690 				 * complete line in the buffer.
1691 				 */
1692 				current.newline = bufend;
1693 				while (*--current.newline != NL) {
1694 				}
1695 				if (current.newline < current.prntbuf) {
1696 					/* end not found */
1697 					current.newline = bufend;
1698 				}
1699 			}
1700 		} else {
1701 			/* There is no more text in the buffer. */
1702 			current.newline = bufend;
1703 		}
1704 		if (current.newline >= bufend) {
1705 			/*
1706 			 * There is no more text in the buffer,
1707 			 * or no new line was found.
1708 			 */
1709 			switch (file_stat) {
1710 			case MORE:	/* file partly unread */
1711 			case BEGIN:
1712 				fgetfile(&current);
1713 
1714 				current.ln--;
1715 				continue; /* with while loop */
1716 			case NO_MORE:
1717 				break;
1718 			}
1719 			/* Nothing more to read in for this file. */
1720 			if (current.newline <= current.prntbuf) {
1721 				/* Nothing in the buffer, either */
1722 				/* We are done with the file. */
1723 				current.ln--;
1724 				break; /* out of while loop */
1725 			}
1726 			/* There is no NL at the end of the file */
1727 		}
1728 
1729 		matched = pattern_match(&match_pattern, &current);
1730 		if (matched) {
1731 			int nc;
1732 
1733 			get_ncount(&current, match_pattern.loc1);
1734 			get_line(&current, match_pattern.loc2);
1735 
1736 			nc = current.newline + 1 - current.prntbuf;
1737 			(void) fprintf(output, format, file, current.ln);
1738 			(void) fwrite((char *)current.prntbuf, 1, nc, output);
1739 		} else {
1740 			if (all_searched)
1741 				break; /* out of while loop */
1742 
1743 			get_ncount(&current, current.newline + 1);
1744 		}
1745 	}
1746 
1747 	(void) close(file_desc);
1748 	return (0);
1749 }
1750 
1751 static void
init_file(LINE * cur_ptr)1752 init_file(LINE *cur_ptr)
1753 {
1754 	file_stat = BEGIN;
1755 	cur_ptr->ln = 0;
1756 	bufend = buffer;
1757 	cur_ptr->newline = buffer - 1;
1758 }
1759 
1760 static BOOL
pattern_match(PATTERN * pat,LINE * lptr)1761 pattern_match(PATTERN *pat, LINE *lptr)
1762 {
1763 	if ((*pat->procfn)(pat, lptr->prntbuf - 1, lptr->newline + 1,
1764 	    &pat->loc1, &pat->loc2)) {
1765 		return (YES);
1766 	} else {
1767 		pat->loc1 = lptr->prntbuf;
1768 		pat->loc2 = lptr->newline - 1;
1769 		return (NO);
1770 	}
1771 } /* end of function pattern_match() */
1772 
1773 static void
fgetfile(LINE * cur_ptr)1774 fgetfile(LINE *cur_ptr)
1775 {
1776 	/*
1777 	 * This function reads as much of the current file into the buffer
1778 	 * as will fit.
1779 	 */
1780 	int	bytes;	  /* bytes read in current buffer */
1781 	int	bufsize = MAXBUFSIZE; /* free space in data buffer */
1782 	int	save_current;
1783 	uchar_t	*begin = cur_ptr->prntbuf;
1784 
1785 	/*
1786 	 * Bytes of current incomplete line, if any, save_current in buffer.
1787 	 * These must be saved.
1788 	 */
1789 	save_current = (int)(bufend - begin);
1790 
1791 	if (file_stat == MORE) {
1792 		/*
1793 		 * A portion of the file fills the buffer. We must clear
1794 		 * out the dead wood to make room for more of the file.
1795 		 */
1796 
1797 		int k = 0;
1798 
1799 		k = begin - buffer;
1800 		if (!k) {
1801 			/*
1802 			 * We have one humungous current line,
1803 			 * which fills the whole buffer.
1804 			 * Toss it.
1805 			 */
1806 			begin = bufend;
1807 			k = begin - buffer;
1808 			save_current = 0;
1809 		}
1810 
1811 		begin = buffer;
1812 		bufend = begin + save_current;
1813 
1814 		bufsize = MAXBUFSIZE - save_current;
1815 
1816 		if (save_current > 0) {
1817 			/*
1818 			 * Must save portion of current line.
1819 			 * Copy to beginning of buffer.
1820 			 */
1821 			(void) memmove(buffer, buffer + k, save_current);
1822 		}
1823 	}
1824 
1825 	/* Now read in the file. */
1826 
1827 	do {
1828 		bytes = read(file_desc, (char *)bufend, (unsigned int)bufsize);
1829 		if (bytes < 0) {
1830 			/* can't read any more of file */
1831 			bytes = 0;
1832 		}
1833 		bufend += bytes;
1834 		bufsize -= bytes;
1835 	} while (bytes > 0 && bufsize > 0);
1836 
1837 
1838 	if (begin >= bufend) {
1839 		/* No new lines or incomplete line in buffer */
1840 		file_stat = NO_MORE;
1841 	} else if (bufsize) {
1842 		/* Still space in the buffer, so we have read entire file */
1843 		file_stat = NO_MORE;
1844 	} else {
1845 		/* We filled entire buffer, so there may be more yet to read */
1846 		file_stat = MORE;
1847 	}
1848 		/* Note: bufend is 1 past last good char */
1849 	*bufend = NL;	/* Sentinel new-line character */
1850 	/* Set newline to character preceding the current line */
1851 	cur_ptr->newline = begin - 1;
1852 }
1853 
1854 static void
get_line(LINE * cur_ptr,uchar_t * s)1855 get_line(LINE *cur_ptr, uchar_t *s)
1856 {
1857 	while (*s++ != NL) {
1858 	}
1859 	cur_ptr->newline = --s;
1860 	cur_ptr->ln++;
1861 }
1862 
1863 static void
get_ncount(LINE * cur_ptr,uchar_t * s)1864 get_ncount(LINE *cur_ptr, uchar_t *s)
1865 {
1866 	uchar_t	*p = cur_ptr->prntbuf;
1867 
1868 	while (*--s != NL) {
1869 	}
1870 	cur_ptr->newline = s++;
1871 	while ((s > p) &&
1872 	    (p = (uchar_t *)memchr((char *)p, NL, s - p)) != NULL) {
1873 		cur_ptr->ln++;
1874 		++p;
1875 	}
1876 	cur_ptr->ln--;
1877 	cur_ptr->prntbuf = s;
1878 }
1879