1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright (c) 1997, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28/*	  All Rights Reserved  	*/
29
30
31/* 	Portions Copyright(c) 1988, Sun Microsystems Inc.	*/
32/*	All Rights Reserved					*/
33
34#pragma ident	"%Z%%M%	%I%	%E% SMI"	/* SVr4.0 1.1	*/
35
36/*LINTLIBRARY*/
37
38/*
39 * routines to do regular expression matching
40 *
41 * Entry points:
42 *
43 *	re_comp(s)
44 *		char *s;
45 *	 ... returns 0 if the string s was compiled successfully,
46 *		     a pointer to an error message otherwise.
47 *	     If passed 0 or a null string returns without changing
48 *           the currently compiled re (see note 11 below).
49 *
50 *	re_exec(s)
51 *		char *s;
52 *	 ... returns 1 if the string s matches the last compiled regular
53 *		       expression,
54 *		     0 if the string s failed to match the last compiled
55 *		       regular expression, and
56 *		    -1 if the compiled regular expression was invalid
57 *		       (indicating an internal error).
58 *
59 * The strings passed to both re_comp and re_exec may have trailing or
60 * embedded newline characters; they are terminated by nulls.
61 *
62 * The identity of the author of these routines is lost in antiquity;
63 * this is essentially the same as the re code in the original V6 ed.
64 *
65 * The regular expressions recognized are described below. This description
66 * is essentially the same as that for ed.
67 *
68 *	A regular expression specifies a set of strings of characters.
69 *	A member of this set of strings is said to be matched by
70 *	the regular expression.  In the following specification for
71 *	regular expressions the word `character' means any character but NUL.
72 *
73 *	1.  Any character except a special character matches itself.
74 *	    Special characters are the regular expression delimiter plus
75 *	    \ [ . and sometimes ^ * $.
76 *	2.  A . matches any character.
77 *	3.  A \ followed by any character except a digit or ( )
78 *	    matches that character.
79 *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
80 *	    character in (or not in) s. In s, \ has no special meaning,
81 *	    and ] may only appear as the first letter. A substring
82 *	    a-b, with a and b in ascending ASCII order, stands for
83 *	    the inclusive range of ASCII characters.
84 *	5.  A regular expression of form 1-4 followed by * matches a
85 *	    sequence of 0 or more matches of the regular expression.
86 *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
87 *	    matches what x matches.
88 *	7.  A \ followed by a digit n matches a copy of the string that the
89 *	    bracketed regular expression beginning with the nth \( matched.
90 *	8.  A regular expression of form 1-8, x, followed by a regular
91 *	    expression of form 1-7, y matches a match for x followed by
92 *	    a match for y, with the x match being as long as possible
93 *	    while still permitting a y match.
94 *	9.  A regular expression of form 1-8 preceded by ^ (or followed
95 *	    by $), is constrained to matches that begin at the left
96 *	    (or end at the right) end of a line.
97 *	10. A regular expression of form 1-9 picks out the longest among
98 *	    the leftmost matches in a line.
99 *	11. An empty regular expression stands for a copy of the last
100 *	    regular expression encountered.
101 */
102
103#include <sys/types.h>
104#include <stdlib.h>
105#include <stddef.h>
106
107/*
108 * constants for re's
109 */
110#define	CBRA	1
111#define	CCHR	2
112#define	CDOT	4
113#define	CCL	6
114#define	NCCL	8
115#define	CDOL	10
116#define	CEOF	11
117#define	CKET	12
118#define	CBACK	18
119
120#define	CSTAR	01
121
122#define	ESIZE	512
123#define	NBRA	9
124
125static struct re_globals {
126	char	_expbuf[ESIZE];
127	char	*_braslist[NBRA], *_braelist[NBRA];
128	char	_circf;
129} *re_globals;
130#define	expbuf (_re->_expbuf)
131#define	braslist (_re->_braslist)
132#define	braelist (_re->_braelist)
133#define	circf (_re->_circf)
134
135/*
136 * forward declarations
137 */
138static int backref(int, char *);
139static int advance(char *, char *);
140static int cclass(char *, char, int);
141
142/*
143 * compile the regular expression argument into a dfa
144 */
145char *
146re_comp(char *sp)
147{
148	char	c;
149	struct re_globals *_re = re_globals;
150	char	*ep;
151	int	cclcnt, numbra = 0;
152	char	*lastep = 0;
153	char	bracket[NBRA];
154	char	*bracketp = &bracket[0];
155	char	*retoolong = "Regular expression too long";
156
157	if (_re == 0) {
158		_re = (struct re_globals *)calloc(1, sizeof (*_re));
159		if (_re == 0)
160			return ("Out of memory");
161		re_globals = _re;
162	}
163	ep = expbuf;
164
165#define	comerr(msg) {expbuf[0] = 0; return (msg); }
166
167	if (sp == 0 || *sp == '\0') {
168		if (*ep == 0)
169			return ("No previous regular expression");
170		return (0);
171	}
172	if (*sp == '^') {
173		circf = 1;
174		sp++;
175	}
176	else
177		circf = 0;
178	for (;;) {
179		if (ep >= &expbuf[ESIZE])
180			comerr(retoolong);
181		if ((c = *sp++) == '\0') {
182			if (bracketp != bracket)
183				comerr("unmatched \\(");
184			*ep++ = CEOF;
185			*ep++ = 0;
186			return (0);
187		}
188		if (c != '*')
189			lastep = ep;
190		switch (c) {
191
192		case '.':
193			*ep++ = CDOT;
194			continue;
195
196		case '*':
197			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
198				goto defchar;
199			*lastep |= CSTAR;
200			continue;
201
202		case '$':
203			if (*sp != '\0')
204				goto defchar;
205			*ep++ = CDOL;
206			continue;
207
208		case '[':
209			*ep++ = CCL;
210			*ep++ = 0;
211			cclcnt = 1;
212			if ((c = *sp++) == '^') {
213				c = *sp++;
214				ep[-2] = NCCL;
215			}
216			do {
217				if (c == '\0')
218					comerr("missing ]");
219				if (c == '-' && ep [-1] != 0) {
220					if ((c = *sp++) == ']') {
221						*ep++ = '-';
222						cclcnt++;
223						break;
224					}
225					while (ep[-1] < c) {
226						*ep = ep[-1] + 1;
227						ep++;
228						cclcnt++;
229						if (ep >= &expbuf[ESIZE])
230							comerr(retoolong);
231					}
232				}
233				*ep++ = c;
234				cclcnt++;
235				if (ep >= &expbuf[ESIZE])
236					comerr(retoolong);
237			} while ((c = *sp++) != ']');
238			lastep[1] = (char)cclcnt;
239			continue;
240
241		case '\\':
242			if ((c = *sp++) == '(') {
243				if (numbra >= NBRA)
244					comerr("too many \\(\\) pairs");
245				*bracketp++ = (char)numbra;
246				*ep++ = CBRA;
247				*ep++ = numbra++;
248				continue;
249			}
250			if (c == ')') {
251				if (bracketp <= bracket)
252					comerr("unmatched \\)");
253				*ep++ = CKET;
254				*ep++ = *--bracketp;
255				continue;
256			}
257			if (c >= '1' && c < ('1' + NBRA)) {
258				*ep++ = CBACK;
259				*ep++ = c - '1';
260				continue;
261			}
262			*ep++ = CCHR;
263			*ep++ = c;
264			continue;
265
266		defchar:
267		default:
268			*ep++ = CCHR;
269			*ep++ = c;
270		}
271	}
272}
273
274/*
275 * match the argument string against the compiled re
276 */
277int
278re_exec(char *p1)
279{
280	struct re_globals *_re = re_globals;
281	char	*p2;
282	int	c;
283	int	rv;
284
285	if (_re == 0)
286		return (0);
287	p2 = expbuf;
288	for (c = 0; c < NBRA; c++) {
289		braslist[c] = 0;
290		braelist[c] = 0;
291	}
292	if (circf)
293		return ((advance(p1, p2)));
294	/*
295	 * fast check for first character
296	 */
297	if (*p2 == CCHR) {
298		c = p2[1];
299		do {
300			if (*p1 != c)
301				continue;
302			if (rv = advance(p1, p2))
303				return (rv);
304		} while (*p1++);
305		return (0);
306	}
307	/*
308	 * regular algorithm
309	 */
310	do
311		if (rv = advance(p1, p2))
312			return (rv);
313	while (*p1++);
314	return (0);
315}
316
317/*
318 * try to match the next thing in the dfa
319 */
320static int
321advance(char *lp, char *ep)
322{
323	char	*curlp;
324	int	i;
325	ptrdiff_t	ct;
326	int	rv;
327	struct re_globals *_re = re_globals;
328
329	for (;;)
330		switch (*ep++) {
331
332		case CCHR:
333			if (*ep++ == *lp++)
334				continue;
335			return (0);
336
337		case CDOT:
338			if (*lp++)
339				continue;
340			return (0);
341
342		case CDOL:
343			if (*lp == '\0')
344				continue;
345			return (0);
346
347		case CEOF:
348			return (1);
349
350		case CCL:
351			if (cclass(ep, *lp++, 1)) {
352				ep += *ep;
353				continue;
354			}
355			return (0);
356
357		case NCCL:
358			if (cclass(ep, *lp++, 0)) {
359				ep += *ep;
360				continue;
361			}
362			return (0);
363
364		case CBRA:
365			braslist[*ep++] = lp;
366			continue;
367
368		case CKET:
369			braelist[*ep++] = lp;
370			continue;
371
372		case CBACK:
373			if (braelist[i = *ep++] == 0)
374				return (-1);
375			if (backref(i, lp)) {
376				lp += braelist[i] - braslist[i];
377				continue;
378			}
379			return (0);
380
381		case CBACK|CSTAR:
382			if (braelist[i = *ep++] == 0)
383				return (-1);
384			curlp = lp;
385			ct = braelist[i] - braslist[i];
386			while (backref(i, lp))
387				lp += ct;
388			while (lp >= curlp) {
389				if (rv = advance(lp, ep))
390					return (rv);
391				lp -= ct;
392			}
393			continue;
394
395		case CDOT|CSTAR:
396			curlp = lp;
397			while (*lp++)
398				;
399			goto star;
400
401		case CCHR|CSTAR:
402			curlp = lp;
403			while (*lp++ == *ep)
404				;
405			ep++;
406			goto star;
407
408		case CCL|CSTAR:
409		case NCCL|CSTAR:
410			curlp = lp;
411			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
412				;
413			ep += *ep;
414			goto star;
415
416		star:
417			do {
418				lp--;
419				if (rv = advance(lp, ep))
420					return (rv);
421			} while (lp > curlp);
422			return (0);
423
424		default:
425			return (-1);
426		}
427}
428
429static int
430backref(int i, char *lp)
431{
432	char	*bp;
433	struct re_globals *_re = re_globals;
434
435	bp = braslist[i];
436	while (*bp++ == *lp++)
437		if (bp >= braelist[i])
438			return (1);
439	return (0);
440}
441
442static int
443cclass(char *set, char c, int af)
444{
445	int	n;
446
447	if (c == 0)
448		return (0);
449	n = *set++;
450	while (--n)
451		if (*set++ == c)
452			return (af);
453	return (! af);
454}
455