1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28/*	  All Rights Reserved  	*/
29
30#pragma ident	"%Z%%M%	%I%	%E% SMI"
31
32/*
33 * routines to do regular expression matching
34 *
35 * Entry points:
36 *
37 *	re_comp(s)
38 *		char *s;
39 *	 ... returns 0 if the string s was compiled successfully,
40 *		     a pointer to an error message otherwise.
41 *	     If passed 0 or a null string returns without changing
42 *           the currently compiled re (see note 11 below).
43 *
44 *	re_exec(s)
45 *		char *s;
46 *	 ... returns 1 if the string s matches the last compiled regular
47 *		       expression,
48 *		     0 if the string s failed to match the last compiled
49 *		       regular expression, and
50 *		    -1 if the compiled regular expression was invalid
51 *		       (indicating an internal error).
52 *
53 * The strings passed to both re_comp and re_exec may have trailing or
54 * embedded newline characters; they are terminated by nulls.
55 *
56 * The identity of the author of these routines is lost in antiquity;
57 * this is essentially the same as the re code in the original V6 ed.
58 *
59 * The regular expressions recognized are described below. This description
60 * is essentially the same as that for ed.
61 *
62 *	A regular expression specifies a set of strings of characters.
63 *	A member of this set of strings is said to be matched by
64 *	the regular expression.  In the following specification for
65 *	regular expressions the word `character' means any character but NUL.
66 *
67 *	1.  Any character except a special character matches itself.
68 *	    Special characters are the regular expression delimiter plus
69 *	    \ [ . and sometimes ^ * $.
70 *	2.  A . matches any character.
71 *	3.  A \ followed by any character except a digit or ( )
72 *	    matches that character.
73 *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
74 *	    character in (or not in) s. In s, \ has no special meaning,
75 *	    and ] may only appear as the first letter. A substring
76 *	    a-b, with a and b in ascending ASCII order, stands for
77 *	    the inclusive range of ASCII characters.
78 *	5.  A regular expression of form 1-4 followed by * matches a
79 *	    sequence of 0 or more matches of the regular expression.
80 *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
81 *	    matches what x matches.
82 *	7.  A \ followed by a digit n matches a copy of the string that the
83 *	    bracketed regular expression beginning with the nth \( matched.
84 *	8.  A regular expression of form 1-8, x, followed by a regular
85 *	    expression of form 1-7, y matches a match for x followed by
86 *	    a match for y, with the x match being as long as possible
87 *	    while still permitting a y match.
88 *	9.  A regular expression of form 1-8 preceded by ^ (or followed
89 *	    by $), is constrained to matches that begin at the left
90 *	    (or end at the right) end of a line.
91 *	10. A regular expression of form 1-9 picks out the longest among
92 *	    the leftmost matches in a line.
93 *	11. An empty regular expression stands for a copy of the last
94 *	    regular expression encountered.
95 */
96
97#include "lint.h"
98
99#include <stdlib.h>
100#include <re_comp.h>
101#include <stddef.h>
102#include <sys/types.h>
103
104/*
105 * constants for re's
106 */
107#define	CBRA	1
108#define	CCHR	2
109#define	CDOT	4
110#define	CCL	6
111#define	NCCL	8
112#define	CDOL	10
113#define	CEOF	11
114#define	CKET	12
115#define	CBACK	18
116
117#define	CSTAR	01
118
119#define	ESIZE	512
120#define	NBRA	9
121
122static struct re_globals {
123	char	_expbuf[ESIZE];
124	char	*_braslist[NBRA], *_braelist[NBRA];
125	char	_circf;
126} *re_globals;
127#define	expbuf (_re->_expbuf)
128#define	braslist (_re->_braslist)
129#define	braelist (_re->_braelist)
130#define	circf (_re->_circf)
131
132static int advance(const char *, char *);
133static int backref(int, const char *);
134static int cclass(char *, char, int);
135
136/*
137 * compile the regular expression argument into a dfa
138 */
139char *
140re_comp(const char *sp)
141{
142	char	c;
143	struct re_globals *_re = re_globals;
144	char	*ep;
145	char	cclcnt, numbra = 0;
146	char	*lastep = NULL;
147	char	bracket[NBRA];
148	char	*bracketp = &bracket[0];
149	char	*retoolong = "Regular expression too long";
150
151	if (_re == NULL) {
152		_re = (struct re_globals *)calloc(1, sizeof (*_re));
153		if (_re == NULL)
154			return ("Out of memory");
155		re_globals = _re;
156	}
157	ep = expbuf;
158
159#define	comerr(msg) {expbuf[0] = 0; return (msg); }
160
161	if (sp == NULL || *sp == '\0') {
162		if (*ep == 0)
163			return ("No previous regular expression");
164		return (NULL);
165	}
166	if (*sp == '^') {
167		circf = 1;
168		sp++;
169	}
170	else
171		circf = 0;
172	for (;;) {
173		if (ep >= &expbuf[ESIZE])
174			comerr(retoolong);
175		if ((c = *sp++) == '\0') {
176			if (bracketp != bracket)
177				comerr("unmatched \\(");
178			*ep++ = CEOF;
179			*ep++ = 0;
180			return (NULL);
181		}
182		if (c != '*')
183			lastep = ep;
184		switch (c) {
185
186		case '.':
187			*ep++ = CDOT;
188			continue;
189
190		case '*':
191			if (lastep == NULL || *lastep == CBRA ||
192			    *lastep == CKET)
193				goto defchar;
194			*lastep |= CSTAR;
195			continue;
196
197		case '$':
198			if (*sp != '\0')
199				goto defchar;
200			*ep++ = CDOL;
201			continue;
202
203		case '[':
204			*ep++ = CCL;
205			*ep++ = 0;
206			cclcnt = 1;
207			if ((c = *sp++) == '^') {
208				c = *sp++;
209				ep[-2] = NCCL;
210			}
211			do {
212				if (c == '\0')
213					comerr("missing ]");
214				if (c == '-' && ep [-1] != 0) {
215					if ((c = *sp++) == ']') {
216						*ep++ = '-';
217						cclcnt++;
218						break;
219					}
220					while (ep[-1] < c) {
221						*ep = ep[-1] + 1;
222						ep++;
223						cclcnt++;
224						if (ep >= &expbuf[ESIZE])
225							comerr(retoolong);
226					}
227				}
228				*ep++ = c;
229				cclcnt++;
230				if (ep >= &expbuf[ESIZE])
231					comerr(retoolong);
232			} while ((c = *sp++) != ']');
233			lastep[1] = cclcnt;
234			continue;
235
236		case '\\':
237			if ((c = *sp++) == '(') {
238				if (numbra >= NBRA)
239					comerr("too many \\(\\) pairs");
240				*bracketp++ = numbra;
241				*ep++ = CBRA;
242				*ep++ = numbra++;
243				continue;
244			}
245			if (c == ')') {
246				if (bracketp <= bracket)
247					comerr("unmatched \\)");
248				*ep++ = CKET;
249				*ep++ = *--bracketp;
250				continue;
251			}
252			if (c >= '1' && c < ('1' + NBRA)) {
253				*ep++ = CBACK;
254				*ep++ = c - '1';
255				continue;
256			}
257			*ep++ = CCHR;
258			*ep++ = c;
259			continue;
260
261		defchar:
262		default:
263			*ep++ = CCHR;
264			*ep++ = c;
265		}
266	}
267}
268
269/*
270 * match the argument string against the compiled re
271 */
272int
273re_exec(const char *p1)
274{
275	struct re_globals *_re = re_globals;
276	char	*p2;
277	int	c;
278	int	rv;
279
280	if (_re == NULL)
281		return (0);
282	p2 = expbuf;
283	for (c = 0; c < NBRA; c++) {
284		braslist[c] = 0;
285		braelist[c] = 0;
286	}
287	if (circf)
288		return ((advance(p1, p2)));
289	/*
290	 * fast check for first character
291	 */
292	if (*p2 == CCHR) {
293		c = p2[1];
294		do {
295			if (*p1 != c)
296				continue;
297			if (rv = advance(p1, p2))
298				return (rv);
299		} while (*p1++);
300		return (0);
301	}
302	/*
303	 * regular algorithm
304	 */
305	do {
306		if (rv = advance(p1, p2))
307			return (rv);
308	} while (*p1++);
309	return (0);
310}
311
312/*
313 * try to match the next thing in the dfa
314 */
315static int
316advance(const char *lp, char *ep)
317{
318	const char	*curlp;
319	ptrdiff_t	ct;
320	int		i;
321	int		rv;
322	struct re_globals *_re = re_globals;
323
324	for (;;)
325		switch (*ep++) {
326
327		case CCHR:
328			if (*ep++ == *lp++)
329				continue;
330			return (0);
331
332		case CDOT:
333			if (*lp++)
334				continue;
335			return (0);
336
337		case CDOL:
338			if (*lp == '\0')
339				continue;
340			return (0);
341
342		case CEOF:
343			return (1);
344
345		case CCL:
346			if (cclass(ep, *lp++, 1)) {
347				ep += *ep;
348				continue;
349			}
350			return (0);
351
352		case NCCL:
353			if (cclass(ep, *lp++, 0)) {
354				ep += *ep;
355				continue;
356			}
357			return (0);
358
359		case CBRA:
360			braslist[*ep++] = (char *)lp;
361			continue;
362
363		case CKET:
364			braelist[*ep++] = (char *)lp;
365			continue;
366
367		case CBACK:
368			if (braelist[i = *ep++] == NULL)
369				return (-1);
370			if (backref(i, lp)) {
371				lp += braelist[i] - braslist[i];
372				continue;
373			}
374			return (0);
375
376		case CBACK|CSTAR:
377			if (braelist[i = *ep++] == NULL)
378				return (-1);
379			curlp = lp;
380			ct = braelist[i] - braslist[i];
381			while (backref(i, lp))
382				lp += ct;
383			while (lp >= curlp) {
384				if (rv = advance(lp, ep))
385					return (rv);
386				lp -= ct;
387			}
388			continue;
389
390		case CDOT|CSTAR:
391			curlp = lp;
392			while (*lp++)
393				;
394			goto star;
395
396		case CCHR|CSTAR:
397			curlp = lp;
398			while (*lp++ == *ep)
399				;
400			ep++;
401			goto star;
402
403		case CCL|CSTAR:
404		case NCCL|CSTAR:
405			curlp = lp;
406			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
407				;
408			ep += *ep;
409			goto star;
410
411		star:
412			do {
413				lp--;
414				if (rv = advance(lp, ep))
415					return (rv);
416			} while (lp > curlp);
417			return (0);
418
419		default:
420			return (-1);
421		}
422}
423
424static int
425backref(int i, const char *lp)
426{
427	char	*bp;
428	struct re_globals *_re = re_globals;
429
430	bp = braslist[i];
431	while (*bp++ == *lp++)
432		if (bp >= braelist[i])
433			return (1);
434	return (0);
435}
436
437static int
438cclass(char *set, char c, int af)
439{
440	int	n;
441
442	if (c == 0)
443		return (0);
444	n = *set++;
445	while (--n)
446		if (*set++ == c)
447			return (af);
448	return (! af);
449}
450