xref: /illumos-gate/usr/src/ucblib/libucb/port/gen/regex.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 1997, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 
31 /* 	Portions Copyright(c) 1988, Sun Microsystems Inc.	*/
32 /*	All Rights Reserved					*/
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* SVr4.0 1.1	*/
35 
36 /*LINTLIBRARY*/
37 
38 /*
39  * routines to do regular expression matching
40  *
41  * Entry points:
42  *
43  *	re_comp(s)
44  *		char *s;
45  *	 ... returns 0 if the string s was compiled successfully,
46  *		     a pointer to an error message otherwise.
47  *	     If passed 0 or a null string returns without changing
48  *           the currently compiled re (see note 11 below).
49  *
50  *	re_exec(s)
51  *		char *s;
52  *	 ... returns 1 if the string s matches the last compiled regular
53  *		       expression,
54  *		     0 if the string s failed to match the last compiled
55  *		       regular expression, and
56  *		    -1 if the compiled regular expression was invalid
57  *		       (indicating an internal error).
58  *
59  * The strings passed to both re_comp and re_exec may have trailing or
60  * embedded newline characters; they are terminated by nulls.
61  *
62  * The identity of the author of these routines is lost in antiquity;
63  * this is essentially the same as the re code in the original V6 ed.
64  *
65  * The regular expressions recognized are described below. This description
66  * is essentially the same as that for ed.
67  *
68  *	A regular expression specifies a set of strings of characters.
69  *	A member of this set of strings is said to be matched by
70  *	the regular expression.  In the following specification for
71  *	regular expressions the word `character' means any character but NUL.
72  *
73  *	1.  Any character except a special character matches itself.
74  *	    Special characters are the regular expression delimiter plus
75  *	    \ [ . and sometimes ^ * $.
76  *	2.  A . matches any character.
77  *	3.  A \ followed by any character except a digit or ( )
78  *	    matches that character.
79  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
80  *	    character in (or not in) s. In s, \ has no special meaning,
81  *	    and ] may only appear as the first letter. A substring
82  *	    a-b, with a and b in ascending ASCII order, stands for
83  *	    the inclusive range of ASCII characters.
84  *	5.  A regular expression of form 1-4 followed by * matches a
85  *	    sequence of 0 or more matches of the regular expression.
86  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
87  *	    matches what x matches.
88  *	7.  A \ followed by a digit n matches a copy of the string that the
89  *	    bracketed regular expression beginning with the nth \( matched.
90  *	8.  A regular expression of form 1-8, x, followed by a regular
91  *	    expression of form 1-7, y matches a match for x followed by
92  *	    a match for y, with the x match being as long as possible
93  *	    while still permitting a y match.
94  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
95  *	    by $), is constrained to matches that begin at the left
96  *	    (or end at the right) end of a line.
97  *	10. A regular expression of form 1-9 picks out the longest among
98  *	    the leftmost matches in a line.
99  *	11. An empty regular expression stands for a copy of the last
100  *	    regular expression encountered.
101  */
102 
103 #include <sys/types.h>
104 #include <stdlib.h>
105 #include <stddef.h>
106 
107 /*
108  * constants for re's
109  */
110 #define	CBRA	1
111 #define	CCHR	2
112 #define	CDOT	4
113 #define	CCL	6
114 #define	NCCL	8
115 #define	CDOL	10
116 #define	CEOF	11
117 #define	CKET	12
118 #define	CBACK	18
119 
120 #define	CSTAR	01
121 
122 #define	ESIZE	512
123 #define	NBRA	9
124 
125 static struct re_globals {
126 	char	_expbuf[ESIZE];
127 	char	*_braslist[NBRA], *_braelist[NBRA];
128 	char	_circf;
129 } *re_globals;
130 #define	expbuf (_re->_expbuf)
131 #define	braslist (_re->_braslist)
132 #define	braelist (_re->_braelist)
133 #define	circf (_re->_circf)
134 
135 /*
136  * forward declarations
137  */
138 static int backref(int, char *);
139 static int advance(char *, char *);
140 static int cclass(char *, char, int);
141 
142 /*
143  * compile the regular expression argument into a dfa
144  */
145 char *
146 re_comp(char *sp)
147 {
148 	char	c;
149 	struct re_globals *_re = re_globals;
150 	char	*ep;
151 	int	cclcnt, numbra = 0;
152 	char	*lastep = 0;
153 	char	bracket[NBRA];
154 	char	*bracketp = &bracket[0];
155 	char	*retoolong = "Regular expression too long";
156 
157 	if (_re == 0) {
158 		_re = (struct re_globals *)calloc(1, sizeof (*_re));
159 		if (_re == 0)
160 			return ("Out of memory");
161 		re_globals = _re;
162 	}
163 	ep = expbuf;
164 
165 #define	comerr(msg) {expbuf[0] = 0; return (msg); }
166 
167 	if (sp == 0 || *sp == '\0') {
168 		if (*ep == 0)
169 			return ("No previous regular expression");
170 		return (0);
171 	}
172 	if (*sp == '^') {
173 		circf = 1;
174 		sp++;
175 	}
176 	else
177 		circf = 0;
178 	for (;;) {
179 		if (ep >= &expbuf[ESIZE])
180 			comerr(retoolong);
181 		if ((c = *sp++) == '\0') {
182 			if (bracketp != bracket)
183 				comerr("unmatched \\(");
184 			*ep++ = CEOF;
185 			*ep++ = 0;
186 			return (0);
187 		}
188 		if (c != '*')
189 			lastep = ep;
190 		switch (c) {
191 
192 		case '.':
193 			*ep++ = CDOT;
194 			continue;
195 
196 		case '*':
197 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
198 				goto defchar;
199 			*lastep |= CSTAR;
200 			continue;
201 
202 		case '$':
203 			if (*sp != '\0')
204 				goto defchar;
205 			*ep++ = CDOL;
206 			continue;
207 
208 		case '[':
209 			*ep++ = CCL;
210 			*ep++ = 0;
211 			cclcnt = 1;
212 			if ((c = *sp++) == '^') {
213 				c = *sp++;
214 				ep[-2] = NCCL;
215 			}
216 			do {
217 				if (c == '\0')
218 					comerr("missing ]");
219 				if (c == '-' && ep [-1] != 0) {
220 					if ((c = *sp++) == ']') {
221 						*ep++ = '-';
222 						cclcnt++;
223 						break;
224 					}
225 					while (ep[-1] < c) {
226 						*ep = ep[-1] + 1;
227 						ep++;
228 						cclcnt++;
229 						if (ep >= &expbuf[ESIZE])
230 							comerr(retoolong);
231 					}
232 				}
233 				*ep++ = c;
234 				cclcnt++;
235 				if (ep >= &expbuf[ESIZE])
236 					comerr(retoolong);
237 			} while ((c = *sp++) != ']');
238 			lastep[1] = (char)cclcnt;
239 			continue;
240 
241 		case '\\':
242 			if ((c = *sp++) == '(') {
243 				if (numbra >= NBRA)
244 					comerr("too many \\(\\) pairs");
245 				*bracketp++ = (char)numbra;
246 				*ep++ = CBRA;
247 				*ep++ = numbra++;
248 				continue;
249 			}
250 			if (c == ')') {
251 				if (bracketp <= bracket)
252 					comerr("unmatched \\)");
253 				*ep++ = CKET;
254 				*ep++ = *--bracketp;
255 				continue;
256 			}
257 			if (c >= '1' && c < ('1' + NBRA)) {
258 				*ep++ = CBACK;
259 				*ep++ = c - '1';
260 				continue;
261 			}
262 			*ep++ = CCHR;
263 			*ep++ = c;
264 			continue;
265 
266 		defchar:
267 		default:
268 			*ep++ = CCHR;
269 			*ep++ = c;
270 		}
271 	}
272 }
273 
274 /*
275  * match the argument string against the compiled re
276  */
277 int
278 re_exec(char *p1)
279 {
280 	struct re_globals *_re = re_globals;
281 	char	*p2;
282 	int	c;
283 	int	rv;
284 
285 	if (_re == 0)
286 		return (0);
287 	p2 = expbuf;
288 	for (c = 0; c < NBRA; c++) {
289 		braslist[c] = 0;
290 		braelist[c] = 0;
291 	}
292 	if (circf)
293 		return ((advance(p1, p2)));
294 	/*
295 	 * fast check for first character
296 	 */
297 	if (*p2 == CCHR) {
298 		c = p2[1];
299 		do {
300 			if (*p1 != c)
301 				continue;
302 			if (rv = advance(p1, p2))
303 				return (rv);
304 		} while (*p1++);
305 		return (0);
306 	}
307 	/*
308 	 * regular algorithm
309 	 */
310 	do
311 		if (rv = advance(p1, p2))
312 			return (rv);
313 	while (*p1++);
314 	return (0);
315 }
316 
317 /*
318  * try to match the next thing in the dfa
319  */
320 static int
321 advance(char *lp, char *ep)
322 {
323 	char	*curlp;
324 	int	i;
325 	ptrdiff_t	ct;
326 	int	rv;
327 	struct re_globals *_re = re_globals;
328 
329 	for (;;)
330 		switch (*ep++) {
331 
332 		case CCHR:
333 			if (*ep++ == *lp++)
334 				continue;
335 			return (0);
336 
337 		case CDOT:
338 			if (*lp++)
339 				continue;
340 			return (0);
341 
342 		case CDOL:
343 			if (*lp == '\0')
344 				continue;
345 			return (0);
346 
347 		case CEOF:
348 			return (1);
349 
350 		case CCL:
351 			if (cclass(ep, *lp++, 1)) {
352 				ep += *ep;
353 				continue;
354 			}
355 			return (0);
356 
357 		case NCCL:
358 			if (cclass(ep, *lp++, 0)) {
359 				ep += *ep;
360 				continue;
361 			}
362 			return (0);
363 
364 		case CBRA:
365 			braslist[*ep++] = lp;
366 			continue;
367 
368 		case CKET:
369 			braelist[*ep++] = lp;
370 			continue;
371 
372 		case CBACK:
373 			if (braelist[i = *ep++] == 0)
374 				return (-1);
375 			if (backref(i, lp)) {
376 				lp += braelist[i] - braslist[i];
377 				continue;
378 			}
379 			return (0);
380 
381 		case CBACK|CSTAR:
382 			if (braelist[i = *ep++] == 0)
383 				return (-1);
384 			curlp = lp;
385 			ct = braelist[i] - braslist[i];
386 			while (backref(i, lp))
387 				lp += ct;
388 			while (lp >= curlp) {
389 				if (rv = advance(lp, ep))
390 					return (rv);
391 				lp -= ct;
392 			}
393 			continue;
394 
395 		case CDOT|CSTAR:
396 			curlp = lp;
397 			while (*lp++)
398 				;
399 			goto star;
400 
401 		case CCHR|CSTAR:
402 			curlp = lp;
403 			while (*lp++ == *ep)
404 				;
405 			ep++;
406 			goto star;
407 
408 		case CCL|CSTAR:
409 		case NCCL|CSTAR:
410 			curlp = lp;
411 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
412 				;
413 			ep += *ep;
414 			goto star;
415 
416 		star:
417 			do {
418 				lp--;
419 				if (rv = advance(lp, ep))
420 					return (rv);
421 			} while (lp > curlp);
422 			return (0);
423 
424 		default:
425 			return (-1);
426 		}
427 }
428 
429 static int
430 backref(int i, char *lp)
431 {
432 	char	*bp;
433 	struct re_globals *_re = re_globals;
434 
435 	bp = braslist[i];
436 	while (*bp++ == *lp++)
437 		if (bp >= braelist[i])
438 			return (1);
439 	return (0);
440 }
441 
442 static int
443 cclass(char *set, char c, int af)
444 {
445 	int	n;
446 
447 	if (c == 0)
448 		return (0);
449 	n = *set++;
450 	while (--n)
451 		if (*set++ == c)
452 			return (af);
453 	return (! af);
454 }
455