xref: /illumos-gate/usr/src/lib/libc/port/regex/regcomp.c (revision 14247eb6)
1 /*
2  * Copyright 2023 Bill Sommerfeld <sommerfeld@hamachi.org>
3  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
4  * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
5  * Copyright 2012 Milan Jurik. All rights reserved.
6  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7  * Copyright (c) 1992, 1993, 1994
8  *	The Regents of the University of California.  All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * Henry Spencer.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #include "lint.h"
39 #include "file64.h"
40 #include <sys/types.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <ctype.h>
44 #include <limits.h>
45 #include <regex.h>
46 #include <stdlib.h>
47 #include <stdbool.h>
48 #include <wchar.h>
49 #include <wctype.h>
50 
51 #include "../locale/runetype.h"
52 #include "../locale/collate.h"
53 
54 #include "utils.h"
55 #include "regex2.h"
56 
57 #include "cname.h"
58 #include "../locale/mblocal.h"
59 
60 /*
61  * Branching context, used to keep track of branch state for all of the branch-
62  * aware functions. In addition to keeping track of branch positions for the
63  * p_branch_* functions, we use this to simplify some clumsiness in BREs for
64  * detection of whether ^ is acting as an anchor or being used erroneously and
65  * also for whether we're in a sub-expression or not.
66  */
67 struct branchc {
68 	sopno start;
69 	sopno back;
70 	sopno fwd;
71 
72 	int nbranch;
73 	int nchain;
74 	bool outer;
75 	bool terminate;
76 };
77 
78 /*
79  * parse structure, passed up and down to avoid global variables and
80  * other clumsinesses
81  */
82 struct parse {
83 	const char *next;	/* next character in RE */
84 	const char *end;	/* end of string (-> NUL normally) */
85 	int error;		/* has an error been seen? */
86 	sop *strip;		/* malloced strip */
87 	sopno ssize;		/* malloced strip size (allocated) */
88 	sopno slen;		/* malloced strip length (used) */
89 	int ncsalloc;		/* number of csets allocated */
90 	wint_t nc;		/* size of small-char bitmap in cset */
91 	struct re_guts *g;
92 #define	NPAREN	10		/* we need to remember () 1-9 for back refs */
93 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
94 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
95 	bool allowbranch;	/* can this expression branch? */
96 	bool bre;		/* convenience; is this a BRE? */
97 	bool (*parse_expr)(struct parse *, struct branchc *);
98 	void (*pre_parse)(struct parse *, struct branchc *);
99 	void (*post_parse)(struct parse *, struct branchc *);
100 };
101 
102 /* ========= begin header generated by ./mkh ========= */
103 #ifdef __cplusplus
104 extern "C" {
105 #endif
106 
107 /* === regcomp.c === */
108 static bool p_ere_exp(struct parse *p, struct branchc *bc);
109 static void p_str(struct parse *p);
110 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
111 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
112 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
113 static bool p_branch_empty(struct parse *p, struct branchc *bc);
114 static bool p_branch_do(struct parse *p, struct branchc *bc);
115 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
116 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
117 static void p_re(struct parse *p, int end1, int end2);
118 static bool p_simp_re(struct parse *p, struct branchc *bc);
119 static int p_count(struct parse *p);
120 static void p_bracket(struct parse *p);
121 static void p_b_term(struct parse *p, cset *cs);
122 static void p_b_cclass(struct parse *p, cset *cs);
123 static void p_b_eclass(struct parse *p, cset *cs);
124 static wint_t p_b_symbol(struct parse *p);
125 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
126 static wint_t othercase(wint_t ch);
127 static void bothcases(struct parse *p, wint_t ch);
128 static void ordinary(struct parse *p, wint_t ch);
129 static void nonnewline(struct parse *p);
130 static void repeat(struct parse *p, sopno start, int from, int to);
131 static int seterr(struct parse *p, int e);
132 static cset *allocset(struct parse *p);
133 static void freeset(struct parse *p, cset *cs);
134 static void CHadd(struct parse *p, cset *cs, wint_t ch);
135 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
136 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
137 static wint_t singleton(struct parse *p, cset *cs);
138 static sopno dupl(struct parse *p, sopno start, sopno finish);
139 static void doemit(struct parse *p, sop op, size_t opnd);
140 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
141 static void dofwd(struct parse *p, sopno pos, sop value);
142 static int enlarge(struct parse *p, sopno size);
143 static void stripsnug(struct parse *p, struct re_guts *g);
144 static void findmust(struct parse *p, struct re_guts *g);
145 static int altoffset(sop *scan, int offset);
146 static void computejumps(struct parse *p, struct re_guts *g);
147 static void computematchjumps(struct parse *p, struct re_guts *g);
148 static sopno pluscount(struct parse *p, struct re_guts *g);
149 static wint_t wgetnext(struct parse *p);
150 
151 #ifdef __cplusplus
152 }
153 #endif
154 /* ========= end header generated by ./mkh ========= */
155 
156 static char nuls[10];		/* place to point scanner in event of error */
157 
158 /*
159  * macros for use with parse structure
160  * BEWARE:  these know that the parse structure is named `p' !!!
161  */
162 #define	PEEK()	(*p->next)
163 #define	PEEK2()	(*(p->next+1))
164 #define	MORE()	(p->next < p->end)
165 #define	MORE2()	(p->next+1 < p->end)
166 #define	SEE(c)	(MORE() && PEEK() == (c))
167 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
168 #define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
169 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
170 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
171 #define	NEXT()	(p->next++)
172 #define	NEXT2()	(p->next += 2)
173 #define	NEXTn(n)	(p->next += (n))
174 #define	GETNEXT()	(*p->next++)
175 #define	WGETNEXT()	wgetnext(p)
176 #define	SETERROR(e)	((void)seterr(p, (e)))
177 #define	REQUIRE(co, e)	((co) || seterr(p, e))
178 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
179 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
180 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
181 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
182 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
183 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
184 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
185 #define	HERE()		(p->slen)
186 #define	THERE()		(p->slen - 1)
187 #define	THERETHERE()	(p->slen - 2)
188 #define	DROP(n)	(p->slen -= (n))
189 
190 #ifndef NDEBUG
191 static int never = 0;		/* for use in asserts; shuts lint up */
192 #else
193 #define	never	0		/* some <assert.h>s have bugs too */
194 #endif
195 
196 /*
197  * regcomp - interface for parser and compilation
198  */
199 int				/* 0 success, otherwise REG_something */
regcomp(regex_t * _RESTRICT_KYWD preg,const char * _RESTRICT_KYWD pattern,int cflags)200 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
201     int cflags)
202 {
203 	struct parse pa;
204 	struct re_guts *g;
205 	struct parse *p = &pa;
206 	int i;
207 	size_t len;
208 	size_t maxlen;
209 #ifdef REDEBUG
210 #define	GOODFLAGS(f)	(f)
211 #else
212 #define	GOODFLAGS(f)	((f)&~REG_DUMP)
213 #endif
214 
215 	/*
216 	 * Which character values are kept in cset bitmaps?
217 	 *
218 	 * Character sets store their members as a bitmap (for low-codepoint
219 	 * characters) or as elements of an array; pa.nc sets the dividing
220 	 * point between them.
221 	 *
222 	 * The value of MB_CUR_MAX depends on the current locale; fetching the
223 	 * current locale can be expensive, so we pick a value now and stick
224 	 * with it for the lifetime of the compiled regex.
225 	 */
226 	pa.nc = ((MB_CUR_MAX) == 1 ? (NC_MAX) : (NC_WIDE));
227 
228 	/* We had REG_INVARG, but we don't have that on Solaris. */
229 	cflags = GOODFLAGS(cflags);
230 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
231 		return (REG_EFATAL);
232 
233 	if (cflags&REG_PEND) {
234 		if (preg->re_endp < pattern)
235 			return (REG_EFATAL);
236 		len = preg->re_endp - pattern;
237 	} else
238 		len = strlen(pattern);
239 
240 	/* do the mallocs early so failure handling is easy */
241 	g = (struct re_guts *)malloc(sizeof (struct re_guts));
242 	if (g == NULL)
243 		return (REG_ESPACE);
244 	g->mb_cur_max = MB_CUR_MAX;
245 	/*
246 	 * Limit the pattern space to avoid a 32-bit overflow on buffer
247 	 * extension.  Also avoid any signed overflow in case of conversion
248 	 * so make the real limit based on a 31-bit overflow.
249 	 *
250 	 * Likely not applicable on 64-bit systems but handle the case
251 	 * generically (who are we to stop people from using ~715MB+
252 	 * patterns?).
253 	 */
254 	maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3;
255 	if (len >= maxlen) {
256 		free((char *)g);
257 		return (REG_ESPACE);
258 	}
259 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
260 	assert(p->ssize >= len);
261 
262 	p->strip = (sop *)malloc(p->ssize * sizeof (sop));
263 	p->slen = 0;
264 	if (p->strip == NULL) {
265 		free((char *)g);
266 		return (REG_ESPACE);
267 	}
268 
269 	/* set things up */
270 	p->g = g;
271 	p->next = pattern;	/* convenience; we do not modify it */
272 	p->end = p->next + len;
273 	p->error = 0;
274 	p->ncsalloc = 0;
275 	for (i = 0; i < NPAREN; i++) {
276 		p->pbegin[i] = 0;
277 		p->pend[i] = 0;
278 	}
279 	if (cflags & REG_EXTENDED) {
280 		p->allowbranch = true;
281 		p->bre = false;
282 		p->parse_expr = p_ere_exp;
283 		p->pre_parse = NULL;
284 		p->post_parse = NULL;
285 	} else {
286 		p->allowbranch = false;
287 		p->bre = true;
288 		p->parse_expr = p_simp_re;
289 		p->pre_parse = p_bre_pre_parse;
290 		p->post_parse = p_bre_post_parse;
291 	}
292 	g->sets = NULL;
293 	g->ncsets = 0;
294 	g->cflags = cflags;
295 	g->iflags = 0;
296 	g->nbol = 0;
297 	g->neol = 0;
298 	g->must = NULL;
299 	g->moffset = -1;
300 	g->charjump = NULL;
301 	g->matchjump = NULL;
302 	g->mlen = 0;
303 	g->nsub = 0;
304 	g->backrefs = 0;
305 
306 	/* do it */
307 	EMIT(OEND, 0);
308 	g->firststate = THERE();
309 	if (cflags & REG_NOSPEC)
310 		p_str(p);
311 	else
312 		p_re(p, OUT, OUT);
313 	EMIT(OEND, 0);
314 	g->laststate = THERE();
315 
316 	/* tidy up loose ends and fill things in */
317 	stripsnug(p, g);
318 	findmust(p, g);
319 	/*
320 	 * only use Boyer-Moore algorithm if the pattern is bigger
321 	 * than three characters
322 	 */
323 	if (g->mlen > 3) {
324 		computejumps(p, g);
325 		computematchjumps(p, g);
326 		if (g->matchjump == NULL && g->charjump != NULL) {
327 			free(g->charjump);
328 			g->charjump = NULL;
329 		}
330 	}
331 	g->nplus = pluscount(p, g);
332 	g->magic = MAGIC2;
333 	preg->re_nsub = g->nsub;
334 	preg->re_g = g;
335 	preg->re_magic = MAGIC1;
336 #ifndef REDEBUG
337 	/* not debugging, so can't rely on the assert() in regexec() */
338 	if (g->iflags&BAD)
339 		SETERROR(REG_EFATAL);
340 #endif
341 
342 	/* win or lose, we're done */
343 	if (p->error != 0)	/* lose */
344 		regfree(preg);
345 	return (p->error);
346 }
347 
348 /*
349  * Parse one subERE, an atom possibly followed by a repetition op,
350  * return whether we should terminate or not.
351  */
352 static bool
p_ere_exp(struct parse * p,struct branchc * bc)353 p_ere_exp(struct parse *p, struct branchc *bc)
354 {
355 	char c;
356 	wint_t wc;
357 	sopno pos;
358 	int count;
359 	int count2;
360 	sopno subno;
361 	int wascaret = 0;
362 
363 	(void) bc;
364 	assert(MORE());		/* caller should have ensured this */
365 	c = GETNEXT();
366 
367 	pos = HERE();
368 	switch (c) {
369 	case '(':
370 		(void) REQUIRE(MORE(), REG_EPAREN);
371 		p->g->nsub++;
372 		subno = p->g->nsub;
373 		if (subno < NPAREN)
374 			p->pbegin[subno] = HERE();
375 		EMIT(OLPAREN, subno);
376 		if (!SEE(')'))
377 			p_re(p, ')', IGN);
378 		if (subno < NPAREN) {
379 			p->pend[subno] = HERE();
380 			assert(p->pend[subno] != 0);
381 		}
382 		EMIT(ORPAREN, subno);
383 		(void) MUSTEAT(')', REG_EPAREN);
384 		break;
385 #ifndef POSIX_MISTAKE
386 	case ')':		/* happens only if no current unmatched ( */
387 		/*
388 		 * You may ask, why the ifndef?  Because I didn't notice
389 		 * this until slightly too late for 1003.2, and none of the
390 		 * other 1003.2 regular-expression reviewers noticed it at
391 		 * all.  So an unmatched ) is legal POSIX, at least until
392 		 * we can get it fixed.
393 		 */
394 		SETERROR(REG_EPAREN);
395 		break;
396 #endif
397 	case '^':
398 		EMIT(OBOL, 0);
399 		p->g->iflags |= USEBOL;
400 		p->g->nbol++;
401 		wascaret = 1;
402 		break;
403 	case '$':
404 		EMIT(OEOL, 0);
405 		p->g->iflags |= USEEOL;
406 		p->g->neol++;
407 		break;
408 	case '|':
409 		SETERROR(REG_BADPAT);
410 		break;
411 	case '*':
412 	case '+':
413 	case '?':
414 	case '{':
415 		SETERROR(REG_BADRPT);
416 		break;
417 	case '.':
418 		if (p->g->cflags&REG_NEWLINE)
419 			nonnewline(p);
420 		else
421 			EMIT(OANY, 0);
422 		break;
423 	case '[':
424 		p_bracket(p);
425 		break;
426 	case '\\':
427 		(void) REQUIRE(MORE(), REG_EESCAPE);
428 		wc = WGETNEXT();
429 		switch (wc) {
430 		case '<':
431 			EMIT(OBOW, 0);
432 			break;
433 		case '>':
434 			EMIT(OEOW, 0);
435 			break;
436 		default:
437 			ordinary(p, wc);
438 			break;
439 		}
440 		break;
441 	default:
442 		if (p->error != 0)
443 			return (false);
444 		p->next--;
445 		wc = WGETNEXT();
446 		ordinary(p, wc);
447 		break;
448 	}
449 
450 	if (!MORE())
451 		return (false);
452 	c = PEEK();
453 	/* we call { a repetition if followed by a digit */
454 	if (!(c == '*' || c == '+' || c == '?' || c == '{'))
455 		return (false);		/* no repetition, we're done */
456 	else if (c == '{')
457 		(void) REQUIRE(MORE2() && \
458 		    (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
459 	NEXT();
460 
461 	(void) REQUIRE(!wascaret, REG_BADRPT);
462 	switch (c) {
463 	case '*':	/* implemented as +? */
464 		/* this case does not require the (y|) trick, noKLUDGE */
465 		INSERT(OPLUS_, pos);
466 		ASTERN(O_PLUS, pos);
467 		INSERT(OQUEST_, pos);
468 		ASTERN(O_QUEST, pos);
469 		break;
470 	case '+':
471 		INSERT(OPLUS_, pos);
472 		ASTERN(O_PLUS, pos);
473 		break;
474 	case '?':
475 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
476 		INSERT(OCH_, pos);		/* offset slightly wrong */
477 		ASTERN(OOR1, pos);		/* this one's right */
478 		AHEAD(pos);			/* fix the OCH_ */
479 		EMIT(OOR2, 0);			/* offset very wrong... */
480 		AHEAD(THERE());			/* ...so fix it */
481 		ASTERN(O_CH, THERETHERE());
482 		break;
483 	case '{':
484 		count = p_count(p);
485 		if (EAT(',')) {
486 			if (isdigit((uch)PEEK())) {
487 				count2 = p_count(p);
488 				(void) REQUIRE(count <= count2, REG_BADBR);
489 			} else		/* single number with comma */
490 				count2 = INFINITY;
491 		} else		/* just a single number */
492 			count2 = count;
493 		repeat(p, pos, count, count2);
494 		if (!EAT('}')) {	/* error heuristics */
495 			while (MORE() && PEEK() != '}')
496 				NEXT();
497 			(void) REQUIRE(MORE(), REG_EBRACE);
498 			SETERROR(REG_BADBR);
499 		}
500 		break;
501 	}
502 
503 	if (!MORE())
504 		return (false);
505 	c = PEEK();
506 	if (!(c == '*' || c == '+' || c == '?' ||
507 	    (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
508 		return (false);
509 	SETERROR(REG_BADRPT);
510 	return (false);
511 }
512 
513 /*
514  * p_str - string (no metacharacters) "parser"
515  */
516 static void
p_str(struct parse * p)517 p_str(struct parse *p)
518 {
519 	(void) REQUIRE(MORE(), REG_BADPAT);
520 	while (MORE())
521 		ordinary(p, WGETNEXT());
522 }
523 
524 /*
525  * Eat consecutive branch delimiters for the kind of expression that we are
526  * parsing, return the number of delimiters that we ate.
527  */
528 static int
p_branch_eat_delim(struct parse * p,struct branchc * bc)529 p_branch_eat_delim(struct parse *p, struct branchc *bc)
530 {
531 	int nskip;
532 
533 	(void) bc;
534 	nskip = 0;
535 	while (EAT('|'))
536 		++nskip;
537 	return (nskip);
538 }
539 
540 /*
541  * Insert necessary branch book-keeping operations. This emits a
542  * bogus 'next' offset, since we still have more to parse
543  */
544 static void
p_branch_ins_offset(struct parse * p,struct branchc * bc)545 p_branch_ins_offset(struct parse *p, struct branchc *bc)
546 {
547 	if (bc->nbranch == 0) {
548 		INSERT(OCH_, bc->start);	/* offset is wrong */
549 		bc->fwd = bc->start;
550 		bc->back = bc->start;
551 	}
552 
553 	ASTERN(OOR1, bc->back);
554 	bc->back = THERE();
555 	AHEAD(bc->fwd);			/* fix previous offset */
556 	bc->fwd = HERE();
557 	EMIT(OOR2, 0);			/* offset is very wrong */
558 	++bc->nbranch;
559 }
560 
561 /*
562  * Fix the offset of the tail branch, if we actually had any branches.
563  * This is to correct the bogus placeholder offset that we use.
564  */
565 static void
p_branch_fix_tail(struct parse * p,struct branchc * bc)566 p_branch_fix_tail(struct parse *p, struct branchc *bc)
567 {
568 	/* Fix bogus offset at the tail if we actually have branches */
569 	if (bc->nbranch > 0) {
570 		AHEAD(bc->fwd);
571 		ASTERN(O_CH, bc->back);
572 	}
573 }
574 
575 /*
576  * Signal to the parser that an empty branch has been encountered; this will,
577  * in the future, be used to allow for more permissive behavior with empty
578  * branches. The return value should indicate whether parsing may continue
579  * or not.
580  */
581 static bool
p_branch_empty(struct parse * p,struct branchc * bc)582 p_branch_empty(struct parse *p, struct branchc *bc)
583 {
584 	(void) bc;
585 	SETERROR(REG_BADPAT);
586 	return (false);
587 }
588 
589 /*
590  * Take care of any branching requirements. This includes inserting the
591  * appropriate branching instructions as well as eating all of the branch
592  * delimiters until we either run out of pattern or need to parse more pattern.
593  */
594 static bool
p_branch_do(struct parse * p,struct branchc * bc)595 p_branch_do(struct parse *p, struct branchc *bc)
596 {
597 	int ate = 0;
598 
599 	ate = p_branch_eat_delim(p, bc);
600 	if (ate == 0)
601 		return (false);
602 	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
603 		/*
604 		 * Halt parsing only if we have an empty branch and
605 		 * p_branch_empty indicates that we must not continue.
606 		 * In the future, this will not  necessarily be an error.
607 		 */
608 		return (false);
609 	p_branch_ins_offset(p, bc);
610 
611 	return (true);
612 }
613 
614 static void
p_bre_pre_parse(struct parse * p,struct branchc * bc)615 p_bre_pre_parse(struct parse *p, struct branchc *bc)
616 {
617 	(void) bc;
618 	/*
619 	 * Does not move cleanly into expression parser because of
620 	 * ordinary interpration of * at the beginning position of
621 	 * an expression.
622 	 */
623 	if (EAT('^')) {
624 		EMIT(OBOL, 0);
625 		p->g->iflags |= USEBOL;
626 		p->g->nbol++;
627 	}
628 }
629 
630 static void
p_bre_post_parse(struct parse * p,struct branchc * bc)631 p_bre_post_parse(struct parse *p, struct branchc *bc)
632 {
633 	/* Expression is terminating due to EOL token */
634 	if (bc->terminate) {
635 		DROP(1);
636 		EMIT(OEOL, 0);
637 		p->g->iflags |= USEEOL;
638 		p->g->neol++;
639 	}
640 }
641 
642 /*
643  * Top level parser, concatenation and BRE anchoring.
644  * Giving end1 as OUT essentially eliminates the end1/end2 check.
645  *
646  * This implementation is a bit of a kludge, in that a trailing $ is first
647  * taken as an ordinary character and then revised to be an anchor.
648  * The amount of lookahead needed to avoid this kludge is excessive.
649  */
650 static void
p_re(struct parse * p,int end1,int end2)651 p_re(struct parse *p,
652     int end1,	/* first terminating character */
653     int end2)	/* second terminating character; ignored for EREs */
654 {
655 	struct branchc bc;
656 
657 	bc.nbranch = 0;
658 	if (end1 == OUT && end2 == OUT)
659 		bc.outer = true;
660 	else
661 		bc.outer = false;
662 #define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
663 	for (;;) {
664 		bc.start = HERE();
665 		bc.nchain = 0;
666 		bc.terminate = false;
667 		if (p->pre_parse != NULL)
668 			p->pre_parse(p, &bc);
669 		while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
670 		    !SEEEND()) {
671 			bc.terminate = p->parse_expr(p, &bc);
672 			++bc.nchain;
673 		}
674 		if (p->post_parse != NULL)
675 			p->post_parse(p, &bc);
676 		(void) REQUIRE(HERE() != bc.start, REG_BADPAT);
677 		if (!p->allowbranch)
678 			break;
679 		/*
680 		 * p_branch_do's return value indicates whether we should
681 		 * continue parsing or not. This is both for correctness and
682 		 * a slight optimization, because it will check if we've
683 		 * encountered an empty branch or the end of the string
684 		 * immediately following a branch delimiter.
685 		 */
686 		if (!p_branch_do(p, &bc))
687 			break;
688 	}
689 #undef SEE_END
690 	if (p->allowbranch)
691 		p_branch_fix_tail(p, &bc);
692 	assert(!MORE() || SEE(end1));
693 }
694 
695 /*
696  * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
697  */
698 static bool			/* was the simple RE an unbackslashed $? */
p_simp_re(struct parse * p,struct branchc * bc)699 p_simp_re(struct parse *p, struct branchc *bc)
700 {
701 	int c;
702 	int count;
703 	int count2;
704 	sopno pos;
705 	int i;
706 	wint_t wc;
707 	sopno subno;
708 #define	BACKSL	(1<<CHAR_BIT)
709 
710 	pos = HERE();		/* repetition op, if any, covers from here */
711 
712 	assert(MORE());		/* caller should have ensured this */
713 	c = GETNEXT();
714 	if (c == '\\') {
715 		(void) REQUIRE(MORE(), REG_EESCAPE);
716 		c = BACKSL | GETNEXT();
717 	}
718 	switch (c) {
719 	case '.':
720 		if (p->g->cflags&REG_NEWLINE)
721 			nonnewline(p);
722 		else
723 			EMIT(OANY, 0);
724 		break;
725 	case '[':
726 		p_bracket(p);
727 		break;
728 	case BACKSL|'<':
729 		EMIT(OBOW, 0);
730 		break;
731 	case BACKSL|'>':
732 		EMIT(OEOW, 0);
733 		break;
734 	case BACKSL|'{':
735 		SETERROR(REG_BADRPT);
736 		break;
737 	case BACKSL|'(':
738 		p->g->nsub++;
739 		subno = p->g->nsub;
740 		if (subno < NPAREN)
741 			p->pbegin[subno] = HERE();
742 		EMIT(OLPAREN, subno);
743 		/* the MORE here is an error heuristic */
744 		if (MORE() && !SEETWO('\\', ')'))
745 			p_re(p, '\\', ')');
746 		if (subno < NPAREN) {
747 			p->pend[subno] = HERE();
748 			assert(p->pend[subno] != 0);
749 		}
750 		EMIT(ORPAREN, subno);
751 		(void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
752 		break;
753 	case BACKSL|')':	/* should not get here -- must be user */
754 		SETERROR(REG_EPAREN);
755 		break;
756 	case BACKSL|'1':
757 	case BACKSL|'2':
758 	case BACKSL|'3':
759 	case BACKSL|'4':
760 	case BACKSL|'5':
761 	case BACKSL|'6':
762 	case BACKSL|'7':
763 	case BACKSL|'8':
764 	case BACKSL|'9':
765 		i = (c&~BACKSL) - '0';
766 		assert(i < NPAREN);
767 		if (p->pend[i] != 0) {
768 			assert(i <= p->g->nsub);
769 			EMIT(OBACK_, i);
770 			assert(p->pbegin[i] != 0);
771 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
772 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
773 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
774 			EMIT(O_BACK, i);
775 		} else
776 			SETERROR(REG_ESUBREG);
777 		p->g->backrefs = 1;
778 		break;
779 	case '*':
780 		/*
781 		 * Ordinary if used as the first character beyond BOL anchor of
782 		 * a (sub-)expression, counts as a bad repetition operator if it
783 		 * appears otherwise.
784 		 */
785 		(void) REQUIRE(bc->nchain == 0, REG_BADRPT);
786 		/* FALLTHROUGH */
787 	default:
788 		if (p->error != 0)
789 			return (false);	/* Definitely not $... */
790 		p->next--;
791 		wc = WGETNEXT();
792 		ordinary(p, wc);
793 		break;
794 	}
795 
796 	if (EAT('*')) {		/* implemented as +? */
797 		/* this case does not require the (y|) trick, noKLUDGE */
798 		INSERT(OPLUS_, pos);
799 		ASTERN(O_PLUS, pos);
800 		INSERT(OQUEST_, pos);
801 		ASTERN(O_QUEST, pos);
802 	} else if (EATTWO('\\', '{')) {
803 		count = p_count(p);
804 		if (EAT(',')) {
805 			if (MORE() && isdigit((uch)PEEK())) {
806 				count2 = p_count(p);
807 				(void) REQUIRE(count <= count2, REG_BADBR);
808 			} else		/* single number with comma */
809 				count2 = INFINITY;
810 		} else		/* just a single number */
811 			count2 = count;
812 		repeat(p, pos, count, count2);
813 		if (!EATTWO('\\', '}')) {	/* error heuristics */
814 			while (MORE() && !SEETWO('\\', '}'))
815 				NEXT();
816 			(void) REQUIRE(MORE(), REG_EBRACE);
817 			SETERROR(REG_BADBR);
818 		}
819 	} else if (c == '$')	/* $ (but not \$) ends it */
820 		return (true);
821 
822 	return (false);
823 }
824 
825 /*
826  * p_count - parse a repetition count
827  */
828 static int			/* the value */
p_count(struct parse * p)829 p_count(struct parse *p)
830 {
831 	int count = 0;
832 	int ndigits = 0;
833 
834 	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
835 		count = count*10 + (GETNEXT() - '0');
836 		ndigits++;
837 	}
838 
839 	(void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
840 	return (count);
841 }
842 
843 /*
844  * p_bracket - parse a bracketed character list
845  */
846 static void
p_bracket(struct parse * p)847 p_bracket(struct parse *p)
848 {
849 	cset *cs;
850 	wint_t ch;
851 
852 	/* Dept of Truly Sickening Special-Case Kludges */
853 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
854 		EMIT(OBOW, 0);
855 		NEXTn(6);
856 		return;
857 	}
858 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
859 		EMIT(OEOW, 0);
860 		NEXTn(6);
861 		return;
862 	}
863 
864 	if ((cs = allocset(p)) == NULL)
865 		return;
866 
867 	if (p->g->cflags&REG_ICASE)
868 		cs->icase = 1;
869 	if (EAT('^'))
870 		cs->invert = 1;
871 	if (EAT(']'))
872 		CHadd(p, cs, ']');
873 	else if (EAT('-'))
874 		CHadd(p, cs, '-');
875 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
876 		p_b_term(p, cs);
877 	if (EAT('-'))
878 		CHadd(p, cs, '-');
879 	(void) MUSTEAT(']', REG_EBRACK);
880 
881 	if (p->error != 0)	/* don't mess things up further */
882 		return;
883 
884 	if (cs->invert && p->g->cflags&REG_NEWLINE)
885 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
886 
887 	if ((ch = singleton(p, cs)) != OUT) {	/* optimize singleton sets */
888 		ordinary(p, ch);
889 		freeset(p, cs);
890 	} else
891 		EMIT(OANYOF, (int)(cs - p->g->sets));
892 }
893 
894 /*
895  * p_b_term - parse one term of a bracketed character list
896  */
897 static void
p_b_term(struct parse * p,cset * cs)898 p_b_term(struct parse *p, cset *cs)
899 {
900 	char c;
901 	wint_t start, finish;
902 	wint_t i;
903 	locale_t loc = uselocale(NULL);
904 
905 	/* classify what we've got */
906 	switch ((MORE()) ? PEEK() : '\0') {
907 	case '[':
908 		c = (MORE2()) ? PEEK2() : '\0';
909 		break;
910 	case '-':
911 		SETERROR(REG_ERANGE);
912 		return;			/* NOTE RETURN */
913 	default:
914 		c = '\0';
915 		break;
916 	}
917 
918 	switch (c) {
919 	case ':':		/* character class */
920 		NEXT2();
921 		(void) REQUIRE(MORE(), REG_EBRACK);
922 		c = PEEK();
923 		(void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
924 		p_b_cclass(p, cs);
925 		(void) REQUIRE(MORE(), REG_EBRACK);
926 		(void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
927 		break;
928 	case '=':		/* equivalence class */
929 		NEXT2();
930 		(void) REQUIRE(MORE(), REG_EBRACK);
931 		c = PEEK();
932 		(void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
933 		p_b_eclass(p, cs);
934 		(void) REQUIRE(MORE(), REG_EBRACK);
935 		(void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
936 		break;
937 	default:		/* symbol, ordinary character, or range */
938 		start = p_b_symbol(p);
939 		if (SEE('-') && MORE2() && PEEK2() != ']') {
940 			/* range */
941 			NEXT();
942 			if (EAT('-'))
943 				finish = '-';
944 			else
945 				finish = p_b_symbol(p);
946 		} else
947 			finish = start;
948 		if (start == finish)
949 			CHadd(p, cs, start);
950 		else {
951 			if (loc->collate->lc_is_posix) {
952 				(void) REQUIRE((uch)start <= (uch)finish,
953 				    REG_ERANGE);
954 				CHaddrange(p, cs, start, finish);
955 			} else {
956 				(void) REQUIRE(_collate_range_cmp(start,
957 				    finish, loc) <= 0, REG_ERANGE);
958 				for (i = 0; i <= UCHAR_MAX; i++) {
959 					if (_collate_range_cmp(start, i, loc)
960 					    <= 0 &&
961 					    _collate_range_cmp(i, finish, loc)
962 					    <= 0)
963 						CHadd(p, cs, i);
964 				}
965 			}
966 		}
967 		break;
968 	}
969 }
970 
971 /*
972  * p_b_cclass - parse a character-class name and deal with it
973  */
974 static void
p_b_cclass(struct parse * p,cset * cs)975 p_b_cclass(struct parse *p, cset *cs)
976 {
977 	const char *sp = p->next;
978 	size_t len;
979 	wctype_t wct;
980 	char clname[16];
981 
982 	while (MORE() && isalpha((uch)PEEK()))
983 		NEXT();
984 	len = p->next - sp;
985 	if (len >= sizeof (clname) - 1) {
986 		SETERROR(REG_ECTYPE);
987 		return;
988 	}
989 	(void) memcpy(clname, sp, len);
990 	clname[len] = '\0';
991 	if ((wct = wctype(clname)) == 0) {
992 		SETERROR(REG_ECTYPE);
993 		return;
994 	}
995 	CHaddtype(p, cs, wct);
996 }
997 
998 /*
999  * p_b_eclass - parse an equivalence-class name and deal with it
1000  *
1001  * This implementation is incomplete. xxx
1002  */
1003 static void
p_b_eclass(struct parse * p,cset * cs)1004 p_b_eclass(struct parse *p, cset *cs)
1005 {
1006 	wint_t c;
1007 
1008 	c = p_b_coll_elem(p, '=');
1009 	CHadd(p, cs, c);
1010 }
1011 
1012 /*
1013  * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1014  */
1015 static wint_t			/* value of symbol */
p_b_symbol(struct parse * p)1016 p_b_symbol(struct parse *p)
1017 {
1018 	wint_t value;
1019 
1020 	(void) REQUIRE(MORE(), REG_EBRACK);
1021 	if (!EATTWO('[', '.'))
1022 		return (WGETNEXT());
1023 
1024 	/* collating symbol */
1025 	value = p_b_coll_elem(p, '.');
1026 	(void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1027 	return (value);
1028 }
1029 
1030 /*
1031  * p_b_coll_elem - parse a collating-element name and look it up
1032  */
1033 static wint_t			/* value of collating element */
p_b_coll_elem(struct parse * p,wint_t endc)1034 p_b_coll_elem(struct parse *p,
1035     wint_t endc)		/* name ended by endc,']' */
1036 {
1037 	const char *sp = p->next;
1038 	struct cname *cp;
1039 	mbstate_t mbs;
1040 	wchar_t wc;
1041 	size_t clen, len;
1042 
1043 	while (MORE() && !SEETWO(endc, ']'))
1044 		NEXT();
1045 	if (!MORE()) {
1046 		SETERROR(REG_EBRACK);
1047 		return (0);
1048 	}
1049 	len = p->next - sp;
1050 	for (cp = cnames; cp->name != NULL; cp++)
1051 		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1052 			return (cp->code);	/* known name */
1053 	(void) memset(&mbs, 0, sizeof (mbs));
1054 	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1055 		return (wc);			/* single character */
1056 	else if (clen == (size_t)-1 || clen == (size_t)-2)
1057 		SETERROR(REG_ECHAR);
1058 	else
1059 		SETERROR(REG_ECOLLATE);		/* neither */
1060 	return (0);
1061 }
1062 
1063 /*
1064  * othercase - return the case counterpart of an alphabetic
1065  */
1066 static wint_t			/* if no counterpart, return ch */
othercase(wint_t ch)1067 othercase(wint_t ch)
1068 {
1069 	assert(iswalpha(ch));
1070 	if (iswupper(ch))
1071 		return (towlower(ch));
1072 	else if (iswlower(ch))
1073 		return (towupper(ch));
1074 	else			/* peculiar, but could happen */
1075 		return (ch);
1076 }
1077 
1078 /*
1079  * bothcases - emit a dualcase version of a two-case character
1080  *
1081  * Boy, is this implementation ever a kludge...
1082  */
1083 static void
bothcases(struct parse * p,wint_t ch)1084 bothcases(struct parse *p, wint_t ch)
1085 {
1086 	const char *oldnext = p->next;
1087 	const char *oldend = p->end;
1088 	char bracket[3 + MB_LEN_MAX];
1089 	size_t n;
1090 	mbstate_t mbs;
1091 
1092 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1093 	p->next = bracket;
1094 	(void) memset(&mbs, 0, sizeof (mbs));
1095 	n = wcrtomb(bracket, ch, &mbs);
1096 	assert(n != (size_t)-1);
1097 	bracket[n] = ']';
1098 	bracket[n + 1] = '\0';
1099 	p->end = bracket+n+1;
1100 	p_bracket(p);
1101 	assert(p->next == p->end);
1102 	p->next = oldnext;
1103 	p->end = oldend;
1104 }
1105 
1106 /*
1107  * ordinary - emit an ordinary character
1108  */
1109 static void
ordinary(struct parse * p,wint_t ch)1110 ordinary(struct parse *p, wint_t ch)
1111 {
1112 	cset *cs;
1113 
1114 	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1115 		bothcases(p, ch);
1116 	else if ((ch & OPDMASK) == ch)
1117 		EMIT(OCHAR, ch);
1118 	else {
1119 		/*
1120 		 * Kludge: character is too big to fit into an OCHAR operand.
1121 		 * Emit a singleton set.
1122 		 */
1123 		if ((cs = allocset(p)) == NULL)
1124 			return;
1125 		CHadd(p, cs, ch);
1126 		EMIT(OANYOF, (int)(cs - p->g->sets));
1127 	}
1128 }
1129 
1130 /*
1131  * nonnewline - emit REG_NEWLINE version of OANY
1132  *
1133  * Boy, is this implementation ever a kludge...
1134  */
1135 static void
nonnewline(struct parse * p)1136 nonnewline(struct parse *p)
1137 {
1138 	const char *oldnext = p->next;
1139 	const char *oldend = p->end;
1140 	char bracket[4];
1141 
1142 	p->next = bracket;
1143 	p->end = bracket+3;
1144 	bracket[0] = '^';
1145 	bracket[1] = '\n';
1146 	bracket[2] = ']';
1147 	bracket[3] = '\0';
1148 	p_bracket(p);
1149 	assert(p->next == bracket+3);
1150 	p->next = oldnext;
1151 	p->end = oldend;
1152 }
1153 
1154 /*
1155  * repeat - generate code for a bounded repetition, recursively if needed
1156  */
1157 static void
repeat(struct parse * p,sopno start,int from,int to)1158 repeat(struct parse *p,
1159     sopno start,		/* operand from here to end of strip */
1160     int from,			/* repeated from this number */
1161     int to)			/* to this number of times (maybe INFINITY) */
1162 {
1163 	sopno finish = HERE();
1164 #define	N	2
1165 #define	INF	3
1166 #define	REP(f, t)	((f)*8 + (t))
1167 #define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1168 	sopno copy;
1169 
1170 	if (p->error != 0)	/* head off possible runaway recursion */
1171 		return;
1172 
1173 	assert(from <= to);
1174 
1175 	switch (REP(MAP(from), MAP(to))) {
1176 	case REP(0, 0):			/* must be user doing this */
1177 		DROP(finish-start);	/* drop the operand */
1178 		break;
1179 	case REP(0, 1):			/* as x{1,1}? */
1180 	case REP(0, N):			/* as x{1,n}? */
1181 	case REP(0, INF):		/* as x{1,}? */
1182 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1183 		INSERT(OCH_, start);		/* offset is wrong... */
1184 		repeat(p, start+1, 1, to);
1185 		ASTERN(OOR1, start);
1186 		AHEAD(start);			/* ... fix it */
1187 		EMIT(OOR2, 0);
1188 		AHEAD(THERE());
1189 		ASTERN(O_CH, THERETHERE());
1190 		break;
1191 	case REP(1, 1):			/* trivial case */
1192 		/* done */
1193 		break;
1194 	case REP(1, N):			/* as x?x{1,n-1} */
1195 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1196 		INSERT(OCH_, start);
1197 		ASTERN(OOR1, start);
1198 		AHEAD(start);
1199 		EMIT(OOR2, 0);			/* offset very wrong... */
1200 		AHEAD(THERE());			/* ...so fix it */
1201 		ASTERN(O_CH, THERETHERE());
1202 		copy = dupl(p, start+1, finish+1);
1203 		assert(copy == finish+4);
1204 		repeat(p, copy, 1, to-1);
1205 		break;
1206 	case REP(1, INF):		/* as x+ */
1207 		INSERT(OPLUS_, start);
1208 		ASTERN(O_PLUS, start);
1209 		break;
1210 	case REP(N, N):			/* as xx{m-1,n-1} */
1211 		copy = dupl(p, start, finish);
1212 		repeat(p, copy, from-1, to-1);
1213 		break;
1214 	case REP(N, INF):		/* as xx{n-1,INF} */
1215 		copy = dupl(p, start, finish);
1216 		repeat(p, copy, from-1, to);
1217 		break;
1218 	default:			/* "can't happen" */
1219 		SETERROR(REG_EFATAL);	/* just in case */
1220 		break;
1221 	}
1222 }
1223 
1224 /*
1225  * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1226  * character from the parse struct, signals a REG_ILLSEQ error if the
1227  * character can't be converted. Returns the number of bytes consumed.
1228  */
1229 static wint_t
wgetnext(struct parse * p)1230 wgetnext(struct parse *p)
1231 {
1232 	mbstate_t mbs;
1233 	wchar_t wc;
1234 	size_t n;
1235 
1236 	(void) memset(&mbs, 0, sizeof (mbs));
1237 	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1238 	if (n == (size_t)-1 || n == (size_t)-2) {
1239 		SETERROR(REG_ECHAR);
1240 		return (0);
1241 	}
1242 	if (n == 0)
1243 		n = 1;
1244 	p->next += n;
1245 	return (wc);
1246 }
1247 
1248 /*
1249  * seterr - set an error condition
1250  */
1251 static int			/* useless but makes type checking happy */
seterr(struct parse * p,int e)1252 seterr(struct parse *p, int e)
1253 {
1254 	if (p->error == 0)	/* keep earliest error condition */
1255 		p->error = e;
1256 	p->next = nuls;		/* try to bring things to a halt */
1257 	p->end = nuls;
1258 	return (0);		/* make the return value well-defined */
1259 }
1260 
1261 /*
1262  * allocset - allocate a set of characters for []
1263  */
1264 static cset *
allocset(struct parse * p)1265 allocset(struct parse *p)
1266 {
1267 	cset *cs, *ncs;
1268 
1269 	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1270 	if (ncs == NULL) {
1271 		SETERROR(REG_ESPACE);
1272 		return (NULL);
1273 	}
1274 	p->g->sets = ncs;
1275 	cs = &p->g->sets[p->g->ncsets++];
1276 	(void) memset(cs, 0, sizeof (*cs));
1277 
1278 	return (cs);
1279 }
1280 
1281 /*
1282  * freeset - free a now-unused set
1283  */
1284 static void
freeset(struct parse * p,cset * cs)1285 freeset(struct parse *p, cset *cs)
1286 {
1287 	cset *top = &p->g->sets[p->g->ncsets];
1288 
1289 	free(cs->wides);
1290 	free(cs->ranges);
1291 	free(cs->types);
1292 	(void) memset(cs, 0, sizeof (*cs));
1293 	if (cs == top-1)	/* recover only the easy case */
1294 		p->g->ncsets--;
1295 }
1296 
1297 /*
1298  * singleton - Determine whether a set contains only one character,
1299  * returning it if so, otherwise returning OUT.
1300  */
1301 static wint_t
singleton(struct parse * p,cset * cs)1302 singleton(struct parse *p, cset *cs)
1303 {
1304 	wint_t i, s, n;
1305 
1306 	/* Exclude the complicated cases we don't want to deal with */
1307 	if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0)
1308 		return (OUT);
1309 
1310 	if (cs->nwides > 1)
1311 		return (OUT);
1312 
1313 	/* Count the number of characters present in the bitmap */
1314 	for (i = n = 0; i < p->nc; i++)
1315 		if (CHIN(p->nc, cs, i)) {
1316 			n++;
1317 			s = i;
1318 		}
1319 
1320 	if (n > 1)
1321 		return (OUT);
1322 
1323 	if (n == 1) {
1324 		if (cs->nwides == 0)
1325 			return (s);
1326 		else
1327 			return (OUT);
1328 	}
1329 	if (cs->nwides == 1)
1330 		return (cs->wides[0]);
1331 
1332 	return (OUT);
1333 }
1334 
1335 /*
1336  * CHadd - add character to character set.
1337  */
1338 static void
CHadd(struct parse * p,cset * cs,wint_t ch)1339 CHadd(struct parse *p, cset *cs, wint_t ch)
1340 {
1341 	wint_t nch, *newwides;
1342 	assert(ch >= 0);
1343 	if (ch < p->nc)
1344 		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1345 	else {
1346 		newwides = realloc(cs->wides, (cs->nwides + 1) *
1347 		    sizeof (*cs->wides));
1348 		if (newwides == NULL) {
1349 			SETERROR(REG_ESPACE);
1350 			return;
1351 		}
1352 		cs->wides = newwides;
1353 		cs->wides[cs->nwides++] = ch;
1354 	}
1355 	if (cs->icase) {
1356 		if ((nch = towlower(ch)) < p->nc)
1357 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1358 		if ((nch = towupper(ch)) < p->nc)
1359 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1360 	}
1361 }
1362 
1363 /*
1364  * CHaddrange - add all characters in the range [min,max] to a character set.
1365  */
1366 static void
CHaddrange(struct parse * p,cset * cs,wint_t min,wint_t max)1367 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1368 {
1369 	crange *newranges;
1370 
1371 	for (; min < p->nc && min <= max; min++)
1372 		CHadd(p, cs, min);
1373 	if (min >= max)
1374 		return;
1375 	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1376 	    sizeof (*cs->ranges));
1377 	if (newranges == NULL) {
1378 		SETERROR(REG_ESPACE);
1379 		return;
1380 	}
1381 	cs->ranges = newranges;
1382 	cs->ranges[cs->nranges].min = min;
1383 	cs->ranges[cs->nranges].max = max;
1384 	cs->nranges++;
1385 }
1386 
1387 /*
1388  * CHaddtype - add all characters of a certain type to a character set.
1389  */
1390 static void
CHaddtype(struct parse * p,cset * cs,wctype_t wct)1391 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1392 {
1393 	wint_t i;
1394 	wctype_t *newtypes;
1395 
1396 	for (i = 0; i < p->nc; i++)
1397 		if (iswctype(i, wct))
1398 			CHadd(p, cs, i);
1399 	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1400 	    sizeof (*cs->types));
1401 	if (newtypes == NULL) {
1402 		SETERROR(REG_ESPACE);
1403 		return;
1404 	}
1405 	cs->types = newtypes;
1406 	cs->types[cs->ntypes++] = wct;
1407 }
1408 
1409 /*
1410  * dupl - emit a duplicate of a bunch of sops
1411  */
1412 static sopno			/* start of duplicate */
dupl(struct parse * p,sopno start,sopno finish)1413 dupl(struct parse *p,
1414     sopno start,		/* from here */
1415     sopno finish)		/* to this less one */
1416 {
1417 	sopno ret = HERE();
1418 	sopno len = finish - start;
1419 
1420 	assert(finish >= start);
1421 	if (len == 0)
1422 		return (ret);
1423 	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1424 		return (ret);
1425 	assert(p->ssize >= p->slen + len);
1426 	(void) memcpy((char *)(p->strip + p->slen),
1427 	    (char *)(p->strip + start), (size_t)len*sizeof (sop));
1428 	p->slen += len;
1429 	return (ret);
1430 }
1431 
1432 /*
1433  * doemit - emit a strip operator
1434  *
1435  * It might seem better to implement this as a macro with a function as
1436  * hard-case backup, but it's just too big and messy unless there are
1437  * some changes to the data structures.  Maybe later.
1438  */
1439 static void
doemit(struct parse * p,sop op,size_t opnd)1440 doemit(struct parse *p, sop op, size_t opnd)
1441 {
1442 	/* avoid making error situations worse */
1443 	if (p->error != 0)
1444 		return;
1445 
1446 	/* deal with oversize operands ("can't happen", more or less) */
1447 	assert(opnd < 1<<OPSHIFT);
1448 
1449 	/* deal with undersized strip */
1450 	if (p->slen >= p->ssize)
1451 		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1452 			return;
1453 
1454 	/* finally, it's all reduced to the easy case */
1455 	p->strip[p->slen++] = SOP(op, opnd);
1456 }
1457 
1458 /*
1459  * doinsert - insert a sop into the strip
1460  */
1461 static void
doinsert(struct parse * p,sop op,size_t opnd,sopno pos)1462 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1463 {
1464 	sopno sn;
1465 	sop s;
1466 	int i;
1467 
1468 	/* avoid making error situations worse */
1469 	if (p->error != 0)
1470 		return;
1471 
1472 	sn = HERE();
1473 	EMIT(op, opnd);		/* do checks, ensure space */
1474 	assert(HERE() == sn+1);
1475 	s = p->strip[sn];
1476 
1477 	/* adjust paren pointers */
1478 	assert(pos > 0);
1479 	for (i = 1; i < NPAREN; i++) {
1480 		if (p->pbegin[i] >= pos) {
1481 			p->pbegin[i]++;
1482 		}
1483 		if (p->pend[i] >= pos) {
1484 			p->pend[i]++;
1485 		}
1486 	}
1487 
1488 	(void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1489 	    (HERE()-pos-1)*sizeof (sop));
1490 	p->strip[pos] = s;
1491 }
1492 
1493 /*
1494  * dofwd - complete a forward reference
1495  */
1496 static void
dofwd(struct parse * p,sopno pos,sop value)1497 dofwd(struct parse *p, sopno pos, sop value)
1498 {
1499 	/* avoid making error situations worse */
1500 	if (p->error != 0)
1501 		return;
1502 
1503 	assert(value < 1<<OPSHIFT);
1504 	p->strip[pos] = OP(p->strip[pos]) | value;
1505 }
1506 
1507 /*
1508  * enlarge - enlarge the strip
1509  */
1510 static int
enlarge(struct parse * p,sopno size)1511 enlarge(struct parse *p, sopno size)
1512 {
1513 	sop *sp;
1514 
1515 	if (p->ssize >= size)
1516 		return (1);
1517 
1518 	sp = (sop *)realloc(p->strip, size*sizeof (sop));
1519 	if (sp == NULL) {
1520 		SETERROR(REG_ESPACE);
1521 		return (0);
1522 	}
1523 	p->strip = sp;
1524 	p->ssize = size;
1525 	return (1);
1526 }
1527 
1528 /*
1529  * stripsnug - compact the strip
1530  */
1531 static void
stripsnug(struct parse * p,struct re_guts * g)1532 stripsnug(struct parse *p, struct re_guts *g)
1533 {
1534 	g->nstates = p->slen;
1535 	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1536 	if (g->strip == NULL) {
1537 		SETERROR(REG_ESPACE);
1538 		g->strip = p->strip;
1539 	}
1540 }
1541 
1542 /*
1543  * findmust - fill in must and mlen with longest mandatory literal string
1544  *
1545  * This algorithm could do fancy things like analyzing the operands of |
1546  * for common subsequences.  Someday.  This code is simple and finds most
1547  * of the interesting cases.
1548  *
1549  * Note that must and mlen got initialized during setup.
1550  */
1551 static void
findmust(struct parse * p,struct re_guts * g)1552 findmust(struct parse *p, struct re_guts *g)
1553 {
1554 	sop *scan;
1555 	sop *start = NULL;
1556 	sop *newstart = NULL;
1557 	sopno newlen;
1558 	sop s;
1559 	char *cp;
1560 	int offset;
1561 	char buf[MB_LEN_MAX];
1562 	size_t clen;
1563 	mbstate_t mbs;
1564 	locale_t loc = uselocale(NULL);
1565 
1566 	/* avoid making error situations worse */
1567 	if (p->error != 0)
1568 		return;
1569 
1570 	/*
1571 	 * It's not generally safe to do a ``char'' substring search on
1572 	 * multibyte character strings, but it's safe for at least
1573 	 * UTF-8 (see RFC 3629).
1574 	 */
1575 	if (g->mb_cur_max > 1 &&
1576 	    strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1577 		return;
1578 
1579 	/* find the longest OCHAR sequence in strip */
1580 	newlen = 0;
1581 	offset = 0;
1582 	g->moffset = 0;
1583 	scan = g->strip + 1;
1584 	do {
1585 		s = *scan++;
1586 		switch (OP(s)) {
1587 		case OCHAR:		/* sequence member */
1588 			if (newlen == 0) {		/* new sequence */
1589 				(void) memset(&mbs, 0, sizeof (mbs));
1590 				newstart = scan - 1;
1591 			}
1592 			clen = wcrtomb(buf, OPND(s), &mbs);
1593 			if (clen == (size_t)-1)
1594 				goto toohard;
1595 			newlen += clen;
1596 			break;
1597 		case OPLUS_:		/* things that don't break one */
1598 		case OLPAREN:
1599 		case ORPAREN:
1600 			break;
1601 		case OQUEST_:		/* things that must be skipped */
1602 		case OCH_:
1603 			offset = altoffset(scan, offset);
1604 			scan--;
1605 			do {
1606 				scan += OPND(s);
1607 				s = *scan;
1608 				/* assert() interferes w debug printouts */
1609 				if (OP(s) != (sop)O_QUEST &&
1610 				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1611 					g->iflags |= BAD;
1612 					return;
1613 				}
1614 			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1615 			/* FALLTHROUGH */
1616 		case OBOW:		/* things that break a sequence */
1617 		case OEOW:
1618 		case OBOL:
1619 		case OEOL:
1620 		case O_QUEST:
1621 		case O_CH:
1622 		case OEND:
1623 			if (newlen > (sopno)g->mlen) {		/* ends one */
1624 				start = newstart;
1625 				g->mlen = newlen;
1626 				if (offset > -1) {
1627 					g->moffset += offset;
1628 					offset = newlen;
1629 				} else
1630 					g->moffset = offset;
1631 			} else {
1632 				if (offset > -1)
1633 					offset += newlen;
1634 			}
1635 			newlen = 0;
1636 			break;
1637 		case OANY:
1638 			if (newlen > (sopno)g->mlen) {		/* ends one */
1639 				start = newstart;
1640 				g->mlen = newlen;
1641 				if (offset > -1) {
1642 					g->moffset += offset;
1643 					offset = newlen;
1644 				} else
1645 					g->moffset = offset;
1646 			} else {
1647 				if (offset > -1)
1648 					offset += newlen;
1649 			}
1650 			if (offset > -1)
1651 				offset++;
1652 			newlen = 0;
1653 			break;
1654 		case OANYOF:		/* may or may not invalidate offset */
1655 			/* First, everything as OANY */
1656 			if (newlen > (sopno)g->mlen) {		/* ends one */
1657 				start = newstart;
1658 				g->mlen = newlen;
1659 				if (offset > -1) {
1660 					g->moffset += offset;
1661 					offset = newlen;
1662 				} else
1663 					g->moffset = offset;
1664 			} else {
1665 				if (offset > -1)
1666 					offset += newlen;
1667 			}
1668 			if (offset > -1)
1669 				offset++;
1670 			newlen = 0;
1671 			break;
1672 		toohard:
1673 		default:
1674 			/*
1675 			 * Anything here makes it impossible or too hard
1676 			 * to calculate the offset -- so we give up;
1677 			 * save the last known good offset, in case the
1678 			 * must sequence doesn't occur later.
1679 			 */
1680 			if (newlen > (sopno)g->mlen) {		/* ends one */
1681 				start = newstart;
1682 				g->mlen = newlen;
1683 				if (offset > -1)
1684 					g->moffset += offset;
1685 				else
1686 					g->moffset = offset;
1687 			}
1688 			offset = -1;
1689 			newlen = 0;
1690 			break;
1691 		}
1692 	} while (OP(s) != OEND);
1693 
1694 	if (g->mlen == 0) {		/* there isn't one */
1695 		g->moffset = -1;
1696 		return;
1697 	}
1698 
1699 	/* turn it into a character string */
1700 	g->must = malloc((size_t)g->mlen + 1);
1701 	if (g->must == NULL) {		/* argh; just forget it */
1702 		g->mlen = 0;
1703 		g->moffset = -1;
1704 		return;
1705 	}
1706 	cp = g->must;
1707 	scan = start;
1708 	(void) memset(&mbs, 0, sizeof (mbs));
1709 	while (cp < g->must + g->mlen) {
1710 		while (OP(s = *scan++) != OCHAR)
1711 			continue;
1712 		clen = wcrtomb(cp, OPND(s), &mbs);
1713 		assert(clen != (size_t)-1);
1714 		cp += clen;
1715 	}
1716 	assert(cp == g->must + g->mlen);
1717 	*cp++ = '\0';		/* just on general principles */
1718 }
1719 
1720 /*
1721  * altoffset - choose biggest offset among multiple choices
1722  *
1723  * Compute, recursively if necessary, the largest offset among multiple
1724  * re paths.
1725  */
1726 static int
altoffset(sop * scan,int offset)1727 altoffset(sop *scan, int offset)
1728 {
1729 	int largest;
1730 	int try;
1731 	sop s;
1732 
1733 	/* If we gave up already on offsets, return */
1734 	if (offset == -1)
1735 		return (-1);
1736 
1737 	largest = 0;
1738 	try = 0;
1739 	s = *scan++;
1740 	while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
1741 		switch (OP(s)) {
1742 		case OOR1:
1743 			if (try > largest)
1744 				largest = try;
1745 			try = 0;
1746 			break;
1747 		case OQUEST_:
1748 		case OCH_:
1749 			try = altoffset(scan, try);
1750 			if (try == -1)
1751 				return (-1);
1752 			scan--;
1753 			do {
1754 				scan += OPND(s);
1755 				s = *scan;
1756 				if (OP(s) != (sop)O_QUEST &&
1757 				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
1758 					return (-1);
1759 			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1760 			/*
1761 			 * We must skip to the next position, or we'll
1762 			 * leave altoffset() too early.
1763 			 */
1764 			scan++;
1765 			break;
1766 		case OANYOF:
1767 		case OCHAR:
1768 		case OANY:
1769 			try++;
1770 			/*FALLTHRU*/
1771 		case OBOW:
1772 		case OEOW:
1773 		case OLPAREN:
1774 		case ORPAREN:
1775 		case OOR2:
1776 			break;
1777 		default:
1778 			try = -1;
1779 			break;
1780 		}
1781 		if (try == -1)
1782 			return (-1);
1783 		s = *scan++;
1784 	}
1785 
1786 	if (try > largest)
1787 		largest = try;
1788 
1789 	return (largest+offset);
1790 }
1791 
1792 /*
1793  * computejumps - compute char jumps for BM scan
1794  *
1795  * This algorithm assumes g->must exists and is has size greater than
1796  * zero. It's based on the algorithm found on Computer Algorithms by
1797  * Sara Baase.
1798  *
1799  * A char jump is the number of characters one needs to jump based on
1800  * the value of the character from the text that was mismatched.
1801  */
1802 static void
computejumps(struct parse * p,struct re_guts * g)1803 computejumps(struct parse *p, struct re_guts *g)
1804 {
1805 	int ch;
1806 	int mindex;
1807 
1808 	/* Avoid making errors worse */
1809 	if (p->error != 0)
1810 		return;
1811 
1812 	g->charjump = (int *)malloc((NC_MAX + 1) * sizeof (int));
1813 	if (g->charjump == NULL)	/* Not a fatal error */
1814 		return;
1815 	/* Adjust for signed chars, if necessary */
1816 	g->charjump = &g->charjump[-(CHAR_MIN)];
1817 
1818 	/*
1819 	 * If the character does not exist in the pattern, the jump
1820 	 * is equal to the number of characters in the pattern.
1821 	 */
1822 	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1823 		g->charjump[ch] = g->mlen;
1824 
1825 	/*
1826 	 * If the character does exist, compute the jump that would
1827 	 * take us to the last character in the pattern equal to it
1828 	 * (notice that we match right to left, so that last character
1829 	 * is the first one that would be matched).
1830 	 */
1831 	for (mindex = 0; mindex < g->mlen; mindex++)
1832 		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1833 }
1834 
1835 /*
1836  * computematchjumps - compute match jumps for BM scan
1837  *
1838  * This algorithm assumes g->must exists and is has size greater than
1839  * zero. It's based on the algorithm found on Computer Algorithms by
1840  * Sara Baase.
1841  *
1842  * A match jump is the number of characters one needs to advance based
1843  * on the already-matched suffix.
1844  * Notice that all values here are minus (g->mlen-1), because of the way
1845  * the search algorithm works.
1846  */
1847 static void
computematchjumps(struct parse * p,struct re_guts * g)1848 computematchjumps(struct parse *p, struct re_guts *g)
1849 {
1850 	int mindex;		/* General "must" iterator */
1851 	int suffix;		/* Keeps track of matching suffix */
1852 	int ssuffix;		/* Keeps track of suffixes' suffix */
1853 	int *pmatches;
1854 				/*
1855 				 * pmatches[k] points to the next i
1856 				 * such that i+1...mlen is a substring
1857 				 * of k+1...k+mlen-i-1
1858 				 */
1859 
1860 	/* Avoid making errors worse */
1861 	if (p->error != 0)
1862 		return;
1863 
1864 	pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1865 	if (pmatches == NULL) {
1866 		g->matchjump = NULL;
1867 		return;
1868 	}
1869 
1870 	g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1871 	if (g->matchjump == NULL) {	/* Not a fatal error */
1872 		free(pmatches);
1873 		return;
1874 	}
1875 
1876 	/* Set maximum possible jump for each character in the pattern */
1877 	for (mindex = 0; mindex < g->mlen; mindex++)
1878 		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1879 
1880 	/* Compute pmatches[] */
1881 	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1882 	    mindex--, suffix--) {
1883 		pmatches[mindex] = suffix;
1884 
1885 		/*
1886 		 * If a mismatch is found, interrupting the substring,
1887 		 * compute the matchjump for that position. If no
1888 		 * mismatch is found, then a text substring mismatched
1889 		 * against the suffix will also mismatch against the
1890 		 * substring.
1891 		 */
1892 		while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1893 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1894 			    g->mlen - mindex - 1);
1895 			suffix = pmatches[suffix];
1896 		}
1897 	}
1898 
1899 	/*
1900 	 * Compute the matchjump up to the last substring found to jump
1901 	 * to the beginning of the largest must pattern prefix matching
1902 	 * it's own suffix.
1903 	 */
1904 	for (mindex = 0; mindex <= suffix; mindex++)
1905 		g->matchjump[mindex] = MIN(g->matchjump[mindex],
1906 		    g->mlen + suffix - mindex);
1907 
1908 	ssuffix = pmatches[suffix];
1909 	while (suffix < g->mlen) {
1910 		while (suffix <= ssuffix && suffix < g->mlen) {
1911 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1912 			    g->mlen + ssuffix - suffix);
1913 			suffix++;
1914 		}
1915 		if (suffix < g->mlen)
1916 			ssuffix = pmatches[ssuffix];
1917 	}
1918 
1919 	free(pmatches);
1920 }
1921 
1922 /*
1923  * pluscount - count + nesting
1924  */
1925 static sopno			/* nesting depth */
pluscount(struct parse * p,struct re_guts * g)1926 pluscount(struct parse *p, struct re_guts *g)
1927 {
1928 	sop *scan;
1929 	sop s;
1930 	sopno plusnest = 0;
1931 	sopno maxnest = 0;
1932 
1933 	if (p->error != 0)
1934 		return (0);	/* there may not be an OEND */
1935 
1936 	scan = g->strip + 1;
1937 	do {
1938 		s = *scan++;
1939 		switch (OP(s)) {
1940 		case OPLUS_:
1941 			plusnest++;
1942 			break;
1943 		case O_PLUS:
1944 			if (plusnest > maxnest)
1945 				maxnest = plusnest;
1946 			plusnest--;
1947 			break;
1948 		}
1949 	} while (OP(s) != OEND);
1950 	if (plusnest != 0)
1951 		g->iflags |= BAD;
1952 	return (maxnest);
1953 }
1954