xref: /illumos-gate/usr/src/lib/libc/port/regex/regcomp.c (revision 1603eda2)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
4  * Copyright 2012 Milan Jurik. All rights reserved.
5  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
6  * Copyright (c) 1992, 1993, 1994
7  *	The Regents of the University of California.  All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * Henry Spencer.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #include "lint.h"
38 #include "file64.h"
39 #include <sys/types.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <limits.h>
44 #include <regex.h>
45 #include <stdlib.h>
46 #include <stdbool.h>
47 #include <wchar.h>
48 #include <wctype.h>
49 
50 #include "../locale/runetype.h"
51 #include "../locale/collate.h"
52 
53 #include "utils.h"
54 #include "regex2.h"
55 
56 #include "cname.h"
57 #include "../locale/mblocal.h"
58 
59 /*
60  * Branching context, used to keep track of branch state for all of the branch-
61  * aware functions. In addition to keeping track of branch positions for the
62  * p_branch_* functions, we use this to simplify some clumsiness in BREs for
63  * detection of whether ^ is acting as an anchor or being used erroneously and
64  * also for whether we're in a sub-expression or not.
65  */
66 struct branchc {
67 	sopno start;
68 	sopno back;
69 	sopno fwd;
70 
71 	int nbranch;
72 	int nchain;
73 	bool outer;
74 	bool terminate;
75 };
76 
77 /*
78  * parse structure, passed up and down to avoid global variables and
79  * other clumsinesses
80  */
81 struct parse {
82 	const char *next;	/* next character in RE */
83 	const char *end;	/* end of string (-> NUL normally) */
84 	int error;		/* has an error been seen? */
85 	sop *strip;		/* malloced strip */
86 	sopno ssize;		/* malloced strip size (allocated) */
87 	sopno slen;		/* malloced strip length (used) */
88 	int ncsalloc;		/* number of csets allocated */
89 	struct re_guts *g;
90 #define	NPAREN	10		/* we need to remember () 1-9 for back refs */
91 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
92 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
93 	bool allowbranch;	/* can this expression branch? */
94 	bool bre;		/* convenience; is this a BRE? */
95 	bool (*parse_expr)(struct parse *, struct branchc *);
96 	void (*pre_parse)(struct parse *, struct branchc *);
97 	void (*post_parse)(struct parse *, struct branchc *);
98 };
99 
100 /* ========= begin header generated by ./mkh ========= */
101 #ifdef __cplusplus
102 extern "C" {
103 #endif
104 
105 /* === regcomp.c === */
106 static bool p_ere_exp(struct parse *p, struct branchc *bc);
107 static void p_str(struct parse *p);
108 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
109 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
110 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
111 static bool p_branch_empty(struct parse *p, struct branchc *bc);
112 static bool p_branch_do(struct parse *p, struct branchc *bc);
113 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
114 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
115 static void p_re(struct parse *p, int end1, int end2);
116 static bool p_simp_re(struct parse *p, struct branchc *bc);
117 static int p_count(struct parse *p);
118 static void p_bracket(struct parse *p);
119 static void p_b_term(struct parse *p, cset *cs);
120 static void p_b_cclass(struct parse *p, cset *cs);
121 static void p_b_eclass(struct parse *p, cset *cs);
122 static wint_t p_b_symbol(struct parse *p);
123 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
124 static wint_t othercase(wint_t ch);
125 static void bothcases(struct parse *p, wint_t ch);
126 static void ordinary(struct parse *p, wint_t ch);
127 static void nonnewline(struct parse *p);
128 static void repeat(struct parse *p, sopno start, int from, int to);
129 static int seterr(struct parse *p, int e);
130 static cset *allocset(struct parse *p);
131 static void freeset(struct parse *p, cset *cs);
132 static void CHadd(struct parse *p, cset *cs, wint_t ch);
133 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
134 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
135 static wint_t singleton(cset *cs);
136 static sopno dupl(struct parse *p, sopno start, sopno finish);
137 static void doemit(struct parse *p, sop op, size_t opnd);
138 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
139 static void dofwd(struct parse *p, sopno pos, sop value);
140 static int enlarge(struct parse *p, sopno size);
141 static void stripsnug(struct parse *p, struct re_guts *g);
142 static void findmust(struct parse *p, struct re_guts *g);
143 static int altoffset(sop *scan, int offset);
144 static void computejumps(struct parse *p, struct re_guts *g);
145 static void computematchjumps(struct parse *p, struct re_guts *g);
146 static sopno pluscount(struct parse *p, struct re_guts *g);
147 static wint_t wgetnext(struct parse *p);
148 
149 #ifdef __cplusplus
150 }
151 #endif
152 /* ========= end header generated by ./mkh ========= */
153 
154 static char nuls[10];		/* place to point scanner in event of error */
155 
156 /*
157  * macros for use with parse structure
158  * BEWARE:  these know that the parse structure is named `p' !!!
159  */
160 #define	PEEK()	(*p->next)
161 #define	PEEK2()	(*(p->next+1))
162 #define	MORE()	(p->next < p->end)
163 #define	MORE2()	(p->next+1 < p->end)
164 #define	SEE(c)	(MORE() && PEEK() == (c))
165 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
166 #define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
167 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
168 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
169 #define	NEXT()	(p->next++)
170 #define	NEXT2()	(p->next += 2)
171 #define	NEXTn(n)	(p->next += (n))
172 #define	GETNEXT()	(*p->next++)
173 #define	WGETNEXT()	wgetnext(p)
174 #define	SETERROR(e)	((void)seterr(p, (e)))
175 #define	REQUIRE(co, e)	((co) || seterr(p, e))
176 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
177 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
178 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
179 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
180 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
181 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
182 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
183 #define	HERE()		(p->slen)
184 #define	THERE()		(p->slen - 1)
185 #define	THERETHERE()	(p->slen - 2)
186 #define	DROP(n)	(p->slen -= (n))
187 
188 #ifndef NDEBUG
189 static int never = 0;		/* for use in asserts; shuts lint up */
190 #else
191 #define	never	0		/* some <assert.h>s have bugs too */
192 #endif
193 
194 /*
195  * regcomp - interface for parser and compilation
196  */
197 int				/* 0 success, otherwise REG_something */
regcomp(regex_t * _RESTRICT_KYWD preg,const char * _RESTRICT_KYWD pattern,int cflags)198 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
199     int cflags)
200 {
201 	struct parse pa;
202 	struct re_guts *g;
203 	struct parse *p = &pa;
204 	int i;
205 	size_t len;
206 	size_t maxlen;
207 #ifdef REDEBUG
208 #define	GOODFLAGS(f)	(f)
209 #else
210 #define	GOODFLAGS(f)	((f)&~REG_DUMP)
211 #endif
212 
213 	/* We had REG_INVARG, but we don't have that on Solaris. */
214 	cflags = GOODFLAGS(cflags);
215 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
216 		return (REG_EFATAL);
217 
218 	if (cflags&REG_PEND) {
219 		if (preg->re_endp < pattern)
220 			return (REG_EFATAL);
221 		len = preg->re_endp - pattern;
222 	} else
223 		len = strlen(pattern);
224 
225 	/* do the mallocs early so failure handling is easy */
226 	g = (struct re_guts *)malloc(sizeof (struct re_guts));
227 	if (g == NULL)
228 		return (REG_ESPACE);
229 	/*
230 	 * Limit the pattern space to avoid a 32-bit overflow on buffer
231 	 * extension.  Also avoid any signed overflow in case of conversion
232 	 * so make the real limit based on a 31-bit overflow.
233 	 *
234 	 * Likely not applicable on 64-bit systems but handle the case
235 	 * generically (who are we to stop people from using ~715MB+
236 	 * patterns?).
237 	 */
238 	maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3;
239 	if (len >= maxlen) {
240 		free((char *)g);
241 		return (REG_ESPACE);
242 	}
243 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
244 	assert(p->ssize >= len);
245 
246 	p->strip = (sop *)malloc(p->ssize * sizeof (sop));
247 	p->slen = 0;
248 	if (p->strip == NULL) {
249 		free((char *)g);
250 		return (REG_ESPACE);
251 	}
252 
253 	/* set things up */
254 	p->g = g;
255 	p->next = pattern;	/* convenience; we do not modify it */
256 	p->end = p->next + len;
257 	p->error = 0;
258 	p->ncsalloc = 0;
259 	for (i = 0; i < NPAREN; i++) {
260 		p->pbegin[i] = 0;
261 		p->pend[i] = 0;
262 	}
263 	if (cflags & REG_EXTENDED) {
264 		p->allowbranch = true;
265 		p->bre = false;
266 		p->parse_expr = p_ere_exp;
267 		p->pre_parse = NULL;
268 		p->post_parse = NULL;
269 	} else {
270 		p->allowbranch = false;
271 		p->bre = true;
272 		p->parse_expr = p_simp_re;
273 		p->pre_parse = p_bre_pre_parse;
274 		p->post_parse = p_bre_post_parse;
275 	}
276 	g->sets = NULL;
277 	g->ncsets = 0;
278 	g->cflags = cflags;
279 	g->iflags = 0;
280 	g->nbol = 0;
281 	g->neol = 0;
282 	g->must = NULL;
283 	g->moffset = -1;
284 	g->charjump = NULL;
285 	g->matchjump = NULL;
286 	g->mlen = 0;
287 	g->nsub = 0;
288 	g->backrefs = 0;
289 
290 	/* do it */
291 	EMIT(OEND, 0);
292 	g->firststate = THERE();
293 	if (cflags & REG_NOSPEC)
294 		p_str(p);
295 	else
296 		p_re(p, OUT, OUT);
297 	EMIT(OEND, 0);
298 	g->laststate = THERE();
299 
300 	/* tidy up loose ends and fill things in */
301 	stripsnug(p, g);
302 	findmust(p, g);
303 	/*
304 	 * only use Boyer-Moore algorithm if the pattern is bigger
305 	 * than three characters
306 	 */
307 	if (g->mlen > 3) {
308 		computejumps(p, g);
309 		computematchjumps(p, g);
310 		if (g->matchjump == NULL && g->charjump != NULL) {
311 			free(g->charjump);
312 			g->charjump = NULL;
313 		}
314 	}
315 	g->nplus = pluscount(p, g);
316 	g->magic = MAGIC2;
317 	preg->re_nsub = g->nsub;
318 	preg->re_g = g;
319 	preg->re_magic = MAGIC1;
320 #ifndef REDEBUG
321 	/* not debugging, so can't rely on the assert() in regexec() */
322 	if (g->iflags&BAD)
323 		SETERROR(REG_EFATAL);
324 #endif
325 
326 	/* win or lose, we're done */
327 	if (p->error != 0)	/* lose */
328 		regfree(preg);
329 	return (p->error);
330 }
331 
332 /*
333  * Parse one subERE, an atom possibly followed by a repetition op,
334  * return whether we should terminate or not.
335  */
336 static bool
p_ere_exp(struct parse * p,struct branchc * bc)337 p_ere_exp(struct parse *p, struct branchc *bc)
338 {
339 	char c;
340 	wint_t wc;
341 	sopno pos;
342 	int count;
343 	int count2;
344 	sopno subno;
345 	int wascaret = 0;
346 
347 	(void) bc;
348 	assert(MORE());		/* caller should have ensured this */
349 	c = GETNEXT();
350 
351 	pos = HERE();
352 	switch (c) {
353 	case '(':
354 		(void) REQUIRE(MORE(), REG_EPAREN);
355 		p->g->nsub++;
356 		subno = p->g->nsub;
357 		if (subno < NPAREN)
358 			p->pbegin[subno] = HERE();
359 		EMIT(OLPAREN, subno);
360 		if (!SEE(')'))
361 			p_re(p, ')', IGN);
362 		if (subno < NPAREN) {
363 			p->pend[subno] = HERE();
364 			assert(p->pend[subno] != 0);
365 		}
366 		EMIT(ORPAREN, subno);
367 		(void) MUSTEAT(')', REG_EPAREN);
368 		break;
369 #ifndef POSIX_MISTAKE
370 	case ')':		/* happens only if no current unmatched ( */
371 		/*
372 		 * You may ask, why the ifndef?  Because I didn't notice
373 		 * this until slightly too late for 1003.2, and none of the
374 		 * other 1003.2 regular-expression reviewers noticed it at
375 		 * all.  So an unmatched ) is legal POSIX, at least until
376 		 * we can get it fixed.
377 		 */
378 		SETERROR(REG_EPAREN);
379 		break;
380 #endif
381 	case '^':
382 		EMIT(OBOL, 0);
383 		p->g->iflags |= USEBOL;
384 		p->g->nbol++;
385 		wascaret = 1;
386 		break;
387 	case '$':
388 		EMIT(OEOL, 0);
389 		p->g->iflags |= USEEOL;
390 		p->g->neol++;
391 		break;
392 	case '|':
393 		SETERROR(REG_BADPAT);
394 		break;
395 	case '*':
396 	case '+':
397 	case '?':
398 	case '{':
399 		SETERROR(REG_BADRPT);
400 		break;
401 	case '.':
402 		if (p->g->cflags&REG_NEWLINE)
403 			nonnewline(p);
404 		else
405 			EMIT(OANY, 0);
406 		break;
407 	case '[':
408 		p_bracket(p);
409 		break;
410 	case '\\':
411 		(void) REQUIRE(MORE(), REG_EESCAPE);
412 		wc = WGETNEXT();
413 		switch (wc) {
414 		case '<':
415 			EMIT(OBOW, 0);
416 			break;
417 		case '>':
418 			EMIT(OEOW, 0);
419 			break;
420 		default:
421 			ordinary(p, wc);
422 			break;
423 		}
424 		break;
425 	default:
426 		if (p->error != 0)
427 			return (false);
428 		p->next--;
429 		wc = WGETNEXT();
430 		ordinary(p, wc);
431 		break;
432 	}
433 
434 	if (!MORE())
435 		return (false);
436 	c = PEEK();
437 	/* we call { a repetition if followed by a digit */
438 	if (!(c == '*' || c == '+' || c == '?' || c == '{'))
439 		return (false);		/* no repetition, we're done */
440 	else if (c == '{')
441 		(void) REQUIRE(MORE2() && \
442 		    (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
443 	NEXT();
444 
445 	(void) REQUIRE(!wascaret, REG_BADRPT);
446 	switch (c) {
447 	case '*':	/* implemented as +? */
448 		/* this case does not require the (y|) trick, noKLUDGE */
449 		INSERT(OPLUS_, pos);
450 		ASTERN(O_PLUS, pos);
451 		INSERT(OQUEST_, pos);
452 		ASTERN(O_QUEST, pos);
453 		break;
454 	case '+':
455 		INSERT(OPLUS_, pos);
456 		ASTERN(O_PLUS, pos);
457 		break;
458 	case '?':
459 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
460 		INSERT(OCH_, pos);		/* offset slightly wrong */
461 		ASTERN(OOR1, pos);		/* this one's right */
462 		AHEAD(pos);			/* fix the OCH_ */
463 		EMIT(OOR2, 0);			/* offset very wrong... */
464 		AHEAD(THERE());			/* ...so fix it */
465 		ASTERN(O_CH, THERETHERE());
466 		break;
467 	case '{':
468 		count = p_count(p);
469 		if (EAT(',')) {
470 			if (isdigit((uch)PEEK())) {
471 				count2 = p_count(p);
472 				(void) REQUIRE(count <= count2, REG_BADBR);
473 			} else		/* single number with comma */
474 				count2 = INFINITY;
475 		} else		/* just a single number */
476 			count2 = count;
477 		repeat(p, pos, count, count2);
478 		if (!EAT('}')) {	/* error heuristics */
479 			while (MORE() && PEEK() != '}')
480 				NEXT();
481 			(void) REQUIRE(MORE(), REG_EBRACE);
482 			SETERROR(REG_BADBR);
483 		}
484 		break;
485 	}
486 
487 	if (!MORE())
488 		return (false);
489 	c = PEEK();
490 	if (!(c == '*' || c == '+' || c == '?' ||
491 	    (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
492 		return (false);
493 	SETERROR(REG_BADRPT);
494 	return (false);
495 }
496 
497 /*
498  * p_str - string (no metacharacters) "parser"
499  */
500 static void
p_str(struct parse * p)501 p_str(struct parse *p)
502 {
503 	(void) REQUIRE(MORE(), REG_BADPAT);
504 	while (MORE())
505 		ordinary(p, WGETNEXT());
506 }
507 
508 /*
509  * Eat consecutive branch delimiters for the kind of expression that we are
510  * parsing, return the number of delimiters that we ate.
511  */
512 static int
p_branch_eat_delim(struct parse * p,struct branchc * bc)513 p_branch_eat_delim(struct parse *p, struct branchc *bc)
514 {
515 	int nskip;
516 
517 	(void) bc;
518 	nskip = 0;
519 	while (EAT('|'))
520 		++nskip;
521 	return (nskip);
522 }
523 
524 /*
525  * Insert necessary branch book-keeping operations. This emits a
526  * bogus 'next' offset, since we still have more to parse
527  */
528 static void
p_branch_ins_offset(struct parse * p,struct branchc * bc)529 p_branch_ins_offset(struct parse *p, struct branchc *bc)
530 {
531 	if (bc->nbranch == 0) {
532 		INSERT(OCH_, bc->start);	/* offset is wrong */
533 		bc->fwd = bc->start;
534 		bc->back = bc->start;
535 	}
536 
537 	ASTERN(OOR1, bc->back);
538 	bc->back = THERE();
539 	AHEAD(bc->fwd);			/* fix previous offset */
540 	bc->fwd = HERE();
541 	EMIT(OOR2, 0);			/* offset is very wrong */
542 	++bc->nbranch;
543 }
544 
545 /*
546  * Fix the offset of the tail branch, if we actually had any branches.
547  * This is to correct the bogus placeholder offset that we use.
548  */
549 static void
p_branch_fix_tail(struct parse * p,struct branchc * bc)550 p_branch_fix_tail(struct parse *p, struct branchc *bc)
551 {
552 	/* Fix bogus offset at the tail if we actually have branches */
553 	if (bc->nbranch > 0) {
554 		AHEAD(bc->fwd);
555 		ASTERN(O_CH, bc->back);
556 	}
557 }
558 
559 /*
560  * Signal to the parser that an empty branch has been encountered; this will,
561  * in the future, be used to allow for more permissive behavior with empty
562  * branches. The return value should indicate whether parsing may continue
563  * or not.
564  */
565 static bool
p_branch_empty(struct parse * p,struct branchc * bc)566 p_branch_empty(struct parse *p, struct branchc *bc)
567 {
568 	(void) bc;
569 	SETERROR(REG_BADPAT);
570 	return (false);
571 }
572 
573 /*
574  * Take care of any branching requirements. This includes inserting the
575  * appropriate branching instructions as well as eating all of the branch
576  * delimiters until we either run out of pattern or need to parse more pattern.
577  */
578 static bool
p_branch_do(struct parse * p,struct branchc * bc)579 p_branch_do(struct parse *p, struct branchc *bc)
580 {
581 	int ate = 0;
582 
583 	ate = p_branch_eat_delim(p, bc);
584 	if (ate == 0)
585 		return (false);
586 	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
587 		/*
588 		 * Halt parsing only if we have an empty branch and
589 		 * p_branch_empty indicates that we must not continue.
590 		 * In the future, this will not  necessarily be an error.
591 		 */
592 		return (false);
593 	p_branch_ins_offset(p, bc);
594 
595 	return (true);
596 }
597 
598 static void
p_bre_pre_parse(struct parse * p,struct branchc * bc)599 p_bre_pre_parse(struct parse *p, struct branchc *bc)
600 {
601 	(void) bc;
602 	/*
603 	 * Does not move cleanly into expression parser because of
604 	 * ordinary interpration of * at the beginning position of
605 	 * an expression.
606 	 */
607 	if (EAT('^')) {
608 		EMIT(OBOL, 0);
609 		p->g->iflags |= USEBOL;
610 		p->g->nbol++;
611 	}
612 }
613 
614 static void
p_bre_post_parse(struct parse * p,struct branchc * bc)615 p_bre_post_parse(struct parse *p, struct branchc *bc)
616 {
617 	/* Expression is terminating due to EOL token */
618 	if (bc->terminate) {
619 		DROP(1);
620 		EMIT(OEOL, 0);
621 		p->g->iflags |= USEEOL;
622 		p->g->neol++;
623 	}
624 }
625 
626 /*
627  * Top level parser, concatenation and BRE anchoring.
628  * Giving end1 as OUT essentially eliminates the end1/end2 check.
629  *
630  * This implementation is a bit of a kludge, in that a trailing $ is first
631  * taken as an ordinary character and then revised to be an anchor.
632  * The amount of lookahead needed to avoid this kludge is excessive.
633  */
634 static void
p_re(struct parse * p,int end1,int end2)635 p_re(struct parse *p,
636     int end1,	/* first terminating character */
637     int end2)	/* second terminating character; ignored for EREs */
638 {
639 	struct branchc bc;
640 
641 	bc.nbranch = 0;
642 	if (end1 == OUT && end2 == OUT)
643 		bc.outer = true;
644 	else
645 		bc.outer = false;
646 #define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
647 	for (;;) {
648 		bc.start = HERE();
649 		bc.nchain = 0;
650 		bc.terminate = false;
651 		if (p->pre_parse != NULL)
652 			p->pre_parse(p, &bc);
653 		while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
654 		    !SEEEND()) {
655 			bc.terminate = p->parse_expr(p, &bc);
656 			++bc.nchain;
657 		}
658 		if (p->post_parse != NULL)
659 			p->post_parse(p, &bc);
660 		(void) REQUIRE(HERE() != bc.start, REG_BADPAT);
661 		if (!p->allowbranch)
662 			break;
663 		/*
664 		 * p_branch_do's return value indicates whether we should
665 		 * continue parsing or not. This is both for correctness and
666 		 * a slight optimization, because it will check if we've
667 		 * encountered an empty branch or the end of the string
668 		 * immediately following a branch delimiter.
669 		 */
670 		if (!p_branch_do(p, &bc))
671 			break;
672 	}
673 #undef SEE_END
674 	if (p->allowbranch)
675 		p_branch_fix_tail(p, &bc);
676 	assert(!MORE() || SEE(end1));
677 }
678 
679 /*
680  * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
681  */
682 static bool			/* was the simple RE an unbackslashed $? */
p_simp_re(struct parse * p,struct branchc * bc)683 p_simp_re(struct parse *p, struct branchc *bc)
684 {
685 	int c;
686 	int count;
687 	int count2;
688 	sopno pos;
689 	int i;
690 	wint_t wc;
691 	sopno subno;
692 #define	BACKSL	(1<<CHAR_BIT)
693 
694 	pos = HERE();		/* repetition op, if any, covers from here */
695 
696 	assert(MORE());		/* caller should have ensured this */
697 	c = GETNEXT();
698 	if (c == '\\') {
699 		(void) REQUIRE(MORE(), REG_EESCAPE);
700 		c = BACKSL | GETNEXT();
701 	}
702 	switch (c) {
703 	case '.':
704 		if (p->g->cflags&REG_NEWLINE)
705 			nonnewline(p);
706 		else
707 			EMIT(OANY, 0);
708 		break;
709 	case '[':
710 		p_bracket(p);
711 		break;
712 	case BACKSL|'<':
713 		EMIT(OBOW, 0);
714 		break;
715 	case BACKSL|'>':
716 		EMIT(OEOW, 0);
717 		break;
718 	case BACKSL|'{':
719 		SETERROR(REG_BADRPT);
720 		break;
721 	case BACKSL|'(':
722 		p->g->nsub++;
723 		subno = p->g->nsub;
724 		if (subno < NPAREN)
725 			p->pbegin[subno] = HERE();
726 		EMIT(OLPAREN, subno);
727 		/* the MORE here is an error heuristic */
728 		if (MORE() && !SEETWO('\\', ')'))
729 			p_re(p, '\\', ')');
730 		if (subno < NPAREN) {
731 			p->pend[subno] = HERE();
732 			assert(p->pend[subno] != 0);
733 		}
734 		EMIT(ORPAREN, subno);
735 		(void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
736 		break;
737 	case BACKSL|')':	/* should not get here -- must be user */
738 		SETERROR(REG_EPAREN);
739 		break;
740 	case BACKSL|'1':
741 	case BACKSL|'2':
742 	case BACKSL|'3':
743 	case BACKSL|'4':
744 	case BACKSL|'5':
745 	case BACKSL|'6':
746 	case BACKSL|'7':
747 	case BACKSL|'8':
748 	case BACKSL|'9':
749 		i = (c&~BACKSL) - '0';
750 		assert(i < NPAREN);
751 		if (p->pend[i] != 0) {
752 			assert(i <= p->g->nsub);
753 			EMIT(OBACK_, i);
754 			assert(p->pbegin[i] != 0);
755 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
756 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
757 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
758 			EMIT(O_BACK, i);
759 		} else
760 			SETERROR(REG_ESUBREG);
761 		p->g->backrefs = 1;
762 		break;
763 	case '*':
764 		/*
765 		 * Ordinary if used as the first character beyond BOL anchor of
766 		 * a (sub-)expression, counts as a bad repetition operator if it
767 		 * appears otherwise.
768 		 */
769 		(void) REQUIRE(bc->nchain == 0, REG_BADRPT);
770 		/* FALLTHROUGH */
771 	default:
772 		if (p->error != 0)
773 			return (false);	/* Definitely not $... */
774 		p->next--;
775 		wc = WGETNEXT();
776 		ordinary(p, wc);
777 		break;
778 	}
779 
780 	if (EAT('*')) {		/* implemented as +? */
781 		/* this case does not require the (y|) trick, noKLUDGE */
782 		INSERT(OPLUS_, pos);
783 		ASTERN(O_PLUS, pos);
784 		INSERT(OQUEST_, pos);
785 		ASTERN(O_QUEST, pos);
786 	} else if (EATTWO('\\', '{')) {
787 		count = p_count(p);
788 		if (EAT(',')) {
789 			if (MORE() && isdigit((uch)PEEK())) {
790 				count2 = p_count(p);
791 				(void) REQUIRE(count <= count2, REG_BADBR);
792 			} else		/* single number with comma */
793 				count2 = INFINITY;
794 		} else		/* just a single number */
795 			count2 = count;
796 		repeat(p, pos, count, count2);
797 		if (!EATTWO('\\', '}')) {	/* error heuristics */
798 			while (MORE() && !SEETWO('\\', '}'))
799 				NEXT();
800 			(void) REQUIRE(MORE(), REG_EBRACE);
801 			SETERROR(REG_BADBR);
802 		}
803 	} else if (c == '$')	/* $ (but not \$) ends it */
804 		return (true);
805 
806 	return (false);
807 }
808 
809 /*
810  * p_count - parse a repetition count
811  */
812 static int			/* the value */
p_count(struct parse * p)813 p_count(struct parse *p)
814 {
815 	int count = 0;
816 	int ndigits = 0;
817 
818 	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
819 		count = count*10 + (GETNEXT() - '0');
820 		ndigits++;
821 	}
822 
823 	(void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
824 	return (count);
825 }
826 
827 /*
828  * p_bracket - parse a bracketed character list
829  */
830 static void
p_bracket(struct parse * p)831 p_bracket(struct parse *p)
832 {
833 	cset *cs;
834 	wint_t ch;
835 
836 	/* Dept of Truly Sickening Special-Case Kludges */
837 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
838 		EMIT(OBOW, 0);
839 		NEXTn(6);
840 		return;
841 	}
842 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
843 		EMIT(OEOW, 0);
844 		NEXTn(6);
845 		return;
846 	}
847 
848 	if ((cs = allocset(p)) == NULL)
849 		return;
850 
851 	if (p->g->cflags&REG_ICASE)
852 		cs->icase = 1;
853 	if (EAT('^'))
854 		cs->invert = 1;
855 	if (EAT(']'))
856 		CHadd(p, cs, ']');
857 	else if (EAT('-'))
858 		CHadd(p, cs, '-');
859 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
860 		p_b_term(p, cs);
861 	if (EAT('-'))
862 		CHadd(p, cs, '-');
863 	(void) MUSTEAT(']', REG_EBRACK);
864 
865 	if (p->error != 0)	/* don't mess things up further */
866 		return;
867 
868 	if (cs->invert && p->g->cflags&REG_NEWLINE)
869 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
870 
871 	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
872 		ordinary(p, ch);
873 		freeset(p, cs);
874 	} else
875 		EMIT(OANYOF, (int)(cs - p->g->sets));
876 }
877 
878 /*
879  * p_b_term - parse one term of a bracketed character list
880  */
881 static void
p_b_term(struct parse * p,cset * cs)882 p_b_term(struct parse *p, cset *cs)
883 {
884 	char c;
885 	wint_t start, finish;
886 	wint_t i;
887 	locale_t loc = uselocale(NULL);
888 
889 	/* classify what we've got */
890 	switch ((MORE()) ? PEEK() : '\0') {
891 	case '[':
892 		c = (MORE2()) ? PEEK2() : '\0';
893 		break;
894 	case '-':
895 		SETERROR(REG_ERANGE);
896 		return;			/* NOTE RETURN */
897 	default:
898 		c = '\0';
899 		break;
900 	}
901 
902 	switch (c) {
903 	case ':':		/* character class */
904 		NEXT2();
905 		(void) REQUIRE(MORE(), REG_EBRACK);
906 		c = PEEK();
907 		(void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
908 		p_b_cclass(p, cs);
909 		(void) REQUIRE(MORE(), REG_EBRACK);
910 		(void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
911 		break;
912 	case '=':		/* equivalence class */
913 		NEXT2();
914 		(void) REQUIRE(MORE(), REG_EBRACK);
915 		c = PEEK();
916 		(void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
917 		p_b_eclass(p, cs);
918 		(void) REQUIRE(MORE(), REG_EBRACK);
919 		(void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
920 		break;
921 	default:		/* symbol, ordinary character, or range */
922 		start = p_b_symbol(p);
923 		if (SEE('-') && MORE2() && PEEK2() != ']') {
924 			/* range */
925 			NEXT();
926 			if (EAT('-'))
927 				finish = '-';
928 			else
929 				finish = p_b_symbol(p);
930 		} else
931 			finish = start;
932 		if (start == finish)
933 			CHadd(p, cs, start);
934 		else {
935 			if (loc->collate->lc_is_posix) {
936 				(void) REQUIRE((uch)start <= (uch)finish,
937 				    REG_ERANGE);
938 				CHaddrange(p, cs, start, finish);
939 			} else {
940 				(void) REQUIRE(_collate_range_cmp(start,
941 				    finish, loc) <= 0, REG_ERANGE);
942 				for (i = 0; i <= UCHAR_MAX; i++) {
943 					if (_collate_range_cmp(start, i, loc)
944 					    <= 0 &&
945 					    _collate_range_cmp(i, finish, loc)
946 					    <= 0)
947 						CHadd(p, cs, i);
948 				}
949 			}
950 		}
951 		break;
952 	}
953 }
954 
955 /*
956  * p_b_cclass - parse a character-class name and deal with it
957  */
958 static void
p_b_cclass(struct parse * p,cset * cs)959 p_b_cclass(struct parse *p, cset *cs)
960 {
961 	const char *sp = p->next;
962 	size_t len;
963 	wctype_t wct;
964 	char clname[16];
965 
966 	while (MORE() && isalpha((uch)PEEK()))
967 		NEXT();
968 	len = p->next - sp;
969 	if (len >= sizeof (clname) - 1) {
970 		SETERROR(REG_ECTYPE);
971 		return;
972 	}
973 	(void) memcpy(clname, sp, len);
974 	clname[len] = '\0';
975 	if ((wct = wctype(clname)) == 0) {
976 		SETERROR(REG_ECTYPE);
977 		return;
978 	}
979 	CHaddtype(p, cs, wct);
980 }
981 
982 /*
983  * p_b_eclass - parse an equivalence-class name and deal with it
984  *
985  * This implementation is incomplete. xxx
986  */
987 static void
p_b_eclass(struct parse * p,cset * cs)988 p_b_eclass(struct parse *p, cset *cs)
989 {
990 	wint_t c;
991 
992 	c = p_b_coll_elem(p, '=');
993 	CHadd(p, cs, c);
994 }
995 
996 /*
997  * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
998  */
999 static wint_t			/* value of symbol */
p_b_symbol(struct parse * p)1000 p_b_symbol(struct parse *p)
1001 {
1002 	wint_t value;
1003 
1004 	(void) REQUIRE(MORE(), REG_EBRACK);
1005 	if (!EATTWO('[', '.'))
1006 		return (WGETNEXT());
1007 
1008 	/* collating symbol */
1009 	value = p_b_coll_elem(p, '.');
1010 	(void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1011 	return (value);
1012 }
1013 
1014 /*
1015  * p_b_coll_elem - parse a collating-element name and look it up
1016  */
1017 static wint_t			/* value of collating element */
p_b_coll_elem(struct parse * p,wint_t endc)1018 p_b_coll_elem(struct parse *p,
1019     wint_t endc)		/* name ended by endc,']' */
1020 {
1021 	const char *sp = p->next;
1022 	struct cname *cp;
1023 	mbstate_t mbs;
1024 	wchar_t wc;
1025 	size_t clen, len;
1026 
1027 	while (MORE() && !SEETWO(endc, ']'))
1028 		NEXT();
1029 	if (!MORE()) {
1030 		SETERROR(REG_EBRACK);
1031 		return (0);
1032 	}
1033 	len = p->next - sp;
1034 	for (cp = cnames; cp->name != NULL; cp++)
1035 		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1036 			return (cp->code);	/* known name */
1037 	(void) memset(&mbs, 0, sizeof (mbs));
1038 	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1039 		return (wc);			/* single character */
1040 	else if (clen == (size_t)-1 || clen == (size_t)-2)
1041 		SETERROR(REG_ECHAR);
1042 	else
1043 		SETERROR(REG_ECOLLATE);		/* neither */
1044 	return (0);
1045 }
1046 
1047 /*
1048  * othercase - return the case counterpart of an alphabetic
1049  */
1050 static wint_t			/* if no counterpart, return ch */
othercase(wint_t ch)1051 othercase(wint_t ch)
1052 {
1053 	assert(iswalpha(ch));
1054 	if (iswupper(ch))
1055 		return (towlower(ch));
1056 	else if (iswlower(ch))
1057 		return (towupper(ch));
1058 	else			/* peculiar, but could happen */
1059 		return (ch);
1060 }
1061 
1062 /*
1063  * bothcases - emit a dualcase version of a two-case character
1064  *
1065  * Boy, is this implementation ever a kludge...
1066  */
1067 static void
bothcases(struct parse * p,wint_t ch)1068 bothcases(struct parse *p, wint_t ch)
1069 {
1070 	const char *oldnext = p->next;
1071 	const char *oldend = p->end;
1072 	char bracket[3 + MB_LEN_MAX];
1073 	size_t n;
1074 	mbstate_t mbs;
1075 
1076 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1077 	p->next = bracket;
1078 	(void) memset(&mbs, 0, sizeof (mbs));
1079 	n = wcrtomb(bracket, ch, &mbs);
1080 	assert(n != (size_t)-1);
1081 	bracket[n] = ']';
1082 	bracket[n + 1] = '\0';
1083 	p->end = bracket+n+1;
1084 	p_bracket(p);
1085 	assert(p->next == p->end);
1086 	p->next = oldnext;
1087 	p->end = oldend;
1088 }
1089 
1090 /*
1091  * ordinary - emit an ordinary character
1092  */
1093 static void
ordinary(struct parse * p,wint_t ch)1094 ordinary(struct parse *p, wint_t ch)
1095 {
1096 	cset *cs;
1097 
1098 	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1099 		bothcases(p, ch);
1100 	else if ((ch & OPDMASK) == ch)
1101 		EMIT(OCHAR, ch);
1102 	else {
1103 		/*
1104 		 * Kludge: character is too big to fit into an OCHAR operand.
1105 		 * Emit a singleton set.
1106 		 */
1107 		if ((cs = allocset(p)) == NULL)
1108 			return;
1109 		CHadd(p, cs, ch);
1110 		EMIT(OANYOF, (int)(cs - p->g->sets));
1111 	}
1112 }
1113 
1114 /*
1115  * nonnewline - emit REG_NEWLINE version of OANY
1116  *
1117  * Boy, is this implementation ever a kludge...
1118  */
1119 static void
nonnewline(struct parse * p)1120 nonnewline(struct parse *p)
1121 {
1122 	const char *oldnext = p->next;
1123 	const char *oldend = p->end;
1124 	char bracket[4];
1125 
1126 	p->next = bracket;
1127 	p->end = bracket+3;
1128 	bracket[0] = '^';
1129 	bracket[1] = '\n';
1130 	bracket[2] = ']';
1131 	bracket[3] = '\0';
1132 	p_bracket(p);
1133 	assert(p->next == bracket+3);
1134 	p->next = oldnext;
1135 	p->end = oldend;
1136 }
1137 
1138 /*
1139  * repeat - generate code for a bounded repetition, recursively if needed
1140  */
1141 static void
repeat(struct parse * p,sopno start,int from,int to)1142 repeat(struct parse *p,
1143     sopno start,		/* operand from here to end of strip */
1144     int from,			/* repeated from this number */
1145     int to)			/* to this number of times (maybe INFINITY) */
1146 {
1147 	sopno finish = HERE();
1148 #define	N	2
1149 #define	INF	3
1150 #define	REP(f, t)	((f)*8 + (t))
1151 #define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1152 	sopno copy;
1153 
1154 	if (p->error != 0)	/* head off possible runaway recursion */
1155 		return;
1156 
1157 	assert(from <= to);
1158 
1159 	switch (REP(MAP(from), MAP(to))) {
1160 	case REP(0, 0):			/* must be user doing this */
1161 		DROP(finish-start);	/* drop the operand */
1162 		break;
1163 	case REP(0, 1):			/* as x{1,1}? */
1164 	case REP(0, N):			/* as x{1,n}? */
1165 	case REP(0, INF):		/* as x{1,}? */
1166 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1167 		INSERT(OCH_, start);		/* offset is wrong... */
1168 		repeat(p, start+1, 1, to);
1169 		ASTERN(OOR1, start);
1170 		AHEAD(start);			/* ... fix it */
1171 		EMIT(OOR2, 0);
1172 		AHEAD(THERE());
1173 		ASTERN(O_CH, THERETHERE());
1174 		break;
1175 	case REP(1, 1):			/* trivial case */
1176 		/* done */
1177 		break;
1178 	case REP(1, N):			/* as x?x{1,n-1} */
1179 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1180 		INSERT(OCH_, start);
1181 		ASTERN(OOR1, start);
1182 		AHEAD(start);
1183 		EMIT(OOR2, 0);			/* offset very wrong... */
1184 		AHEAD(THERE());			/* ...so fix it */
1185 		ASTERN(O_CH, THERETHERE());
1186 		copy = dupl(p, start+1, finish+1);
1187 		assert(copy == finish+4);
1188 		repeat(p, copy, 1, to-1);
1189 		break;
1190 	case REP(1, INF):		/* as x+ */
1191 		INSERT(OPLUS_, start);
1192 		ASTERN(O_PLUS, start);
1193 		break;
1194 	case REP(N, N):			/* as xx{m-1,n-1} */
1195 		copy = dupl(p, start, finish);
1196 		repeat(p, copy, from-1, to-1);
1197 		break;
1198 	case REP(N, INF):		/* as xx{n-1,INF} */
1199 		copy = dupl(p, start, finish);
1200 		repeat(p, copy, from-1, to);
1201 		break;
1202 	default:			/* "can't happen" */
1203 		SETERROR(REG_EFATAL);	/* just in case */
1204 		break;
1205 	}
1206 }
1207 
1208 /*
1209  * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1210  * character from the parse struct, signals a REG_ILLSEQ error if the
1211  * character can't be converted. Returns the number of bytes consumed.
1212  */
1213 static wint_t
wgetnext(struct parse * p)1214 wgetnext(struct parse *p)
1215 {
1216 	mbstate_t mbs;
1217 	wchar_t wc;
1218 	size_t n;
1219 
1220 	(void) memset(&mbs, 0, sizeof (mbs));
1221 	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1222 	if (n == (size_t)-1 || n == (size_t)-2) {
1223 		SETERROR(REG_ECHAR);
1224 		return (0);
1225 	}
1226 	if (n == 0)
1227 		n = 1;
1228 	p->next += n;
1229 	return (wc);
1230 }
1231 
1232 /*
1233  * seterr - set an error condition
1234  */
1235 static int			/* useless but makes type checking happy */
seterr(struct parse * p,int e)1236 seterr(struct parse *p, int e)
1237 {
1238 	if (p->error == 0)	/* keep earliest error condition */
1239 		p->error = e;
1240 	p->next = nuls;		/* try to bring things to a halt */
1241 	p->end = nuls;
1242 	return (0);		/* make the return value well-defined */
1243 }
1244 
1245 /*
1246  * allocset - allocate a set of characters for []
1247  */
1248 static cset *
allocset(struct parse * p)1249 allocset(struct parse *p)
1250 {
1251 	cset *cs, *ncs;
1252 
1253 	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1254 	if (ncs == NULL) {
1255 		SETERROR(REG_ESPACE);
1256 		return (NULL);
1257 	}
1258 	p->g->sets = ncs;
1259 	cs = &p->g->sets[p->g->ncsets++];
1260 	(void) memset(cs, 0, sizeof (*cs));
1261 
1262 	return (cs);
1263 }
1264 
1265 /*
1266  * freeset - free a now-unused set
1267  */
1268 static void
freeset(struct parse * p,cset * cs)1269 freeset(struct parse *p, cset *cs)
1270 {
1271 	cset *top = &p->g->sets[p->g->ncsets];
1272 
1273 	free(cs->wides);
1274 	free(cs->ranges);
1275 	free(cs->types);
1276 	(void) memset(cs, 0, sizeof (*cs));
1277 	if (cs == top-1)	/* recover only the easy case */
1278 		p->g->ncsets--;
1279 }
1280 
1281 /*
1282  * singleton - Determine whether a set contains only one character,
1283  * returning it if so, otherwise returning OUT.
1284  */
1285 static wint_t
singleton(cset * cs)1286 singleton(cset *cs)
1287 {
1288 	wint_t i, s, n;
1289 
1290 	for (i = n = 0; i < NC; i++)
1291 		if (CHIN(cs, i)) {
1292 			n++;
1293 			s = i;
1294 		}
1295 	if (n == 1)
1296 		return (s);
1297 	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1298 	    cs->icase == 0)
1299 		return (cs->wides[0]);
1300 	/* Don't bother handling the other cases. */
1301 	return (OUT);
1302 }
1303 
1304 /*
1305  * CHadd - add character to character set.
1306  */
1307 static void
CHadd(struct parse * p,cset * cs,wint_t ch)1308 CHadd(struct parse *p, cset *cs, wint_t ch)
1309 {
1310 	wint_t nch, *newwides;
1311 	assert(ch >= 0);
1312 	if (ch < NC)
1313 		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1314 	else {
1315 		newwides = realloc(cs->wides, (cs->nwides + 1) *
1316 		    sizeof (*cs->wides));
1317 		if (newwides == NULL) {
1318 			SETERROR(REG_ESPACE);
1319 			return;
1320 		}
1321 		cs->wides = newwides;
1322 		cs->wides[cs->nwides++] = ch;
1323 	}
1324 	if (cs->icase) {
1325 		if ((nch = towlower(ch)) < NC)
1326 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1327 		if ((nch = towupper(ch)) < NC)
1328 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1329 	}
1330 }
1331 
1332 /*
1333  * CHaddrange - add all characters in the range [min,max] to a character set.
1334  */
1335 static void
CHaddrange(struct parse * p,cset * cs,wint_t min,wint_t max)1336 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1337 {
1338 	crange *newranges;
1339 
1340 	for (; min < NC && min <= max; min++)
1341 		CHadd(p, cs, min);
1342 	if (min >= max)
1343 		return;
1344 	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1345 	    sizeof (*cs->ranges));
1346 	if (newranges == NULL) {
1347 		SETERROR(REG_ESPACE);
1348 		return;
1349 	}
1350 	cs->ranges = newranges;
1351 	cs->ranges[cs->nranges].min = min;
1352 	cs->ranges[cs->nranges].max = max;
1353 	cs->nranges++;
1354 }
1355 
1356 /*
1357  * CHaddtype - add all characters of a certain type to a character set.
1358  */
1359 static void
CHaddtype(struct parse * p,cset * cs,wctype_t wct)1360 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1361 {
1362 	wint_t i;
1363 	wctype_t *newtypes;
1364 
1365 	for (i = 0; i < NC; i++)
1366 		if (iswctype(i, wct))
1367 			CHadd(p, cs, i);
1368 	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1369 	    sizeof (*cs->types));
1370 	if (newtypes == NULL) {
1371 		SETERROR(REG_ESPACE);
1372 		return;
1373 	}
1374 	cs->types = newtypes;
1375 	cs->types[cs->ntypes++] = wct;
1376 }
1377 
1378 /*
1379  * dupl - emit a duplicate of a bunch of sops
1380  */
1381 static sopno			/* start of duplicate */
dupl(struct parse * p,sopno start,sopno finish)1382 dupl(struct parse *p,
1383     sopno start,		/* from here */
1384     sopno finish)		/* to this less one */
1385 {
1386 	sopno ret = HERE();
1387 	sopno len = finish - start;
1388 
1389 	assert(finish >= start);
1390 	if (len == 0)
1391 		return (ret);
1392 	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1393 		return (ret);
1394 	assert(p->ssize >= p->slen + len);
1395 	(void) memcpy((char *)(p->strip + p->slen),
1396 	    (char *)(p->strip + start), (size_t)len*sizeof (sop));
1397 	p->slen += len;
1398 	return (ret);
1399 }
1400 
1401 /*
1402  * doemit - emit a strip operator
1403  *
1404  * It might seem better to implement this as a macro with a function as
1405  * hard-case backup, but it's just too big and messy unless there are
1406  * some changes to the data structures.  Maybe later.
1407  */
1408 static void
doemit(struct parse * p,sop op,size_t opnd)1409 doemit(struct parse *p, sop op, size_t opnd)
1410 {
1411 	/* avoid making error situations worse */
1412 	if (p->error != 0)
1413 		return;
1414 
1415 	/* deal with oversize operands ("can't happen", more or less) */
1416 	assert(opnd < 1<<OPSHIFT);
1417 
1418 	/* deal with undersized strip */
1419 	if (p->slen >= p->ssize)
1420 		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1421 			return;
1422 
1423 	/* finally, it's all reduced to the easy case */
1424 	p->strip[p->slen++] = SOP(op, opnd);
1425 }
1426 
1427 /*
1428  * doinsert - insert a sop into the strip
1429  */
1430 static void
doinsert(struct parse * p,sop op,size_t opnd,sopno pos)1431 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1432 {
1433 	sopno sn;
1434 	sop s;
1435 	int i;
1436 
1437 	/* avoid making error situations worse */
1438 	if (p->error != 0)
1439 		return;
1440 
1441 	sn = HERE();
1442 	EMIT(op, opnd);		/* do checks, ensure space */
1443 	assert(HERE() == sn+1);
1444 	s = p->strip[sn];
1445 
1446 	/* adjust paren pointers */
1447 	assert(pos > 0);
1448 	for (i = 1; i < NPAREN; i++) {
1449 		if (p->pbegin[i] >= pos) {
1450 			p->pbegin[i]++;
1451 		}
1452 		if (p->pend[i] >= pos) {
1453 			p->pend[i]++;
1454 		}
1455 	}
1456 
1457 	(void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1458 	    (HERE()-pos-1)*sizeof (sop));
1459 	p->strip[pos] = s;
1460 }
1461 
1462 /*
1463  * dofwd - complete a forward reference
1464  */
1465 static void
dofwd(struct parse * p,sopno pos,sop value)1466 dofwd(struct parse *p, sopno pos, sop value)
1467 {
1468 	/* avoid making error situations worse */
1469 	if (p->error != 0)
1470 		return;
1471 
1472 	assert(value < 1<<OPSHIFT);
1473 	p->strip[pos] = OP(p->strip[pos]) | value;
1474 }
1475 
1476 /*
1477  * enlarge - enlarge the strip
1478  */
1479 static int
enlarge(struct parse * p,sopno size)1480 enlarge(struct parse *p, sopno size)
1481 {
1482 	sop *sp;
1483 
1484 	if (p->ssize >= size)
1485 		return (1);
1486 
1487 	sp = (sop *)realloc(p->strip, size*sizeof (sop));
1488 	if (sp == NULL) {
1489 		SETERROR(REG_ESPACE);
1490 		return (0);
1491 	}
1492 	p->strip = sp;
1493 	p->ssize = size;
1494 	return (1);
1495 }
1496 
1497 /*
1498  * stripsnug - compact the strip
1499  */
1500 static void
stripsnug(struct parse * p,struct re_guts * g)1501 stripsnug(struct parse *p, struct re_guts *g)
1502 {
1503 	g->nstates = p->slen;
1504 	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1505 	if (g->strip == NULL) {
1506 		SETERROR(REG_ESPACE);
1507 		g->strip = p->strip;
1508 	}
1509 }
1510 
1511 /*
1512  * findmust - fill in must and mlen with longest mandatory literal string
1513  *
1514  * This algorithm could do fancy things like analyzing the operands of |
1515  * for common subsequences.  Someday.  This code is simple and finds most
1516  * of the interesting cases.
1517  *
1518  * Note that must and mlen got initialized during setup.
1519  */
1520 static void
findmust(struct parse * p,struct re_guts * g)1521 findmust(struct parse *p, struct re_guts *g)
1522 {
1523 	sop *scan;
1524 	sop *start = NULL;
1525 	sop *newstart = NULL;
1526 	sopno newlen;
1527 	sop s;
1528 	char *cp;
1529 	int offset;
1530 	char buf[MB_LEN_MAX];
1531 	size_t clen;
1532 	mbstate_t mbs;
1533 	locale_t loc = uselocale(NULL);
1534 
1535 	/* avoid making error situations worse */
1536 	if (p->error != 0)
1537 		return;
1538 
1539 	/*
1540 	 * It's not generally safe to do a ``char'' substring search on
1541 	 * multibyte character strings, but it's safe for at least
1542 	 * UTF-8 (see RFC 3629).
1543 	 */
1544 	if (MB_CUR_MAX > 1 &&
1545 	    strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1546 		return;
1547 
1548 	/* find the longest OCHAR sequence in strip */
1549 	newlen = 0;
1550 	offset = 0;
1551 	g->moffset = 0;
1552 	scan = g->strip + 1;
1553 	do {
1554 		s = *scan++;
1555 		switch (OP(s)) {
1556 		case OCHAR:		/* sequence member */
1557 			if (newlen == 0) {		/* new sequence */
1558 				(void) memset(&mbs, 0, sizeof (mbs));
1559 				newstart = scan - 1;
1560 			}
1561 			clen = wcrtomb(buf, OPND(s), &mbs);
1562 			if (clen == (size_t)-1)
1563 				goto toohard;
1564 			newlen += clen;
1565 			break;
1566 		case OPLUS_:		/* things that don't break one */
1567 		case OLPAREN:
1568 		case ORPAREN:
1569 			break;
1570 		case OQUEST_:		/* things that must be skipped */
1571 		case OCH_:
1572 			offset = altoffset(scan, offset);
1573 			scan--;
1574 			do {
1575 				scan += OPND(s);
1576 				s = *scan;
1577 				/* assert() interferes w debug printouts */
1578 				if (OP(s) != (sop)O_QUEST &&
1579 				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1580 					g->iflags |= BAD;
1581 					return;
1582 				}
1583 			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1584 			/* FALLTHROUGH */
1585 		case OBOW:		/* things that break a sequence */
1586 		case OEOW:
1587 		case OBOL:
1588 		case OEOL:
1589 		case O_QUEST:
1590 		case O_CH:
1591 		case OEND:
1592 			if (newlen > (sopno)g->mlen) {		/* ends one */
1593 				start = newstart;
1594 				g->mlen = newlen;
1595 				if (offset > -1) {
1596 					g->moffset += offset;
1597 					offset = newlen;
1598 				} else
1599 					g->moffset = offset;
1600 			} else {
1601 				if (offset > -1)
1602 					offset += newlen;
1603 			}
1604 			newlen = 0;
1605 			break;
1606 		case OANY:
1607 			if (newlen > (sopno)g->mlen) {		/* ends one */
1608 				start = newstart;
1609 				g->mlen = newlen;
1610 				if (offset > -1) {
1611 					g->moffset += offset;
1612 					offset = newlen;
1613 				} else
1614 					g->moffset = offset;
1615 			} else {
1616 				if (offset > -1)
1617 					offset += newlen;
1618 			}
1619 			if (offset > -1)
1620 				offset++;
1621 			newlen = 0;
1622 			break;
1623 		case OANYOF:		/* may or may not invalidate offset */
1624 			/* First, everything as OANY */
1625 			if (newlen > (sopno)g->mlen) {		/* ends one */
1626 				start = newstart;
1627 				g->mlen = newlen;
1628 				if (offset > -1) {
1629 					g->moffset += offset;
1630 					offset = newlen;
1631 				} else
1632 					g->moffset = offset;
1633 			} else {
1634 				if (offset > -1)
1635 					offset += newlen;
1636 			}
1637 			if (offset > -1)
1638 				offset++;
1639 			newlen = 0;
1640 			break;
1641 		toohard:
1642 		default:
1643 			/*
1644 			 * Anything here makes it impossible or too hard
1645 			 * to calculate the offset -- so we give up;
1646 			 * save the last known good offset, in case the
1647 			 * must sequence doesn't occur later.
1648 			 */
1649 			if (newlen > (sopno)g->mlen) {		/* ends one */
1650 				start = newstart;
1651 				g->mlen = newlen;
1652 				if (offset > -1)
1653 					g->moffset += offset;
1654 				else
1655 					g->moffset = offset;
1656 			}
1657 			offset = -1;
1658 			newlen = 0;
1659 			break;
1660 		}
1661 	} while (OP(s) != OEND);
1662 
1663 	if (g->mlen == 0) {		/* there isn't one */
1664 		g->moffset = -1;
1665 		return;
1666 	}
1667 
1668 	/* turn it into a character string */
1669 	g->must = malloc((size_t)g->mlen + 1);
1670 	if (g->must == NULL) {		/* argh; just forget it */
1671 		g->mlen = 0;
1672 		g->moffset = -1;
1673 		return;
1674 	}
1675 	cp = g->must;
1676 	scan = start;
1677 	(void) memset(&mbs, 0, sizeof (mbs));
1678 	while (cp < g->must + g->mlen) {
1679 		while (OP(s = *scan++) != OCHAR)
1680 			continue;
1681 		clen = wcrtomb(cp, OPND(s), &mbs);
1682 		assert(clen != (size_t)-1);
1683 		cp += clen;
1684 	}
1685 	assert(cp == g->must + g->mlen);
1686 	*cp++ = '\0';		/* just on general principles */
1687 }
1688 
1689 /*
1690  * altoffset - choose biggest offset among multiple choices
1691  *
1692  * Compute, recursively if necessary, the largest offset among multiple
1693  * re paths.
1694  */
1695 static int
altoffset(sop * scan,int offset)1696 altoffset(sop *scan, int offset)
1697 {
1698 	int largest;
1699 	int try;
1700 	sop s;
1701 
1702 	/* If we gave up already on offsets, return */
1703 	if (offset == -1)
1704 		return (-1);
1705 
1706 	largest = 0;
1707 	try = 0;
1708 	s = *scan++;
1709 	while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
1710 		switch (OP(s)) {
1711 		case OOR1:
1712 			if (try > largest)
1713 				largest = try;
1714 			try = 0;
1715 			break;
1716 		case OQUEST_:
1717 		case OCH_:
1718 			try = altoffset(scan, try);
1719 			if (try == -1)
1720 				return (-1);
1721 			scan--;
1722 			do {
1723 				scan += OPND(s);
1724 				s = *scan;
1725 				if (OP(s) != (sop)O_QUEST &&
1726 				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
1727 					return (-1);
1728 			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1729 			/*
1730 			 * We must skip to the next position, or we'll
1731 			 * leave altoffset() too early.
1732 			 */
1733 			scan++;
1734 			break;
1735 		case OANYOF:
1736 		case OCHAR:
1737 		case OANY:
1738 			try++;
1739 			/*FALLTHRU*/
1740 		case OBOW:
1741 		case OEOW:
1742 		case OLPAREN:
1743 		case ORPAREN:
1744 		case OOR2:
1745 			break;
1746 		default:
1747 			try = -1;
1748 			break;
1749 		}
1750 		if (try == -1)
1751 			return (-1);
1752 		s = *scan++;
1753 	}
1754 
1755 	if (try > largest)
1756 		largest = try;
1757 
1758 	return (largest+offset);
1759 }
1760 
1761 /*
1762  * computejumps - compute char jumps for BM scan
1763  *
1764  * This algorithm assumes g->must exists and is has size greater than
1765  * zero. It's based on the algorithm found on Computer Algorithms by
1766  * Sara Baase.
1767  *
1768  * A char jump is the number of characters one needs to jump based on
1769  * the value of the character from the text that was mismatched.
1770  */
1771 static void
computejumps(struct parse * p,struct re_guts * g)1772 computejumps(struct parse *p, struct re_guts *g)
1773 {
1774 	int ch;
1775 	int mindex;
1776 
1777 	/* Avoid making errors worse */
1778 	if (p->error != 0)
1779 		return;
1780 
1781 	g->charjump = (int *)malloc((NC_MAX + 1) * sizeof (int));
1782 	if (g->charjump == NULL)	/* Not a fatal error */
1783 		return;
1784 	/* Adjust for signed chars, if necessary */
1785 	g->charjump = &g->charjump[-(CHAR_MIN)];
1786 
1787 	/*
1788 	 * If the character does not exist in the pattern, the jump
1789 	 * is equal to the number of characters in the pattern.
1790 	 */
1791 	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1792 		g->charjump[ch] = g->mlen;
1793 
1794 	/*
1795 	 * If the character does exist, compute the jump that would
1796 	 * take us to the last character in the pattern equal to it
1797 	 * (notice that we match right to left, so that last character
1798 	 * is the first one that would be matched).
1799 	 */
1800 	for (mindex = 0; mindex < g->mlen; mindex++)
1801 		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1802 }
1803 
1804 /*
1805  * computematchjumps - compute match jumps for BM scan
1806  *
1807  * This algorithm assumes g->must exists and is has size greater than
1808  * zero. It's based on the algorithm found on Computer Algorithms by
1809  * Sara Baase.
1810  *
1811  * A match jump is the number of characters one needs to advance based
1812  * on the already-matched suffix.
1813  * Notice that all values here are minus (g->mlen-1), because of the way
1814  * the search algorithm works.
1815  */
1816 static void
computematchjumps(struct parse * p,struct re_guts * g)1817 computematchjumps(struct parse *p, struct re_guts *g)
1818 {
1819 	int mindex;		/* General "must" iterator */
1820 	int suffix;		/* Keeps track of matching suffix */
1821 	int ssuffix;		/* Keeps track of suffixes' suffix */
1822 	int *pmatches;
1823 				/*
1824 				 * pmatches[k] points to the next i
1825 				 * such that i+1...mlen is a substring
1826 				 * of k+1...k+mlen-i-1
1827 				 */
1828 
1829 	/* Avoid making errors worse */
1830 	if (p->error != 0)
1831 		return;
1832 
1833 	pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1834 	if (pmatches == NULL) {
1835 		g->matchjump = NULL;
1836 		return;
1837 	}
1838 
1839 	g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1840 	if (g->matchjump == NULL) {	/* Not a fatal error */
1841 		free(pmatches);
1842 		return;
1843 	}
1844 
1845 	/* Set maximum possible jump for each character in the pattern */
1846 	for (mindex = 0; mindex < g->mlen; mindex++)
1847 		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1848 
1849 	/* Compute pmatches[] */
1850 	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1851 	    mindex--, suffix--) {
1852 		pmatches[mindex] = suffix;
1853 
1854 		/*
1855 		 * If a mismatch is found, interrupting the substring,
1856 		 * compute the matchjump for that position. If no
1857 		 * mismatch is found, then a text substring mismatched
1858 		 * against the suffix will also mismatch against the
1859 		 * substring.
1860 		 */
1861 		while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1862 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1863 			    g->mlen - mindex - 1);
1864 			suffix = pmatches[suffix];
1865 		}
1866 	}
1867 
1868 	/*
1869 	 * Compute the matchjump up to the last substring found to jump
1870 	 * to the beginning of the largest must pattern prefix matching
1871 	 * it's own suffix.
1872 	 */
1873 	for (mindex = 0; mindex <= suffix; mindex++)
1874 		g->matchjump[mindex] = MIN(g->matchjump[mindex],
1875 		    g->mlen + suffix - mindex);
1876 
1877 	ssuffix = pmatches[suffix];
1878 	while (suffix < g->mlen) {
1879 		while (suffix <= ssuffix && suffix < g->mlen) {
1880 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1881 			    g->mlen + ssuffix - suffix);
1882 			suffix++;
1883 		}
1884 		if (suffix < g->mlen)
1885 			ssuffix = pmatches[ssuffix];
1886 	}
1887 
1888 	free(pmatches);
1889 }
1890 
1891 /*
1892  * pluscount - count + nesting
1893  */
1894 static sopno			/* nesting depth */
pluscount(struct parse * p,struct re_guts * g)1895 pluscount(struct parse *p, struct re_guts *g)
1896 {
1897 	sop *scan;
1898 	sop s;
1899 	sopno plusnest = 0;
1900 	sopno maxnest = 0;
1901 
1902 	if (p->error != 0)
1903 		return (0);	/* there may not be an OEND */
1904 
1905 	scan = g->strip + 1;
1906 	do {
1907 		s = *scan++;
1908 		switch (OP(s)) {
1909 		case OPLUS_:
1910 			plusnest++;
1911 			break;
1912 		case O_PLUS:
1913 			if (plusnest > maxnest)
1914 				maxnest = plusnest;
1915 			plusnest--;
1916 			break;
1917 		}
1918 	} while (OP(s) != OEND);
1919 	if (plusnest != 0)
1920 		g->iflags |= BAD;
1921 	return (maxnest);
1922 }
1923