1/*
2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
4 * Copyright 2012 Milan Jurik. All rights reserved.
5 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
6 * Copyright (c) 1992, 1993, 1994
7 *	The Regents of the University of California.  All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * Henry Spencer.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#include "lint.h"
38#include "file64.h"
39#include <sys/types.h>
40#include <stdio.h>
41#include <string.h>
42#include <ctype.h>
43#include <limits.h>
44#include <regex.h>
45#include <stdlib.h>
46#include <stdbool.h>
47#include <wchar.h>
48#include <wctype.h>
49
50#include "../locale/runetype.h"
51#include "../locale/collate.h"
52
53#include "utils.h"
54#include "regex2.h"
55
56#include "cname.h"
57#include "../locale/mblocal.h"
58
59/*
60 * Branching context, used to keep track of branch state for all of the branch-
61 * aware functions. In addition to keeping track of branch positions for the
62 * p_branch_* functions, we use this to simplify some clumsiness in BREs for
63 * detection of whether ^ is acting as an anchor or being used erroneously and
64 * also for whether we're in a sub-expression or not.
65 */
66struct branchc {
67	sopno start;
68	sopno back;
69	sopno fwd;
70
71	int nbranch;
72	int nchain;
73	bool outer;
74	bool terminate;
75};
76
77/*
78 * parse structure, passed up and down to avoid global variables and
79 * other clumsinesses
80 */
81struct parse {
82	const char *next;	/* next character in RE */
83	const char *end;	/* end of string (-> NUL normally) */
84	int error;		/* has an error been seen? */
85	sop *strip;		/* malloced strip */
86	sopno ssize;		/* malloced strip size (allocated) */
87	sopno slen;		/* malloced strip length (used) */
88	int ncsalloc;		/* number of csets allocated */
89	struct re_guts *g;
90#define	NPAREN	10		/* we need to remember () 1-9 for back refs */
91	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
92	sopno pend[NPAREN];	/* -> ) ([0] unused) */
93	bool allowbranch;	/* can this expression branch? */
94	bool bre;		/* convenience; is this a BRE? */
95	bool (*parse_expr)(struct parse *, struct branchc *);
96	void (*pre_parse)(struct parse *, struct branchc *);
97	void (*post_parse)(struct parse *, struct branchc *);
98};
99
100/* ========= begin header generated by ./mkh ========= */
101#ifdef __cplusplus
102extern "C" {
103#endif
104
105/* === regcomp.c === */
106static bool p_ere_exp(struct parse *p, struct branchc *bc);
107static void p_str(struct parse *p);
108static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
109static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
110static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
111static bool p_branch_empty(struct parse *p, struct branchc *bc);
112static bool p_branch_do(struct parse *p, struct branchc *bc);
113static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
114static void p_bre_post_parse(struct parse *p, struct branchc *bc);
115static void p_re(struct parse *p, int end1, int end2);
116static bool p_simp_re(struct parse *p, struct branchc *bc);
117static int p_count(struct parse *p);
118static void p_bracket(struct parse *p);
119static void p_b_term(struct parse *p, cset *cs);
120static void p_b_cclass(struct parse *p, cset *cs);
121static void p_b_eclass(struct parse *p, cset *cs);
122static wint_t p_b_symbol(struct parse *p);
123static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
124static wint_t othercase(wint_t ch);
125static void bothcases(struct parse *p, wint_t ch);
126static void ordinary(struct parse *p, wint_t ch);
127static void nonnewline(struct parse *p);
128static void repeat(struct parse *p, sopno start, int from, int to);
129static int seterr(struct parse *p, int e);
130static cset *allocset(struct parse *p);
131static void freeset(struct parse *p, cset *cs);
132static void CHadd(struct parse *p, cset *cs, wint_t ch);
133static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
134static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
135static wint_t singleton(cset *cs);
136static sopno dupl(struct parse *p, sopno start, sopno finish);
137static void doemit(struct parse *p, sop op, size_t opnd);
138static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
139static void dofwd(struct parse *p, sopno pos, sop value);
140static int enlarge(struct parse *p, sopno size);
141static void stripsnug(struct parse *p, struct re_guts *g);
142static void findmust(struct parse *p, struct re_guts *g);
143static int altoffset(sop *scan, int offset);
144static void computejumps(struct parse *p, struct re_guts *g);
145static void computematchjumps(struct parse *p, struct re_guts *g);
146static sopno pluscount(struct parse *p, struct re_guts *g);
147static wint_t wgetnext(struct parse *p);
148
149#ifdef __cplusplus
150}
151#endif
152/* ========= end header generated by ./mkh ========= */
153
154static char nuls[10];		/* place to point scanner in event of error */
155
156/*
157 * macros for use with parse structure
158 * BEWARE:  these know that the parse structure is named `p' !!!
159 */
160#define	PEEK()	(*p->next)
161#define	PEEK2()	(*(p->next+1))
162#define	MORE()	(p->next < p->end)
163#define	MORE2()	(p->next+1 < p->end)
164#define	SEE(c)	(MORE() && PEEK() == (c))
165#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
166#define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
167#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
168#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
169#define	NEXT()	(p->next++)
170#define	NEXT2()	(p->next += 2)
171#define	NEXTn(n)	(p->next += (n))
172#define	GETNEXT()	(*p->next++)
173#define	WGETNEXT()	wgetnext(p)
174#define	SETERROR(e)	((void)seterr(p, (e)))
175#define	REQUIRE(co, e)	((co) || seterr(p, e))
176#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
177#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
178#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
179#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
180#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
181#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
182#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
183#define	HERE()		(p->slen)
184#define	THERE()		(p->slen - 1)
185#define	THERETHERE()	(p->slen - 2)
186#define	DROP(n)	(p->slen -= (n))
187
188#ifndef NDEBUG
189static int never = 0;		/* for use in asserts; shuts lint up */
190#else
191#define	never	0		/* some <assert.h>s have bugs too */
192#endif
193
194/*
195 * regcomp - interface for parser and compilation
196 */
197int				/* 0 success, otherwise REG_something */
198regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
199    int cflags)
200{
201	struct parse pa;
202	struct re_guts *g;
203	struct parse *p = &pa;
204	int i;
205	size_t len;
206	size_t maxlen;
207#ifdef REDEBUG
208#define	GOODFLAGS(f)	(f)
209#else
210#define	GOODFLAGS(f)	((f)&~REG_DUMP)
211#endif
212
213	/* We had REG_INVARG, but we don't have that on Solaris. */
214	cflags = GOODFLAGS(cflags);
215	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
216		return (REG_EFATAL);
217
218	if (cflags&REG_PEND) {
219		if (preg->re_endp < pattern)
220			return (REG_EFATAL);
221		len = preg->re_endp - pattern;
222	} else
223		len = strlen(pattern);
224
225	/* do the mallocs early so failure handling is easy */
226	g = (struct re_guts *)malloc(sizeof (struct re_guts));
227	if (g == NULL)
228		return (REG_ESPACE);
229	/*
230	 * Limit the pattern space to avoid a 32-bit overflow on buffer
231	 * extension.  Also avoid any signed overflow in case of conversion
232	 * so make the real limit based on a 31-bit overflow.
233	 *
234	 * Likely not applicable on 64-bit systems but handle the case
235	 * generically (who are we to stop people from using ~715MB+
236	 * patterns?).
237	 */
238	maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3;
239	if (len >= maxlen) {
240		free((char *)g);
241		return (REG_ESPACE);
242	}
243	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
244	assert(p->ssize >= len);
245
246	p->strip = (sop *)malloc(p->ssize * sizeof (sop));
247	p->slen = 0;
248	if (p->strip == NULL) {
249		free((char *)g);
250		return (REG_ESPACE);
251	}
252
253	/* set things up */
254	p->g = g;
255	p->next = pattern;	/* convenience; we do not modify it */
256	p->end = p->next + len;
257	p->error = 0;
258	p->ncsalloc = 0;
259	for (i = 0; i < NPAREN; i++) {
260		p->pbegin[i] = 0;
261		p->pend[i] = 0;
262	}
263	if (cflags & REG_EXTENDED) {
264		p->allowbranch = true;
265		p->bre = false;
266		p->parse_expr = p_ere_exp;
267		p->pre_parse = NULL;
268		p->post_parse = NULL;
269	} else {
270		p->allowbranch = false;
271		p->bre = true;
272		p->parse_expr = p_simp_re;
273		p->pre_parse = p_bre_pre_parse;
274		p->post_parse = p_bre_post_parse;
275	}
276	g->sets = NULL;
277	g->ncsets = 0;
278	g->cflags = cflags;
279	g->iflags = 0;
280	g->nbol = 0;
281	g->neol = 0;
282	g->must = NULL;
283	g->moffset = -1;
284	g->charjump = NULL;
285	g->matchjump = NULL;
286	g->mlen = 0;
287	g->nsub = 0;
288	g->backrefs = 0;
289
290	/* do it */
291	EMIT(OEND, 0);
292	g->firststate = THERE();
293	if (cflags & REG_NOSPEC)
294		p_str(p);
295	else
296		p_re(p, OUT, OUT);
297	EMIT(OEND, 0);
298	g->laststate = THERE();
299
300	/* tidy up loose ends and fill things in */
301	stripsnug(p, g);
302	findmust(p, g);
303	/*
304	 * only use Boyer-Moore algorithm if the pattern is bigger
305	 * than three characters
306	 */
307	if (g->mlen > 3) {
308		computejumps(p, g);
309		computematchjumps(p, g);
310		if (g->matchjump == NULL && g->charjump != NULL) {
311			free(g->charjump);
312			g->charjump = NULL;
313		}
314	}
315	g->nplus = pluscount(p, g);
316	g->magic = MAGIC2;
317	preg->re_nsub = g->nsub;
318	preg->re_g = g;
319	preg->re_magic = MAGIC1;
320#ifndef REDEBUG
321	/* not debugging, so can't rely on the assert() in regexec() */
322	if (g->iflags&BAD)
323		SETERROR(REG_EFATAL);
324#endif
325
326	/* win or lose, we're done */
327	if (p->error != 0)	/* lose */
328		regfree(preg);
329	return (p->error);
330}
331
332/*
333 * Parse one subERE, an atom possibly followed by a repetition op,
334 * return whether we should terminate or not.
335 */
336static bool
337p_ere_exp(struct parse *p, struct branchc *bc)
338{
339	char c;
340	wint_t wc;
341	sopno pos;
342	int count;
343	int count2;
344	sopno subno;
345	int wascaret = 0;
346
347	(void) bc;
348	assert(MORE());		/* caller should have ensured this */
349	c = GETNEXT();
350
351	pos = HERE();
352	switch (c) {
353	case '(':
354		(void) REQUIRE(MORE(), REG_EPAREN);
355		p->g->nsub++;
356		subno = p->g->nsub;
357		if (subno < NPAREN)
358			p->pbegin[subno] = HERE();
359		EMIT(OLPAREN, subno);
360		if (!SEE(')'))
361			p_re(p, ')', IGN);
362		if (subno < NPAREN) {
363			p->pend[subno] = HERE();
364			assert(p->pend[subno] != 0);
365		}
366		EMIT(ORPAREN, subno);
367		(void) MUSTEAT(')', REG_EPAREN);
368		break;
369#ifndef POSIX_MISTAKE
370	case ')':		/* happens only if no current unmatched ( */
371		/*
372		 * You may ask, why the ifndef?  Because I didn't notice
373		 * this until slightly too late for 1003.2, and none of the
374		 * other 1003.2 regular-expression reviewers noticed it at
375		 * all.  So an unmatched ) is legal POSIX, at least until
376		 * we can get it fixed.
377		 */
378		SETERROR(REG_EPAREN);
379		break;
380#endif
381	case '^':
382		EMIT(OBOL, 0);
383		p->g->iflags |= USEBOL;
384		p->g->nbol++;
385		wascaret = 1;
386		break;
387	case '$':
388		EMIT(OEOL, 0);
389		p->g->iflags |= USEEOL;
390		p->g->neol++;
391		break;
392	case '|':
393		SETERROR(REG_BADPAT);
394		break;
395	case '*':
396	case '+':
397	case '?':
398	case '{':
399		SETERROR(REG_BADRPT);
400		break;
401	case '.':
402		if (p->g->cflags&REG_NEWLINE)
403			nonnewline(p);
404		else
405			EMIT(OANY, 0);
406		break;
407	case '[':
408		p_bracket(p);
409		break;
410	case '\\':
411		(void) REQUIRE(MORE(), REG_EESCAPE);
412		wc = WGETNEXT();
413		switch (wc) {
414		case '<':
415			EMIT(OBOW, 0);
416			break;
417		case '>':
418			EMIT(OEOW, 0);
419			break;
420		default:
421			ordinary(p, wc);
422			break;
423		}
424		break;
425	default:
426		if (p->error != 0)
427			return (false);
428		p->next--;
429		wc = WGETNEXT();
430		ordinary(p, wc);
431		break;
432	}
433
434	if (!MORE())
435		return (false);
436	c = PEEK();
437	/* we call { a repetition if followed by a digit */
438	if (!(c == '*' || c == '+' || c == '?' || c == '{'))
439		return (false);		/* no repetition, we're done */
440	else if (c == '{')
441		(void) REQUIRE(MORE2() && \
442		    (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
443	NEXT();
444
445	(void) REQUIRE(!wascaret, REG_BADRPT);
446	switch (c) {
447	case '*':	/* implemented as +? */
448		/* this case does not require the (y|) trick, noKLUDGE */
449		INSERT(OPLUS_, pos);
450		ASTERN(O_PLUS, pos);
451		INSERT(OQUEST_, pos);
452		ASTERN(O_QUEST, pos);
453		break;
454	case '+':
455		INSERT(OPLUS_, pos);
456		ASTERN(O_PLUS, pos);
457		break;
458	case '?':
459		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
460		INSERT(OCH_, pos);		/* offset slightly wrong */
461		ASTERN(OOR1, pos);		/* this one's right */
462		AHEAD(pos);			/* fix the OCH_ */
463		EMIT(OOR2, 0);			/* offset very wrong... */
464		AHEAD(THERE());			/* ...so fix it */
465		ASTERN(O_CH, THERETHERE());
466		break;
467	case '{':
468		count = p_count(p);
469		if (EAT(',')) {
470			if (isdigit((uch)PEEK())) {
471				count2 = p_count(p);
472				(void) REQUIRE(count <= count2, REG_BADBR);
473			} else		/* single number with comma */
474				count2 = INFINITY;
475		} else		/* just a single number */
476			count2 = count;
477		repeat(p, pos, count, count2);
478		if (!EAT('}')) {	/* error heuristics */
479			while (MORE() && PEEK() != '}')
480				NEXT();
481			(void) REQUIRE(MORE(), REG_EBRACE);
482			SETERROR(REG_BADBR);
483		}
484		break;
485	}
486
487	if (!MORE())
488		return (false);
489	c = PEEK();
490	if (!(c == '*' || c == '+' || c == '?' ||
491	    (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
492		return (false);
493	SETERROR(REG_BADRPT);
494	return (false);
495}
496
497/*
498 * p_str - string (no metacharacters) "parser"
499 */
500static void
501p_str(struct parse *p)
502{
503	(void) REQUIRE(MORE(), REG_BADPAT);
504	while (MORE())
505		ordinary(p, WGETNEXT());
506}
507
508/*
509 * Eat consecutive branch delimiters for the kind of expression that we are
510 * parsing, return the number of delimiters that we ate.
511 */
512static int
513p_branch_eat_delim(struct parse *p, struct branchc *bc)
514{
515	int nskip;
516
517	(void) bc;
518	nskip = 0;
519	while (EAT('|'))
520		++nskip;
521	return (nskip);
522}
523
524/*
525 * Insert necessary branch book-keeping operations. This emits a
526 * bogus 'next' offset, since we still have more to parse
527 */
528static void
529p_branch_ins_offset(struct parse *p, struct branchc *bc)
530{
531	if (bc->nbranch == 0) {
532		INSERT(OCH_, bc->start);	/* offset is wrong */
533		bc->fwd = bc->start;
534		bc->back = bc->start;
535	}
536
537	ASTERN(OOR1, bc->back);
538	bc->back = THERE();
539	AHEAD(bc->fwd);			/* fix previous offset */
540	bc->fwd = HERE();
541	EMIT(OOR2, 0);			/* offset is very wrong */
542	++bc->nbranch;
543}
544
545/*
546 * Fix the offset of the tail branch, if we actually had any branches.
547 * This is to correct the bogus placeholder offset that we use.
548 */
549static void
550p_branch_fix_tail(struct parse *p, struct branchc *bc)
551{
552	/* Fix bogus offset at the tail if we actually have branches */
553	if (bc->nbranch > 0) {
554		AHEAD(bc->fwd);
555		ASTERN(O_CH, bc->back);
556	}
557}
558
559/*
560 * Signal to the parser that an empty branch has been encountered; this will,
561 * in the future, be used to allow for more permissive behavior with empty
562 * branches. The return value should indicate whether parsing may continue
563 * or not.
564 */
565static bool
566p_branch_empty(struct parse *p, struct branchc *bc)
567{
568	(void) bc;
569	SETERROR(REG_BADPAT);
570	return (false);
571}
572
573/*
574 * Take care of any branching requirements. This includes inserting the
575 * appropriate branching instructions as well as eating all of the branch
576 * delimiters until we either run out of pattern or need to parse more pattern.
577 */
578static bool
579p_branch_do(struct parse *p, struct branchc *bc)
580{
581	int ate = 0;
582
583	ate = p_branch_eat_delim(p, bc);
584	if (ate == 0)
585		return (false);
586	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
587		/*
588		 * Halt parsing only if we have an empty branch and
589		 * p_branch_empty indicates that we must not continue.
590		 * In the future, this will not  necessarily be an error.
591		 */
592		return (false);
593	p_branch_ins_offset(p, bc);
594
595	return (true);
596}
597
598static void
599p_bre_pre_parse(struct parse *p, struct branchc *bc)
600{
601	(void) bc;
602	/*
603	 * Does not move cleanly into expression parser because of
604	 * ordinary interpration of * at the beginning position of
605	 * an expression.
606	 */
607	if (EAT('^')) {
608		EMIT(OBOL, 0);
609		p->g->iflags |= USEBOL;
610		p->g->nbol++;
611	}
612}
613
614static void
615p_bre_post_parse(struct parse *p, struct branchc *bc)
616{
617	/* Expression is terminating due to EOL token */
618	if (bc->terminate) {
619		DROP(1);
620		EMIT(OEOL, 0);
621		p->g->iflags |= USEEOL;
622		p->g->neol++;
623	}
624}
625
626/*
627 * Top level parser, concatenation and BRE anchoring.
628 * Giving end1 as OUT essentially eliminates the end1/end2 check.
629 *
630 * This implementation is a bit of a kludge, in that a trailing $ is first
631 * taken as an ordinary character and then revised to be an anchor.
632 * The amount of lookahead needed to avoid this kludge is excessive.
633 */
634static void
635p_re(struct parse *p,
636    int end1,	/* first terminating character */
637    int end2)	/* second terminating character; ignored for EREs */
638{
639	struct branchc bc;
640
641	bc.nbranch = 0;
642	if (end1 == OUT && end2 == OUT)
643		bc.outer = true;
644	else
645		bc.outer = false;
646#define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
647	for (;;) {
648		bc.start = HERE();
649		bc.nchain = 0;
650		bc.terminate = false;
651		if (p->pre_parse != NULL)
652			p->pre_parse(p, &bc);
653		while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
654		    !SEEEND()) {
655			bc.terminate = p->parse_expr(p, &bc);
656			++bc.nchain;
657		}
658		if (p->post_parse != NULL)
659			p->post_parse(p, &bc);
660		(void) REQUIRE(HERE() != bc.start, REG_BADPAT);
661		if (!p->allowbranch)
662			break;
663		/*
664		 * p_branch_do's return value indicates whether we should
665		 * continue parsing or not. This is both for correctness and
666		 * a slight optimization, because it will check if we've
667		 * encountered an empty branch or the end of the string
668		 * immediately following a branch delimiter.
669		 */
670		if (!p_branch_do(p, &bc))
671			break;
672	}
673#undef SEE_END
674	if (p->allowbranch)
675		p_branch_fix_tail(p, &bc);
676	assert(!MORE() || SEE(end1));
677}
678
679/*
680 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
681 */
682static bool			/* was the simple RE an unbackslashed $? */
683p_simp_re(struct parse *p, struct branchc *bc)
684{
685	int c;
686	int count;
687	int count2;
688	sopno pos;
689	int i;
690	wint_t wc;
691	sopno subno;
692#define	BACKSL	(1<<CHAR_BIT)
693
694	pos = HERE();		/* repetition op, if any, covers from here */
695
696	assert(MORE());		/* caller should have ensured this */
697	c = GETNEXT();
698	if (c == '\\') {
699		(void) REQUIRE(MORE(), REG_EESCAPE);
700		c = BACKSL | GETNEXT();
701	}
702	switch (c) {
703	case '.':
704		if (p->g->cflags&REG_NEWLINE)
705			nonnewline(p);
706		else
707			EMIT(OANY, 0);
708		break;
709	case '[':
710		p_bracket(p);
711		break;
712	case BACKSL|'<':
713		EMIT(OBOW, 0);
714		break;
715	case BACKSL|'>':
716		EMIT(OEOW, 0);
717		break;
718	case BACKSL|'{':
719		SETERROR(REG_BADRPT);
720		break;
721	case BACKSL|'(':
722		p->g->nsub++;
723		subno = p->g->nsub;
724		if (subno < NPAREN)
725			p->pbegin[subno] = HERE();
726		EMIT(OLPAREN, subno);
727		/* the MORE here is an error heuristic */
728		if (MORE() && !SEETWO('\\', ')'))
729			p_re(p, '\\', ')');
730		if (subno < NPAREN) {
731			p->pend[subno] = HERE();
732			assert(p->pend[subno] != 0);
733		}
734		EMIT(ORPAREN, subno);
735		(void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
736		break;
737	case BACKSL|')':	/* should not get here -- must be user */
738		SETERROR(REG_EPAREN);
739		break;
740	case BACKSL|'1':
741	case BACKSL|'2':
742	case BACKSL|'3':
743	case BACKSL|'4':
744	case BACKSL|'5':
745	case BACKSL|'6':
746	case BACKSL|'7':
747	case BACKSL|'8':
748	case BACKSL|'9':
749		i = (c&~BACKSL) - '0';
750		assert(i < NPAREN);
751		if (p->pend[i] != 0) {
752			assert(i <= p->g->nsub);
753			EMIT(OBACK_, i);
754			assert(p->pbegin[i] != 0);
755			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
756			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
757			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
758			EMIT(O_BACK, i);
759		} else
760			SETERROR(REG_ESUBREG);
761		p->g->backrefs = 1;
762		break;
763	case '*':
764		/*
765		 * Ordinary if used as the first character beyond BOL anchor of
766		 * a (sub-)expression, counts as a bad repetition operator if it
767		 * appears otherwise.
768		 */
769		(void) REQUIRE(bc->nchain == 0, REG_BADRPT);
770		/* FALLTHROUGH */
771	default:
772		if (p->error != 0)
773			return (false);	/* Definitely not $... */
774		p->next--;
775		wc = WGETNEXT();
776		ordinary(p, wc);
777		break;
778	}
779
780	if (EAT('*')) {		/* implemented as +? */
781		/* this case does not require the (y|) trick, noKLUDGE */
782		INSERT(OPLUS_, pos);
783		ASTERN(O_PLUS, pos);
784		INSERT(OQUEST_, pos);
785		ASTERN(O_QUEST, pos);
786	} else if (EATTWO('\\', '{')) {
787		count = p_count(p);
788		if (EAT(',')) {
789			if (MORE() && isdigit((uch)PEEK())) {
790				count2 = p_count(p);
791				(void) REQUIRE(count <= count2, REG_BADBR);
792			} else		/* single number with comma */
793				count2 = INFINITY;
794		} else		/* just a single number */
795			count2 = count;
796		repeat(p, pos, count, count2);
797		if (!EATTWO('\\', '}')) {	/* error heuristics */
798			while (MORE() && !SEETWO('\\', '}'))
799				NEXT();
800			(void) REQUIRE(MORE(), REG_EBRACE);
801			SETERROR(REG_BADBR);
802		}
803	} else if (c == '$')	/* $ (but not \$) ends it */
804		return (true);
805
806	return (false);
807}
808
809/*
810 * p_count - parse a repetition count
811 */
812static int			/* the value */
813p_count(struct parse *p)
814{
815	int count = 0;
816	int ndigits = 0;
817
818	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
819		count = count*10 + (GETNEXT() - '0');
820		ndigits++;
821	}
822
823	(void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
824	return (count);
825}
826
827/*
828 * p_bracket - parse a bracketed character list
829 */
830static void
831p_bracket(struct parse *p)
832{
833	cset *cs;
834	wint_t ch;
835
836	/* Dept of Truly Sickening Special-Case Kludges */
837	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
838		EMIT(OBOW, 0);
839		NEXTn(6);
840		return;
841	}
842	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
843		EMIT(OEOW, 0);
844		NEXTn(6);
845		return;
846	}
847
848	if ((cs = allocset(p)) == NULL)
849		return;
850
851	if (p->g->cflags&REG_ICASE)
852		cs->icase = 1;
853	if (EAT('^'))
854		cs->invert = 1;
855	if (EAT(']'))
856		CHadd(p, cs, ']');
857	else if (EAT('-'))
858		CHadd(p, cs, '-');
859	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
860		p_b_term(p, cs);
861	if (EAT('-'))
862		CHadd(p, cs, '-');
863	(void) MUSTEAT(']', REG_EBRACK);
864
865	if (p->error != 0)	/* don't mess things up further */
866		return;
867
868	if (cs->invert && p->g->cflags&REG_NEWLINE)
869		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
870
871	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
872		ordinary(p, ch);
873		freeset(p, cs);
874	} else
875		EMIT(OANYOF, (int)(cs - p->g->sets));
876}
877
878/*
879 * p_b_term - parse one term of a bracketed character list
880 */
881static void
882p_b_term(struct parse *p, cset *cs)
883{
884	char c;
885	wint_t start, finish;
886	wint_t i;
887	locale_t loc = uselocale(NULL);
888
889	/* classify what we've got */
890	switch ((MORE()) ? PEEK() : '\0') {
891	case '[':
892		c = (MORE2()) ? PEEK2() : '\0';
893		break;
894	case '-':
895		SETERROR(REG_ERANGE);
896		return;			/* NOTE RETURN */
897	default:
898		c = '\0';
899		break;
900	}
901
902	switch (c) {
903	case ':':		/* character class */
904		NEXT2();
905		(void) REQUIRE(MORE(), REG_EBRACK);
906		c = PEEK();
907		(void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
908		p_b_cclass(p, cs);
909		(void) REQUIRE(MORE(), REG_EBRACK);
910		(void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
911		break;
912	case '=':		/* equivalence class */
913		NEXT2();
914		(void) REQUIRE(MORE(), REG_EBRACK);
915		c = PEEK();
916		(void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
917		p_b_eclass(p, cs);
918		(void) REQUIRE(MORE(), REG_EBRACK);
919		(void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
920		break;
921	default:		/* symbol, ordinary character, or range */
922		start = p_b_symbol(p);
923		if (SEE('-') && MORE2() && PEEK2() != ']') {
924			/* range */
925			NEXT();
926			if (EAT('-'))
927				finish = '-';
928			else
929				finish = p_b_symbol(p);
930		} else
931			finish = start;
932		if (start == finish)
933			CHadd(p, cs, start);
934		else {
935			if (loc->collate->lc_is_posix) {
936				(void) REQUIRE((uch)start <= (uch)finish,
937				    REG_ERANGE);
938				CHaddrange(p, cs, start, finish);
939			} else {
940				(void) REQUIRE(_collate_range_cmp(start,
941				    finish, loc) <= 0, REG_ERANGE);
942				for (i = 0; i <= UCHAR_MAX; i++) {
943					if (_collate_range_cmp(start, i, loc)
944					    <= 0 &&
945					    _collate_range_cmp(i, finish, loc)
946					    <= 0)
947						CHadd(p, cs, i);
948				}
949			}
950		}
951		break;
952	}
953}
954
955/*
956 * p_b_cclass - parse a character-class name and deal with it
957 */
958static void
959p_b_cclass(struct parse *p, cset *cs)
960{
961	const char *sp = p->next;
962	size_t len;
963	wctype_t wct;
964	char clname[16];
965
966	while (MORE() && isalpha((uch)PEEK()))
967		NEXT();
968	len = p->next - sp;
969	if (len >= sizeof (clname) - 1) {
970		SETERROR(REG_ECTYPE);
971		return;
972	}
973	(void) memcpy(clname, sp, len);
974	clname[len] = '\0';
975	if ((wct = wctype(clname)) == 0) {
976		SETERROR(REG_ECTYPE);
977		return;
978	}
979	CHaddtype(p, cs, wct);
980}
981
982/*
983 * p_b_eclass - parse an equivalence-class name and deal with it
984 *
985 * This implementation is incomplete. xxx
986 */
987static void
988p_b_eclass(struct parse *p, cset *cs)
989{
990	wint_t c;
991
992	c = p_b_coll_elem(p, '=');
993	CHadd(p, cs, c);
994}
995
996/*
997 * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
998 */
999static wint_t			/* value of symbol */
1000p_b_symbol(struct parse *p)
1001{
1002	wint_t value;
1003
1004	(void) REQUIRE(MORE(), REG_EBRACK);
1005	if (!EATTWO('[', '.'))
1006		return (WGETNEXT());
1007
1008	/* collating symbol */
1009	value = p_b_coll_elem(p, '.');
1010	(void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1011	return (value);
1012}
1013
1014/*
1015 * p_b_coll_elem - parse a collating-element name and look it up
1016 */
1017static wint_t			/* value of collating element */
1018p_b_coll_elem(struct parse *p,
1019    wint_t endc)		/* name ended by endc,']' */
1020{
1021	const char *sp = p->next;
1022	struct cname *cp;
1023	mbstate_t mbs;
1024	wchar_t wc;
1025	size_t clen, len;
1026
1027	while (MORE() && !SEETWO(endc, ']'))
1028		NEXT();
1029	if (!MORE()) {
1030		SETERROR(REG_EBRACK);
1031		return (0);
1032	}
1033	len = p->next - sp;
1034	for (cp = cnames; cp->name != NULL; cp++)
1035		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1036			return (cp->code);	/* known name */
1037	(void) memset(&mbs, 0, sizeof (mbs));
1038	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1039		return (wc);			/* single character */
1040	else if (clen == (size_t)-1 || clen == (size_t)-2)
1041		SETERROR(REG_ECHAR);
1042	else
1043		SETERROR(REG_ECOLLATE);		/* neither */
1044	return (0);
1045}
1046
1047/*
1048 * othercase - return the case counterpart of an alphabetic
1049 */
1050static wint_t			/* if no counterpart, return ch */
1051othercase(wint_t ch)
1052{
1053	assert(iswalpha(ch));
1054	if (iswupper(ch))
1055		return (towlower(ch));
1056	else if (iswlower(ch))
1057		return (towupper(ch));
1058	else			/* peculiar, but could happen */
1059		return (ch);
1060}
1061
1062/*
1063 * bothcases - emit a dualcase version of a two-case character
1064 *
1065 * Boy, is this implementation ever a kludge...
1066 */
1067static void
1068bothcases(struct parse *p, wint_t ch)
1069{
1070	const char *oldnext = p->next;
1071	const char *oldend = p->end;
1072	char bracket[3 + MB_LEN_MAX];
1073	size_t n;
1074	mbstate_t mbs;
1075
1076	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1077	p->next = bracket;
1078	(void) memset(&mbs, 0, sizeof (mbs));
1079	n = wcrtomb(bracket, ch, &mbs);
1080	assert(n != (size_t)-1);
1081	bracket[n] = ']';
1082	bracket[n + 1] = '\0';
1083	p->end = bracket+n+1;
1084	p_bracket(p);
1085	assert(p->next == p->end);
1086	p->next = oldnext;
1087	p->end = oldend;
1088}
1089
1090/*
1091 * ordinary - emit an ordinary character
1092 */
1093static void
1094ordinary(struct parse *p, wint_t ch)
1095{
1096	cset *cs;
1097
1098	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1099		bothcases(p, ch);
1100	else if ((ch & OPDMASK) == ch)
1101		EMIT(OCHAR, ch);
1102	else {
1103		/*
1104		 * Kludge: character is too big to fit into an OCHAR operand.
1105		 * Emit a singleton set.
1106		 */
1107		if ((cs = allocset(p)) == NULL)
1108			return;
1109		CHadd(p, cs, ch);
1110		EMIT(OANYOF, (int)(cs - p->g->sets));
1111	}
1112}
1113
1114/*
1115 * nonnewline - emit REG_NEWLINE version of OANY
1116 *
1117 * Boy, is this implementation ever a kludge...
1118 */
1119static void
1120nonnewline(struct parse *p)
1121{
1122	const char *oldnext = p->next;
1123	const char *oldend = p->end;
1124	char bracket[4];
1125
1126	p->next = bracket;
1127	p->end = bracket+3;
1128	bracket[0] = '^';
1129	bracket[1] = '\n';
1130	bracket[2] = ']';
1131	bracket[3] = '\0';
1132	p_bracket(p);
1133	assert(p->next == bracket+3);
1134	p->next = oldnext;
1135	p->end = oldend;
1136}
1137
1138/*
1139 * repeat - generate code for a bounded repetition, recursively if needed
1140 */
1141static void
1142repeat(struct parse *p,
1143    sopno start,		/* operand from here to end of strip */
1144    int from,			/* repeated from this number */
1145    int to)			/* to this number of times (maybe INFINITY) */
1146{
1147	sopno finish = HERE();
1148#define	N	2
1149#define	INF	3
1150#define	REP(f, t)	((f)*8 + (t))
1151#define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1152	sopno copy;
1153
1154	if (p->error != 0)	/* head off possible runaway recursion */
1155		return;
1156
1157	assert(from <= to);
1158
1159	switch (REP(MAP(from), MAP(to))) {
1160	case REP(0, 0):			/* must be user doing this */
1161		DROP(finish-start);	/* drop the operand */
1162		break;
1163	case REP(0, 1):			/* as x{1,1}? */
1164	case REP(0, N):			/* as x{1,n}? */
1165	case REP(0, INF):		/* as x{1,}? */
1166		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1167		INSERT(OCH_, start);		/* offset is wrong... */
1168		repeat(p, start+1, 1, to);
1169		ASTERN(OOR1, start);
1170		AHEAD(start);			/* ... fix it */
1171		EMIT(OOR2, 0);
1172		AHEAD(THERE());
1173		ASTERN(O_CH, THERETHERE());
1174		break;
1175	case REP(1, 1):			/* trivial case */
1176		/* done */
1177		break;
1178	case REP(1, N):			/* as x?x{1,n-1} */
1179		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1180		INSERT(OCH_, start);
1181		ASTERN(OOR1, start);
1182		AHEAD(start);
1183		EMIT(OOR2, 0);			/* offset very wrong... */
1184		AHEAD(THERE());			/* ...so fix it */
1185		ASTERN(O_CH, THERETHERE());
1186		copy = dupl(p, start+1, finish+1);
1187		assert(copy == finish+4);
1188		repeat(p, copy, 1, to-1);
1189		break;
1190	case REP(1, INF):		/* as x+ */
1191		INSERT(OPLUS_, start);
1192		ASTERN(O_PLUS, start);
1193		break;
1194	case REP(N, N):			/* as xx{m-1,n-1} */
1195		copy = dupl(p, start, finish);
1196		repeat(p, copy, from-1, to-1);
1197		break;
1198	case REP(N, INF):		/* as xx{n-1,INF} */
1199		copy = dupl(p, start, finish);
1200		repeat(p, copy, from-1, to);
1201		break;
1202	default:			/* "can't happen" */
1203		SETERROR(REG_EFATAL);	/* just in case */
1204		break;
1205	}
1206}
1207
1208/*
1209 * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1210 * character from the parse struct, signals a REG_ILLSEQ error if the
1211 * character can't be converted. Returns the number of bytes consumed.
1212 */
1213static wint_t
1214wgetnext(struct parse *p)
1215{
1216	mbstate_t mbs;
1217	wchar_t wc;
1218	size_t n;
1219
1220	(void) memset(&mbs, 0, sizeof (mbs));
1221	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1222	if (n == (size_t)-1 || n == (size_t)-2) {
1223		SETERROR(REG_ECHAR);
1224		return (0);
1225	}
1226	if (n == 0)
1227		n = 1;
1228	p->next += n;
1229	return (wc);
1230}
1231
1232/*
1233 * seterr - set an error condition
1234 */
1235static int			/* useless but makes type checking happy */
1236seterr(struct parse *p, int e)
1237{
1238	if (p->error == 0)	/* keep earliest error condition */
1239		p->error = e;
1240	p->next = nuls;		/* try to bring things to a halt */
1241	p->end = nuls;
1242	return (0);		/* make the return value well-defined */
1243}
1244
1245/*
1246 * allocset - allocate a set of characters for []
1247 */
1248static cset *
1249allocset(struct parse *p)
1250{
1251	cset *cs, *ncs;
1252
1253	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1254	if (ncs == NULL) {
1255		SETERROR(REG_ESPACE);
1256		return (NULL);
1257	}
1258	p->g->sets = ncs;
1259	cs = &p->g->sets[p->g->ncsets++];
1260	(void) memset(cs, 0, sizeof (*cs));
1261
1262	return (cs);
1263}
1264
1265/*
1266 * freeset - free a now-unused set
1267 */
1268static void
1269freeset(struct parse *p, cset *cs)
1270{
1271	cset *top = &p->g->sets[p->g->ncsets];
1272
1273	free(cs->wides);
1274	free(cs->ranges);
1275	free(cs->types);
1276	(void) memset(cs, 0, sizeof (*cs));
1277	if (cs == top-1)	/* recover only the easy case */
1278		p->g->ncsets--;
1279}
1280
1281/*
1282 * singleton - Determine whether a set contains only one character,
1283 * returning it if so, otherwise returning OUT.
1284 */
1285static wint_t
1286singleton(cset *cs)
1287{
1288	wint_t i, s, n;
1289
1290	for (i = n = 0; i < NC; i++)
1291		if (CHIN(cs, i)) {
1292			n++;
1293			s = i;
1294		}
1295	if (n == 1)
1296		return (s);
1297	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1298	    cs->icase == 0)
1299		return (cs->wides[0]);
1300	/* Don't bother handling the other cases. */
1301	return (OUT);
1302}
1303
1304/*
1305 * CHadd - add character to character set.
1306 */
1307static void
1308CHadd(struct parse *p, cset *cs, wint_t ch)
1309{
1310	wint_t nch, *newwides;
1311	assert(ch >= 0);
1312	if (ch < NC)
1313		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1314	else {
1315		newwides = realloc(cs->wides, (cs->nwides + 1) *
1316		    sizeof (*cs->wides));
1317		if (newwides == NULL) {
1318			SETERROR(REG_ESPACE);
1319			return;
1320		}
1321		cs->wides = newwides;
1322		cs->wides[cs->nwides++] = ch;
1323	}
1324	if (cs->icase) {
1325		if ((nch = towlower(ch)) < NC)
1326			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1327		if ((nch = towupper(ch)) < NC)
1328			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1329	}
1330}
1331
1332/*
1333 * CHaddrange - add all characters in the range [min,max] to a character set.
1334 */
1335static void
1336CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1337{
1338	crange *newranges;
1339
1340	for (; min < NC && min <= max; min++)
1341		CHadd(p, cs, min);
1342	if (min >= max)
1343		return;
1344	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1345	    sizeof (*cs->ranges));
1346	if (newranges == NULL) {
1347		SETERROR(REG_ESPACE);
1348		return;
1349	}
1350	cs->ranges = newranges;
1351	cs->ranges[cs->nranges].min = min;
1352	cs->ranges[cs->nranges].max = max;
1353	cs->nranges++;
1354}
1355
1356/*
1357 * CHaddtype - add all characters of a certain type to a character set.
1358 */
1359static void
1360CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1361{
1362	wint_t i;
1363	wctype_t *newtypes;
1364
1365	for (i = 0; i < NC; i++)
1366		if (iswctype(i, wct))
1367			CHadd(p, cs, i);
1368	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1369	    sizeof (*cs->types));
1370	if (newtypes == NULL) {
1371		SETERROR(REG_ESPACE);
1372		return;
1373	}
1374	cs->types = newtypes;
1375	cs->types[cs->ntypes++] = wct;
1376}
1377
1378/*
1379 * dupl - emit a duplicate of a bunch of sops
1380 */
1381static sopno			/* start of duplicate */
1382dupl(struct parse *p,
1383    sopno start,		/* from here */
1384    sopno finish)		/* to this less one */
1385{
1386	sopno ret = HERE();
1387	sopno len = finish - start;
1388
1389	assert(finish >= start);
1390	if (len == 0)
1391		return (ret);
1392	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1393		return (ret);
1394	assert(p->ssize >= p->slen + len);
1395	(void) memcpy((char *)(p->strip + p->slen),
1396	    (char *)(p->strip + start), (size_t)len*sizeof (sop));
1397	p->slen += len;
1398	return (ret);
1399}
1400
1401/*
1402 * doemit - emit a strip operator
1403 *
1404 * It might seem better to implement this as a macro with a function as
1405 * hard-case backup, but it's just too big and messy unless there are
1406 * some changes to the data structures.  Maybe later.
1407 */
1408static void
1409doemit(struct parse *p, sop op, size_t opnd)
1410{
1411	/* avoid making error situations worse */
1412	if (p->error != 0)
1413		return;
1414
1415	/* deal with oversize operands ("can't happen", more or less) */
1416	assert(opnd < 1<<OPSHIFT);
1417
1418	/* deal with undersized strip */
1419	if (p->slen >= p->ssize)
1420		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1421			return;
1422
1423	/* finally, it's all reduced to the easy case */
1424	p->strip[p->slen++] = SOP(op, opnd);
1425}
1426
1427/*
1428 * doinsert - insert a sop into the strip
1429 */
1430static void
1431doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1432{
1433	sopno sn;
1434	sop s;
1435	int i;
1436
1437	/* avoid making error situations worse */
1438	if (p->error != 0)
1439		return;
1440
1441	sn = HERE();
1442	EMIT(op, opnd);		/* do checks, ensure space */
1443	assert(HERE() == sn+1);
1444	s = p->strip[sn];
1445
1446	/* adjust paren pointers */
1447	assert(pos > 0);
1448	for (i = 1; i < NPAREN; i++) {
1449		if (p->pbegin[i] >= pos) {
1450			p->pbegin[i]++;
1451		}
1452		if (p->pend[i] >= pos) {
1453			p->pend[i]++;
1454		}
1455	}
1456
1457	(void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1458	    (HERE()-pos-1)*sizeof (sop));
1459	p->strip[pos] = s;
1460}
1461
1462/*
1463 * dofwd - complete a forward reference
1464 */
1465static void
1466dofwd(struct parse *p, sopno pos, sop value)
1467{
1468	/* avoid making error situations worse */
1469	if (p->error != 0)
1470		return;
1471
1472	assert(value < 1<<OPSHIFT);
1473	p->strip[pos] = OP(p->strip[pos]) | value;
1474}
1475
1476/*
1477 * enlarge - enlarge the strip
1478 */
1479static int
1480enlarge(struct parse *p, sopno size)
1481{
1482	sop *sp;
1483
1484	if (p->ssize >= size)
1485		return (1);
1486
1487	sp = (sop *)realloc(p->strip, size*sizeof (sop));
1488	if (sp == NULL) {
1489		SETERROR(REG_ESPACE);
1490		return (0);
1491	}
1492	p->strip = sp;
1493	p->ssize = size;
1494	return (1);
1495}
1496
1497/*
1498 * stripsnug - compact the strip
1499 */
1500static void
1501stripsnug(struct parse *p, struct re_guts *g)
1502{
1503	g->nstates = p->slen;
1504	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1505	if (g->strip == NULL) {
1506		SETERROR(REG_ESPACE);
1507		g->strip = p->strip;
1508	}
1509}
1510
1511/*
1512 * findmust - fill in must and mlen with longest mandatory literal string
1513 *
1514 * This algorithm could do fancy things like analyzing the operands of |
1515 * for common subsequences.  Someday.  This code is simple and finds most
1516 * of the interesting cases.
1517 *
1518 * Note that must and mlen got initialized during setup.
1519 */
1520static void
1521findmust(struct parse *p, struct re_guts *g)
1522{
1523	sop *scan;
1524	sop *start = NULL;
1525	sop *newstart = NULL;
1526	sopno newlen;
1527	sop s;
1528	char *cp;
1529	int offset;
1530	char buf[MB_LEN_MAX];
1531	size_t clen;
1532	mbstate_t mbs;
1533	locale_t loc = uselocale(NULL);
1534
1535	/* avoid making error situations worse */
1536	if (p->error != 0)
1537		return;
1538
1539	/*
1540	 * It's not generally safe to do a ``char'' substring search on
1541	 * multibyte character strings, but it's safe for at least
1542	 * UTF-8 (see RFC 3629).
1543	 */
1544	if (MB_CUR_MAX > 1 &&
1545	    strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1546		return;
1547
1548	/* find the longest OCHAR sequence in strip */
1549	newlen = 0;
1550	offset = 0;
1551	g->moffset = 0;
1552	scan = g->strip + 1;
1553	do {
1554		s = *scan++;
1555		switch (OP(s)) {
1556		case OCHAR:		/* sequence member */
1557			if (newlen == 0) {		/* new sequence */
1558				(void) memset(&mbs, 0, sizeof (mbs));
1559				newstart = scan - 1;
1560			}
1561			clen = wcrtomb(buf, OPND(s), &mbs);
1562			if (clen == (size_t)-1)
1563				goto toohard;
1564			newlen += clen;
1565			break;
1566		case OPLUS_:		/* things that don't break one */
1567		case OLPAREN:
1568		case ORPAREN:
1569			break;
1570		case OQUEST_:		/* things that must be skipped */
1571		case OCH_:
1572			offset = altoffset(scan, offset);
1573			scan--;
1574			do {
1575				scan += OPND(s);
1576				s = *scan;
1577				/* assert() interferes w debug printouts */
1578				if (OP(s) != (sop)O_QUEST &&
1579				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1580					g->iflags |= BAD;
1581					return;
1582				}
1583			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1584			/* FALLTHROUGH */
1585		case OBOW:		/* things that break a sequence */
1586		case OEOW:
1587		case OBOL:
1588		case OEOL:
1589		case O_QUEST:
1590		case O_CH:
1591		case OEND:
1592			if (newlen > (sopno)g->mlen) {		/* ends one */
1593				start = newstart;
1594				g->mlen = newlen;
1595				if (offset > -1) {
1596					g->moffset += offset;
1597					offset = newlen;
1598				} else
1599					g->moffset = offset;
1600			} else {
1601				if (offset > -1)
1602					offset += newlen;
1603			}
1604			newlen = 0;
1605			break;
1606		case OANY:
1607			if (newlen > (sopno)g->mlen) {		/* ends one */
1608				start = newstart;
1609				g->mlen = newlen;
1610				if (offset > -1) {
1611					g->moffset += offset;
1612					offset = newlen;
1613				} else
1614					g->moffset = offset;
1615			} else {
1616				if (offset > -1)
1617					offset += newlen;
1618			}
1619			if (offset > -1)
1620				offset++;
1621			newlen = 0;
1622			break;
1623		case OANYOF:		/* may or may not invalidate offset */
1624			/* First, everything as OANY */
1625			if (newlen > (sopno)g->mlen) {		/* ends one */
1626				start = newstart;
1627				g->mlen = newlen;
1628				if (offset > -1) {
1629					g->moffset += offset;
1630					offset = newlen;
1631				} else
1632					g->moffset = offset;
1633			} else {
1634				if (offset > -1)
1635					offset += newlen;
1636			}
1637			if (offset > -1)
1638				offset++;
1639			newlen = 0;
1640			break;
1641		toohard:
1642		default:
1643			/*
1644			 * Anything here makes it impossible or too hard
1645			 * to calculate the offset -- so we give up;
1646			 * save the last known good offset, in case the
1647			 * must sequence doesn't occur later.
1648			 */
1649			if (newlen > (sopno)g->mlen) {		/* ends one */
1650				start = newstart;
1651				g->mlen = newlen;
1652				if (offset > -1)
1653					g->moffset += offset;
1654				else
1655					g->moffset = offset;
1656			}
1657			offset = -1;
1658			newlen = 0;
1659			break;
1660		}
1661	} while (OP(s) != OEND);
1662
1663	if (g->mlen == 0) {		/* there isn't one */
1664		g->moffset = -1;
1665		return;
1666	}
1667
1668	/* turn it into a character string */
1669	g->must = malloc((size_t)g->mlen + 1);
1670	if (g->must == NULL) {		/* argh; just forget it */
1671		g->mlen = 0;
1672		g->moffset = -1;
1673		return;
1674	}
1675	cp = g->must;
1676	scan = start;
1677	(void) memset(&mbs, 0, sizeof (mbs));
1678	while (cp < g->must + g->mlen) {
1679		while (OP(s = *scan++) != OCHAR)
1680			continue;
1681		clen = wcrtomb(cp, OPND(s), &mbs);
1682		assert(clen != (size_t)-1);
1683		cp += clen;
1684	}
1685	assert(cp == g->must + g->mlen);
1686	*cp++ = '\0';		/* just on general principles */
1687}
1688
1689/*
1690 * altoffset - choose biggest offset among multiple choices
1691 *
1692 * Compute, recursively if necessary, the largest offset among multiple
1693 * re paths.
1694 */
1695static int
1696altoffset(sop *scan, int offset)
1697{
1698	int largest;
1699	int try;
1700	sop s;
1701
1702	/* If we gave up already on offsets, return */
1703	if (offset == -1)
1704		return (-1);
1705
1706	largest = 0;
1707	try = 0;
1708	s = *scan++;
1709	while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
1710		switch (OP(s)) {
1711		case OOR1:
1712			if (try > largest)
1713				largest = try;
1714			try = 0;
1715			break;
1716		case OQUEST_:
1717		case OCH_:
1718			try = altoffset(scan, try);
1719			if (try == -1)
1720				return (-1);
1721			scan--;
1722			do {
1723				scan += OPND(s);
1724				s = *scan;
1725				if (OP(s) != (sop)O_QUEST &&
1726				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
1727					return (-1);
1728			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1729			/*
1730			 * We must skip to the next position, or we'll
1731			 * leave altoffset() too early.
1732			 */
1733			scan++;
1734			break;
1735		case OANYOF:
1736		case OCHAR:
1737		case OANY:
1738			try++;
1739			/*FALLTHRU*/
1740		case OBOW:
1741		case OEOW:
1742		case OLPAREN:
1743		case ORPAREN:
1744		case OOR2:
1745			break;
1746		default:
1747			try = -1;
1748			break;
1749		}
1750		if (try == -1)
1751			return (-1);
1752		s = *scan++;
1753	}
1754
1755	if (try > largest)
1756		largest = try;
1757
1758	return (largest+offset);
1759}
1760
1761/*
1762 * computejumps - compute char jumps for BM scan
1763 *
1764 * This algorithm assumes g->must exists and is has size greater than
1765 * zero. It's based on the algorithm found on Computer Algorithms by
1766 * Sara Baase.
1767 *
1768 * A char jump is the number of characters one needs to jump based on
1769 * the value of the character from the text that was mismatched.
1770 */
1771static void
1772computejumps(struct parse *p, struct re_guts *g)
1773{
1774	int ch;
1775	int mindex;
1776
1777	/* Avoid making errors worse */
1778	if (p->error != 0)
1779		return;
1780
1781	g->charjump = (int *)malloc((NC_MAX + 1) * sizeof (int));
1782	if (g->charjump == NULL)	/* Not a fatal error */
1783		return;
1784	/* Adjust for signed chars, if necessary */
1785	g->charjump = &g->charjump[-(CHAR_MIN)];
1786
1787	/*
1788	 * If the character does not exist in the pattern, the jump
1789	 * is equal to the number of characters in the pattern.
1790	 */
1791	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1792		g->charjump[ch] = g->mlen;
1793
1794	/*
1795	 * If the character does exist, compute the jump that would
1796	 * take us to the last character in the pattern equal to it
1797	 * (notice that we match right to left, so that last character
1798	 * is the first one that would be matched).
1799	 */
1800	for (mindex = 0; mindex < g->mlen; mindex++)
1801		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1802}
1803
1804/*
1805 * computematchjumps - compute match jumps for BM scan
1806 *
1807 * This algorithm assumes g->must exists and is has size greater than
1808 * zero. It's based on the algorithm found on Computer Algorithms by
1809 * Sara Baase.
1810 *
1811 * A match jump is the number of characters one needs to advance based
1812 * on the already-matched suffix.
1813 * Notice that all values here are minus (g->mlen-1), because of the way
1814 * the search algorithm works.
1815 */
1816static void
1817computematchjumps(struct parse *p, struct re_guts *g)
1818{
1819	int mindex;		/* General "must" iterator */
1820	int suffix;		/* Keeps track of matching suffix */
1821	int ssuffix;		/* Keeps track of suffixes' suffix */
1822	int *pmatches;
1823				/*
1824				 * pmatches[k] points to the next i
1825				 * such that i+1...mlen is a substring
1826				 * of k+1...k+mlen-i-1
1827				 */
1828
1829	/* Avoid making errors worse */
1830	if (p->error != 0)
1831		return;
1832
1833	pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1834	if (pmatches == NULL) {
1835		g->matchjump = NULL;
1836		return;
1837	}
1838
1839	g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1840	if (g->matchjump == NULL) {	/* Not a fatal error */
1841		free(pmatches);
1842		return;
1843	}
1844
1845	/* Set maximum possible jump for each character in the pattern */
1846	for (mindex = 0; mindex < g->mlen; mindex++)
1847		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1848
1849	/* Compute pmatches[] */
1850	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1851	    mindex--, suffix--) {
1852		pmatches[mindex] = suffix;
1853
1854		/*
1855		 * If a mismatch is found, interrupting the substring,
1856		 * compute the matchjump for that position. If no
1857		 * mismatch is found, then a text substring mismatched
1858		 * against the suffix will also mismatch against the
1859		 * substring.
1860		 */
1861		while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1862			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1863			    g->mlen - mindex - 1);
1864			suffix = pmatches[suffix];
1865		}
1866	}
1867
1868	/*
1869	 * Compute the matchjump up to the last substring found to jump
1870	 * to the beginning of the largest must pattern prefix matching
1871	 * it's own suffix.
1872	 */
1873	for (mindex = 0; mindex <= suffix; mindex++)
1874		g->matchjump[mindex] = MIN(g->matchjump[mindex],
1875		    g->mlen + suffix - mindex);
1876
1877	ssuffix = pmatches[suffix];
1878	while (suffix < g->mlen) {
1879		while (suffix <= ssuffix && suffix < g->mlen) {
1880			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1881			    g->mlen + ssuffix - suffix);
1882			suffix++;
1883		}
1884		if (suffix < g->mlen)
1885			ssuffix = pmatches[ssuffix];
1886	}
1887
1888	free(pmatches);
1889}
1890
1891/*
1892 * pluscount - count + nesting
1893 */
1894static sopno			/* nesting depth */
1895pluscount(struct parse *p, struct re_guts *g)
1896{
1897	sop *scan;
1898	sop s;
1899	sopno plusnest = 0;
1900	sopno maxnest = 0;
1901
1902	if (p->error != 0)
1903		return (0);	/* there may not be an OEND */
1904
1905	scan = g->strip + 1;
1906	do {
1907		s = *scan++;
1908		switch (OP(s)) {
1909		case OPLUS_:
1910			plusnest++;
1911			break;
1912		case O_PLUS:
1913			if (plusnest > maxnest)
1914				maxnest = plusnest;
1915			plusnest--;
1916			break;
1917		}
1918	} while (OP(s) != OEND);
1919	if (plusnest != 0)
1920		g->iflags |= BAD;
1921	return (maxnest);
1922}
1923