1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1985-2010 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                  Common Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*            http://www.opensource.org/licenses/cpl1.0.txt             *
11*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                   Phong Vo <kpv@research.att.com>                    *
20*                                                                      *
21***********************************************************************/
22#pragma prototyped
23
24/*
25 * posix regex compiler
26 */
27
28#include "reglib.h"
29
30#if _PACKAGE_ast
31#include "lclib.h"
32#endif
33
34#define serialize		re_serialize	/* hp.ia64 <unistd.h>! */
35
36#define C_ESC			(-1)
37#define C_MB			(-2)
38
39#if _AST_REGEX_DEBUG
40
41#define DEBUG_TEST(f,y,n)	((debug&(debug_flag=f))?(y):(n))
42#define DEBUG_CODE(f,y,n)	do if(debug&(f)){y}else{n} while(0)
43#define DEBUG_INIT()		do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
44
45static unsigned long	debug;
46static unsigned long	debug_flag;
47
48#else
49
50#define DEBUG_INIT()
51#define DEBUG_TEST(f,y,n)	(n)
52#define DEBUG_CODE(f,y,n)	do {n} while(0)
53
54#endif
55
56#if _PACKAGE_ast
57
58typedef struct Cchr_s
59{
60	Dtlink_t	lnk;
61	unsigned char	nam[2];
62	Ckey_t		key;
63} Cchr_t;
64
65#endif
66
67#define eat(p)		do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0)
68
69/*
70 * determine whether greedy matching will work, i.e. produce
71 * the best match first.  such expressions are "easy", and
72 * need no backtracking once a complete match is found.
73 * if an expression has backreferences or alts it's hard
74 * else if it has only one closure it's easy
75 * else if all closures are simple (i.e. one-character) it's easy
76 * else it's hard.
77 */
78
79typedef struct Stats_s
80{
81	unsigned long	l;	/* min length to left of x		*/
82	unsigned long	k;	/* min length to left of y		*/
83	unsigned long	m;	/* min length				*/
84	unsigned long	n;	/* max length				*/
85	unsigned short	a;	/* number of alternations		*/
86	unsigned short	b;	/* number of backrefs			*/
87	unsigned short	c;	/* number of closures			*/
88	unsigned short	e;	/* $					*/
89	unsigned short	i;	/* number of negations			*/
90	unsigned short	p;	/* number of named subexpressions	*/
91	unsigned short	s;	/* number of simple closures		*/
92	unsigned short	t;	/* number of tries			*/
93	unsigned short	u;	/* number of unnamed subexpressions	*/
94	Rex_t*		x;	/* max length REX_STRING		*/
95	Rex_t*		y;	/* max length REX_TRIE			*/
96} Stats_t;
97
98typedef struct Token_s
99{
100	unsigned long	min;
101	unsigned long	max;
102	short		lex;
103	short		len;
104	short		esc;
105	short		att;
106	short		push;
107} Token_t;
108
109typedef struct Cenv_s
110{
111	int		delimiter;	/* pattern delimiter		*/
112	int		error;		/* last error			*/
113	int		explicit;	/* explicit match on this char	*/
114	int		mappeddot;	/* inverse mapped '.'		*/
115	int		mappednewline;	/* inverse mapped '\n'		*/
116	int		mappedslash;	/* inverse mapped '/'		*/
117	regflags_t	flags;		/* flags arg to regcomp		*/
118	int		type;		/* BRE,ERE,ARE,SRE,KRE		*/
119	unsigned char*	cursor;		/* curent point in re		*/
120	unsigned char*	pattern;	/* the original pattern		*/
121	unsigned char*	literal;	/* literal restart pattern	*/
122	int		parno;		/* number of last open paren	*/
123	int		parnest;	/* paren nest count		*/
124	int		posixkludge; 	/* to make * nonspecial		*/
125	int		regexp; 	/* <regexp.h> compatibility	*/
126	Token_t		token;		/* token lookahead		*/
127	Stats_t		stats;		/* RE statistics		*/
128	int		terminator;	/* pattern terminator		*/
129	Rex_t*		paren[2*(BACK_REF_MAX+2)];
130					/* paren[i]!=0 if \i defined	*/
131	regex_t*	regex;		/* user handle			*/
132	regdisc_t*	disc;		/* user discipline		*/
133	unsigned char*	map;		/* external to native ccode map	*/
134	unsigned char*	MAP;		/* fold and/or map		*/
135} Cenv_t;
136
137/*
138 * allocate a new Rex_t node
139 */
140
141static Rex_t*
142node(Cenv_t* env, int type, int lo, int hi, size_t extra)
143{
144	register Rex_t*	e;
145
146	if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra))
147	{
148		memset(e, 0, sizeof(Rex_t) + extra);
149		e->type = type;
150		e->marked = 0;
151		e->lo = lo;
152		e->hi = hi;
153		e->flags = env->flags;
154		e->map = (e->flags & REG_ICASE) ? env->MAP : env->map;
155		e->explicit = env->explicit;
156		if (extra)
157			e->re.data = (char*)e + sizeof(Rex_t);
158	}
159	return e;
160}
161
162/*
163 * free a Trie_node_t node
164 */
165
166static void
167triedrop(regdisc_t* disc, Trie_node_t* e)
168{
169	if (e)
170	{
171		triedrop(disc, e->son);
172		triedrop(disc, e->sib);
173		alloc(disc, e, 0);
174	}
175}
176
177/*
178 * free a Rex_t node
179 */
180
181void
182drop(regdisc_t* disc, Rex_t* e)
183{
184	int	i;
185	Rex_t*	x;
186
187	if (e && !(disc->re_flags & REG_NOFREE))
188		do
189		{
190			switch (e->type)
191			{
192			case REX_ALT:
193			case REX_CONJ:
194				drop(disc, e->re.group.expr.binary.left);
195				drop(disc, e->re.group.expr.binary.right);
196				break;
197			case REX_GROUP:
198			case REX_GROUP_AHEAD:
199			case REX_GROUP_AHEAD_NOT:
200			case REX_GROUP_BEHIND:
201			case REX_GROUP_BEHIND_NOT:
202			case REX_GROUP_CUT:
203			case REX_NEG:
204			case REX_REP:
205				drop(disc, e->re.group.expr.rex);
206				break;
207			case REX_TRIE:
208				for (i = 0; i <= UCHAR_MAX; i++)
209					triedrop(disc, e->re.trie.root[i]);
210				break;
211			}
212			x = e->next;
213			alloc(disc, e, 0);
214		} while (e = x);
215}
216
217/*
218 * mark e and descendants minimal
219 */
220
221static void
222mark(register Rex_t* e, int set)
223{
224	if (e && !e->marked)
225		do
226		{
227			e->marked = 1;
228			if (set)
229				e->flags |= REG_MINIMAL;
230			else
231				e->flags &= ~REG_MINIMAL;
232			switch (e->type)
233			{
234			case REX_ALT:
235			case REX_CONJ:
236			case REX_GROUP_COND:
237				if (e->re.group.expr.binary.left)
238					mark(e->re.group.expr.binary.left, set);
239				if (e->re.group.expr.binary.right)
240					mark(e->re.group.expr.binary.right, set);
241				break;
242			case REX_GROUP:
243			case REX_GROUP_AHEAD:
244			case REX_GROUP_AHEAD_NOT:
245			case REX_GROUP_BEHIND:
246			case REX_GROUP_BEHIND_NOT:
247			case REX_GROUP_CUT:
248			case REX_NEG:
249			case REX_REP:
250			case REX_TRIE:
251				mark(e->re.group.expr.rex, set);
252				break;
253			}
254		} while (e = e->next);
255}
256
257/*
258 * assign subexpression numbers by a preorder tree walk
259 */
260
261static int
262serialize(Cenv_t* env, Rex_t* e, int n)
263{
264	do
265	{
266		e->serial = n++;
267		switch (e->type)
268		{
269		case REX_ALT:
270		case REX_GROUP_COND:
271			if (e->re.group.expr.binary.left)
272				n = serialize(env, e->re.group.expr.binary.left, n);
273			e->re.group.expr.binary.serial = n++;
274			if (e->re.group.expr.binary.right)
275				n = serialize(env, e->re.group.expr.binary.right, n);
276			break;
277		case REX_CONJ:
278			n = serialize(env, e->re.group.expr.binary.left, n);
279			n = serialize(env, e->re.group.expr.binary.right, n);
280			break;
281		case REX_GROUP:
282		case REX_GROUP_AHEAD:
283		case REX_GROUP_AHEAD_NOT:
284		case REX_GROUP_BEHIND:
285		case REX_GROUP_BEHIND_NOT:
286		case REX_GROUP_CUT:
287		case REX_NEG:
288		case REX_REP:
289			n = serialize(env, e->re.group.expr.rex, n);
290			break;
291		}
292	} while (e = e->next);
293	return n;
294}
295
296/*
297 * catenate e and f into a sequence, collapsing them if possible
298 */
299
300static Rex_t*
301cat(Cenv_t* env, Rex_t* e, Rex_t* f)
302{
303	Rex_t*	g;
304
305	if (!f)
306	{
307		drop(env->disc, e);
308		return 0;
309	}
310	if (e->type == REX_NULL)
311	{
312		drop(env->disc, e);
313		return f;
314	}
315	if (f->type == REX_NULL)
316	{
317		g = f->next;
318		f->next = 0;
319		drop(env->disc, f);
320		f = g;
321	}
322	else if (e->type == REX_DOT && f->type == REX_DOT)
323	{
324		unsigned int	m = e->lo + f->lo;
325		unsigned int	n = e->hi + f->hi;
326
327		if (m <= RE_DUP_MAX)
328		{
329			if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX)
330			{
331				n = RE_DUP_INF;
332				goto combine;
333			}
334			else if (n <= RE_DUP_MAX)
335			{
336			combine:
337				e->lo = m;
338				e->hi = n;
339				g = f->next;
340				f->next = 0;
341				drop(env->disc, f);
342				f = g;
343			}
344		}
345	}
346	e->next = f;
347	return e;
348}
349
350/*
351 * collect re statistics
352 */
353
354static int
355stats(register Cenv_t* env, register Rex_t* e)
356{
357	register unsigned long	n;
358	register unsigned long	m;
359	unsigned long		cm;
360	unsigned long		nm;
361	unsigned long		cn;
362	unsigned long		nn;
363	unsigned long		l;
364	unsigned long		k;
365	unsigned long		t;
366	Rex_t*			q;
367	Rex_t*			x;
368	Rex_t*			y;
369	unsigned char		c;
370	unsigned char		b;
371
372	do
373	{
374		switch (e->type)
375		{
376		case REX_ALT:
377			x = env->stats.x;
378			l = env->stats.l;
379			y = env->stats.y;
380			k = env->stats.k;
381			t = env->stats.t;
382			if (++env->stats.a <= 0)
383				return 1;
384			cm = env->stats.m;
385			env->stats.m = 0;
386			cn = env->stats.n;
387			env->stats.n = 0;
388			if (stats(env, e->re.group.expr.binary.left))
389				return 1;
390			m = env->stats.m;
391			env->stats.m = 0;
392			n = env->stats.n;
393			env->stats.n = 0;
394			if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right))
395				return 1;
396			if (env->stats.m > m)
397				env->stats.m = m;
398			else
399				m = env->stats.m;
400			if ((env->stats.m += cm) < m)
401				return 1;
402			if (env->stats.n < n)
403				env->stats.n = n;
404			else
405				n = env->stats.n;
406			if ((env->stats.n += cn) < n)
407				return 1;
408			env->stats.x = x;
409			env->stats.l = l;
410			env->stats.y = y;
411			env->stats.k = k;
412			env->stats.t = t;
413			break;
414		case REX_BACK:
415			if (++env->stats.b <= 0)
416				return 1;
417			break;
418		case REX_CLASS:
419		case REX_COLL_CLASS:
420		case REX_DOT:
421		case REX_ONECHAR:
422			n = env->stats.m;
423			if ((env->stats.m += e->lo) < n)
424				return 1;
425			if (e->hi != RE_DUP_INF)
426			{
427				n = env->stats.n;
428				if ((env->stats.n += e->hi) < n)
429					return 1;
430			}
431			if (e->lo != e->hi)
432			{
433				if (++env->stats.c <= 0)
434					return 1;
435				if (++env->stats.s <= 0)
436					return 1;
437			}
438			break;
439		case REX_CONJ:
440			cm = env->stats.m;
441			env->stats.m = 0;
442			cn = env->stats.n;
443			env->stats.n = 0;
444			if (stats(env, e->re.group.expr.binary.left))
445				return 1;
446			nm = env->stats.m;
447			env->stats.m = 0;
448			nn = env->stats.n;
449			env->stats.n = 0;
450			if (stats(env, e->re.group.expr.binary.right))
451				return 1;
452			if (env->stats.m < nm)
453				env->stats.m = nm;
454			else
455				nm = env->stats.m;
456			if ((env->stats.m += cm) < nm)
457				return 1;
458			if (env->stats.n < nn)
459				env->stats.n = nn;
460			else
461				nn = env->stats.n;
462			if ((env->stats.n += cn) < nn)
463				return 1;
464			break;
465		case REX_END:
466			env->stats.e = 1;
467			break;
468		case REX_GROUP:
469			if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0)
470				return 1;
471			if (stats(env, e->re.group.expr.rex))
472				return 1;
473			break;
474		case REX_GROUP_AHEAD:
475		case REX_GROUP_AHEAD_NOT:
476		case REX_GROUP_BEHIND:
477		case REX_GROUP_BEHIND_NOT:
478			m = env->stats.m;
479			n = env->stats.n;
480			x = env->stats.x;
481			y = env->stats.y;
482			if (stats(env, e->re.group.expr.rex))
483				return 1;
484			env->stats.m = m;
485			env->stats.n = n;
486			env->stats.x = x;
487			env->stats.y = y;
488			switch (e->type)
489			{
490			case REX_GROUP_AHEAD:
491			case REX_GROUP_BEHIND:
492				if (++env->stats.u <= 0)
493					return 1;
494				break;
495			}
496			break;
497		case REX_GROUP_COND:
498			if (++env->stats.u <= 0)
499				return 1;
500			m = env->stats.m;
501			n = env->stats.n;
502			x = env->stats.x;
503			y = env->stats.y;
504			if (e->re.group.size > 0 && ++env->stats.b <= 0)
505				return 1;
506			if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left))
507				return 1;
508			if (q = e->re.group.expr.binary.right)
509			{
510				if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left))
511					return 1;
512				if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right))
513					return 1;
514			}
515			env->stats.m = m;
516			env->stats.n = n;
517			env->stats.x = x;
518			env->stats.y = y;
519			break;
520		case REX_GROUP_CUT:
521			if (++env->stats.u <= 0)
522				return 1;
523			m = env->stats.m;
524			n = env->stats.n;
525			x = env->stats.x;
526			y = env->stats.y;
527			if (stats(env, e->re.group.expr.rex))
528				return 1;
529			env->stats.m = m;
530			env->stats.n = n;
531			env->stats.x = x;
532			env->stats.y = y;
533			break;
534		case REX_NEG:
535			env->stats.i++;
536			x = env->stats.x;
537			l = env->stats.l;
538			y = env->stats.y;
539			k = env->stats.k;
540			t = env->stats.t;
541			cm = env->stats.m;
542			env->stats.m = 0;
543			if (stats(env, e->re.group.expr.rex))
544				return 1;
545			env->stats.m = !env->stats.m;
546			if ((env->stats.m += cm) < cm)
547				return 1;
548			env->stats.x = x;
549			env->stats.l = l;
550			env->stats.y = y;
551			env->stats.k = k;
552			env->stats.t = t;
553			break;
554		case REX_REP:
555			x = env->stats.x;
556			l = env->stats.l;
557			y = env->stats.y;
558			k = env->stats.k;
559			t = env->stats.t;
560			if (++env->stats.c <= 0)
561				return 1;
562			b = env->stats.b;
563			c = env->stats.c;
564			cm = env->stats.m;
565			env->stats.m = 0;
566			if (stats(env, e->re.group.expr.rex))
567				return 1;
568			if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0)
569				return 1;
570			if (e->lo < 1)
571			{
572				env->stats.x = x;
573				env->stats.l = l;
574				env->stats.y = y;
575				env->stats.k = k;
576				env->stats.t = t;
577				env->stats.m = cm;
578			}
579			else
580			{
581				m = env->stats.m;
582				if ((env->stats.m *= e->lo) > 0 && env->stats.m < m)
583					return 1;
584				m = env->stats.m;
585				if ((env->stats.m += cm) < m)
586					return 1;
587				if (env->stats.x != x)
588					env->stats.l = cm;
589				if (env->stats.y != y)
590					env->stats.k = cm;
591			}
592			break;
593		case REX_STRING:
594			if (!e->map)
595			{
596				cm = env->stats.m;
597				if ((env->stats.m += e->re.string.size) < cm)
598					return 1;
599				cn = env->stats.n;
600				if ((env->stats.n += e->re.string.size) < cn)
601					return 1;
602				if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size)
603				{
604					env->stats.x = e;
605					env->stats.l = cm;
606				}
607			}
608			break;
609		case REX_TRIE:
610			if (++env->stats.s <= 0)
611				return 1;
612			cm = env->stats.m;
613			if ((env->stats.m += e->re.trie.min) < cm)
614				return 1;
615			cn = env->stats.n;
616			if ((env->stats.n += e->re.trie.max) < cn)
617				return 1;
618			env->stats.t++;
619			if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min)
620			{
621				env->stats.y = e;
622				env->stats.k = cm;
623			}
624			break;
625		}
626	} while (e = e->next);
627	return 0;
628}
629
630static int	token(Cenv_t*);
631
632static int
633magic(register Cenv_t* env, register int c, int escaped)
634{
635	register char*	sp;
636	register int	n;
637	int		o = c;
638	int		e = env->error;
639	int		l = env->token.len;
640	short*		mp;
641	char*		ep;
642
643	if (mp = state.magic[c])
644	{
645		c = mp[env->type+escaped];
646		if (c >= T_META)
647		{
648			sp = (char*)env->cursor + env->token.len;
649			switch (c)
650			{
651			case T_LEFT:
652				n = 0;
653				ep = sp;
654				while (*sp >= '0' && *sp <= '9')
655				{
656					if (n > (INT_MAX / 10))
657					{
658						env->error = REG_BADBR;
659						goto bad;
660					}
661					n = n * 10 + *sp++ - '0';
662				}
663				if (sp == ep)
664				{
665					if (env->type < SRE || *sp != ',')
666					{
667						env->error = *sp ? REG_BADBR : REG_EBRACE;
668						goto bad;
669					}
670				}
671				else if (n > RE_DUP_MAX)
672				{
673					env->error = REG_BADBR;
674					goto bad;
675				}
676				env->token.min = n;
677				if (*sp == ',')
678				{
679					n = 0;
680					ep = ++sp;
681					while (*sp >= '0' && *sp <= '9')
682					{
683						if (n > (INT_MAX / 10))
684						{
685							env->error = REG_BADBR;
686							goto bad;
687						}
688						n = n * 10 + *sp++ - '0';
689					}
690					if (sp == ep)
691						n = RE_DUP_INF;
692					else if (n < env->token.min)
693					{
694						env->error = REG_BADBR;
695						goto bad;
696					}
697				}
698				env->token.max = n;
699				switch (*sp)
700				{
701				case 0:
702					env->error = REG_EBRACE;
703					goto bad;
704				case '\\':
705					if (!escaped)
706					{
707						env->error = REG_BADBR;
708						goto bad;
709					}
710					sp++;
711					break;
712				default:
713					if (escaped)
714					{
715						env->error = REG_BADBR;
716						goto bad;
717					}
718					break;
719				}
720				switch (*sp++)
721				{
722				case 0:
723					env->error = REG_EBRACE;
724					goto bad;
725				case '}':
726					break;
727				default:
728					env->error = REG_BADBR;
729					goto bad;
730				}
731				env->token.len = sp - (char*)env->cursor;
732				break;
733			case T_RIGHT:
734				env->error = REG_EBRACE;
735				goto bad;
736			case T_OPEN:
737				if (env->type < SRE && *sp == '?')
738				{
739					env->token.len++;
740					env->token.lex = 0;
741					goto group;
742				}
743				break;
744			case T_ESCAPE:
745				c = chresc(sp - 2, &ep);
746				if (ep < sp)
747					goto bad;
748				env->token.len += ep - sp;
749				if (c >= T_META)
750				{
751					env->token.lex = c;
752					c = C_ESC;
753				}
754				return c;
755			case T_BACK+0:
756			case T_BACK+1:
757			case T_BACK+2:
758			case T_BACK+3:
759			case T_BACK+4:
760			case T_BACK+5:
761			case T_BACK+6:
762			case T_BACK+7:
763				n = chresc(sp - 2, &ep);
764				if (ep > sp + 1)
765				{
766					env->token.len += ep - sp;
767					return n;
768				}
769				/*FALLTHROUGH*/
770			case T_BACK+8:
771			case T_BACK+9:
772				if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT))
773				{
774					env->error = REG_BADESC;
775					goto bad;
776				}
777				if ((env->flags & REG_MULTIREF) && isdigit(*sp))
778				{
779					c = (c - T_BACK) * 10 + (*sp - '0');
780					if (c > 0 && c <= env->parno && env->paren[c])
781						c += T_BACK;
782					else
783						c = chresc(sp - 2, &ep);
784					env->token.len++;
785				}
786				if (c == T_BACK)
787					c = 0;
788				break;
789			case T_BAD:
790				if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META)
791					return c;
792				goto bad;
793			}
794			if (env->type >= SRE)
795			{
796				if (c == T_DOT)
797					c = '.';
798				else if (c < T_OPEN)
799				{
800					if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(')
801					{
802						env->token.len++;
803						env->token.att = 1;
804					}
805					if (env->type == KRE && *(env->cursor + env->token.len) == '(')
806					{
807						env->token.len++;
808						switch (c)
809						{
810						case T_AT:
811							break;
812						case T_PERCENT:
813							env->token.lex = c;
814							goto group;
815						case T_TILDE:
816							env->token.lex = 0;
817							goto group;
818						default:
819							env->token.lex = c;
820							break;
821						}
822						c = T_OPEN;
823					}
824					else if (c == T_STAR)
825						c = T_DOTSTAR;
826					else if (c == T_QUES)
827						c = T_DOT;
828					else
829					{
830						c = o;
831						env->token.len = l;
832					}
833				}
834				else if (c > T_BACK)
835				{
836					c = (c - T_BACK) * 2 - 1;
837					c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c;
838				}
839				else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP))
840				{
841					if (c == T_AND)
842						c = '&';
843					else if (c == T_BAR)
844						c = '|';
845					else if (c == T_OPEN)
846						c = '(';
847				}
848			}
849		}
850	}
851	else if (escaped == 2)
852	{
853		if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter))
854			/*ok*/;
855		else
856		{
857			env->error = REG_BADESC;
858			goto bad;
859		}
860	}
861	else if (escaped && !(env->flags & REG_LENIENT) && c != ']')
862	{
863		env->error = REG_BADESC;
864		goto bad;
865	}
866	return c;
867 group:
868	sp = (char*)env->cursor + env->token.len;
869	switch (*sp++)
870	{
871	case ')':
872		break;
873	case '#':
874		for (;;)
875		{
876			switch (*sp++)
877			{
878			case 0:
879				env->error = REG_EPAREN;
880				return T_BAD;
881			case ')':
882				break;
883			default:
884				continue;
885			}
886			break;
887		}
888		break;
889	default:
890		return T_GROUP;
891	}
892	env->cursor = (unsigned char*)sp;
893	return token(env);
894 bad:
895	if (escaped == 2)
896		env->error = e;
897	else if (env->flags & REG_LENIENT)
898		return o;
899	else if (escaped == 1 && !env->error)
900	{
901		if (mp || o == ']')
902			return o;
903		env->error = REG_BADESC;
904	}
905	return T_BAD;
906}
907
908static int
909token(register Cenv_t* env)
910{
911	int	c;
912	int	posixkludge;
913
914	if (env->token.push)
915		return env->token.lex;
916	env->token.att = env->token.esc = 0;
917	if ((env->token.len = MBSIZE(env->cursor)) > 1)
918		return env->token.lex = C_MB;
919	env->token.lex = 0;
920	for (;;)
921	{
922		c = *env->cursor;
923		if (c == 0 || c == env->delimiter || c == env->terminator)
924			return T_END;
925		if (!(env->flags & REG_COMMENT))
926			break;
927		if (c == '#')
928		{
929			do
930			{
931				c = *++env->cursor;
932				if (c == 0 || c == env->delimiter)
933					return T_END;
934			} while (c != '\n');
935		}
936		else if (!isspace(c))
937			break;
938		env->cursor++;
939	}
940	if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter)
941	{
942		if (env->parnest)
943		{
944			env->error = REG_EPAREN;
945			return T_BAD;
946		}
947		env->parno = 0;
948		env->pattern = env->cursor + 1;
949		return T_BAR;
950	}
951	if (env->flags & REG_LITERAL)
952		return c;
953	if (posixkludge = env->posixkludge)
954	{
955		env->posixkludge = 0;
956		if (c == '*')
957			return c;
958	}
959	if (c == '\\')
960	{
961		if (env->flags & REG_SHELL_ESCAPED)
962			return c;
963		if (!(c = *(env->cursor + 1)) || c == env->terminator)
964		{
965			if (env->flags & REG_LENIENT)
966			{
967				if (c)
968				{
969					env->token.esc = env->token.len;
970					env->token.len += MBSIZE(env->cursor + 1);
971					return c;
972				}
973				return '\\';
974			}
975			env->error = REG_EESCAPE;
976			return T_BAD;
977		}
978		env->token.esc = env->token.len;
979		env->token.len += MBSIZE(env->cursor + 1);
980		if (env->delimiter && c == 'n')
981			return '\n';
982		else if (c == env->delimiter)
983			return magic(env, c, 0);
984		else if (c == '(' && env->type == BRE)
985			env->posixkludge = 1;
986		else if (c == ')' && env->type == BRE && env->parnest <= 0)
987		{
988			env->error = REG_EPAREN;
989			return T_BAD;
990		}
991		else if (isspace(c) && (env->flags & REG_COMMENT))
992			return c;
993		return magic(env, c, 1);
994	}
995	else if (c == '$')
996	{
997		if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n')
998			return T_DOLL;
999	}
1000	else if (c == '^')
1001	{
1002		if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1))
1003		{
1004			env->posixkludge = 2;
1005			return T_CFLX;
1006		}
1007	}
1008	else if (c == ')')
1009	{
1010		if (env->type != BRE && env->parnest <= 0)
1011			return c;
1012	}
1013	else if (c == '/' && env->explicit == env->mappedslash)
1014	{
1015		while (*(env->cursor + env->token.len) == c)
1016			env->token.len++;
1017		return T_SLASHPLUS;
1018	}
1019	return magic(env, c, 0);
1020}
1021
1022static Celt_t*
1023col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec)
1024{
1025	register char*		s;
1026	register unsigned char*	k;
1027	register unsigned char*	e;
1028	register int		c;
1029	register int		cc;
1030	int			bt;
1031	int			et;
1032	Ckey_t			key;
1033
1034	cc = 0;
1035	for (;;)
1036	{
1037		k = key;
1038		if (bw == 1)
1039		{
1040			c = bc;
1041			if (ic)
1042			{
1043				if (isupper(c))
1044				{
1045					c = tolower(c);
1046					cc = -1;
1047				}
1048				else if (islower(c))
1049				{
1050					c = toupper(c);
1051					cc = -1;
1052				}
1053			}
1054			*k++ = c;
1055		}
1056		else if (bw < COLL_KEY_MAX)
1057		{
1058			s = (char*)bp;
1059			if (ic)
1060			{
1061				c = mbchar(s);
1062				if (iswupper(c))
1063				{
1064					c = towlower(c);
1065					cc = 1;
1066				}
1067				else if (iswlower(c))
1068				{
1069					c = towupper(c);
1070					cc = 1;
1071				}
1072			}
1073			if (cc > 0)
1074			{
1075				cc = -1;
1076				k += mbconv((char*)k, c);
1077			}
1078			else
1079				for (e = k + bw; k < e; *k++ = *s++);
1080		}
1081		*k = 0;
1082		mbxfrm(ce->beg, key, COLL_KEY_MAX);
1083		if (ep)
1084		{
1085			k = key;
1086			c = mbchar(k);
1087			if (iswupper(c))
1088				bt = COLL_range_uc;
1089			else if (iswlower(c))
1090				bt = COLL_range_lc;
1091			else
1092				bt = COLL_range;
1093			k = key;
1094			if (ew == 1)
1095			{
1096				c = ec;
1097				if (ic)
1098				{
1099					if (isupper(c))
1100					{
1101						c = tolower(c);
1102						cc = -1;
1103					}
1104					else if (islower(c))
1105					{
1106						c = toupper(c);
1107						cc = -1;
1108					}
1109				}
1110				*k++ = c;
1111			}
1112			else if (ew < COLL_KEY_MAX)
1113			{
1114				s = (char*)ep;
1115				if (ic)
1116				{
1117					c = mbchar(s);
1118					if (iswupper(c))
1119					{
1120						c = towlower(c);
1121						cc = 1;
1122					}
1123					else if (iswlower(c))
1124					{
1125						c = towupper(c);
1126						cc = 1;
1127					}
1128				}
1129				if (cc > 0)
1130				{
1131					cc = -1;
1132					k += mbconv((char*)k, c);
1133				}
1134				else
1135					for (e = k + ew; k < e; *k++ = *s++);
1136			}
1137			*k = 0;
1138			mbxfrm(ce->end, key, COLL_KEY_MAX);
1139			k = key;
1140			c = mbchar(k);
1141			if (iswupper(c))
1142				et = COLL_range_uc;
1143			else if (iswlower(c))
1144				et = COLL_range_lc;
1145			else
1146				et = COLL_range;
1147			ce->typ = bt == et ? bt : COLL_range;
1148		}
1149		else
1150			ce->typ = COLL_char;
1151		ce++;
1152		if (!ic || !cc)
1153			break;
1154		ic = 0;
1155	}
1156	return ce;
1157}
1158
1159static Rex_t*
1160bra(Cenv_t* env)
1161{
1162	Rex_t*		e;
1163	int		c;
1164	int		i;
1165	int		w;
1166	int		neg;
1167	int		last;
1168	int		inrange;
1169	int		complicated;
1170	int		collate;
1171	int		elements;
1172	unsigned char*	first;
1173	unsigned char*	start;
1174	unsigned char*	begin;
1175	unsigned char*	s;
1176	regclass_t	f;
1177	unsigned char	buf[4 * (COLL_KEY_MAX + 1)];
1178#if _PACKAGE_ast
1179	int		ic;
1180	char		mbc[COLL_KEY_MAX + 1];
1181#endif
1182
1183	if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1184		return 0;
1185	collate = complicated = elements = 0;
1186	if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!')
1187	{
1188		env->cursor++;
1189		neg = 1;
1190	}
1191	else
1192		neg = 0;
1193	first = env->cursor;
1194	start = first + MBSIZE(first);
1195	if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter))
1196		goto error;
1197	begin = env->cursor + MBSIZE(env->cursor);
1198
1199	/*
1200	 * inrange: 0=no, 1=possibly, 2=definitely
1201	 */
1202
1203	inrange = 0;
1204	for (;;)
1205	{
1206		if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE))
1207			goto error;
1208		env->cursor += (w = MBSIZE(env->cursor));
1209		if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1210		{
1211			if (*env->cursor)
1212			{
1213				if (*env->cursor == 'n')
1214				{
1215					env->cursor++;
1216					c = '\n';
1217				}
1218				else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1219				{
1220					env->token.len = 1;
1221					w = magic(env, *env->cursor, 2);
1222					if (env->token.len > 1 || w != T_BAD)
1223					{
1224						if (env->token.len == 1 && (f = classfun(w)))
1225						{
1226							if (inrange > 1)
1227							{
1228								if (env->type < SRE && !(env->flags & REG_LENIENT))
1229									goto erange;
1230								inrange = 0;
1231							}
1232							env->cursor++;
1233							for (c = 0; c <= UCHAR_MAX; c++)
1234								if ((*f)(c))
1235									setadd(e->re.charclass, c);
1236							complicated++;
1237							elements++;
1238							continue;
1239						}
1240						if (env->token.len > 1 || w >= 0 && w < T_META)
1241						{
1242							c = w;
1243							if (c > UCHAR_MAX)
1244							{
1245								if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide())
1246									goto erange;
1247								c = UCHAR_MAX;
1248							}
1249							env->cursor += env->token.len;
1250						}
1251					}
1252				}
1253			}
1254		}
1255		else if (c == ']')
1256		{
1257			if (env->cursor == begin)
1258			{
1259				last = c;
1260				inrange = 1;
1261				continue;
1262			}
1263			if (inrange != 0)
1264			{
1265				setadd(e->re.charclass, last);
1266				elements++;
1267				if (inrange == 2)
1268				{
1269					setadd(e->re.charclass, '-');
1270					elements++;
1271				}
1272			}
1273			break;
1274		}
1275		else if (c == '-')
1276		{
1277			if (!inrange && env->cursor != begin && *env->cursor != ']')
1278			{
1279				if (env->type < SRE && !(env->flags & REG_LENIENT))
1280					goto erange;
1281				continue;
1282			}
1283			else if (inrange == 1)
1284			{
1285				inrange = 2;
1286				complicated++;
1287				continue;
1288			}
1289		}
1290		else if (c == '[')
1291		{
1292			switch (*env->cursor)
1293			{
1294			case 0:
1295				goto error;
1296			case ':':
1297				if (env->regexp)
1298					goto normal;
1299				if (inrange == 1)
1300				{
1301					setadd(e->re.charclass, last);
1302					elements++;
1303				}
1304				if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1305				{
1306					if (env->cursor == start && (c = *(env->cursor + 1)))
1307					{
1308						s = start = env->cursor + 1;
1309						while (*++s && *s != ':');
1310						if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']')
1311						{
1312							if ((i = (s - start)) == 1)
1313							{
1314								switch (c)
1315								{
1316								case '<':
1317									i = REX_WBEG;
1318									break;
1319								case '>':
1320									i = REX_WEND;
1321									break;
1322								default:
1323									i = 0;
1324									break;
1325								}
1326								if (i)
1327								{
1328									env->cursor = s + 3;
1329									drop(env->disc, e);
1330									return node(env, i, 0, 0, 0);
1331								}
1332							}
1333						}
1334					}
1335					env->error = REG_ECTYPE;
1336					goto error;
1337				}
1338				for (c = 0; c <= UCHAR_MAX; c++)
1339					if ((*f)(c))
1340						setadd(e->re.charclass, c);
1341				inrange = 0;
1342				complicated++;
1343				elements++;
1344				continue;
1345			case '=':
1346				if (env->regexp)
1347					goto normal;
1348				if (inrange == 2)
1349					goto erange;
1350				if (inrange == 1)
1351				{
1352					setadd(e->re.charclass, last);
1353					elements++;
1354				}
1355				if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0)
1356					goto ecollate;
1357				if (c > 1)
1358					collate++;
1359				else
1360					setadd(e->re.charclass, buf[0]);
1361				c = buf[0];
1362				inrange = 0;
1363				complicated++;
1364				elements++;
1365				continue;
1366			case '.':
1367				if (env->regexp)
1368					goto normal;
1369				if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0)
1370					goto ecollate;
1371				if (c > 1)
1372					collate++;
1373				c = buf[0];
1374				complicated++;
1375				break;
1376			default:
1377			normal:
1378				if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1379					goto error;
1380				break;
1381			}
1382		}
1383		else if (w > 1)
1384			complicated++;
1385		if (inrange == 2)
1386		{
1387			if (last <= c)
1388			{
1389				for (i = last; i <= c; i++)
1390					setadd(e->re.charclass, i);
1391				inrange = env->type >= SRE || (env->flags & REG_LENIENT);
1392				elements += 2;
1393			}
1394			else if (env->type >= SRE)
1395			{
1396				setadd(e->re.charclass, last);
1397				setadd(e->re.charclass, c);
1398				elements += 2;
1399				inrange = 1;
1400			}
1401			else if (!(env->flags & REG_LENIENT))
1402				goto erange;
1403			else
1404				inrange = 0;
1405		}
1406		else if (inrange == 1)
1407		{
1408			setadd(e->re.charclass, last);
1409			elements++;
1410		}
1411		else
1412			inrange = 1;
1413		last = c;
1414	}
1415#if _PACKAGE_ast
1416	if (complicated && mbcoll())
1417	{
1418		Dt_t*			dt;
1419		Cchr_t*			cc;
1420		Cchr_t*			tc;
1421		Cchr_t*			xc;
1422		Celt_t*			ce;
1423		Cchr_t			key;
1424		int			rw;
1425		int			rc;
1426		int			wc;
1427		unsigned char*		rp;
1428		unsigned char*		pp;
1429		char*			wp;
1430		char			cb[2][COLL_KEY_MAX+1];
1431
1432		static Dtdisc_t		disc;
1433
1434		static const char	primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
1435
1436		if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data))
1437		{
1438			disc.key = offsetof(Cchr_t, key);
1439			if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree)))
1440			{
1441				for (i = 0; i < elementsof(primary) - 1; i++, cc++)
1442				{
1443					cc->nam[0] = primary[i];
1444					mbxfrm(cc->key, cc->nam, COLL_KEY_MAX);
1445					dtinsert(dt, cc);
1446				}
1447				for (i = 0; i < elementsof(cc->key); i++)
1448					cc->key[i] = ~0;
1449				dtinsert(dt, cc);
1450				LCINFO(AST_LC_COLLATE)->data = (void*)dt;
1451			}
1452			else
1453			{
1454				if (cc)
1455					free(cc);
1456				drop(env->disc, e);
1457				return 0;
1458			}
1459		}
1460		if (dt)
1461		{
1462			drop(env->disc, e);
1463			if (ic = env->flags & REG_ICASE)
1464				elements *= 2;
1465			if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t))))
1466				return 0;
1467			ce = (Celt_t*)e->re.data;
1468			e->re.collate.invert = neg;
1469			e->re.collate.elements = ce;
1470			env->cursor = first;
1471			inrange = 0;
1472			for (;;)
1473			{
1474				if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter)
1475					goto error;
1476				pp = env->cursor;
1477				env->cursor += (w = MBSIZE(env->cursor));
1478				if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1479				{
1480					if (*env->cursor)
1481					{
1482						if (*env->cursor == 'n')
1483						{
1484							pp = env->cursor++;
1485							c = '\n';
1486						}
1487						else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1488						{
1489							env->token.len = 1;
1490							w = magic(env, *env->cursor, 2);
1491							if (env->token.len > 1 || w != T_BAD)
1492							{
1493								if (env->token.len == 1 && (f = classfun(w)))
1494								{
1495									if (inrange > 1)
1496									{
1497										if (env->type < SRE && !(env->flags & REG_LENIENT))
1498											goto erange;
1499										inrange = 0;
1500									}
1501									env->cursor++;
1502									ce->fun = f;
1503									ce->typ = COLL_call;
1504									ce++;
1505									continue;
1506								}
1507								if (env->token.len > 1 || w >= 0 && w < T_META)
1508								{
1509									c = w;
1510									w = mbconv(mbc, c);
1511									pp = (unsigned char*)mbc;
1512									env->cursor += env->token.len;
1513								}
1514							}
1515						}
1516					}
1517				}
1518				else if (c == ']')
1519				{
1520					if (env->cursor == begin)
1521					{
1522						rp = pp;
1523						rw = w;
1524						inrange = 1;
1525						continue;
1526					}
1527					if (inrange != 0)
1528					{
1529						ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1530						if (inrange == 2)
1531							ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0);
1532					}
1533					break;
1534				}
1535				else if (c == '-')
1536				{
1537					if (!inrange && env->cursor != begin && *env->cursor != ']')
1538					{
1539						if (env->type < SRE && !(env->flags & REG_LENIENT))
1540							goto erange;
1541						continue;
1542					}
1543					else if (inrange == 1)
1544					{
1545						inrange = 2;
1546						continue;
1547					}
1548				}
1549				else if (c == '[')
1550				{
1551					switch (*env->cursor)
1552					{
1553					case 0:
1554						goto error;
1555					case ':':
1556						if (env->regexp)
1557							goto complicated_normal;
1558						if (inrange == 1)
1559							ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1560						if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1561						{
1562							if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']')
1563							{
1564								switch (c)
1565								{
1566								case '<':
1567									i = REX_WBEG;
1568									break;
1569								case '>':
1570									i = REX_WEND;
1571									break;
1572								default:
1573									i = 0;
1574									break;
1575								}
1576								if (i)
1577								{
1578									env->cursor += 5;
1579									drop(env->disc, e);
1580									return node(env, i, 0, 0, 0);
1581								}
1582							}
1583							env->error = REG_ECTYPE;
1584							goto error;
1585						}
1586						ce->fun = f;
1587						ce->typ = COLL_call;
1588						ce++;
1589						inrange = 0;
1590						continue;
1591					case '=':
1592						if (env->regexp)
1593							goto complicated_normal;
1594						if (inrange == 2)
1595							goto erange;
1596						if (inrange == 1)
1597							ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1598						pp = (unsigned char*)cb[inrange];
1599						rp = env->cursor + 1;
1600						if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0)
1601							goto ecollate;
1602						wp = (char*)pp;
1603						wc = mbchar(wp);
1604						c = 0;
1605						if (ic)
1606						{
1607							if (iswupper(wc))
1608							{
1609								wc = towlower(wc);
1610								rw = mbconv((char*)pp, wc);
1611								c = 'u';
1612							}
1613							else if (iswlower(wc))
1614								c = 'l';
1615						}
1616						for (;;)
1617						{
1618							mbxfrm(key.key, (char*)pp, COLL_KEY_MAX);
1619							if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key)))
1620								goto ecollate;
1621							xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc;
1622							if (c == 'l' || c == 'L' && !(c = 0))
1623								ce->typ = COLL_range_lc;
1624							else if (c == 'u' || c == 'U' && !(c = 0))
1625								ce->typ = COLL_range_uc;
1626							else
1627								ce->typ = COLL_range;
1628							strcpy((char*)ce->beg, (char*)xc->key);
1629							if (!(cc = (Cchr_t*)dtnext(dt, cc)))
1630								goto ecollate;
1631							if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc)))
1632								cc = tc;
1633							strcpy((char*)ce->end, (char*)cc->key);
1634							ce->max = -1;
1635							ce++;
1636							if (!c)
1637								break;
1638							if (c == 'u')
1639							{
1640								wc = towlower(wc);
1641								c = 'L';
1642							}
1643							else
1644							{
1645								wc = towupper(wc);
1646								c = 'U';
1647							}
1648							rw = mbconv((char*)pp, wc);
1649						}
1650						inrange = 0;
1651						c = *pp;
1652						continue;
1653					case '.':
1654						if (env->regexp)
1655							goto complicated_normal;
1656						pp = (unsigned char*)cb[inrange];
1657						if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0)
1658							goto ecollate;
1659						c = buf[0];
1660						break;
1661					default:
1662					complicated_normal:
1663						if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1664							goto error;
1665						break;
1666					}
1667				}
1668				if (inrange == 2)
1669				{
1670					ce = col(ce, ic, rp, rw, rc, pp, w, c);
1671					if (strcmp((char*)ce->beg, (char*)ce->end) > 0)
1672					{
1673						if (env->type < SRE && !(env->flags & REG_LENIENT))
1674							goto erange;
1675						(ce-1)->typ = COLL_char;
1676						strcpy((char*)ce->beg, (char*)(ce-1)->end);
1677						ce->typ = COLL_char;
1678						ce++;
1679					}
1680					inrange = env->type >= SRE || (env->flags & REG_LENIENT);
1681				}
1682				else if (inrange == 1)
1683					ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1684				else
1685					inrange = 1;
1686				rp = pp;
1687				rw = w;
1688				rc = c;
1689			}
1690			ce->typ = COLL_end;
1691			return e;
1692		}
1693	}
1694#endif
1695	if (collate)
1696		goto ecollate;
1697	if (env->flags & REG_ICASE)
1698		for (i = 0; i <= UCHAR_MAX; i++)
1699			if (settst(e->re.charclass, i))
1700			{
1701				if (isupper(i))
1702					c = tolower(i);
1703				else if (islower(i))
1704					c = toupper(i);
1705				else
1706					continue;
1707				setadd(e->re.charclass, c);
1708			}
1709	if (neg)
1710	{
1711		for (i = 0; i < elementsof(e->re.charclass->bits); i++)
1712			e->re.charclass->bits[i] ^= ~0;
1713		if (env->explicit >= 0)
1714			setclr(e->re.charclass, env->explicit);
1715	}
1716	return e;
1717 ecollate:
1718	env->error = REG_ECOLLATE;
1719	goto error;
1720 erange:
1721	env->error = REG_ERANGE;
1722 error:
1723	drop(env->disc, e);
1724	if (!env->error)
1725		env->error = REG_EBRACK;
1726	return 0;
1727}
1728
1729static Rex_t*
1730ccl(Cenv_t* env, int type)
1731{
1732	int		i;
1733	Rex_t*		e;
1734	Celt_t*		ce;
1735	regclass_t	f;
1736
1737	if (!(f = classfun(type)))
1738	{
1739		env->error = REG_BADESC;
1740		return 0;
1741	}
1742	if (!mbcoll())
1743	{
1744		if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1745			return 0;
1746		for (i = 0; i <= UCHAR_MAX; i++)
1747			if ((*f)(i))
1748				setadd(e->re.charclass, i);
1749		if (env->explicit >= 0)
1750			setclr(e->re.charclass, env->explicit);
1751	}
1752	else
1753	{
1754		if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t))))
1755			return 0;
1756		ce = (Celt_t*)e->re.data;
1757		e->re.collate.invert = 0;
1758		e->re.collate.elements = ce;
1759		ce->fun = f;
1760		ce->typ = COLL_call;
1761		ce++;
1762		ce->typ = COLL_end;
1763	}
1764	return e;
1765}
1766
1767static Rex_t*
1768rep(Cenv_t* env, Rex_t* e, int number, int last)
1769{
1770	Rex_t*		f;
1771	unsigned long	m = 0;
1772	unsigned long	n = RE_DUP_INF;
1773	int		minimal = -1;
1774
1775	if (!e)
1776		return 0;
1777	switch (token(env))
1778	{
1779	case T_BANG:
1780		eat(env);
1781		if (!(f = node(env, REX_NEG, m, n, 0)))
1782		{
1783			drop(env->disc, e);
1784			return 0;
1785		}
1786		f->re.group.expr.rex = e;
1787		return f;
1788	case T_QUES:
1789		eat(env);
1790		n = 1;
1791		break;
1792	case T_STAR:
1793		eat(env);
1794		break;
1795	case T_PLUS:
1796		eat(env);
1797		m = 1;
1798		break;
1799	case T_LEFT:
1800		eat(env);
1801		m = env->token.min;
1802		n = env->token.max;
1803		break;
1804	default:
1805		return e;
1806	}
1807	if (env->token.att)
1808		minimal = 1;
1809	else if (env->type < SRE)
1810		switch (token(env))
1811		{
1812		case T_QUES:
1813			eat(env);
1814			minimal = !(env->flags & REG_MINIMAL);
1815			break;
1816		case T_STAR: /*AST*/
1817			eat(env);
1818			minimal = !!(env->flags & REG_MINIMAL);
1819			break;
1820		}
1821	switch (e->type)
1822	{
1823	case REX_DOT:
1824	case REX_CLASS:
1825	case REX_COLL_CLASS:
1826	case REX_ONECHAR:
1827		e->lo = m;
1828		e->hi = n;
1829		if (minimal >= 0)
1830			mark(e, minimal);
1831		return e;
1832#if HUH_2002_08_07
1833	case REX_BEG:
1834#endif
1835	case REX_BEG_STR:
1836	case REX_END_STR:
1837	case REX_FIN_STR:
1838	case REX_WBEG:
1839	case REX_WEND:
1840	case REX_WORD:
1841	case REX_WORD_NOT:
1842		env->error = REG_BADRPT;
1843		drop(env->disc, e);
1844		return 0;
1845	}
1846	if (m == 1 && n == 1)
1847	{
1848		if (minimal >= 0)
1849			mark(e, minimal);
1850		return e;
1851	}
1852	if (!(f = node(env, REX_REP, m, n, 0)))
1853	{
1854		drop(env->disc, e);
1855		return 0;
1856	}
1857	f->re.group.expr.rex = e;
1858	f->re.group.number = number;
1859	f->re.group.last = last;
1860	if (minimal >= 0)
1861		mark(f, minimal);
1862	if (m <= n && n)
1863	{
1864		for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex);
1865		if (e && e->type == REX_NEG)
1866			f->type = REX_GROUP;
1867	}
1868	return f;
1869}
1870
1871static int
1872isstring(Rex_t* e)
1873{
1874	switch (e->type)
1875	{
1876	case REX_ONECHAR:
1877		return e->lo == 1 && e->hi == 1;
1878	case REX_STRING:
1879		return 1;
1880	}
1881	return 0;
1882}
1883
1884static Trie_node_t*
1885trienode(Cenv_t* env, int c)
1886{
1887	Trie_node_t*	t;
1888
1889	if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t)))
1890	{
1891		memset(t, 0, sizeof(Trie_node_t));
1892		t->c = c;
1893	}
1894	return t;
1895}
1896
1897static int
1898insert(Cenv_t* env, Rex_t* f, Rex_t* g)
1899{
1900	unsigned char*	s;
1901	unsigned char*	e;
1902	Trie_node_t*	t;
1903	int		len;
1904	unsigned char	tmp[2];
1905
1906	switch (f->type)
1907	{
1908	case REX_ONECHAR:
1909		*(s = tmp) = f->re.onechar;
1910		e = s + 1;
1911		break;
1912	case REX_STRING:
1913		s = f->re.string.base;
1914		e = s + f->re.string.size;
1915		break;
1916	default:
1917		return 1;
1918	}
1919	if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s)))
1920		return 1;
1921	for (len = 1;;)
1922	{
1923		if (t->c == *s)
1924		{
1925			if (++s >= e)
1926				break;
1927			if (!t->son && !(t->son = trienode(env, *s)))
1928				return 1;
1929			t = t->son;
1930			len++;
1931		}
1932		else
1933		{
1934			if (!t->sib && !(t->sib = trienode(env, *s)))
1935				return 1;
1936			t = t->sib;
1937		}
1938	}
1939	if (g->re.trie.min > len)
1940		g->re.trie.min = len;
1941	if (g->re.trie.max < len)
1942		g->re.trie.max = len;
1943	t->end = 1;
1944	return 0;
1945}
1946
1947/*
1948 * trie() tries to combine nontrivial e and f into a REX_TRIE
1949 * unless 0 is returned, e and f are deleted as far as possible
1950 * also called by regcomb
1951 */
1952
1953static Rex_t*
1954trie(Cenv_t* env, Rex_t* e, Rex_t* f)
1955{
1956	Rex_t*	g;
1957
1958	if (e->next || f->next || !isstring(e) || e->flags != f->flags)
1959		return 0;
1960	if (isstring(f))
1961	{
1962		if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*))))
1963			return 0;
1964		g->re.trie.min = INT_MAX;
1965		if (insert(env, f, g))
1966			goto nospace;
1967		drop(env->disc, f);
1968	}
1969	else if (f->type != REX_TRIE)
1970		return 0;
1971	else
1972		g = f;
1973	if (insert(env, e, g))
1974		goto nospace;
1975	drop(env->disc, e);
1976	return g;
1977 nospace:
1978	if (g != f)
1979		drop(env->disc, g);
1980	return 0;
1981}
1982
1983static Rex_t*		alt(Cenv_t*, int, int);
1984
1985static int
1986chr(register Cenv_t* env, int* escaped)
1987{
1988	unsigned char*	p;
1989	int		c;
1990
1991	*escaped = 0;
1992	if (!(c = *env->cursor))
1993		return -1;
1994	env->cursor++;
1995	if (c == '\\')
1996	{
1997		if (env->flags & REG_SHELL_ESCAPED)
1998			return c;
1999		if (!(c = *(env->cursor + 1)) || c == env->terminator)
2000		{
2001			if (env->flags & REG_LENIENT)
2002				return c ? c : '\\';
2003			env->error = REG_EESCAPE;
2004			return -1;
2005		}
2006		p = env->cursor;
2007		c = chresc((char*)env->cursor - 1, (char**)&env->cursor);
2008		*escaped = env->cursor - p;
2009	}
2010	return c;
2011}
2012
2013/*
2014 * open the perly gates
2015 */
2016
2017static Rex_t*
2018grp(Cenv_t* env, int parno)
2019{
2020	Rex_t*		e;
2021	Rex_t*		f;
2022	int		c;
2023	int		i;
2024	int		n;
2025	int		x;
2026	int		esc;
2027	int		typ;
2028	int		beg;
2029	unsigned char*	p;
2030
2031	beg = env->pattern == env->cursor - env->token.len;
2032	if (!(c = env->token.lex) && (c = *env->cursor))
2033		env->cursor++;
2034	env->token.len = 0;
2035	env->parnest++;
2036	typ = -1;
2037	switch (c)
2038	{
2039	case '-':
2040	case '+':
2041	case 'a':
2042	case 'g':
2043	case 'i':
2044	case 'l':
2045	case 'm':
2046	case 'p':
2047	case 'r':
2048	case 's':
2049	case 'x':
2050	case 'A':
2051	case 'B':
2052	case 'E':
2053	case 'F':
2054	case 'G':
2055	case 'I':
2056	case 'K':
2057	case 'L':
2058	case 'M':	/* glob(3) */
2059	case 'N':	/* glob(3) */
2060	case 'O':	/* glob(3) */
2061	case 'P':
2062	case 'R':	/* pcre */
2063	case 'S':
2064	case 'U':	/* pcre */
2065	case 'X':	/* pcre */
2066		x = REX_GROUP;
2067		i = 1;
2068		env->token.push = 1;
2069		for (;;)
2070		{
2071			switch (c)
2072			{
2073			case ')':
2074				if (!(env->flags & REG_LITERAL))
2075				{
2076					env->error = REG_BADRPT;
2077					return 0;
2078				}
2079				/*FALLTHROUGH*/
2080			case 0:
2081			case T_CLOSE:
2082				x = 0;
2083				goto done;
2084			case ':':
2085				eat(env);
2086				if (token(env) == T_CLOSE)
2087					x = 0;
2088				goto done;
2089			case '-':
2090				i = 0;
2091				break;
2092			case '+':
2093				i = 1;
2094				break;
2095			case 'a':
2096				if (i)
2097					env->flags |= (REG_LEFT|REG_RIGHT);
2098				else
2099					env->flags &= ~(REG_LEFT|REG_RIGHT);
2100				break;
2101			case 'g':
2102				if (i)
2103					env->flags &= ~REG_MINIMAL;
2104				else
2105					env->flags |= REG_MINIMAL;
2106				break;
2107			case 'i':
2108				if (i)
2109					env->flags |= REG_ICASE;
2110				else
2111					env->flags &= ~REG_ICASE;
2112				break;
2113			case 'l':
2114				if (i)
2115					env->flags |= REG_LEFT;
2116				else
2117					env->flags &= ~REG_LEFT;
2118				break;
2119			case 'm':
2120				if (i)
2121					env->flags |= REG_NEWLINE;
2122				else
2123					env->flags &= ~REG_NEWLINE;
2124				env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2125				break;
2126			case 'p':
2127				if (i)
2128					env->flags &= ~REG_LENIENT;
2129				else
2130					env->flags |= REG_LENIENT;
2131				break;
2132			case 'r':
2133				if (i)
2134					env->flags |= REG_RIGHT;
2135				else
2136					env->flags &= ~REG_RIGHT;
2137				break;
2138			case 's':
2139				if (i)
2140					env->flags |= REG_SPAN;
2141				else
2142					env->flags &= ~REG_SPAN;
2143				env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2144				break;
2145			case 'x':
2146				if (i)
2147					env->flags |= REG_COMMENT;
2148				else
2149					env->flags &= ~REG_COMMENT;
2150				break;
2151			case 'A':
2152				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2153				env->flags |= REG_AUGMENTED|REG_EXTENDED;
2154				typ = ARE;
2155				break;
2156			case 'B':
2157			case 'G':
2158				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2159				typ = BRE;
2160				break;
2161			case 'E':
2162				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2163				env->flags |= REG_EXTENDED;
2164				typ = ERE;
2165				break;
2166			case 'F':
2167			case 'L':
2168				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2169				env->flags |= REG_LITERAL;
2170				typ = ERE;
2171				break;
2172			case 'K':
2173				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2174				env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT;
2175				typ = KRE;
2176				break;
2177			case 'M':
2178				/* used by caller to disable glob(3) GLOB_BRACE */
2179				break;
2180			case 'N':
2181				/* used by caller to disable glob(3) GLOB_NOCHECK */
2182				break;
2183			case 'O':
2184				/* used by caller to disable glob(3) GLOB_STARSTAR */
2185				break;
2186			case 'P':
2187				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2188				env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE;
2189				typ = ERE;
2190				break;
2191			case 'S':
2192				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2193				env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT;
2194				typ = SRE;
2195				break;
2196			case 'U': /* PCRE_UNGREEDY */
2197				if (i)
2198					env->flags |= REG_MINIMAL;
2199				else
2200					env->flags &= ~REG_MINIMAL;
2201				break;
2202			case 'X': /* PCRE_EXTRA */
2203				break;
2204			default:
2205				env->error = REG_BADRPT;
2206				return 0;
2207			}
2208			eat(env);
2209			c = token(env);
2210		}
2211	done:
2212		break;
2213	case ':':
2214		x = REX_GROUP;
2215		break;
2216	case '=':
2217		x = REX_GROUP_AHEAD;
2218		break;
2219	case '!':
2220		x = REX_GROUP_AHEAD_NOT;
2221		break;
2222	case '<':
2223		switch (token(env))
2224		{
2225		case '=':
2226			x = REX_GROUP_BEHIND;
2227			break;
2228		case '!':
2229		case T_BANG:
2230			x = REX_GROUP_BEHIND_NOT;
2231			break;
2232		default:
2233			env->error = REG_BADRPT;
2234			return 0;
2235		}
2236		eat(env);
2237		break;
2238	case '>':
2239		x = REX_GROUP_CUT;
2240		break;
2241	case '%':
2242	case T_PERCENT:
2243		e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short));
2244		e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor;
2245		n = 1;
2246		for (;;)
2247		{
2248			switch (i = chr(env, &esc))
2249			{
2250			case -1:
2251			case 0:
2252			invalid:
2253				env->cursor -= esc + 1;
2254				env->error = REG_EPAREN;
2255				return 0;
2256			case 'D':
2257				x = REX_NEST_delimiter;
2258				/*FALLTHROUGH*/
2259			delimiter:
2260				if ((i = chr(env, &esc)) < 0)
2261					goto invalid;
2262				if (e->re.nest.type[i] & ~x)
2263					goto invalid;
2264				e->re.nest.type[i] = x;
2265				continue;
2266			case 'E':
2267				x = REX_NEST_escape;
2268				goto delimiter;
2269			case 'L':
2270				x = REX_NEST_literal;
2271				goto quote;
2272			case 'O':
2273				switch (i = chr(env, &esc))
2274				{
2275				case 'T':
2276					e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator;
2277					break;
2278				default:
2279					goto invalid;
2280				}
2281				continue;
2282			case 'Q':
2283				x = REX_NEST_quote;
2284				/*FALLTHROUGH*/
2285			quote:
2286				if ((i = chr(env, &esc)) < 0)
2287					goto invalid;
2288				if (e->re.nest.type[i] & ~x)
2289					goto invalid;
2290				e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT);
2291				continue;
2292			case 'S':
2293				x = REX_NEST_separator;
2294				goto delimiter;
2295			case 'T':
2296				x = REX_NEST_terminator;
2297				goto delimiter;
2298			case '|':
2299			case '&':
2300				if (!esc)
2301					goto invalid;
2302				goto nesting;
2303			case '(':
2304				if (!esc)
2305					n++;
2306				goto nesting;
2307			case ')':
2308				if (!esc && !--n)
2309					break;
2310				goto nesting;
2311			default:
2312			nesting:
2313				if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2314					goto invalid;
2315				e->re.nest.type[i] = REX_NEST_open;
2316				if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2317					goto invalid;
2318				if (!esc)
2319				{
2320					if (x == ')' && !--n)
2321						goto invalid;
2322					else if (x == '(')
2323						n++;
2324				}
2325				e->re.nest.type[x] |= REX_NEST_close;
2326				e->re.nest.type[i] |= x << REX_NEST_SHIFT;
2327				continue;
2328			}
2329			break;
2330		}
2331		env->parnest--;
2332		if (c == T_PERCENT)
2333			for (n = 0; n < 2; n++)
2334			{
2335				parno = ++env->parno;
2336				if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2337				{
2338					drop(env->disc, e);
2339					return 0;
2340				}
2341				if (parno < elementsof(env->paren))
2342					env->paren[parno] = f;
2343				f->re.group.back = 0;
2344				f->re.group.number = parno;
2345				f->re.group.expr.rex = e;
2346				e = f;
2347			}
2348		return e;
2349	case '(':
2350		c = 0;
2351		if (isdigit(*env->cursor))
2352		{
2353			f = 0;
2354			do
2355			{
2356				if (c > (INT_MAX / 10))
2357				{
2358					env->error = REG_BADRPT;
2359					return 0;
2360				}
2361				c = c * 10 + (*env->cursor++ - '0');
2362			} while (isdigit(*env->cursor));
2363			if (*env->cursor++ != ')')
2364			{
2365				env->error = REG_BADRPT;
2366				return 0;
2367			}
2368			if (c && env->type >= SRE)
2369				c = c * 2 - 1;
2370			if (!c || c > env->parno || !env->paren[c])
2371			{
2372				if (!(env->flags & REG_LENIENT))
2373				{
2374					env->error = REG_ESUBREG;
2375					return 0;
2376				}
2377				if (c)
2378					c = -1;
2379			}
2380		}
2381		else
2382		{
2383			if (env->type < SRE && *env->cursor++ != '?')
2384			{
2385				env->error = REG_BADRPT;
2386				return 0;
2387			}
2388			if (!(f = grp(env, parno + 1)) && env->error)
2389				return 0;
2390		}
2391		if (!(e = node(env, REX_GROUP_COND, 0, 0, 0)))
2392		{
2393			drop(env->disc, f);
2394			return 0;
2395		}
2396		e->re.group.size = c;
2397		e->re.group.expr.binary.left = f;
2398		if (!(e->re.group.expr.binary.right = alt(env, parno, 1)))
2399		{
2400			drop(env->disc, e);
2401			return 0;
2402		}
2403		if (token(env) != T_CLOSE)
2404		{
2405			env->error = REG_EPAREN;
2406			return 0;
2407		}
2408		eat(env);
2409		env->parnest--;
2410		return rep(env, e, parno, parno);
2411	case '{':
2412		p = env->cursor;
2413		n = 1;
2414		while (c = *env->cursor)
2415		{
2416			if (c == '\\' && *(env->cursor + 1))
2417				env->cursor++;
2418			else if (c == '{')
2419				n++;
2420			else if (c == '}' && !--n)
2421				break;
2422			else if (c == env->delimiter || c == env->terminator)
2423				break;
2424			env->cursor++;
2425		}
2426		if (c != '}')
2427		{
2428			env->error = REG_EBRACE;
2429			return 0;
2430		}
2431		if (*++env->cursor != ')')
2432		{
2433			env->error = REG_EPAREN;
2434			return 0;
2435		}
2436		env->cursor++;
2437		env->parnest--;
2438		if (env->disc->re_version < REG_VERSION_EXEC)
2439		{
2440			env->error = REG_BADRPT;
2441			return 0;
2442		}
2443		if (!env->disc->re_execf)
2444			return 0;
2445		if (!(e = node(env, REX_EXEC, 0, 0, 0)))
2446			return 0;
2447		e->re.exec.text = (const char*)p;
2448		e->re.exec.size = env->cursor - p - 2;
2449		if (!env->disc->re_compf)
2450			e->re.exec.data = 0;
2451		else
2452			e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc);
2453		return e;
2454	case '0': case '1': case '2': case '3': case '4':
2455	case '5': case '6': case '7': case '8': case '9':
2456		c -= '0';
2457		while (isdigit(*env->cursor))
2458		{
2459			if (c > (INT_MAX / 10))
2460			{
2461				env->error = REG_ESUBREG;
2462				return 0;
2463			}
2464			c = c * 10 + *env->cursor++ - '0';
2465		}
2466		if (*env->cursor == ')')
2467		{
2468			env->cursor++;
2469			env->parnest--;
2470			env->token.len = 1;
2471			if (c > env->parno || !env->paren[c])
2472			{
2473				env->error = REG_ESUBREG;
2474				return 0;
2475			}
2476			env->paren[c]->re.group.back = 1;
2477			return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2478		}
2479		/*FALLTHROUGH*/
2480	default:
2481		env->error = REG_BADRPT;
2482		return 0;
2483	}
2484	if (x && !(e = alt(env, parno, 0)))
2485		return 0;
2486	c = token(env);
2487	env->parnest--;
2488	if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')'))
2489	{
2490		env->error = REG_EPAREN;
2491		return 0;
2492	}
2493	eat(env);
2494	if (typ >= 0)
2495	{
2496		if (beg)
2497			env->pattern = env->cursor;
2498		env->type = typ;
2499	}
2500	if (!x)
2501		return 0;
2502	if (!(f = node(env, x, 0, 0, 0)))
2503	{
2504		drop(env->disc, e);
2505		return 0;
2506	}
2507	f->re.group.expr.rex = e;
2508	if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT)
2509	{
2510		if (stats(env, e))
2511		{
2512			drop(env->disc, f);
2513			if (!env->error)
2514				env->error = REG_ECOUNT;
2515			return 0;
2516		}
2517		f->re.group.size = env->stats.m;
2518		memset(&env->stats, 0, sizeof(env->stats));
2519	}
2520	switch (x)
2521	{
2522	case REX_GROUP:
2523	case REX_GROUP_CUT:
2524		f = rep(env, f, parno, env->parno);
2525		break;
2526	}
2527	return f;
2528}
2529
2530static Rex_t*
2531seq(Cenv_t* env)
2532{
2533	Rex_t*		e;
2534	Rex_t*		f;
2535	Token_t		tok;
2536	int		c;
2537	int		i;
2538	int		n;
2539	int		x;
2540	int		parno;
2541	int		type;
2542	regflags_t	flags;
2543	unsigned char*	s;
2544	unsigned char*	p;
2545	unsigned char*	t;
2546	unsigned char*	u;
2547	unsigned char	buf[256];
2548
2549	for (;;)
2550	{
2551		s = buf;
2552		while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len])
2553		{
2554			x = c;
2555			p = env->cursor;
2556			if (c >= 0)
2557			{
2558				n = 1;
2559				*s++ = (env->flags & REG_ICASE) ? toupper(c) : c;
2560			}
2561			else if (c == C_ESC || (env->flags & REG_ICASE))
2562			{
2563				c = (c == C_ESC) ? env->token.lex : mbchar(p);
2564				if (env->flags & REG_ICASE)
2565					c = towupper(c);
2566				if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX)
2567					break;
2568				if ((n = mbconv((char*)s, c)) < 0)
2569					*s++ = c;
2570				else if (n)
2571					s += n;
2572				else
2573				{
2574					n = 1;
2575					*s++ = 0;
2576				}
2577			}
2578			else
2579			{
2580				n = env->token.len - env->token.esc;
2581				for (t = p, u = s + n; s < u; *s++ = *t++);
2582			}
2583			eat(env);
2584		}
2585		if (c == T_BAD)
2586			return 0;
2587		if (s > buf)
2588			switch (c)
2589			{
2590			case T_STAR:
2591			case T_PLUS:
2592			case T_LEFT:
2593			case T_QUES:
2594			case T_BANG:
2595				if ((s -= n) == buf)
2596					e = 0;
2597				else
2598				{
2599					i = s - buf;
2600					if (!(e = node(env, REX_STRING, 0, 0, i)))
2601						return 0;
2602					memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i);
2603					e->re.string.size = i;
2604				}
2605				if (x >= 0)
2606				{
2607					if (!(f = node(env, REX_ONECHAR, 1, 1, 0)))
2608					{
2609						drop(env->disc, e);
2610						return 0;
2611					}
2612					f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x;
2613				}
2614				else
2615				{
2616					if (!(f = node(env, REX_STRING, 0, 0, n)))
2617						return 0;
2618					memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n);
2619					f->re.string.size = n;
2620				}
2621				if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env))))
2622				{
2623					drop(env->disc, e);
2624					return 0;
2625				}
2626				if (e)
2627					f = cat(env, e, f);
2628				return f;
2629			default:
2630				c = s - buf;
2631				if (!(e = node(env, REX_STRING, 0, 0, c)))
2632					return 0;
2633				memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c);
2634				e->re.string.size = c;
2635				return cat(env, e, seq(env));
2636			}
2637		else if (c > T_BACK)
2638		{
2639			eat(env);
2640			c -= T_BACK;
2641			if (c > env->parno || !env->paren[c])
2642			{
2643				env->error = REG_ESUBREG;
2644				return 0;
2645			}
2646			env->paren[c]->re.group.back = 1;
2647			e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2648		}
2649		else
2650			switch (c)
2651			{
2652			case T_AND:
2653			case T_CLOSE:
2654			case T_BAR:
2655			case T_END:
2656				return node(env, REX_NULL, 0, 0, 0);
2657			case T_DOLL:
2658				eat(env);
2659				e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0);
2660				break;
2661			case T_CFLX:
2662				eat(env);
2663				if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED))
2664					e = rep(env, e, 0, 0);
2665				break;
2666			case T_OPEN:
2667				tok = env->token;
2668				eat(env);
2669				flags = env->flags;
2670				type = env->type;
2671				if (env->token.att)
2672					env->flags |= REG_MINIMAL;
2673				env->parnest++;
2674				if (env->type == KRE)
2675					++env->parno;
2676				parno = ++env->parno;
2677				if (!(e = alt(env, parno + 1, 0)))
2678					break;
2679				if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL))
2680				{
2681					drop(env->disc, e);
2682					env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL;
2683					return 0;
2684				}
2685				if (token(env) != T_CLOSE)
2686				{
2687					drop(env->disc, e);
2688					env->error = REG_EPAREN;
2689					return 0;
2690				}
2691				env->parnest--;
2692				eat(env);
2693				if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2694				{
2695					drop(env->disc, e);
2696					return 0;
2697				}
2698				if (parno < elementsof(env->paren))
2699					env->paren[parno] = f;
2700				f->re.group.back = 0;
2701				f->re.group.number = parno;
2702				f->re.group.expr.rex = e;
2703				if (tok.lex)
2704				{
2705					tok.push = 1;
2706					env->token = tok;
2707				}
2708				if (!(e = rep(env, f, parno, env->parno)))
2709					return 0;
2710				if (env->type == KRE)
2711				{
2712					if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2713					{
2714						drop(env->disc, e);
2715						return 0;
2716					}
2717					if (--parno < elementsof(env->paren))
2718						env->paren[parno] = f;
2719					f->re.group.back = 0;
2720					f->re.group.number = parno;
2721					f->re.group.expr.rex = e;
2722					e = f;
2723				}
2724				env->flags = flags;
2725				env->type = type;
2726				break;
2727			case T_GROUP:
2728				p = env->cursor;
2729				eat(env);
2730				flags = env->flags;
2731				type = env->type;
2732				if (!(e = grp(env, env->parno + 1)))
2733				{
2734					if (env->error)
2735						return 0;
2736					if (env->literal == env->pattern && env->literal == p)
2737						env->literal = env->cursor;
2738					continue;
2739				}
2740				env->flags = flags;
2741				env->type = type;
2742				break;
2743			case T_BRA:
2744				eat(env);
2745				if (e = bra(env))
2746					e = rep(env, e, 0, 0);
2747				break;
2748			case T_ALNUM:
2749			case T_ALNUM_NOT:
2750			case T_DIGIT:
2751			case T_DIGIT_NOT:
2752			case T_SPACE:
2753			case T_SPACE_NOT:
2754				eat(env);
2755				if (e = ccl(env, c))
2756					e = rep(env, e, 0, 0);
2757				break;
2758			case T_LT:
2759				eat(env);
2760				e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0);
2761				break;
2762			case T_GT:
2763				eat(env);
2764				e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0);
2765				break;
2766			case T_DOT:
2767				eat(env);
2768				e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2769				break;
2770			case T_DOTSTAR:
2771				eat(env);
2772				env->token.lex = T_STAR;
2773				env->token.push = 1;
2774				e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2775				break;
2776			case T_SLASHPLUS:
2777				eat(env);
2778				env->token.lex = T_PLUS;
2779				env->token.push = 1;
2780				if (e = node(env, REX_ONECHAR, 1, 1, 0))
2781				{
2782					e->re.onechar = '/';
2783					e = rep(env, e, 0, 0);
2784				}
2785				break;
2786			case T_WORD:
2787				eat(env);
2788				e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0);
2789				break;
2790			case T_WORD_NOT:
2791				eat(env);
2792				e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0);
2793				break;
2794			case T_BEG_STR:
2795				eat(env);
2796				e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0);
2797				break;
2798			case T_END_STR:
2799				eat(env);
2800				e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0);
2801				break;
2802			case T_FIN_STR:
2803				eat(env);
2804				e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0);
2805				break;
2806			default:
2807				env->error = REG_BADRPT;
2808				return 0;
2809			}
2810		if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator)
2811			e = cat(env, e, seq(env));
2812		return e;
2813	}
2814}
2815
2816static Rex_t*
2817con(Cenv_t* env)
2818{
2819	Rex_t*	e;
2820	Rex_t*	f;
2821	Rex_t*	g;
2822
2823	if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND)
2824		return e;
2825	eat(env);
2826	if (!(f = con(env)))
2827	{
2828		drop(env->disc, e);
2829		return 0;
2830	}
2831	if (!(g = node(env, REX_CONJ, 0, 0, 0)))
2832	{
2833		drop(env->disc, e);
2834		drop(env->disc, f);
2835		return 0;
2836	}
2837	g->re.group.expr.binary.left = e;
2838	g->re.group.expr.binary.right = f;
2839	return g;
2840}
2841
2842static Rex_t*
2843alt(Cenv_t* env, int number, int cond)
2844{
2845	Rex_t*	e;
2846	Rex_t*	f;
2847	Rex_t*	g;
2848
2849	if (!(e = con(env)))
2850		return 0;
2851	else if (token(env) != T_BAR)
2852	{
2853		if (!cond)
2854			return e;
2855		f = 0;
2856		if (e->type == REX_NULL)
2857			goto bad;
2858	}
2859	else
2860	{
2861		eat(env);
2862		if (!(f = alt(env, number, 0)))
2863		{
2864			drop(env->disc, e);
2865			return 0;
2866		}
2867		if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL))
2868			goto bad;
2869		if (!cond && (g = trie(env, e, f)))
2870			return g;
2871	}
2872	if (!(g = node(env, REX_ALT, 0, 0, 0)))
2873	{
2874		env->error = REG_ESPACE;
2875		goto bad;
2876	}
2877	g->re.group.number = number;
2878	g->re.group.last = env->parno;
2879	g->re.group.expr.binary.left = e;
2880	g->re.group.expr.binary.right = f;
2881	return g;
2882 bad:
2883	drop(env->disc, e);
2884	drop(env->disc, f);
2885	if (!env->error)
2886		env->error = REG_ENULL;
2887	return 0;
2888}
2889
2890/*
2891 * add v to REX_BM tables
2892 */
2893
2894static void
2895bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b)
2896{
2897	int	c;
2898	int	m;
2899	size_t	z;
2900
2901	for (m = 0; m < n; m++)
2902	{
2903		if (!(z = n - m - 1))
2904			z = HIT;
2905		c = v[m];
2906		a->re.bm.mask[m][c] |= b;
2907		if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2908			a->re.bm.skip[c] = z;
2909		if (a->flags & REG_ICASE)
2910		{
2911			if (isupper(c))
2912				c = tolower(c);
2913			else if (islower(c))
2914				c = toupper(c);
2915			else
2916				continue;
2917			a->re.bm.mask[m][c] |= b;
2918			if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2919				a->re.bm.skip[c] = z;
2920		}
2921	}
2922}
2923
2924/*
2925 * set up BM table from trie
2926 */
2927
2928static int
2929bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b)
2930{
2931	do
2932	{
2933		v[m] = x->c;
2934		if (m >= (n - 1))
2935		{
2936			bmstr(env, a, v, n, b);
2937			if (!(b <<= 1))
2938			{
2939				b = 1;
2940				a->re.bm.complete = 0;
2941			}
2942			else if (x->son)
2943				a->re.bm.complete = 0;
2944		}
2945		else if (x->son)
2946			b = bmtrie(env, a, v, x->son, n, m + 1, b);
2947	} while (x = x->sib);
2948	return b;
2949}
2950
2951/*
2952 * rewrite the expression tree for some special cases
2953 * 1. it is a null expression - illegal
2954 * 2. max length fixed string found -- use BM algorithm
2955 * 3. it begins with an unanchored string - use KMP algorithm
2956 * 0 returned on success
2957 */
2958
2959static int
2960special(Cenv_t* env, regex_t* p)
2961{
2962	Rex_t*		a;
2963	Rex_t*		e;
2964	Rex_t*		t;
2965	Rex_t*		x;
2966	Rex_t*		y;
2967	unsigned char*	s;
2968	int*		f;
2969	int		n;
2970	int		m;
2971	int		k;
2972
2973	DEBUG_INIT();
2974	if (e = p->env->rex)
2975	{
2976		if ((x = env->stats.x) && x->re.string.size < 3)
2977			x = 0;
2978		if ((t = env->stats.y) && t->re.trie.min < 3)
2979			t = 0;
2980		if (x && t)
2981		{
2982			if (x->re.string.size >= t->re.trie.min)
2983				t = 0;
2984			else
2985				x = 0;
2986		}
2987		if (x || t)
2988		{
2989			Bm_mask_t**	mask;
2990			Bm_mask_t*	h;
2991			unsigned char*	v;
2992			size_t*		q;
2993			unsigned long	l;
2994			int		i;
2995			int		j;
2996
2997			if (x)
2998			{
2999				y = x;
3000				n = m = x->re.string.size;
3001				l = env->stats.l;
3002			}
3003			else
3004			{
3005				y = t;
3006				n = t->re.trie.min;
3007				m = t->re.trie.max;
3008				l = env->stats.k;
3009			}
3010			if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t))))
3011				return 1;
3012			if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t))))
3013			{
3014				alloc(env->disc, q, 0);
3015				return 1;
3016			}
3017			a->flags = y->flags;
3018			a->map = y->map;
3019			a->re.bm.size = n;
3020			a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1;
3021			a->re.bm.left = l - 1;
3022			a->re.bm.right = env->stats.m - l - n;
3023			a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n;
3024			h = (Bm_mask_t*)&a->re.bm.mask[n];
3025			a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1));
3026			a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1];
3027			for (m = 0; m <= UCHAR_MAX; m++)
3028				a->re.bm.skip[m] = n;
3029			a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left);
3030			for (i = 1; i <= n; i++)
3031				a->re.bm.fail[i] = 2 * n - i;
3032			mask = a->re.bm.mask;
3033			for (m = 0; m < n; m++)
3034			{
3035				mask[m] = h;
3036				h += UCHAR_MAX + 1;
3037			}
3038			if (x)
3039				bmstr(env, a, x->re.string.base, n, 1);
3040			else
3041			{
3042				v = (unsigned char*)q;
3043				memset(v, 0, n);
3044				m = 1;
3045				for (i = 0; i <= UCHAR_MAX; i++)
3046					if (t->re.trie.root[i])
3047						m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m);
3048			}
3049			mask--;
3050			memset(q, 0, n * sizeof(*q));
3051			for (k = (j = n) + 1; j > 0; j--, k--)
3052			{
3053				DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0));
3054				for (q[j] = k; k <= n; k = q[k])
3055				{
3056					for (m = 0; m <= UCHAR_MAX; m++)
3057						if (mask[k][m] == mask[j][m])
3058						{
3059							DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0));
3060							goto cut;
3061						}
3062					DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0));
3063					if (a->re.bm.fail[k] > n - j)
3064						a->re.bm.fail[k] = n - j;
3065				}
3066			cut:	;
3067			}
3068			for (i = 1; i <= n; i++)
3069				if (a->re.bm.fail[i] > n + k - i)
3070				{
3071					DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0));
3072					a->re.bm.fail[i] = n + k - i;
3073				}
3074#if _AST_REGEX_DEBUG
3075			if (DEBUG_TEST(0x0020,(1),(0)))
3076			{
3077				sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0);
3078				for (m = 0; m < n; m++)
3079					for (i = 1; i <= UCHAR_MAX; i++)
3080						if (a->re.bm.mask[m][i])
3081							sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]);
3082				for (i = ' '; i <= UCHAR_MAX; i++)
3083					if (a->re.bm.skip[i] >= HIT)
3084						sfprintf(sfstderr, "SKIP: ['%c'] =   *\n", i);
3085					else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n)
3086						sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]);
3087				for (j = 31; j >= 0; j--)
3088				{
3089					sfprintf(sfstderr, "      ");
3090				next:
3091					for (m = 0; m < n; m++)
3092					{
3093						for (i = 0040; i < 0177; i++)
3094							if (a->re.bm.mask[m][i] & (1 << j))
3095							{
3096								sfprintf(sfstderr, "  %c", i);
3097								break;
3098							}
3099						if (i >= 0177)
3100						{
3101							if (j > 0)
3102							{
3103								j--;
3104								goto next;
3105							}
3106							sfprintf(sfstderr, "  ?");
3107						}
3108					}
3109					sfprintf(sfstderr, "\n");
3110				}
3111				sfprintf(sfstderr, "FAIL: ");
3112				for (m = 1; m <= n; m++)
3113					sfprintf(sfstderr, "%3d", a->re.bm.fail[m]);
3114				sfprintf(sfstderr, "\n");
3115			}
3116#endif
3117			alloc(env->disc, q, 0);
3118			a->next = e;
3119			p->env->rex = a;
3120			return 0;
3121		}
3122		switch (e->type)
3123		{
3124		case REX_BEG:
3125			if (env->flags & REG_NEWLINE)
3126				return 0;
3127			break;
3128		case REX_GROUP:
3129			if (env->stats.b)
3130				return 0;
3131			e = e->re.group.expr.rex;
3132			if (e->type != REX_DOT)
3133				return 0;
3134			/*FALLTHROUGH*/
3135		case REX_DOT:
3136			if (e->lo == 0 && e->hi == RE_DUP_INF)
3137				break;
3138			return 0;
3139		case REX_NULL:
3140			if (env->flags & REG_NULL)
3141				break;
3142			env->error = REG_ENULL;
3143			return 1;
3144		case REX_STRING:
3145			if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map)
3146				return 0;
3147			s = e->re.string.base;
3148			n = e->re.string.size;
3149			if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1))))
3150				return 1;
3151			a->flags = e->flags;
3152			a->map = e->map;
3153			f = a->re.string.fail;
3154			memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n);
3155			s = a->re.string.base;
3156			a->re.string.size = n;
3157			f[0] = m = -1;
3158			for (k = 1; k < n; k++)
3159			{
3160				while (m >= 0 && s[m+1] != s[k])
3161					m = f[m];
3162				if (s[m+1] == s[k])
3163					m++;
3164				f[k] = m;
3165			}
3166			a->next = e->next;
3167			p->env->rex = a;
3168			e->next = 0;
3169			drop(env->disc, e);
3170			break;
3171		default:
3172			return 0;
3173		}
3174	}
3175	p->env->once = 1;
3176	return 0;
3177}
3178
3179int
3180regcomp(regex_t* p, const char* pattern, regflags_t flags)
3181{
3182	Rex_t*			e;
3183	Rex_t*			f;
3184	regdisc_t*		disc;
3185	unsigned char*		fold;
3186	int			i;
3187	Cenv_t			env;
3188
3189	if (!p)
3190		return REG_BADPAT;
3191	if (flags & REG_DISCIPLINE)
3192	{
3193		flags &= ~REG_DISCIPLINE;
3194		disc = p->re_disc;
3195	}
3196	else
3197		disc = &state.disc;
3198	if (!disc->re_errorlevel)
3199		disc->re_errorlevel = 2;
3200	p->env = 0;
3201	if (!pattern)
3202		return fatal(disc, REG_BADPAT, pattern);
3203	if (!state.initialized)
3204	{
3205		state.initialized = 1;
3206		for (i = 0; i < elementsof(state.escape); i++)
3207			state.magic[state.escape[i].key] = state.escape[i].val;
3208	}
3209	if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data))
3210	{
3211		if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1)))
3212			return fatal(disc, REG_ESPACE, pattern);
3213		for (i = 0; i <= UCHAR_MAX; i++)
3214			fold[i] = toupper(i);
3215		LCINFO(AST_LC_CTYPE)->data = (void*)fold;
3216	}
3217 again:
3218	if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t))))
3219		return fatal(disc, REG_ESPACE, pattern);
3220	memset(p->env, 0, sizeof(*p->env));
3221	memset(&env, 0, sizeof(env));
3222	env.regex = p;
3223	env.flags = flags;
3224	env.disc = p->env->disc = disc;
3225	if (env.flags & REG_AUGMENTED)
3226		env.flags |= REG_EXTENDED;
3227	env.mappeddot = '.';
3228	env.mappednewline = '\n';
3229	env.mappedslash = '/';
3230	if (disc->re_version >= REG_VERSION_MAP && disc->re_map)
3231	{
3232		env.map = disc->re_map;
3233		env.MAP = p->env->fold;
3234		for (i = 0; i <= UCHAR_MAX; i++)
3235		{
3236			env.MAP[i] = fold[env.map[i]];
3237			if (env.map[i] == '.')
3238				env.mappeddot = i;
3239			if (env.map[i] == '\n')
3240				env.mappednewline = i;
3241			if (env.map[i] == '/')
3242				env.mappedslash = i;
3243		}
3244	}
3245	else
3246		env.MAP = fold;
3247	env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE;
3248	env.explicit = -1;
3249	if (env.flags & REG_SHELL)
3250	{
3251		if (env.flags & REG_SHELL_PATH)
3252			env.explicit = env.mappedslash;
3253		env.flags |= REG_LENIENT|REG_NULL;
3254		env.type = env.type == BRE ? SRE : KRE;
3255	}
3256	if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE)
3257		env.explicit = env.mappednewline;
3258	p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1;
3259	env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL));
3260	env.regexp = !!(env.flags & REG_REGEXP);
3261	env.token.lex = 0;
3262	env.token.push = 0;
3263	if (env.flags & REG_DELIMITED)
3264	{
3265		switch (env.delimiter = *pattern++)
3266		{
3267		case 0:
3268		case '\\':
3269		case '\n':
3270		case '\r':
3271			env.error = REG_EDELIM;
3272			goto bad;
3273		}
3274		env.terminator = '\n';
3275	}
3276	env.literal = env.pattern = env.cursor = (unsigned char*)pattern;
3277	if (!(p->env->rex = alt(&env, 1, 0)))
3278		goto bad;
3279	if (env.parnest)
3280	{
3281		env.error = REG_EPAREN;
3282		goto bad;
3283	}
3284	p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL);
3285	if (env.flags & REG_LEFT)
3286	{
3287		if (p->env->rex->type != REX_BEG)
3288		{
3289			if (p->env->rex->type == REX_ALT)
3290				env.flags &= ~REG_FIRST;
3291			if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3292			{
3293				regfree(p);
3294				return fatal(disc, REG_ESPACE, pattern);
3295			}
3296			e->next = p->env->rex;
3297			p->env->rex = e;
3298			p->env->once = 1;
3299		}
3300		p->env->stats.re_flags |= REG_LEFT;
3301	}
3302	for (e = p->env->rex; e->next; e = e->next);
3303	p->env->done.type = REX_DONE;
3304	p->env->done.flags = e->flags;
3305	if (env.flags & REG_RIGHT)
3306	{
3307		if (e->type != REX_END)
3308		{
3309			if (p->env->rex->type == REX_ALT)
3310				env.flags &= ~REG_FIRST;
3311			if (!(f = node(&env, REX_END, 0, 0, 0)))
3312			{
3313				regfree(p);
3314				return fatal(disc, REG_ESPACE, pattern);
3315			}
3316			f->flags = e->flags;
3317			f->map = e->map;
3318			e->next = f;
3319		}
3320		p->env->stats.re_flags |= REG_RIGHT;
3321	}
3322	if (stats(&env, p->env->rex))
3323	{
3324		if (!env.error)
3325			env.error = REG_ECOUNT;
3326		goto bad;
3327	}
3328	if (env.stats.b)
3329		p->env->hard = p->env->separate = 1;
3330	else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c)))
3331		p->env->hard = 1;
3332	if (p->env->hard || env.stats.c || env.stats.i)
3333		p->env->stats.re_min = p->env->stats.re_max = -1;
3334	else
3335	{
3336		if (!(p->env->stats.re_min = env.stats.m))
3337			p->env->stats.re_min = -1;
3338		if (!(p->env->stats.re_max = env.stats.n))
3339			p->env->stats.re_max = -1;
3340	}
3341	if (special(&env, p))
3342		goto bad;
3343	serialize(&env, p->env->rex, 1);
3344	p->re_nsub = env.stats.p;
3345	if (env.type == KRE)
3346		p->re_nsub /= 2;
3347	if (env.flags & REG_DELIMITED)
3348	{
3349		p->re_npat = env.cursor - env.pattern + 1;
3350		if (*env.cursor == env.delimiter)
3351			p->re_npat++;
3352		else if (env.flags & REG_MUSTDELIM)
3353		{
3354			env.error = REG_EDELIM;
3355			goto bad;
3356		}
3357		else
3358			env.flags &= ~REG_DELIMITED;
3359	}
3360	p->env->explicit = env.explicit;
3361	p->env->flags = env.flags & REG_COMP;
3362	p->env->min = env.stats.m;
3363	p->env->nsub = env.stats.p + env.stats.u;
3364	p->env->refs = 1;
3365	return 0;
3366 bad:
3367	regfree(p);
3368	if (!env.error)
3369		env.error = REG_ESPACE;
3370	if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL))
3371	{
3372		flags |= REG_LITERAL;
3373		pattern = (const char*)env.literal;
3374		goto again;
3375	}
3376	return fatal(disc, env.error, pattern);
3377}
3378
3379/*
3380 * regcomp() on sized pattern
3381 * the lazy way around adding and checking env.end
3382 */
3383
3384int
3385regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags)
3386{
3387	char*	s;
3388	int	r;
3389
3390	if (!(s = malloc(size + 1)))
3391		return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern);
3392	memcpy(s, pattern, size);
3393	s[size] = 0;
3394	r = regcomp(p, s, flags);
3395	free(s);
3396	return r;
3397}
3398
3399/*
3400 * combine two compiled regular expressions if possible,
3401 * replacing first with the combination and freeing second.
3402 * return 0 on success.
3403 * the only combinations handled are building a Trie
3404 * from String|Kmp|Trie and String|Kmp
3405 */
3406
3407int
3408regcomb(regex_t* p, regex_t* q)
3409{
3410	Rex_t*	e = p->env->rex;
3411	Rex_t*	f = q->env->rex;
3412	Rex_t*	g;
3413	Rex_t*	h;
3414	Cenv_t	env;
3415
3416	if (!e || !f)
3417		return fatal(p->env->disc, REG_BADPAT, NiL);
3418	if (p->env->separate || q->env->separate)
3419		return REG_ESUBREG;
3420	memset(&env, 0, sizeof(env));
3421	env.disc = p->env->disc;
3422	if (e->type == REX_BM)
3423	{
3424		p->env->rex = e->next;
3425		e->next = 0;
3426		drop(env.disc, e);
3427		e = p->env->rex;
3428	}
3429	if (f->type == REX_BM)
3430	{
3431		q->env->rex = f->next;
3432		f->next = 0;
3433		drop(env.disc, f);
3434		f = q->env->rex;
3435	}
3436	if (e->type == REX_BEG && f->type == REX_BEG)
3437	{
3438		p->env->flags |= REG_LEFT;
3439		p->env->rex = e->next;
3440		e->next = 0;
3441		drop(env.disc, e);
3442		e = p->env->rex;
3443		q->env->rex = f->next;
3444		f->next = 0;
3445		drop(env.disc, f);
3446		f = q->env->rex;
3447	}
3448	for (g = e; g->next; g = g->next);
3449	for (h = f; h->next; h = h->next);
3450	if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END)
3451	{
3452		p->env->flags |= REG_RIGHT;
3453		drop(env.disc, g->next);
3454		g->next = 0;
3455		drop(env.disc, h->next);
3456		h->next = 0;
3457	}
3458	if (!(g = trie(&env, f, e)))
3459		return fatal(p->env->disc, REG_BADPAT, NiL);
3460	p->env->rex = g;
3461	if (!q->env->once)
3462		p->env->once = 0;
3463	q->env->rex = 0;
3464	if (p->env->flags & REG_LEFT)
3465	{
3466		if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3467		{
3468			regfree(p);
3469			return fatal(p->env->disc, REG_ESPACE, NiL);
3470		}
3471		e->next = p->env->rex;
3472		p->env->rex = e;
3473		p->env->once = 1;
3474	}
3475	if (p->env->flags & REG_RIGHT)
3476	{
3477		for (f = p->env->rex; f->next; f = f->next);
3478		if (f->type != REX_END)
3479		{
3480			if (!(e = node(&env, REX_END, 0, 0, 0)))
3481			{
3482				regfree(p);
3483				return fatal(p->env->disc, REG_ESPACE, NiL);
3484			}
3485			f->next = e;
3486		}
3487	}
3488	env.explicit = p->env->explicit;
3489	env.flags = p->env->flags;
3490	env.disc = p->env->disc;
3491	if (stats(&env, p->env->rex))
3492	{
3493		regfree(p);
3494		return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL);
3495	}
3496	if (special(&env, p))
3497	{
3498		regfree(p);
3499		return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL);
3500	}
3501	p->env->min = g->re.trie.min;
3502	return 0;
3503}
3504
3505/*
3506 * copy a reference of p into q
3507 * p and q may then have separate regsubcomp() instantiations
3508 */
3509
3510int
3511regdup(regex_t* p, regex_t* q)
3512{
3513	if (!p || !q)
3514		return REG_BADPAT;
3515	*q = *p;
3516	p->env->refs++;
3517	q->re_sub = 0;
3518	return 0;
3519}
3520