1da2e3ebdSchin /***********************************************************************
2da2e3ebdSchin *                                                                      *
3da2e3ebdSchin *               This software is part of the ast package               *
4*b30d1939SAndy Fiddaman *          Copyright (c) 1985-2012 AT&T Intellectual Property          *
5da2e3ebdSchin *                      and is licensed under the                       *
6*b30d1939SAndy Fiddaman *                 Eclipse Public License, Version 1.0                  *
77c2fbfb3SApril Chin *                    by AT&T Intellectual Property                     *
8da2e3ebdSchin *                                                                      *
9da2e3ebdSchin *                A copy of the License is available at                 *
10*b30d1939SAndy Fiddaman *          http://www.eclipse.org/org/documents/epl-v10.html           *
11*b30d1939SAndy Fiddaman *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12da2e3ebdSchin *                                                                      *
13da2e3ebdSchin *              Information and Software Systems Research               *
14da2e3ebdSchin *                            AT&T Research                             *
15da2e3ebdSchin *                           Florham Park NJ                            *
16da2e3ebdSchin *                                                                      *
17da2e3ebdSchin *                 Glenn Fowler <gsf@research.att.com>                  *
18da2e3ebdSchin *                  David Korn <dgk@research.att.com>                   *
19da2e3ebdSchin *                   Phong Vo <kpv@research.att.com>                    *
20da2e3ebdSchin *                                                                      *
21da2e3ebdSchin ***********************************************************************/
22da2e3ebdSchin #pragma prototyped
23da2e3ebdSchin 
24da2e3ebdSchin /*
25da2e3ebdSchin  * posix regex implementation
26da2e3ebdSchin  *
27da2e3ebdSchin  * based on Doug McIlroy's C++ implementation
28da2e3ebdSchin  * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest
29da2e3ebdSchin  * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume
30da2e3ebdSchin  */
31da2e3ebdSchin 
32da2e3ebdSchin #ifndef _REGLIB_H
33da2e3ebdSchin #define _REGLIB_H
34da2e3ebdSchin 
35da2e3ebdSchin #define REG_VERSION_EXEC	20020509L
36da2e3ebdSchin #define REG_VERSION_MAP		20030916L	/* regdisc_t.re_map	*/
37da2e3ebdSchin 
38da2e3ebdSchin #define re_info		env
39da2e3ebdSchin 
40da2e3ebdSchin #define alloc		_reg_alloc
41da2e3ebdSchin #define classfun	_reg_classfun
42da2e3ebdSchin #define drop		_reg_drop
43da2e3ebdSchin #define fatal		_reg_fatal
44da2e3ebdSchin #define state		_reg_state
45da2e3ebdSchin 
46da2e3ebdSchin typedef struct regsubop_s
47da2e3ebdSchin {
48da2e3ebdSchin 	int		op;		/* REG_SUB_LOWER,REG_SUB_UPPER	*/
49da2e3ebdSchin 	int		off;		/* re_rhs or match[] offset	*/
50da2e3ebdSchin 	int		len;		/* re_rhs len or len==0 match[]	*/
51da2e3ebdSchin } regsubop_t;
52da2e3ebdSchin 
53da2e3ebdSchin #define _REG_SUB_PRIVATE_ \
54da2e3ebdSchin 	char*		re_cur;		/* re_buf cursor		*/ \
55da2e3ebdSchin 	char*		re_end;		/* re_buf end			*/ \
56da2e3ebdSchin 	regsubop_t*	re_ops;		/* rhs ops			*/ \
57da2e3ebdSchin 	char		re_rhs[1];	/* substitution rhs		*/
58da2e3ebdSchin 
59da2e3ebdSchin #include <ast.h>
60da2e3ebdSchin #include <cdt.h>
61da2e3ebdSchin #include <stk.h>
62da2e3ebdSchin 
63da2e3ebdSchin #include "regex.h"
64da2e3ebdSchin 
65da2e3ebdSchin #include <ctype.h>
66da2e3ebdSchin #include <errno.h>
67da2e3ebdSchin 
6834f9b3eeSRoland Mainz #if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG)
6934f9b3eeSRoland Mainz #define _AST_REGEX_DEBUG	1
7034f9b3eeSRoland Mainz #endif
7134f9b3eeSRoland Mainz 
72da2e3ebdSchin #define MBSIZE(p)	((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1)
73da2e3ebdSchin 
74da2e3ebdSchin #undef	RE_DUP_MAX			/* posix puts this in limits.h!	*/
75da2e3ebdSchin #define RE_DUP_MAX	(INT_MAX/2-1)	/* 2*RE_DUP_MAX won't overflow	*/
76da2e3ebdSchin #define RE_DUP_INF	(RE_DUP_MAX+1)	/* infinity, for *		*/
77da2e3ebdSchin #define BACK_REF_MAX	9
78da2e3ebdSchin 
79*b30d1939SAndy Fiddaman #define REG_COMP	(~REG_EXEC)
80da2e3ebdSchin #define REG_EXEC	(REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND)
81da2e3ebdSchin 
82da2e3ebdSchin #define REX_NULL		0	/* null string (internal)	*/
83da2e3ebdSchin #define REX_ALT			1	/* a|b				*/
84da2e3ebdSchin #define REX_ALT_CATCH		2	/* REX_ALT catcher		*/
85da2e3ebdSchin #define REX_BACK		3	/* \1, \2, etc			*/
86da2e3ebdSchin #define REX_BEG			4	/* initial ^			*/
87da2e3ebdSchin #define REX_BEG_STR		5	/* initial ^ w/ no newline	*/
88da2e3ebdSchin #define REX_BM			6	/* Boyer-Moore			*/
89da2e3ebdSchin #define REX_CAT			7	/* catenation catcher		*/
90da2e3ebdSchin #define REX_CLASS		8	/* [...]			*/
91da2e3ebdSchin #define REX_COLL_CLASS		9	/* collation order [...]	*/
92da2e3ebdSchin #define REX_CONJ		10	/* a&b				*/
93da2e3ebdSchin #define REX_CONJ_LEFT		11	/* REX_CONJ left catcher	*/
94da2e3ebdSchin #define REX_CONJ_RIGHT		12	/* REX_CONJ right catcher	*/
95da2e3ebdSchin #define REX_DONE		13	/* completed match (internal)	*/
96da2e3ebdSchin #define REX_DOT			14	/* .				*/
97da2e3ebdSchin #define REX_END			15	/* final $			*/
98da2e3ebdSchin #define REX_END_STR		16	/* final $ before tail newline	*/
99da2e3ebdSchin #define REX_EXEC		17	/* call re.re_exec()		*/
100da2e3ebdSchin #define REX_FIN_STR		18	/* final $ w/ no newline	*/
101da2e3ebdSchin #define REX_GROUP		19	/* \(...\)			*/
102da2e3ebdSchin #define REX_GROUP_CATCH		20	/* REX_GROUP catcher		*/
103da2e3ebdSchin #define REX_GROUP_AHEAD		21	/* 0-width lookahead		*/
104da2e3ebdSchin #define REX_GROUP_AHEAD_CATCH	22	/* REX_GROUP_AHEAD catcher	*/
105da2e3ebdSchin #define REX_GROUP_AHEAD_NOT	23	/* inverted 0-width lookahead	*/
106da2e3ebdSchin #define REX_GROUP_BEHIND	24	/* 0-width lookbehind		*/
107da2e3ebdSchin #define REX_GROUP_BEHIND_CATCH	25	/* REX_GROUP_BEHIND catcher	*/
108da2e3ebdSchin #define REX_GROUP_BEHIND_NOT	26	/* inverted 0-width lookbehind	*/
109da2e3ebdSchin #define REX_GROUP_BEHIND_NOT_CATCH 27	/* REX_GROUP_BEHIND_NOT catcher	*/
110da2e3ebdSchin #define REX_GROUP_COND		28	/* conditional group		*/
111da2e3ebdSchin #define REX_GROUP_COND_CATCH	29	/* conditional group catcher	*/
112da2e3ebdSchin #define REX_GROUP_CUT		30	/* don't backtrack over this	*/
113da2e3ebdSchin #define REX_GROUP_CUT_CATCH	31	/* REX_GROUP_CUT catcher	*/
114da2e3ebdSchin #define REX_KMP			32	/* Knuth-Morris-Pratt		*/
115da2e3ebdSchin #define REX_NEG			33	/* negation			*/
116da2e3ebdSchin #define REX_NEG_CATCH		34	/* REX_NEG catcher		*/
117da2e3ebdSchin #define REX_NEST		35	/* nested match			*/
118da2e3ebdSchin #define REX_ONECHAR		36	/* a single-character literal	*/
119da2e3ebdSchin #define REX_REP			37	/* Kleene closure		*/
120da2e3ebdSchin #define REX_REP_CATCH		38	/* REX_REP catcher		*/
121da2e3ebdSchin #define REX_STRING		39	/* some chars			*/
122da2e3ebdSchin #define REX_TRIE		40	/* alternation of strings	*/
123da2e3ebdSchin #define REX_WBEG		41	/* \<				*/
124da2e3ebdSchin #define REX_WEND		42	/* \>				*/
125da2e3ebdSchin #define REX_WORD		43	/* word boundary		*/
126da2e3ebdSchin #define REX_WORD_NOT		44	/* not word boundary		*/
127da2e3ebdSchin 
128da2e3ebdSchin #define T_META		((int)UCHAR_MAX+1)
129da2e3ebdSchin #define T_STAR		(T_META+0)
130da2e3ebdSchin #define T_PLUS		(T_META+1)
131da2e3ebdSchin #define T_QUES		(T_META+2)
132da2e3ebdSchin #define T_BANG		(T_META+3)
133da2e3ebdSchin #define T_AT		(T_META+4)
134da2e3ebdSchin #define T_TILDE		(T_META+5)
135da2e3ebdSchin #define T_PERCENT	(T_META+6)
136da2e3ebdSchin #define T_LEFT		(T_META+7)
137da2e3ebdSchin #define T_OPEN		(T_META+8)
138da2e3ebdSchin #define T_CLOSE		(T_OPEN+1)
139da2e3ebdSchin #define T_RIGHT		(T_OPEN+2)
140da2e3ebdSchin #define T_CFLX		(T_OPEN+3)
141da2e3ebdSchin #define T_DOT		(T_OPEN+4)
142da2e3ebdSchin #define T_DOTSTAR	(T_OPEN+5)
143da2e3ebdSchin #define T_END		(T_OPEN+6)
144da2e3ebdSchin #define T_BAD		(T_OPEN+7)
145da2e3ebdSchin #define T_DOLL		(T_OPEN+8)
146da2e3ebdSchin #define T_BRA		(T_OPEN+9)
147da2e3ebdSchin #define T_BAR		(T_OPEN+10)
148da2e3ebdSchin #define T_AND		(T_OPEN+11)
149da2e3ebdSchin #define T_LT		(T_OPEN+12)
150da2e3ebdSchin #define T_GT		(T_OPEN+13)
151da2e3ebdSchin #define T_SLASHPLUS	(T_OPEN+14)
152da2e3ebdSchin #define T_GROUP		(T_OPEN+15)
153da2e3ebdSchin #define T_WORD		(T_OPEN+16)
154da2e3ebdSchin #define T_WORD_NOT	(T_WORD+1)
155da2e3ebdSchin #define T_BEG_STR	(T_WORD+2)
156da2e3ebdSchin #define T_END_STR	(T_WORD+3)
157da2e3ebdSchin #define T_FIN_STR	(T_WORD+4)
158da2e3ebdSchin #define T_ESCAPE	(T_WORD+5)
159da2e3ebdSchin #define T_ALNUM		(T_WORD+6)
160da2e3ebdSchin #define T_ALNUM_NOT	(T_ALNUM+1)
161da2e3ebdSchin #define T_DIGIT		(T_ALNUM+2)
162da2e3ebdSchin #define T_DIGIT_NOT	(T_ALNUM+3)
163da2e3ebdSchin #define T_SPACE		(T_ALNUM+4)
164da2e3ebdSchin #define T_SPACE_NOT	(T_ALNUM+5)
165da2e3ebdSchin #define T_BACK		(T_ALNUM+6)
166da2e3ebdSchin 
167da2e3ebdSchin #define BRE		0
168da2e3ebdSchin #define ERE		3
169da2e3ebdSchin #define ARE		6
170da2e3ebdSchin #define SRE		9
171da2e3ebdSchin #define KRE		12
172da2e3ebdSchin 
173da2e3ebdSchin #define HIT		SSIZE_MAX
174da2e3ebdSchin 
175da2e3ebdSchin #define bitclr(p,c)	((p)[((c)>>3)&037]&=(~(1<<((c)&07))))
176da2e3ebdSchin #define bitset(p,c)	((p)[((c)>>3)&037]|=(1<<((c)&07)))
177da2e3ebdSchin #define bittst(p,c)	((p)[((c)>>3)&037]&(1<<((c)&07)))
178da2e3ebdSchin 
179da2e3ebdSchin #define setadd(p,c)	bitset((p)->bits,c)
180da2e3ebdSchin #define setclr(p,c)	bitclr((p)->bits,c)
181da2e3ebdSchin #define settst(p,c)	bittst((p)->bits,c)
182da2e3ebdSchin 
183da2e3ebdSchin #if _hdr_wchar && _lib_wctype && _lib_iswctype
184da2e3ebdSchin 
185da2e3ebdSchin #include <stdio.h> /* because <wchar.h> includes it and we generate it */
186da2e3ebdSchin #include <wchar.h>
187da2e3ebdSchin #if _hdr_wctype
188da2e3ebdSchin #include <wctype.h>
189da2e3ebdSchin #endif
190da2e3ebdSchin 
191da2e3ebdSchin #if !defined(iswblank) && !_lib_iswblank
192da2e3ebdSchin #define _need_iswblank	1
193da2e3ebdSchin #define iswblank(x)	_reg_iswblank(x)
194da2e3ebdSchin extern int		_reg_iswblank(wint_t);
195da2e3ebdSchin #endif
196da2e3ebdSchin 
197da2e3ebdSchin #if !defined(towupper) && !_lib_towupper
198da2e3ebdSchin #define towupper(x)	toupper(x)
199da2e3ebdSchin #endif
200da2e3ebdSchin 
201da2e3ebdSchin #if !defined(towlower) && !_lib_towlower
202da2e3ebdSchin #define towlower(x)	tolower(x)
203da2e3ebdSchin #endif
204da2e3ebdSchin 
205da2e3ebdSchin #else
206da2e3ebdSchin 
207da2e3ebdSchin #undef	_lib_wctype
208da2e3ebdSchin 
209da2e3ebdSchin #ifndef iswalnum
210da2e3ebdSchin #define iswalnum(x)	isalnum(x)
211da2e3ebdSchin #endif
212da2e3ebdSchin #ifndef iswalpha
213da2e3ebdSchin #define iswalpha(x)	isalpha(x)
214da2e3ebdSchin #endif
215da2e3ebdSchin #ifndef iswcntrl
216da2e3ebdSchin #define iswcntrl(x)	iscntrl(x)
217da2e3ebdSchin #endif
218da2e3ebdSchin #ifndef iswdigit
219da2e3ebdSchin #define iswdigit(x)	isdigit(x)
220da2e3ebdSchin #endif
221da2e3ebdSchin #ifndef iswgraph
222da2e3ebdSchin #define iswgraph(x)	isgraph(x)
223da2e3ebdSchin #endif
224da2e3ebdSchin #ifndef iswlower
225da2e3ebdSchin #define iswlower(x)	islower(x)
226da2e3ebdSchin #endif
227da2e3ebdSchin #ifndef iswprint
228da2e3ebdSchin #define iswprint(x)	isprint(x)
229da2e3ebdSchin #endif
230da2e3ebdSchin #ifndef iswpunct
231da2e3ebdSchin #define iswpunct(x)	ispunct(x)
232da2e3ebdSchin #endif
233da2e3ebdSchin #ifndef iswspace
234da2e3ebdSchin #define iswspace(x)	isspace(x)
235da2e3ebdSchin #endif
236da2e3ebdSchin #ifndef iswupper
237da2e3ebdSchin #define iswupper(x)	isupper(x)
238da2e3ebdSchin #endif
239da2e3ebdSchin #ifndef iswxdigit
240da2e3ebdSchin #define iswxdigit(x)	isxdigit(x)
241da2e3ebdSchin #endif
242da2e3ebdSchin 
243da2e3ebdSchin #ifndef towlower
244da2e3ebdSchin #define towlower(x)	tolower(x)
245da2e3ebdSchin #endif
246da2e3ebdSchin #ifndef towupper
247da2e3ebdSchin #define towupper(x)	toupper(x)
248da2e3ebdSchin #endif
249da2e3ebdSchin 
250da2e3ebdSchin #endif
251da2e3ebdSchin 
252da2e3ebdSchin #ifndef	iswblank
253da2e3ebdSchin #define	iswblank(x)	((x)==' '||(x)=='\t')
254da2e3ebdSchin #endif
255da2e3ebdSchin 
256da2e3ebdSchin #ifndef iswgraph
257da2e3ebdSchin #define	iswgraph(x)	(iswprint(x)&&!iswblank(x))
258da2e3ebdSchin #endif
259da2e3ebdSchin 
260da2e3ebdSchin #define isword(x)	(isalnum(x)||(x)=='_')
261da2e3ebdSchin 
262da2e3ebdSchin /*
263da2e3ebdSchin  * collation element support
264da2e3ebdSchin  */
265da2e3ebdSchin 
2667c2fbfb3SApril Chin #define COLL_KEY_MAX	32
267da2e3ebdSchin 
268da2e3ebdSchin #if COLL_KEY_MAX < MB_LEN_MAX
269da2e3ebdSchin #undef	COLL_KEY_MAX
270da2e3ebdSchin #define COLL_KEY_MAX	MB_LEN_MAX
271da2e3ebdSchin #endif
272da2e3ebdSchin 
273da2e3ebdSchin typedef unsigned char Ckey_t[COLL_KEY_MAX+1];
274da2e3ebdSchin 
275da2e3ebdSchin #define COLL_end	0
276da2e3ebdSchin #define COLL_call	1
277da2e3ebdSchin #define COLL_char	2
278da2e3ebdSchin #define COLL_range	3
279da2e3ebdSchin #define COLL_range_lc	4
280da2e3ebdSchin #define COLL_range_uc	5
281da2e3ebdSchin 
282da2e3ebdSchin typedef struct Celt_s
283da2e3ebdSchin {
284da2e3ebdSchin 	short		typ;
285da2e3ebdSchin 	short		min;
286da2e3ebdSchin 	short		max;
287da2e3ebdSchin 	regclass_t	fun;
288da2e3ebdSchin 	Ckey_t		beg;
289da2e3ebdSchin 	Ckey_t		end;
290da2e3ebdSchin } Celt_t;
291da2e3ebdSchin 
292da2e3ebdSchin /*
293da2e3ebdSchin  * private stuff hanging off regex_t
294da2e3ebdSchin  */
295da2e3ebdSchin 
296da2e3ebdSchin typedef struct Stk_pos_s
297da2e3ebdSchin {
298da2e3ebdSchin 	off_t		offset;
299da2e3ebdSchin 	char*		base;
300da2e3ebdSchin } Stk_pos_t;
301da2e3ebdSchin 
302da2e3ebdSchin typedef struct Vector_s
303da2e3ebdSchin {
304da2e3ebdSchin 	Stk_t*		stk;		/* stack pointer		*/
305da2e3ebdSchin 	char*		vec;		/* the data			*/
306da2e3ebdSchin 	int		inc;		/* growth increment		*/
307da2e3ebdSchin 	int		siz;		/* element size			*/
308*b30d1939SAndy Fiddaman 	ssize_t		max;		/* max index			*/
309*b30d1939SAndy Fiddaman 	ssize_t		cur;		/* current index -- user domain	*/
310da2e3ebdSchin } Vector_t;
311da2e3ebdSchin 
312da2e3ebdSchin /*
313da2e3ebdSchin  * Rex_t subtypes
314da2e3ebdSchin  */
315da2e3ebdSchin 
316da2e3ebdSchin typedef struct Cond_s
317da2e3ebdSchin {
318da2e3ebdSchin 	unsigned char*	beg;		/* beginning of next match	*/
319da2e3ebdSchin 	struct Rex_s*	next[2];	/* 0:no 1:yes next pattern	*/
320da2e3ebdSchin 	struct Rex_s*	cont;		/* right catcher		*/
321da2e3ebdSchin 	int		yes;		/* yes condition hit		*/
322da2e3ebdSchin } Cond_t;
323da2e3ebdSchin 
324da2e3ebdSchin typedef struct Conj_left_s
325da2e3ebdSchin {
326da2e3ebdSchin 	unsigned char*	beg;		/* beginning of left match	*/
327da2e3ebdSchin 	struct Rex_s*	right;		/* right pattern		*/
328da2e3ebdSchin 	struct Rex_s*	cont;		/* right catcher		*/
329da2e3ebdSchin } Conj_left_t;
330da2e3ebdSchin 
331da2e3ebdSchin typedef struct Conj_right_s
332da2e3ebdSchin {
333da2e3ebdSchin 	unsigned char*	end;		/* end of left match		*/
334da2e3ebdSchin 	struct Rex_s*	cont;		/* ambient continuation		*/
335da2e3ebdSchin } Conj_right_t;
336da2e3ebdSchin 
337da2e3ebdSchin typedef unsigned int Bm_mask_t;
338da2e3ebdSchin 
339da2e3ebdSchin typedef struct Bm_s
340da2e3ebdSchin {
341da2e3ebdSchin 	Bm_mask_t**	mask;
342da2e3ebdSchin 	size_t*		skip;
343da2e3ebdSchin 	size_t*		fail;
344da2e3ebdSchin 	size_t		size;
345da2e3ebdSchin 	ssize_t		back;
346da2e3ebdSchin 	ssize_t		left;
347da2e3ebdSchin 	ssize_t		right;
348da2e3ebdSchin 	size_t		complete;
349da2e3ebdSchin } Bm_t;
350da2e3ebdSchin 
351da2e3ebdSchin typedef struct String_s
352da2e3ebdSchin {
353da2e3ebdSchin 	int*		fail;
354da2e3ebdSchin 	unsigned char*	base;
355da2e3ebdSchin 	size_t		size;
356da2e3ebdSchin } String_t;
357da2e3ebdSchin 
358da2e3ebdSchin typedef struct Set_s
359da2e3ebdSchin {
360da2e3ebdSchin 	unsigned char	bits[(UCHAR_MAX+1)/CHAR_BIT];
361da2e3ebdSchin } Set_t;
362da2e3ebdSchin 
363da2e3ebdSchin typedef struct Collate_s
364da2e3ebdSchin {
365da2e3ebdSchin 	int		invert;
366da2e3ebdSchin 	Celt_t*		elements;
367da2e3ebdSchin } Collate_t;
368da2e3ebdSchin 
369da2e3ebdSchin typedef struct Binary_s
370da2e3ebdSchin {
371da2e3ebdSchin 	struct Rex_s*	left;
372da2e3ebdSchin 	struct Rex_s*	right;
373da2e3ebdSchin 	int		serial;
374da2e3ebdSchin } Binary_t;
375da2e3ebdSchin 
376da2e3ebdSchin typedef struct Group_s
377da2e3ebdSchin {
378da2e3ebdSchin 	int		number;		/* group number			*/
379da2e3ebdSchin 	int		last;		/* last contained group number	*/
380da2e3ebdSchin 	int		size;		/* lookbehind size		*/
381da2e3ebdSchin 	int		back;		/* backreferenced		*/
382da2e3ebdSchin 	regflags_t	flags;		/* group flags			*/
383da2e3ebdSchin 	union
384da2e3ebdSchin 	{
385da2e3ebdSchin 	Binary_t	binary;
386da2e3ebdSchin 	struct Rex_s*	rex;
387da2e3ebdSchin 	}		expr;
388da2e3ebdSchin } Group_t;
389da2e3ebdSchin 
390da2e3ebdSchin typedef struct Exec_s
391da2e3ebdSchin {
392da2e3ebdSchin 	void*		data;
393da2e3ebdSchin 	const char*	text;
394da2e3ebdSchin 	size_t		size;
395da2e3ebdSchin } Exec_t;
396da2e3ebdSchin 
397da2e3ebdSchin #define REX_NEST_open		0x01
398da2e3ebdSchin #define REX_NEST_close		0x02
399da2e3ebdSchin #define REX_NEST_escape		0x04
400da2e3ebdSchin #define REX_NEST_quote		0x08
401da2e3ebdSchin #define REX_NEST_literal	0x10
402da2e3ebdSchin #define REX_NEST_delimiter	0x20
403da2e3ebdSchin #define REX_NEST_terminator	0x40
404da2e3ebdSchin #define REX_NEST_separator	0x80
405da2e3ebdSchin 
406da2e3ebdSchin #define REX_NEST_SHIFT		8
407da2e3ebdSchin 
408da2e3ebdSchin typedef struct Nest_s
409da2e3ebdSchin {
410da2e3ebdSchin 	int		primary;
411da2e3ebdSchin 	unsigned short	none;		/* for Nest_t.type[-1] */
412da2e3ebdSchin 	unsigned short	type[1];
413da2e3ebdSchin } Nest_t;
414da2e3ebdSchin 
415da2e3ebdSchin /*
416da2e3ebdSchin  * REX_ALT catcher, solely to get control at the end of an
417da2e3ebdSchin  * alternative to keep records for comparing matches.
418da2e3ebdSchin  */
419da2e3ebdSchin 
420da2e3ebdSchin typedef struct Alt_catch_s
421da2e3ebdSchin {
422da2e3ebdSchin 	struct Rex_s*	cont;
423da2e3ebdSchin } Alt_catch_t;
424da2e3ebdSchin 
425da2e3ebdSchin typedef struct Group_catch_s
426da2e3ebdSchin {
427da2e3ebdSchin 	struct Rex_s*	cont;
428da2e3ebdSchin 	regoff_t*	eo;
429da2e3ebdSchin } Group_catch_t;
430da2e3ebdSchin 
431da2e3ebdSchin typedef struct Behind_catch_s
432da2e3ebdSchin {
433da2e3ebdSchin 	struct Rex_s*	cont;
434da2e3ebdSchin 	unsigned char*	beg;
435da2e3ebdSchin 	unsigned char*	end;
436da2e3ebdSchin } Behind_catch_t;
437da2e3ebdSchin 
438da2e3ebdSchin /*
439da2e3ebdSchin  * REX_NEG catcher determines what string lengths can be matched,
440da2e3ebdSchin  * then Neg investigates continuations of other lengths.
441da2e3ebdSchin  * This is inefficient.  For !POSITIONS expressions, we can do better:
442da2e3ebdSchin  * since matches to rex will be enumerated in decreasing order,
443da2e3ebdSchin  * we can investigate continuations whenever a length is skipped.
444da2e3ebdSchin  */
445da2e3ebdSchin 
446da2e3ebdSchin typedef struct Neg_catch_s
447da2e3ebdSchin {
448da2e3ebdSchin 	unsigned char*	beg;
449da2e3ebdSchin 	unsigned char*	index;
450da2e3ebdSchin } Neg_catch_t;
451da2e3ebdSchin 
452da2e3ebdSchin /*
453da2e3ebdSchin  * REX_REP catcher.  One is created on the stack for
454da2e3ebdSchin  * each iteration of a complex repetition.
455da2e3ebdSchin  */
456da2e3ebdSchin 
457da2e3ebdSchin typedef struct Rep_catch_s
458da2e3ebdSchin {
459da2e3ebdSchin 	struct Rex_s*	cont;
460da2e3ebdSchin 	struct Rex_s*	ref;
461da2e3ebdSchin 	unsigned char*	beg;
462da2e3ebdSchin 	int		n;
463da2e3ebdSchin } Rep_catch_t;
464da2e3ebdSchin 
465da2e3ebdSchin /*
466da2e3ebdSchin  * data structure for an alternation of pure strings
467da2e3ebdSchin  * son points to a subtree of all strings with a common
468da2e3ebdSchin  * prefix ending in character c.  sib links alternate
469da2e3ebdSchin  * letters in the same position of a word.  end=1 if
470da2e3ebdSchin  * some word ends with c.  the order of strings is
471da2e3ebdSchin  * irrelevant, except long words must be investigated
472da2e3ebdSchin  * before short ones.
473da2e3ebdSchin  */
474da2e3ebdSchin 
475da2e3ebdSchin typedef struct Trie_node_s
476da2e3ebdSchin {
477da2e3ebdSchin 	unsigned char		c;
478da2e3ebdSchin 	unsigned char		end;
479da2e3ebdSchin 	struct Trie_node_s*	son;
480da2e3ebdSchin 	struct Trie_node_s*	sib;
481da2e3ebdSchin } Trie_node_t;
482da2e3ebdSchin 
483da2e3ebdSchin typedef struct Trie_s
484da2e3ebdSchin {
485da2e3ebdSchin 	Trie_node_t**	root;
486da2e3ebdSchin 	int		min;
487da2e3ebdSchin 	int		max;
488da2e3ebdSchin } Trie_t;
489da2e3ebdSchin 
490da2e3ebdSchin /*
491da2e3ebdSchin  * Rex_t is a node in a regular expression
492da2e3ebdSchin  */
493da2e3ebdSchin 
494da2e3ebdSchin typedef struct Rex_s
495da2e3ebdSchin {
496da2e3ebdSchin 	unsigned char	type;			/* node type		*/
497da2e3ebdSchin 	unsigned char	marked;			/* already marked	*/
498da2e3ebdSchin 	short		serial;			/* subpattern number	*/
499da2e3ebdSchin 	regflags_t	flags;			/* scoped flags		*/
500da2e3ebdSchin 	int		explicit;		/* scoped explicit match*/
501da2e3ebdSchin 	struct Rex_s*	next;			/* remaining parts	*/
502da2e3ebdSchin 	int		lo;			/* lo dup count		*/
503da2e3ebdSchin 	int		hi;			/* hi dup count		*/
504da2e3ebdSchin 	unsigned char*	map;			/* fold and/or ccode map*/
505da2e3ebdSchin 	union
506da2e3ebdSchin 	{
507da2e3ebdSchin 	Alt_catch_t	alt_catch;		/* alt catcher		*/
508da2e3ebdSchin 	Bm_t		bm;			/* bm			*/
509da2e3ebdSchin 	Behind_catch_t	behind_catch;		/* behind catcher	*/
510da2e3ebdSchin 	Set_t*		charclass;		/* char class		*/
511da2e3ebdSchin 	Collate_t	collate;		/* collation class	*/
512da2e3ebdSchin 	Cond_t		cond_catch;		/* cond catcher		*/
513da2e3ebdSchin 	Conj_left_t	conj_left;		/* conj left catcher	*/
514da2e3ebdSchin 	Conj_right_t	conj_right;		/* conj right catcher	*/
515da2e3ebdSchin 	void*		data;			/* data after Rex_t	*/
516da2e3ebdSchin 	Exec_t		exec;			/* re.re_exec() args	*/
517da2e3ebdSchin 	Group_t		group;			/* a|b or rep		*/
518da2e3ebdSchin 	Group_catch_t	group_catch;		/* group catcher	*/
519da2e3ebdSchin 	Neg_catch_t	neg_catch;		/* neg catcher		*/
520da2e3ebdSchin 	Nest_t		nest;			/* nested match		*/
521da2e3ebdSchin 	unsigned char	onechar;		/* single char		*/
522da2e3ebdSchin 	Rep_catch_t	rep_catch;		/* rep catcher		*/
523da2e3ebdSchin 	String_t	string;			/* string/kmp		*/
524da2e3ebdSchin 	Trie_t		trie;			/* trie			*/
525da2e3ebdSchin 	}		re;
526da2e3ebdSchin } Rex_t;
527da2e3ebdSchin 
528da2e3ebdSchin typedef struct reglib_s			/* library private regex_t info	*/
529da2e3ebdSchin {
530da2e3ebdSchin 	struct Rex_s*	rex;		/* compiled expression		*/
531da2e3ebdSchin 	regdisc_t*	disc;		/* REG_DISCIPLINE discipline	*/
532da2e3ebdSchin 	const regex_t*	regex;		/* from regexec			*/
533da2e3ebdSchin 	unsigned char*	beg;		/* beginning of string		*/
534da2e3ebdSchin 	unsigned char*	end;		/* end of string		*/
535da2e3ebdSchin 	Vector_t*	pos;		/* posns of certain subpatterns	*/
536da2e3ebdSchin 	Vector_t*	bestpos;	/* ditto for best match		*/
537da2e3ebdSchin 	regmatch_t*	match;		/* subexrs in current match 	*/
538da2e3ebdSchin 	regmatch_t*	best;		/* ditto in best match yet	*/
539da2e3ebdSchin 	Stk_pos_t	stk;		/* exec stack pos		*/
540da2e3ebdSchin 	size_t		min;		/* minimum match length		*/
541da2e3ebdSchin 	size_t		nsub;		/* internal re_nsub		*/
542da2e3ebdSchin 	regflags_t	flags;		/* flags from regcomp()		*/
543da2e3ebdSchin 	int		error;		/* last error			*/
544da2e3ebdSchin 	int		explicit;	/* explicit match on this char	*/
545da2e3ebdSchin 	int		leading;	/* leading match on this char	*/
546da2e3ebdSchin 	int		refs;		/* regcomp()+regdup() references*/
547da2e3ebdSchin 	Rex_t		done;		/* the last continuation	*/
548da2e3ebdSchin 	regstat_t	stats;		/* for regstat()		*/
549da2e3ebdSchin 	unsigned char	fold[UCHAR_MAX+1]; /* REG_ICASE map		*/
550da2e3ebdSchin 	unsigned char	hard;		/* hard comp			*/
551da2e3ebdSchin 	unsigned char	once;		/* if 1st parse fails, quit	*/
552da2e3ebdSchin 	unsigned char	separate;	/* cannot combine		*/
553da2e3ebdSchin 	unsigned char	stack;		/* hard comp or exec		*/
554da2e3ebdSchin 	unsigned char	sub;		/* re_sub is valid		*/
555da2e3ebdSchin 	unsigned char	test;		/* debug/test bitmask		*/
556da2e3ebdSchin } Env_t;
557da2e3ebdSchin 
558*b30d1939SAndy Fiddaman typedef struct oldregmatch_s		/* pre-20120528 regmatch_t	*/
559*b30d1939SAndy Fiddaman {
560*b30d1939SAndy Fiddaman 	int		rm_so;		/* offset of start		*/
561*b30d1939SAndy Fiddaman 	int		rm_eo;		/* offset of end		*/
562*b30d1939SAndy Fiddaman } oldregmatch_t;
563*b30d1939SAndy Fiddaman 
564da2e3ebdSchin typedef struct State_s				/* shared state		*/
565da2e3ebdSchin {
566da2e3ebdSchin 	regmatch_t	nomatch;
567da2e3ebdSchin 	struct
568da2e3ebdSchin 	{
569da2e3ebdSchin 	unsigned char	key;
570da2e3ebdSchin 	short		val[15];
571da2e3ebdSchin 	}		escape[52];
572da2e3ebdSchin 	short*		magic[UCHAR_MAX+1];
573da2e3ebdSchin 	regdisc_t	disc;
574da2e3ebdSchin 	int		fatal;
575da2e3ebdSchin 	int		initialized;
576da2e3ebdSchin 	Dt_t*		attrs;
577da2e3ebdSchin 	Dt_t*		names;
578da2e3ebdSchin 	Dtdisc_t	dtdisc;
579da2e3ebdSchin } State_t;
580da2e3ebdSchin 
581da2e3ebdSchin extern State_t		state;
582da2e3ebdSchin 
583da2e3ebdSchin extern void*		alloc(regdisc_t*, void*, size_t);
584da2e3ebdSchin extern regclass_t	classfun(int);
585da2e3ebdSchin extern void		drop(regdisc_t*, Rex_t*);
586da2e3ebdSchin extern int		fatal(regdisc_t*, int, const char*);
587da2e3ebdSchin 
588da2e3ebdSchin #endif
589