xref: /illumos-gate/usr/src/cmd/expr/compile.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 1995-2003 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g)
29  *	using regcomp(3c), regexec(3c) interfaces. This is an XCU4
30  *	porting aid. switches out to libgen compile/step if collation
31  *	table not present.
32  *
33  *	Goal is to work with vi and sed/ed.
34  * 	Returns expbuf in dhl format (encoding of first two bytes).
35  * 	Note also that this is profoundly single threaded.  You
36  *	cannot call compile twice with two separate search strings
37  *	because the second call will wipe out the earlier stored string.
38  *	This must be fixed, plus a general cleanup should be performed
39  *	if this is to be integrated into libc.
40  *
41  */
42 
43 #pragma ident	"%Z%%M%	%I%	%E% SMI"
44 
45 #include <stdio.h>
46 #include <widec.h>
47 #include <sys/types.h>
48 #include <regex.h>
49 #include <locale.h>
50 #include <stdlib.h>
51 #include <locale.h>
52 #include <string.h>
53 #include <unistd.h>
54 #include <regexpr.h>
55 
56 /*
57  * psuedo compile/step/advance global variables
58  */
59 extern int nbra;
60 extern char *locs; 		/* for stopping execess recursion */
61 extern char *loc1;  		/* 1st character which matched RE */
62 extern char *loc2; 		/* char after lst char in matched RE */
63 extern char *braslist[]; 	/* start of nbra subexp  */
64 extern char *braelist[]; 	/* end of nbra subexp    */
65 extern int regerrno;
66 extern int reglength;
67 
68 int regcomp_flags;		/* interface to specify cflags for regcomp */
69 
70 void regex_comp_free(void *a);
71 static int dhl_step(const char *str, const char *ep);
72 static int dhl_advance(const char *str, const char *ep);
73 static int map_errnos(int);		/* Convert regcomp error */
74 static int dhl_doit(const char *, const regex_t *, const int flags);
75 static char * dhl_compile(const char *instr, char *ep, char *endbuf);
76 
77 /*
78  * # of sub re's: NOTE: For now limit on bra list defined here
79  * but fix is to add maxbra define to to regex.h
80  * One problem is that a bigger number is a performance hit since
81  * regexec() has a slow initialization loop that goes around SEPSIZE times
82  */
83 #define	SEPSIZE 20
84 static regmatch_t rm[SEPSIZE];		/* ptr to list of RE matches */
85 
86 /*
87  * Structure to contain dl encoded first two bytes for vi, plus hold two
88  * regex structures, one for advance and one for step.
89  */
90 static struct regex_comp {
91 	char 	r_head[2];		/* Header for DL encoding for vi */
92 	regex_t r_stp;			/* For use by step */
93 	regex_t r_adv;			/* For use by advance */
94 } reg_comp;
95 
96 /*
97  * global value for the size of a regex_comp structure:
98  */
99 size_t regexc_size = sizeof (reg_comp);
100 
101 
102 char *
103 compile(const char *instr, char *expbuf, char *endbuf)
104 {
105 	return (dhl_compile(instr, expbuf, endbuf));
106 }
107 
108 int
109 step(const char *instr, const char *expbuf)
110 {
111 	return (dhl_step(instr, expbuf));
112 }
113 
114 int
115 advance(const char *instr, const char *expbuf)
116 {
117 	return (dhl_advance(instr, expbuf));
118 }
119 
120 
121 /*
122  * the compile and step routines here simulate the old libgen routines of
123  * compile/step Re: regexpr(3G). in order to do this, we must assume
124  * that expbuf[] consists of the following format:
125  *	1) the first two bytes consist of a special encoding - see below.
126  *	2) the next part is a regex_t used by regexec()/regcomp() for step
127  *	3) the final part is a regex_t used by regexec()/regcomp() for advance
128  *
129  * the special encoding of the first two bytes is referenced throughout
130  * vi. apparently expbuf[0] is set to:
131  *	= 0 upon initialization
132  *	= 1 if the first char of the RE is a ^
133  *	= 0 if the first char of the RE isn't a ^
134  * and expbuf[1-35+]	= bitmap of the type of RE chars in the expression.
135  * this is apparently 0 if there's no RE.
136  * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero
137  * if there's at least 1 RE in the string.
138  * I say "apparently" as the code to compile()/step() is poorly written.
139  */
140 static char *
141 dhl_compile(instr, expbuf, endbuf)
142 const char *instr;		/* the regular expression		*/
143 char *expbuf;			/* where the compiled RE gets placed	*/
144 char *endbuf;			/* ending addr of expbuf		*/
145 {
146 	int rv;
147 	int alloc = 0;
148 	char adv_instr[4096];	/* PLENTY big temp buffer */
149 	char *instrp;		/* PLENTY big temp buffer */
150 
151 	if (*instr == (char) NULL) {
152 		regerrno = 41;
153 		return (NULL);
154 	}
155 
156 	/*
157 	 * Check values of expbuf and endbuf
158 	 */
159 	if (expbuf == NULL) {
160 		if ((expbuf = malloc(regexc_size)) == NULL) {
161 			regerrno = 50;
162 			return (NULL);
163 		}
164 		memset(&reg_comp, 0, regexc_size);
165 		alloc = 1;
166 		endbuf = expbuf + regexc_size;
167 	} else {		/* Check if enough memory was allocated */
168 		if (expbuf + regexc_size > endbuf) {
169 			regerrno = 50;
170 			return (NULL);
171 		}
172 		memcpy(&reg_comp, expbuf, regexc_size);
173 	}
174 
175 	/*
176 	 * Clear global flags
177 	 */
178 	nbra = 0;
179 	regerrno = 0;
180 
181 	/*
182 	 * Free any data being held for previous search strings
183 	 */
184 	regex_comp_free(&reg_comp);
185 
186 	/*
187 	 * We call regcomp twice, once to get a regex_t for use by step()
188 	 * and then again with for use by advance()
189 	 */
190 	if ((rv = regcomp(&reg_comp.r_stp, instr, regcomp_flags)) != 0) {
191 		regerrno = map_errnos(rv);	/* Convert regcomp error */
192 		goto out;
193 	}
194 	/*
195 	 * To support advance, which assumes an implicit ^ to match at start
196 	 * of line we prepend a ^ to the pattern by copying to a temp buffer
197 	 */
198 
199 	if (instr[0] == '^')
200 		instrp = (char *) instr; /* String already has leading ^ */
201 	else {
202 		adv_instr[0] = '^';
203 		strncpy(&adv_instr[1], instr, 2048);
204 		instrp = adv_instr;
205 	}
206 
207 	if ((rv = regcomp(&reg_comp.r_adv, instrp, regcomp_flags)) != 0) {
208 		regerrno = map_errnos(rv);	/* Convert regcomp error */
209 		goto out;
210 	}
211 
212 	/*
213 	 * update global variables
214 	 */
215 	nbra = (int) reg_comp.r_adv.re_nsub > 0 ?
216 	    (int) reg_comp.r_adv.re_nsub : 0;
217 	regerrno = 0;
218 
219 	/*
220 	 * Set the header flags for use by vi
221 	 */
222 	if (instr[0] == '^') 		/* if beginning of string,	*/
223 		reg_comp.r_head[0] = 1;	/* set special flag		*/
224 	else
225 		reg_comp.r_head[0] = 0;	/* clear special flag		*/
226 	/*
227 	 * note that for a single BRE, nbra will be 0 here.
228 	 * we're guaranteed that, at this point, a RE has been found.
229 	 */
230 	reg_comp.r_head[1] = 1;	/* set special flag		*/
231 	/*
232 	 * Copy our reg_comp structure to expbuf
233 	 */
234 	(void) memcpy(expbuf, (char *) &reg_comp, regexc_size);
235 
236 out:
237 	/*
238 	 * Return code from libgen regcomp with mods.  Note weird return
239 	 * value - if space is malloc'd return pointer to start of space,
240 	 * if user provided his own space, return pointer to 1+last byte
241 	 * of his space.
242 	 */
243 	if (regerrno != 0) {
244 		if (alloc)
245 			free(expbuf);
246 		return (NULL);
247 	}
248 	reglength = regexc_size;
249 
250 	if (alloc)
251 		return (expbuf);
252 	else
253 		return (expbuf + regexc_size);
254 }
255 
256 
257 /*
258  * dhl_step: step through a string until a RE match is found, or end of str
259  */
260 static int
261 dhl_step(str, ep)
262 const char *str;		/* characters to be checked for a match	*/
263 const char *ep;			/* compiled RE from dhl_compile()	*/
264 {
265 	/*
266 	 * Check if we're passed a null ep
267 	 */
268 	if (ep == NULL) {
269 		regerrno = 41;	/* No remembered search string error */
270 		return (0);
271 	}
272 	/*
273 	 * Call common routine with r_stp (step) structure
274 	 */
275 	return (dhl_doit(str, &(((struct regex_comp *) ep)->r_stp),
276 	    ((locs != NULL) ? REG_NOTBOL : 0)));
277 }
278 
279 /*
280  * dhl_advance: implement advance
281  */
282 static int
283 dhl_advance(str, ep)
284 const char *str;		/* characters to be checked for a match	*/
285 const char *ep;			/* compiled RE from dhl_compile()	*/
286 {
287 	int rv;
288 	/*
289 	 * Check if we're passed a null ep
290 	 */
291 	if (ep == NULL) {
292 		regerrno = 41;	/* No remembered search string error */
293 		return (0);
294 	}
295 	/*
296 	 * Call common routine with r_adv (advance) structure
297 	 */
298 	rv = dhl_doit(str, &(((struct regex_comp *) ep)->r_adv), 0);
299 	loc1 = NULL;		/* Clear it per the compile man page */
300 	return (rv);
301 }
302 
303 /*
304  * dhl_doit - common code for step and advance
305  */
306 static int
307 dhl_doit(str, rep, flags)
308 const char *str;		/* characters to be checked for a match	*/
309 const regex_t *rep;
310 const int flags;		/* flags to be passed to regexec directly */
311 {
312 	int rv;
313 	int i;
314 	regmatch_t *prm;	/* ptr to current regmatch_t		*/
315 
316 	/*
317 	 * Check if we're passed a null regex_t
318 	 */
319 	if (rep == NULL) {
320 		regerrno = 41;	/* No remembered search string error */
321 		return (0);
322 	}
323 
324 	regerrno = 0;
325 	prm = &rm[0];
326 
327 	if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) {
328 		if (rv == REG_NOMATCH)
329 			return (0);
330 		regerrno = map_errnos(rv);
331 		return (0);
332 	}
333 
334 	loc1 = (char *)str + prm->rm_so;
335 	loc2 = (char *)str + prm->rm_eo;
336 
337 	/*
338 	 * Now we need to fill up the bra lists with all of the sub re's
339 	 * Note we subtract nsub -1, and preincrement prm.
340 	 */
341 	for (i = 0; i <= rep->re_nsub; i++) {
342 		prm++;		/* XXX inc past first subexp */
343 		braslist[i] = (char *)str + prm->rm_so;
344 		braelist[i] = (char *)str + prm->rm_eo;
345 		if (i >= SEPSIZE) {
346 			regerrno = 50; 	/* regex overflow */
347 			return (0);
348 		}
349 	}
350 
351 	/*
352 	 * Inverse logic, a zero from regexec - success, is a 1
353 	 * from advance/step.
354 	 */
355 
356 	return (rv == 0);
357 }
358 
359 
360 /*
361  *	regerrno to compile/step error mapping:
362  *	This is really a big compromise.  Some errors don't map at all
363  *	like regcomp error 15 is generated by both compile() error types
364  *  	44 & 46.  So which one should we map to?
365  *	Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions
366  *	To do your errors right use xregerr() to get the regcomp error
367  *	string and print that.
368  *
369  * |	regcomp/regexec		     | 	Compile/step/advance		    |
370  * +---------------------------------+--------------------------------------+
371  * 0 REG_OK	  Pattern matched	1  - Pattern matched
372  * 1 REG_NOMATCH  No match		0  - Pattern didn't match
373  * 2 REG_ECOLLATE Bad collation elmnt.	67 - Returned by compile on mbtowc err
374  * 3 REG_EESCAPE  trailing \ in patrn	45 - } expected after \.
375  * 4 REG_ENEWLINE \n before end pattrn	36 - Illegal or missing delimiter.
376  * 5 REG_ENSUB	  Over 9 \( \) pairs 	43 - Too many \(
377  * 6 REG_ESUBREG  Bad number in \[0-9]  25 - ``\digit'' out of range.
378  * 7 REG_EBRACK   [ ] inbalance		49 - [ ] imbalance.
379  * 8 REG_EPAREN   ( ) inbalance         42 - \(~\) imbalance.
380  * 9 REG_EBRACE   \{ \} inbalance       45 - } expected after \.
381  * 10 REG_ERANGE  bad range endpoint	11 - Range endpoint too large.
382  * 11 REG_ESPACE  no memory for pattern 50 - Regular expression overflow.
383  * 12 REG_BADRPT  invalid repetition	36 - Illegal or missing delimiter.
384  * 13 REG_ECTYPE  invalid char-class    67 - illegal byte sequence
385  * 14 REG_BADPAT  syntax error		50 - Regular expression overflow.
386  * 15 REG_BADBR   \{ \} contents bad	46 - First number exceeds 2nd in \{~\}
387  * 16 REG_EFATAL  internal error	50 - Regular expression overflow.
388  * 17 REG_ECHAR   bad mulitbyte char	67 - illegal byte sequence
389  * 18 REG_STACK   stack overflow	50 - Regular expression overflow.
390  * 19 REG_ENOSYS  function not supported 50- Regular expression overflow.
391  *
392  *	For reference here's the compile/step errno's. We don't generate
393  *	41 here - it's done earlier, nor 44 since we can't tell if from 46.
394  *
395  *	11 - Range endpoint too large.
396  *	16 - Bad number.
397  *	25 - ``\digit'' out of range.
398  *	36 - Illegal or missing delimiter.
399  *	41 - No remembered search string.
400  *	42 - \(~\) imbalance.
401  *	43 - Too many \(.
402  *	44 - More than 2 numbers given in "\{~\}"
403  *	45 - } expected after \.
404  *	46 - First number exceeds 2nd in "\{~\}"
405  *	49 - [ ] imbalance.
406  *	50 - Regular expression overflow.
407  */
408 
409 static int
410 map_errnos(int Errno)
411 {
412 	switch (Errno) {
413 	case REG_ECOLLATE:
414 		regerrno = 67;
415 		break;
416 	case REG_EESCAPE:
417 		regerrno = 45;
418 		break;
419 	case REG_ENEWLINE:
420 		regerrno = 36;
421 		break;
422 	case REG_ENSUB:
423 		regerrno = 43;
424 		break;
425 	case REG_ESUBREG:
426 		regerrno = 25;
427 		break;
428 	case REG_EBRACK:
429 		regerrno = 49;
430 		break;
431 	case REG_EPAREN:
432 		regerrno = 42;
433 		break;
434 	case REG_EBRACE:
435 		regerrno = 45;
436 		break;
437 	case REG_ERANGE:
438 		regerrno = 11;
439 		break;
440 	case REG_ESPACE:
441 		regerrno = 50;
442 		break;
443 	case REG_BADRPT:
444 		regerrno = 36;
445 		break;
446 	case REG_ECTYPE:
447 		regerrno = 67;
448 		break;
449 	case REG_BADPAT:
450 		regerrno = 50;
451 		break;
452 	case REG_BADBR:
453 		regerrno = 46;
454 		break;
455 	case REG_EFATAL:
456 		regerrno = 50;
457 		break;
458 	case REG_ECHAR:
459 		regerrno = 67;
460 		break;
461 	case REG_STACK:
462 		regerrno = 50;
463 		break;
464 	case REG_ENOSYS:
465 		regerrno = 50;
466 		break;
467 	default:
468 		regerrno = 50;
469 		break;
470 	}
471 	return (regerrno);
472 }
473 
474 /*
475  *  This is a routine to clean up the subtle substructure of the struct
476  *  regex_comp type for use by clients of this module.  Since the struct
477  *  type is private, we use a generic interface, and trust the
478  *  application to be damn sure that this operation is valid for the
479  *  named memory.
480  */
481 
482 void
483 regex_comp_free(void * a)
484 {
485 	/*
486 	 * Free any data being held for previous search strings
487 	 */
488 
489 	if (((struct regex_comp *) a) == NULL) {
490 		return;
491 	}
492 
493 	regfree(&((struct regex_comp *)a)->r_stp);
494 	regfree(&((struct regex_comp *)a)->r_adv);
495 }
496