1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 1995-2003 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28/*
29 * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g)
30 *	using regcomp(3c), regexec(3c) interfaces. This is an XCU4
31 *	porting aid. switches out to libgen compile/step if collation
32 *	table not present.
33 *
34 *	Goal is to work with vi and sed/ed.
35 *	Returns expbuf in dhl format (encoding of first two bytes).
36 *	Note also that this is profoundly single threaded.  You
37 *	cannot call compile twice with two separate search strings
38 *	because the second call will wipe out the earlier stored string.
39 *	This must be fixed, plus a general cleanup should be performed
40 *	if this is to be integrated into libc.
41 *
42 */
43
44#include <stdio.h>
45#include <widec.h>
46#include <sys/types.h>
47#include <regex.h>
48#include <locale.h>
49#include <stdlib.h>
50#include <locale.h>
51#include <string.h>
52#include <unistd.h>
53#include <regexpr.h>
54
55/*
56 * psuedo compile/step/advance global variables
57 */
58extern int nbra;
59extern char *locs;		/* for stopping execess recursion */
60extern char *loc1;		/* 1st character which matched RE */
61extern char *loc2;		/* char after lst char in matched RE */
62extern char *braslist[];	/* start of nbra subexp  */
63extern char *braelist[];	/* end of nbra subexp    */
64extern int regerrno;
65extern int reglength;
66
67int regcomp_flags;		/* interface to specify cflags for regcomp */
68
69void regex_comp_free(void *a);
70static int dhl_step(const char *str, const char *ep);
71static int dhl_advance(const char *str, const char *ep);
72static int map_errnos(int);		/* Convert regcomp error */
73static int dhl_doit(const char *, const regex_t *, const int flags);
74static char *dhl_compile(const char *instr, char *ep, char *endbuf);
75
76/*
77 * # of sub re's: NOTE: For now limit on bra list defined here
78 * but fix is to add maxbra define to to regex.h
79 * One problem is that a bigger number is a performance hit since
80 * regexec() has a slow initialization loop that goes around SEPSIZE times
81 */
82#define	SEPSIZE 20
83static regmatch_t rm[SEPSIZE];		/* ptr to list of RE matches */
84
85/*
86 * Structure to contain dl encoded first two bytes for vi, plus hold two
87 * regex structures, one for advance and one for step.
88 */
89static struct regex_comp {
90	char	r_head[2];		/* Header for DL encoding for vi */
91	regex_t r_stp;			/* For use by step */
92	regex_t r_adv;			/* For use by advance */
93} reg_comp;
94
95/*
96 * global value for the size of a regex_comp structure:
97 */
98size_t regexc_size = sizeof (reg_comp);
99
100
101char *
102compile(const char *instr, char *expbuf, char *endbuf)
103{
104	return (dhl_compile(instr, expbuf, endbuf));
105}
106
107int
108step(const char *instr, const char *expbuf)
109{
110	return (dhl_step(instr, expbuf));
111}
112
113int
114advance(const char *instr, const char *expbuf)
115{
116	return (dhl_advance(instr, expbuf));
117}
118
119
120/*
121 * the compile and step routines here simulate the old libgen routines of
122 * compile/step Re: regexpr(3G). in order to do this, we must assume
123 * that expbuf[] consists of the following format:
124 *	1) the first two bytes consist of a special encoding - see below.
125 *	2) the next part is a regex_t used by regexec()/regcomp() for step
126 *	3) the final part is a regex_t used by regexec()/regcomp() for advance
127 *
128 * the special encoding of the first two bytes is referenced throughout
129 * vi. apparently expbuf[0] is set to:
130 *	= 0 upon initialization
131 *	= 1 if the first char of the RE is a ^
132 *	= 0 if the first char of the RE isn't a ^
133 * and expbuf[1-35+]	= bitmap of the type of RE chars in the expression.
134 * this is apparently 0 if there's no RE.
135 * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero
136 * if there's at least 1 RE in the string.
137 * I say "apparently" as the code to compile()/step() is poorly written.
138 */
139static char *
140dhl_compile(const char *instr,	/* the regular expression		*/
141    char *expbuf,		/* where the compiled RE gets placed	*/
142    char *endbuf)		/* ending addr of expbuf		*/
143{
144	int rv;
145	int alloc = 0;
146	char adv_instr[4096];	/* PLENTY big temp buffer */
147	char *instrp;		/* PLENTY big temp buffer */
148
149	if (*instr == '\0') {
150		regerrno = 41;
151		return (NULL);
152	}
153
154	/*
155	 * Check values of expbuf and endbuf
156	 */
157	if (expbuf == NULL) {
158		if ((expbuf = malloc(regexc_size)) == NULL) {
159			regerrno = 50;
160			return (NULL);
161		}
162		memset(&reg_comp, 0, regexc_size);
163		alloc = 1;
164		endbuf = expbuf + regexc_size;
165	} else {		/* Check if enough memory was allocated */
166		if (expbuf + regexc_size > endbuf) {
167			regerrno = 50;
168			return (NULL);
169		}
170		memcpy(&reg_comp, expbuf, regexc_size);
171	}
172
173	/*
174	 * Clear global flags
175	 */
176	nbra = 0;
177	regerrno = 0;
178
179	/*
180	 * Free any data being held for previous search strings
181	 */
182	regex_comp_free(&reg_comp);
183
184	/*
185	 * We call regcomp twice, once to get a regex_t for use by step()
186	 * and then again with for use by advance()
187	 */
188	if ((rv = regcomp(&reg_comp.r_stp, instr, regcomp_flags)) != 0) {
189		regerrno = map_errnos(rv);	/* Convert regcomp error */
190		goto out;
191	}
192	/*
193	 * To support advance, which assumes an implicit ^ to match at start
194	 * of line we prepend a ^ to the pattern by copying to a temp buffer
195	 */
196
197	if (instr[0] == '^')
198		instrp = (char *)instr; /* String already has leading ^ */
199	else {
200		adv_instr[0] = '^';
201		strncpy(&adv_instr[1], instr, 2048);
202		instrp = adv_instr;
203	}
204
205	if ((rv = regcomp(&reg_comp.r_adv, instrp, regcomp_flags)) != 0) {
206		regerrno = map_errnos(rv);	/* Convert regcomp error */
207		goto out;
208	}
209
210	/*
211	 * update global variables
212	 */
213	nbra = (int)reg_comp.r_adv.re_nsub > 0 ?
214	    (int)reg_comp.r_adv.re_nsub : 0;
215	regerrno = 0;
216
217	/*
218	 * Set the header flags for use by vi
219	 */
220	if (instr[0] == '^')		/* if beginning of string,	*/
221		reg_comp.r_head[0] = 1;	/* set special flag		*/
222	else
223		reg_comp.r_head[0] = 0;	/* clear special flag		*/
224	/*
225	 * note that for a single BRE, nbra will be 0 here.
226	 * we're guaranteed that, at this point, a RE has been found.
227	 */
228	reg_comp.r_head[1] = 1;	/* set special flag		*/
229	/*
230	 * Copy our reg_comp structure to expbuf
231	 */
232	(void) memcpy(expbuf, (char *)&reg_comp, regexc_size);
233
234out:
235	/*
236	 * Return code from libgen regcomp with mods.  Note weird return
237	 * value - if space is malloc'd return pointer to start of space,
238	 * if user provided their own space, return pointer to 1+last byte
239	 * of that space.
240	 */
241	if (regerrno != 0) {
242		if (alloc)
243			free(expbuf);
244		return (NULL);
245	}
246	reglength = regexc_size;
247
248	if (alloc)
249		return (expbuf);
250	else
251		return (expbuf + regexc_size);
252}
253
254
255/*
256 * dhl_step: step through a string until a RE match is found, or end of str
257 */
258static int
259dhl_step(const char *str,	/* characters to be checked for a match	*/
260    const char *ep)		/* compiled RE from dhl_compile()	*/
261{
262	/*
263	 * Check if we're passed a null ep
264	 */
265	if (ep == NULL) {
266		regerrno = 41;	/* No remembered search string error */
267		return (0);
268	}
269	/*
270	 * Call common routine with r_stp (step) structure
271	 */
272	return (dhl_doit(str, &(((struct regex_comp *)ep)->r_stp),
273	    ((locs != NULL) ? REG_NOTBOL : 0)));
274}
275
276/*
277 * dhl_advance: implement advance
278 */
279static int
280dhl_advance(const char *str,	/* characters to be checked for a match	*/
281    const char *ep)		/* compiled RE from dhl_compile()	*/
282{
283	int rv;
284	/*
285	 * Check if we're passed a null ep
286	 */
287	if (ep == NULL) {
288		regerrno = 41;	/* No remembered search string error */
289		return (0);
290	}
291	/*
292	 * Call common routine with r_adv (advance) structure
293	 */
294	rv = dhl_doit(str, &(((struct regex_comp *)ep)->r_adv), 0);
295	loc1 = NULL;		/* Clear it per the compile man page */
296	return (rv);
297}
298
299/*
300 * dhl_doit - common code for step and advance
301 */
302static int
303dhl_doit(const char *str,	/* characters to be checked for a match	*/
304    const regex_t *rep,
305    const int flags)		/* flags to be passed to regexec directly */
306{
307	int rv;
308	int i;
309	regmatch_t *prm;	/* ptr to current regmatch_t		*/
310
311	/*
312	 * Check if we're passed a null regex_t
313	 */
314	if (rep == NULL) {
315		regerrno = 41;	/* No remembered search string error */
316		return (0);
317	}
318
319	regerrno = 0;
320	prm = &rm[0];
321
322	if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) {
323		if (rv == REG_NOMATCH)
324			return (0);
325		regerrno = map_errnos(rv);
326		return (0);
327	}
328
329	loc1 = (char *)str + prm->rm_so;
330	loc2 = (char *)str + prm->rm_eo;
331
332	/*
333	 * Now we need to fill up the bra lists with all of the sub re's
334	 * Note we subtract nsub -1, and preincrement prm.
335	 */
336	for (i = 0; i <= rep->re_nsub; i++) {
337		prm++;		/* XXX inc past first subexp */
338		braslist[i] = (char *)str + prm->rm_so;
339		braelist[i] = (char *)str + prm->rm_eo;
340		if (i >= SEPSIZE) {
341			regerrno = 50;	/* regex overflow */
342			return (0);
343		}
344	}
345
346	/*
347	 * Inverse logic, a zero from regexec - success, is a 1
348	 * from advance/step.
349	 */
350
351	return (rv == 0);
352}
353
354
355/*
356 *	regerrno to compile/step error mapping:
357 *	This is really a big compromise.  Some errors don't map at all
358 *	like regcomp error 15 is generated by both compile() error types
359 *	44 & 46.  So which one should we map to?
360 *	Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions
361 *	To do your errors right use xregerr() to get the regcomp error
362 *	string and print that.
363 *
364 * |    regcomp/regexec              |  Compile/step/advance                |
365 * +---------------------------------+--------------------------------------+
366 * 0 REG_OK	  Pattern matched	1  - Pattern matched
367 * 1 REG_NOMATCH  No match		0  - Pattern didn't match
368 * 2 REG_ECOLLATE Bad collation elmnt.	67 - Returned by compile on mbtowc err
369 * 3 REG_EESCAPE  trailing \ in patrn	45 - } expected after \.
370 * 4 REG_ENEWLINE \n before end pattrn	36 - Illegal or missing delimiter.
371 * 5 REG_ENSUB    Over 9 \( \) pairs	43 - Too many \(
372 * 6 REG_ESUBREG  Bad number in \[0-9]  25 - ``\digit'' out of range.
373 * 7 REG_EBRACK   [ ] inbalance		49 - [ ] imbalance.
374 * 8 REG_EPAREN   ( ) inbalance         42 - \(~\) imbalance.
375 * 9 REG_EBRACE   \{ \} inbalance       45 - } expected after \.
376 * 10 REG_ERANGE  bad range endpoint	11 - Range endpoint too large.
377 * 11 REG_ESPACE  no memory for pattern 50 - Regular expression overflow.
378 * 12 REG_BADRPT  invalid repetition	36 - Illegal or missing delimiter.
379 * 13 REG_ECTYPE  invalid char-class    67 - illegal byte sequence
380 * 14 REG_BADPAT  syntax error		50 - Regular expression overflow.
381 * 15 REG_BADBR   \{ \} contents bad	46 - First number exceeds 2nd in \{~\}
382 * 16 REG_EFATAL  internal error	50 - Regular expression overflow.
383 * 17 REG_ECHAR   bad mulitbyte char	67 - illegal byte sequence
384 * 18 REG_STACK   stack overflow	50 - Regular expression overflow.
385 * 19 REG_ENOSYS  function not supported 50- Regular expression overflow.
386 *
387 *	For reference here's the compile/step errno's. We don't generate
388 *	41 here - it's done earlier, nor 44 since we can't tell if from 46.
389 *
390 *	11 - Range endpoint too large.
391 *	16 - Bad number.
392 *	25 - ``\digit'' out of range.
393 *	36 - Illegal or missing delimiter.
394 *	41 - No remembered search string.
395 *	42 - \(~\) imbalance.
396 *	43 - Too many \(.
397 *	44 - More than 2 numbers given in "\{~\}"
398 *	45 - } expected after \.
399 *	46 - First number exceeds 2nd in "\{~\}"
400 *	49 - [ ] imbalance.
401 *	50 - Regular expression overflow.
402 */
403
404static int
405map_errnos(int Errno)
406{
407	switch (Errno) {
408	case REG_ECOLLATE:
409		regerrno = 67;
410		break;
411	case REG_EESCAPE:
412		regerrno = 45;
413		break;
414	case REG_ENEWLINE:
415		regerrno = 36;
416		break;
417	case REG_ENSUB:
418		regerrno = 43;
419		break;
420	case REG_ESUBREG:
421		regerrno = 25;
422		break;
423	case REG_EBRACK:
424		regerrno = 49;
425		break;
426	case REG_EPAREN:
427		regerrno = 42;
428		break;
429	case REG_EBRACE:
430		regerrno = 45;
431		break;
432	case REG_ERANGE:
433		regerrno = 11;
434		break;
435	case REG_ESPACE:
436		regerrno = 50;
437		break;
438	case REG_BADRPT:
439		regerrno = 36;
440		break;
441	case REG_ECTYPE:
442		regerrno = 67;
443		break;
444	case REG_BADPAT:
445		regerrno = 50;
446		break;
447	case REG_BADBR:
448		regerrno = 46;
449		break;
450	case REG_EFATAL:
451		regerrno = 50;
452		break;
453	case REG_ECHAR:
454		regerrno = 67;
455		break;
456	case REG_STACK:
457		regerrno = 50;
458		break;
459	case REG_ENOSYS:
460		regerrno = 50;
461		break;
462	default:
463		regerrno = 50;
464		break;
465	}
466	return (regerrno);
467}
468
469/*
470 *  This is a routine to clean up the subtle substructure of the struct
471 *  regex_comp type for use by clients of this module.  Since the struct
472 *  type is private, we use a generic interface, and trust the
473 *  application to be damn sure that this operation is valid for the
474 *  named memory.
475 */
476
477void
478regex_comp_free(void *a)
479{
480	/*
481	 * Free any data being held for previous search strings
482	 */
483
484	if (a == NULL) {
485		return;
486	}
487
488	regfree(&((struct regex_comp *)a)->r_stp);
489	regfree(&((struct regex_comp *)a)->r_adv);
490}
491