1/*
2 * Copyright (c) 1980 Regents of the University of California.
3 * All rights reserved.  The Berkeley software License Agreement
4 * specifies the terms and conditions for redistribution.
5 */
6
7#pragma ident	"%Z%%M%	%I%	%E% SMI"
8
9#include <ctype.h>
10
11typedef int	boolean;
12#define TRUE	1
13#define FALSE	0
14#define NIL	0
15
16extern boolean	l_onecase;	/* true if upper and lower equivalent */
17extern char	*l_idchars;	/* set of characters legal in identifiers
18				   in addition to letters and digits */
19
20extern char	*strchr();
21static void	expconv(void);
22
23#define isidchr(c)	\
24		(isalnum(c) || ((c) != NIL && strchr(l_idchars, (c)) != NIL))
25#define makelower(c)	(isupper((c)) ? tolower((c)) : (c))
26
27/*  STRNCMP -	like strncmp except that we convert the
28 *	 	first string to lower case before comparing
29 *		if l_onecase is set.
30 */
31
32int
33STRNCMP(char *s1, char *s2, int len)
34{
35	if (l_onecase) {
36	    do
37		if (*s2 - makelower(*s1))
38			return (*s2 - makelower(*s1));
39		else {
40			s2++;
41			s1++;
42		}
43	    while (--len);
44	} else {
45	    do
46		if (*s2 - *s1)
47			return (*s2 - *s1);
48		else {
49			s2++;
50			s1++;
51		}
52	    while (--len);
53	}
54	return(0);
55}
56
57/*	The following routine converts an irregular expression to
58 *	internal format.
59 *
60 *	Either meta symbols (\a \d or \p) or character strings or
61 *	operations ( alternation or parenthesizing ) can be
62 *	specified.  Each starts with a descriptor byte.  The descriptor
63 *	byte has STR set for strings, META set for meta symbols
64 *	and OPER set for operations.
65 *	The descriptor byte can also have the OPT bit set if the object
66 *	defined is optional.  Also ALT can be set to indicate an alternation.
67 *
68 *	For metasymbols the byte following the descriptor byte identities
69 *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
70 *	strings the byte after the descriptor is a character count for
71 *	the string:
72 *
73 *		meta symbols := descriptor
74 *				symbol
75 *
76 *		strings :=	descriptor
77 *				character count
78 *				the string
79 *
80 *		operations :=	descriptor
81 *				symbol
82 *				character count
83 */
84
85/*
86 *  handy macros for accessing parts of match blocks
87 */
88#define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
89#define MNEXT(A) (A+2)		/* character following a metasymbol block */
90
91#define OSYM(A) (*(A+1))	/* symbol in an operation block */
92#define OCNT(A) (*(A+2))	/* character count */
93#define ONEXT(A) (A+3)		/* next character after the operation */
94#define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
95
96#define SCNT(A) (*(A+1))	/* byte count of a string */
97#define SSTR(A) (A+2)		/* address of the string */
98#define SNEXT(A) (A+2+*(A+1))	/* character following the string */
99
100/*
101 *  bit flags in the descriptor
102 */
103#define OPT 1
104#define STR 2
105#define META 4
106#define ALT 8
107#define OPER 16
108
109char *ure;		/* pointer current position in unconverted exp */
110char *ccre;		/* pointer to current position in converted exp*/
111char *malloc();
112
113char *
114convexp(char *re)
115	/* re - unconverted irregular expression */
116{
117    char *cre;		/* pointer to converted regular expression */
118
119    /* allocate room for the converted expression */
120    if (re == NIL)
121	return (NIL);
122    if (*re == '\0')
123	return (NIL);
124    cre = malloc (4 * strlen(re) + 3);
125    ccre = cre;
126    ure = re;
127
128    /* start the conversion with a \a */
129    *cre = META | OPT;
130    MSYM(cre) = 'a';
131    ccre = MNEXT(cre);
132
133    /* start the conversion (its recursive) */
134    expconv ();
135    *ccre = 0;
136    return (cre);
137}
138
139static void
140expconv(void)
141{
142    char *cs;		/* pointer to current symbol in converted exp */
143    char c;		/* character being processed */
144    char *acs;		/* pinter to last alternate */
145    int temp;
146
147    /* let the conversion begin */
148    acs = NIL;
149    cs = NIL;
150    while (*ure != NIL) {
151	switch (c = *ure++) {
152
153	case '\\':
154	    switch (c = *ure++) {
155
156	    /* escaped characters are just characters */
157	    default:
158		if (cs == NIL || (*cs & STR) == 0) {
159		    cs = ccre;
160		    *cs = STR;
161		    SCNT(cs) = 1;
162		    ccre += 2;
163		} else
164		    SCNT(cs)++;
165		*ccre++ = c;
166		break;
167
168	    /* normal(?) metacharacters */
169	    case 'a':
170	    case 'd':
171	    case 'e':
172	    case 'p':
173		if (acs != NIL && acs != cs) {
174		    do {
175			temp = OCNT(acs);
176			OCNT(acs) = ccre - acs;
177			acs -= temp;
178		    } while (temp != 0);
179		    acs = NIL;
180		}
181		cs = ccre;
182		*cs = META;
183		MSYM(cs) = c;
184		ccre = MNEXT(cs);
185		break;
186	    }
187	    break;
188
189	/* just put the symbol in */
190	case '^':
191	case '$':
192	    if (acs != NIL && acs != cs) {
193		do {
194		    temp = OCNT(acs);
195		    OCNT(acs) = ccre - acs;
196		    acs -= temp;
197		} while (temp != 0);
198		acs = NIL;
199	    }
200	    cs = ccre;
201	    *cs = META;
202	    MSYM(cs) = c;
203	    ccre = MNEXT(cs);
204	    break;
205
206	/* mark the last match sequence as optional */
207	case '?':
208	    if (cs)
209	    	*cs = *cs | OPT;
210	    break;
211
212	/* recurse and define a subexpression */
213	case '(':
214	    if (acs != NIL && acs != cs) {
215		do {
216		    temp = OCNT(acs);
217		    OCNT(acs) = ccre - acs;
218		    acs -= temp;
219		} while (temp != 0);
220		acs = NIL;
221	    }
222	    cs = ccre;
223	    *cs = OPER;
224	    OSYM(cs) = '(';
225	    ccre = ONEXT(cs);
226	    expconv ();
227	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
228	    break;
229
230	/* return from a recursion */
231	case ')':
232	    if (acs != NIL) {
233		do {
234		    temp = OCNT(acs);
235		    OCNT(acs) = ccre - acs;
236		    acs -= temp;
237		} while (temp != 0);
238		acs = NIL;
239	    }
240	    cs = ccre;
241	    *cs = META;
242	    MSYM(cs) = c;
243	    ccre = MNEXT(cs);
244	    return;
245
246	/* mark the last match sequence as having an alternate */
247	/* the third byte will contain an offset to jump over the */
248	/* alternate match in case the first did not fail */
249	case '|':
250	    if (acs != NIL && acs != cs)
251		OCNT(ccre) = ccre - acs;	/* make a back pointer */
252	    else
253		OCNT(ccre) = 0;
254	    *cs |= ALT;
255	    cs = ccre;
256	    *cs = OPER;
257	    OSYM(cs) = '|';
258	    ccre = ONEXT(cs);
259	    acs = cs;	/* remember that the pointer is to be filles */
260	    break;
261
262	/* if its not a metasymbol just build a scharacter string */
263	default:
264	    if (cs == NIL || (*cs & STR) == 0) {
265		cs = ccre;
266		*cs = STR;
267		SCNT(cs) = 1;
268		ccre = SSTR(cs);
269	    } else
270		SCNT(cs)++;
271	    *ccre++ = c;
272	    break;
273	}
274    }
275    if (acs != NIL) {
276	do {
277	    temp = OCNT(acs);
278	    OCNT(acs) = ccre - acs;
279	    acs -= temp;
280	} while (temp != 0);
281	acs = NIL;
282    }
283}
284/* end of convertre */
285
286
287/*
288 *	The following routine recognises an irregular expresion
289 *	with the following special characters:
290 *
291 *		\?	-	means last match was optional
292 *		\a	-	matches any number of characters
293 *		\d	-	matches any number of spaces and tabs
294 *		\p	-	matches any number of alphanumeric
295 *				characters. The
296 *				characters matched will be copied into
297 *				the area pointed to by 'name'.
298 *		\|	-	alternation
299 *		\( \)	-	grouping used mostly for alternation and
300 *				optionality
301 *
302 *	The irregular expression must be translated to internal form
303 *	prior to calling this routine
304 *
305 *	The value returned is the pointer to the first non \a
306 *	character matched.
307 */
308
309boolean _escaped;		/* true if we are currently _escaped */
310char *Start;			/* start of string */
311
312char *
313expmatch(char *s, char *re, char *mstring)
314	/* s - string to check for a match in */
315	/* re - a converted irregular expression */
316	/* mstring - where to put whatever matches a \p */
317{
318    char *cs;		/* the current symbol */
319    char *ptr, *s1;	/* temporary pointer */
320    boolean matched;	/* a temporary boolean */
321
322    /* initial conditions */
323    if (re == NIL)
324	return (NIL);
325    cs = re;
326    matched = FALSE;
327
328    /* loop till expression string is exhausted (or at least pretty tired) */
329    while (*cs) {
330	switch (*cs & (OPER | STR | META)) {
331
332	/* try to match a string */
333	case STR:
334	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
335	    if (matched) {
336
337		/* hoorah it matches */
338		s += SCNT(cs);
339		cs = SNEXT(cs);
340	    } else if (*cs & ALT) {
341
342		/* alternation, skip to next expression */
343		cs = SNEXT(cs);
344	    } else if (*cs & OPT) {
345
346		/* the match is optional */
347		cs = SNEXT(cs);
348		matched = 1;		/* indicate a successful match */
349	    } else {
350
351		/* no match, error return */
352		return (NIL);
353	    }
354	    break;
355
356	/* an operator, do something fancy */
357	case OPER:
358	    switch (OSYM(cs)) {
359
360	    /* this is an alternation */
361	    case '|':
362		if (matched)
363
364		    /* last thing in the alternation was a match, skip ahead */
365		    cs = OPTR(cs);
366		else
367
368		    /* no match, keep trying */
369		    cs = ONEXT(cs);
370		break;
371
372	    /* this is a grouping, recurse */
373	    case '(':
374		ptr = expmatch (s, ONEXT(cs), mstring);
375		if (ptr != NIL) {
376
377		    /* the subexpression matched */
378		    matched = 1;
379		    s = ptr;
380		} else if (*cs & ALT) {
381
382		    /* alternation, skip to next expression */
383		    matched = 0;
384		} else if (*cs & OPT) {
385
386		    /* the match is optional */
387		    matched = 1;	/* indicate a successful match */
388		} else {
389
390		    /* no match, error return */
391		    return (NIL);
392		}
393		cs = OPTR(cs);
394		break;
395	    }
396	    break;
397
398	/* try to match a metasymbol */
399	case META:
400	    switch (MSYM(cs)) {
401
402	    /* try to match anything and remember what was matched */
403	    case 'p':
404		/*
405		 *  This is really the same as trying the match the
406		 *  remaining parts of the expression to any subset
407		 *  of the string.
408		 */
409		s1 = s;
410		do {
411		    ptr = expmatch (s1, MNEXT(cs), mstring);
412		    if (ptr != NIL && s1 != s) {
413
414			/* we have a match, remember the match */
415			strncpy (mstring, s, s1 - s);
416			mstring[s1 - s] = '\0';
417			return (ptr);
418		    } else if (ptr != NIL && (*cs & OPT)) {
419
420			/* it was aoptional so no match is ok */
421			return (ptr);
422		    } else if (ptr != NIL) {
423
424			/* not optional and we still matched */
425			return (NIL);
426		    }
427		    if (!isidchr(*s1))
428			return (NIL);
429		    if (*s1 == '\\')
430			_escaped = _escaped ? FALSE : TRUE;
431		    else
432			_escaped = FALSE;
433		} while (*s1++);
434		return (NIL);
435
436	    /* try to match anything */
437	    case 'a':
438		/*
439		 *  This is really the same as trying the match the
440		 *  remaining parts of the expression to any subset
441		 *  of the string.
442		 */
443		s1 = s;
444		do {
445		    ptr = expmatch (s1, MNEXT(cs), mstring);
446		    if (ptr != NIL && s1 != s) {
447
448			/* we have a match */
449			return (ptr);
450		    } else if (ptr != NIL && (*cs & OPT)) {
451
452			/* it was aoptional so no match is ok */
453			return (ptr);
454		    } else if (ptr != NIL) {
455
456			/* not optional and we still matched */
457			return (NIL);
458		    }
459		    if (*s1 == '\\')
460			_escaped = _escaped ? FALSE : TRUE;
461		    else
462			_escaped = FALSE;
463		} while (*s1++);
464		return (NIL);
465
466	    /* fail if we are currently _escaped */
467	    case 'e':
468		if (_escaped)
469		    return(NIL);
470		cs = MNEXT(cs);
471		break;
472
473	    /* match any number of tabs and spaces */
474	    case 'd':
475		ptr = s;
476		while (*s == ' ' || *s == '\t')
477		    s++;
478		if (s != ptr || s == Start) {
479
480		    /* match, be happy */
481		    matched = 1;
482		    cs = MNEXT(cs);
483		} else if (*s == '\n' || *s == '\0') {
484
485		    /* match, be happy */
486		    matched = 1;
487		    cs = MNEXT(cs);
488		} else if (*cs & ALT) {
489
490		    /* try the next part */
491		    matched = 0;
492		    cs = MNEXT(cs);
493		} else if (*cs & OPT) {
494
495		    /* doesn't matter */
496		    matched = 1;
497		    cs = MNEXT(cs);
498		} else
499
500		    /* no match, error return */
501		    return (NIL);
502		break;
503
504	    /* check for end of line */
505	    case '$':
506		if (*s == '\0' || *s == '\n') {
507
508		    /* match, be happy */
509		    s++;
510		    matched = 1;
511		    cs = MNEXT(cs);
512		} else if (*cs & ALT) {
513
514		    /* try the next part */
515		    matched = 0;
516		    cs = MNEXT(cs);
517		} else if (*cs & OPT) {
518
519		    /* doesn't matter */
520		    matched = 1;
521		    cs = MNEXT(cs);
522		} else
523
524		    /* no match, error return */
525		    return (NIL);
526		break;
527
528	    /* check for start of line */
529	    case '^':
530		if (s == Start) {
531
532		    /* match, be happy */
533		    matched = 1;
534		    cs = MNEXT(cs);
535		} else if (*cs & ALT) {
536
537		    /* try the next part */
538		    matched = 0;
539		    cs = MNEXT(cs);
540		} else if (*cs & OPT) {
541
542		    /* doesn't matter */
543		    matched = 1;
544		    cs = MNEXT(cs);
545		} else
546
547		    /* no match, error return */
548		    return (NIL);
549		break;
550
551	    /* end of a subexpression, return success */
552	    case ')':
553		return (s);
554	    }
555	    break;
556	}
557    }
558    return (s);
559}
560