xref: /illumos-gate/usr/src/cmd/sgs/lex/common/sub3.c (revision 55855f50)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * sub3.c ... ALE enhancement.
28  * Since a typical Asian language has a huge character set, it is not
29  * ideal to index an array by a character code itself, which requires
30  * as large as 2**16 entries per array.
31  * To get arround this problem, we identify a set of characters that
32  * causes the same transition on all states and call it character group.
33  * Every character in a same character group has a unique number called
34  * character group id.  A function yycgid(c) maps the character c (in process
35  * code) to the id.  This mapping is determined by analyzing all regular
36  * expressions in the lex program.
37  *
38  */
39 #include	<stdlib.h>
40 #include	<widec.h>
41 #include	<search.h>
42 #include	"ldefs.h"
43 
44 /*
45  * "lchar" stands for linearized character.  It is a variant of
46  * process code.  AT&T's 16-bit process code has a drawback in which
47  * for three three process code C, D and E where C <= D <= E,
48  * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C).
49  * In other words, four codesets alternates as the magnitude
50  * of character increases.
51  * The lchar representation holds this property:
52  *   If three lchar C', D' and E' have the relationship C' < D' <  E' and
53  *   codeset(C') == codeset(E') then D' is guaranteed to belong to
54  *   the same codeset as C' and E'.
55  * lchar is implemented as 32 bit entities and the function linearize()
56  * that maps a wchar_t to lchar is defined below.  There is no
57  * reverse function for it though.
58  * The 32-bit process code by AT&T, used only for Taiwanese version at the
59  * time of wrting, has no such problem and we use it as it is.
60  */
61 
62 lchar	yycgidtbl[MAXNCG] = {
63 	0,		/* For ease of computation of the id. */
64 	'\n',		/* Newline is always special because '.' exclude it. */
65 	0x000000ff,	/* The upper limit of codeset 0. */
66 	0x20ffffff,	/* The upper limit of codeset 2. */
67 	0x40ffffff	/* The upper limit of codeset 3. */
68 /*	0x60ffffff	   The upper limit of codeset 1. */
69 	/* Above assumes the number of significant bits of wchar_t is <= 24. */
70 };
71 int	ncgidtbl = 5; /* # elements in yycgidtbl. */
72 int	ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */
73 		/* returns plus 1. */
74 
75 static void setsymbol(int i);
76 
77 /*
78  * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below:
79  *
80  *	wc: axxxxxxbyyyyyyy
81  *
82  * returns: 0ab0000000000000axxxxxxxbyyyyyyy
83  *
84  * linearize() doesn't do any if compiled with 32-bit wchar_t, use of
85  * which is flagged with LONG_WCHAR_T macro.
86  * NOTE:
87  * The implementation is highly depends on the process code representation.
88  * This function should be modified when 32-bit process code is used.
89  * There is no need to keep 'a' and 'b' bits in the lower half of lchar.
90  * You can actually omit these and squeeze the xxxxxx part one bit right.
91  * We don't do that here just in sake of speed.
92  */
93 lchar
linearize(wchar_t wc)94 linearize(wchar_t wc)
95 {
96 #ifdef LONG_WCHAR_T
97 	return ((lchar)wc); /* Don't do anything. */
98 #else
99 
100 	lchar	prefix;
101 	switch (wc&0x8080) {
102 	case 0x0000: prefix = 0x00000000; break;
103 	case 0x0080: prefix = 0x20000000; break;
104 	case 0x8000: prefix = 0x40000000; break;
105 	case 0x8080: prefix = 0x60000000; break;
106 	}
107 	return (prefix|wc);
108 #endif
109 }
110 
111 /* compare liniear characters pointed to by pc1 and pc2 */
112 int
cmplc(const void * arg1,const void * arg2)113 cmplc(const void *arg1, const void *arg2)
114 {
115 	lchar *pc1 = (lchar *)arg1;
116 	lchar *pc2 = (lchar *)arg2;
117 
118 	if (*pc1 > *pc2)
119 		return (1);
120 	else if (*pc1 == *pc2)
121 		return (0);
122 	else
123 		return (-1);
124 }
125 
126 void
remch(wchar_t c)127 remch(wchar_t c)
128 {
129 	lchar	lc = linearize(c);
130 	size_t	local_ncgidtbl;
131 
132 	/*
133 	 * User-friendliness consideration:
134 	 * Make sure no EUC chars are used in reg. exp.
135 	 */
136 	if (!handleeuc) {
137 		if (!isascii(c)) {
138 			if (iswprint(c))
139 				warning(
140 "Non-ASCII character '%wc' in pattern; use -w or -e lex option.", c);
141 			else warning(
142 "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c);
143 		}
144 		/* In any case, we don't need to construct ncgidtbl[]. */
145 		return;
146 	}
147 
148 	/*
149 	 * lsearch wants ncgidtbl to be size_t, but it is int. Hence,
150 	 * the use of local_ncgidtbl to satisfy the calling interface.
151 	 */
152 	local_ncgidtbl = ncgidtbl;
153 	(void) lsearch(&lc, yycgidtbl,
154 	    &local_ncgidtbl, sizeof (lchar), cmplc);
155 	ncgidtbl = (int)local_ncgidtbl;
156 }
157 
158 void
sortcgidtbl(void)159 sortcgidtbl(void)
160 {
161 	if (!handleeuc)
162 		return;
163 	qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc);
164 }
165 
166 /*
167  * int yycgid(wchar_t c)
168  *	Takes c and returns its character group id, determind by the
169  *	following algorithm.  The program also uses the binary search
170  *	algorithm, generalized from Knuth (6.2.1) Algorithm B.
171  *
172  *	This function computes the "character group id" based on
173  *	a table yycgidtbl of which each lchar entry is pre-sorted
174  *	in ascending sequence  The number of valid entries is given
175  *	by YYNCGIDTBL.  There is no duplicate entries in yycgidtbl.
176  *		const int YYNCGIDTBL;
177  *		lchar	yycgidtbl[YYNCGIDTBL];
178  *
179  *	yycgidtbl[0] is guaranteed to have zero.
180  *
181  *	For given c, yycgid(c) returns:
182  *		2*i	iff yycgidtbl[i] == lc
183  *		2*i+1	iff yycgidtbl[i] < lc < yycgidtbl[i+1]
184  *		YYNCGIDTBL*2-1
185  *			iff yycgidtbl[YYNCGIDTBL-1] < lc
186  *	where lc=linearize(c).
187  *
188  *	Some interesting properties.:
189  *	1.  For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1
190  *	2.  yycgid(c) == 0  iff  c == 0.
191  *	3.  For any wchar_t c and d, if linearize(c) < linearize(d) then
192  *	    yycgid(c) <= yycgid(d).
193  *	4.  For any wchar_t c and d, if yycgid(c) < yycgid(d) then
194  *	    linearize(c) < linearize(d).
195  */
196 #define	YYNCGIDTBL ncgidtbl
197 
198 int
yycgid(wchar_t c)199 yycgid(wchar_t c)
200 {
201 	int first = 0;
202 	int last = YYNCGIDTBL - 1;
203 	lchar lc;
204 
205 	/*
206 	 * In ASCII compat. mode, each character forms a "group" and the
207 	 * group-id is itself...
208 	 */
209 	if (!handleeuc)
210 		return (c);
211 
212 	lc = linearize(c);
213 
214 	/* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */
215 	if (yycgidtbl[YYNCGIDTBL - 1] < lc)
216 		return (YYNCGIDTBL*2 - 1);
217 
218 	while (last >= 0) {
219 		int i = (first+last)/2;
220 		if (lc == yycgidtbl[i])
221 			return (2*i);	/* lc exactly matches an element. */
222 		else if (yycgidtbl[i] < lc) {
223 			if (lc < yycgidtbl[i+1]) {
224 				/* lc is in between two elements */
225 				return (2*i+1);
226 			}
227 			else
228 				first = i + 1;
229 		} else
230 			last = i - 1;
231 	}
232 	error(
233 	"system error in yycgid():binary search failed for c=0x%04x\n", c);
234 	return (0);
235 }
236 
237 /*
238  * repbycgid --- replaces each character in the parsing tree by its
239  * character group id.   This, however, should be called even in
240  * the ASCII compat. mode to process DOT nodes and to call cclinter()
241  * for the DOT and CCL nodes.
242  */
243 void
repbycgid(void)244 repbycgid(void)
245 {
246 	int i, c;
247 
248 	for (i = 0; i < tptr; ++i) {
249 		c = name[i];
250 		if (!ISOPERATOR(c)) {
251 		/* If not an operator, it must be a char.  */
252 			name[i] = yycgid((wchar_t)c); /* So replace it. */
253 #ifdef DEBUG
254 			if (debug) {
255 				printf("name[%d]:'%c'->%d;\n", i, c, name[i]);
256 			}
257 #endif
258 		} else if (c == RSTR) {
259 			c = right[i];
260 			right[i] = yycgid((wchar_t)c);
261 #ifdef DEBUG
262 			if (debug) {
263 				printf(
264 				    "name[%d].right:'%c'->%d;\n",
265 				    i, c, right[i]);
266 			}
267 #endif
268 		} else if ((c == RCCL) || (c == RNCCL)) {
269 			CHR cc, *s;
270 			int j;
271 			CHR ccltoken[CCLSIZE];
272 			CHR *ccp;
273 			int m;
274 			/*
275 			 * This node represetns a character class RE [ccccc]
276 			 * s points to the string of characters that forms
277 			 * the class and/or a special prefix notation
278 			 * <RANGE>XY which corresponds to the RE X-Y,
279 			 * characters in the range of X and Y.  Here,
280 			 * X <= Y is guranteed.
281 			 * We transform these characters into a string
282 			 * of sorted character group ids.
283 			 *
284 			 * There is another mechanism of packing tables
285 			 * that is inherited from the ASCII lex.  Call of
286 			 * cclinter() is required for this packing.
287 			 * This used to be done as yylex() reads the lex
288 			 * rules but we have to do this here because the
289 			 * transition table is made to work on the char-group
290 			 * ids and the mapping cannot be determined until
291 			 * the entire file is read.
292 			 */
293 #ifdef DEBUG
294 			if (debug) {
295 				printf("name[%d]:R[N]CCL of \"", i);
296 				strpt(left[i]);
297 				printf(" -> {");
298 			}
299 #endif
300 			/* Prepare symbol[] for cclinter(). */
301 			for (j = 0; j < ncg; ++j)
302 				symbol[j] = FALSE;
303 
304 			s = (CHR *) left[i];
305 			while ((cc = *s++) != 0) {
306 				if (cc == RANGE) {
307 					int	low, high, i;
308 					/*
309 					 * Special form: <RANGE>XY
310 					 * This means the range X-Y.
311 					 * We mark all symbols[]
312 					 * elements for yycgid(X) thru
313 					 * yycgid(Y), inclusively.
314 					 */
315 					low = yycgid(*s++);
316 					high = yycgid(*s++);
317 					for (i = low; i <= high; ++i)
318 						setsymbol(i);
319 				} else {
320 					setsymbol(yycgid(cc));
321 				}
322 			}
323 
324 			/* Now make a transformed string of cgids. */
325 			s = ccptr;
326 			m = 0;
327 			for (j = 0; j < ncg; ++j)
328 				if (symbol[j]) {
329 					ccltoken[m++] = (CHR)j;
330 #ifdef DEBUG
331 					if (debug) printf("%d, ", j);
332 #endif
333 				}
334 
335 #ifdef DEBUG
336 			if (debug) printf("}\n");
337 #endif
338 			ccltoken[m] = 0;
339 			ccp = ccl;
340 			while (ccp < ccptr && scomp(ccltoken, ccp) != 0)
341 				ccp++;
342 			if (ccp < ccptr) {  /* character class found in ccl */
343 				left[i] = (int)ccp;
344 			} else { /* not in ccl, add it */
345 				left[i] = (int)ccptr;
346 				scopy(ccltoken, ccptr);
347 				ccptr += slength(ccltoken) + 1;
348 				if (ccptr > ccl + CCLSIZE)
349 					error(
350 					"Too many large character classes");
351 			}
352 			cclinter(c == RCCL);
353 		} else if (c == DOT) {
354 			if (psave == 0) { /* First DOT node. */
355 				int j, nlid;
356 				/*
357 				 * Make symbol[k]=TRUE for all k
358 				 *  except k == yycgid('\n').
359 				 */
360 				nlid = yycgid('\n');
361 				psave = ccptr;
362 				for (j = 1; j < ncg; ++j) {
363 					if (j == nlid) {
364 						symbol[j] = FALSE;
365 					} else {
366 						symbol[j] = TRUE;
367 						*ccptr++ = (CHR) j;
368 					}
369 				}
370 				*ccptr++ = 0;
371 				if (ccptr > ccl + CCLSIZE)
372 					error(
373 					"Too many large character classes");
374 			}
375 			/* Mimic mn1(RCCL,psave)... */
376 			name[i] = RCCL;
377 			left[i] = (int)psave;
378 			cclinter(1);
379 		}
380 	}
381 #ifdef DEBUG
382 	if (debug) {
383 		printf("treedump after repbycgid().\n");
384 		treedump();
385 	}
386 #endif
387 }
388 
389 static void
setsymbol(int i)390 setsymbol(int i)
391 {
392 	if (i > (int)sizeof (symbol))
393 		error("setsymbol: (SYSERR) %d out of range", i);
394 	symbol[i] = TRUE;
395 }
396