xref: /illumos-gate/usr/src/cmd/localedef/ctype.c (revision 3e6960d7)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2010,2011 Nexenta Systems, Inc.  All rights reserved.
14  * Copyright 2012 Garrett D'Amore <garrett@damore.org>
15  * Copyright 2013 DEY Storage Systems, Inc.
16  */
17 
18 /*
19  * LC_CTYPE database generation routines for localedef.
20  */
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/types.h>
26 #include <sys/avl.h>
27 #include <wchar.h>
28 #include <ctype.h>
29 #include <wctype.h>
30 #include <unistd.h>
31 #include "_ctype.h"
32 #include "localedef.h"
33 #include "parser.tab.h"
34 #include "runefile.h"
35 
36 static avl_tree_t	ctypes;
37 
38 static wchar_t		last_ctype;
39 
40 typedef struct ctype_node {
41 	wchar_t wc;
42 	int32_t	ctype;
43 	int32_t	toupper;
44 	int32_t	tolower;
45 	avl_node_t avl;
46 } ctype_node_t;
47 
48 typedef struct width_node {
49 	wchar_t start;
50 	wchar_t end;
51 	int8_t width;
52 	avl_node_t avl;
53 } width_node_t;
54 
55 static int
56 ctype_compare(const void *n1, const void *n2)
57 {
58 	const ctype_node_t *c1 = n1;
59 	const ctype_node_t *c2 = n2;
60 
61 	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
62 }
63 
64 void
65 init_ctype(void)
66 {
67 	avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
68 	    offsetof(ctype_node_t, avl));
69 }
70 
71 
72 static void
73 add_ctype_impl(ctype_node_t *ctn)
74 {
75 	switch (last_kw) {
76 	case T_ISUPPER:
77 		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
78 		break;
79 	case T_ISLOWER:
80 		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
81 		break;
82 	case T_ISALPHA:
83 		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
84 		break;
85 	case T_ISDIGIT:
86 		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
87 		break;
88 	case T_ISSPACE:
89 		ctn->ctype |= _ISSPACE;
90 		break;
91 	case T_ISCNTRL:
92 		ctn->ctype |= _ISCNTRL;
93 		break;
94 	case T_ISGRAPH:
95 		ctn->ctype |= (_ISGRAPH | _ISPRINT);
96 		break;
97 	case T_ISPRINT:
98 		ctn->ctype |= _ISPRINT;
99 		break;
100 	case T_ISPUNCT:
101 		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
102 		break;
103 	case T_ISXDIGIT:
104 		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
105 		break;
106 	case T_ISBLANK:
107 		ctn->ctype |= (_ISBLANK | _ISSPACE);
108 		break;
109 	case T_ISPHONOGRAM:
110 		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
111 		break;
112 	case T_ISIDEOGRAM:
113 		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
114 		break;
115 	case T_ISENGLISH:
116 		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
117 		break;
118 	case T_ISNUMBER:
119 		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
120 		break;
121 	case T_ISSPECIAL:
122 		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
123 		break;
124 	case T_ISALNUM:
125 		/*
126 		 * We can't do anything with this.  The character
127 		 * should already be specified as a digit or alpha.
128 		 */
129 		break;
130 	default:
131 		errf(_("not a valid character class"));
132 	}
133 }
134 
135 static ctype_node_t *
136 get_ctype(wchar_t wc)
137 {
138 	ctype_node_t	srch;
139 	ctype_node_t	*ctn;
140 	avl_index_t	where;
141 
142 	srch.wc = wc;
143 	if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
144 		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
145 			errf(_("out of memory"));
146 			return (NULL);
147 		}
148 		ctn->wc = wc;
149 
150 		avl_insert(&ctypes, ctn, where);
151 	}
152 	return (ctn);
153 }
154 
155 void
156 add_ctype(int val)
157 {
158 	ctype_node_t	*ctn;
159 
160 	if ((ctn = get_ctype(val)) == NULL) {
161 		INTERR;
162 		return;
163 	}
164 	add_ctype_impl(ctn);
165 	last_ctype = ctn->wc;
166 }
167 
168 void
169 add_ctype_range(int end)
170 {
171 	ctype_node_t	*ctn;
172 	wchar_t		cur;
173 
174 	if (end < last_ctype) {
175 		errf(_("malformed character range (%u ... %u))"),
176 		    last_ctype, end);
177 		return;
178 	}
179 	for (cur = last_ctype + 1; cur <= end; cur++) {
180 		if ((ctn = get_ctype(cur)) == NULL) {
181 			INTERR;
182 			return;
183 		}
184 		add_ctype_impl(ctn);
185 	}
186 	last_ctype = end;
187 
188 }
189 
190 /*
191  * A word about widths: if the width mask is specified, then libc
192  * unconditionally honors it.  Otherwise, it assumes printable
193  * characters have width 1, and non-printable characters have width
194  * -1 (except for NULL which is special with with 0).  Hence, we have
195  * no need to inject defaults here -- the "default" unset value of 0
196  * indicates that libc should use its own logic in wcwidth as described.
197  */
198 void
199 add_width(int wc, int width)
200 {
201 	ctype_node_t	*ctn;
202 
203 	if ((ctn = get_ctype(wc)) == NULL) {
204 		INTERR;
205 		return;
206 	}
207 	ctn->ctype &= ~(_CTYPE_SWM);
208 	switch (width) {
209 	case 0:
210 		ctn->ctype |= _CTYPE_SW0;
211 		break;
212 	case 1:
213 		ctn->ctype |= _CTYPE_SW1;
214 		break;
215 	case 2:
216 		ctn->ctype |= _CTYPE_SW2;
217 		break;
218 	case 3:
219 		ctn->ctype |= _CTYPE_SW3;
220 		break;
221 	}
222 }
223 
224 void
225 add_width_range(int start, int end, int width)
226 {
227 	for (; start <= end; start++) {
228 		add_width(start, width);
229 	}
230 }
231 
232 void
233 add_caseconv(int val, int wc)
234 {
235 	ctype_node_t	*ctn;
236 
237 	ctn = get_ctype(val);
238 	if (ctn == NULL) {
239 		INTERR;
240 		return;
241 	}
242 
243 	switch (last_kw) {
244 	case T_TOUPPER:
245 		ctn->toupper = wc;
246 		break;
247 	case T_TOLOWER:
248 		ctn->tolower = wc;
249 		break;
250 	default:
251 		INTERR;
252 		break;
253 	}
254 }
255 
256 void
257 dump_ctype(void)
258 {
259 	FILE		*f;
260 	_FileRuneLocale	rl;
261 	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
262 	_FileRuneEntry	*ct = NULL;
263 	_FileRuneEntry	*lo = NULL;
264 	_FileRuneEntry	*up = NULL;
265 	wchar_t		wc;
266 
267 	(void) memset(&rl, 0, sizeof (rl));
268 	last_ct = NULL;
269 	last_lo = NULL;
270 	last_up = NULL;
271 
272 	if ((f = open_category()) == NULL)
273 		return;
274 
275 	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
276 	(void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
277 
278 	/*
279 	 * Initialize the identity map.
280 	 */
281 	for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
282 		rl.maplower[wc] = wc;
283 		rl.mapupper[wc] = wc;
284 	}
285 
286 	for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
287 		int conflict = 0;
288 
289 
290 		wc = ctn->wc;
291 
292 		/*
293 		 * POSIX requires certain portable characters have
294 		 * certain types.  Add them if they are missing.
295 		 */
296 		if ((wc >= 1) && (wc <= 127)) {
297 			if ((wc >= 'A') && (wc <= 'Z'))
298 				ctn->ctype |= _ISUPPER;
299 			if ((wc >= 'a') && (wc <= 'z'))
300 				ctn->ctype |= _ISLOWER;
301 			if ((wc >= '0') && (wc <= '9'))
302 				ctn->ctype |= _ISDIGIT;
303 			if (wc == ' ')
304 				ctn->ctype |= _ISPRINT;
305 			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
306 				ctn->ctype |= _ISSPACE;
307 			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
308 				ctn->ctype |= _ISXDIGIT;
309 			if (strchr(" \t", (char)wc))
310 				ctn->ctype |= _ISBLANK;
311 
312 			/*
313 			 * Technically these settings are only
314 			 * required for the C locale.  However, it
315 			 * turns out that because of the historical
316 			 * version of isprint(), we need them for all
317 			 * locales as well.  Note that these are not
318 			 * necessarily valid punctation characters in
319 			 * the current language, but ispunct() needs
320 			 * to return TRUE for them.
321 			 */
322 			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
323 			    (char)wc))
324 				ctn->ctype |= _ISPUNCT;
325 		}
326 
327 		/*
328 		 * POSIX also requires that certain types imply
329 		 * others.  Add any inferred types here.
330 		 */
331 		if (ctn->ctype & (_ISUPPER |_ISLOWER))
332 			ctn->ctype |= _ISALPHA;
333 		if (ctn->ctype & _ISDIGIT)
334 			ctn->ctype |= _ISXDIGIT;
335 		if (ctn->ctype & _ISBLANK)
336 			ctn->ctype |= _ISSPACE;
337 		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
338 			ctn->ctype |= _ISGRAPH;
339 		if (ctn->ctype & _ISGRAPH)
340 			ctn->ctype |= _ISPRINT;
341 
342 		/*
343 		 * Finally, POSIX requires that certain combinations
344 		 * are invalid.  We don't flag this as a fatal error,
345 		 * but we will warn about.
346 		 */
347 		if ((ctn->ctype & _ISALPHA) &&
348 		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
349 			conflict++;
350 		if ((ctn->ctype & _ISPUNCT) &
351 		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
352 			conflict++;
353 		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
354 			conflict++;
355 		if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
356 			conflict++;
357 		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
358 			conflict++;
359 
360 #ifndef NATIVE
361 		if (conflict) {
362 			warn("conflicting classes for character 0x%x (%x)",
363 			    wc, ctn->ctype);
364 		}
365 #endif
366 		/*
367 		 * Handle the lower 256 characters using the simple
368 		 * optimization.  Note that if we have not defined the
369 		 * upper/lower case, then we identity map it.
370 		 */
371 		if ((unsigned)wc < _CACHED_RUNES) {
372 			rl.runetype[wc] = ctn->ctype;
373 			if (ctn->tolower)
374 				rl.maplower[wc] = ctn->tolower;
375 			if (ctn->toupper)
376 				rl.mapupper[wc] = ctn->toupper;
377 			continue;
378 		}
379 
380 		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
381 			ct[rl.runetype_ext_nranges-1].max = wc;
382 			last_ct = ctn;
383 		} else {
384 			rl.runetype_ext_nranges++;
385 			ct = realloc(ct,
386 			    sizeof (*ct) * rl.runetype_ext_nranges);
387 			ct[rl.runetype_ext_nranges - 1].min = wc;
388 			ct[rl.runetype_ext_nranges - 1].max = wc;
389 			ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
390 			last_ct = ctn;
391 		}
392 		if (ctn->tolower == 0) {
393 			last_lo = NULL;
394 		} else if ((last_lo != NULL) &&
395 		    (last_lo->tolower + 1 == ctn->tolower)) {
396 			lo[rl.maplower_ext_nranges-1].max = wc;
397 			last_lo = ctn;
398 		} else {
399 			rl.maplower_ext_nranges++;
400 			lo = realloc(lo,
401 			    sizeof (*lo) * rl.maplower_ext_nranges);
402 			lo[rl.maplower_ext_nranges - 1].min = wc;
403 			lo[rl.maplower_ext_nranges - 1].max = wc;
404 			lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
405 			last_lo = ctn;
406 		}
407 
408 		if (ctn->toupper == 0) {
409 			last_up = NULL;
410 		} else if ((last_up != NULL) &&
411 		    (last_up->toupper + 1 == ctn->toupper)) {
412 			up[rl.mapupper_ext_nranges-1].max = wc;
413 			last_up = ctn;
414 		} else {
415 			rl.mapupper_ext_nranges++;
416 			up = realloc(up,
417 			    sizeof (*up) * rl.mapupper_ext_nranges);
418 			up[rl.mapupper_ext_nranges - 1].min = wc;
419 			up[rl.mapupper_ext_nranges - 1].max = wc;
420 			up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
421 			last_up = ctn;
422 		}
423 	}
424 
425 	if ((wr_category(&rl, sizeof (rl), f) < 0) ||
426 	    (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
427 	    (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
428 	    (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
429 		return;
430 	}
431 
432 	close_category(f);
433 }
434