xref: /illumos-gate/usr/src/cmd/localedef/ctype.c (revision 6cf13876)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2017 Nexenta Systems, Inc.
14  * Copyright 2012 Garrett D'Amore <garrett@damore.org>
15  * Copyright 2013 DEY Storage Systems, Inc.
16  */
17 
18 /*
19  * LC_CTYPE database generation routines for localedef.
20  */
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/types.h>
26 #include <sys/avl.h>
27 #include <wchar.h>
28 #include <ctype.h>
29 #include <wctype.h>
30 #include <unistd.h>
31 #include "_ctype.h"
32 #include "localedef.h"
33 #include "parser.tab.h"
34 #include "runefile.h"
35 
36 static avl_tree_t	ctypes;
37 
38 static wchar_t		last_ctype;
39 
40 typedef struct ctype_node {
41 	wchar_t wc;
42 	int32_t	ctype;
43 	int32_t	toupper;
44 	int32_t	tolower;
45 	avl_node_t avl;
46 } ctype_node_t;
47 
48 typedef struct width_node {
49 	wchar_t start;
50 	wchar_t end;
51 	int8_t width;
52 	avl_node_t avl;
53 } width_node_t;
54 
55 static int
ctype_compare(const void * n1,const void * n2)56 ctype_compare(const void *n1, const void *n2)
57 {
58 	const ctype_node_t *c1 = n1;
59 	const ctype_node_t *c2 = n2;
60 
61 	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
62 }
63 
64 void
init_ctype(void)65 init_ctype(void)
66 {
67 	avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
68 	    offsetof(ctype_node_t, avl));
69 }
70 
71 
72 static void
add_ctype_impl(ctype_node_t * ctn)73 add_ctype_impl(ctype_node_t *ctn)
74 {
75 	switch (last_kw) {
76 	case T_ISUPPER:
77 		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
78 		break;
79 	case T_ISLOWER:
80 		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
81 		break;
82 	case T_ISALPHA:
83 		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
84 		break;
85 	case T_ISDIGIT:
86 		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
87 		break;
88 	case T_ISSPACE:
89 		ctn->ctype |= _ISSPACE;
90 		break;
91 	case T_ISCNTRL:
92 		ctn->ctype |= _ISCNTRL;
93 		break;
94 	case T_ISGRAPH:
95 		ctn->ctype |= (_ISGRAPH | _ISPRINT);
96 		break;
97 	case T_ISPRINT:
98 		ctn->ctype |= _ISPRINT;
99 		break;
100 	case T_ISPUNCT:
101 		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
102 		break;
103 	case T_ISXDIGIT:
104 		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
105 		break;
106 	case T_ISBLANK:
107 		ctn->ctype |= (_ISBLANK | _ISSPACE);
108 		break;
109 	case T_ISPHONOGRAM:
110 		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
111 		break;
112 	case T_ISIDEOGRAM:
113 		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
114 		break;
115 	case T_ISENGLISH:
116 		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
117 		break;
118 	case T_ISNUMBER:
119 		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
120 		break;
121 	case T_ISSPECIAL:
122 		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
123 		break;
124 	case T_ISALNUM:
125 		/*
126 		 * We can't do anything with this.  The character
127 		 * should already be specified as a digit or alpha.
128 		 */
129 		break;
130 	default:
131 		errf(_("not a valid character class"));
132 	}
133 }
134 
135 static ctype_node_t *
get_ctype(wchar_t wc)136 get_ctype(wchar_t wc)
137 {
138 	ctype_node_t	srch;
139 	ctype_node_t	*ctn;
140 	avl_index_t	where;
141 
142 	srch.wc = wc;
143 	if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
144 		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
145 			errf(_("out of memory"));
146 			return (NULL);
147 		}
148 		ctn->wc = wc;
149 
150 		avl_insert(&ctypes, ctn, where);
151 	}
152 	return (ctn);
153 }
154 
155 void
add_ctype(int val)156 add_ctype(int val)
157 {
158 	ctype_node_t	*ctn;
159 
160 	if ((ctn = get_ctype(val)) == NULL) {
161 		INTERR;
162 		return;
163 	}
164 	add_ctype_impl(ctn);
165 	last_ctype = ctn->wc;
166 }
167 
168 void
add_ctype_range(wchar_t end)169 add_ctype_range(wchar_t end)
170 {
171 	ctype_node_t	*ctn;
172 	wchar_t		cur;
173 
174 	if (end < last_ctype) {
175 		errf(_("malformed character range (%u ... %u))"),
176 		    last_ctype, end);
177 		return;
178 	}
179 	for (cur = last_ctype + 1; cur <= end; cur++) {
180 		if ((ctn = get_ctype(cur)) == NULL) {
181 			INTERR;
182 			return;
183 		}
184 		add_ctype_impl(ctn);
185 	}
186 	last_ctype = end;
187 
188 }
189 
190 /*
191  * A word about widths: if the width mask is specified, then libc
192  * unconditionally honors it.  Otherwise, it assumes printable
193  * characters have width 1, and non-printable characters have width
194  * -1 (except for NULL which is special with with 0).  Hence, we have
195  * no need to inject defaults here -- the "default" unset value of 0
196  * indicates that libc should use its own logic in wcwidth as described.
197  */
198 void
add_width(int wc,int width)199 add_width(int wc, int width)
200 {
201 	ctype_node_t	*ctn;
202 
203 	if ((ctn = get_ctype(wc)) == NULL) {
204 		INTERR;
205 		return;
206 	}
207 	ctn->ctype &= ~(_CTYPE_SWM);
208 	switch (width) {
209 	case 0:
210 		ctn->ctype |= _CTYPE_SW0;
211 		break;
212 	case 1:
213 		ctn->ctype |= _CTYPE_SW1;
214 		break;
215 	case 2:
216 		ctn->ctype |= _CTYPE_SW2;
217 		break;
218 	case 3:
219 		ctn->ctype |= _CTYPE_SW3;
220 		break;
221 	}
222 }
223 
224 void
add_width_range(int start,int end,int width)225 add_width_range(int start, int end, int width)
226 {
227 	for (; start <= end; start++) {
228 		add_width(start, width);
229 	}
230 }
231 
232 void
add_caseconv(int val,int wc)233 add_caseconv(int val, int wc)
234 {
235 	ctype_node_t	*ctn;
236 
237 	ctn = get_ctype(val);
238 	if (ctn == NULL) {
239 		INTERR;
240 		return;
241 	}
242 
243 	switch (last_kw) {
244 	case T_TOUPPER:
245 		ctn->toupper = wc;
246 		break;
247 	case T_TOLOWER:
248 		ctn->tolower = wc;
249 		break;
250 	default:
251 		INTERR;
252 		break;
253 	}
254 }
255 
256 void
dump_ctype(void)257 dump_ctype(void)
258 {
259 	FILE		*f;
260 	_FileRuneLocale	rl;
261 	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
262 	_FileRuneEntry	*ct = NULL;
263 	_FileRuneEntry	*lo = NULL;
264 	_FileRuneEntry	*up = NULL;
265 	wchar_t		wc;
266 
267 	(void) memset(&rl, 0, sizeof (rl));
268 	last_ct = NULL;
269 	last_lo = NULL;
270 	last_up = NULL;
271 
272 	if ((f = open_category()) == NULL)
273 		return;
274 
275 	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
276 	(void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
277 
278 	/*
279 	 * Initialize the identity map.
280 	 */
281 	for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
282 		rl.maplower[wc] = wc;
283 		rl.mapupper[wc] = wc;
284 	}
285 
286 	for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
287 		int conflict = 0;
288 
289 
290 		wc = ctn->wc;
291 
292 		/*
293 		 * POSIX requires certain portable characters have
294 		 * certain types.  Add them if they are missing.
295 		 */
296 		if ((wc >= 1) && (wc <= 127)) {
297 			if ((wc >= 'A') && (wc <= 'Z'))
298 				ctn->ctype |= _ISUPPER;
299 			if ((wc >= 'a') && (wc <= 'z'))
300 				ctn->ctype |= _ISLOWER;
301 			if ((wc >= '0') && (wc <= '9'))
302 				ctn->ctype |= _ISDIGIT;
303 			if (wc == ' ')
304 				ctn->ctype |= _ISPRINT;
305 			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
306 				ctn->ctype |= _ISSPACE;
307 			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
308 				ctn->ctype |= _ISXDIGIT;
309 			if (strchr(" \t", (char)wc))
310 				ctn->ctype |= _ISBLANK;
311 
312 			/*
313 			 * Technically these settings are only
314 			 * required for the C locale.  However, it
315 			 * turns out that because of the historical
316 			 * version of isprint(), we need them for all
317 			 * locales as well.  Note that these are not
318 			 * necessarily valid punctation characters in
319 			 * the current language, but ispunct() needs
320 			 * to return TRUE for them.
321 			 */
322 			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
323 			    (char)wc))
324 				ctn->ctype |= _ISPUNCT;
325 		}
326 
327 		/*
328 		 * POSIX also requires that certain types imply
329 		 * others.  Add any inferred types here.
330 		 */
331 		if (ctn->ctype & (_ISUPPER |_ISLOWER))
332 			ctn->ctype |= _ISALPHA;
333 		if (ctn->ctype & _ISDIGIT)
334 			ctn->ctype |= _ISXDIGIT;
335 		if (ctn->ctype & _ISBLANK)
336 			ctn->ctype |= _ISSPACE;
337 		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
338 			ctn->ctype |= _ISGRAPH;
339 		if (ctn->ctype & _ISGRAPH)
340 			ctn->ctype |= _ISPRINT;
341 
342 		/*
343 		 * Finally, POSIX requires that certain combinations
344 		 * are invalid.  We don't flag this as a fatal error,
345 		 * but we will warn about.
346 		 */
347 		if ((ctn->ctype & _ISALPHA) &&
348 		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
349 			conflict++;
350 		if ((ctn->ctype & _ISPUNCT) &&
351 		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
352 			conflict++;
353 		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
354 			conflict++;
355 		if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT))
356 			conflict++;
357 		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
358 			conflict++;
359 
360 		if (conflict) {
361 			warn("conflicting classes for character 0x%x (%x)",
362 			    wc, ctn->ctype);
363 		}
364 
365 		/*
366 		 * Handle the lower 256 characters using the simple
367 		 * optimization.  Note that if we have not defined the
368 		 * upper/lower case, then we identity map it.
369 		 */
370 		if ((unsigned)wc < _CACHED_RUNES) {
371 			rl.runetype[wc] = ctn->ctype;
372 			if (ctn->tolower)
373 				rl.maplower[wc] = ctn->tolower;
374 			if (ctn->toupper)
375 				rl.mapupper[wc] = ctn->toupper;
376 			continue;
377 		}
378 
379 		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
380 		    (last_ct->wc + 1 == wc)) {
381 			ct[rl.runetype_ext_nranges-1].max = wc;
382 		} else {
383 			rl.runetype_ext_nranges++;
384 			ct = realloc(ct,
385 			    sizeof (*ct) * rl.runetype_ext_nranges);
386 			if (ct == NULL)
387 				goto fail;
388 			ct[rl.runetype_ext_nranges - 1].min = wc;
389 			ct[rl.runetype_ext_nranges - 1].max = wc;
390 			ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
391 		}
392 		last_ct = ctn;
393 		if (ctn->tolower == 0) {
394 			last_lo = NULL;
395 		} else if ((last_lo != NULL) &&
396 		    (last_lo->tolower + 1 == ctn->tolower)) {
397 			lo[rl.maplower_ext_nranges-1].max = wc;
398 			last_lo = ctn;
399 		} else {
400 			rl.maplower_ext_nranges++;
401 			lo = realloc(lo,
402 			    sizeof (*lo) * rl.maplower_ext_nranges);
403 			if (lo == NULL)
404 				goto fail;
405 			lo[rl.maplower_ext_nranges - 1].min = wc;
406 			lo[rl.maplower_ext_nranges - 1].max = wc;
407 			lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
408 			last_lo = ctn;
409 		}
410 
411 		if (ctn->toupper == 0) {
412 			last_up = NULL;
413 		} else if ((last_up != NULL) &&
414 		    (last_up->toupper + 1 == ctn->toupper)) {
415 			up[rl.mapupper_ext_nranges-1].max = wc;
416 			last_up = ctn;
417 		} else {
418 			rl.mapupper_ext_nranges++;
419 			up = realloc(up,
420 			    sizeof (*up) * rl.mapupper_ext_nranges);
421 			if (up == NULL)
422 				goto fail;
423 			up[rl.mapupper_ext_nranges - 1].min = wc;
424 			up[rl.mapupper_ext_nranges - 1].max = wc;
425 			up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
426 			last_up = ctn;
427 		}
428 	}
429 
430 	if ((wr_category(&rl, sizeof (rl), f) == 0) &&
431 	    (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) == 0) &&
432 	    (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) == 0) &&
433 	    (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) == 0)) {
434 		close_category(f);
435 		goto out;
436 	}
437 
438 fail:
439 	delete_category(f);
440 out:
441 	free(ct);
442 	free(lo);
443 	free(up);
444 }
445