xref: /illumos-gate/usr/src/cmd/localedef/ctype.c (revision 5aec55eb)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
14  */
15 
16 /*
17  * LC_CTYPE database generation routines for localedef.
18  */
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <sys/types.h>
24 #include <sys/avl.h>
25 #include <wchar.h>
26 #include <ctype.h>
27 #include <wctype.h>
28 #include <unistd.h>
29 #include "localedef.h"
30 #include "parser.tab.h"
31 #include "runefile.h"
32 
33 static avl_tree_t	ctypes;
34 
35 static wchar_t		last_ctype;
36 
37 typedef struct ctype_node {
38 	wchar_t wc;
39 	int32_t	ctype;
40 	int32_t	toupper;
41 	int32_t	tolower;
42 	avl_node_t avl;
43 } ctype_node_t;
44 
45 static int
46 ctype_compare(const void *n1, const void *n2)
47 {
48 	const ctype_node_t *c1 = n1;
49 	const ctype_node_t *c2 = n2;
50 
51 	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
52 }
53 
54 void
55 init_ctype(void)
56 {
57 	avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
58 	    offsetof(ctype_node_t, avl));
59 }
60 
61 
62 static void
63 add_ctype_impl(ctype_node_t *ctn)
64 {
65 	switch (last_kw) {
66 	case T_ISUPPER:
67 		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
68 		break;
69 	case T_ISLOWER:
70 		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
71 		break;
72 	case T_ISALPHA:
73 		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
74 		break;
75 	case T_ISDIGIT:
76 		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
77 		break;
78 	case T_ISSPACE:
79 		ctn->ctype |= _ISSPACE;
80 		break;
81 	case T_ISCNTRL:
82 		ctn->ctype |= _ISCNTRL;
83 		break;
84 	case T_ISGRAPH:
85 		ctn->ctype |= (_ISGRAPH | _ISPRINT);
86 		break;
87 	case T_ISPRINT:
88 		ctn->ctype |= _ISPRINT;
89 		break;
90 	case T_ISPUNCT:
91 		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
92 		break;
93 	case T_ISXDIGIT:
94 		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
95 		break;
96 	case T_ISBLANK:
97 		ctn->ctype |= (_ISBLANK | _ISSPACE);
98 		break;
99 	case T_ISPHONOGRAM:
100 		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
101 		break;
102 	case T_ISIDEOGRAM:
103 		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
104 		break;
105 	case T_ISENGLISH:
106 		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
107 		break;
108 	case T_ISNUMBER:
109 		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
110 		break;
111 	case T_ISSPECIAL:
112 		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
113 		break;
114 	case T_ISALNUM:
115 		/*
116 		 * We can't do anything with this.  The character
117 		 * should already be specified as a digit or alpha.
118 		 */
119 		break;
120 	default:
121 		errf(_("not a valid character class"));
122 	}
123 }
124 
125 static ctype_node_t *
126 get_ctype(wchar_t wc)
127 {
128 	ctype_node_t	srch;
129 	ctype_node_t	*ctn;
130 	avl_index_t	where;
131 
132 	srch.wc = wc;
133 	if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
134 		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
135 			errf(_("out of memory"));
136 			return (NULL);
137 		}
138 		ctn->wc = wc;
139 
140 		avl_insert(&ctypes, ctn, where);
141 	}
142 	return (ctn);
143 }
144 
145 void
146 add_ctype(int val)
147 {
148 	ctype_node_t	*ctn;
149 
150 	if ((ctn = get_ctype(val)) == NULL) {
151 		INTERR;
152 		return;
153 	}
154 	add_ctype_impl(ctn);
155 	last_ctype = ctn->wc;
156 }
157 
158 void
159 add_ctype_range(int end)
160 {
161 	ctype_node_t	*ctn;
162 	wchar_t		cur;
163 
164 	if (end < last_ctype) {
165 		errf(_("malformed character range (%u ... %u))"),
166 		    last_ctype, end);
167 		return;
168 	}
169 	for (cur = last_ctype + 1; cur <= end; cur++) {
170 		if ((ctn = get_ctype(cur)) == NULL) {
171 			INTERR;
172 			return;
173 		}
174 		add_ctype_impl(ctn);
175 	}
176 	last_ctype = end;
177 
178 }
179 
180 void
181 add_caseconv(int val, int wc)
182 {
183 	ctype_node_t	*ctn;
184 
185 	ctn = get_ctype(val);
186 	if (ctn == NULL) {
187 		INTERR;
188 		return;
189 	}
190 
191 	switch (last_kw) {
192 	case T_TOUPPER:
193 		ctn->toupper = wc;
194 		break;
195 	case T_TOLOWER:
196 		ctn->tolower = wc;
197 		break;
198 	default:
199 		INTERR;
200 		break;
201 	}
202 }
203 
204 void
205 dump_ctype(void)
206 {
207 	FILE		*f;
208 	_FileRuneLocale	rl;
209 	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
210 	_FileRuneEntry	*ct = NULL;
211 	_FileRuneEntry	*lo = NULL;
212 	_FileRuneEntry	*up = NULL;
213 
214 	(void) memset(&rl, 0, sizeof (rl));
215 	last_ct = NULL;
216 	last_lo = NULL;
217 	last_up = NULL;
218 
219 	if ((f = open_category()) == NULL)
220 		return;
221 
222 	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
223 	(void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
224 
225 	for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
226 
227 		wchar_t	wc = ctn->wc;
228 		int conflict = 0;
229 
230 		/*
231 		 * POSIX requires certain portable characters have
232 		 * certain types.  Add them if they are missing.
233 		 */
234 		if ((wc >= 1) && (wc <= 127)) {
235 			if ((wc >= 'A') && (wc <= 'Z'))
236 				ctn->ctype |= _ISUPPER;
237 			if ((wc >= 'a') && (wc <= 'z'))
238 				ctn->ctype |= _ISLOWER;
239 			if ((wc >= '0') && (wc <= '9'))
240 				ctn->ctype |= _ISDIGIT;
241 			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
242 				ctn->ctype |= _ISSPACE;
243 			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
244 				ctn->ctype |= _ISXDIGIT;
245 			if (strchr(" \t", (char)wc))
246 				ctn->ctype |= _ISBLANK;
247 
248 			/*
249 			 * Technically these settings are only
250 			 * required for the C locale.  However, it
251 			 * turns out that because of the historical
252 			 * version of isprint(), we need them for all
253 			 * locales as well.  Note that these are not
254 			 * necessarily valid punctation characters in
255 			 * the current language, but ispunct() needs
256 			 * to return TRUE for them.
257 			 */
258 			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
259 			    (char)wc))
260 				ctn->ctype |= _ISPUNCT;
261 		}
262 
263 		/*
264 		 * POSIX also requires that certain types imply
265 		 * others.  Add any inferred types here.
266 		 */
267 		if (ctn->ctype & (_ISUPPER |_ISLOWER))
268 			ctn->ctype |= _ISALPHA;
269 		if (ctn->ctype & _ISDIGIT)
270 			ctn->ctype |= _ISXDIGIT;
271 		if (ctn->ctype & _ISBLANK)
272 			ctn->ctype |= _ISSPACE;
273 		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
274 			ctn->ctype |= _ISGRAPH;
275 		if (ctn->ctype & _ISGRAPH)
276 			ctn->ctype |= _ISPRINT;
277 
278 		/*
279 		 * Finally, POSIX requires that certain combinations
280 		 * are invalid.  We don't flag this as a fatal error,
281 		 * but we will warn about.
282 		 */
283 		if ((ctn->ctype & _ISALPHA) &&
284 		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
285 			conflict++;
286 		if ((ctn->ctype & _ISPUNCT) &
287 		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
288 			conflict++;
289 		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
290 			conflict++;
291 		if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
292 			conflict++;
293 		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
294 			conflict++;
295 
296 		if (conflict) {
297 			warn("conflicting classes for character 0x%x (%x)",
298 			    wc, ctn->ctype);
299 		}
300 		/*
301 		 * Handle the lower 256 characters using the simple
302 		 * optimization.  Note that if we have not defined the
303 		 * upper/lower case, then we identity map it.
304 		 */
305 		if (wc < _CACHED_RUNES) {
306 			rl.runetype[wc] = ctn->ctype;
307 			rl.maplower[wc] = ctn->tolower ? ctn->tolower : wc;
308 			rl.mapupper[wc] = ctn->toupper ? ctn->toupper : wc;
309 			continue;
310 		}
311 
312 		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
313 			ct[rl.runetype_ext_nranges-1].max = wc;
314 			last_ct = ctn;
315 		} else {
316 			rl.runetype_ext_nranges++;
317 			ct = realloc(ct,
318 			    sizeof (*ct) * rl.runetype_ext_nranges);
319 			ct[rl.runetype_ext_nranges - 1].min = wc;
320 			ct[rl.runetype_ext_nranges - 1].max = wc;
321 			ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
322 			last_ct = ctn;
323 		}
324 		if (ctn->toupper == 0) {
325 			last_up = NULL;
326 		} else if ((last_lo != NULL) &&
327 		    (last_lo->tolower + 1 == ctn->tolower)) {
328 			lo[rl.maplower_ext_nranges-1].max = wc;
329 			last_lo = ctn;
330 		} else {
331 			rl.maplower_ext_nranges++;
332 			lo = realloc(lo,
333 			    sizeof (*lo) * rl.maplower_ext_nranges);
334 			lo[rl.maplower_ext_nranges - 1].min = wc;
335 			lo[rl.maplower_ext_nranges - 1].max = wc;
336 			lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
337 			last_lo = ctn;
338 		}
339 
340 		if (ctn->toupper == 0) {
341 			last_up = NULL;
342 		} else if ((last_up != NULL) &&
343 		    (last_up->toupper + 1 == ctn->toupper)) {
344 			up[rl.mapupper_ext_nranges-1].max = wc;
345 			last_up = ctn;
346 		} else {
347 			rl.mapupper_ext_nranges++;
348 			up = realloc(up,
349 			    sizeof (*up) * rl.mapupper_ext_nranges);
350 			up[rl.mapupper_ext_nranges - 1].min = wc;
351 			up[rl.mapupper_ext_nranges - 1].max = wc;
352 			up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
353 			last_up = ctn;
354 		}
355 	}
356 
357 	if ((wr_category(&rl, sizeof (rl), f) < 0) ||
358 	    (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
359 	    (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
360 	    (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
361 		return;
362 	}
363 
364 	close_category(f);
365 }
366