xref: /illumos-gate/usr/src/cmd/localedef/ctype.c (revision 6125cca6)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2010,2011 Nexenta Systems, Inc.  All rights reserved.
14  * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
15  */
16 
17 /*
18  * LC_CTYPE database generation routines for localedef.
19  */
20 
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <sys/avl.h>
26 #include <wchar.h>
27 #include <ctype.h>
28 #include <wctype.h>
29 #include <unistd.h>
30 #include "localedef.h"
31 #include "parser.tab.h"
32 #include "runefile.h"
33 
34 static avl_tree_t	ctypes;
35 
36 static wchar_t		last_ctype;
37 
38 typedef struct ctype_node {
39 	wchar_t wc;
40 	int32_t	ctype;
41 	int32_t	toupper;
42 	int32_t	tolower;
43 	avl_node_t avl;
44 } ctype_node_t;
45 
46 static int
47 ctype_compare(const void *n1, const void *n2)
48 {
49 	const ctype_node_t *c1 = n1;
50 	const ctype_node_t *c2 = n2;
51 
52 	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
53 }
54 
55 void
56 init_ctype(void)
57 {
58 	avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
59 	    offsetof(ctype_node_t, avl));
60 }
61 
62 
63 static void
64 add_ctype_impl(ctype_node_t *ctn)
65 {
66 	switch (last_kw) {
67 	case T_ISUPPER:
68 		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
69 		break;
70 	case T_ISLOWER:
71 		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
72 		break;
73 	case T_ISALPHA:
74 		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
75 		break;
76 	case T_ISDIGIT:
77 		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
78 		break;
79 	case T_ISSPACE:
80 		ctn->ctype |= _ISSPACE;
81 		break;
82 	case T_ISCNTRL:
83 		ctn->ctype |= _ISCNTRL;
84 		break;
85 	case T_ISGRAPH:
86 		ctn->ctype |= (_ISGRAPH | _ISPRINT);
87 		break;
88 	case T_ISPRINT:
89 		ctn->ctype |= _ISPRINT;
90 		break;
91 	case T_ISPUNCT:
92 		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
93 		break;
94 	case T_ISXDIGIT:
95 		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
96 		break;
97 	case T_ISBLANK:
98 		ctn->ctype |= (_ISBLANK | _ISSPACE);
99 		break;
100 	case T_ISPHONOGRAM:
101 		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
102 		break;
103 	case T_ISIDEOGRAM:
104 		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
105 		break;
106 	case T_ISENGLISH:
107 		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
108 		break;
109 	case T_ISNUMBER:
110 		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
111 		break;
112 	case T_ISSPECIAL:
113 		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
114 		break;
115 	case T_ISALNUM:
116 		/*
117 		 * We can't do anything with this.  The character
118 		 * should already be specified as a digit or alpha.
119 		 */
120 		break;
121 	default:
122 		errf(_("not a valid character class"));
123 	}
124 }
125 
126 static ctype_node_t *
127 get_ctype(wchar_t wc)
128 {
129 	ctype_node_t	srch;
130 	ctype_node_t	*ctn;
131 	avl_index_t	where;
132 
133 	srch.wc = wc;
134 	if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
135 		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
136 			errf(_("out of memory"));
137 			return (NULL);
138 		}
139 		ctn->wc = wc;
140 
141 		avl_insert(&ctypes, ctn, where);
142 	}
143 	return (ctn);
144 }
145 
146 void
147 add_ctype(int val)
148 {
149 	ctype_node_t	*ctn;
150 
151 	if ((ctn = get_ctype(val)) == NULL) {
152 		INTERR;
153 		return;
154 	}
155 	add_ctype_impl(ctn);
156 	last_ctype = ctn->wc;
157 }
158 
159 void
160 add_ctype_range(int end)
161 {
162 	ctype_node_t	*ctn;
163 	wchar_t		cur;
164 
165 	if (end < last_ctype) {
166 		errf(_("malformed character range (%u ... %u))"),
167 		    last_ctype, end);
168 		return;
169 	}
170 	for (cur = last_ctype + 1; cur <= end; cur++) {
171 		if ((ctn = get_ctype(cur)) == NULL) {
172 			INTERR;
173 			return;
174 		}
175 		add_ctype_impl(ctn);
176 	}
177 	last_ctype = end;
178 
179 }
180 
181 void
182 add_caseconv(int val, int wc)
183 {
184 	ctype_node_t	*ctn;
185 
186 	ctn = get_ctype(val);
187 	if (ctn == NULL) {
188 		INTERR;
189 		return;
190 	}
191 
192 	switch (last_kw) {
193 	case T_TOUPPER:
194 		ctn->toupper = wc;
195 		break;
196 	case T_TOLOWER:
197 		ctn->tolower = wc;
198 		break;
199 	default:
200 		INTERR;
201 		break;
202 	}
203 }
204 
205 void
206 dump_ctype(void)
207 {
208 	FILE		*f;
209 	_FileRuneLocale	rl;
210 	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
211 	_FileRuneEntry	*ct = NULL;
212 	_FileRuneEntry	*lo = NULL;
213 	_FileRuneEntry	*up = NULL;
214 	wchar_t		wc;
215 
216 	(void) memset(&rl, 0, sizeof (rl));
217 	last_ct = NULL;
218 	last_lo = NULL;
219 	last_up = NULL;
220 
221 	if ((f = open_category()) == NULL)
222 		return;
223 
224 	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
225 	(void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
226 
227 	/*
228 	 * Initialize the identity map.
229 	 */
230 	for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
231 		rl.maplower[wc] = wc;
232 		rl.mapupper[wc] = wc;
233 	}
234 
235 	for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
236 		int conflict = 0;
237 
238 		wc = ctn->wc;
239 
240 		/*
241 		 * POSIX requires certain portable characters have
242 		 * certain types.  Add them if they are missing.
243 		 */
244 		if ((wc >= 1) && (wc <= 127)) {
245 			if ((wc >= 'A') && (wc <= 'Z'))
246 				ctn->ctype |= _ISUPPER;
247 			if ((wc >= 'a') && (wc <= 'z'))
248 				ctn->ctype |= _ISLOWER;
249 			if ((wc >= '0') && (wc <= '9'))
250 				ctn->ctype |= _ISDIGIT;
251 			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
252 				ctn->ctype |= _ISSPACE;
253 			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
254 				ctn->ctype |= _ISXDIGIT;
255 			if (strchr(" \t", (char)wc))
256 				ctn->ctype |= _ISBLANK;
257 
258 			/*
259 			 * Technically these settings are only
260 			 * required for the C locale.  However, it
261 			 * turns out that because of the historical
262 			 * version of isprint(), we need them for all
263 			 * locales as well.  Note that these are not
264 			 * necessarily valid punctation characters in
265 			 * the current language, but ispunct() needs
266 			 * to return TRUE for them.
267 			 */
268 			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
269 			    (char)wc))
270 				ctn->ctype |= _ISPUNCT;
271 		}
272 
273 		/*
274 		 * POSIX also requires that certain types imply
275 		 * others.  Add any inferred types here.
276 		 */
277 		if (ctn->ctype & (_ISUPPER |_ISLOWER))
278 			ctn->ctype |= _ISALPHA;
279 		if (ctn->ctype & _ISDIGIT)
280 			ctn->ctype |= _ISXDIGIT;
281 		if (ctn->ctype & _ISBLANK)
282 			ctn->ctype |= _ISSPACE;
283 		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
284 			ctn->ctype |= _ISGRAPH;
285 		if (ctn->ctype & _ISGRAPH)
286 			ctn->ctype |= _ISPRINT;
287 
288 		/*
289 		 * Finally, POSIX requires that certain combinations
290 		 * are invalid.  We don't flag this as a fatal error,
291 		 * but we will warn about.
292 		 */
293 		if ((ctn->ctype & _ISALPHA) &&
294 		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
295 			conflict++;
296 		if ((ctn->ctype & _ISPUNCT) &
297 		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
298 			conflict++;
299 		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
300 			conflict++;
301 		if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
302 			conflict++;
303 		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
304 			conflict++;
305 
306 		if (conflict) {
307 			warn("conflicting classes for character 0x%x (%x)",
308 			    wc, ctn->ctype);
309 		}
310 		/*
311 		 * Handle the lower 256 characters using the simple
312 		 * optimization.  Note that if we have not defined the
313 		 * upper/lower case, then we identity map it.
314 		 */
315 		if ((unsigned)wc < _CACHED_RUNES) {
316 			rl.runetype[wc] = ctn->ctype;
317 			if (ctn->tolower)
318 				rl.maplower[wc] = ctn->tolower;
319 			if (ctn->toupper)
320 				rl.mapupper[wc] = ctn->toupper;
321 			continue;
322 		}
323 
324 		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
325 			ct[rl.runetype_ext_nranges-1].max = wc;
326 			last_ct = ctn;
327 		} else {
328 			rl.runetype_ext_nranges++;
329 			ct = realloc(ct,
330 			    sizeof (*ct) * rl.runetype_ext_nranges);
331 			ct[rl.runetype_ext_nranges - 1].min = wc;
332 			ct[rl.runetype_ext_nranges - 1].max = wc;
333 			ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
334 			last_ct = ctn;
335 		}
336 		if (ctn->tolower == 0) {
337 			last_lo = NULL;
338 		} else if ((last_lo != NULL) &&
339 		    (last_lo->tolower + 1 == ctn->tolower)) {
340 			lo[rl.maplower_ext_nranges-1].max = wc;
341 			last_lo = ctn;
342 		} else {
343 			rl.maplower_ext_nranges++;
344 			lo = realloc(lo,
345 			    sizeof (*lo) * rl.maplower_ext_nranges);
346 			lo[rl.maplower_ext_nranges - 1].min = wc;
347 			lo[rl.maplower_ext_nranges - 1].max = wc;
348 			lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
349 			last_lo = ctn;
350 		}
351 
352 		if (ctn->toupper == 0) {
353 			last_up = NULL;
354 		} else if ((last_up != NULL) &&
355 		    (last_up->toupper + 1 == ctn->toupper)) {
356 			up[rl.mapupper_ext_nranges-1].max = wc;
357 			last_up = ctn;
358 		} else {
359 			rl.mapupper_ext_nranges++;
360 			up = realloc(up,
361 			    sizeof (*up) * rl.mapupper_ext_nranges);
362 			up[rl.mapupper_ext_nranges - 1].min = wc;
363 			up[rl.mapupper_ext_nranges - 1].max = wc;
364 			up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
365 			last_up = ctn;
366 		}
367 	}
368 
369 	if ((wr_category(&rl, sizeof (rl), f) < 0) ||
370 	    (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
371 	    (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
372 	    (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
373 		return;
374 	}
375 
376 	close_category(f);
377 }
378