xref: /illumos-gate/usr/src/cmd/iconv/charmap.c (revision 48edc7cf)
1*48edc7cfSGordon Ross /*
2*48edc7cfSGordon Ross  * This file and its contents are supplied under the terms of the
3*48edc7cfSGordon Ross  * Common Development and Distribution License ("CDDL"), version 1.0.
4*48edc7cfSGordon Ross  * You may only use this file in accordance with the terms of version
5*48edc7cfSGordon Ross  * 1.0 of the CDDL.
6*48edc7cfSGordon Ross  *
7*48edc7cfSGordon Ross  * A full copy of the text of the CDDL should have accompanied this
8*48edc7cfSGordon Ross  * source.  A copy of the CDDL is also available via the Internet at
9*48edc7cfSGordon Ross  * http://www.illumos.org/license/CDDL.
10*48edc7cfSGordon Ross  */
11*48edc7cfSGordon Ross 
12*48edc7cfSGordon Ross /*
13*48edc7cfSGordon Ross  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
14*48edc7cfSGordon Ross  */
15*48edc7cfSGordon Ross 
16*48edc7cfSGordon Ross /*
17*48edc7cfSGordon Ross  * CHARMAP file handling for iconv.
18*48edc7cfSGordon Ross  */
19*48edc7cfSGordon Ross 
20*48edc7cfSGordon Ross #include <stdio.h>
21*48edc7cfSGordon Ross #include <stdlib.h>
22*48edc7cfSGordon Ross #include <string.h>
23*48edc7cfSGordon Ross #include <errno.h>
24*48edc7cfSGordon Ross #include <limits.h>
25*48edc7cfSGordon Ross #include <unistd.h>
26*48edc7cfSGordon Ross #include <alloca.h>
27*48edc7cfSGordon Ross #include <sys/avl.h>
28*48edc7cfSGordon Ross #include <stddef.h>
29*48edc7cfSGordon Ross #include <unistd.h>
30*48edc7cfSGordon Ross #include "charmap.h"
31*48edc7cfSGordon Ross #include "parser.tab.h"
32*48edc7cfSGordon Ross #include <assert.h>
33*48edc7cfSGordon Ross 
34*48edc7cfSGordon Ross enum cmap_pass cmap_pass;
35*48edc7cfSGordon Ross static avl_tree_t	cmap_sym;
36*48edc7cfSGordon Ross static avl_tree_t	cmap_mbs;
37*48edc7cfSGordon Ross 
38*48edc7cfSGordon Ross typedef struct charmap {
39*48edc7cfSGordon Ross 	const char *cm_name;
40*48edc7cfSGordon Ross 	struct charmap *cm_alias_of;
41*48edc7cfSGordon Ross 	avl_node_t cm_avl_sym;
42*48edc7cfSGordon Ross 	avl_node_t cm_avl_mbs;
43*48edc7cfSGordon Ross 	int cm_warned;
44*48edc7cfSGordon Ross 	int cm_frmbs_len;
45*48edc7cfSGordon Ross 	int cm_tombs_len;
46*48edc7cfSGordon Ross 	char cm_frmbs[MB_LEN_MAX + 1]; /* input */
47*48edc7cfSGordon Ross 	char cm_tombs[MB_LEN_MAX + 1]; /* output */
48*48edc7cfSGordon Ross } charmap_t;
49*48edc7cfSGordon Ross 
50*48edc7cfSGordon Ross static void add_charmap_impl_fr(char *sym, char *mbs, int mbs_len, int nodups);
51*48edc7cfSGordon Ross static void add_charmap_impl_to(char *sym, char *mbs, int mbs_len, int nodups);
52*48edc7cfSGordon Ross 
53*48edc7cfSGordon Ross /*
54*48edc7cfSGordon Ross  * Array of POSIX specific portable characters.
55*48edc7cfSGordon Ross  */
56*48edc7cfSGordon Ross static const struct {
57*48edc7cfSGordon Ross 	char	*name;
58*48edc7cfSGordon Ross 	int	ch;
59*48edc7cfSGordon Ross } portable_chars[] = {
60*48edc7cfSGordon Ross 	{ "NUL",		 '\0' },
61*48edc7cfSGordon Ross 	{ "alert",		'\a' },
62*48edc7cfSGordon Ross 	{ "backspace",		'\b' },
63*48edc7cfSGordon Ross 	{ "tab",		'\t' },
64*48edc7cfSGordon Ross 	{ "carriage-return",	'\r' },
65*48edc7cfSGordon Ross 	{ "newline",		'\n' },
66*48edc7cfSGordon Ross 	{ "vertical-tab",	'\v' },
67*48edc7cfSGordon Ross 	{ "form-feed",		'\f' },
68*48edc7cfSGordon Ross 	{ "space",		' ' },
69*48edc7cfSGordon Ross 	{ "exclamation-mark",	'!' },
70*48edc7cfSGordon Ross 	{ "quotation-mark",	'"' },
71*48edc7cfSGordon Ross 	{ "number-sign",	'#' },
72*48edc7cfSGordon Ross 	{ "dollar-sign",	'$' },
73*48edc7cfSGordon Ross 	{ "percent-sign",	'%' },
74*48edc7cfSGordon Ross 	{ "ampersand",		'&' },
75*48edc7cfSGordon Ross 	{ "apostrophe",		'\'' },
76*48edc7cfSGordon Ross 	{ "left-parenthesis",	'(' },
77*48edc7cfSGordon Ross 	{ "right-parenthesis",	'(' },
78*48edc7cfSGordon Ross 	{ "asterisk",		'*' },
79*48edc7cfSGordon Ross 	{ "plus-sign",		'+' },
80*48edc7cfSGordon Ross 	{ "comma",		 ','},
81*48edc7cfSGordon Ross 	{ "hyphen-minus",	'-' },
82*48edc7cfSGordon Ross 	{ "hyphen",		'-' },
83*48edc7cfSGordon Ross 	{ "full-stop",		'.' },
84*48edc7cfSGordon Ross 	{ "period",		'.' },
85*48edc7cfSGordon Ross 	{ "slash",		'/' },
86*48edc7cfSGordon Ross 	{ "solidus",		'/' },
87*48edc7cfSGordon Ross 	{ "zero",		'0' },
88*48edc7cfSGordon Ross 	{ "one",		'1' },
89*48edc7cfSGordon Ross 	{ "two",		'2' },
90*48edc7cfSGordon Ross 	{ "three",		'3' },
91*48edc7cfSGordon Ross 	{ "four",		'4' },
92*48edc7cfSGordon Ross 	{ "five",		'5' },
93*48edc7cfSGordon Ross 	{ "six",		'6' },
94*48edc7cfSGordon Ross 	{ "seven",		'7' },
95*48edc7cfSGordon Ross 	{ "eight",		'8' },
96*48edc7cfSGordon Ross 	{ "nine",		'9' },
97*48edc7cfSGordon Ross 	{ "colon",		':' },
98*48edc7cfSGordon Ross 	{ "semicolon",		';' },
99*48edc7cfSGordon Ross 	{ "less-than-sign",	'<' },
100*48edc7cfSGordon Ross 	{ "equals-sign",	'=' },
101*48edc7cfSGordon Ross 	{ "greater-than-sign",	'>' },
102*48edc7cfSGordon Ross 	{ "question-mark",	'?' },
103*48edc7cfSGordon Ross 	{ "commercial-at",	'@' },
104*48edc7cfSGordon Ross 	{ "left-square-bracket", '[' },
105*48edc7cfSGordon Ross 	{ "backslash",		'\\' },
106*48edc7cfSGordon Ross 	{ "reverse-solidus",	'\\' },
107*48edc7cfSGordon Ross 	{ "right-square-bracket", ']' },
108*48edc7cfSGordon Ross 	{ "circumflex",		'^' },
109*48edc7cfSGordon Ross 	{ "circumflex-accent",	'^' },
110*48edc7cfSGordon Ross 	{ "low-line",		'_' },
111*48edc7cfSGordon Ross 	{ "underscore",		'_' },
112*48edc7cfSGordon Ross 	{ "grave-accent",	'`' },
113*48edc7cfSGordon Ross 	{ "left-brace",		'{' },
114*48edc7cfSGordon Ross 	{ "left-curly-bracket",	'{' },
115*48edc7cfSGordon Ross 	{ "vertical-line",	'|' },
116*48edc7cfSGordon Ross 	{ "right-brace",	'}' },
117*48edc7cfSGordon Ross 	{ "right-curly-bracket", '}' },
118*48edc7cfSGordon Ross 	{ "tilde",		'~' },
119*48edc7cfSGordon Ross 	{ "A", 'A' },
120*48edc7cfSGordon Ross 	{ "B", 'B' },
121*48edc7cfSGordon Ross 	{ "C", 'C' },
122*48edc7cfSGordon Ross 	{ "D", 'D' },
123*48edc7cfSGordon Ross 	{ "E", 'E' },
124*48edc7cfSGordon Ross 	{ "F", 'F' },
125*48edc7cfSGordon Ross 	{ "G", 'G' },
126*48edc7cfSGordon Ross 	{ "H", 'H' },
127*48edc7cfSGordon Ross 	{ "I", 'I' },
128*48edc7cfSGordon Ross 	{ "J", 'J' },
129*48edc7cfSGordon Ross 	{ "K", 'K' },
130*48edc7cfSGordon Ross 	{ "L", 'L' },
131*48edc7cfSGordon Ross 	{ "M", 'M' },
132*48edc7cfSGordon Ross 	{ "N", 'N' },
133*48edc7cfSGordon Ross 	{ "O", 'O' },
134*48edc7cfSGordon Ross 	{ "P", 'P' },
135*48edc7cfSGordon Ross 	{ "Q", 'Q' },
136*48edc7cfSGordon Ross 	{ "R", 'R' },
137*48edc7cfSGordon Ross 	{ "S", 'S' },
138*48edc7cfSGordon Ross 	{ "T", 'T' },
139*48edc7cfSGordon Ross 	{ "U", 'U' },
140*48edc7cfSGordon Ross 	{ "V", 'V' },
141*48edc7cfSGordon Ross 	{ "W", 'W' },
142*48edc7cfSGordon Ross 	{ "X", 'X' },
143*48edc7cfSGordon Ross 	{ "Y", 'Y' },
144*48edc7cfSGordon Ross 	{ "Z", 'Z' },
145*48edc7cfSGordon Ross 	{ "a", 'a' },
146*48edc7cfSGordon Ross 	{ "b", 'b' },
147*48edc7cfSGordon Ross 	{ "c", 'c' },
148*48edc7cfSGordon Ross 	{ "d", 'd' },
149*48edc7cfSGordon Ross 	{ "e", 'e' },
150*48edc7cfSGordon Ross 	{ "f", 'f' },
151*48edc7cfSGordon Ross 	{ "g", 'g' },
152*48edc7cfSGordon Ross 	{ "h", 'h' },
153*48edc7cfSGordon Ross 	{ "i", 'i' },
154*48edc7cfSGordon Ross 	{ "j", 'j' },
155*48edc7cfSGordon Ross 	{ "k", 'k' },
156*48edc7cfSGordon Ross 	{ "l", 'l' },
157*48edc7cfSGordon Ross 	{ "m", 'm' },
158*48edc7cfSGordon Ross 	{ "n", 'n' },
159*48edc7cfSGordon Ross 	{ "o", 'o' },
160*48edc7cfSGordon Ross 	{ "p", 'p' },
161*48edc7cfSGordon Ross 	{ "q", 'q' },
162*48edc7cfSGordon Ross 	{ "r", 'r' },
163*48edc7cfSGordon Ross 	{ "s", 's' },
164*48edc7cfSGordon Ross 	{ "t", 't' },
165*48edc7cfSGordon Ross 	{ "u", 'u' },
166*48edc7cfSGordon Ross 	{ "v", 'v' },
167*48edc7cfSGordon Ross 	{ "w", 'w' },
168*48edc7cfSGordon Ross 	{ "x", 'x' },
169*48edc7cfSGordon Ross 	{ "y", 'y' },
170*48edc7cfSGordon Ross 	{ "z", 'z' },
171*48edc7cfSGordon Ross 	{ NULL, 0 }
172*48edc7cfSGordon Ross };
173*48edc7cfSGordon Ross 
174*48edc7cfSGordon Ross static int
cmap_compare_sym(const void * n1,const void * n2)175*48edc7cfSGordon Ross cmap_compare_sym(const void *n1, const void *n2)
176*48edc7cfSGordon Ross {
177*48edc7cfSGordon Ross 	const charmap_t *c1 = n1;
178*48edc7cfSGordon Ross 	const charmap_t *c2 = n2;
179*48edc7cfSGordon Ross 	int rv;
180*48edc7cfSGordon Ross 
181*48edc7cfSGordon Ross 	rv = strcmp(c1->cm_name, c2->cm_name);
182*48edc7cfSGordon Ross 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
183*48edc7cfSGordon Ross }
184*48edc7cfSGordon Ross 
185*48edc7cfSGordon Ross /*
186*48edc7cfSGordon Ross  * In order for partial match searches to work,
187*48edc7cfSGordon Ross  * we need these sorted by mbs contents.
188*48edc7cfSGordon Ross  */
189*48edc7cfSGordon Ross static int
cmap_compare_mbs(const void * n1,const void * n2)190*48edc7cfSGordon Ross cmap_compare_mbs(const void *n1, const void *n2)
191*48edc7cfSGordon Ross {
192*48edc7cfSGordon Ross 	const charmap_t *c1 = n1;
193*48edc7cfSGordon Ross 	const charmap_t *c2 = n2;
194*48edc7cfSGordon Ross 	int len, rv;
195*48edc7cfSGordon Ross 
196*48edc7cfSGordon Ross 	len = c1->cm_frmbs_len;
197*48edc7cfSGordon Ross 	if (len < c2->cm_frmbs_len)
198*48edc7cfSGordon Ross 		len = c2->cm_frmbs_len;
199*48edc7cfSGordon Ross 	rv = memcmp(c1->cm_frmbs, c2->cm_frmbs, len);
200*48edc7cfSGordon Ross 	if (rv < 0)
201*48edc7cfSGordon Ross 		return (-1);
202*48edc7cfSGordon Ross 	if (rv > 0)
203*48edc7cfSGordon Ross 		return (1);
204*48edc7cfSGordon Ross 	/* they match through length */
205*48edc7cfSGordon Ross 	if (c1->cm_frmbs_len < c2->cm_frmbs_len)
206*48edc7cfSGordon Ross 		return (-1);
207*48edc7cfSGordon Ross 	if (c2->cm_frmbs_len < c1->cm_frmbs_len)
208*48edc7cfSGordon Ross 		return (1);
209*48edc7cfSGordon Ross 	return (0);
210*48edc7cfSGordon Ross }
211*48edc7cfSGordon Ross 
212*48edc7cfSGordon Ross void
charmap_init(char * to_map,char * from_map)213*48edc7cfSGordon Ross charmap_init(char *to_map, char *from_map)
214*48edc7cfSGordon Ross {
215*48edc7cfSGordon Ross 	avl_create(&cmap_sym, cmap_compare_sym, sizeof (charmap_t),
216*48edc7cfSGordon Ross 	    offsetof(charmap_t, cm_avl_sym));
217*48edc7cfSGordon Ross 
218*48edc7cfSGordon Ross 	avl_create(&cmap_mbs, cmap_compare_mbs, sizeof (charmap_t),
219*48edc7cfSGordon Ross 	    offsetof(charmap_t, cm_avl_mbs));
220*48edc7cfSGordon Ross 
221*48edc7cfSGordon Ross 	cmap_pass = CMAP_PASS_FROM;
222*48edc7cfSGordon Ross 	reset_scanner(from_map);
223*48edc7cfSGordon Ross 	(void) yyparse();
224*48edc7cfSGordon Ross 	add_charmap_posix();
225*48edc7cfSGordon Ross 
226*48edc7cfSGordon Ross 	cmap_pass = CMAP_PASS_TO;
227*48edc7cfSGordon Ross 	reset_scanner(to_map);
228*48edc7cfSGordon Ross 	(void) yyparse();
229*48edc7cfSGordon Ross }
230*48edc7cfSGordon Ross 
231*48edc7cfSGordon Ross void
charmap_dump()232*48edc7cfSGordon Ross charmap_dump()
233*48edc7cfSGordon Ross {
234*48edc7cfSGordon Ross 	charmap_t *cm;
235*48edc7cfSGordon Ross 	int i;
236*48edc7cfSGordon Ross 
237*48edc7cfSGordon Ross 	cm = avl_first(&cmap_mbs);
238*48edc7cfSGordon Ross 	while (cm != NULL) {
239*48edc7cfSGordon Ross 		(void) printf("name=\"%s\"\n", cm->cm_name);
240*48edc7cfSGordon Ross 
241*48edc7cfSGordon Ross 		(void) printf("\timbs=\"");
242*48edc7cfSGordon Ross 		for (i = 0; i < cm->cm_frmbs_len; i++)
243*48edc7cfSGordon Ross 			(void) printf("\\x%02x", cm->cm_frmbs[i] & 0xFF);
244*48edc7cfSGordon Ross 		(void) printf("\"\n");
245*48edc7cfSGordon Ross 
246*48edc7cfSGordon Ross 		(void) printf("\tombs=\"");
247*48edc7cfSGordon Ross 		for (i = 0; i < cm->cm_tombs_len; i++)
248*48edc7cfSGordon Ross 			(void) printf("\\x%02x", cm->cm_tombs[i] & 0xFF);
249*48edc7cfSGordon Ross 		(void) printf("\"\n");
250*48edc7cfSGordon Ross 
251*48edc7cfSGordon Ross 		cm = AVL_NEXT(&cmap_mbs, cm);
252*48edc7cfSGordon Ross 	}
253*48edc7cfSGordon Ross }
254*48edc7cfSGordon Ross 
255*48edc7cfSGordon Ross /*
256*48edc7cfSGordon Ross  * We parse two charmap files:  First the "from" map, where we build
257*48edc7cfSGordon Ross  * cmap_mbs and cmap_sym which we'll later use to translate the input
258*48edc7cfSGordon Ross  * stream (mbs encodings) to symbols.  Second, we parse the "to" map,
259*48edc7cfSGordon Ross  * where we fill in the tombs members of entries in cmap_sym, (which
260*48edc7cfSGordon Ross  * must alread exist) used later to write the output encoding.
261*48edc7cfSGordon Ross  */
262*48edc7cfSGordon Ross static void
add_charmap_impl(char * sym,char * mbs,int mbs_len,int nodups)263*48edc7cfSGordon Ross add_charmap_impl(char *sym, char *mbs, int mbs_len, int nodups)
264*48edc7cfSGordon Ross {
265*48edc7cfSGordon Ross 
266*48edc7cfSGordon Ross 	/*
267*48edc7cfSGordon Ross 	 * While parsing both the "from" and "to" cmaps,
268*48edc7cfSGordon Ross 	 * require both the symbol and encoding.
269*48edc7cfSGordon Ross 	 */
270*48edc7cfSGordon Ross 	if (sym == NULL || mbs == NULL) {
271*48edc7cfSGordon Ross 		errf(_("invalid charmap entry"));
272*48edc7cfSGordon Ross 		return;
273*48edc7cfSGordon Ross 	}
274*48edc7cfSGordon Ross 
275*48edc7cfSGordon Ross 	switch (cmap_pass) {
276*48edc7cfSGordon Ross 	case CMAP_PASS_FROM:
277*48edc7cfSGordon Ross 		add_charmap_impl_fr(sym, mbs, mbs_len, nodups);
278*48edc7cfSGordon Ross 		break;
279*48edc7cfSGordon Ross 	case CMAP_PASS_TO:
280*48edc7cfSGordon Ross 		add_charmap_impl_to(sym, mbs, mbs_len, nodups);
281*48edc7cfSGordon Ross 		break;
282*48edc7cfSGordon Ross 	default:
283*48edc7cfSGordon Ross 		abort();
284*48edc7cfSGordon Ross 		break;
285*48edc7cfSGordon Ross 	}
286*48edc7cfSGordon Ross }
287*48edc7cfSGordon Ross 
288*48edc7cfSGordon Ross static void
add_charmap_impl_fr(char * sym,char * mbs,int mbs_len,int nodups)289*48edc7cfSGordon Ross add_charmap_impl_fr(char *sym, char *mbs, int mbs_len, int nodups)
290*48edc7cfSGordon Ross {
291*48edc7cfSGordon Ross 	charmap_t	*m, *n, *s;
292*48edc7cfSGordon Ross 	avl_index_t	where_sym, where_mbs;
293*48edc7cfSGordon Ross 
294*48edc7cfSGordon Ross 	if ((n = calloc(1, sizeof (*n))) == NULL) {
295*48edc7cfSGordon Ross 		errf(_("out of memory"));
296*48edc7cfSGordon Ross 		return;
297*48edc7cfSGordon Ross 	}
298*48edc7cfSGordon Ross 	n->cm_name = sym;
299*48edc7cfSGordon Ross 
300*48edc7cfSGordon Ross 	assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
301*48edc7cfSGordon Ross 	(void) memcpy(n->cm_frmbs, mbs, mbs_len);
302*48edc7cfSGordon Ross 	n->cm_frmbs_len = mbs_len;
303*48edc7cfSGordon Ross 
304*48edc7cfSGordon Ross 	m = avl_find(&cmap_mbs, n, &where_mbs);
305*48edc7cfSGordon Ross 	s = avl_find(&cmap_sym, n, &where_sym);
306*48edc7cfSGordon Ross 
307*48edc7cfSGordon Ross 	/*
308*48edc7cfSGordon Ross 	 * If we found the symbol, this is a dup.
309*48edc7cfSGordon Ross 	 */
310*48edc7cfSGordon Ross 	if (s != NULL) {
311*48edc7cfSGordon Ross 		if (nodups) {
312*48edc7cfSGordon Ross 			warn(_("%s: duplicate character symbol"), sym);
313*48edc7cfSGordon Ross 		}
314*48edc7cfSGordon Ross 		free(n);
315*48edc7cfSGordon Ross 		return;
316*48edc7cfSGordon Ross 	}
317*48edc7cfSGordon Ross 
318*48edc7cfSGordon Ross 	/*
319*48edc7cfSGordon Ross 	 * If we found the mbs, the new one is an alias,
320*48edc7cfSGordon Ross 	 * which we'll add _only_ to the symbol AVL.
321*48edc7cfSGordon Ross 	 */
322*48edc7cfSGordon Ross 	if (m != NULL) {
323*48edc7cfSGordon Ross 		/* The new one is an alias of the original. */
324*48edc7cfSGordon Ross 		n->cm_alias_of = m;
325*48edc7cfSGordon Ross 		avl_insert(&cmap_sym, n, where_sym);
326*48edc7cfSGordon Ross 		return;
327*48edc7cfSGordon Ross 	}
328*48edc7cfSGordon Ross 
329*48edc7cfSGordon Ross 	avl_insert(&cmap_sym, n, where_sym);
330*48edc7cfSGordon Ross 	avl_insert(&cmap_mbs, n, where_mbs);
331*48edc7cfSGordon Ross }
332*48edc7cfSGordon Ross 
333*48edc7cfSGordon Ross static void
add_charmap_impl_to(char * sym,char * mbs,int mbs_len,int nodups)334*48edc7cfSGordon Ross add_charmap_impl_to(char *sym, char *mbs, int mbs_len, int nodups)
335*48edc7cfSGordon Ross {
336*48edc7cfSGordon Ross 	charmap_t	srch = {0};
337*48edc7cfSGordon Ross 	charmap_t	*m;
338*48edc7cfSGordon Ross 
339*48edc7cfSGordon Ross 	assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
340*48edc7cfSGordon Ross 
341*48edc7cfSGordon Ross 	srch.cm_name = sym;
342*48edc7cfSGordon Ross 
343*48edc7cfSGordon Ross 	m = avl_find(&cmap_sym, &srch, NULL);
344*48edc7cfSGordon Ross 	if (m == NULL) {
345*48edc7cfSGordon Ross 		if (sflag == 0)
346*48edc7cfSGordon Ross 			warn(_("%s: symbol not found"), sym);
347*48edc7cfSGordon Ross 		return;
348*48edc7cfSGordon Ross 	}
349*48edc7cfSGordon Ross 	if (m->cm_alias_of != NULL) {
350*48edc7cfSGordon Ross 		m = m->cm_alias_of;
351*48edc7cfSGordon Ross 
352*48edc7cfSGordon Ross 		/* don't warn for dups with aliases */
353*48edc7cfSGordon Ross 		if (m->cm_tombs_len != 0)
354*48edc7cfSGordon Ross 			return;
355*48edc7cfSGordon Ross 	}
356*48edc7cfSGordon Ross 
357*48edc7cfSGordon Ross 	if (m->cm_tombs_len != 0) {
358*48edc7cfSGordon Ross 		if (nodups) {
359*48edc7cfSGordon Ross 			warn(_("%s: duplicate encoding for"), sym);
360*48edc7cfSGordon Ross 		}
361*48edc7cfSGordon Ross 		return;
362*48edc7cfSGordon Ross 	}
363*48edc7cfSGordon Ross 
364*48edc7cfSGordon Ross 	(void) memcpy(m->cm_tombs, mbs, mbs_len);
365*48edc7cfSGordon Ross 	m->cm_tombs_len = mbs_len;
366*48edc7cfSGordon Ross }
367*48edc7cfSGordon Ross 
368*48edc7cfSGordon Ross void
add_charmap(char * sym,char * mbs)369*48edc7cfSGordon Ross add_charmap(char *sym, char *mbs)
370*48edc7cfSGordon Ross {
371*48edc7cfSGordon Ross 	/* mbs[0] is the length */
372*48edc7cfSGordon Ross 	int mbs_len = *mbs++;
373*48edc7cfSGordon Ross 	assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
374*48edc7cfSGordon Ross 	add_charmap_impl(sym, mbs, mbs_len, 1);
375*48edc7cfSGordon Ross }
376*48edc7cfSGordon Ross 
377*48edc7cfSGordon Ross 
378*48edc7cfSGordon Ross /*
379*48edc7cfSGordon Ross  * This is called by the parser with start/end symbol strings (ssym, esym),
380*48edc7cfSGordon Ross  * which are allocated in the scanner (T_SYMBOL) and free'd here.
381*48edc7cfSGordon Ross  */
382*48edc7cfSGordon Ross void
add_charmap_range(char * ssym,char * esym,char * mbs)383*48edc7cfSGordon Ross add_charmap_range(char *ssym, char *esym, char *mbs)
384*48edc7cfSGordon Ross {
385*48edc7cfSGordon Ross 	int	ls, le;
386*48edc7cfSGordon Ross 	int	si;
387*48edc7cfSGordon Ross 	int	sn, en;
388*48edc7cfSGordon Ross 	int	i;
389*48edc7cfSGordon Ross 	int	mbs_len;
390*48edc7cfSGordon Ross 	char	tmbs[MB_LEN_MAX+1];
391*48edc7cfSGordon Ross 	char	*mb_last;
392*48edc7cfSGordon Ross 
393*48edc7cfSGordon Ross 	static const char *digits = "0123456789";
394*48edc7cfSGordon Ross 
395*48edc7cfSGordon Ross 	/* mbs[0] is the length */
396*48edc7cfSGordon Ross 	mbs_len = *mbs++;
397*48edc7cfSGordon Ross 	assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
398*48edc7cfSGordon Ross 	(void) memcpy(tmbs, mbs, mbs_len);
399*48edc7cfSGordon Ross 	mb_last = tmbs + mbs_len - 1;
400*48edc7cfSGordon Ross 
401*48edc7cfSGordon Ross 	ls = strlen(ssym);
402*48edc7cfSGordon Ross 	le = strlen(esym);
403*48edc7cfSGordon Ross 
404*48edc7cfSGordon Ross 	if (((si = strcspn(ssym, digits)) == 0) || (si == ls) ||
405*48edc7cfSGordon Ross 	    (strncmp(ssym, esym, si) != 0) ||
406*48edc7cfSGordon Ross 	    (strspn(ssym + si, digits) != (ls - si)) ||
407*48edc7cfSGordon Ross 	    (strspn(esym + si, digits) != (le - si)) ||
408*48edc7cfSGordon Ross 	    ((sn = atoi(ssym + si)) > ((en = atoi(esym + si))))) {
409*48edc7cfSGordon Ross 		errf(_("malformed charmap range"));
410*48edc7cfSGordon Ross 		return;
411*48edc7cfSGordon Ross 	}
412*48edc7cfSGordon Ross 
413*48edc7cfSGordon Ross 	ssym[si] = 0;
414*48edc7cfSGordon Ross 	for (i = sn; i <= en; i++) {
415*48edc7cfSGordon Ross 		char *nn;
416*48edc7cfSGordon Ross 		(void) asprintf(&nn, "%s%0*u", ssym, ls - si, i);
417*48edc7cfSGordon Ross 		if (nn == NULL) {
418*48edc7cfSGordon Ross 			errf(_("out of memory"));
419*48edc7cfSGordon Ross 			return;
420*48edc7cfSGordon Ross 		}
421*48edc7cfSGordon Ross 
422*48edc7cfSGordon Ross 		add_charmap_impl(nn, tmbs, mbs_len, 1);
423*48edc7cfSGordon Ross 		(*mb_last)++;
424*48edc7cfSGordon Ross 	}
425*48edc7cfSGordon Ross 	free(ssym);
426*48edc7cfSGordon Ross 	free(esym);
427*48edc7cfSGordon Ross }
428*48edc7cfSGordon Ross 
429*48edc7cfSGordon Ross void
add_charmap_char(char * name,int c)430*48edc7cfSGordon Ross add_charmap_char(char *name, int c)
431*48edc7cfSGordon Ross {
432*48edc7cfSGordon Ross 	char mbs[MB_LEN_MAX+1];
433*48edc7cfSGordon Ross 
434*48edc7cfSGordon Ross 	mbs[0] = c;
435*48edc7cfSGordon Ross 	mbs[1] = '\0';
436*48edc7cfSGordon Ross 	add_charmap_impl(name, mbs, 1, 0);
437*48edc7cfSGordon Ross }
438*48edc7cfSGordon Ross 
439*48edc7cfSGordon Ross /*
440*48edc7cfSGordon Ross  * POSIX insists that certain entries be present, even when not in the
441*48edc7cfSGordon Ross  * orginal charmap file.
442*48edc7cfSGordon Ross  */
443*48edc7cfSGordon Ross void
add_charmap_posix(void)444*48edc7cfSGordon Ross add_charmap_posix(void)
445*48edc7cfSGordon Ross {
446*48edc7cfSGordon Ross 	int	i;
447*48edc7cfSGordon Ross 
448*48edc7cfSGordon Ross 	for (i = 0; portable_chars[i].name; i++) {
449*48edc7cfSGordon Ross 		add_charmap_char(portable_chars[i].name, portable_chars[i].ch);
450*48edc7cfSGordon Ross 	}
451*48edc7cfSGordon Ross }
452*48edc7cfSGordon Ross 
453*48edc7cfSGordon Ross /*
454*48edc7cfSGordon Ross  * This is called with a buffer of (typically) MB_LEN_MAX bytes,
455*48edc7cfSGordon Ross  * which is potentially a multi-byte symbol, but often contains
456*48edc7cfSGordon Ross  * extra bytes. Find and return the longest match in the charmap.
457*48edc7cfSGordon Ross  */
458*48edc7cfSGordon Ross static charmap_t *
find_mbs(const char * mbs,int len)459*48edc7cfSGordon Ross find_mbs(const char *mbs, int len)
460*48edc7cfSGordon Ross {
461*48edc7cfSGordon Ross 	charmap_t srch = {0};
462*48edc7cfSGordon Ross 	charmap_t *cm = NULL;
463*48edc7cfSGordon Ross 
464*48edc7cfSGordon Ross 	while (len > 0) {
465*48edc7cfSGordon Ross 		(void) memcpy(srch.cm_frmbs, mbs, len);
466*48edc7cfSGordon Ross 		srch.cm_frmbs_len = len;
467*48edc7cfSGordon Ross 		cm = avl_find(&cmap_mbs, &srch, NULL);
468*48edc7cfSGordon Ross 		if (cm != NULL)
469*48edc7cfSGordon Ross 			break;
470*48edc7cfSGordon Ross 		len--;
471*48edc7cfSGordon Ross 	}
472*48edc7cfSGordon Ross 
473*48edc7cfSGordon Ross 	return (cm);
474*48edc7cfSGordon Ross }
475*48edc7cfSGordon Ross 
476*48edc7cfSGordon Ross /*
477*48edc7cfSGordon Ross  * Return true if this sequence matches the initial part
478*48edc7cfSGordon Ross  * of any sequence known in this charmap.
479*48edc7cfSGordon Ross  */
480*48edc7cfSGordon Ross static boolean_t
find_mbs_partial(const char * mbs,int len)481*48edc7cfSGordon Ross find_mbs_partial(const char *mbs, int len)
482*48edc7cfSGordon Ross {
483*48edc7cfSGordon Ross 	charmap_t srch = {0};
484*48edc7cfSGordon Ross 	charmap_t *cm;
485*48edc7cfSGordon Ross 	avl_index_t where;
486*48edc7cfSGordon Ross 
487*48edc7cfSGordon Ross 	(void) memcpy(srch.cm_frmbs, mbs, len);
488*48edc7cfSGordon Ross 	srch.cm_frmbs_len = len;
489*48edc7cfSGordon Ross 	cm = avl_find(&cmap_mbs, &srch, &where);
490*48edc7cfSGordon Ross 	if (cm != NULL) {
491*48edc7cfSGordon Ross 		/* full match - not expected, but OK */
492*48edc7cfSGordon Ross 		return (B_TRUE);
493*48edc7cfSGordon Ross 	}
494*48edc7cfSGordon Ross 	cm = avl_nearest(&cmap_mbs, where, AVL_AFTER);
495*48edc7cfSGordon Ross 	if (cm != NULL && 0 == memcmp(cm->cm_frmbs, mbs, len))
496*48edc7cfSGordon Ross 		return (B_TRUE);
497*48edc7cfSGordon Ross 
498*48edc7cfSGordon Ross 	return (B_FALSE);
499*48edc7cfSGordon Ross }
500*48edc7cfSGordon Ross 
501*48edc7cfSGordon Ross /*
502*48edc7cfSGordon Ross  * Do like iconv(3), but with charmaps.
503*48edc7cfSGordon Ross  */
504*48edc7cfSGordon Ross size_t
cm_iconv(const char ** iptr,size_t * ileft,char ** optr,size_t * oleft)505*48edc7cfSGordon Ross cm_iconv(const char **iptr, size_t *ileft, char **optr, size_t *oleft)
506*48edc7cfSGordon Ross {
507*48edc7cfSGordon Ross 	charmap_t *cm;
508*48edc7cfSGordon Ross 	int mbs_len;
509*48edc7cfSGordon Ross 
510*48edc7cfSGordon Ross 	/* Ignore state reset requests. */
511*48edc7cfSGordon Ross 	if (iptr == NULL || *iptr == NULL)
512*48edc7cfSGordon Ross 		return (0);
513*48edc7cfSGordon Ross 
514*48edc7cfSGordon Ross 	if (*oleft < MB_LEN_MAX) {
515*48edc7cfSGordon Ross 		errno = E2BIG;
516*48edc7cfSGordon Ross 		return ((size_t)-1);
517*48edc7cfSGordon Ross 	}
518*48edc7cfSGordon Ross 
519*48edc7cfSGordon Ross 	while (*ileft > 0 && *oleft >= MB_LEN_MAX) {
520*48edc7cfSGordon Ross 		mbs_len = MB_LEN_MAX;
521*48edc7cfSGordon Ross 		if (mbs_len > *ileft)
522*48edc7cfSGordon Ross 			mbs_len = *ileft;
523*48edc7cfSGordon Ross 		cm = find_mbs(*iptr, mbs_len);
524*48edc7cfSGordon Ross 		if (cm == NULL) {
525*48edc7cfSGordon Ross 			if (mbs_len < MB_LEN_MAX &&
526*48edc7cfSGordon Ross 			    find_mbs_partial(*iptr, mbs_len)) {
527*48edc7cfSGordon Ross 				/* incomplete sequence */
528*48edc7cfSGordon Ross 				errno = EINVAL;
529*48edc7cfSGordon Ross 			} else {
530*48edc7cfSGordon Ross 				errno = EILSEQ;
531*48edc7cfSGordon Ross 			}
532*48edc7cfSGordon Ross 			return ((size_t)-1);
533*48edc7cfSGordon Ross 		}
534*48edc7cfSGordon Ross 		assert(cm->cm_frmbs_len > 0);
535*48edc7cfSGordon Ross 		if (cm->cm_tombs_len == 0) {
536*48edc7cfSGordon Ross 			if (sflag == 0 && cm->cm_warned == 0) {
537*48edc7cfSGordon Ross 				cm->cm_warned = 1;
538*48edc7cfSGordon Ross 				warn(_("To-map does not encode <%s>\n"),
539*48edc7cfSGordon Ross 				    cm->cm_name);
540*48edc7cfSGordon Ross 			}
541*48edc7cfSGordon Ross 			if (cflag == 0) {
542*48edc7cfSGordon Ross 				errno = EILSEQ;
543*48edc7cfSGordon Ross 				return ((size_t)-1);
544*48edc7cfSGordon Ross 			}
545*48edc7cfSGordon Ross 			/* just skip this input seq. */
546*48edc7cfSGordon Ross 			*iptr  += cm->cm_frmbs_len;
547*48edc7cfSGordon Ross 			*ileft -= cm->cm_frmbs_len;
548*48edc7cfSGordon Ross 			continue;
549*48edc7cfSGordon Ross 		}
550*48edc7cfSGordon Ross 
551*48edc7cfSGordon Ross 		*iptr  += cm->cm_frmbs_len;
552*48edc7cfSGordon Ross 		*ileft -= cm->cm_frmbs_len;
553*48edc7cfSGordon Ross 		(void) memcpy(*optr, cm->cm_tombs, cm->cm_tombs_len);
554*48edc7cfSGordon Ross 		*optr  += cm->cm_tombs_len;
555*48edc7cfSGordon Ross 		*oleft -= cm->cm_tombs_len;
556*48edc7cfSGordon Ross 	}
557*48edc7cfSGordon Ross 
558*48edc7cfSGordon Ross 	return (0);
559*48edc7cfSGordon Ross }
560