1*48edc7cfSGordon Ross /*
2*48edc7cfSGordon Ross * This file and its contents are supplied under the terms of the
3*48edc7cfSGordon Ross * Common Development and Distribution License ("CDDL"), version 1.0.
4*48edc7cfSGordon Ross * You may only use this file in accordance with the terms of version
5*48edc7cfSGordon Ross * 1.0 of the CDDL.
6*48edc7cfSGordon Ross *
7*48edc7cfSGordon Ross * A full copy of the text of the CDDL should have accompanied this
8*48edc7cfSGordon Ross * source. A copy of the CDDL is also available via the Internet at
9*48edc7cfSGordon Ross * http://www.illumos.org/license/CDDL.
10*48edc7cfSGordon Ross */
11*48edc7cfSGordon Ross
12*48edc7cfSGordon Ross /*
13*48edc7cfSGordon Ross * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
14*48edc7cfSGordon Ross */
15*48edc7cfSGordon Ross
16*48edc7cfSGordon Ross /*
17*48edc7cfSGordon Ross * CHARMAP file handling for iconv.
18*48edc7cfSGordon Ross */
19*48edc7cfSGordon Ross
20*48edc7cfSGordon Ross #include <stdio.h>
21*48edc7cfSGordon Ross #include <stdlib.h>
22*48edc7cfSGordon Ross #include <string.h>
23*48edc7cfSGordon Ross #include <errno.h>
24*48edc7cfSGordon Ross #include <limits.h>
25*48edc7cfSGordon Ross #include <unistd.h>
26*48edc7cfSGordon Ross #include <alloca.h>
27*48edc7cfSGordon Ross #include <sys/avl.h>
28*48edc7cfSGordon Ross #include <stddef.h>
29*48edc7cfSGordon Ross #include <unistd.h>
30*48edc7cfSGordon Ross #include "charmap.h"
31*48edc7cfSGordon Ross #include "parser.tab.h"
32*48edc7cfSGordon Ross #include <assert.h>
33*48edc7cfSGordon Ross
34*48edc7cfSGordon Ross enum cmap_pass cmap_pass;
35*48edc7cfSGordon Ross static avl_tree_t cmap_sym;
36*48edc7cfSGordon Ross static avl_tree_t cmap_mbs;
37*48edc7cfSGordon Ross
38*48edc7cfSGordon Ross typedef struct charmap {
39*48edc7cfSGordon Ross const char *cm_name;
40*48edc7cfSGordon Ross struct charmap *cm_alias_of;
41*48edc7cfSGordon Ross avl_node_t cm_avl_sym;
42*48edc7cfSGordon Ross avl_node_t cm_avl_mbs;
43*48edc7cfSGordon Ross int cm_warned;
44*48edc7cfSGordon Ross int cm_frmbs_len;
45*48edc7cfSGordon Ross int cm_tombs_len;
46*48edc7cfSGordon Ross char cm_frmbs[MB_LEN_MAX + 1]; /* input */
47*48edc7cfSGordon Ross char cm_tombs[MB_LEN_MAX + 1]; /* output */
48*48edc7cfSGordon Ross } charmap_t;
49*48edc7cfSGordon Ross
50*48edc7cfSGordon Ross static void add_charmap_impl_fr(char *sym, char *mbs, int mbs_len, int nodups);
51*48edc7cfSGordon Ross static void add_charmap_impl_to(char *sym, char *mbs, int mbs_len, int nodups);
52*48edc7cfSGordon Ross
53*48edc7cfSGordon Ross /*
54*48edc7cfSGordon Ross * Array of POSIX specific portable characters.
55*48edc7cfSGordon Ross */
56*48edc7cfSGordon Ross static const struct {
57*48edc7cfSGordon Ross char *name;
58*48edc7cfSGordon Ross int ch;
59*48edc7cfSGordon Ross } portable_chars[] = {
60*48edc7cfSGordon Ross { "NUL", '\0' },
61*48edc7cfSGordon Ross { "alert", '\a' },
62*48edc7cfSGordon Ross { "backspace", '\b' },
63*48edc7cfSGordon Ross { "tab", '\t' },
64*48edc7cfSGordon Ross { "carriage-return", '\r' },
65*48edc7cfSGordon Ross { "newline", '\n' },
66*48edc7cfSGordon Ross { "vertical-tab", '\v' },
67*48edc7cfSGordon Ross { "form-feed", '\f' },
68*48edc7cfSGordon Ross { "space", ' ' },
69*48edc7cfSGordon Ross { "exclamation-mark", '!' },
70*48edc7cfSGordon Ross { "quotation-mark", '"' },
71*48edc7cfSGordon Ross { "number-sign", '#' },
72*48edc7cfSGordon Ross { "dollar-sign", '$' },
73*48edc7cfSGordon Ross { "percent-sign", '%' },
74*48edc7cfSGordon Ross { "ampersand", '&' },
75*48edc7cfSGordon Ross { "apostrophe", '\'' },
76*48edc7cfSGordon Ross { "left-parenthesis", '(' },
77*48edc7cfSGordon Ross { "right-parenthesis", '(' },
78*48edc7cfSGordon Ross { "asterisk", '*' },
79*48edc7cfSGordon Ross { "plus-sign", '+' },
80*48edc7cfSGordon Ross { "comma", ','},
81*48edc7cfSGordon Ross { "hyphen-minus", '-' },
82*48edc7cfSGordon Ross { "hyphen", '-' },
83*48edc7cfSGordon Ross { "full-stop", '.' },
84*48edc7cfSGordon Ross { "period", '.' },
85*48edc7cfSGordon Ross { "slash", '/' },
86*48edc7cfSGordon Ross { "solidus", '/' },
87*48edc7cfSGordon Ross { "zero", '0' },
88*48edc7cfSGordon Ross { "one", '1' },
89*48edc7cfSGordon Ross { "two", '2' },
90*48edc7cfSGordon Ross { "three", '3' },
91*48edc7cfSGordon Ross { "four", '4' },
92*48edc7cfSGordon Ross { "five", '5' },
93*48edc7cfSGordon Ross { "six", '6' },
94*48edc7cfSGordon Ross { "seven", '7' },
95*48edc7cfSGordon Ross { "eight", '8' },
96*48edc7cfSGordon Ross { "nine", '9' },
97*48edc7cfSGordon Ross { "colon", ':' },
98*48edc7cfSGordon Ross { "semicolon", ';' },
99*48edc7cfSGordon Ross { "less-than-sign", '<' },
100*48edc7cfSGordon Ross { "equals-sign", '=' },
101*48edc7cfSGordon Ross { "greater-than-sign", '>' },
102*48edc7cfSGordon Ross { "question-mark", '?' },
103*48edc7cfSGordon Ross { "commercial-at", '@' },
104*48edc7cfSGordon Ross { "left-square-bracket", '[' },
105*48edc7cfSGordon Ross { "backslash", '\\' },
106*48edc7cfSGordon Ross { "reverse-solidus", '\\' },
107*48edc7cfSGordon Ross { "right-square-bracket", ']' },
108*48edc7cfSGordon Ross { "circumflex", '^' },
109*48edc7cfSGordon Ross { "circumflex-accent", '^' },
110*48edc7cfSGordon Ross { "low-line", '_' },
111*48edc7cfSGordon Ross { "underscore", '_' },
112*48edc7cfSGordon Ross { "grave-accent", '`' },
113*48edc7cfSGordon Ross { "left-brace", '{' },
114*48edc7cfSGordon Ross { "left-curly-bracket", '{' },
115*48edc7cfSGordon Ross { "vertical-line", '|' },
116*48edc7cfSGordon Ross { "right-brace", '}' },
117*48edc7cfSGordon Ross { "right-curly-bracket", '}' },
118*48edc7cfSGordon Ross { "tilde", '~' },
119*48edc7cfSGordon Ross { "A", 'A' },
120*48edc7cfSGordon Ross { "B", 'B' },
121*48edc7cfSGordon Ross { "C", 'C' },
122*48edc7cfSGordon Ross { "D", 'D' },
123*48edc7cfSGordon Ross { "E", 'E' },
124*48edc7cfSGordon Ross { "F", 'F' },
125*48edc7cfSGordon Ross { "G", 'G' },
126*48edc7cfSGordon Ross { "H", 'H' },
127*48edc7cfSGordon Ross { "I", 'I' },
128*48edc7cfSGordon Ross { "J", 'J' },
129*48edc7cfSGordon Ross { "K", 'K' },
130*48edc7cfSGordon Ross { "L", 'L' },
131*48edc7cfSGordon Ross { "M", 'M' },
132*48edc7cfSGordon Ross { "N", 'N' },
133*48edc7cfSGordon Ross { "O", 'O' },
134*48edc7cfSGordon Ross { "P", 'P' },
135*48edc7cfSGordon Ross { "Q", 'Q' },
136*48edc7cfSGordon Ross { "R", 'R' },
137*48edc7cfSGordon Ross { "S", 'S' },
138*48edc7cfSGordon Ross { "T", 'T' },
139*48edc7cfSGordon Ross { "U", 'U' },
140*48edc7cfSGordon Ross { "V", 'V' },
141*48edc7cfSGordon Ross { "W", 'W' },
142*48edc7cfSGordon Ross { "X", 'X' },
143*48edc7cfSGordon Ross { "Y", 'Y' },
144*48edc7cfSGordon Ross { "Z", 'Z' },
145*48edc7cfSGordon Ross { "a", 'a' },
146*48edc7cfSGordon Ross { "b", 'b' },
147*48edc7cfSGordon Ross { "c", 'c' },
148*48edc7cfSGordon Ross { "d", 'd' },
149*48edc7cfSGordon Ross { "e", 'e' },
150*48edc7cfSGordon Ross { "f", 'f' },
151*48edc7cfSGordon Ross { "g", 'g' },
152*48edc7cfSGordon Ross { "h", 'h' },
153*48edc7cfSGordon Ross { "i", 'i' },
154*48edc7cfSGordon Ross { "j", 'j' },
155*48edc7cfSGordon Ross { "k", 'k' },
156*48edc7cfSGordon Ross { "l", 'l' },
157*48edc7cfSGordon Ross { "m", 'm' },
158*48edc7cfSGordon Ross { "n", 'n' },
159*48edc7cfSGordon Ross { "o", 'o' },
160*48edc7cfSGordon Ross { "p", 'p' },
161*48edc7cfSGordon Ross { "q", 'q' },
162*48edc7cfSGordon Ross { "r", 'r' },
163*48edc7cfSGordon Ross { "s", 's' },
164*48edc7cfSGordon Ross { "t", 't' },
165*48edc7cfSGordon Ross { "u", 'u' },
166*48edc7cfSGordon Ross { "v", 'v' },
167*48edc7cfSGordon Ross { "w", 'w' },
168*48edc7cfSGordon Ross { "x", 'x' },
169*48edc7cfSGordon Ross { "y", 'y' },
170*48edc7cfSGordon Ross { "z", 'z' },
171*48edc7cfSGordon Ross { NULL, 0 }
172*48edc7cfSGordon Ross };
173*48edc7cfSGordon Ross
174*48edc7cfSGordon Ross static int
cmap_compare_sym(const void * n1,const void * n2)175*48edc7cfSGordon Ross cmap_compare_sym(const void *n1, const void *n2)
176*48edc7cfSGordon Ross {
177*48edc7cfSGordon Ross const charmap_t *c1 = n1;
178*48edc7cfSGordon Ross const charmap_t *c2 = n2;
179*48edc7cfSGordon Ross int rv;
180*48edc7cfSGordon Ross
181*48edc7cfSGordon Ross rv = strcmp(c1->cm_name, c2->cm_name);
182*48edc7cfSGordon Ross return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
183*48edc7cfSGordon Ross }
184*48edc7cfSGordon Ross
185*48edc7cfSGordon Ross /*
186*48edc7cfSGordon Ross * In order for partial match searches to work,
187*48edc7cfSGordon Ross * we need these sorted by mbs contents.
188*48edc7cfSGordon Ross */
189*48edc7cfSGordon Ross static int
cmap_compare_mbs(const void * n1,const void * n2)190*48edc7cfSGordon Ross cmap_compare_mbs(const void *n1, const void *n2)
191*48edc7cfSGordon Ross {
192*48edc7cfSGordon Ross const charmap_t *c1 = n1;
193*48edc7cfSGordon Ross const charmap_t *c2 = n2;
194*48edc7cfSGordon Ross int len, rv;
195*48edc7cfSGordon Ross
196*48edc7cfSGordon Ross len = c1->cm_frmbs_len;
197*48edc7cfSGordon Ross if (len < c2->cm_frmbs_len)
198*48edc7cfSGordon Ross len = c2->cm_frmbs_len;
199*48edc7cfSGordon Ross rv = memcmp(c1->cm_frmbs, c2->cm_frmbs, len);
200*48edc7cfSGordon Ross if (rv < 0)
201*48edc7cfSGordon Ross return (-1);
202*48edc7cfSGordon Ross if (rv > 0)
203*48edc7cfSGordon Ross return (1);
204*48edc7cfSGordon Ross /* they match through length */
205*48edc7cfSGordon Ross if (c1->cm_frmbs_len < c2->cm_frmbs_len)
206*48edc7cfSGordon Ross return (-1);
207*48edc7cfSGordon Ross if (c2->cm_frmbs_len < c1->cm_frmbs_len)
208*48edc7cfSGordon Ross return (1);
209*48edc7cfSGordon Ross return (0);
210*48edc7cfSGordon Ross }
211*48edc7cfSGordon Ross
212*48edc7cfSGordon Ross void
charmap_init(char * to_map,char * from_map)213*48edc7cfSGordon Ross charmap_init(char *to_map, char *from_map)
214*48edc7cfSGordon Ross {
215*48edc7cfSGordon Ross avl_create(&cmap_sym, cmap_compare_sym, sizeof (charmap_t),
216*48edc7cfSGordon Ross offsetof(charmap_t, cm_avl_sym));
217*48edc7cfSGordon Ross
218*48edc7cfSGordon Ross avl_create(&cmap_mbs, cmap_compare_mbs, sizeof (charmap_t),
219*48edc7cfSGordon Ross offsetof(charmap_t, cm_avl_mbs));
220*48edc7cfSGordon Ross
221*48edc7cfSGordon Ross cmap_pass = CMAP_PASS_FROM;
222*48edc7cfSGordon Ross reset_scanner(from_map);
223*48edc7cfSGordon Ross (void) yyparse();
224*48edc7cfSGordon Ross add_charmap_posix();
225*48edc7cfSGordon Ross
226*48edc7cfSGordon Ross cmap_pass = CMAP_PASS_TO;
227*48edc7cfSGordon Ross reset_scanner(to_map);
228*48edc7cfSGordon Ross (void) yyparse();
229*48edc7cfSGordon Ross }
230*48edc7cfSGordon Ross
231*48edc7cfSGordon Ross void
charmap_dump()232*48edc7cfSGordon Ross charmap_dump()
233*48edc7cfSGordon Ross {
234*48edc7cfSGordon Ross charmap_t *cm;
235*48edc7cfSGordon Ross int i;
236*48edc7cfSGordon Ross
237*48edc7cfSGordon Ross cm = avl_first(&cmap_mbs);
238*48edc7cfSGordon Ross while (cm != NULL) {
239*48edc7cfSGordon Ross (void) printf("name=\"%s\"\n", cm->cm_name);
240*48edc7cfSGordon Ross
241*48edc7cfSGordon Ross (void) printf("\timbs=\"");
242*48edc7cfSGordon Ross for (i = 0; i < cm->cm_frmbs_len; i++)
243*48edc7cfSGordon Ross (void) printf("\\x%02x", cm->cm_frmbs[i] & 0xFF);
244*48edc7cfSGordon Ross (void) printf("\"\n");
245*48edc7cfSGordon Ross
246*48edc7cfSGordon Ross (void) printf("\tombs=\"");
247*48edc7cfSGordon Ross for (i = 0; i < cm->cm_tombs_len; i++)
248*48edc7cfSGordon Ross (void) printf("\\x%02x", cm->cm_tombs[i] & 0xFF);
249*48edc7cfSGordon Ross (void) printf("\"\n");
250*48edc7cfSGordon Ross
251*48edc7cfSGordon Ross cm = AVL_NEXT(&cmap_mbs, cm);
252*48edc7cfSGordon Ross }
253*48edc7cfSGordon Ross }
254*48edc7cfSGordon Ross
255*48edc7cfSGordon Ross /*
256*48edc7cfSGordon Ross * We parse two charmap files: First the "from" map, where we build
257*48edc7cfSGordon Ross * cmap_mbs and cmap_sym which we'll later use to translate the input
258*48edc7cfSGordon Ross * stream (mbs encodings) to symbols. Second, we parse the "to" map,
259*48edc7cfSGordon Ross * where we fill in the tombs members of entries in cmap_sym, (which
260*48edc7cfSGordon Ross * must alread exist) used later to write the output encoding.
261*48edc7cfSGordon Ross */
262*48edc7cfSGordon Ross static void
add_charmap_impl(char * sym,char * mbs,int mbs_len,int nodups)263*48edc7cfSGordon Ross add_charmap_impl(char *sym, char *mbs, int mbs_len, int nodups)
264*48edc7cfSGordon Ross {
265*48edc7cfSGordon Ross
266*48edc7cfSGordon Ross /*
267*48edc7cfSGordon Ross * While parsing both the "from" and "to" cmaps,
268*48edc7cfSGordon Ross * require both the symbol and encoding.
269*48edc7cfSGordon Ross */
270*48edc7cfSGordon Ross if (sym == NULL || mbs == NULL) {
271*48edc7cfSGordon Ross errf(_("invalid charmap entry"));
272*48edc7cfSGordon Ross return;
273*48edc7cfSGordon Ross }
274*48edc7cfSGordon Ross
275*48edc7cfSGordon Ross switch (cmap_pass) {
276*48edc7cfSGordon Ross case CMAP_PASS_FROM:
277*48edc7cfSGordon Ross add_charmap_impl_fr(sym, mbs, mbs_len, nodups);
278*48edc7cfSGordon Ross break;
279*48edc7cfSGordon Ross case CMAP_PASS_TO:
280*48edc7cfSGordon Ross add_charmap_impl_to(sym, mbs, mbs_len, nodups);
281*48edc7cfSGordon Ross break;
282*48edc7cfSGordon Ross default:
283*48edc7cfSGordon Ross abort();
284*48edc7cfSGordon Ross break;
285*48edc7cfSGordon Ross }
286*48edc7cfSGordon Ross }
287*48edc7cfSGordon Ross
288*48edc7cfSGordon Ross static void
add_charmap_impl_fr(char * sym,char * mbs,int mbs_len,int nodups)289*48edc7cfSGordon Ross add_charmap_impl_fr(char *sym, char *mbs, int mbs_len, int nodups)
290*48edc7cfSGordon Ross {
291*48edc7cfSGordon Ross charmap_t *m, *n, *s;
292*48edc7cfSGordon Ross avl_index_t where_sym, where_mbs;
293*48edc7cfSGordon Ross
294*48edc7cfSGordon Ross if ((n = calloc(1, sizeof (*n))) == NULL) {
295*48edc7cfSGordon Ross errf(_("out of memory"));
296*48edc7cfSGordon Ross return;
297*48edc7cfSGordon Ross }
298*48edc7cfSGordon Ross n->cm_name = sym;
299*48edc7cfSGordon Ross
300*48edc7cfSGordon Ross assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
301*48edc7cfSGordon Ross (void) memcpy(n->cm_frmbs, mbs, mbs_len);
302*48edc7cfSGordon Ross n->cm_frmbs_len = mbs_len;
303*48edc7cfSGordon Ross
304*48edc7cfSGordon Ross m = avl_find(&cmap_mbs, n, &where_mbs);
305*48edc7cfSGordon Ross s = avl_find(&cmap_sym, n, &where_sym);
306*48edc7cfSGordon Ross
307*48edc7cfSGordon Ross /*
308*48edc7cfSGordon Ross * If we found the symbol, this is a dup.
309*48edc7cfSGordon Ross */
310*48edc7cfSGordon Ross if (s != NULL) {
311*48edc7cfSGordon Ross if (nodups) {
312*48edc7cfSGordon Ross warn(_("%s: duplicate character symbol"), sym);
313*48edc7cfSGordon Ross }
314*48edc7cfSGordon Ross free(n);
315*48edc7cfSGordon Ross return;
316*48edc7cfSGordon Ross }
317*48edc7cfSGordon Ross
318*48edc7cfSGordon Ross /*
319*48edc7cfSGordon Ross * If we found the mbs, the new one is an alias,
320*48edc7cfSGordon Ross * which we'll add _only_ to the symbol AVL.
321*48edc7cfSGordon Ross */
322*48edc7cfSGordon Ross if (m != NULL) {
323*48edc7cfSGordon Ross /* The new one is an alias of the original. */
324*48edc7cfSGordon Ross n->cm_alias_of = m;
325*48edc7cfSGordon Ross avl_insert(&cmap_sym, n, where_sym);
326*48edc7cfSGordon Ross return;
327*48edc7cfSGordon Ross }
328*48edc7cfSGordon Ross
329*48edc7cfSGordon Ross avl_insert(&cmap_sym, n, where_sym);
330*48edc7cfSGordon Ross avl_insert(&cmap_mbs, n, where_mbs);
331*48edc7cfSGordon Ross }
332*48edc7cfSGordon Ross
333*48edc7cfSGordon Ross static void
add_charmap_impl_to(char * sym,char * mbs,int mbs_len,int nodups)334*48edc7cfSGordon Ross add_charmap_impl_to(char *sym, char *mbs, int mbs_len, int nodups)
335*48edc7cfSGordon Ross {
336*48edc7cfSGordon Ross charmap_t srch = {0};
337*48edc7cfSGordon Ross charmap_t *m;
338*48edc7cfSGordon Ross
339*48edc7cfSGordon Ross assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
340*48edc7cfSGordon Ross
341*48edc7cfSGordon Ross srch.cm_name = sym;
342*48edc7cfSGordon Ross
343*48edc7cfSGordon Ross m = avl_find(&cmap_sym, &srch, NULL);
344*48edc7cfSGordon Ross if (m == NULL) {
345*48edc7cfSGordon Ross if (sflag == 0)
346*48edc7cfSGordon Ross warn(_("%s: symbol not found"), sym);
347*48edc7cfSGordon Ross return;
348*48edc7cfSGordon Ross }
349*48edc7cfSGordon Ross if (m->cm_alias_of != NULL) {
350*48edc7cfSGordon Ross m = m->cm_alias_of;
351*48edc7cfSGordon Ross
352*48edc7cfSGordon Ross /* don't warn for dups with aliases */
353*48edc7cfSGordon Ross if (m->cm_tombs_len != 0)
354*48edc7cfSGordon Ross return;
355*48edc7cfSGordon Ross }
356*48edc7cfSGordon Ross
357*48edc7cfSGordon Ross if (m->cm_tombs_len != 0) {
358*48edc7cfSGordon Ross if (nodups) {
359*48edc7cfSGordon Ross warn(_("%s: duplicate encoding for"), sym);
360*48edc7cfSGordon Ross }
361*48edc7cfSGordon Ross return;
362*48edc7cfSGordon Ross }
363*48edc7cfSGordon Ross
364*48edc7cfSGordon Ross (void) memcpy(m->cm_tombs, mbs, mbs_len);
365*48edc7cfSGordon Ross m->cm_tombs_len = mbs_len;
366*48edc7cfSGordon Ross }
367*48edc7cfSGordon Ross
368*48edc7cfSGordon Ross void
add_charmap(char * sym,char * mbs)369*48edc7cfSGordon Ross add_charmap(char *sym, char *mbs)
370*48edc7cfSGordon Ross {
371*48edc7cfSGordon Ross /* mbs[0] is the length */
372*48edc7cfSGordon Ross int mbs_len = *mbs++;
373*48edc7cfSGordon Ross assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
374*48edc7cfSGordon Ross add_charmap_impl(sym, mbs, mbs_len, 1);
375*48edc7cfSGordon Ross }
376*48edc7cfSGordon Ross
377*48edc7cfSGordon Ross
378*48edc7cfSGordon Ross /*
379*48edc7cfSGordon Ross * This is called by the parser with start/end symbol strings (ssym, esym),
380*48edc7cfSGordon Ross * which are allocated in the scanner (T_SYMBOL) and free'd here.
381*48edc7cfSGordon Ross */
382*48edc7cfSGordon Ross void
add_charmap_range(char * ssym,char * esym,char * mbs)383*48edc7cfSGordon Ross add_charmap_range(char *ssym, char *esym, char *mbs)
384*48edc7cfSGordon Ross {
385*48edc7cfSGordon Ross int ls, le;
386*48edc7cfSGordon Ross int si;
387*48edc7cfSGordon Ross int sn, en;
388*48edc7cfSGordon Ross int i;
389*48edc7cfSGordon Ross int mbs_len;
390*48edc7cfSGordon Ross char tmbs[MB_LEN_MAX+1];
391*48edc7cfSGordon Ross char *mb_last;
392*48edc7cfSGordon Ross
393*48edc7cfSGordon Ross static const char *digits = "0123456789";
394*48edc7cfSGordon Ross
395*48edc7cfSGordon Ross /* mbs[0] is the length */
396*48edc7cfSGordon Ross mbs_len = *mbs++;
397*48edc7cfSGordon Ross assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
398*48edc7cfSGordon Ross (void) memcpy(tmbs, mbs, mbs_len);
399*48edc7cfSGordon Ross mb_last = tmbs + mbs_len - 1;
400*48edc7cfSGordon Ross
401*48edc7cfSGordon Ross ls = strlen(ssym);
402*48edc7cfSGordon Ross le = strlen(esym);
403*48edc7cfSGordon Ross
404*48edc7cfSGordon Ross if (((si = strcspn(ssym, digits)) == 0) || (si == ls) ||
405*48edc7cfSGordon Ross (strncmp(ssym, esym, si) != 0) ||
406*48edc7cfSGordon Ross (strspn(ssym + si, digits) != (ls - si)) ||
407*48edc7cfSGordon Ross (strspn(esym + si, digits) != (le - si)) ||
408*48edc7cfSGordon Ross ((sn = atoi(ssym + si)) > ((en = atoi(esym + si))))) {
409*48edc7cfSGordon Ross errf(_("malformed charmap range"));
410*48edc7cfSGordon Ross return;
411*48edc7cfSGordon Ross }
412*48edc7cfSGordon Ross
413*48edc7cfSGordon Ross ssym[si] = 0;
414*48edc7cfSGordon Ross for (i = sn; i <= en; i++) {
415*48edc7cfSGordon Ross char *nn;
416*48edc7cfSGordon Ross (void) asprintf(&nn, "%s%0*u", ssym, ls - si, i);
417*48edc7cfSGordon Ross if (nn == NULL) {
418*48edc7cfSGordon Ross errf(_("out of memory"));
419*48edc7cfSGordon Ross return;
420*48edc7cfSGordon Ross }
421*48edc7cfSGordon Ross
422*48edc7cfSGordon Ross add_charmap_impl(nn, tmbs, mbs_len, 1);
423*48edc7cfSGordon Ross (*mb_last)++;
424*48edc7cfSGordon Ross }
425*48edc7cfSGordon Ross free(ssym);
426*48edc7cfSGordon Ross free(esym);
427*48edc7cfSGordon Ross }
428*48edc7cfSGordon Ross
429*48edc7cfSGordon Ross void
add_charmap_char(char * name,int c)430*48edc7cfSGordon Ross add_charmap_char(char *name, int c)
431*48edc7cfSGordon Ross {
432*48edc7cfSGordon Ross char mbs[MB_LEN_MAX+1];
433*48edc7cfSGordon Ross
434*48edc7cfSGordon Ross mbs[0] = c;
435*48edc7cfSGordon Ross mbs[1] = '\0';
436*48edc7cfSGordon Ross add_charmap_impl(name, mbs, 1, 0);
437*48edc7cfSGordon Ross }
438*48edc7cfSGordon Ross
439*48edc7cfSGordon Ross /*
440*48edc7cfSGordon Ross * POSIX insists that certain entries be present, even when not in the
441*48edc7cfSGordon Ross * orginal charmap file.
442*48edc7cfSGordon Ross */
443*48edc7cfSGordon Ross void
add_charmap_posix(void)444*48edc7cfSGordon Ross add_charmap_posix(void)
445*48edc7cfSGordon Ross {
446*48edc7cfSGordon Ross int i;
447*48edc7cfSGordon Ross
448*48edc7cfSGordon Ross for (i = 0; portable_chars[i].name; i++) {
449*48edc7cfSGordon Ross add_charmap_char(portable_chars[i].name, portable_chars[i].ch);
450*48edc7cfSGordon Ross }
451*48edc7cfSGordon Ross }
452*48edc7cfSGordon Ross
453*48edc7cfSGordon Ross /*
454*48edc7cfSGordon Ross * This is called with a buffer of (typically) MB_LEN_MAX bytes,
455*48edc7cfSGordon Ross * which is potentially a multi-byte symbol, but often contains
456*48edc7cfSGordon Ross * extra bytes. Find and return the longest match in the charmap.
457*48edc7cfSGordon Ross */
458*48edc7cfSGordon Ross static charmap_t *
find_mbs(const char * mbs,int len)459*48edc7cfSGordon Ross find_mbs(const char *mbs, int len)
460*48edc7cfSGordon Ross {
461*48edc7cfSGordon Ross charmap_t srch = {0};
462*48edc7cfSGordon Ross charmap_t *cm = NULL;
463*48edc7cfSGordon Ross
464*48edc7cfSGordon Ross while (len > 0) {
465*48edc7cfSGordon Ross (void) memcpy(srch.cm_frmbs, mbs, len);
466*48edc7cfSGordon Ross srch.cm_frmbs_len = len;
467*48edc7cfSGordon Ross cm = avl_find(&cmap_mbs, &srch, NULL);
468*48edc7cfSGordon Ross if (cm != NULL)
469*48edc7cfSGordon Ross break;
470*48edc7cfSGordon Ross len--;
471*48edc7cfSGordon Ross }
472*48edc7cfSGordon Ross
473*48edc7cfSGordon Ross return (cm);
474*48edc7cfSGordon Ross }
475*48edc7cfSGordon Ross
476*48edc7cfSGordon Ross /*
477*48edc7cfSGordon Ross * Return true if this sequence matches the initial part
478*48edc7cfSGordon Ross * of any sequence known in this charmap.
479*48edc7cfSGordon Ross */
480*48edc7cfSGordon Ross static boolean_t
find_mbs_partial(const char * mbs,int len)481*48edc7cfSGordon Ross find_mbs_partial(const char *mbs, int len)
482*48edc7cfSGordon Ross {
483*48edc7cfSGordon Ross charmap_t srch = {0};
484*48edc7cfSGordon Ross charmap_t *cm;
485*48edc7cfSGordon Ross avl_index_t where;
486*48edc7cfSGordon Ross
487*48edc7cfSGordon Ross (void) memcpy(srch.cm_frmbs, mbs, len);
488*48edc7cfSGordon Ross srch.cm_frmbs_len = len;
489*48edc7cfSGordon Ross cm = avl_find(&cmap_mbs, &srch, &where);
490*48edc7cfSGordon Ross if (cm != NULL) {
491*48edc7cfSGordon Ross /* full match - not expected, but OK */
492*48edc7cfSGordon Ross return (B_TRUE);
493*48edc7cfSGordon Ross }
494*48edc7cfSGordon Ross cm = avl_nearest(&cmap_mbs, where, AVL_AFTER);
495*48edc7cfSGordon Ross if (cm != NULL && 0 == memcmp(cm->cm_frmbs, mbs, len))
496*48edc7cfSGordon Ross return (B_TRUE);
497*48edc7cfSGordon Ross
498*48edc7cfSGordon Ross return (B_FALSE);
499*48edc7cfSGordon Ross }
500*48edc7cfSGordon Ross
501*48edc7cfSGordon Ross /*
502*48edc7cfSGordon Ross * Do like iconv(3), but with charmaps.
503*48edc7cfSGordon Ross */
504*48edc7cfSGordon Ross size_t
cm_iconv(const char ** iptr,size_t * ileft,char ** optr,size_t * oleft)505*48edc7cfSGordon Ross cm_iconv(const char **iptr, size_t *ileft, char **optr, size_t *oleft)
506*48edc7cfSGordon Ross {
507*48edc7cfSGordon Ross charmap_t *cm;
508*48edc7cfSGordon Ross int mbs_len;
509*48edc7cfSGordon Ross
510*48edc7cfSGordon Ross /* Ignore state reset requests. */
511*48edc7cfSGordon Ross if (iptr == NULL || *iptr == NULL)
512*48edc7cfSGordon Ross return (0);
513*48edc7cfSGordon Ross
514*48edc7cfSGordon Ross if (*oleft < MB_LEN_MAX) {
515*48edc7cfSGordon Ross errno = E2BIG;
516*48edc7cfSGordon Ross return ((size_t)-1);
517*48edc7cfSGordon Ross }
518*48edc7cfSGordon Ross
519*48edc7cfSGordon Ross while (*ileft > 0 && *oleft >= MB_LEN_MAX) {
520*48edc7cfSGordon Ross mbs_len = MB_LEN_MAX;
521*48edc7cfSGordon Ross if (mbs_len > *ileft)
522*48edc7cfSGordon Ross mbs_len = *ileft;
523*48edc7cfSGordon Ross cm = find_mbs(*iptr, mbs_len);
524*48edc7cfSGordon Ross if (cm == NULL) {
525*48edc7cfSGordon Ross if (mbs_len < MB_LEN_MAX &&
526*48edc7cfSGordon Ross find_mbs_partial(*iptr, mbs_len)) {
527*48edc7cfSGordon Ross /* incomplete sequence */
528*48edc7cfSGordon Ross errno = EINVAL;
529*48edc7cfSGordon Ross } else {
530*48edc7cfSGordon Ross errno = EILSEQ;
531*48edc7cfSGordon Ross }
532*48edc7cfSGordon Ross return ((size_t)-1);
533*48edc7cfSGordon Ross }
534*48edc7cfSGordon Ross assert(cm->cm_frmbs_len > 0);
535*48edc7cfSGordon Ross if (cm->cm_tombs_len == 0) {
536*48edc7cfSGordon Ross if (sflag == 0 && cm->cm_warned == 0) {
537*48edc7cfSGordon Ross cm->cm_warned = 1;
538*48edc7cfSGordon Ross warn(_("To-map does not encode <%s>\n"),
539*48edc7cfSGordon Ross cm->cm_name);
540*48edc7cfSGordon Ross }
541*48edc7cfSGordon Ross if (cflag == 0) {
542*48edc7cfSGordon Ross errno = EILSEQ;
543*48edc7cfSGordon Ross return ((size_t)-1);
544*48edc7cfSGordon Ross }
545*48edc7cfSGordon Ross /* just skip this input seq. */
546*48edc7cfSGordon Ross *iptr += cm->cm_frmbs_len;
547*48edc7cfSGordon Ross *ileft -= cm->cm_frmbs_len;
548*48edc7cfSGordon Ross continue;
549*48edc7cfSGordon Ross }
550*48edc7cfSGordon Ross
551*48edc7cfSGordon Ross *iptr += cm->cm_frmbs_len;
552*48edc7cfSGordon Ross *ileft -= cm->cm_frmbs_len;
553*48edc7cfSGordon Ross (void) memcpy(*optr, cm->cm_tombs, cm->cm_tombs_len);
554*48edc7cfSGordon Ross *optr += cm->cm_tombs_len;
555*48edc7cfSGordon Ross *oleft -= cm->cm_tombs_len;
556*48edc7cfSGordon Ross }
557*48edc7cfSGordon Ross
558*48edc7cfSGordon Ross return (0);
559*48edc7cfSGordon Ross }
560