/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ /* * CHARMAP file handling for iconv. */ #include #include #include #include #include #include #include #include #include #include #include "charmap.h" #include "parser.tab.h" #include enum cmap_pass cmap_pass; static avl_tree_t cmap_sym; static avl_tree_t cmap_mbs; typedef struct charmap { const char *cm_name; struct charmap *cm_alias_of; avl_node_t cm_avl_sym; avl_node_t cm_avl_mbs; int cm_warned; int cm_frmbs_len; int cm_tombs_len; char cm_frmbs[MB_LEN_MAX + 1]; /* input */ char cm_tombs[MB_LEN_MAX + 1]; /* output */ } charmap_t; static void add_charmap_impl_fr(char *sym, char *mbs, int mbs_len, int nodups); static void add_charmap_impl_to(char *sym, char *mbs, int mbs_len, int nodups); /* * Array of POSIX specific portable characters. */ static const struct { char *name; int ch; } portable_chars[] = { { "NUL", '\0' }, { "alert", '\a' }, { "backspace", '\b' }, { "tab", '\t' }, { "carriage-return", '\r' }, { "newline", '\n' }, { "vertical-tab", '\v' }, { "form-feed", '\f' }, { "space", ' ' }, { "exclamation-mark", '!' }, { "quotation-mark", '"' }, { "number-sign", '#' }, { "dollar-sign", '$' }, { "percent-sign", '%' }, { "ampersand", '&' }, { "apostrophe", '\'' }, { "left-parenthesis", '(' }, { "right-parenthesis", '(' }, { "asterisk", '*' }, { "plus-sign", '+' }, { "comma", ','}, { "hyphen-minus", '-' }, { "hyphen", '-' }, { "full-stop", '.' }, { "period", '.' }, { "slash", '/' }, { "solidus", '/' }, { "zero", '0' }, { "one", '1' }, { "two", '2' }, { "three", '3' }, { "four", '4' }, { "five", '5' }, { "six", '6' }, { "seven", '7' }, { "eight", '8' }, { "nine", '9' }, { "colon", ':' }, { "semicolon", ';' }, { "less-than-sign", '<' }, { "equals-sign", '=' }, { "greater-than-sign", '>' }, { "question-mark", '?' }, { "commercial-at", '@' }, { "left-square-bracket", '[' }, { "backslash", '\\' }, { "reverse-solidus", '\\' }, { "right-square-bracket", ']' }, { "circumflex", '^' }, { "circumflex-accent", '^' }, { "low-line", '_' }, { "underscore", '_' }, { "grave-accent", '`' }, { "left-brace", '{' }, { "left-curly-bracket", '{' }, { "vertical-line", '|' }, { "right-brace", '}' }, { "right-curly-bracket", '}' }, { "tilde", '~' }, { "A", 'A' }, { "B", 'B' }, { "C", 'C' }, { "D", 'D' }, { "E", 'E' }, { "F", 'F' }, { "G", 'G' }, { "H", 'H' }, { "I", 'I' }, { "J", 'J' }, { "K", 'K' }, { "L", 'L' }, { "M", 'M' }, { "N", 'N' }, { "O", 'O' }, { "P", 'P' }, { "Q", 'Q' }, { "R", 'R' }, { "S", 'S' }, { "T", 'T' }, { "U", 'U' }, { "V", 'V' }, { "W", 'W' }, { "X", 'X' }, { "Y", 'Y' }, { "Z", 'Z' }, { "a", 'a' }, { "b", 'b' }, { "c", 'c' }, { "d", 'd' }, { "e", 'e' }, { "f", 'f' }, { "g", 'g' }, { "h", 'h' }, { "i", 'i' }, { "j", 'j' }, { "k", 'k' }, { "l", 'l' }, { "m", 'm' }, { "n", 'n' }, { "o", 'o' }, { "p", 'p' }, { "q", 'q' }, { "r", 'r' }, { "s", 's' }, { "t", 't' }, { "u", 'u' }, { "v", 'v' }, { "w", 'w' }, { "x", 'x' }, { "y", 'y' }, { "z", 'z' }, { NULL, 0 } }; static int cmap_compare_sym(const void *n1, const void *n2) { const charmap_t *c1 = n1; const charmap_t *c2 = n2; int rv; rv = strcmp(c1->cm_name, c2->cm_name); return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); } /* * In order for partial match searches to work, * we need these sorted by mbs contents. */ static int cmap_compare_mbs(const void *n1, const void *n2) { const charmap_t *c1 = n1; const charmap_t *c2 = n2; int len, rv; len = c1->cm_frmbs_len; if (len < c2->cm_frmbs_len) len = c2->cm_frmbs_len; rv = memcmp(c1->cm_frmbs, c2->cm_frmbs, len); if (rv < 0) return (-1); if (rv > 0) return (1); /* they match through length */ if (c1->cm_frmbs_len < c2->cm_frmbs_len) return (-1); if (c2->cm_frmbs_len < c1->cm_frmbs_len) return (1); return (0); } void charmap_init(char *to_map, char *from_map) { avl_create(&cmap_sym, cmap_compare_sym, sizeof (charmap_t), offsetof(charmap_t, cm_avl_sym)); avl_create(&cmap_mbs, cmap_compare_mbs, sizeof (charmap_t), offsetof(charmap_t, cm_avl_mbs)); cmap_pass = CMAP_PASS_FROM; reset_scanner(from_map); (void) yyparse(); add_charmap_posix(); cmap_pass = CMAP_PASS_TO; reset_scanner(to_map); (void) yyparse(); } void charmap_dump() { charmap_t *cm; int i; cm = avl_first(&cmap_mbs); while (cm != NULL) { (void) printf("name=\"%s\"\n", cm->cm_name); (void) printf("\timbs=\""); for (i = 0; i < cm->cm_frmbs_len; i++) (void) printf("\\x%02x", cm->cm_frmbs[i] & 0xFF); (void) printf("\"\n"); (void) printf("\tombs=\""); for (i = 0; i < cm->cm_tombs_len; i++) (void) printf("\\x%02x", cm->cm_tombs[i] & 0xFF); (void) printf("\"\n"); cm = AVL_NEXT(&cmap_mbs, cm); } } /* * We parse two charmap files: First the "from" map, where we build * cmap_mbs and cmap_sym which we'll later use to translate the input * stream (mbs encodings) to symbols. Second, we parse the "to" map, * where we fill in the tombs members of entries in cmap_sym, (which * must alread exist) used later to write the output encoding. */ static void add_charmap_impl(char *sym, char *mbs, int mbs_len, int nodups) { /* * While parsing both the "from" and "to" cmaps, * require both the symbol and encoding. */ if (sym == NULL || mbs == NULL) { errf(_("invalid charmap entry")); return; } switch (cmap_pass) { case CMAP_PASS_FROM: add_charmap_impl_fr(sym, mbs, mbs_len, nodups); break; case CMAP_PASS_TO: add_charmap_impl_to(sym, mbs, mbs_len, nodups); break; default: abort(); break; } } static void add_charmap_impl_fr(char *sym, char *mbs, int mbs_len, int nodups) { charmap_t *m, *n, *s; avl_index_t where_sym, where_mbs; if ((n = calloc(1, sizeof (*n))) == NULL) { errf(_("out of memory")); return; } n->cm_name = sym; assert(0 < mbs_len && mbs_len <= MB_LEN_MAX); (void) memcpy(n->cm_frmbs, mbs, mbs_len); n->cm_frmbs_len = mbs_len; m = avl_find(&cmap_mbs, n, &where_mbs); s = avl_find(&cmap_sym, n, &where_sym); /* * If we found the symbol, this is a dup. */ if (s != NULL) { if (nodups) { warn(_("%s: duplicate character symbol"), sym); } free(n); return; } /* * If we found the mbs, the new one is an alias, * which we'll add _only_ to the symbol AVL. */ if (m != NULL) { /* The new one is an alias of the original. */ n->cm_alias_of = m; avl_insert(&cmap_sym, n, where_sym); return; } avl_insert(&cmap_sym, n, where_sym); avl_insert(&cmap_mbs, n, where_mbs); } static void add_charmap_impl_to(char *sym, char *mbs, int mbs_len, int nodups) { charmap_t srch = {0}; charmap_t *m; assert(0 < mbs_len && mbs_len <= MB_LEN_MAX); srch.cm_name = sym; m = avl_find(&cmap_sym, &srch, NULL); if (m == NULL) { if (sflag == 0) warn(_("%s: symbol not found"), sym); return; } if (m->cm_alias_of != NULL) { m = m->cm_alias_of; /* don't warn for dups with aliases */ if (m->cm_tombs_len != 0) return; } if (m->cm_tombs_len != 0) { if (nodups) { warn(_("%s: duplicate encoding for"), sym); } return; } (void) memcpy(m->cm_tombs, mbs, mbs_len); m->cm_tombs_len = mbs_len; } void add_charmap(char *sym, char *mbs) { /* mbs[0] is the length */ int mbs_len = *mbs++; assert(0 < mbs_len && mbs_len <= MB_LEN_MAX); add_charmap_impl(sym, mbs, mbs_len, 1); } /* * This is called by the parser with start/end symbol strings (ssym, esym), * which are allocated in the scanner (T_SYMBOL) and free'd here. */ void add_charmap_range(char *ssym, char *esym, char *mbs) { int ls, le; int si; int sn, en; int i; int mbs_len; char tmbs[MB_LEN_MAX+1]; char *mb_last; static const char *digits = "0123456789"; /* mbs[0] is the length */ mbs_len = *mbs++; assert(0 < mbs_len && mbs_len <= MB_LEN_MAX); (void) memcpy(tmbs, mbs, mbs_len); mb_last = tmbs + mbs_len - 1; ls = strlen(ssym); le = strlen(esym); if (((si = strcspn(ssym, digits)) == 0) || (si == ls) || (strncmp(ssym, esym, si) != 0) || (strspn(ssym + si, digits) != (ls - si)) || (strspn(esym + si, digits) != (le - si)) || ((sn = atoi(ssym + si)) > ((en = atoi(esym + si))))) { errf(_("malformed charmap range")); return; } ssym[si] = 0; for (i = sn; i <= en; i++) { char *nn; (void) asprintf(&nn, "%s%0*u", ssym, ls - si, i); if (nn == NULL) { errf(_("out of memory")); return; } add_charmap_impl(nn, tmbs, mbs_len, 1); (*mb_last)++; } free(ssym); free(esym); } void add_charmap_char(char *name, int c) { char mbs[MB_LEN_MAX+1]; mbs[0] = c; mbs[1] = '\0'; add_charmap_impl(name, mbs, 1, 0); } /* * POSIX insists that certain entries be present, even when not in the * orginal charmap file. */ void add_charmap_posix(void) { int i; for (i = 0; portable_chars[i].name; i++) { add_charmap_char(portable_chars[i].name, portable_chars[i].ch); } } /* * This is called with a buffer of (typically) MB_LEN_MAX bytes, * which is potentially a multi-byte symbol, but often contains * extra bytes. Find and return the longest match in the charmap. */ static charmap_t * find_mbs(const char *mbs, int len) { charmap_t srch = {0}; charmap_t *cm = NULL; while (len > 0) { (void) memcpy(srch.cm_frmbs, mbs, len); srch.cm_frmbs_len = len; cm = avl_find(&cmap_mbs, &srch, NULL); if (cm != NULL) break; len--; } return (cm); } /* * Return true if this sequence matches the initial part * of any sequence known in this charmap. */ static boolean_t find_mbs_partial(const char *mbs, int len) { charmap_t srch = {0}; charmap_t *cm; avl_index_t where; (void) memcpy(srch.cm_frmbs, mbs, len); srch.cm_frmbs_len = len; cm = avl_find(&cmap_mbs, &srch, &where); if (cm != NULL) { /* full match - not expected, but OK */ return (B_TRUE); } cm = avl_nearest(&cmap_mbs, where, AVL_AFTER); if (cm != NULL && 0 == memcmp(cm->cm_frmbs, mbs, len)) return (B_TRUE); return (B_FALSE); } /* * Do like iconv(3), but with charmaps. */ size_t cm_iconv(const char **iptr, size_t *ileft, char **optr, size_t *oleft) { charmap_t *cm; int mbs_len; /* Ignore state reset requests. */ if (iptr == NULL || *iptr == NULL) return (0); if (*oleft < MB_LEN_MAX) { errno = E2BIG; return ((size_t)-1); } while (*ileft > 0 && *oleft >= MB_LEN_MAX) { mbs_len = MB_LEN_MAX; if (mbs_len > *ileft) mbs_len = *ileft; cm = find_mbs(*iptr, mbs_len); if (cm == NULL) { if (mbs_len < MB_LEN_MAX && find_mbs_partial(*iptr, mbs_len)) { /* incomplete sequence */ errno = EINVAL; } else { errno = EILSEQ; } return ((size_t)-1); } assert(cm->cm_frmbs_len > 0); if (cm->cm_tombs_len == 0) { if (sflag == 0 && cm->cm_warned == 0) { cm->cm_warned = 1; warn(_("To-map does not encode <%s>\n"), cm->cm_name); } if (cflag == 0) { errno = EILSEQ; return ((size_t)-1); } /* just skip this input seq. */ *iptr += cm->cm_frmbs_len; *ileft -= cm->cm_frmbs_len; continue; } *iptr += cm->cm_frmbs_len; *ileft -= cm->cm_frmbs_len; (void) memcpy(*optr, cm->cm_tombs, cm->cm_tombs_len); *optr += cm->cm_tombs_len; *oleft -= cm->cm_tombs_len; } return (0); }