19c9af259SGordon Ross /*
29c9af259SGordon Ross * CDDL HEADER START
39c9af259SGordon Ross *
49c9af259SGordon Ross * The contents of this file are subject to the terms of the
59c9af259SGordon Ross * Common Development and Distribution License (the "License").
69c9af259SGordon Ross * You may not use this file except in compliance with the License.
79c9af259SGordon Ross *
89c9af259SGordon Ross * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
99c9af259SGordon Ross * or http://www.opensolaris.org/os/licensing.
109c9af259SGordon Ross * See the License for the specific language governing permissions
119c9af259SGordon Ross * and limitations under the License.
129c9af259SGordon Ross *
139c9af259SGordon Ross * When distributing Covered Code, include this CDDL HEADER in each
149c9af259SGordon Ross * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
159c9af259SGordon Ross * If applicable, add the following below this CDDL HEADER, with the
169c9af259SGordon Ross * fields enclosed by brackets "[]" replaced with your own identifying
179c9af259SGordon Ross * information: Portions Copyright [yyyy] [name of copyright owner]
189c9af259SGordon Ross *
199c9af259SGordon Ross * CDDL HEADER END
209c9af259SGordon Ross */
219c9af259SGordon Ross
229c9af259SGordon Ross /*
23*613a2f6bSGordon Ross * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
249c9af259SGordon Ross * Use is subject to license terms.
259c9af259SGordon Ross */
269c9af259SGordon Ross
279c9af259SGordon Ross /*
289c9af259SGordon Ross * Unicode conversions (yet more)
299c9af259SGordon Ross */
309c9af259SGordon Ross
319c9af259SGordon Ross #include <stdio.h>
329c9af259SGordon Ross #include <stdlib.h>
339c9af259SGordon Ross #include <string.h>
349c9af259SGordon Ross #include <errno.h>
359c9af259SGordon Ross #include <iconv.h>
369c9af259SGordon Ross #include <libintl.h>
379c9af259SGordon Ross
389c9af259SGordon Ross #include <sys/u8_textprep.h>
399c9af259SGordon Ross
409c9af259SGordon Ross #include <netsmb/smb_lib.h>
419c9af259SGordon Ross #include "charsets.h"
429c9af259SGordon Ross
439c9af259SGordon Ross
449c9af259SGordon Ross /*
459c9af259SGordon Ross * Number of unicode symbols in the string,
469c9af259SGordon Ross * not including the 2-byte null terminator.
479c9af259SGordon Ross * (multiply by two for storage size)
489c9af259SGordon Ross */
499c9af259SGordon Ross size_t
unicode_strlen(const uint16_t * us)509c9af259SGordon Ross unicode_strlen(const uint16_t *us)
519c9af259SGordon Ross {
529c9af259SGordon Ross size_t len = 0;
539c9af259SGordon Ross while (*us++)
549c9af259SGordon Ross len++;
559c9af259SGordon Ross return (len);
569c9af259SGordon Ross }
579c9af259SGordon Ross
589c9af259SGordon Ross static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *);
599c9af259SGordon Ross
609c9af259SGordon Ross /*
619c9af259SGordon Ross * Convert (native) Unicode string to UTF-8.
629c9af259SGordon Ross * Returns allocated memory.
639c9af259SGordon Ross */
649c9af259SGordon Ross char *
convert_unicode_to_utf8(uint16_t * us)659c9af259SGordon Ross convert_unicode_to_utf8(uint16_t *us)
669c9af259SGordon Ross {
679c9af259SGordon Ross static iconv_t cd1 = (iconv_t)-1;
689c9af259SGordon Ross
699c9af259SGordon Ross /* Get conversion descriptor (to, from) */
709c9af259SGordon Ross if (cd1 == (iconv_t)-1)
719c9af259SGordon Ross cd1 = iconv_open("UTF-8", "UCS-2");
729c9af259SGordon Ross
739c9af259SGordon Ross return (convert_ucs2xx_to_utf8(cd1, us));
749c9af259SGordon Ross }
759c9af259SGordon Ross
769c9af259SGordon Ross /*
779c9af259SGordon Ross * Convert little-endian Unicode string to UTF-8.
789c9af259SGordon Ross * Returns allocated memory.
799c9af259SGordon Ross */
809c9af259SGordon Ross char *
convert_leunicode_to_utf8(unsigned short * us)819c9af259SGordon Ross convert_leunicode_to_utf8(unsigned short *us)
829c9af259SGordon Ross {
839c9af259SGordon Ross static iconv_t cd2 = (iconv_t)-1;
849c9af259SGordon Ross
859c9af259SGordon Ross /* Get conversion descriptor (to, from) */
869c9af259SGordon Ross if (cd2 == (iconv_t)-1)
879c9af259SGordon Ross cd2 = iconv_open("UTF-8", "UCS-2LE");
889c9af259SGordon Ross
899c9af259SGordon Ross return (convert_ucs2xx_to_utf8(cd2, us));
909c9af259SGordon Ross }
919c9af259SGordon Ross
929c9af259SGordon Ross static char *
convert_ucs2xx_to_utf8(iconv_t cd,const uint16_t * us)939c9af259SGordon Ross convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us)
949c9af259SGordon Ross {
959c9af259SGordon Ross char *obuf, *optr;
969c9af259SGordon Ross const char *iptr;
979c9af259SGordon Ross size_t ileft, obsize, oleft, ret;
989c9af259SGordon Ross
999c9af259SGordon Ross if (cd == (iconv_t)-1) {
1009c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN,
1019c9af259SGordon Ross "iconv_open(UTF-8/UCS-2)"), -1);
1029c9af259SGordon Ross return (NULL);
1039c9af259SGordon Ross }
1049c9af259SGordon Ross
1059c9af259SGordon Ross iptr = (const char *)us;
1069c9af259SGordon Ross ileft = unicode_strlen(us);
1079c9af259SGordon Ross ileft *= 2; /* now bytes */
1089c9af259SGordon Ross
1099c9af259SGordon Ross /* Worst-case output size is 2x input size. */
1109c9af259SGordon Ross oleft = ileft * 2;
1119c9af259SGordon Ross obsize = oleft + 2; /* room for null */
1129c9af259SGordon Ross obuf = malloc(obsize);
1139c9af259SGordon Ross if (!obuf)
1149c9af259SGordon Ross return (NULL);
1159c9af259SGordon Ross optr = obuf;
1169c9af259SGordon Ross
1179c9af259SGordon Ross ret = iconv(cd, &iptr, &ileft, &optr, &oleft);
1189c9af259SGordon Ross *optr = '\0';
1199c9af259SGordon Ross if (ret == (size_t)-1) {
1209c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN,
1219c9af259SGordon Ross "iconv(%s) failed"), errno, obuf);
1229c9af259SGordon Ross }
1239c9af259SGordon Ross if (ileft) {
1249c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN,
1259c9af259SGordon Ross "iconv(%s) failed"), -1, obuf);
1269c9af259SGordon Ross /*
1279c9af259SGordon Ross * XXX: What's better? return NULL?
1289c9af259SGordon Ross * The truncated string? << for now
1299c9af259SGordon Ross */
1309c9af259SGordon Ross }
1319c9af259SGordon Ross
1329c9af259SGordon Ross return (obuf);
1339c9af259SGordon Ross }
1349c9af259SGordon Ross
1359c9af259SGordon Ross static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *);
1369c9af259SGordon Ross
1379c9af259SGordon Ross /*
1389c9af259SGordon Ross * Convert UTF-8 string to Unicode.
1399c9af259SGordon Ross * Returns allocated memory.
1409c9af259SGordon Ross */
1419c9af259SGordon Ross uint16_t *
convert_utf8_to_unicode(const char * utf8_string)1429c9af259SGordon Ross convert_utf8_to_unicode(const char *utf8_string)
1439c9af259SGordon Ross {
1449c9af259SGordon Ross static iconv_t cd3 = (iconv_t)-1;
1459c9af259SGordon Ross
1469c9af259SGordon Ross /* Get conversion descriptor (to, from) */
1479c9af259SGordon Ross if (cd3 == (iconv_t)-1)
1489c9af259SGordon Ross cd3 = iconv_open("UCS-2", "UTF-8");
1499c9af259SGordon Ross return (convert_utf8_to_ucs2xx(cd3, utf8_string));
1509c9af259SGordon Ross }
1519c9af259SGordon Ross
1529c9af259SGordon Ross /*
1539c9af259SGordon Ross * Convert UTF-8 string to little-endian Unicode.
1549c9af259SGordon Ross * Returns allocated memory.
1559c9af259SGordon Ross */
1569c9af259SGordon Ross uint16_t *
convert_utf8_to_leunicode(const char * utf8_string)1579c9af259SGordon Ross convert_utf8_to_leunicode(const char *utf8_string)
1589c9af259SGordon Ross {
1599c9af259SGordon Ross static iconv_t cd4 = (iconv_t)-1;
1609c9af259SGordon Ross
1619c9af259SGordon Ross /* Get conversion descriptor (to, from) */
1629c9af259SGordon Ross if (cd4 == (iconv_t)-1)
1639c9af259SGordon Ross cd4 = iconv_open("UCS-2LE", "UTF-8");
1649c9af259SGordon Ross return (convert_utf8_to_ucs2xx(cd4, utf8_string));
1659c9af259SGordon Ross }
1669c9af259SGordon Ross
1679c9af259SGordon Ross static uint16_t *
convert_utf8_to_ucs2xx(iconv_t cd,const char * utf8_string)1689c9af259SGordon Ross convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string)
1699c9af259SGordon Ross {
1709c9af259SGordon Ross uint16_t *obuf, *optr;
1719c9af259SGordon Ross const char *iptr;
1729c9af259SGordon Ross size_t ileft, obsize, oleft, ret;
1739c9af259SGordon Ross
1749c9af259SGordon Ross if (cd == (iconv_t)-1) {
1759c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN,
1769c9af259SGordon Ross "iconv_open(UCS-2/UTF-8)"), -1);
1779c9af259SGordon Ross return (NULL);
1789c9af259SGordon Ross }
1799c9af259SGordon Ross
1809c9af259SGordon Ross iptr = utf8_string;
1819c9af259SGordon Ross ileft = strlen(iptr);
1829c9af259SGordon Ross
1839c9af259SGordon Ross /* Worst-case output size is 2x input size. */
1849c9af259SGordon Ross oleft = ileft * 2;
1859c9af259SGordon Ross obsize = oleft + 2; /* room for null */
1869c9af259SGordon Ross obuf = malloc(obsize);
1879c9af259SGordon Ross if (!obuf)
1889c9af259SGordon Ross return (NULL);
1899c9af259SGordon Ross optr = obuf;
1909c9af259SGordon Ross
1919c9af259SGordon Ross ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft);
1929c9af259SGordon Ross *optr = '\0';
1939c9af259SGordon Ross if (ret == (size_t)-1) {
1949c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN,
1959c9af259SGordon Ross "iconv(%s) failed"), errno, utf8_string);
1969c9af259SGordon Ross }
1979c9af259SGordon Ross if (ileft) {
1989c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN,
1999c9af259SGordon Ross "iconv(%s) failed"), -1, utf8_string);
2009c9af259SGordon Ross /*
2019c9af259SGordon Ross * XXX: What's better? return NULL?
2029c9af259SGordon Ross * The truncated string? << for now
2039c9af259SGordon Ross */
2049c9af259SGordon Ross }
2059c9af259SGordon Ross
2069c9af259SGordon Ross return (obuf);
2079c9af259SGordon Ross }
208*613a2f6bSGordon Ross
209*613a2f6bSGordon Ross
210*613a2f6bSGordon Ross /*
211*613a2f6bSGordon Ross * A simple wrapper around u8_textprep_str() that returns the Unicode
212*613a2f6bSGordon Ross * upper-case version of some string. Returns memory from malloc.
213*613a2f6bSGordon Ross * Borrowed from idmapd.
214*613a2f6bSGordon Ross */
215*613a2f6bSGordon Ross static char *
utf8_str_to_upper_or_lower(const char * s,int upper_lower)216*613a2f6bSGordon Ross utf8_str_to_upper_or_lower(const char *s, int upper_lower)
217*613a2f6bSGordon Ross {
218*613a2f6bSGordon Ross char *res = NULL;
219*613a2f6bSGordon Ross char *outs;
220*613a2f6bSGordon Ross size_t inlen, outlen, inbleft, outbleft;
221*613a2f6bSGordon Ross int rc, err;
222*613a2f6bSGordon Ross
223*613a2f6bSGordon Ross /*
224*613a2f6bSGordon Ross * u8_textprep_str() does not allocate memory. The input and
225*613a2f6bSGordon Ross * output buffers may differ in size (though that would be more
226*613a2f6bSGordon Ross * likely when normalization is done). We have to loop over it...
227*613a2f6bSGordon Ross *
228*613a2f6bSGordon Ross * To improve the chances that we can avoid looping we add 10
229*613a2f6bSGordon Ross * bytes of output buffer room the first go around.
230*613a2f6bSGordon Ross */
231*613a2f6bSGordon Ross inlen = inbleft = strlen(s);
232*613a2f6bSGordon Ross outlen = outbleft = inlen + 10;
233*613a2f6bSGordon Ross if ((res = malloc(outlen)) == NULL)
234*613a2f6bSGordon Ross return (NULL);
235*613a2f6bSGordon Ross outs = res;
236*613a2f6bSGordon Ross
237*613a2f6bSGordon Ross while ((rc = u8_textprep_str((char *)s, &inbleft, outs,
238*613a2f6bSGordon Ross &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 &&
239*613a2f6bSGordon Ross err == E2BIG) {
240*613a2f6bSGordon Ross if ((res = realloc(res, outlen + inbleft)) == NULL)
241*613a2f6bSGordon Ross return (NULL);
242*613a2f6bSGordon Ross /* adjust input/output buffer pointers */
243*613a2f6bSGordon Ross s += (inlen - inbleft);
244*613a2f6bSGordon Ross outs = res + outlen - outbleft;
245*613a2f6bSGordon Ross /* adjust outbleft and outlen */
246*613a2f6bSGordon Ross outlen += inbleft;
247*613a2f6bSGordon Ross outbleft += inbleft;
248*613a2f6bSGordon Ross }
249*613a2f6bSGordon Ross
250*613a2f6bSGordon Ross if (rc < 0) {
251*613a2f6bSGordon Ross free(res);
252*613a2f6bSGordon Ross res = NULL;
253*613a2f6bSGordon Ross return (NULL);
254*613a2f6bSGordon Ross }
255*613a2f6bSGordon Ross
256*613a2f6bSGordon Ross res[outlen - outbleft] = '\0';
257*613a2f6bSGordon Ross
258*613a2f6bSGordon Ross return (res);
259*613a2f6bSGordon Ross }
260*613a2f6bSGordon Ross
261*613a2f6bSGordon Ross char *
utf8_str_toupper(const char * s)262*613a2f6bSGordon Ross utf8_str_toupper(const char *s)
263*613a2f6bSGordon Ross {
264*613a2f6bSGordon Ross return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER));
265*613a2f6bSGordon Ross }
266*613a2f6bSGordon Ross
267*613a2f6bSGordon Ross char *
utf8_str_tolower(const char * s)268*613a2f6bSGordon Ross utf8_str_tolower(const char *s)
269*613a2f6bSGordon Ross {
270*613a2f6bSGordon Ross return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER));
271*613a2f6bSGordon Ross }
272