xref: /illumos-gate/usr/src/lib/libsmbfs/smb/utf_str.c (revision 613a2f6b)
19c9af259SGordon Ross /*
29c9af259SGordon Ross  * CDDL HEADER START
39c9af259SGordon Ross  *
49c9af259SGordon Ross  * The contents of this file are subject to the terms of the
59c9af259SGordon Ross  * Common Development and Distribution License (the "License").
69c9af259SGordon Ross  * You may not use this file except in compliance with the License.
79c9af259SGordon Ross  *
89c9af259SGordon Ross  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
99c9af259SGordon Ross  * or http://www.opensolaris.org/os/licensing.
109c9af259SGordon Ross  * See the License for the specific language governing permissions
119c9af259SGordon Ross  * and limitations under the License.
129c9af259SGordon Ross  *
139c9af259SGordon Ross  * When distributing Covered Code, include this CDDL HEADER in each
149c9af259SGordon Ross  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
159c9af259SGordon Ross  * If applicable, add the following below this CDDL HEADER, with the
169c9af259SGordon Ross  * fields enclosed by brackets "[]" replaced with your own identifying
179c9af259SGordon Ross  * information: Portions Copyright [yyyy] [name of copyright owner]
189c9af259SGordon Ross  *
199c9af259SGordon Ross  * CDDL HEADER END
209c9af259SGordon Ross  */
219c9af259SGordon Ross 
229c9af259SGordon Ross /*
23*613a2f6bSGordon Ross  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
249c9af259SGordon Ross  * Use is subject to license terms.
259c9af259SGordon Ross  */
269c9af259SGordon Ross 
279c9af259SGordon Ross /*
289c9af259SGordon Ross  * Unicode conversions (yet more)
299c9af259SGordon Ross  */
309c9af259SGordon Ross 
319c9af259SGordon Ross #include <stdio.h>
329c9af259SGordon Ross #include <stdlib.h>
339c9af259SGordon Ross #include <string.h>
349c9af259SGordon Ross #include <errno.h>
359c9af259SGordon Ross #include <iconv.h>
369c9af259SGordon Ross #include <libintl.h>
379c9af259SGordon Ross 
389c9af259SGordon Ross #include <sys/u8_textprep.h>
399c9af259SGordon Ross 
409c9af259SGordon Ross #include <netsmb/smb_lib.h>
419c9af259SGordon Ross #include "charsets.h"
429c9af259SGordon Ross 
439c9af259SGordon Ross 
449c9af259SGordon Ross /*
459c9af259SGordon Ross  * Number of unicode symbols in the string,
469c9af259SGordon Ross  * not including the 2-byte null terminator.
479c9af259SGordon Ross  * (multiply by two for storage size)
489c9af259SGordon Ross  */
499c9af259SGordon Ross size_t
unicode_strlen(const uint16_t * us)509c9af259SGordon Ross unicode_strlen(const uint16_t *us)
519c9af259SGordon Ross {
529c9af259SGordon Ross 	size_t len = 0;
539c9af259SGordon Ross 	while (*us++)
549c9af259SGordon Ross 		len++;
559c9af259SGordon Ross 	return (len);
569c9af259SGordon Ross }
579c9af259SGordon Ross 
589c9af259SGordon Ross static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *);
599c9af259SGordon Ross 
609c9af259SGordon Ross /*
619c9af259SGordon Ross  * Convert (native) Unicode string to UTF-8.
629c9af259SGordon Ross  * Returns allocated memory.
639c9af259SGordon Ross  */
649c9af259SGordon Ross char *
convert_unicode_to_utf8(uint16_t * us)659c9af259SGordon Ross convert_unicode_to_utf8(uint16_t *us)
669c9af259SGordon Ross {
679c9af259SGordon Ross 	static iconv_t cd1 = (iconv_t)-1;
689c9af259SGordon Ross 
699c9af259SGordon Ross 	/* Get conversion descriptor (to, from) */
709c9af259SGordon Ross 	if (cd1 == (iconv_t)-1)
719c9af259SGordon Ross 		cd1 = iconv_open("UTF-8", "UCS-2");
729c9af259SGordon Ross 
739c9af259SGordon Ross 	return (convert_ucs2xx_to_utf8(cd1, us));
749c9af259SGordon Ross }
759c9af259SGordon Ross 
769c9af259SGordon Ross /*
779c9af259SGordon Ross  * Convert little-endian Unicode string to UTF-8.
789c9af259SGordon Ross  * Returns allocated memory.
799c9af259SGordon Ross  */
809c9af259SGordon Ross char *
convert_leunicode_to_utf8(unsigned short * us)819c9af259SGordon Ross convert_leunicode_to_utf8(unsigned short *us)
829c9af259SGordon Ross {
839c9af259SGordon Ross 	static iconv_t cd2 = (iconv_t)-1;
849c9af259SGordon Ross 
859c9af259SGordon Ross 	/* Get conversion descriptor (to, from) */
869c9af259SGordon Ross 	if (cd2 == (iconv_t)-1)
879c9af259SGordon Ross 		cd2 = iconv_open("UTF-8", "UCS-2LE");
889c9af259SGordon Ross 
899c9af259SGordon Ross 	return (convert_ucs2xx_to_utf8(cd2, us));
909c9af259SGordon Ross }
919c9af259SGordon Ross 
929c9af259SGordon Ross static char *
convert_ucs2xx_to_utf8(iconv_t cd,const uint16_t * us)939c9af259SGordon Ross convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us)
949c9af259SGordon Ross {
959c9af259SGordon Ross 	char *obuf, *optr;
969c9af259SGordon Ross 	const char *iptr;
979c9af259SGordon Ross 	size_t  ileft, obsize, oleft, ret;
989c9af259SGordon Ross 
999c9af259SGordon Ross 	if (cd == (iconv_t)-1) {
1009c9af259SGordon Ross 		smb_error(dgettext(TEXT_DOMAIN,
1019c9af259SGordon Ross 		    "iconv_open(UTF-8/UCS-2)"), -1);
1029c9af259SGordon Ross 		return (NULL);
1039c9af259SGordon Ross 	}
1049c9af259SGordon Ross 
1059c9af259SGordon Ross 	iptr = (const char *)us;
1069c9af259SGordon Ross 	ileft = unicode_strlen(us);
1079c9af259SGordon Ross 	ileft *= 2; /* now bytes */
1089c9af259SGordon Ross 
1099c9af259SGordon Ross 	/* Worst-case output size is 2x input size. */
1109c9af259SGordon Ross 	oleft = ileft * 2;
1119c9af259SGordon Ross 	obsize = oleft + 2; /* room for null */
1129c9af259SGordon Ross 	obuf = malloc(obsize);
1139c9af259SGordon Ross 	if (!obuf)
1149c9af259SGordon Ross 		return (NULL);
1159c9af259SGordon Ross 	optr = obuf;
1169c9af259SGordon Ross 
1179c9af259SGordon Ross 	ret = iconv(cd, &iptr, &ileft, &optr, &oleft);
1189c9af259SGordon Ross 	*optr = '\0';
1199c9af259SGordon Ross 	if (ret == (size_t)-1) {
1209c9af259SGordon Ross 		smb_error(dgettext(TEXT_DOMAIN,
1219c9af259SGordon Ross 		    "iconv(%s) failed"), errno, obuf);
1229c9af259SGordon Ross 	}
1239c9af259SGordon Ross 	if (ileft) {
1249c9af259SGordon Ross 		smb_error(dgettext(TEXT_DOMAIN,
1259c9af259SGordon Ross 		    "iconv(%s) failed"), -1, obuf);
1269c9af259SGordon Ross 		/*
1279c9af259SGordon Ross 		 * XXX: What's better?  return NULL?
1289c9af259SGordon Ross 		 * The truncated string? << for now
1299c9af259SGordon Ross 		 */
1309c9af259SGordon Ross 	}
1319c9af259SGordon Ross 
1329c9af259SGordon Ross 	return (obuf);
1339c9af259SGordon Ross }
1349c9af259SGordon Ross 
1359c9af259SGordon Ross static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *);
1369c9af259SGordon Ross 
1379c9af259SGordon Ross /*
1389c9af259SGordon Ross  * Convert UTF-8 string to Unicode.
1399c9af259SGordon Ross  * Returns allocated memory.
1409c9af259SGordon Ross  */
1419c9af259SGordon Ross uint16_t *
convert_utf8_to_unicode(const char * utf8_string)1429c9af259SGordon Ross convert_utf8_to_unicode(const char *utf8_string)
1439c9af259SGordon Ross {
1449c9af259SGordon Ross 	static iconv_t cd3 = (iconv_t)-1;
1459c9af259SGordon Ross 
1469c9af259SGordon Ross 	/* Get conversion descriptor (to, from) */
1479c9af259SGordon Ross 	if (cd3 == (iconv_t)-1)
1489c9af259SGordon Ross 		cd3 = iconv_open("UCS-2", "UTF-8");
1499c9af259SGordon Ross 	return (convert_utf8_to_ucs2xx(cd3, utf8_string));
1509c9af259SGordon Ross }
1519c9af259SGordon Ross 
1529c9af259SGordon Ross /*
1539c9af259SGordon Ross  * Convert UTF-8 string to little-endian Unicode.
1549c9af259SGordon Ross  * Returns allocated memory.
1559c9af259SGordon Ross  */
1569c9af259SGordon Ross uint16_t *
convert_utf8_to_leunicode(const char * utf8_string)1579c9af259SGordon Ross convert_utf8_to_leunicode(const char *utf8_string)
1589c9af259SGordon Ross {
1599c9af259SGordon Ross 	static iconv_t cd4 = (iconv_t)-1;
1609c9af259SGordon Ross 
1619c9af259SGordon Ross 	/* Get conversion descriptor (to, from) */
1629c9af259SGordon Ross 	if (cd4 == (iconv_t)-1)
1639c9af259SGordon Ross 		cd4 = iconv_open("UCS-2LE", "UTF-8");
1649c9af259SGordon Ross 	return (convert_utf8_to_ucs2xx(cd4, utf8_string));
1659c9af259SGordon Ross }
1669c9af259SGordon Ross 
1679c9af259SGordon Ross static uint16_t *
convert_utf8_to_ucs2xx(iconv_t cd,const char * utf8_string)1689c9af259SGordon Ross convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string)
1699c9af259SGordon Ross {
1709c9af259SGordon Ross 	uint16_t *obuf, *optr;
1719c9af259SGordon Ross 	const char *iptr;
1729c9af259SGordon Ross 	size_t  ileft, obsize, oleft, ret;
1739c9af259SGordon Ross 
1749c9af259SGordon Ross 	if (cd == (iconv_t)-1) {
1759c9af259SGordon Ross 		smb_error(dgettext(TEXT_DOMAIN,
1769c9af259SGordon Ross 		    "iconv_open(UCS-2/UTF-8)"), -1);
1779c9af259SGordon Ross 		return (NULL);
1789c9af259SGordon Ross 	}
1799c9af259SGordon Ross 
1809c9af259SGordon Ross 	iptr = utf8_string;
1819c9af259SGordon Ross 	ileft = strlen(iptr);
1829c9af259SGordon Ross 
1839c9af259SGordon Ross 	/* Worst-case output size is 2x input size. */
1849c9af259SGordon Ross 	oleft = ileft * 2;
1859c9af259SGordon Ross 	obsize = oleft + 2; /* room for null */
1869c9af259SGordon Ross 	obuf = malloc(obsize);
1879c9af259SGordon Ross 	if (!obuf)
1889c9af259SGordon Ross 		return (NULL);
1899c9af259SGordon Ross 	optr = obuf;
1909c9af259SGordon Ross 
1919c9af259SGordon Ross 	ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft);
1929c9af259SGordon Ross 	*optr = '\0';
1939c9af259SGordon Ross 	if (ret == (size_t)-1) {
1949c9af259SGordon Ross 		smb_error(dgettext(TEXT_DOMAIN,
1959c9af259SGordon Ross 		    "iconv(%s) failed"), errno, utf8_string);
1969c9af259SGordon Ross 	}
1979c9af259SGordon Ross 	if (ileft) {
1989c9af259SGordon Ross 		smb_error(dgettext(TEXT_DOMAIN,
1999c9af259SGordon Ross 		    "iconv(%s) failed"), -1, utf8_string);
2009c9af259SGordon Ross 		/*
2019c9af259SGordon Ross 		 * XXX: What's better?  return NULL?
2029c9af259SGordon Ross 		 * The truncated string? << for now
2039c9af259SGordon Ross 		 */
2049c9af259SGordon Ross 	}
2059c9af259SGordon Ross 
2069c9af259SGordon Ross 	return (obuf);
2079c9af259SGordon Ross }
208*613a2f6bSGordon Ross 
209*613a2f6bSGordon Ross 
210*613a2f6bSGordon Ross /*
211*613a2f6bSGordon Ross  * A simple wrapper around u8_textprep_str() that returns the Unicode
212*613a2f6bSGordon Ross  * upper-case version of some string.  Returns memory from malloc.
213*613a2f6bSGordon Ross  * Borrowed from idmapd.
214*613a2f6bSGordon Ross  */
215*613a2f6bSGordon Ross static char *
utf8_str_to_upper_or_lower(const char * s,int upper_lower)216*613a2f6bSGordon Ross utf8_str_to_upper_or_lower(const char *s, int upper_lower)
217*613a2f6bSGordon Ross {
218*613a2f6bSGordon Ross 	char *res = NULL;
219*613a2f6bSGordon Ross 	char *outs;
220*613a2f6bSGordon Ross 	size_t inlen, outlen, inbleft, outbleft;
221*613a2f6bSGordon Ross 	int rc, err;
222*613a2f6bSGordon Ross 
223*613a2f6bSGordon Ross 	/*
224*613a2f6bSGordon Ross 	 * u8_textprep_str() does not allocate memory.  The input and
225*613a2f6bSGordon Ross 	 * output buffers may differ in size (though that would be more
226*613a2f6bSGordon Ross 	 * likely when normalization is done).  We have to loop over it...
227*613a2f6bSGordon Ross 	 *
228*613a2f6bSGordon Ross 	 * To improve the chances that we can avoid looping we add 10
229*613a2f6bSGordon Ross 	 * bytes of output buffer room the first go around.
230*613a2f6bSGordon Ross 	 */
231*613a2f6bSGordon Ross 	inlen = inbleft = strlen(s);
232*613a2f6bSGordon Ross 	outlen = outbleft = inlen + 10;
233*613a2f6bSGordon Ross 	if ((res = malloc(outlen)) == NULL)
234*613a2f6bSGordon Ross 		return (NULL);
235*613a2f6bSGordon Ross 	outs = res;
236*613a2f6bSGordon Ross 
237*613a2f6bSGordon Ross 	while ((rc = u8_textprep_str((char *)s, &inbleft, outs,
238*613a2f6bSGordon Ross 	    &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 &&
239*613a2f6bSGordon Ross 	    err == E2BIG) {
240*613a2f6bSGordon Ross 		if ((res = realloc(res, outlen + inbleft)) == NULL)
241*613a2f6bSGordon Ross 			return (NULL);
242*613a2f6bSGordon Ross 		/* adjust input/output buffer pointers */
243*613a2f6bSGordon Ross 		s += (inlen - inbleft);
244*613a2f6bSGordon Ross 		outs = res + outlen - outbleft;
245*613a2f6bSGordon Ross 		/* adjust outbleft and outlen */
246*613a2f6bSGordon Ross 		outlen += inbleft;
247*613a2f6bSGordon Ross 		outbleft += inbleft;
248*613a2f6bSGordon Ross 	}
249*613a2f6bSGordon Ross 
250*613a2f6bSGordon Ross 	if (rc < 0) {
251*613a2f6bSGordon Ross 		free(res);
252*613a2f6bSGordon Ross 		res = NULL;
253*613a2f6bSGordon Ross 		return (NULL);
254*613a2f6bSGordon Ross 	}
255*613a2f6bSGordon Ross 
256*613a2f6bSGordon Ross 	res[outlen - outbleft] = '\0';
257*613a2f6bSGordon Ross 
258*613a2f6bSGordon Ross 	return (res);
259*613a2f6bSGordon Ross }
260*613a2f6bSGordon Ross 
261*613a2f6bSGordon Ross char *
utf8_str_toupper(const char * s)262*613a2f6bSGordon Ross utf8_str_toupper(const char *s)
263*613a2f6bSGordon Ross {
264*613a2f6bSGordon Ross 	return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER));
265*613a2f6bSGordon Ross }
266*613a2f6bSGordon Ross 
267*613a2f6bSGordon Ross char *
utf8_str_tolower(const char * s)268*613a2f6bSGordon Ross utf8_str_tolower(const char *s)
269*613a2f6bSGordon Ross {
270*613a2f6bSGordon Ross 	return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER));
271*613a2f6bSGordon Ross }
272