xref: /illumos-gate/usr/src/lib/libsmbfs/smb/charsets.c (revision 55fea89d)
14bff34e3Sthurlow /*
24bff34e3Sthurlow  * Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
34bff34e3Sthurlow  *
44bff34e3Sthurlow  * @APPLE_LICENSE_HEADER_START@
54bff34e3Sthurlow  *
64bff34e3Sthurlow  * "Portions Copyright (c) 1999 Apple Computer, Inc.  All Rights
74bff34e3Sthurlow  * Reserved.  This file contains Original Code and/or Modifications of
84bff34e3Sthurlow  * Original Code as defined in and that are subject to the Apple Public
94bff34e3Sthurlow  * Source License Version 1.0 (the 'License').  You may not use this file
104bff34e3Sthurlow  * except in compliance with the License.  Please obtain a copy of the
114bff34e3Sthurlow  * License at http://www.apple.com/publicsource and read it before using
124bff34e3Sthurlow  * this file.
134bff34e3Sthurlow  *
144bff34e3Sthurlow  * The Original Code and all software distributed under the License are
154bff34e3Sthurlow  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
164bff34e3Sthurlow  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
174bff34e3Sthurlow  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
184bff34e3Sthurlow  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
194bff34e3Sthurlow  * License for the specific language governing rights and limitations
204bff34e3Sthurlow  * under the License."
214bff34e3Sthurlow  *
224bff34e3Sthurlow  * @APPLE_LICENSE_HEADER_END@
234bff34e3Sthurlow  */
249c9af259SGordon Ross /* CSTYLED */
259c9af259SGordon Ross /*
269c9af259SGordon Ross  *      @(#)charsets.c      *
274bff34e3Sthurlow  *      (c) 2004   Apple Computer, Inc.  All Rights Reserved
284bff34e3Sthurlow  *
294bff34e3Sthurlow  *
304bff34e3Sthurlow  *      charsets.c -- Routines converting between UTF-8, 16-bit
314bff34e3Sthurlow  *			little-endian Unicode, and various Windows
324bff34e3Sthurlow  *			code pages.
334bff34e3Sthurlow  *
344bff34e3Sthurlow  *      MODIFICATION HISTORY:
354bff34e3Sthurlow  *       28-Nov-2004     Guy Harris	New today
364bff34e3Sthurlow  */
374bff34e3Sthurlow 
384bff34e3Sthurlow #include <stdlib.h>
394bff34e3Sthurlow #include <stdio.h>
404bff34e3Sthurlow #include <string.h>
414bff34e3Sthurlow #include <ctype.h>
429c9af259SGordon Ross #include <errno.h>
434bff34e3Sthurlow #include <iconv.h>
444bff34e3Sthurlow #include <langinfo.h>
454bff34e3Sthurlow #include <strings.h>
46613a2f6bSGordon Ross #include <libintl.h>
474bff34e3Sthurlow 
48613a2f6bSGordon Ross #include <sys/isa_defs.h>
494bff34e3Sthurlow #include <netsmb/smb_lib.h>
504bff34e3Sthurlow #include <netsmb/mchain.h>
514bff34e3Sthurlow 
524bff34e3Sthurlow #include "charsets.h"
534bff34e3Sthurlow 
544bff34e3Sthurlow /*
554bff34e3Sthurlow  * On Solaris, we will need to do some rewriting to use our iconv
564bff34e3Sthurlow  * routines for the conversions.  For now, we're effectively
574bff34e3Sthurlow  * stubbing out code, leaving the details of what happens on
584bff34e3Sthurlow  * Darwin in case it's useful as a guide later.
594bff34e3Sthurlow  */
604bff34e3Sthurlow 
614bff34e3Sthurlow static unsigned
xtoi(char u)624bff34e3Sthurlow xtoi(char u)
634bff34e3Sthurlow {
649c9af259SGordon Ross 	if (isdigit(u))
659c9af259SGordon Ross 		return (u - '0');
669c9af259SGordon Ross 	else if (islower(u))
679c9af259SGordon Ross 		return (10 + u - 'a');
689c9af259SGordon Ross 	else if (isupper(u))
699c9af259SGordon Ross 		return (10 + u - 'A');
709c9af259SGordon Ross 	return (16);
714bff34e3Sthurlow }
724bff34e3Sthurlow 
734bff34e3Sthurlow 
749c9af259SGordon Ross /*
759c9af259SGordon Ross  * Removes the "%" escape sequences from a URL component.
764bff34e3Sthurlow  * See IETF RFC 2396.
774bff34e3Sthurlow  */
784bff34e3Sthurlow char *
unpercent(char * component)799c9af259SGordon Ross unpercent(char *component)
804bff34e3Sthurlow {
819c9af259SGordon Ross 	char c, *s;
829c9af259SGordon Ross 	unsigned hi, lo;
834bff34e3Sthurlow 
849c9af259SGordon Ross 	if (component == NULL)
859c9af259SGordon Ross 		return (component);
869c9af259SGordon Ross 
879c9af259SGordon Ross 	for (s = component; (c = *s) != 0; s++) {
889c9af259SGordon Ross 		if (c != '%')
899c9af259SGordon Ross 			continue;
909c9af259SGordon Ross 		if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15)
919c9af259SGordon Ross 			continue; /* ignore invalid escapes */
929c9af259SGordon Ross 		s[0] = hi*16 + lo;
939c9af259SGordon Ross 		/*
949c9af259SGordon Ross 		 * This was strcpy(s + 1, s + 3);
959c9af259SGordon Ross 		 * But nowadays leftward overlapping copies are
969c9af259SGordon Ross 		 * officially undefined in C.  Ours seems to
979c9af259SGordon Ross 		 * work or not depending upon alignment.
989c9af259SGordon Ross 		 */
999c9af259SGordon Ross 		memmove(s+1, s+3, strlen(s+3) + 1);
1009c9af259SGordon Ross 	}
1019c9af259SGordon Ross 	return (component);
1024bff34e3Sthurlow }
1034bff34e3Sthurlow 
1049c9af259SGordon Ross /* BEGIN CSTYLED */
1054bff34e3Sthurlow #ifdef NOTPORTED
1064bff34e3Sthurlow static CFStringEncoding
get_windows_encoding_equivalent(void)1074bff34e3Sthurlow get_windows_encoding_equivalent( void )
1084bff34e3Sthurlow {
1094bff34e3Sthurlow 
1104bff34e3Sthurlow 	CFStringEncoding encoding;
1114bff34e3Sthurlow 	uint32_t index,region;
1124bff34e3Sthurlow 
1134bff34e3Sthurlow 	/* important! use root ID so you can read the config file! */
1144bff34e3Sthurlow 	seteuid(eff_uid);
1154bff34e3Sthurlow 	__CFStringGetInstallationEncodingAndRegion(&index,&region);
1164bff34e3Sthurlow 	seteuid(real_uid);
1174bff34e3Sthurlow 
1184bff34e3Sthurlow 	switch ( index )
1194bff34e3Sthurlow 	{
1204bff34e3Sthurlow 		case	kCFStringEncodingMacRoman:
1214bff34e3Sthurlow 			if (region) /* anything nonzero is not US */
1224bff34e3Sthurlow 				encoding = kCFStringEncodingDOSLatin1;
1234bff34e3Sthurlow 			else /* US region */
1244bff34e3Sthurlow 				encoding = kCFStringEncodingDOSLatinUS;
1254bff34e3Sthurlow 			break;
126*55fea89dSDan Cross 
1274bff34e3Sthurlow 		case	kCFStringEncodingMacJapanese:
1284bff34e3Sthurlow 			encoding = kCFStringEncodingDOSJapanese;
1294bff34e3Sthurlow 			break;
130*55fea89dSDan Cross 
131*55fea89dSDan Cross 		case	kCFStringEncodingMacChineseTrad:
1324bff34e3Sthurlow 			encoding = kCFStringEncodingDOSChineseTrad;
1334bff34e3Sthurlow 			break;
134*55fea89dSDan Cross 
1354bff34e3Sthurlow 		case	kCFStringEncodingMacKorean:
1364bff34e3Sthurlow 			encoding = kCFStringEncodingDOSKorean;
1374bff34e3Sthurlow 			break;
138*55fea89dSDan Cross 
139*55fea89dSDan Cross 		case	kCFStringEncodingMacArabic:
1404bff34e3Sthurlow 			encoding = kCFStringEncodingDOSArabic;
1414bff34e3Sthurlow 			break;
142*55fea89dSDan Cross 
143*55fea89dSDan Cross 		case	kCFStringEncodingMacHebrew:
1444bff34e3Sthurlow 			encoding = kCFStringEncodingDOSHebrew;
1454bff34e3Sthurlow 			break;
146*55fea89dSDan Cross 
1474bff34e3Sthurlow 		case	kCFStringEncodingMacGreek:
1484bff34e3Sthurlow 			encoding = kCFStringEncodingDOSGreek;
1494bff34e3Sthurlow 			break;
150*55fea89dSDan Cross 
151*55fea89dSDan Cross 		case	kCFStringEncodingMacCyrillic:
1524bff34e3Sthurlow 			encoding = kCFStringEncodingDOSCyrillic;
1534bff34e3Sthurlow 			break;
154*55fea89dSDan Cross 
1554bff34e3Sthurlow 		case	kCFStringEncodingMacThai:
1564bff34e3Sthurlow 			encoding = kCFStringEncodingDOSThai;
1574bff34e3Sthurlow 			break;
158*55fea89dSDan Cross 
1594bff34e3Sthurlow 		case	kCFStringEncodingMacChineseSimp:
1604bff34e3Sthurlow 			encoding = kCFStringEncodingDOSChineseSimplif;
1614bff34e3Sthurlow 			break;
162*55fea89dSDan Cross 
1634bff34e3Sthurlow 		case	kCFStringEncodingMacCentralEurRoman:
1644bff34e3Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1654bff34e3Sthurlow 			break;
166*55fea89dSDan Cross 
1674bff34e3Sthurlow 		case	kCFStringEncodingMacTurkish:
1684bff34e3Sthurlow 			encoding = kCFStringEncodingDOSTurkish;
1694bff34e3Sthurlow 			break;
170*55fea89dSDan Cross 
1714bff34e3Sthurlow 		case	kCFStringEncodingMacCroatian:
1724bff34e3Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1734bff34e3Sthurlow 			break;
174*55fea89dSDan Cross 
1754bff34e3Sthurlow 		case	kCFStringEncodingMacIcelandic:
1764bff34e3Sthurlow 			encoding = kCFStringEncodingDOSIcelandic;
1774bff34e3Sthurlow 			break;
178*55fea89dSDan Cross 
1794bff34e3Sthurlow 		case	kCFStringEncodingMacRomanian:
1804bff34e3Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1814bff34e3Sthurlow 			break;
182*55fea89dSDan Cross 
1834bff34e3Sthurlow 		case	kCFStringEncodingMacFarsi:
1844bff34e3Sthurlow 			encoding = kCFStringEncodingDOSArabic;
1854bff34e3Sthurlow 			break;
186*55fea89dSDan Cross 
1874bff34e3Sthurlow 		case	kCFStringEncodingMacUkrainian:
1884bff34e3Sthurlow 			encoding = kCFStringEncodingDOSCyrillic;
1894bff34e3Sthurlow 			break;
190*55fea89dSDan Cross 
1914bff34e3Sthurlow 		default:
1924bff34e3Sthurlow 			encoding = kCFStringEncodingDOSLatin1;
1934bff34e3Sthurlow 			break;
1944bff34e3Sthurlow 	}
1954bff34e3Sthurlow 
1964bff34e3Sthurlow 	return encoding;
1974bff34e3Sthurlow }
1984bff34e3Sthurlow #endif /* NOTPORTED */
1994bff34e3Sthurlow 
2004bff34e3Sthurlow /*
2014bff34e3Sthurlow  * XXX - NLS, or CF?  We should probably use the same routine for all
2024bff34e3Sthurlow  * conversions.
2034bff34e3Sthurlow  */
2044bff34e3Sthurlow char *
convert_wincs_to_utf8(const char * windows_string)2054bff34e3Sthurlow convert_wincs_to_utf8(const char *windows_string)
2064bff34e3Sthurlow {
2074bff34e3Sthurlow #ifdef NOTPORTED
2084bff34e3Sthurlow 	CFStringRef s;
2094bff34e3Sthurlow 	CFIndex maxlen;
2104bff34e3Sthurlow 	char *result;
2114bff34e3Sthurlow 
212*55fea89dSDan Cross 	s = CFStringCreateWithCString(NULL, windows_string,
2134bff34e3Sthurlow 		get_windows_encoding_equivalent());
2144bff34e3Sthurlow 	if (s == NULL) {
2154bff34e3Sthurlow 		smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1,
2164bff34e3Sthurlow 		    windows_string);
2174bff34e3Sthurlow 
2184bff34e3Sthurlow 		/* kCFStringEncodingMacRoman should always succeed */
219*55fea89dSDan Cross 		s = CFStringCreateWithCString(NULL, windows_string,
2204bff34e3Sthurlow 		    kCFStringEncodingMacRoman);
2214bff34e3Sthurlow 		if (s == NULL) {
2224bff34e3Sthurlow 			smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping",
2234bff34e3Sthurlow 			    -1, windows_string);
2244bff34e3Sthurlow 			return NULL;
2254bff34e3Sthurlow 		}
2264bff34e3Sthurlow 	}
2274bff34e3Sthurlow 
2284bff34e3Sthurlow 	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2294bff34e3Sthurlow 	    kCFStringEncodingUTF8) + 1;
2304bff34e3Sthurlow 	result = malloc(maxlen);
2314bff34e3Sthurlow 	if (result == NULL) {
2324bff34e3Sthurlow 		smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1,
2334bff34e3Sthurlow 		    windows_string);
2344bff34e3Sthurlow 		CFRelease(s);
2354bff34e3Sthurlow 		return NULL;
2364bff34e3Sthurlow 	}
2374bff34e3Sthurlow 	if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) {
2384bff34e3Sthurlow 		smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping",
2394bff34e3Sthurlow 		    -1, windows_string);
2404bff34e3Sthurlow 		CFRelease(s);
2414bff34e3Sthurlow 		return NULL;
2424bff34e3Sthurlow 	}
2434bff34e3Sthurlow 	CFRelease(s);
2444bff34e3Sthurlow 	return result;
2454bff34e3Sthurlow #else /* NOTPORTED */
2469c9af259SGordon Ross 	return (strdup((char*)windows_string));
2474bff34e3Sthurlow #endif /* NOTPORTED */
2484bff34e3Sthurlow }
2494bff34e3Sthurlow 
2504bff34e3Sthurlow /*
2514bff34e3Sthurlow  * XXX - NLS, or CF?  We should probably use the same routine for all
2524bff34e3Sthurlow  * conversions.
2534bff34e3Sthurlow  */
2544bff34e3Sthurlow char *
convert_utf8_to_wincs(const char * utf8_string)2554bff34e3Sthurlow convert_utf8_to_wincs(const char *utf8_string)
2564bff34e3Sthurlow {
2574bff34e3Sthurlow #ifdef NOTPORTED
2584bff34e3Sthurlow 	CFStringRef s;
2594bff34e3Sthurlow 	CFIndex maxlen;
2604bff34e3Sthurlow 	char *result;
2614bff34e3Sthurlow 
2624bff34e3Sthurlow 	s = CFStringCreateWithCString(NULL, utf8_string,
2634bff34e3Sthurlow 	    kCFStringEncodingUTF8);
2644bff34e3Sthurlow 	if (s == NULL) {
2654bff34e3Sthurlow 		smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1,
2664bff34e3Sthurlow 		    utf8_string);
2674bff34e3Sthurlow 		return NULL;
2684bff34e3Sthurlow 	}
2694bff34e3Sthurlow 
2704bff34e3Sthurlow 	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2714bff34e3Sthurlow 	    get_windows_encoding_equivalent()) + 1;
2724bff34e3Sthurlow 	result = malloc(maxlen);
2734bff34e3Sthurlow 	if (result == NULL) {
2744bff34e3Sthurlow 		smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1,
2754bff34e3Sthurlow 		    utf8_string);
2764bff34e3Sthurlow 		CFRelease(s);
2774bff34e3Sthurlow 		return NULL;
2784bff34e3Sthurlow 	}
2794bff34e3Sthurlow 	if (!CFStringGetCString(s, result, maxlen,
2804bff34e3Sthurlow 	    get_windows_encoding_equivalent())) {
2814bff34e3Sthurlow 		smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping",
2824bff34e3Sthurlow 		    -1, utf8_string);
2834bff34e3Sthurlow 		CFRelease(s);
2844bff34e3Sthurlow 		return NULL;
2854bff34e3Sthurlow 	}
2864bff34e3Sthurlow 	CFRelease(s);
2874bff34e3Sthurlow 	return result;
2884bff34e3Sthurlow #else /* NOTPORTED */
2899c9af259SGordon Ross 	return (strdup((char*)utf8_string));
2904bff34e3Sthurlow #endif /* NOTPORTED */
2914bff34e3Sthurlow }
2929c9af259SGordon Ross /* END CSTYLED */
2934bff34e3Sthurlow 
2944bff34e3Sthurlow /*
2959c9af259SGordon Ross  * We replaced these routines for Solaris:
2969c9af259SGordon Ross  *	convert_leunicode_to_utf8
2979c9af259SGordon Ross  *	convert_unicode_to_utf8
2989c9af259SGordon Ross  *	convert_utf8_to_leunicode
2999c9af259SGordon Ross  * with new code in: utf_str.c
3004bff34e3Sthurlow  */
301