1 /*
2  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7  * util/support/utf8_conv.c
8  *
9  * Copyright 2008 by the Massachusetts Institute of Technology.
10  * All Rights Reserved.
11  *
12  * Export of this software from the United States of America may
13  *   require a specific license from the United States Government.
14  *   It is the responsibility of any person or organization contemplating
15  *   export to obtain such a license before exporting.
16  *
17  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
18  * distribute this software and its documentation for any purpose and
19  * without fee is hereby granted, provided that the above copyright
20  * notice appear in all copies and that both that copyright notice and
21  * this permission notice appear in supporting documentation, and that
22  * the name of M.I.T. not be used in advertising or publicity pertaining
23  * to distribution of the software without specific, written prior
24  * permission.  Furthermore if you modify this software you must label
25  * your software as modified software and not distribute it in such a
26  * fashion that it might be confused with the original M.I.T. software.
27  * M.I.T. makes no representations about the suitability of
28  * this software for any purpose.  It is provided "as is" without express
29  * or implied warranty.
30  */
31 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
32  *
33  * Copyright 1998-2008 The OpenLDAP Foundation.
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted only as authorized by the OpenLDAP
38  * Public License.
39  *
40  * A copy of this license is available in the file LICENSE in the
41  * top-level directory of the distribution or, alternatively, at
42  * <http://www.OpenLDAP.org/license.html>.
43  */
44 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
45  *
46  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
47  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
48  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
49  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
50  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
51  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
52  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
53  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
54  */
55 
56 /*
57  * UTF-8 Conversion Routines
58  *
59  * These routines convert between Wide Character and UTF-8,
60  * or between MultiByte and UTF-8 encodings.
61  *
62  * Both single character and string versions of the functions are provided.
63  * All functions return -1 if the character or string cannot be converted.
64  */
65 
66 #include "k5-platform.h"
67 #include "k5-utf8.h"
68 #include "supp-int.h"
69 #include "errno.h"  /* SUNW17PACresync */
70 
71 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
72 
73 static ssize_t
74 k5_utf8s_to_ucs2s(krb5_ucs2 *ucs2str,
75 		  const char *utf8str,
76 		  size_t count,
77 		  int little_endian)
78 {
79     size_t ucs2len = 0;
80     size_t utflen, i;
81     krb5_ucs2 ch;
82 
83     /* If input ptr is NULL or empty... */
84     if (utf8str == NULL || *utf8str == '\0') {
85 	*ucs2str = 0;
86 
87 	return 0;
88     }
89 
90     /* Examine next UTF-8 character.  */
91     while (*utf8str && ucs2len < count) {
92 	/* Get UTF-8 sequence length from 1st byte */
93 	utflen = KRB5_UTF8_CHARLEN2(utf8str, utflen);
94 
95 	if (utflen == 0 || utflen > KRB5_MAX_UTF8_LEN)
96 	    return -1;
97 
98 	/* First byte minus length tag */
99 	ch = (krb5_ucs2)(utf8str[0] & mask[utflen]);
100 
101 	for (i = 1; i < utflen; i++) {
102 	    /* Subsequent bytes must start with 10 */
103 	    if ((utf8str[i] & 0xc0) != 0x80)
104 		return -1;
105 
106 	    ch <<= 6;			/* 6 bits of data in each subsequent byte */
107 	    ch |= (krb5_ucs2)(utf8str[i] & 0x3f);
108 	}
109 
110 	if (ucs2str != NULL) {
111 #ifdef K5_BE
112 #ifndef SWAP16
113 #define SWAP16(X)	((((X) << 8) | ((X) >> 8)) & 0xFFFF)
114 #endif
115 	    if (little_endian)
116 		ucs2str[ucs2len] = SWAP16(ch);
117 	    else
118 #endif
119 		ucs2str[ucs2len] = ch;
120 	}
121 
122 	utf8str += utflen;	/* Move to next UTF-8 character */
123 	ucs2len++;		/* Count number of wide chars stored/required */
124     }
125 
126     assert(ucs2len < count);
127 
128     if (ucs2str != NULL) {
129 	/* Add null terminator if there's room in the buffer. */
130 	ucs2str[ucs2len] = 0;
131     }
132 
133     return ucs2len;
134 }
135 
136 int
137 krb5int_utf8s_to_ucs2s(const char *utf8s,
138 		       krb5_ucs2 **ucs2s,
139 		       size_t *ucs2chars)
140 {
141     ssize_t len;
142     size_t chars;
143 
144     chars = krb5int_utf8_chars(utf8s);
145     *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
146     if (*ucs2s == NULL) {
147 	return ENOMEM;
148     }
149 
150     len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0);
151     if (len < 0) {
152 	free(*ucs2s);
153 	*ucs2s = NULL;
154 	return EINVAL;
155     }
156 
157     if (ucs2chars != NULL) {
158 	*ucs2chars = chars;
159     }
160 
161     return 0;
162 }
163 
164 int
165 krb5int_utf8cs_to_ucs2s(const char *utf8s,
166 			size_t utf8slen,
167 			krb5_ucs2 **ucs2s,
168 			size_t *ucs2chars)
169 {
170     ssize_t len;
171     size_t chars;
172 
173     chars = krb5int_utf8c_chars(utf8s, utf8slen);
174     *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
175     if (*ucs2s == NULL) {
176 	return ENOMEM;
177     }
178 
179     len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0);
180     if (len < 0) {
181 	free(*ucs2s);
182 	*ucs2s = NULL;
183 	return EINVAL;
184     }
185 
186     if (ucs2chars != NULL) {
187 	*ucs2chars = chars;
188     }
189 
190     return 0;
191 }
192 
193 int
194 krb5int_utf8s_to_ucs2les(const char *utf8s,
195                          unsigned char **ucs2les,
196 			 size_t *ucs2leslen)
197 {
198     ssize_t len;
199     size_t chars;
200 
201     chars = krb5int_utf8_chars(utf8s);
202 
203     *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2));
204     if (*ucs2les == NULL) {
205 	return ENOMEM;
206     }
207 
208     len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1);
209     if (len < 0) {
210 	free(*ucs2les);
211 	*ucs2les = NULL;
212 	return EINVAL;
213     }
214 
215     if (ucs2leslen != NULL) {
216 	*ucs2leslen = chars * sizeof(krb5_ucs2);
217     }
218 
219     return 0;
220 }
221 
222 int
223 krb5int_utf8cs_to_ucs2les(const char *utf8s,
224 			  size_t utf8slen,
225 			  unsigned char **ucs2les,
226 			  size_t *ucs2leslen)
227 {
228     ssize_t len;
229     size_t chars;
230 
231     chars = krb5int_utf8c_chars(utf8s, utf8slen);
232 
233     *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2));
234     if (*ucs2les == NULL) {
235 	return ENOMEM;
236     }
237 
238     len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1);
239     if (len < 0) {
240 	free(*ucs2les);
241 	*ucs2les = NULL;
242 	return EINVAL;
243     }
244 
245     if (ucs2leslen != NULL) {
246 	*ucs2leslen = chars * sizeof(krb5_ucs2);
247     }
248 
249     return 0;
250 }
251 
252 /*-----------------------------------------------------------------------------
253    Convert a wide char string to a UTF-8 string.
254    No more than 'count' bytes will be written to the output buffer.
255    Return the # of bytes written to the output buffer, excl null terminator.
256 
257    ucs2len is -1 if the UCS-2 string is NUL terminated, otherwise it is the
258    length of the UCS-2 string in characters
259 */
260 static ssize_t
261 k5_ucs2s_to_utf8s(char *utf8str, const krb5_ucs2 *ucs2str,
262 		  size_t count, ssize_t ucs2len, int little_endian)
263 {
264     int len = 0;
265     int n;
266     char *p = utf8str;
267     krb5_ucs2 empty = 0, ch;
268 
269     if (ucs2str == NULL)	/* Treat input ptr NULL as an empty string */
270 	ucs2str = &empty;
271 
272     if (utf8str == NULL)	/* Just compute size of output, excl null */
273     {
274 	while (ucs2len == -1 ? *ucs2str : --ucs2len >= 0) {
275 	    /* Get UTF-8 size of next wide char */
276 	  ch = *ucs2str++;
277 #ifdef K5_BE
278 	    if (little_endian)
279 		ch = SWAP16(ch);
280 #endif
281 
282 	    n = krb5int_ucs2_to_utf8(ch, NULL);
283 	    if (n < 1)
284 		return -1;
285 	    if (len + n < len)
286 		return -1; /* overflow */
287 	    len += n;
288 	}
289 
290 	return len;
291     }
292 
293     /* Do the actual conversion. */
294 
295     n = 1;					/* In case of empty ucs2str */
296     while (ucs2len == -1 ? *ucs2str != 0 : --ucs2len >= 0) {
297       ch = *ucs2str++;
298 #ifdef K5_BE
299 	if (little_endian)
300 	    ch = SWAP16(ch);
301 #endif
302 
303 	n = krb5int_ucs2_to_utf8(ch, p);
304 
305 	if (n < 1)
306 	    break;
307 
308 	p += n;
309 	count -= n;			/* Space left in output buffer */
310     }
311 
312     /* If not enough room for last character, pad remainder with null
313        so that return value = original count, indicating buffer full. */
314     if (n == 0) {
315 	while (count--)
316 	    *p++ = 0;
317     }
318     /* Add a null terminator if there's room. */
319     else if (count)
320 	*p = 0;
321 
322     if (n == -1)			/* Conversion encountered invalid wide char. */
323 	return -1;
324 
325     /* Return the number of bytes written to output buffer, excl null. */
326     return (p - utf8str);
327 }
328 
329 int
330 krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s,
331 		       char **utf8s,
332 		       size_t *utf8slen)
333 {
334     ssize_t len;
335 
336     len = k5_ucs2s_to_utf8s(NULL, ucs2s, 0, -1, 0);
337     if (len < 0) {
338 	return EINVAL;
339     }
340 
341     *utf8s = (char *)malloc((size_t)len + 1);
342     if (*utf8s == NULL) {
343 	return ENOMEM;
344     }
345 
346     len = k5_ucs2s_to_utf8s(*utf8s, ucs2s, (size_t)len + 1, -1, 0);
347     if (len < 0) {
348 	free(*utf8s);
349 	*utf8s = NULL;
350 	return EINVAL;
351     }
352 
353     if (utf8slen != NULL) {
354 	*utf8slen = len;
355     }
356 
357     return 0;
358 }
359 
360 int
361 krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les,
362 			 char **utf8s,
363 			 size_t *utf8slen)
364 {
365     ssize_t len;
366 
367     len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, -1, 1);
368     if (len < 0)
369 	return EINVAL;
370 
371     *utf8s = (char *)malloc((size_t)len + 1);
372     if (*utf8s == NULL) {
373 	return ENOMEM;
374     }
375 
376     len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len + 1, -1, 1);
377     if (len < 0) {
378 	free(*utf8s);
379 	*utf8s = NULL;
380 	return EINVAL;
381     }
382 
383     if (utf8slen != NULL) {
384 	*utf8slen = len;
385     }
386 
387     return 0;
388 }
389 
390 int
391 krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s,
392                         size_t ucs2slen,
393                         char **utf8s,
394                         size_t *utf8slen)
395 {
396     ssize_t len;
397 
398     if (ucs2slen > SSIZE_MAX)
399 	return ERANGE;
400 
401     len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2s, 0,
402 			    (ssize_t)ucs2slen, 0);
403     if (len < 0)
404 	return EINVAL;
405 
406     *utf8s = (char *)malloc((size_t)len + 1);
407     if (*utf8s == NULL) {
408 	return ENOMEM;
409     }
410 
411     len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2s,
412 			    (size_t)len + 1, (ssize_t)ucs2slen, 0);
413     if (len < 0) {
414 	free(*utf8s);
415 	*utf8s = NULL;
416 	return EINVAL;
417     }
418 
419     if (utf8slen != NULL) {
420 	*utf8slen = len;
421     }
422 
423     return 0;
424 }
425 
426 int
427 krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les,
428                           size_t ucs2leslen,
429                           char **utf8s,
430                           size_t *utf8slen)
431 {
432     ssize_t len;
433 
434     if (ucs2leslen > SSIZE_MAX)
435 	return ERANGE;
436 
437     len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0,
438 			    (ssize_t)ucs2leslen, 1);
439     if (len < 0)
440 	return EINVAL;
441 
442     *utf8s = (char *)malloc((size_t)len + 1);
443     if (*utf8s == NULL) {
444 	return ENOMEM;
445     }
446 
447     len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les,
448 			    (size_t)len + 1, (ssize_t)ucs2leslen, 1);
449     if (len < 0) {
450 	free(*utf8s);
451 	*utf8s = NULL;
452 	return EINVAL;
453     }
454 
455     if (utf8slen != NULL) {
456 	*utf8slen = len;
457     }
458 
459     return 0;
460 }
461 
462