1 /*
2  * util/support/utf8.c
3  *
4  * Copyright 2008 by the Massachusetts Institute of Technology.
5  * All Rights Reserved.
6  *
7  * Export of this software from the United States of America may
8  *   require a specific license from the United States Government.
9  *   It is the responsibility of any person or organization contemplating
10  *   export to obtain such a license before exporting.
11  *
12  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
13  * distribute this software and its documentation for any purpose and
14  * without fee is hereby granted, provided that the above copyright
15  * notice appear in all copies and that both that copyright notice and
16  * this permission notice appear in supporting documentation, and that
17  * the name of M.I.T. not be used in advertising or publicity pertaining
18  * to distribution of the software without specific, written prior
19  * permission.  Furthermore if you modify this software you must label
20  * your software as modified software and not distribute it in such a
21  * fashion that it might be confused with the original M.I.T. software.
22  * M.I.T. makes no representations about the suitability of
23  * this software for any purpose.  It is provided "as is" without express
24  * or implied warranty.
25  */
26 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
27  *
28  * Copyright 1998-2008 The OpenLDAP Foundation.
29  * All rights reserved.
30  *
31  * Redistribution and use in source and binary forms, with or without
32  * modification, are permitted only as authorized by the OpenLDAP
33  * Public License.
34  *
35  * A copy of this license is available in the file LICENSE in the
36  * top-level directory of the distribution or, alternatively, at
37  * <http://www.OpenLDAP.org/license.html>.
38  */
39 /* Basic UTF-8 routines
40  *
41  * These routines are "dumb".  Though they understand UTF-8,
42  * they don't grok Unicode.  That is, they can push bits,
43  * but don't have a clue what the bits represent.  That's
44  * good enough for use with the KRB5 Client SDK.
45  *
46  * These routines are not optimized.
47  */
48 
49 #include "k5-platform.h"
50 #include "k5-utf8.h"
51 #include "supp-int.h"
52 
53 /*
54  * return the number of bytes required to hold the
55  * NULL-terminated UTF-8 string NOT INCLUDING the
56  * termination.
57  */
krb5int_utf8_bytes(const char * p)58 size_t krb5int_utf8_bytes(const char *p)
59 {
60     size_t bytes;
61 
62     for (bytes = 0; p[bytes]; bytes++)
63 	;
64 
65     return bytes;
66 }
67 
krb5int_utf8_chars(const char * p)68 size_t krb5int_utf8_chars(const char *p)
69 {
70     /* could be optimized and could check for invalid sequences */
71     size_t chars = 0;
72 
73     for ( ; *p ; KRB5_UTF8_INCR(p))
74 	chars++;
75 
76     return chars;
77 }
78 
krb5int_utf8c_chars(const char * p,size_t length)79 size_t krb5int_utf8c_chars(const char *p, size_t length)
80 {
81     /* could be optimized and could check for invalid sequences */
82     size_t chars = 0;
83     const char *end = p + length;
84 
85     for ( ; p < end; KRB5_UTF8_INCR(p))
86 	chars++;
87 
88     return chars;
89 }
90 
91 /* return offset to next character */
krb5int_utf8_offset(const char * p)92 int krb5int_utf8_offset(const char *p)
93 {
94     return KRB5_UTF8_NEXT(p) - p;
95 }
96 
97 /*
98  * Returns length indicated by first byte.
99  */
100 const char krb5int_utf8_lentab[] = {
101     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
108     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
109 
krb5int_utf8_charlen(const char * p)110 int krb5int_utf8_charlen(const char *p)
111 {
112     if (!(*p & 0x80))
113 	return 1;
114 
115     return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80];
116 }
117 
118 /*
119  * Make sure the UTF-8 char used the shortest possible encoding
120  * returns charlen if valid, 0 if not.
121  *
122  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
123  * The table is slightly modified from that of the RFC.
124  *
125  * UCS-4 range (hex)      UTF-8 sequence (binary)
126  * 0000 0000-0000 007F   0.......
127  * 0000 0080-0000 07FF   110++++. 10......
128  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
129  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
130  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
131  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
132  *
133  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
134  * at least one of the '+' bits must be set, otherwise the character
135  * should have been encoded in fewer octets. Note that in the two-octet
136  * case, only the first octet needs to be validated, and this is done
137  * in the krb5int_utf8_lentab[] above.
138  */
139 
140 /* mask of required bits in second octet */
141 #undef c
142 #define c const char
143 c krb5int_utf8_mintab[] = {
144     (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
145     (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
146     (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
147     (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
148 #undef c
149 
krb5int_utf8_charlen2(const char * p)150 int krb5int_utf8_charlen2(const char *p)
151 {
152     int i = KRB5_UTF8_CHARLEN(p);
153 
154     if (i > 2) {
155 	if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1]))
156 	    i = 0;
157     }
158 
159     return i;
160 }
161 
162 /*
163  * Convert a UTF8 character to a UCS4 character.  Return 0 on success,
164  * -1 on failure.
165  */
krb5int_utf8_to_ucs4(const char * p,krb5_ucs4 * out)166 int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
167 {
168     const unsigned char *c = (const unsigned char *) p;
169     krb5_ucs4 ch;
170     int len, i;
171     static unsigned char mask[] = {
172 	0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
173 
174     *out = 0;
175     len = KRB5_UTF8_CHARLEN2(p, len);
176 
177     if (len == 0)
178 	return -1;
179 
180     ch = c[0] & mask[len];
181 
182     for (i = 1; i < len; i++) {
183 	if ((c[i] & 0xc0) != 0x80)
184 	    return -1;
185 
186 	ch <<= 6;
187 	ch |= c[i] & 0x3f;
188     }
189 
190     *out = ch;
191     return 0;
192 }
193 
krb5int_utf8_to_ucs2(const char * p,krb5_ucs2 * out)194 int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
195 {
196     krb5_ucs4 ch;
197 
198     *out = 0;
199     if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF)
200 	return -1;
201     *out = (krb5_ucs2) ch;
202     return 0;
203 }
204 
205 /* conv UCS-2 to UTF-8, not used */
krb5int_ucs4_to_utf8(krb5_ucs4 c,char * buf)206 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
207 {
208     size_t len = 0;
209     unsigned char *p = (unsigned char *) buf;
210 
211     /* not a valid Unicode character */
212     if (c < 0)
213 	return 0;
214 
215     /* Just return length, don't convert */
216     if (buf == NULL) {
217 	if (c < 0x80) return 1;
218 	else if (c < 0x800) return 2;
219 	else if (c < 0x10000) return 3;
220 	else if (c < 0x200000) return 4;
221 	else if (c < 0x4000000) return 5;
222 	else return 6;
223     }
224 
225     if (c < 0x80) {
226 	p[len++] = c;
227     } else if (c < 0x800) {
228 	p[len++] = 0xc0 | ( c >> 6 );
229 	p[len++] = 0x80 | ( c & 0x3f );
230     } else if (c < 0x10000) {
231 	p[len++] = 0xe0 | ( c >> 12 );
232 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
233 	p[len++] = 0x80 | ( c & 0x3f );
234     } else if (c < 0x200000) {
235 	p[len++] = 0xf0 | ( c >> 18 );
236 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
237 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
238 	p[len++] = 0x80 | ( c & 0x3f );
239     } else if (c < 0x4000000) {
240 	p[len++] = 0xf8 | ( c >> 24 );
241 	p[len++] = 0x80 | ( (c >> 18) & 0x3f );
242 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
243 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
244 	p[len++] = 0x80 | ( c & 0x3f );
245     } else /* if( c < 0x80000000 ) */ {
246 	p[len++] = 0xfc | ( c >> 30 );
247 	p[len++] = 0x80 | ( (c >> 24) & 0x3f );
248 	p[len++] = 0x80 | ( (c >> 18) & 0x3f );
249 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
250 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
251 	p[len++] = 0x80 | ( c & 0x3f );
252     }
253 
254     return len;
255 }
256 
krb5int_ucs2_to_utf8(krb5_ucs2 c,char * buf)257 size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf)
258 {
259     return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf);
260 }
261 
262 #define KRB5_UCS_UTF8LEN(c)	\
263     c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
264     (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
265 
266 /*
267  * Advance to the next UTF-8 character
268  *
269  * Ignores length of multibyte character, instead rely on
270  * continuation markers to find start of next character.
271  * This allows for "resyncing" of when invalid characters
272  * are provided provided the start of the next character
273  * is appears within the 6 bytes examined.
274  */
krb5int_utf8_next(const char * p)275 char *krb5int_utf8_next(const char *p)
276 {
277     int i;
278     const unsigned char *u = (const unsigned char *) p;
279 
280     if (KRB5_UTF8_ISASCII(u)) {
281 	return (char *) &p[1];
282     }
283 
284     for (i = 1; i < 6; i++) {
285 	if ((u[i] & 0xc0) != 0x80) {
286 	    return (char *) &p[i];
287 	}
288     }
289 
290     return (char *) &p[i];
291 }
292 
293 /*
294  * Advance to the previous UTF-8 character
295  *
296  * Ignores length of multibyte character, instead rely on
297  * continuation markers to find start of next character.
298  * This allows for "resyncing" of when invalid characters
299  * are provided provided the start of the next character
300  * is appears within the 6 bytes examined.
301  */
krb5int_utf8_prev(const char * p)302 char *krb5int_utf8_prev(const char *p)
303 {
304     int i;
305     const unsigned char *u = (const unsigned char *) p;
306 
307     for (i = -1; i>-6 ; i--) {
308 	if ((u[i] & 0xc0 ) != 0x80) {
309 	    return (char *) &p[i];
310 	}
311     }
312 
313     return (char *) &p[i];
314 }
315 
316 /*
317  * Copy one UTF-8 character from src to dst returning
318  * number of bytes copied.
319  *
320  * Ignores length of multibyte character, instead rely on
321  * continuation markers to find start of next character.
322  * This allows for "resyncing" of when invalid characters
323  * are provided provided the start of the next character
324  * is appears within the 6 bytes examined.
325  */
krb5int_utf8_copy(char * dst,const char * src)326 int krb5int_utf8_copy(char* dst, const char *src)
327 {
328     int i;
329     const unsigned char *u = (const unsigned char *) src;
330 
331     dst[0] = src[0];
332 
333     if (KRB5_UTF8_ISASCII(u)) {
334 	return 1;
335     }
336 
337     for (i=1; i<6; i++) {
338 	if ((u[i] & 0xc0) != 0x80) {
339 	    return i;
340 	}
341 	dst[i] = src[i];
342     }
343 
344     return i;
345 }
346 
347 #ifndef UTF8_ALPHA_CTYPE
348 /*
349  * UTF-8 ctype routines
350  * Only deals with characters < 0x80 (ie: US-ASCII)
351  */
352 
krb5int_utf8_isascii(const char * p)353 int krb5int_utf8_isascii(const char * p)
354 {
355     unsigned c = * (const unsigned char *) p;
356 
357     return KRB5_ASCII(c);
358 }
359 
krb5int_utf8_isdigit(const char * p)360 int krb5int_utf8_isdigit(const char * p)
361 {
362     unsigned c = * (const unsigned char *) p;
363 
364     if (!KRB5_ASCII(c))
365 	return 0;
366 
367     return KRB5_DIGIT( c );
368 }
369 
krb5int_utf8_isxdigit(const char * p)370 int krb5int_utf8_isxdigit(const char * p)
371 {
372     unsigned c = * (const unsigned char *) p;
373 
374     if (!KRB5_ASCII(c))
375 	return 0;
376 
377     return KRB5_HEX(c);
378 }
379 
krb5int_utf8_isspace(const char * p)380 int krb5int_utf8_isspace(const char * p)
381 {
382     unsigned c = * (const unsigned char *) p;
383 
384     if (!KRB5_ASCII(c))
385 	return 0;
386 
387     switch(c) {
388     case ' ':
389     case '\t':
390     case '\n':
391     case '\r':
392     case '\v':
393     case '\f':
394 	return 1;
395     }
396 
397     return 0;
398 }
399 
400 /*
401  * These are not needed by the C SDK and are
402  * not "good enough" for general use.
403  */
krb5int_utf8_isalpha(const char * p)404 int krb5int_utf8_isalpha(const char * p)
405 {
406     unsigned c = * (const unsigned char *) p;
407 
408     if (!KRB5_ASCII(c))
409 	return 0;
410 
411     return KRB5_ALPHA(c);
412 }
413 
krb5int_utf8_isalnum(const char * p)414 int krb5int_utf8_isalnum(const char * p)
415 {
416     unsigned c = * (const unsigned char *) p;
417 
418     if (!KRB5_ASCII(c))
419 	return 0;
420 
421     return KRB5_ALNUM(c);
422 }
423 
424 #if 0
425 int krb5int_utf8_islower(const char * p)
426 {
427     unsigned c = * (const unsigned char *) p;
428 
429     if (!KRB5_ASCII(c))
430 	return 0;
431 
432     return KRB5_LOWER(c);
433 }
434 
435 int krb5int_utf8_isupper(const char * p)
436 {
437     unsigned c = * (const unsigned char *) p;
438 
439     if (!KRB5_ASCII(c))
440 	return 0;
441 
442     return KRB5_UPPER(c);
443 }
444 #endif
445 #endif
446 
447 
448 /*
449  * UTF-8 string routines
450  */
451 
452 /* like strchr() */
krb5int_utf8_strchr(const char * str,const char * chr)453 char *krb5int_utf8_strchr(const char *str, const char *chr)
454 {
455     krb5_ucs4 chs, ch;
456 
457     if (krb5int_utf8_to_ucs4(chr, &ch) == -1)
458 	return NULL;
459     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
460 	if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch)
461 	    return (char *)str;
462     }
463 
464     return NULL;
465 }
466 
467 /* like strcspn() but returns number of bytes, not characters */
krb5int_utf8_strcspn(const char * str,const char * set)468 size_t krb5int_utf8_strcspn(const char *str, const char *set)
469 {
470     const char *cstr, *cset;
471     krb5_ucs4 chstr, chset;
472 
473     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
474 	for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
475 	    if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
476 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
477 		return cstr - str;
478 	}
479     }
480 
481     return cstr - str;
482 }
483 
484 /* like strspn() but returns number of bytes, not characters */
krb5int_utf8_strspn(const char * str,const char * set)485 size_t krb5int_utf8_strspn(const char *str, const char *set)
486 {
487     const char *cstr, *cset;
488     krb5_ucs4 chstr, chset;
489 
490     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
491 	for (cset = set; ; KRB5_UTF8_INCR(cset)) {
492 	    if (*cset == '\0')
493 		return cstr - str;
494 	    if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
495 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
496 		break;
497 	}
498     }
499 
500     return cstr - str;
501 }
502 
503 /* like strpbrk(), replaces strchr() as well */
krb5int_utf8_strpbrk(const char * str,const char * set)504 char *krb5int_utf8_strpbrk(const char *str, const char *set)
505 {
506     const char *cset;
507     krb5_ucs4 chstr, chset;
508 
509     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
510 	for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
511 	    if (krb5int_utf8_to_ucs4(str, &chstr) == 0
512 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
513 		return (char *)str;
514 	}
515     }
516 
517     return NULL;
518 }
519 
520 /* like strtok_r(), not strtok() */
krb5int_utf8_strtok(char * str,const char * sep,char ** last)521 char *krb5int_utf8_strtok(char *str, const char *sep, char **last)
522 {
523     char *begin;
524     char *end;
525 
526     if (last == NULL)
527 	return NULL;
528 
529     begin = str ? str : *last;
530 
531     begin += krb5int_utf8_strspn(begin, sep);
532 
533     if (*begin == '\0') {
534 	*last = NULL;
535 	return NULL;
536     }
537 
538     end = &begin[krb5int_utf8_strcspn(begin, sep)];
539 
540     if (*end != '\0') {
541 	char *next = KRB5_UTF8_NEXT(end);
542 	*end = '\0';
543 	end = next;
544     }
545 
546     *last = end;
547 
548     return begin;
549 }
550