1ba7b222eSGlenn Barry /*
2ba7b222eSGlenn Barry  * util/support/utf8.c
3ba7b222eSGlenn Barry  *
4ba7b222eSGlenn Barry  * Copyright 2008 by the Massachusetts Institute of Technology.
5ba7b222eSGlenn Barry  * All Rights Reserved.
6ba7b222eSGlenn Barry  *
7ba7b222eSGlenn Barry  * Export of this software from the United States of America may
8ba7b222eSGlenn Barry  *   require a specific license from the United States Government.
9ba7b222eSGlenn Barry  *   It is the responsibility of any person or organization contemplating
10ba7b222eSGlenn Barry  *   export to obtain such a license before exporting.
11*55fea89dSDan Cross  *
12ba7b222eSGlenn Barry  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
13ba7b222eSGlenn Barry  * distribute this software and its documentation for any purpose and
14ba7b222eSGlenn Barry  * without fee is hereby granted, provided that the above copyright
15ba7b222eSGlenn Barry  * notice appear in all copies and that both that copyright notice and
16ba7b222eSGlenn Barry  * this permission notice appear in supporting documentation, and that
17ba7b222eSGlenn Barry  * the name of M.I.T. not be used in advertising or publicity pertaining
18ba7b222eSGlenn Barry  * to distribution of the software without specific, written prior
19ba7b222eSGlenn Barry  * permission.  Furthermore if you modify this software you must label
20ba7b222eSGlenn Barry  * your software as modified software and not distribute it in such a
21ba7b222eSGlenn Barry  * fashion that it might be confused with the original M.I.T. software.
22ba7b222eSGlenn Barry  * M.I.T. makes no representations about the suitability of
23ba7b222eSGlenn Barry  * this software for any purpose.  It is provided "as is" without express
24ba7b222eSGlenn Barry  * or implied warranty.
25ba7b222eSGlenn Barry  */
26ba7b222eSGlenn Barry /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
27ba7b222eSGlenn Barry  *
28ba7b222eSGlenn Barry  * Copyright 1998-2008 The OpenLDAP Foundation.
29ba7b222eSGlenn Barry  * All rights reserved.
30ba7b222eSGlenn Barry  *
31ba7b222eSGlenn Barry  * Redistribution and use in source and binary forms, with or without
32ba7b222eSGlenn Barry  * modification, are permitted only as authorized by the OpenLDAP
33ba7b222eSGlenn Barry  * Public License.
34ba7b222eSGlenn Barry  *
35ba7b222eSGlenn Barry  * A copy of this license is available in the file LICENSE in the
36ba7b222eSGlenn Barry  * top-level directory of the distribution or, alternatively, at
37ba7b222eSGlenn Barry  * <http://www.OpenLDAP.org/license.html>.
38ba7b222eSGlenn Barry  */
39ba7b222eSGlenn Barry /* Basic UTF-8 routines
40ba7b222eSGlenn Barry  *
41ba7b222eSGlenn Barry  * These routines are "dumb".  Though they understand UTF-8,
42ba7b222eSGlenn Barry  * they don't grok Unicode.  That is, they can push bits,
43ba7b222eSGlenn Barry  * but don't have a clue what the bits represent.  That's
44ba7b222eSGlenn Barry  * good enough for use with the KRB5 Client SDK.
45ba7b222eSGlenn Barry  *
46ba7b222eSGlenn Barry  * These routines are not optimized.
47ba7b222eSGlenn Barry  */
48ba7b222eSGlenn Barry 
49ba7b222eSGlenn Barry #include "k5-platform.h"
50ba7b222eSGlenn Barry #include "k5-utf8.h"
51ba7b222eSGlenn Barry #include "supp-int.h"
52ba7b222eSGlenn Barry 
53ba7b222eSGlenn Barry /*
54ba7b222eSGlenn Barry  * return the number of bytes required to hold the
55ba7b222eSGlenn Barry  * NULL-terminated UTF-8 string NOT INCLUDING the
56ba7b222eSGlenn Barry  * termination.
57ba7b222eSGlenn Barry  */
krb5int_utf8_bytes(const char * p)58ba7b222eSGlenn Barry size_t krb5int_utf8_bytes(const char *p)
59ba7b222eSGlenn Barry {
60ba7b222eSGlenn Barry     size_t bytes;
61ba7b222eSGlenn Barry 
62ba7b222eSGlenn Barry     for (bytes = 0; p[bytes]; bytes++)
63ba7b222eSGlenn Barry 	;
64ba7b222eSGlenn Barry 
65ba7b222eSGlenn Barry     return bytes;
66ba7b222eSGlenn Barry }
67ba7b222eSGlenn Barry 
krb5int_utf8_chars(const char * p)68ba7b222eSGlenn Barry size_t krb5int_utf8_chars(const char *p)
69ba7b222eSGlenn Barry {
70ba7b222eSGlenn Barry     /* could be optimized and could check for invalid sequences */
71ba7b222eSGlenn Barry     size_t chars = 0;
72ba7b222eSGlenn Barry 
73ba7b222eSGlenn Barry     for ( ; *p ; KRB5_UTF8_INCR(p))
74ba7b222eSGlenn Barry 	chars++;
75ba7b222eSGlenn Barry 
76ba7b222eSGlenn Barry     return chars;
77ba7b222eSGlenn Barry }
78ba7b222eSGlenn Barry 
krb5int_utf8c_chars(const char * p,size_t length)79ba7b222eSGlenn Barry size_t krb5int_utf8c_chars(const char *p, size_t length)
80ba7b222eSGlenn Barry {
81ba7b222eSGlenn Barry     /* could be optimized and could check for invalid sequences */
82ba7b222eSGlenn Barry     size_t chars = 0;
83ba7b222eSGlenn Barry     const char *end = p + length;
84ba7b222eSGlenn Barry 
85ba7b222eSGlenn Barry     for ( ; p < end; KRB5_UTF8_INCR(p))
86ba7b222eSGlenn Barry 	chars++;
87ba7b222eSGlenn Barry 
88ba7b222eSGlenn Barry     return chars;
89ba7b222eSGlenn Barry }
90ba7b222eSGlenn Barry 
91ba7b222eSGlenn Barry /* return offset to next character */
krb5int_utf8_offset(const char * p)92ba7b222eSGlenn Barry int krb5int_utf8_offset(const char *p)
93ba7b222eSGlenn Barry {
94ba7b222eSGlenn Barry     return KRB5_UTF8_NEXT(p) - p;
95ba7b222eSGlenn Barry }
96ba7b222eSGlenn Barry 
97ba7b222eSGlenn Barry /*
98ba7b222eSGlenn Barry  * Returns length indicated by first byte.
99ba7b222eSGlenn Barry  */
100ba7b222eSGlenn Barry const char krb5int_utf8_lentab[] = {
101ba7b222eSGlenn Barry     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102ba7b222eSGlenn Barry     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103ba7b222eSGlenn Barry     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104ba7b222eSGlenn Barry     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105ba7b222eSGlenn Barry     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106ba7b222eSGlenn Barry     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107ba7b222eSGlenn Barry     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
108ba7b222eSGlenn Barry     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
109ba7b222eSGlenn Barry 
krb5int_utf8_charlen(const char * p)110ba7b222eSGlenn Barry int krb5int_utf8_charlen(const char *p)
111ba7b222eSGlenn Barry {
112ba7b222eSGlenn Barry     if (!(*p & 0x80))
113ba7b222eSGlenn Barry 	return 1;
114ba7b222eSGlenn Barry 
115ba7b222eSGlenn Barry     return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80];
116ba7b222eSGlenn Barry }
117ba7b222eSGlenn Barry 
118ba7b222eSGlenn Barry /*
119ba7b222eSGlenn Barry  * Make sure the UTF-8 char used the shortest possible encoding
120*55fea89dSDan Cross  * returns charlen if valid, 0 if not.
121ba7b222eSGlenn Barry  *
122ba7b222eSGlenn Barry  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
123ba7b222eSGlenn Barry  * The table is slightly modified from that of the RFC.
124ba7b222eSGlenn Barry  *
125ba7b222eSGlenn Barry  * UCS-4 range (hex)      UTF-8 sequence (binary)
126ba7b222eSGlenn Barry  * 0000 0000-0000 007F   0.......
127ba7b222eSGlenn Barry  * 0000 0080-0000 07FF   110++++. 10......
128ba7b222eSGlenn Barry  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
129ba7b222eSGlenn Barry  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
130ba7b222eSGlenn Barry  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
131ba7b222eSGlenn Barry  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
132ba7b222eSGlenn Barry  *
133ba7b222eSGlenn Barry  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
134ba7b222eSGlenn Barry  * at least one of the '+' bits must be set, otherwise the character
135ba7b222eSGlenn Barry  * should have been encoded in fewer octets. Note that in the two-octet
136ba7b222eSGlenn Barry  * case, only the first octet needs to be validated, and this is done
137ba7b222eSGlenn Barry  * in the krb5int_utf8_lentab[] above.
138ba7b222eSGlenn Barry  */
139ba7b222eSGlenn Barry 
140ba7b222eSGlenn Barry /* mask of required bits in second octet */
141ba7b222eSGlenn Barry #undef c
142ba7b222eSGlenn Barry #define c const char
143ba7b222eSGlenn Barry c krb5int_utf8_mintab[] = {
144ba7b222eSGlenn Barry     (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
145ba7b222eSGlenn Barry     (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
146ba7b222eSGlenn Barry     (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
147ba7b222eSGlenn Barry     (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
148ba7b222eSGlenn Barry #undef c
149ba7b222eSGlenn Barry 
krb5int_utf8_charlen2(const char * p)150ba7b222eSGlenn Barry int krb5int_utf8_charlen2(const char *p)
151ba7b222eSGlenn Barry {
152ba7b222eSGlenn Barry     int i = KRB5_UTF8_CHARLEN(p);
153ba7b222eSGlenn Barry 
154ba7b222eSGlenn Barry     if (i > 2) {
155ba7b222eSGlenn Barry 	if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1]))
156ba7b222eSGlenn Barry 	    i = 0;
157ba7b222eSGlenn Barry     }
158ba7b222eSGlenn Barry 
159ba7b222eSGlenn Barry     return i;
160ba7b222eSGlenn Barry }
161ba7b222eSGlenn Barry 
162ba7b222eSGlenn Barry /*
163ba7b222eSGlenn Barry  * Convert a UTF8 character to a UCS4 character.  Return 0 on success,
164ba7b222eSGlenn Barry  * -1 on failure.
165ba7b222eSGlenn Barry  */
krb5int_utf8_to_ucs4(const char * p,krb5_ucs4 * out)166ba7b222eSGlenn Barry int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
167ba7b222eSGlenn Barry {
168ba7b222eSGlenn Barry     const unsigned char *c = (const unsigned char *) p;
169ba7b222eSGlenn Barry     krb5_ucs4 ch;
170ba7b222eSGlenn Barry     int len, i;
171ba7b222eSGlenn Barry     static unsigned char mask[] = {
172ba7b222eSGlenn Barry 	0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
173ba7b222eSGlenn Barry 
174ba7b222eSGlenn Barry     *out = 0;
175ba7b222eSGlenn Barry     len = KRB5_UTF8_CHARLEN2(p, len);
176ba7b222eSGlenn Barry 
177ba7b222eSGlenn Barry     if (len == 0)
178ba7b222eSGlenn Barry 	return -1;
179ba7b222eSGlenn Barry 
180ba7b222eSGlenn Barry     ch = c[0] & mask[len];
181ba7b222eSGlenn Barry 
182ba7b222eSGlenn Barry     for (i = 1; i < len; i++) {
183ba7b222eSGlenn Barry 	if ((c[i] & 0xc0) != 0x80)
184ba7b222eSGlenn Barry 	    return -1;
185ba7b222eSGlenn Barry 
186ba7b222eSGlenn Barry 	ch <<= 6;
187ba7b222eSGlenn Barry 	ch |= c[i] & 0x3f;
188ba7b222eSGlenn Barry     }
189ba7b222eSGlenn Barry 
190ba7b222eSGlenn Barry     *out = ch;
191ba7b222eSGlenn Barry     return 0;
192ba7b222eSGlenn Barry }
193ba7b222eSGlenn Barry 
krb5int_utf8_to_ucs2(const char * p,krb5_ucs2 * out)194ba7b222eSGlenn Barry int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
195ba7b222eSGlenn Barry {
196ba7b222eSGlenn Barry     krb5_ucs4 ch;
197ba7b222eSGlenn Barry 
198ba7b222eSGlenn Barry     *out = 0;
199ba7b222eSGlenn Barry     if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF)
200ba7b222eSGlenn Barry 	return -1;
201ba7b222eSGlenn Barry     *out = (krb5_ucs2) ch;
202ba7b222eSGlenn Barry     return 0;
203ba7b222eSGlenn Barry }
204ba7b222eSGlenn Barry 
205ba7b222eSGlenn Barry /* conv UCS-2 to UTF-8, not used */
krb5int_ucs4_to_utf8(krb5_ucs4 c,char * buf)206ba7b222eSGlenn Barry size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
207ba7b222eSGlenn Barry {
208ba7b222eSGlenn Barry     size_t len = 0;
209ba7b222eSGlenn Barry     unsigned char *p = (unsigned char *) buf;
210ba7b222eSGlenn Barry 
211ba7b222eSGlenn Barry     /* not a valid Unicode character */
212ba7b222eSGlenn Barry     if (c < 0)
213ba7b222eSGlenn Barry 	return 0;
214ba7b222eSGlenn Barry 
215ba7b222eSGlenn Barry     /* Just return length, don't convert */
216ba7b222eSGlenn Barry     if (buf == NULL) {
217ba7b222eSGlenn Barry 	if (c < 0x80) return 1;
218ba7b222eSGlenn Barry 	else if (c < 0x800) return 2;
219ba7b222eSGlenn Barry 	else if (c < 0x10000) return 3;
220ba7b222eSGlenn Barry 	else if (c < 0x200000) return 4;
221ba7b222eSGlenn Barry 	else if (c < 0x4000000) return 5;
222ba7b222eSGlenn Barry 	else return 6;
223ba7b222eSGlenn Barry     }
224ba7b222eSGlenn Barry 
225ba7b222eSGlenn Barry     if (c < 0x80) {
226ba7b222eSGlenn Barry 	p[len++] = c;
227ba7b222eSGlenn Barry     } else if (c < 0x800) {
228ba7b222eSGlenn Barry 	p[len++] = 0xc0 | ( c >> 6 );
229ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( c & 0x3f );
230ba7b222eSGlenn Barry     } else if (c < 0x10000) {
231ba7b222eSGlenn Barry 	p[len++] = 0xe0 | ( c >> 12 );
232ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
233ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( c & 0x3f );
234ba7b222eSGlenn Barry     } else if (c < 0x200000) {
235ba7b222eSGlenn Barry 	p[len++] = 0xf0 | ( c >> 18 );
236ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
237ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
238ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( c & 0x3f );
239ba7b222eSGlenn Barry     } else if (c < 0x4000000) {
240ba7b222eSGlenn Barry 	p[len++] = 0xf8 | ( c >> 24 );
241ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 18) & 0x3f );
242ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
243ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
244ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( c & 0x3f );
245ba7b222eSGlenn Barry     } else /* if( c < 0x80000000 ) */ {
246ba7b222eSGlenn Barry 	p[len++] = 0xfc | ( c >> 30 );
247ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 24) & 0x3f );
248ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 18) & 0x3f );
249ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
250ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
251ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( c & 0x3f );
252ba7b222eSGlenn Barry     }
253ba7b222eSGlenn Barry 
254ba7b222eSGlenn Barry     return len;
255ba7b222eSGlenn Barry }
256ba7b222eSGlenn Barry 
krb5int_ucs2_to_utf8(krb5_ucs2 c,char * buf)257ba7b222eSGlenn Barry size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf)
258ba7b222eSGlenn Barry {
259ba7b222eSGlenn Barry     return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf);
260ba7b222eSGlenn Barry }
261ba7b222eSGlenn Barry 
262ba7b222eSGlenn Barry #define KRB5_UCS_UTF8LEN(c)	\
263ba7b222eSGlenn Barry     c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
264ba7b222eSGlenn Barry     (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
265ba7b222eSGlenn Barry 
266ba7b222eSGlenn Barry /*
267ba7b222eSGlenn Barry  * Advance to the next UTF-8 character
268ba7b222eSGlenn Barry  *
269ba7b222eSGlenn Barry  * Ignores length of multibyte character, instead rely on
270ba7b222eSGlenn Barry  * continuation markers to find start of next character.
271ba7b222eSGlenn Barry  * This allows for "resyncing" of when invalid characters
272ba7b222eSGlenn Barry  * are provided provided the start of the next character
273ba7b222eSGlenn Barry  * is appears within the 6 bytes examined.
274ba7b222eSGlenn Barry  */
krb5int_utf8_next(const char * p)275ba7b222eSGlenn Barry char *krb5int_utf8_next(const char *p)
276ba7b222eSGlenn Barry {
277ba7b222eSGlenn Barry     int i;
278ba7b222eSGlenn Barry     const unsigned char *u = (const unsigned char *) p;
279ba7b222eSGlenn Barry 
280ba7b222eSGlenn Barry     if (KRB5_UTF8_ISASCII(u)) {
281ba7b222eSGlenn Barry 	return (char *) &p[1];
282ba7b222eSGlenn Barry     }
283ba7b222eSGlenn Barry 
284ba7b222eSGlenn Barry     for (i = 1; i < 6; i++) {
285ba7b222eSGlenn Barry 	if ((u[i] & 0xc0) != 0x80) {
286ba7b222eSGlenn Barry 	    return (char *) &p[i];
287ba7b222eSGlenn Barry 	}
288ba7b222eSGlenn Barry     }
289ba7b222eSGlenn Barry 
290ba7b222eSGlenn Barry     return (char *) &p[i];
291ba7b222eSGlenn Barry }
292ba7b222eSGlenn Barry 
293ba7b222eSGlenn Barry /*
294ba7b222eSGlenn Barry  * Advance to the previous UTF-8 character
295ba7b222eSGlenn Barry  *
296ba7b222eSGlenn Barry  * Ignores length of multibyte character, instead rely on
297ba7b222eSGlenn Barry  * continuation markers to find start of next character.
298ba7b222eSGlenn Barry  * This allows for "resyncing" of when invalid characters
299ba7b222eSGlenn Barry  * are provided provided the start of the next character
300ba7b222eSGlenn Barry  * is appears within the 6 bytes examined.
301ba7b222eSGlenn Barry  */
krb5int_utf8_prev(const char * p)302ba7b222eSGlenn Barry char *krb5int_utf8_prev(const char *p)
303ba7b222eSGlenn Barry {
304ba7b222eSGlenn Barry     int i;
305ba7b222eSGlenn Barry     const unsigned char *u = (const unsigned char *) p;
306ba7b222eSGlenn Barry 
307ba7b222eSGlenn Barry     for (i = -1; i>-6 ; i--) {
308ba7b222eSGlenn Barry 	if ((u[i] & 0xc0 ) != 0x80) {
309ba7b222eSGlenn Barry 	    return (char *) &p[i];
310ba7b222eSGlenn Barry 	}
311ba7b222eSGlenn Barry     }
312ba7b222eSGlenn Barry 
313ba7b222eSGlenn Barry     return (char *) &p[i];
314ba7b222eSGlenn Barry }
315ba7b222eSGlenn Barry 
316ba7b222eSGlenn Barry /*
317ba7b222eSGlenn Barry  * Copy one UTF-8 character from src to dst returning
318ba7b222eSGlenn Barry  * number of bytes copied.
319ba7b222eSGlenn Barry  *
320ba7b222eSGlenn Barry  * Ignores length of multibyte character, instead rely on
321ba7b222eSGlenn Barry  * continuation markers to find start of next character.
322ba7b222eSGlenn Barry  * This allows for "resyncing" of when invalid characters
323ba7b222eSGlenn Barry  * are provided provided the start of the next character
324ba7b222eSGlenn Barry  * is appears within the 6 bytes examined.
325ba7b222eSGlenn Barry  */
krb5int_utf8_copy(char * dst,const char * src)326ba7b222eSGlenn Barry int krb5int_utf8_copy(char* dst, const char *src)
327ba7b222eSGlenn Barry {
328ba7b222eSGlenn Barry     int i;
329ba7b222eSGlenn Barry     const unsigned char *u = (const unsigned char *) src;
330ba7b222eSGlenn Barry 
331ba7b222eSGlenn Barry     dst[0] = src[0];
332ba7b222eSGlenn Barry 
333ba7b222eSGlenn Barry     if (KRB5_UTF8_ISASCII(u)) {
334ba7b222eSGlenn Barry 	return 1;
335ba7b222eSGlenn Barry     }
336ba7b222eSGlenn Barry 
337ba7b222eSGlenn Barry     for (i=1; i<6; i++) {
338ba7b222eSGlenn Barry 	if ((u[i] & 0xc0) != 0x80) {
339*55fea89dSDan Cross 	    return i;
340ba7b222eSGlenn Barry 	}
341ba7b222eSGlenn Barry 	dst[i] = src[i];
342ba7b222eSGlenn Barry     }
343ba7b222eSGlenn Barry 
344ba7b222eSGlenn Barry     return i;
345ba7b222eSGlenn Barry }
346ba7b222eSGlenn Barry 
347ba7b222eSGlenn Barry #ifndef UTF8_ALPHA_CTYPE
348ba7b222eSGlenn Barry /*
349ba7b222eSGlenn Barry  * UTF-8 ctype routines
350ba7b222eSGlenn Barry  * Only deals with characters < 0x80 (ie: US-ASCII)
351ba7b222eSGlenn Barry  */
352ba7b222eSGlenn Barry 
krb5int_utf8_isascii(const char * p)353ba7b222eSGlenn Barry int krb5int_utf8_isascii(const char * p)
354ba7b222eSGlenn Barry {
355ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
356ba7b222eSGlenn Barry 
357ba7b222eSGlenn Barry     return KRB5_ASCII(c);
358ba7b222eSGlenn Barry }
359ba7b222eSGlenn Barry 
krb5int_utf8_isdigit(const char * p)360ba7b222eSGlenn Barry int krb5int_utf8_isdigit(const char * p)
361ba7b222eSGlenn Barry {
362ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
363ba7b222eSGlenn Barry 
364ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
365ba7b222eSGlenn Barry 	return 0;
366ba7b222eSGlenn Barry 
367ba7b222eSGlenn Barry     return KRB5_DIGIT( c );
368ba7b222eSGlenn Barry }
369ba7b222eSGlenn Barry 
krb5int_utf8_isxdigit(const char * p)370ba7b222eSGlenn Barry int krb5int_utf8_isxdigit(const char * p)
371ba7b222eSGlenn Barry {
372ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
373ba7b222eSGlenn Barry 
374ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
375ba7b222eSGlenn Barry 	return 0;
376ba7b222eSGlenn Barry 
377ba7b222eSGlenn Barry     return KRB5_HEX(c);
378ba7b222eSGlenn Barry }
379ba7b222eSGlenn Barry 
krb5int_utf8_isspace(const char * p)380ba7b222eSGlenn Barry int krb5int_utf8_isspace(const char * p)
381ba7b222eSGlenn Barry {
382ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
383ba7b222eSGlenn Barry 
384ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
385ba7b222eSGlenn Barry 	return 0;
386ba7b222eSGlenn Barry 
387ba7b222eSGlenn Barry     switch(c) {
388ba7b222eSGlenn Barry     case ' ':
389ba7b222eSGlenn Barry     case '\t':
390ba7b222eSGlenn Barry     case '\n':
391ba7b222eSGlenn Barry     case '\r':
392ba7b222eSGlenn Barry     case '\v':
393ba7b222eSGlenn Barry     case '\f':
394ba7b222eSGlenn Barry 	return 1;
395ba7b222eSGlenn Barry     }
396ba7b222eSGlenn Barry 
397ba7b222eSGlenn Barry     return 0;
398ba7b222eSGlenn Barry }
399ba7b222eSGlenn Barry 
400ba7b222eSGlenn Barry /*
401ba7b222eSGlenn Barry  * These are not needed by the C SDK and are
402ba7b222eSGlenn Barry  * not "good enough" for general use.
403ba7b222eSGlenn Barry  */
krb5int_utf8_isalpha(const char * p)404ba7b222eSGlenn Barry int krb5int_utf8_isalpha(const char * p)
405ba7b222eSGlenn Barry {
406ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
407ba7b222eSGlenn Barry 
408ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
409ba7b222eSGlenn Barry 	return 0;
410ba7b222eSGlenn Barry 
411ba7b222eSGlenn Barry     return KRB5_ALPHA(c);
412ba7b222eSGlenn Barry }
413ba7b222eSGlenn Barry 
krb5int_utf8_isalnum(const char * p)414ba7b222eSGlenn Barry int krb5int_utf8_isalnum(const char * p)
415ba7b222eSGlenn Barry {
416ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
417ba7b222eSGlenn Barry 
418ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
419ba7b222eSGlenn Barry 	return 0;
420ba7b222eSGlenn Barry 
421ba7b222eSGlenn Barry     return KRB5_ALNUM(c);
422ba7b222eSGlenn Barry }
423ba7b222eSGlenn Barry 
424ba7b222eSGlenn Barry #if 0
425ba7b222eSGlenn Barry int krb5int_utf8_islower(const char * p)
426ba7b222eSGlenn Barry {
427ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
428ba7b222eSGlenn Barry 
429ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
430ba7b222eSGlenn Barry 	return 0;
431ba7b222eSGlenn Barry 
432ba7b222eSGlenn Barry     return KRB5_LOWER(c);
433ba7b222eSGlenn Barry }
434ba7b222eSGlenn Barry 
435ba7b222eSGlenn Barry int krb5int_utf8_isupper(const char * p)
436ba7b222eSGlenn Barry {
437ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
438ba7b222eSGlenn Barry 
439ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
440ba7b222eSGlenn Barry 	return 0;
441ba7b222eSGlenn Barry 
442ba7b222eSGlenn Barry     return KRB5_UPPER(c);
443ba7b222eSGlenn Barry }
444ba7b222eSGlenn Barry #endif
445ba7b222eSGlenn Barry #endif
446ba7b222eSGlenn Barry 
447ba7b222eSGlenn Barry 
448ba7b222eSGlenn Barry /*
449ba7b222eSGlenn Barry  * UTF-8 string routines
450ba7b222eSGlenn Barry  */
451ba7b222eSGlenn Barry 
452ba7b222eSGlenn Barry /* like strchr() */
krb5int_utf8_strchr(const char * str,const char * chr)453ba7b222eSGlenn Barry char *krb5int_utf8_strchr(const char *str, const char *chr)
454ba7b222eSGlenn Barry {
455ba7b222eSGlenn Barry     krb5_ucs4 chs, ch;
456ba7b222eSGlenn Barry 
457ba7b222eSGlenn Barry     if (krb5int_utf8_to_ucs4(chr, &ch) == -1)
458ba7b222eSGlenn Barry 	return NULL;
459ba7b222eSGlenn Barry     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
460ba7b222eSGlenn Barry 	if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch)
461ba7b222eSGlenn Barry 	    return (char *)str;
462ba7b222eSGlenn Barry     }
463ba7b222eSGlenn Barry 
464ba7b222eSGlenn Barry     return NULL;
465ba7b222eSGlenn Barry }
466ba7b222eSGlenn Barry 
467ba7b222eSGlenn Barry /* like strcspn() but returns number of bytes, not characters */
krb5int_utf8_strcspn(const char * str,const char * set)468ba7b222eSGlenn Barry size_t krb5int_utf8_strcspn(const char *str, const char *set)
469ba7b222eSGlenn Barry {
470ba7b222eSGlenn Barry     const char *cstr, *cset;
471ba7b222eSGlenn Barry     krb5_ucs4 chstr, chset;
472ba7b222eSGlenn Barry 
473ba7b222eSGlenn Barry     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
474ba7b222eSGlenn Barry 	for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
475ba7b222eSGlenn Barry 	    if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
476ba7b222eSGlenn Barry 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
477ba7b222eSGlenn Barry 		return cstr - str;
478ba7b222eSGlenn Barry 	}
479ba7b222eSGlenn Barry     }
480ba7b222eSGlenn Barry 
481ba7b222eSGlenn Barry     return cstr - str;
482ba7b222eSGlenn Barry }
483ba7b222eSGlenn Barry 
484ba7b222eSGlenn Barry /* like strspn() but returns number of bytes, not characters */
krb5int_utf8_strspn(const char * str,const char * set)485ba7b222eSGlenn Barry size_t krb5int_utf8_strspn(const char *str, const char *set)
486ba7b222eSGlenn Barry {
487ba7b222eSGlenn Barry     const char *cstr, *cset;
488ba7b222eSGlenn Barry     krb5_ucs4 chstr, chset;
489ba7b222eSGlenn Barry 
490ba7b222eSGlenn Barry     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
491ba7b222eSGlenn Barry 	for (cset = set; ; KRB5_UTF8_INCR(cset)) {
492ba7b222eSGlenn Barry 	    if (*cset == '\0')
493ba7b222eSGlenn Barry 		return cstr - str;
494ba7b222eSGlenn Barry 	    if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
495ba7b222eSGlenn Barry 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
496ba7b222eSGlenn Barry 		break;
497ba7b222eSGlenn Barry 	}
498ba7b222eSGlenn Barry     }
499ba7b222eSGlenn Barry 
500ba7b222eSGlenn Barry     return cstr - str;
501ba7b222eSGlenn Barry }
502ba7b222eSGlenn Barry 
503ba7b222eSGlenn Barry /* like strpbrk(), replaces strchr() as well */
krb5int_utf8_strpbrk(const char * str,const char * set)504ba7b222eSGlenn Barry char *krb5int_utf8_strpbrk(const char *str, const char *set)
505ba7b222eSGlenn Barry {
506ba7b222eSGlenn Barry     const char *cset;
507ba7b222eSGlenn Barry     krb5_ucs4 chstr, chset;
508ba7b222eSGlenn Barry 
509ba7b222eSGlenn Barry     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
510ba7b222eSGlenn Barry 	for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
511ba7b222eSGlenn Barry 	    if (krb5int_utf8_to_ucs4(str, &chstr) == 0
512ba7b222eSGlenn Barry 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
513ba7b222eSGlenn Barry 		return (char *)str;
514ba7b222eSGlenn Barry 	}
515ba7b222eSGlenn Barry     }
516ba7b222eSGlenn Barry 
517ba7b222eSGlenn Barry     return NULL;
518ba7b222eSGlenn Barry }
519ba7b222eSGlenn Barry 
520ba7b222eSGlenn Barry /* like strtok_r(), not strtok() */
krb5int_utf8_strtok(char * str,const char * sep,char ** last)521ba7b222eSGlenn Barry char *krb5int_utf8_strtok(char *str, const char *sep, char **last)
522ba7b222eSGlenn Barry {
523ba7b222eSGlenn Barry     char *begin;
524ba7b222eSGlenn Barry     char *end;
525ba7b222eSGlenn Barry 
526ba7b222eSGlenn Barry     if (last == NULL)
527ba7b222eSGlenn Barry 	return NULL;
528ba7b222eSGlenn Barry 
529ba7b222eSGlenn Barry     begin = str ? str : *last;
530ba7b222eSGlenn Barry 
531ba7b222eSGlenn Barry     begin += krb5int_utf8_strspn(begin, sep);
532ba7b222eSGlenn Barry 
533ba7b222eSGlenn Barry     if (*begin == '\0') {
534ba7b222eSGlenn Barry 	*last = NULL;
535ba7b222eSGlenn Barry 	return NULL;
536ba7b222eSGlenn Barry     }
537ba7b222eSGlenn Barry 
538ba7b222eSGlenn Barry     end = &begin[krb5int_utf8_strcspn(begin, sep)];
539ba7b222eSGlenn Barry 
540ba7b222eSGlenn Barry     if (*end != '\0') {
541ba7b222eSGlenn Barry 	char *next = KRB5_UTF8_NEXT(end);
542ba7b222eSGlenn Barry 	*end = '\0';
543ba7b222eSGlenn Barry 	end = next;
544ba7b222eSGlenn Barry     }
545ba7b222eSGlenn Barry 
546ba7b222eSGlenn Barry     *last = end;
547ba7b222eSGlenn Barry 
548ba7b222eSGlenn Barry     return begin;
549ba7b222eSGlenn Barry }
550