17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the Netscape Public
37c478bd9Sstevel@tonic-gate  * License Version 1.1 (the "License"); you may not use this file
47c478bd9Sstevel@tonic-gate  * except in compliance with the License. You may obtain a copy of
57c478bd9Sstevel@tonic-gate  * the License at http://www.mozilla.org/NPL/
67c478bd9Sstevel@tonic-gate  *
77c478bd9Sstevel@tonic-gate  * Software distributed under the License is distributed on an "AS
87c478bd9Sstevel@tonic-gate  * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
97c478bd9Sstevel@tonic-gate  * implied. See the License for the specific language governing
107c478bd9Sstevel@tonic-gate  * rights and limitations under the License.
117c478bd9Sstevel@tonic-gate  *
127c478bd9Sstevel@tonic-gate  * The Original Code is Mozilla Communicator client code, released
137c478bd9Sstevel@tonic-gate  * March 31, 1998.
147c478bd9Sstevel@tonic-gate  *
157c478bd9Sstevel@tonic-gate  * The Initial Developer of the Original Code is Netscape
167c478bd9Sstevel@tonic-gate  * Communications Corporation. Portions created by Netscape are
177c478bd9Sstevel@tonic-gate  * Copyright (C) 1998-1999 Netscape Communications Corporation. All
187c478bd9Sstevel@tonic-gate  * Rights Reserved.
197c478bd9Sstevel@tonic-gate  *
207c478bd9Sstevel@tonic-gate  * Contributor(s):
217c478bd9Sstevel@tonic-gate  */
227c478bd9Sstevel@tonic-gate 
237c478bd9Sstevel@tonic-gate /* uft8.c - misc. utf8 "string" functions. */
247c478bd9Sstevel@tonic-gate #include "ldap-int.h"
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate static char UTF8len[64]
277c478bd9Sstevel@tonic-gate = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
287c478bd9Sstevel@tonic-gate    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
297c478bd9Sstevel@tonic-gate    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307c478bd9Sstevel@tonic-gate    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
317c478bd9Sstevel@tonic-gate 
327c478bd9Sstevel@tonic-gate int
337c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8len(const char * s)347c478bd9Sstevel@tonic-gate ldap_utf8len (const char* s)
357c478bd9Sstevel@tonic-gate      /* Return the number of char's in the character at *s. */
367c478bd9Sstevel@tonic-gate {
377c478bd9Sstevel@tonic-gate     return ldap_utf8next((char*)s) - s;
387c478bd9Sstevel@tonic-gate }
397c478bd9Sstevel@tonic-gate 
407c478bd9Sstevel@tonic-gate char*
417c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8next(char * s)427c478bd9Sstevel@tonic-gate ldap_utf8next (char* s)
437c478bd9Sstevel@tonic-gate      /* Return a pointer to the character immediately following *s.
447c478bd9Sstevel@tonic-gate 	Handle any valid UTF-8 character, including '\0' and ASCII.
457c478bd9Sstevel@tonic-gate 	Try to handle a misaligned pointer or a malformed character.
467c478bd9Sstevel@tonic-gate      */
477c478bd9Sstevel@tonic-gate {
487c478bd9Sstevel@tonic-gate     register unsigned char* next = (unsigned char*)s;
497c478bd9Sstevel@tonic-gate     switch (UTF8len [(*next >> 2) & 0x3F]) {
507c478bd9Sstevel@tonic-gate       case 0: /* erroneous: s points to the middle of a character. */
517c478bd9Sstevel@tonic-gate       case 6: if ((*++next & 0xC0) != 0x80) break;
52*d7fdecd2SToomas Soome 	/* FALLTHROUGH */
537c478bd9Sstevel@tonic-gate       case 5: if ((*++next & 0xC0) != 0x80) break;
54*d7fdecd2SToomas Soome 	/* FALLTHROUGH */
557c478bd9Sstevel@tonic-gate       case 4: if ((*++next & 0xC0) != 0x80) break;
56*d7fdecd2SToomas Soome 	/* FALLTHROUGH */
577c478bd9Sstevel@tonic-gate       case 3: if ((*++next & 0xC0) != 0x80) break;
58*d7fdecd2SToomas Soome 	/* FALLTHROUGH */
597c478bd9Sstevel@tonic-gate       case 2: if ((*++next & 0xC0) != 0x80) break;
60*d7fdecd2SToomas Soome 	/* FALLTHROUGH */
617c478bd9Sstevel@tonic-gate       case 1: ++next;
627c478bd9Sstevel@tonic-gate     }
637c478bd9Sstevel@tonic-gate     return (char*) next;
647c478bd9Sstevel@tonic-gate }
657c478bd9Sstevel@tonic-gate 
667c478bd9Sstevel@tonic-gate char*
677c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8prev(char * s)687c478bd9Sstevel@tonic-gate ldap_utf8prev (char* s)
697c478bd9Sstevel@tonic-gate      /* Return a pointer to the character immediately preceding *s.
707c478bd9Sstevel@tonic-gate 	Handle any valid UTF-8 character, including '\0' and ASCII.
717c478bd9Sstevel@tonic-gate 	Try to handle a misaligned pointer or a malformed character.
727c478bd9Sstevel@tonic-gate      */
737c478bd9Sstevel@tonic-gate {
747c478bd9Sstevel@tonic-gate     register unsigned char* prev = (unsigned char*)s;
757c478bd9Sstevel@tonic-gate     unsigned char* limit = prev - 6;
767c478bd9Sstevel@tonic-gate     while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
777c478bd9Sstevel@tonic-gate     	;
787c478bd9Sstevel@tonic-gate     }
797c478bd9Sstevel@tonic-gate     return (char*) prev;
807c478bd9Sstevel@tonic-gate }
817c478bd9Sstevel@tonic-gate 
827c478bd9Sstevel@tonic-gate int
837c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8copy(char * dst,const char * src)847c478bd9Sstevel@tonic-gate ldap_utf8copy (char* dst, const char* src)
857c478bd9Sstevel@tonic-gate      /* Copy a character from src to dst; return the number of char's copied.
867c478bd9Sstevel@tonic-gate 	Handle any valid UTF-8 character, including '\0' and ASCII.
877c478bd9Sstevel@tonic-gate 	Try to handle a misaligned pointer or a malformed character.
887c478bd9Sstevel@tonic-gate      */
897c478bd9Sstevel@tonic-gate {
907c478bd9Sstevel@tonic-gate     register const unsigned char* s = (const unsigned char*)src;
917c478bd9Sstevel@tonic-gate     switch (UTF8len [(*s >> 2) & 0x3F]) {
927c478bd9Sstevel@tonic-gate       case 0: /* erroneous: s points to the middle of a character. */
937c478bd9Sstevel@tonic-gate       case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
94*d7fdecd2SToomas Soome 	/* FALLTHROUGH */
957c478bd9Sstevel@tonic-gate       case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
96*d7fdecd2SToomas Soome 	/* FALLTHROUGH */
977c478bd9Sstevel@tonic-gate       case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
98*d7fdecd2SToomas Soome 	/* FALLTHROUGH */
997c478bd9Sstevel@tonic-gate       case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
100*d7fdecd2SToomas Soome 	/* FALLTHROUGH */
1017c478bd9Sstevel@tonic-gate       case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
102*d7fdecd2SToomas Soome 	/* FALLTHROUGH */
1037c478bd9Sstevel@tonic-gate       case 1: *dst   = *s++;
1047c478bd9Sstevel@tonic-gate     }
1057c478bd9Sstevel@tonic-gate     return s - (const unsigned char*)src;
1067c478bd9Sstevel@tonic-gate }
1077c478bd9Sstevel@tonic-gate 
1087c478bd9Sstevel@tonic-gate size_t
1097c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8characters(const char * src)1107c478bd9Sstevel@tonic-gate ldap_utf8characters (const char* src)
1117c478bd9Sstevel@tonic-gate      /* Return the number of UTF-8 characters in the 0-terminated array s. */
1127c478bd9Sstevel@tonic-gate {
1137c478bd9Sstevel@tonic-gate     register char* s = (char*)src;
1147c478bd9Sstevel@tonic-gate     size_t n;
1157c478bd9Sstevel@tonic-gate     for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
1167c478bd9Sstevel@tonic-gate     return n;
1177c478bd9Sstevel@tonic-gate }
1187c478bd9Sstevel@tonic-gate 
1197c478bd9Sstevel@tonic-gate unsigned long LDAP_CALL
ldap_utf8getcc(const char ** src)1207c478bd9Sstevel@tonic-gate ldap_utf8getcc( const char** src )
1217c478bd9Sstevel@tonic-gate {
1227c478bd9Sstevel@tonic-gate     register unsigned long c;
1237c478bd9Sstevel@tonic-gate     register const unsigned char* s = (const unsigned char*)*src;
1247c478bd9Sstevel@tonic-gate     switch (UTF8len [(*s >> 2) & 0x3F]) {
1257c478bd9Sstevel@tonic-gate       case 0: /* erroneous: s points to the middle of a character. */
1267c478bd9Sstevel@tonic-gate 	      c = (*s++) & 0x3F; goto more5;
1277c478bd9Sstevel@tonic-gate       case 1: c = (*s++); break;
1287c478bd9Sstevel@tonic-gate       case 2: c = (*s++) & 0x1F; goto more1;
1297c478bd9Sstevel@tonic-gate       case 3: c = (*s++) & 0x0F; goto more2;
1307c478bd9Sstevel@tonic-gate       case 4: c = (*s++) & 0x07; goto more3;
1317c478bd9Sstevel@tonic-gate       case 5: c = (*s++) & 0x03; goto more4;
1327c478bd9Sstevel@tonic-gate       case 6: c = (*s++) & 0x01; goto more5;
1337c478bd9Sstevel@tonic-gate       more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
1347c478bd9Sstevel@tonic-gate       more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
1357c478bd9Sstevel@tonic-gate       more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
1367c478bd9Sstevel@tonic-gate       more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
1377c478bd9Sstevel@tonic-gate       more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
1387c478bd9Sstevel@tonic-gate 	break;
1397c478bd9Sstevel@tonic-gate     }
1407c478bd9Sstevel@tonic-gate     *src = (const char*)s;
1417c478bd9Sstevel@tonic-gate     return c;
1427c478bd9Sstevel@tonic-gate }
1437c478bd9Sstevel@tonic-gate 
1447c478bd9Sstevel@tonic-gate char*
1457c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8strtok_r(char * sp,const char * brk,char ** next)1467c478bd9Sstevel@tonic-gate ldap_utf8strtok_r( char* sp, const char* brk, char** next)
1477c478bd9Sstevel@tonic-gate {
1487c478bd9Sstevel@tonic-gate     const char *bp;
1497c478bd9Sstevel@tonic-gate     unsigned long sc, bc;
1507c478bd9Sstevel@tonic-gate     char *tok;
1517c478bd9Sstevel@tonic-gate 
1527c478bd9Sstevel@tonic-gate     if (sp == NULL && (sp = *next) == NULL)
1537c478bd9Sstevel@tonic-gate       return NULL;
1547c478bd9Sstevel@tonic-gate 
1557c478bd9Sstevel@tonic-gate     /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
1567c478bd9Sstevel@tonic-gate   cont:
1577c478bd9Sstevel@tonic-gate     sc = LDAP_UTF8GETC(sp);
1587c478bd9Sstevel@tonic-gate     for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
1597c478bd9Sstevel@tonic-gate 	if (sc == bc)
1607c478bd9Sstevel@tonic-gate 	  goto cont;
1617c478bd9Sstevel@tonic-gate     }
1627c478bd9Sstevel@tonic-gate 
1637c478bd9Sstevel@tonic-gate     if (sc == 0) { /* no non-delimiter characters */
1647c478bd9Sstevel@tonic-gate 	*next = NULL;
1657c478bd9Sstevel@tonic-gate 	return NULL;
1667c478bd9Sstevel@tonic-gate     }
1677c478bd9Sstevel@tonic-gate     tok = LDAP_UTF8PREV(sp);
1687c478bd9Sstevel@tonic-gate 
1697c478bd9Sstevel@tonic-gate     /* Scan token; roughly, sp += strcspn(sp, brk)
1707c478bd9Sstevel@tonic-gate      * Note that brk must be 0-terminated; we stop if we see that, too.
1717c478bd9Sstevel@tonic-gate      */
1727c478bd9Sstevel@tonic-gate     while (1) {
1737c478bd9Sstevel@tonic-gate 	sc = LDAP_UTF8GETC(sp);
1747c478bd9Sstevel@tonic-gate 	bp = brk;
1757c478bd9Sstevel@tonic-gate 	do {
1767c478bd9Sstevel@tonic-gate 	    if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
1777c478bd9Sstevel@tonic-gate 		if (sc == 0) {
1787c478bd9Sstevel@tonic-gate 		    *next = NULL;
1797c478bd9Sstevel@tonic-gate 		} else {
1807c478bd9Sstevel@tonic-gate 		    *next = sp;
1817c478bd9Sstevel@tonic-gate 		    *(LDAP_UTF8PREV(sp)) = 0;
1827c478bd9Sstevel@tonic-gate 		}
1837c478bd9Sstevel@tonic-gate 		return tok;
1847c478bd9Sstevel@tonic-gate 	    }
1857c478bd9Sstevel@tonic-gate 	} while (bc != 0);
1867c478bd9Sstevel@tonic-gate     }
1877c478bd9Sstevel@tonic-gate     /* NOTREACHED */
1887c478bd9Sstevel@tonic-gate }
1897c478bd9Sstevel@tonic-gate 
1907c478bd9Sstevel@tonic-gate int
1917c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isalnum(char * s)1927c478bd9Sstevel@tonic-gate ldap_utf8isalnum( char* s )
1937c478bd9Sstevel@tonic-gate {
1947c478bd9Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
1957c478bd9Sstevel@tonic-gate     if (0x80 & c) return 0;
1967c478bd9Sstevel@tonic-gate     if (c >= 'A' && c <= 'Z') return 1;
1977c478bd9Sstevel@tonic-gate     if (c >= 'a' && c <= 'z') return 1;
1987c478bd9Sstevel@tonic-gate     if (c >= '0' && c <= '9') return 1;
1997c478bd9Sstevel@tonic-gate     return 0;
2007c478bd9Sstevel@tonic-gate }
2017c478bd9Sstevel@tonic-gate 
2027c478bd9Sstevel@tonic-gate int
2037c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isalpha(char * s)2047c478bd9Sstevel@tonic-gate ldap_utf8isalpha( char* s )
2057c478bd9Sstevel@tonic-gate {
2067c478bd9Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
2077c478bd9Sstevel@tonic-gate     if (0x80 & c) return 0;
2087c478bd9Sstevel@tonic-gate     if (c >= 'A' && c <= 'Z') return 1;
2097c478bd9Sstevel@tonic-gate     if (c >= 'a' && c <= 'z') return 1;
2107c478bd9Sstevel@tonic-gate     return 0;
2117c478bd9Sstevel@tonic-gate }
2127c478bd9Sstevel@tonic-gate 
2137c478bd9Sstevel@tonic-gate int
2147c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isdigit(char * s)2157c478bd9Sstevel@tonic-gate ldap_utf8isdigit( char* s )
2167c478bd9Sstevel@tonic-gate {
2177c478bd9Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
2187c478bd9Sstevel@tonic-gate     if (0x80 & c) return 0;
2197c478bd9Sstevel@tonic-gate     if (c >= '0' && c <= '9') return 1;
2207c478bd9Sstevel@tonic-gate     return 0;
2217c478bd9Sstevel@tonic-gate }
2227c478bd9Sstevel@tonic-gate 
2237c478bd9Sstevel@tonic-gate int
2247c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isxdigit(char * s)2257c478bd9Sstevel@tonic-gate ldap_utf8isxdigit( char* s )
2267c478bd9Sstevel@tonic-gate {
2277c478bd9Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
2287c478bd9Sstevel@tonic-gate     if (0x80 & c) return 0;
2297c478bd9Sstevel@tonic-gate     if (c >= '0' && c <= '9') return 1;
2307c478bd9Sstevel@tonic-gate     if (c >= 'A' && c <= 'F') return 1;
2317c478bd9Sstevel@tonic-gate     if (c >= 'a' && c <= 'f') return 1;
2327c478bd9Sstevel@tonic-gate     return 0;
2337c478bd9Sstevel@tonic-gate }
2347c478bd9Sstevel@tonic-gate 
2357c478bd9Sstevel@tonic-gate int
2367c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isspace(char * s)2377c478bd9Sstevel@tonic-gate ldap_utf8isspace( char* s )
2387c478bd9Sstevel@tonic-gate {
2397c478bd9Sstevel@tonic-gate     register unsigned char *c = (unsigned char*)s;
2407c478bd9Sstevel@tonic-gate     int len = ldap_utf8len(s);
2417c478bd9Sstevel@tonic-gate 
2427c478bd9Sstevel@tonic-gate     if (len == 0) {
2437c478bd9Sstevel@tonic-gate 	return 0;
2447c478bd9Sstevel@tonic-gate     } else if (len == 1) {
2457c478bd9Sstevel@tonic-gate 	switch (*c) {
2467c478bd9Sstevel@tonic-gate 	    case 0x09:
2477c478bd9Sstevel@tonic-gate 	    case 0x0A:
2487c478bd9Sstevel@tonic-gate 	    case 0x0B:
2497c478bd9Sstevel@tonic-gate 	    case 0x0C:
2507c478bd9Sstevel@tonic-gate 	    case 0x0D:
2517c478bd9Sstevel@tonic-gate 	    case 0x20:
2527c478bd9Sstevel@tonic-gate 		return 1;
2537c478bd9Sstevel@tonic-gate 	    default:
2547c478bd9Sstevel@tonic-gate 		return 0;
2557c478bd9Sstevel@tonic-gate 	}
2567c478bd9Sstevel@tonic-gate     } else if (len == 2) {
2577c478bd9Sstevel@tonic-gate 	if (*c == 0xc2) {
2587c478bd9Sstevel@tonic-gate 		return *(c+1) == 0x80;
2597c478bd9Sstevel@tonic-gate 	}
2607c478bd9Sstevel@tonic-gate     } else if (len == 3) {
2617c478bd9Sstevel@tonic-gate 	if (*c == 0xE2) {
2627c478bd9Sstevel@tonic-gate 	    c++;
2637c478bd9Sstevel@tonic-gate 	    if (*c == 0x80) {
2647c478bd9Sstevel@tonic-gate 		c++;
2657c478bd9Sstevel@tonic-gate 		return (*c>=0x80 && *c<=0x8a);
2667c478bd9Sstevel@tonic-gate 	    }
2677c478bd9Sstevel@tonic-gate 	} else if (*c == 0xE3) {
2687c478bd9Sstevel@tonic-gate 	    return (*(c+1)==0x80) && (*(c+2)==0x80);
2697c478bd9Sstevel@tonic-gate 	} else if (*c==0xEF) {
2707c478bd9Sstevel@tonic-gate 	    return (*(c+1)==0xBB) && (*(c+2)==0xBF);
2717c478bd9Sstevel@tonic-gate 	}
2727c478bd9Sstevel@tonic-gate 	return 0;
2737c478bd9Sstevel@tonic-gate     }
2747c478bd9Sstevel@tonic-gate 
2757c478bd9Sstevel@tonic-gate     /* should never reach here */
2767c478bd9Sstevel@tonic-gate     return 0;
2777c478bd9Sstevel@tonic-gate }
278