17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate * The contents of this file are subject to the Netscape Public
37c478bd9Sstevel@tonic-gate * License Version 1.1 (the "License"); you may not use this file
47c478bd9Sstevel@tonic-gate * except in compliance with the License. You may obtain a copy of
57c478bd9Sstevel@tonic-gate * the License at http://www.mozilla.org/NPL/
67c478bd9Sstevel@tonic-gate *
77c478bd9Sstevel@tonic-gate * Software distributed under the License is distributed on an "AS
87c478bd9Sstevel@tonic-gate * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
97c478bd9Sstevel@tonic-gate * implied. See the License for the specific language governing
107c478bd9Sstevel@tonic-gate * rights and limitations under the License.
117c478bd9Sstevel@tonic-gate *
127c478bd9Sstevel@tonic-gate * The Original Code is Mozilla Communicator client code, released
137c478bd9Sstevel@tonic-gate * March 31, 1998.
147c478bd9Sstevel@tonic-gate *
157c478bd9Sstevel@tonic-gate * The Initial Developer of the Original Code is Netscape
167c478bd9Sstevel@tonic-gate * Communications Corporation. Portions created by Netscape are
177c478bd9Sstevel@tonic-gate * Copyright (C) 1998-1999 Netscape Communications Corporation. All
187c478bd9Sstevel@tonic-gate * Rights Reserved.
197c478bd9Sstevel@tonic-gate *
207c478bd9Sstevel@tonic-gate * Contributor(s):
217c478bd9Sstevel@tonic-gate */
227c478bd9Sstevel@tonic-gate
237c478bd9Sstevel@tonic-gate /* uft8.c - misc. utf8 "string" functions. */
247c478bd9Sstevel@tonic-gate #include "ldap-int.h"
257c478bd9Sstevel@tonic-gate
267c478bd9Sstevel@tonic-gate static char UTF8len[64]
277c478bd9Sstevel@tonic-gate = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
287c478bd9Sstevel@tonic-gate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
297c478bd9Sstevel@tonic-gate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307c478bd9Sstevel@tonic-gate 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
317c478bd9Sstevel@tonic-gate
327c478bd9Sstevel@tonic-gate int
337c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8len(const char * s)347c478bd9Sstevel@tonic-gate ldap_utf8len (const char* s)
357c478bd9Sstevel@tonic-gate /* Return the number of char's in the character at *s. */
367c478bd9Sstevel@tonic-gate {
377c478bd9Sstevel@tonic-gate return ldap_utf8next((char*)s) - s;
387c478bd9Sstevel@tonic-gate }
397c478bd9Sstevel@tonic-gate
407c478bd9Sstevel@tonic-gate char*
417c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8next(char * s)427c478bd9Sstevel@tonic-gate ldap_utf8next (char* s)
437c478bd9Sstevel@tonic-gate /* Return a pointer to the character immediately following *s.
447c478bd9Sstevel@tonic-gate Handle any valid UTF-8 character, including '\0' and ASCII.
457c478bd9Sstevel@tonic-gate Try to handle a misaligned pointer or a malformed character.
467c478bd9Sstevel@tonic-gate */
477c478bd9Sstevel@tonic-gate {
487c478bd9Sstevel@tonic-gate register unsigned char* next = (unsigned char*)s;
497c478bd9Sstevel@tonic-gate switch (UTF8len [(*next >> 2) & 0x3F]) {
507c478bd9Sstevel@tonic-gate case 0: /* erroneous: s points to the middle of a character. */
517c478bd9Sstevel@tonic-gate case 6: if ((*++next & 0xC0) != 0x80) break;
52*d7fdecd2SToomas Soome /* FALLTHROUGH */
537c478bd9Sstevel@tonic-gate case 5: if ((*++next & 0xC0) != 0x80) break;
54*d7fdecd2SToomas Soome /* FALLTHROUGH */
557c478bd9Sstevel@tonic-gate case 4: if ((*++next & 0xC0) != 0x80) break;
56*d7fdecd2SToomas Soome /* FALLTHROUGH */
577c478bd9Sstevel@tonic-gate case 3: if ((*++next & 0xC0) != 0x80) break;
58*d7fdecd2SToomas Soome /* FALLTHROUGH */
597c478bd9Sstevel@tonic-gate case 2: if ((*++next & 0xC0) != 0x80) break;
60*d7fdecd2SToomas Soome /* FALLTHROUGH */
617c478bd9Sstevel@tonic-gate case 1: ++next;
627c478bd9Sstevel@tonic-gate }
637c478bd9Sstevel@tonic-gate return (char*) next;
647c478bd9Sstevel@tonic-gate }
657c478bd9Sstevel@tonic-gate
667c478bd9Sstevel@tonic-gate char*
677c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8prev(char * s)687c478bd9Sstevel@tonic-gate ldap_utf8prev (char* s)
697c478bd9Sstevel@tonic-gate /* Return a pointer to the character immediately preceding *s.
707c478bd9Sstevel@tonic-gate Handle any valid UTF-8 character, including '\0' and ASCII.
717c478bd9Sstevel@tonic-gate Try to handle a misaligned pointer or a malformed character.
727c478bd9Sstevel@tonic-gate */
737c478bd9Sstevel@tonic-gate {
747c478bd9Sstevel@tonic-gate register unsigned char* prev = (unsigned char*)s;
757c478bd9Sstevel@tonic-gate unsigned char* limit = prev - 6;
767c478bd9Sstevel@tonic-gate while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
777c478bd9Sstevel@tonic-gate ;
787c478bd9Sstevel@tonic-gate }
797c478bd9Sstevel@tonic-gate return (char*) prev;
807c478bd9Sstevel@tonic-gate }
817c478bd9Sstevel@tonic-gate
827c478bd9Sstevel@tonic-gate int
837c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8copy(char * dst,const char * src)847c478bd9Sstevel@tonic-gate ldap_utf8copy (char* dst, const char* src)
857c478bd9Sstevel@tonic-gate /* Copy a character from src to dst; return the number of char's copied.
867c478bd9Sstevel@tonic-gate Handle any valid UTF-8 character, including '\0' and ASCII.
877c478bd9Sstevel@tonic-gate Try to handle a misaligned pointer or a malformed character.
887c478bd9Sstevel@tonic-gate */
897c478bd9Sstevel@tonic-gate {
907c478bd9Sstevel@tonic-gate register const unsigned char* s = (const unsigned char*)src;
917c478bd9Sstevel@tonic-gate switch (UTF8len [(*s >> 2) & 0x3F]) {
927c478bd9Sstevel@tonic-gate case 0: /* erroneous: s points to the middle of a character. */
937c478bd9Sstevel@tonic-gate case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
94*d7fdecd2SToomas Soome /* FALLTHROUGH */
957c478bd9Sstevel@tonic-gate case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
96*d7fdecd2SToomas Soome /* FALLTHROUGH */
977c478bd9Sstevel@tonic-gate case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
98*d7fdecd2SToomas Soome /* FALLTHROUGH */
997c478bd9Sstevel@tonic-gate case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
100*d7fdecd2SToomas Soome /* FALLTHROUGH */
1017c478bd9Sstevel@tonic-gate case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
102*d7fdecd2SToomas Soome /* FALLTHROUGH */
1037c478bd9Sstevel@tonic-gate case 1: *dst = *s++;
1047c478bd9Sstevel@tonic-gate }
1057c478bd9Sstevel@tonic-gate return s - (const unsigned char*)src;
1067c478bd9Sstevel@tonic-gate }
1077c478bd9Sstevel@tonic-gate
1087c478bd9Sstevel@tonic-gate size_t
1097c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8characters(const char * src)1107c478bd9Sstevel@tonic-gate ldap_utf8characters (const char* src)
1117c478bd9Sstevel@tonic-gate /* Return the number of UTF-8 characters in the 0-terminated array s. */
1127c478bd9Sstevel@tonic-gate {
1137c478bd9Sstevel@tonic-gate register char* s = (char*)src;
1147c478bd9Sstevel@tonic-gate size_t n;
1157c478bd9Sstevel@tonic-gate for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
1167c478bd9Sstevel@tonic-gate return n;
1177c478bd9Sstevel@tonic-gate }
1187c478bd9Sstevel@tonic-gate
1197c478bd9Sstevel@tonic-gate unsigned long LDAP_CALL
ldap_utf8getcc(const char ** src)1207c478bd9Sstevel@tonic-gate ldap_utf8getcc( const char** src )
1217c478bd9Sstevel@tonic-gate {
1227c478bd9Sstevel@tonic-gate register unsigned long c;
1237c478bd9Sstevel@tonic-gate register const unsigned char* s = (const unsigned char*)*src;
1247c478bd9Sstevel@tonic-gate switch (UTF8len [(*s >> 2) & 0x3F]) {
1257c478bd9Sstevel@tonic-gate case 0: /* erroneous: s points to the middle of a character. */
1267c478bd9Sstevel@tonic-gate c = (*s++) & 0x3F; goto more5;
1277c478bd9Sstevel@tonic-gate case 1: c = (*s++); break;
1287c478bd9Sstevel@tonic-gate case 2: c = (*s++) & 0x1F; goto more1;
1297c478bd9Sstevel@tonic-gate case 3: c = (*s++) & 0x0F; goto more2;
1307c478bd9Sstevel@tonic-gate case 4: c = (*s++) & 0x07; goto more3;
1317c478bd9Sstevel@tonic-gate case 5: c = (*s++) & 0x03; goto more4;
1327c478bd9Sstevel@tonic-gate case 6: c = (*s++) & 0x01; goto more5;
1337c478bd9Sstevel@tonic-gate more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
1347c478bd9Sstevel@tonic-gate more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
1357c478bd9Sstevel@tonic-gate more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
1367c478bd9Sstevel@tonic-gate more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
1377c478bd9Sstevel@tonic-gate more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
1387c478bd9Sstevel@tonic-gate break;
1397c478bd9Sstevel@tonic-gate }
1407c478bd9Sstevel@tonic-gate *src = (const char*)s;
1417c478bd9Sstevel@tonic-gate return c;
1427c478bd9Sstevel@tonic-gate }
1437c478bd9Sstevel@tonic-gate
1447c478bd9Sstevel@tonic-gate char*
1457c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8strtok_r(char * sp,const char * brk,char ** next)1467c478bd9Sstevel@tonic-gate ldap_utf8strtok_r( char* sp, const char* brk, char** next)
1477c478bd9Sstevel@tonic-gate {
1487c478bd9Sstevel@tonic-gate const char *bp;
1497c478bd9Sstevel@tonic-gate unsigned long sc, bc;
1507c478bd9Sstevel@tonic-gate char *tok;
1517c478bd9Sstevel@tonic-gate
1527c478bd9Sstevel@tonic-gate if (sp == NULL && (sp = *next) == NULL)
1537c478bd9Sstevel@tonic-gate return NULL;
1547c478bd9Sstevel@tonic-gate
1557c478bd9Sstevel@tonic-gate /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
1567c478bd9Sstevel@tonic-gate cont:
1577c478bd9Sstevel@tonic-gate sc = LDAP_UTF8GETC(sp);
1587c478bd9Sstevel@tonic-gate for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
1597c478bd9Sstevel@tonic-gate if (sc == bc)
1607c478bd9Sstevel@tonic-gate goto cont;
1617c478bd9Sstevel@tonic-gate }
1627c478bd9Sstevel@tonic-gate
1637c478bd9Sstevel@tonic-gate if (sc == 0) { /* no non-delimiter characters */
1647c478bd9Sstevel@tonic-gate *next = NULL;
1657c478bd9Sstevel@tonic-gate return NULL;
1667c478bd9Sstevel@tonic-gate }
1677c478bd9Sstevel@tonic-gate tok = LDAP_UTF8PREV(sp);
1687c478bd9Sstevel@tonic-gate
1697c478bd9Sstevel@tonic-gate /* Scan token; roughly, sp += strcspn(sp, brk)
1707c478bd9Sstevel@tonic-gate * Note that brk must be 0-terminated; we stop if we see that, too.
1717c478bd9Sstevel@tonic-gate */
1727c478bd9Sstevel@tonic-gate while (1) {
1737c478bd9Sstevel@tonic-gate sc = LDAP_UTF8GETC(sp);
1747c478bd9Sstevel@tonic-gate bp = brk;
1757c478bd9Sstevel@tonic-gate do {
1767c478bd9Sstevel@tonic-gate if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
1777c478bd9Sstevel@tonic-gate if (sc == 0) {
1787c478bd9Sstevel@tonic-gate *next = NULL;
1797c478bd9Sstevel@tonic-gate } else {
1807c478bd9Sstevel@tonic-gate *next = sp;
1817c478bd9Sstevel@tonic-gate *(LDAP_UTF8PREV(sp)) = 0;
1827c478bd9Sstevel@tonic-gate }
1837c478bd9Sstevel@tonic-gate return tok;
1847c478bd9Sstevel@tonic-gate }
1857c478bd9Sstevel@tonic-gate } while (bc != 0);
1867c478bd9Sstevel@tonic-gate }
1877c478bd9Sstevel@tonic-gate /* NOTREACHED */
1887c478bd9Sstevel@tonic-gate }
1897c478bd9Sstevel@tonic-gate
1907c478bd9Sstevel@tonic-gate int
1917c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isalnum(char * s)1927c478bd9Sstevel@tonic-gate ldap_utf8isalnum( char* s )
1937c478bd9Sstevel@tonic-gate {
1947c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s;
1957c478bd9Sstevel@tonic-gate if (0x80 & c) return 0;
1967c478bd9Sstevel@tonic-gate if (c >= 'A' && c <= 'Z') return 1;
1977c478bd9Sstevel@tonic-gate if (c >= 'a' && c <= 'z') return 1;
1987c478bd9Sstevel@tonic-gate if (c >= '0' && c <= '9') return 1;
1997c478bd9Sstevel@tonic-gate return 0;
2007c478bd9Sstevel@tonic-gate }
2017c478bd9Sstevel@tonic-gate
2027c478bd9Sstevel@tonic-gate int
2037c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isalpha(char * s)2047c478bd9Sstevel@tonic-gate ldap_utf8isalpha( char* s )
2057c478bd9Sstevel@tonic-gate {
2067c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s;
2077c478bd9Sstevel@tonic-gate if (0x80 & c) return 0;
2087c478bd9Sstevel@tonic-gate if (c >= 'A' && c <= 'Z') return 1;
2097c478bd9Sstevel@tonic-gate if (c >= 'a' && c <= 'z') return 1;
2107c478bd9Sstevel@tonic-gate return 0;
2117c478bd9Sstevel@tonic-gate }
2127c478bd9Sstevel@tonic-gate
2137c478bd9Sstevel@tonic-gate int
2147c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isdigit(char * s)2157c478bd9Sstevel@tonic-gate ldap_utf8isdigit( char* s )
2167c478bd9Sstevel@tonic-gate {
2177c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s;
2187c478bd9Sstevel@tonic-gate if (0x80 & c) return 0;
2197c478bd9Sstevel@tonic-gate if (c >= '0' && c <= '9') return 1;
2207c478bd9Sstevel@tonic-gate return 0;
2217c478bd9Sstevel@tonic-gate }
2227c478bd9Sstevel@tonic-gate
2237c478bd9Sstevel@tonic-gate int
2247c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isxdigit(char * s)2257c478bd9Sstevel@tonic-gate ldap_utf8isxdigit( char* s )
2267c478bd9Sstevel@tonic-gate {
2277c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s;
2287c478bd9Sstevel@tonic-gate if (0x80 & c) return 0;
2297c478bd9Sstevel@tonic-gate if (c >= '0' && c <= '9') return 1;
2307c478bd9Sstevel@tonic-gate if (c >= 'A' && c <= 'F') return 1;
2317c478bd9Sstevel@tonic-gate if (c >= 'a' && c <= 'f') return 1;
2327c478bd9Sstevel@tonic-gate return 0;
2337c478bd9Sstevel@tonic-gate }
2347c478bd9Sstevel@tonic-gate
2357c478bd9Sstevel@tonic-gate int
2367c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isspace(char * s)2377c478bd9Sstevel@tonic-gate ldap_utf8isspace( char* s )
2387c478bd9Sstevel@tonic-gate {
2397c478bd9Sstevel@tonic-gate register unsigned char *c = (unsigned char*)s;
2407c478bd9Sstevel@tonic-gate int len = ldap_utf8len(s);
2417c478bd9Sstevel@tonic-gate
2427c478bd9Sstevel@tonic-gate if (len == 0) {
2437c478bd9Sstevel@tonic-gate return 0;
2447c478bd9Sstevel@tonic-gate } else if (len == 1) {
2457c478bd9Sstevel@tonic-gate switch (*c) {
2467c478bd9Sstevel@tonic-gate case 0x09:
2477c478bd9Sstevel@tonic-gate case 0x0A:
2487c478bd9Sstevel@tonic-gate case 0x0B:
2497c478bd9Sstevel@tonic-gate case 0x0C:
2507c478bd9Sstevel@tonic-gate case 0x0D:
2517c478bd9Sstevel@tonic-gate case 0x20:
2527c478bd9Sstevel@tonic-gate return 1;
2537c478bd9Sstevel@tonic-gate default:
2547c478bd9Sstevel@tonic-gate return 0;
2557c478bd9Sstevel@tonic-gate }
2567c478bd9Sstevel@tonic-gate } else if (len == 2) {
2577c478bd9Sstevel@tonic-gate if (*c == 0xc2) {
2587c478bd9Sstevel@tonic-gate return *(c+1) == 0x80;
2597c478bd9Sstevel@tonic-gate }
2607c478bd9Sstevel@tonic-gate } else if (len == 3) {
2617c478bd9Sstevel@tonic-gate if (*c == 0xE2) {
2627c478bd9Sstevel@tonic-gate c++;
2637c478bd9Sstevel@tonic-gate if (*c == 0x80) {
2647c478bd9Sstevel@tonic-gate c++;
2657c478bd9Sstevel@tonic-gate return (*c>=0x80 && *c<=0x8a);
2667c478bd9Sstevel@tonic-gate }
2677c478bd9Sstevel@tonic-gate } else if (*c == 0xE3) {
2687c478bd9Sstevel@tonic-gate return (*(c+1)==0x80) && (*(c+2)==0x80);
2697c478bd9Sstevel@tonic-gate } else if (*c==0xEF) {
2707c478bd9Sstevel@tonic-gate return (*(c+1)==0xBB) && (*(c+2)==0xBF);
2717c478bd9Sstevel@tonic-gate }
2727c478bd9Sstevel@tonic-gate return 0;
2737c478bd9Sstevel@tonic-gate }
2747c478bd9Sstevel@tonic-gate
2757c478bd9Sstevel@tonic-gate /* should never reach here */
2767c478bd9Sstevel@tonic-gate return 0;
2777c478bd9Sstevel@tonic-gate }
278