1*16d86563SAlexander Pyhalov /* 2*16d86563SAlexander Pyhalov * CDDL HEADER START 3*16d86563SAlexander Pyhalov * 4*16d86563SAlexander Pyhalov * The contents of this file are subject to the terms of the 5*16d86563SAlexander Pyhalov * Common Development and Distribution License (the "License"). 6*16d86563SAlexander Pyhalov * You may not use this file except in compliance with the License. 7*16d86563SAlexander Pyhalov * 8*16d86563SAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9*16d86563SAlexander Pyhalov * or http://www.opensolaris.org/os/licensing. 10*16d86563SAlexander Pyhalov * See the License for the specific language governing permissions 11*16d86563SAlexander Pyhalov * and limitations under the License. 12*16d86563SAlexander Pyhalov * 13*16d86563SAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each 14*16d86563SAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE. 15*16d86563SAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the 16*16d86563SAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying 17*16d86563SAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner] 18*16d86563SAlexander Pyhalov * 19*16d86563SAlexander Pyhalov * CDDL HEADER END 20*16d86563SAlexander Pyhalov */ 21*16d86563SAlexander Pyhalov /* 22*16d86563SAlexander Pyhalov * Copyright 1998-1999, 2001-2002 Sun Microsystems, Inc. All rights reserved. 23*16d86563SAlexander Pyhalov * Use is subject to license terms. 24*16d86563SAlexander Pyhalov */ 25*16d86563SAlexander Pyhalov 26*16d86563SAlexander Pyhalov #ifndef COMMON_DEFS_H 27*16d86563SAlexander Pyhalov #define COMMON_DEFS_H 28*16d86563SAlexander Pyhalov 29*16d86563SAlexander Pyhalov #include <sys/types.h> 30*16d86563SAlexander Pyhalov 31*16d86563SAlexander Pyhalov /* Following are replacement characters for non-identical character cases. */ 32*16d86563SAlexander Pyhalov 33*16d86563SAlexander Pyhalov #define ICV_TYPE_NON_IDENTICAL_CHAR (-1) 34*16d86563SAlexander Pyhalov #define ICV_TYPE_ILLEGAL_CHAR (-2) 35*16d86563SAlexander Pyhalov 36*16d86563SAlexander Pyhalov #define ICV_CHAR_ASCII_REPLACEMENT ('?') 37*16d86563SAlexander Pyhalov #define ICV_CHAR_UTF8_REPLACEMENT (0x00efbfbd) 38*16d86563SAlexander Pyhalov #define ICV_CHAR_UCS2_REPLACEMENT (0xfffd) 39*16d86563SAlexander Pyhalov 40*16d86563SAlexander Pyhalov #define IL_ ICV_TYPE_ILLEGAL_CHAR 41*16d86563SAlexander Pyhalov 42*16d86563SAlexander Pyhalov typedef enum { false = 0, true = 1 } boolean; 43*16d86563SAlexander Pyhalov 44*16d86563SAlexander Pyhalov static const char number_of_bytes_in_utf8_char[0x100] = { 45*16d86563SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 46*16d86563SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 47*16d86563SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 48*16d86563SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49*16d86563SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50*16d86563SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51*16d86563SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52*16d86563SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53*16d86563SAlexander Pyhalov 54*16d86563SAlexander Pyhalov /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ 55*16d86563SAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 56*16d86563SAlexander Pyhalov 57*16d86563SAlexander Pyhalov /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ 58*16d86563SAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 59*16d86563SAlexander Pyhalov 60*16d86563SAlexander Pyhalov /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ 61*16d86563SAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 62*16d86563SAlexander Pyhalov 63*16d86563SAlexander Pyhalov /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ 64*16d86563SAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 65*16d86563SAlexander Pyhalov 66*16d86563SAlexander Pyhalov /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 67*16d86563SAlexander Pyhalov IL_,IL_,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 68*16d86563SAlexander Pyhalov 69*16d86563SAlexander Pyhalov /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 70*16d86563SAlexander Pyhalov 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 71*16d86563SAlexander Pyhalov 72*16d86563SAlexander Pyhalov /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 73*16d86563SAlexander Pyhalov 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 74*16d86563SAlexander Pyhalov 75*16d86563SAlexander Pyhalov /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 76*16d86563SAlexander Pyhalov 4, 4, 4, 4, 4, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, 77*16d86563SAlexander Pyhalov }; 78*16d86563SAlexander Pyhalov 79*16d86563SAlexander Pyhalov #undef IL_ 80*16d86563SAlexander Pyhalov 81*16d86563SAlexander Pyhalov /* 82*16d86563SAlexander Pyhalov * Following is a vector of bit-masks to get used bits in the first byte of 83*16d86563SAlexander Pyhalov * a UTF-8 character. Index is the number of bytes in the UTF-8 character 84*16d86563SAlexander Pyhalov * and the index value comes from above table. 85*16d86563SAlexander Pyhalov */ 86*16d86563SAlexander Pyhalov static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 87*16d86563SAlexander Pyhalov 88*16d86563SAlexander Pyhalov /* 89*16d86563SAlexander Pyhalov * The following two vectors are to provide valid minimum and 90*16d86563SAlexander Pyhalov * maximum values for the 2'nd byte of a multibyte UTF-8 character for 91*16d86563SAlexander Pyhalov * better illegal sequence checking. The index value must be the value of 92*16d86563SAlexander Pyhalov * the first byte of the UTF-8 character. 93*16d86563SAlexander Pyhalov */ 94*16d86563SAlexander Pyhalov static const unsigned char valid_min_2nd_byte[0x100] = { 95*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 96*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 97*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 98*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 99*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 100*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 101*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 102*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 103*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 104*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 105*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 106*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 107*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 108*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 109*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 110*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 111*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 112*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 113*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 114*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 115*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 116*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 117*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 118*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 119*16d86563SAlexander Pyhalov /* C0 C1 C2 C3 C4 C5 C6 C7 */ 120*16d86563SAlexander Pyhalov 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 121*16d86563SAlexander Pyhalov /* C8 C9 CA CB CC CD CE CF */ 122*16d86563SAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 123*16d86563SAlexander Pyhalov /* D0 D1 D2 D3 D4 D5 D6 D7 */ 124*16d86563SAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 125*16d86563SAlexander Pyhalov /* D8 D9 DA DB DC DD DE DF */ 126*16d86563SAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 127*16d86563SAlexander Pyhalov /* E0 E1 E2 E3 E4 E5 E6 E7 */ 128*16d86563SAlexander Pyhalov 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 129*16d86563SAlexander Pyhalov /* E8 E9 EA EB EC ED EE EF */ 130*16d86563SAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 131*16d86563SAlexander Pyhalov /* F0 F1 F2 F3 F4 F5 F6 F7 */ 132*16d86563SAlexander Pyhalov 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 133*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 134*16d86563SAlexander Pyhalov }; 135*16d86563SAlexander Pyhalov 136*16d86563SAlexander Pyhalov static const unsigned char valid_max_2nd_byte[0x100] = { 137*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 138*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 139*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 140*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 141*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 142*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 143*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 144*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 145*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 146*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 147*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 148*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 149*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 150*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 151*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 152*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 153*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 154*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 155*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 156*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 157*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 158*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 159*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 160*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 161*16d86563SAlexander Pyhalov /* C0 C1 C2 C3 C4 C5 C6 C7 */ 162*16d86563SAlexander Pyhalov 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 163*16d86563SAlexander Pyhalov /* C8 C9 CA CB CC CD CE CF */ 164*16d86563SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 165*16d86563SAlexander Pyhalov /* D0 D1 D2 D3 D4 D5 D6 D7 */ 166*16d86563SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 167*16d86563SAlexander Pyhalov /* D8 D9 DA DB DC DD DE DF */ 168*16d86563SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 169*16d86563SAlexander Pyhalov /* E0 E1 E2 E3 E4 E5 E6 E7 */ 170*16d86563SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 171*16d86563SAlexander Pyhalov /* E8 E9 EA EB EC ED EE EF */ 172*16d86563SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 173*16d86563SAlexander Pyhalov /* F0 F1 F2 F3 F4 F5 F6 F7 */ 174*16d86563SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 175*16d86563SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 176*16d86563SAlexander Pyhalov }; 177*16d86563SAlexander Pyhalov 178*16d86563SAlexander Pyhalov 179*16d86563SAlexander Pyhalov /* 180*16d86563SAlexander Pyhalov * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8 181*16d86563SAlexander Pyhalov * characters' second to sixth bytes. 182*16d86563SAlexander Pyhalov */ 183*16d86563SAlexander Pyhalov #define ICV_UTF8_BIT_SHIFT 6 184*16d86563SAlexander Pyhalov #define ICV_UTF8_BIT_MASK 0x3f 185*16d86563SAlexander Pyhalov #define ICV_FETCH_UTF8_BOM_SIZE 6 186*16d86563SAlexander Pyhalov 187*16d86563SAlexander Pyhalov #define ICV_FETCH_UCS4_SIZE 4 188*16d86563SAlexander Pyhalov #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \ 189*16d86563SAlexander Pyhalov defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 190*16d86563SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE 2 191*16d86563SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO 4 192*16d86563SAlexander Pyhalov #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \ 193*16d86563SAlexander Pyhalov defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) 194*16d86563SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE 4 195*16d86563SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO 8 196*16d86563SAlexander Pyhalov #endif 197*16d86563SAlexander Pyhalov 198*16d86563SAlexander Pyhalov 199*16d86563SAlexander Pyhalov /* 200*16d86563SAlexander Pyhalov * UTF-8 represantations of critical values 201*16d86563SAlexander Pyhalov */ 202*16d86563SAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_d800 (0x00eda080UL) 203*16d86563SAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_dfff (0x00edbfbfUL) 204*16d86563SAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_fffe (0x00efbfbeUL) 205*16d86563SAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_ffff (0x00efbfbfUL) 206*16d86563SAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_7fffffff (0x00fdbfbfbfbfbfULL) 207*16d86563SAlexander Pyhalov 208*16d86563SAlexander Pyhalov /* 209*16d86563SAlexander Pyhalov * common utility to convert utf8 string to unicode 210*16d86563SAlexander Pyhalov */ 211*16d86563SAlexander Pyhalov extern int convert_utf8_to_ucs4(uchar_t *, int, unsigned int *); 212*16d86563SAlexander Pyhalov 213*16d86563SAlexander Pyhalov extern int is_valid_utf8_string(unsigned char *, int); 214*16d86563SAlexander Pyhalov 215*16d86563SAlexander Pyhalov /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */ 216*16d86563SAlexander Pyhalov typedef struct { 217*16d86563SAlexander Pyhalov boolean bom_written; 218*16d86563SAlexander Pyhalov boolean little_endian; 219*16d86563SAlexander Pyhalov } ucs_state_t; 220*16d86563SAlexander Pyhalov 221*16d86563SAlexander Pyhalov #endif /* COMMON_DEFS_H */ 222