1*16d86563SAlexander Pyhalov /*
2*16d86563SAlexander Pyhalov  * CDDL HEADER START
3*16d86563SAlexander Pyhalov  *
4*16d86563SAlexander Pyhalov  * The contents of this file are subject to the terms of the
5*16d86563SAlexander Pyhalov  * Common Development and Distribution License (the "License").
6*16d86563SAlexander Pyhalov  * You may not use this file except in compliance with the License.
7*16d86563SAlexander Pyhalov  *
8*16d86563SAlexander Pyhalov  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*16d86563SAlexander Pyhalov  * or http://www.opensolaris.org/os/licensing.
10*16d86563SAlexander Pyhalov  * See the License for the specific language governing permissions
11*16d86563SAlexander Pyhalov  * and limitations under the License.
12*16d86563SAlexander Pyhalov  *
13*16d86563SAlexander Pyhalov  * When distributing Covered Code, include this CDDL HEADER in each
14*16d86563SAlexander Pyhalov  * file and include the License file at src/OPENSOLARIS.LICENSE.
15*16d86563SAlexander Pyhalov  * If applicable, add the following below this CDDL HEADER, with the
16*16d86563SAlexander Pyhalov  * fields enclosed by brackets "[]" replaced with your own identifying
17*16d86563SAlexander Pyhalov  * information: Portions Copyright [yyyy] [name of copyright owner]
18*16d86563SAlexander Pyhalov  *
19*16d86563SAlexander Pyhalov  * CDDL HEADER END
20*16d86563SAlexander Pyhalov  */
21*16d86563SAlexander Pyhalov /*
22*16d86563SAlexander Pyhalov  * Copyright 1998-1999, 2001-2002 Sun Microsystems, Inc.  All rights reserved.
23*16d86563SAlexander Pyhalov  * Use is subject to license terms.
24*16d86563SAlexander Pyhalov  */
25*16d86563SAlexander Pyhalov 
26*16d86563SAlexander Pyhalov #ifndef	COMMON_DEFS_H
27*16d86563SAlexander Pyhalov #define	COMMON_DEFS_H
28*16d86563SAlexander Pyhalov 
29*16d86563SAlexander Pyhalov #include <sys/types.h>
30*16d86563SAlexander Pyhalov 
31*16d86563SAlexander Pyhalov /* Following are replacement characters for non-identical character cases. */
32*16d86563SAlexander Pyhalov 
33*16d86563SAlexander Pyhalov #define ICV_TYPE_NON_IDENTICAL_CHAR	(-1)
34*16d86563SAlexander Pyhalov #define ICV_TYPE_ILLEGAL_CHAR		(-2)
35*16d86563SAlexander Pyhalov 
36*16d86563SAlexander Pyhalov #define	ICV_CHAR_ASCII_REPLACEMENT	('?')
37*16d86563SAlexander Pyhalov #define	ICV_CHAR_UTF8_REPLACEMENT	(0x00efbfbd)
38*16d86563SAlexander Pyhalov #define	ICV_CHAR_UCS2_REPLACEMENT	(0xfffd)
39*16d86563SAlexander Pyhalov 
40*16d86563SAlexander Pyhalov #define	IL_				ICV_TYPE_ILLEGAL_CHAR
41*16d86563SAlexander Pyhalov 
42*16d86563SAlexander Pyhalov typedef enum { false = 0, true = 1 } boolean;
43*16d86563SAlexander Pyhalov 
44*16d86563SAlexander Pyhalov static const char number_of_bytes_in_utf8_char[0x100] = {
45*16d86563SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
46*16d86563SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
47*16d86563SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
48*16d86563SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
49*16d86563SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
50*16d86563SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
51*16d86563SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
52*16d86563SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
53*16d86563SAlexander Pyhalov 
54*16d86563SAlexander Pyhalov     /*  80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
55*16d86563SAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
56*16d86563SAlexander Pyhalov 
57*16d86563SAlexander Pyhalov     /*  90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
58*16d86563SAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
59*16d86563SAlexander Pyhalov 
60*16d86563SAlexander Pyhalov     /*  A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
61*16d86563SAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
62*16d86563SAlexander Pyhalov 
63*16d86563SAlexander Pyhalov     /*  B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
64*16d86563SAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
65*16d86563SAlexander Pyhalov 
66*16d86563SAlexander Pyhalov     /*  C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
67*16d86563SAlexander Pyhalov 	IL_,IL_,2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
68*16d86563SAlexander Pyhalov 
69*16d86563SAlexander Pyhalov     /*  D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
70*16d86563SAlexander Pyhalov 	 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
71*16d86563SAlexander Pyhalov 
72*16d86563SAlexander Pyhalov     /*  E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
73*16d86563SAlexander Pyhalov 	 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
74*16d86563SAlexander Pyhalov 
75*16d86563SAlexander Pyhalov     /*  F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
76*16d86563SAlexander Pyhalov 	 4,  4,  4,  4,  4,  IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_,
77*16d86563SAlexander Pyhalov };
78*16d86563SAlexander Pyhalov 
79*16d86563SAlexander Pyhalov #undef IL_
80*16d86563SAlexander Pyhalov 
81*16d86563SAlexander Pyhalov /*
82*16d86563SAlexander Pyhalov  * Following is a vector of bit-masks to get used bits in the first byte of
83*16d86563SAlexander Pyhalov  * a UTF-8 character.  Index is the number of bytes in the UTF-8 character
84*16d86563SAlexander Pyhalov  * and the index value comes from above table.
85*16d86563SAlexander Pyhalov  */
86*16d86563SAlexander Pyhalov static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
87*16d86563SAlexander Pyhalov 
88*16d86563SAlexander Pyhalov /*
89*16d86563SAlexander Pyhalov  * The following two vectors are to provide valid minimum and
90*16d86563SAlexander Pyhalov  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
91*16d86563SAlexander Pyhalov  * better illegal sequence checking. The index value must be the value of
92*16d86563SAlexander Pyhalov  * the first byte of the UTF-8 character.
93*16d86563SAlexander Pyhalov  */
94*16d86563SAlexander Pyhalov static const unsigned char valid_min_2nd_byte[0x100] = {
95*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
96*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
97*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
98*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
99*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
100*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
101*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
102*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
103*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
104*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
105*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
106*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
107*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
108*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
109*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
110*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
111*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
112*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
113*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
114*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
115*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
116*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
117*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
118*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
119*16d86563SAlexander Pyhalov      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
120*16d86563SAlexander Pyhalov 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
121*16d86563SAlexander Pyhalov      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
122*16d86563SAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
123*16d86563SAlexander Pyhalov      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
124*16d86563SAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
125*16d86563SAlexander Pyhalov      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
126*16d86563SAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
127*16d86563SAlexander Pyhalov      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
128*16d86563SAlexander Pyhalov 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
129*16d86563SAlexander Pyhalov      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
130*16d86563SAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
131*16d86563SAlexander Pyhalov      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
132*16d86563SAlexander Pyhalov 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
133*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
134*16d86563SAlexander Pyhalov };
135*16d86563SAlexander Pyhalov 
136*16d86563SAlexander Pyhalov static const unsigned char valid_max_2nd_byte[0x100] = {
137*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
138*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
139*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
140*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
141*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
142*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
143*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
144*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
145*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
146*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
147*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
148*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
149*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
150*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
151*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
152*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
153*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
154*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
155*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
156*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
157*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
158*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
159*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
160*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
161*16d86563SAlexander Pyhalov      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
162*16d86563SAlexander Pyhalov 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
163*16d86563SAlexander Pyhalov      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
164*16d86563SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
165*16d86563SAlexander Pyhalov      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
166*16d86563SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
167*16d86563SAlexander Pyhalov      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
168*16d86563SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
169*16d86563SAlexander Pyhalov      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
170*16d86563SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
171*16d86563SAlexander Pyhalov      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
172*16d86563SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
173*16d86563SAlexander Pyhalov      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
174*16d86563SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
175*16d86563SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
176*16d86563SAlexander Pyhalov };
177*16d86563SAlexander Pyhalov 
178*16d86563SAlexander Pyhalov 
179*16d86563SAlexander Pyhalov /*
180*16d86563SAlexander Pyhalov  * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8
181*16d86563SAlexander Pyhalov  * characters' second to sixth bytes.
182*16d86563SAlexander Pyhalov  */
183*16d86563SAlexander Pyhalov #define	ICV_UTF8_BIT_SHIFT		6
184*16d86563SAlexander Pyhalov #define	ICV_UTF8_BIT_MASK		0x3f
185*16d86563SAlexander Pyhalov #define	ICV_FETCH_UTF8_BOM_SIZE		6
186*16d86563SAlexander Pyhalov 
187*16d86563SAlexander Pyhalov #define  ICV_FETCH_UCS4_SIZE     4
188*16d86563SAlexander Pyhalov #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \
189*16d86563SAlexander Pyhalov    defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
190*16d86563SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE              2
191*16d86563SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO          4
192*16d86563SAlexander Pyhalov #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
193*16d86563SAlexander Pyhalov    defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
194*16d86563SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE              4
195*16d86563SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO          8
196*16d86563SAlexander Pyhalov #endif
197*16d86563SAlexander Pyhalov 
198*16d86563SAlexander Pyhalov 
199*16d86563SAlexander Pyhalov /*
200*16d86563SAlexander Pyhalov  * UTF-8 represantations of critical values
201*16d86563SAlexander Pyhalov  */
202*16d86563SAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_d800		(0x00eda080UL)
203*16d86563SAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_dfff		(0x00edbfbfUL)
204*16d86563SAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_fffe		(0x00efbfbeUL)
205*16d86563SAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_ffff		(0x00efbfbfUL)
206*16d86563SAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_7fffffff	(0x00fdbfbfbfbfbfULL)
207*16d86563SAlexander Pyhalov 
208*16d86563SAlexander Pyhalov /*
209*16d86563SAlexander Pyhalov  * common utility to convert utf8 string to unicode
210*16d86563SAlexander Pyhalov  */
211*16d86563SAlexander Pyhalov extern  int convert_utf8_to_ucs4(uchar_t *, int, unsigned int *);
212*16d86563SAlexander Pyhalov 
213*16d86563SAlexander Pyhalov extern  int is_valid_utf8_string(unsigned char *, int);
214*16d86563SAlexander Pyhalov 
215*16d86563SAlexander Pyhalov /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */
216*16d86563SAlexander Pyhalov typedef struct {
217*16d86563SAlexander Pyhalov    boolean     bom_written;
218*16d86563SAlexander Pyhalov    boolean     little_endian;
219*16d86563SAlexander Pyhalov } ucs_state_t;
220*16d86563SAlexander Pyhalov 
221*16d86563SAlexander Pyhalov #endif	/* COMMON_DEFS_H */
222