/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef COMMON_DEFS_H #define COMMON_DEFS_H #define MAGIC_NUMBER 201513 /* ISO/IEC 10646-1/Unicode Byte Order Mark */ #define ICV_BOM_IN_BIG_ENDIAN 0x00feff #define ICV_BOM_IN_LITTLE_ENDIAN_UCS4 0xfffe0000 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \ defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) #define ICV_BOM_IN_LITTLE_ENDIAN 0x00fffe #else #define ICV_BOM_IN_LITTLE_ENDIAN 0xfffe0000 #endif /* * Following type macros are for possible error cases that can be defined for * mapping tables. Valid characters will have the byte length which will be * always a positive integer. */ #define ICV_TYPE_NON_IDENTICAL_CHAR (-1) #define ICV_TYPE_ILLEGAL_CHAR (-2) /* Following are replacement characters for non-identical character cases. */ #define ICV_CHAR_ASCII_REPLACEMENT ('?') #define ICV_CHAR_UTF8_REPLACEMENT (0x00efbfbd) #define ICV_CHAR_UCS2_REPLACEMENT (0xfffd) typedef enum { false = 0, true = 1 } boolean; /* We only support characters in range of UTF-16. */ typedef struct { unsigned int u8; signed char size; } to_utf8_table_component_t; typedef struct { unsigned int u8; unsigned char sb; } to_sb_table_component_t; /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */ typedef struct { boolean bom_written; boolean little_endian; } ucs_state_t; typedef struct { ucs_state_t input; ucs_state_t output; } ucs_ucs_state_t; /* UTF-7 requires additional state data fields. */ typedef struct { boolean bom_written; boolean little_endian; boolean in_the_middle_of_utf7_sequence; unsigned int remnant; signed char remnant_count; /* in bits */ unsigned char prevch; } utf7_state_t; /* * Following vector shows the number of bytes in a UTF-8 character. * Index will be the first byte of the character. */ #define IL_ ICV_TYPE_ILLEGAL_CHAR static const char number_of_bytes_in_utf8_char[0x100] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ IL_,IL_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 4, 4, 4, 4, 4, IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, }; #undef IL_ /* * Following is a vector of bit-masks to get used bits in the first byte of * a UTF-8 character. Index is the number of bytes in the UTF-8 character * and the index value comes from above table. */ static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; /* * The following two vectors are to provide valid minimum and * maximum values for the 2'nd byte of a multibyte UTF-8 character for * better illegal sequence checking. The index value must be the value of * the first byte of the UTF-8 character. */ static const unsigned char valid_min_2nd_byte[0x100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C0 C1 C2 C3 C4 C5 C6 C7 */ 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* C8 C9 CA CB CC CD CE CF */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* D0 D1 D2 D3 D4 D5 D6 D7 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* D8 D9 DA DB DC DD DE DF */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* E0 E1 E2 E3 E4 E5 E6 E7 */ 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* E8 E9 EA EB EC ED EE EF */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* F0 F1 F2 F3 F4 F5 F6 F7 */ 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; static const unsigned char valid_max_2nd_byte[0x100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C0 C1 C2 C3 C4 C5 C6 C7 */ 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* C8 C9 CA CB CC CD CE CF */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* D0 D1 D2 D3 D4 D5 D6 D7 */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* D8 D9 DA DB DC DD DE DF */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* E0 E1 E2 E3 E4 E5 E6 E7 */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* E8 E9 EA EB EC ED EE EF */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, /* F0 F1 F2 F3 F4 F5 F6 F7 */ 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /* * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8 * characters' second to sixth bytes. */ #define ICV_UTF8_BIT_SHIFT 6 #define ICV_UTF8_BIT_MASK 0x3f #define ICV_FETCH_UTF8_BOM_SIZE 6 #define ICV_FETCH_UCS4_SIZE 4 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \ defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) #define ICV_FETCH_UCS_SIZE 2 #define ICV_FETCH_UCS_SIZE_TWO 4 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \ defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) #define ICV_FETCH_UCS_SIZE 4 #define ICV_FETCH_UCS_SIZE_TWO 8 #endif /* * UTF-8 representations of some useful Unicode values. * * The U+FFFE in UTF-8 is 0x00efbfbe and the U+FFFF is 0x00efbfbf but * we use masked values at the below: */ #define ICV_UTF8_REPRESENTATION_d800 (0x00eda080UL) #define ICV_UTF8_REPRESENTATION_dfff (0x00edbfbfUL) #define ICV_UTF8_REPRESENTATION_fdd0 (0x00efb790UL) #define ICV_UTF8_REPRESENTATION_fdef (0x00efb7afUL) #define ICV_UTF8_REPRESENTATION_fffe (0x000fbfbeUL) #define ICV_UTF8_REPRESENTATION_ffff (0x000fbfbfUL) #define ICV_UTF8_REPRESENTATION_ffff_mask (0x000fffffUL) #define ICV_UTF8_REPRESENTATION_10fffd (0xf48fbfbdUL) /* * UTF-32 and UCS-4 representations of some useful Unicode values for * non-character and out of bound invalid character detection. */ #define ICV_UTF32_NONCHAR_fffe (0xfffeU) #define ICV_UTF32_NONCHAR_ffff (0xffffU) #define ICV_UTF32_NONCHAR_mask (0xffffU) #define ICV_UTF32_SURROGATE_START_d800 (0xd800U) #define ICV_UTF32_SURROGATE_END_dfff (0xdfffU) #define ICV_UTF32_ARABIC_NONCHAR_START_fdd0 (0xfdd0U) #define ICV_UTF32_ARABIC_NONCHAR_END_fdef (0xfdefU) #define ICV_UTF32_LAST_VALID_CHAR (0x10fffdU) #define ICV_UCS4_LAST_VALID_CHAR (0x7fffffff) #endif /* COMMON_DEFS_H */