xref: /illumos-gate/usr/src/common/unicode/uconv.c (revision c40a6cd7)
103dabef0Sis /*
203dabef0Sis  * CDDL HEADER START
303dabef0Sis  *
403dabef0Sis  * The contents of this file are subject to the terms of the
54703203dSis  * Common Development and Distribution License (the "License").
64703203dSis  * You may not use this file except in compliance with the License.
703dabef0Sis  *
803dabef0Sis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
903dabef0Sis  * or http://www.opensolaris.org/os/licensing.
1003dabef0Sis  * See the License for the specific language governing permissions
1103dabef0Sis  * and limitations under the License.
1203dabef0Sis  *
1303dabef0Sis  * When distributing Covered Code, include this CDDL HEADER in each
1403dabef0Sis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1503dabef0Sis  * If applicable, add the following below this CDDL HEADER, with the
1603dabef0Sis  * fields enclosed by brackets "[]" replaced with your own identifying
1703dabef0Sis  * information: Portions Copyright [yyyy] [name of copyright owner]
1803dabef0Sis  *
1903dabef0Sis  * CDDL HEADER END
2003dabef0Sis  */
2103dabef0Sis /*
22*15d9d0b5Syy  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
2303dabef0Sis  * Use is subject to license terms.
2403dabef0Sis  */
2503dabef0Sis 
2603dabef0Sis /*
2703dabef0Sis  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
284703203dSis  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
294703203dSis  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
304703203dSis  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
314703203dSis  * the section 3C man pages.
324703203dSis  * Interface stability: Committed
3303dabef0Sis  */
3403dabef0Sis 
3503dabef0Sis #include <sys/types.h>
364703203dSis #ifdef	_KERNEL
3703dabef0Sis #include <sys/param.h>
3803dabef0Sis #include <sys/sysmacros.h>
3903dabef0Sis #include <sys/systm.h>
4003dabef0Sis #include <sys/debug.h>
4103dabef0Sis #include <sys/kmem.h>
4203dabef0Sis #include <sys/sunddi.h>
434703203dSis #else
444703203dSis #include <sys/u8_textprep.h>
454703203dSis #endif	/* _KERNEL */
4603dabef0Sis #include <sys/byteorder.h>
4703dabef0Sis #include <sys/errno.h>
4803dabef0Sis 
4903dabef0Sis 
5003dabef0Sis /*
5103dabef0Sis  * The max and min values of high and low surrogate pairs of UTF-16,
5203dabef0Sis  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
5303dabef0Sis  */
5403dabef0Sis #define	UCONV_U16_HI_MIN	(0xd800U)
5503dabef0Sis #define	UCONV_U16_HI_MAX	(0xdbffU)
5603dabef0Sis #define	UCONV_U16_LO_MIN	(0xdc00U)
5703dabef0Sis #define	UCONV_U16_LO_MAX	(0xdfffU)
5803dabef0Sis #define	UCONV_U16_BIT_SHIFT	(0x0400U)
5903dabef0Sis #define	UCONV_U16_BIT_MASK	(0x0fffffU)
6003dabef0Sis #define	UCONV_U16_START		(0x010000U)
6103dabef0Sis 
6203dabef0Sis /* The maximum value of Unicode coding space and ASCII coding space. */
6303dabef0Sis #define	UCONV_UNICODE_MAX	(0x10ffffU)
6403dabef0Sis #define	UCONV_ASCII_MAX		(0x7fU)
6503dabef0Sis 
6603dabef0Sis /* The mask values for input and output endians. */
6703dabef0Sis #define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
6803dabef0Sis #define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
6903dabef0Sis 
7003dabef0Sis /* Native and reversed endian macros. */
7103dabef0Sis #ifdef	_BIG_ENDIAN
7203dabef0Sis #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
7303dabef0Sis #define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
7403dabef0Sis #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
7503dabef0Sis #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
7603dabef0Sis #else
7703dabef0Sis #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
7803dabef0Sis #define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
7903dabef0Sis #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
8003dabef0Sis #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
8103dabef0Sis #endif	/* _BIG_ENDIAN */
8203dabef0Sis 
8303dabef0Sis /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
8403dabef0Sis #define	UCONV_BOM_NORMAL	(0xfeffU)
8503dabef0Sis #define	UCONV_BOM_SWAPPED	(0xfffeU)
8603dabef0Sis #define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
8703dabef0Sis 
8803dabef0Sis /* UTF-32 boundaries based on UTF-8 character byte lengths. */
8903dabef0Sis #define	UCONV_U8_ONE_BYTE	(0x7fU)
9003dabef0Sis #define	UCONV_U8_TWO_BYTES	(0x7ffU)
9103dabef0Sis #define	UCONV_U8_THREE_BYTES	(0xffffU)
9203dabef0Sis #define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
9303dabef0Sis 
9403dabef0Sis /* The common minimum and maximum values at the UTF-8 character bytes. */
9503dabef0Sis #define	UCONV_U8_BYTE_MIN	(0x80U)
9603dabef0Sis #define	UCONV_U8_BYTE_MAX	(0xbfU)
9703dabef0Sis 
9803dabef0Sis /*
9903dabef0Sis  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
10003dabef0Sis  * UTF-8 character bytes.
10103dabef0Sis  */
10203dabef0Sis #define	UCONV_U8_BIT_SHIFT	6
10303dabef0Sis #define	UCONV_U8_BIT_MASK	0x3f
10403dabef0Sis 
10503dabef0Sis /*
10603dabef0Sis  * The following vector shows remaining bytes in a UTF-8 character.
10703dabef0Sis  * Index will be the first byte of the character.
10803dabef0Sis  */
10903dabef0Sis static const uchar_t remaining_bytes_tbl[0x100] = {
11003dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
11103dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
11203dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
11303dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
11403dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
11503dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
11603dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
11703dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
11803dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
11903dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
12003dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
12103dabef0Sis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
12203dabef0Sis 
12303dabef0Sis /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
12403dabef0Sis 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
12503dabef0Sis 
12603dabef0Sis /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
12703dabef0Sis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
12803dabef0Sis 
12903dabef0Sis /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
13003dabef0Sis 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
13103dabef0Sis 
13203dabef0Sis /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
13303dabef0Sis 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
13403dabef0Sis };
13503dabef0Sis 
13603dabef0Sis /*
13703dabef0Sis  * The following is a vector of bit-masks to get used bits in
13803dabef0Sis  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
13903dabef0Sis  * the character.
14003dabef0Sis  */
141*15d9d0b5Syy #ifdef	_KERNEL
142*15d9d0b5Syy const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
143*15d9d0b5Syy #else
144*15d9d0b5Syy static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145*15d9d0b5Syy #endif	/* _KERNEL */
14603dabef0Sis 
14703dabef0Sis /*
14803dabef0Sis  * The following two vectors are to provide valid minimum and
14903dabef0Sis  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
15003dabef0Sis  * better illegal sequence checking. The index value must be the value of
15103dabef0Sis  * the first byte of the UTF-8 character.
15203dabef0Sis  */
15303dabef0Sis static const uchar_t valid_min_2nd_byte[0x100] = {
15403dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
15503dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
15603dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
15703dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
15803dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
15903dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
16003dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
16103dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
16203dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
16303dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
16403dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
16503dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
16603dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
16703dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
16803dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
16903dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
17003dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
17103dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
17203dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
17303dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
17403dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
17503dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
17603dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
17703dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
17803dabef0Sis 
17903dabef0Sis /*	C0    C1    C2    C3    C4    C5    C6    C7 */
18003dabef0Sis 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
18103dabef0Sis 
18203dabef0Sis /*	C8    C9    CA    CB    CC    CD    CE    CF */
18303dabef0Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
18403dabef0Sis 
18503dabef0Sis /*	D0    D1    D2    D3    D4    D5    D6    D7 */
18603dabef0Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
18703dabef0Sis 
18803dabef0Sis /*	D8    D9    DA    DB    DC    DD    DE    DF */
18903dabef0Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
19003dabef0Sis 
19103dabef0Sis /*	E0    E1    E2    E3    E4    E5    E6    E7 */
19203dabef0Sis 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
19303dabef0Sis 
19403dabef0Sis /*	E8    E9    EA    EB    EC    ED    EE    EF */
19503dabef0Sis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
19603dabef0Sis 
19703dabef0Sis /*	F0    F1    F2    F3    F4    F5    F6    F7 */
19803dabef0Sis 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
19903dabef0Sis 
20003dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0
20103dabef0Sis };
20203dabef0Sis 
20303dabef0Sis static const uchar_t valid_max_2nd_byte[0x100] = {
20403dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
20503dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
20603dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
20703dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
20803dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
20903dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
21003dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
21103dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
21203dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
21303dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
21403dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
21503dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
21603dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
21703dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
21803dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
21903dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
22003dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
22103dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
22203dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
22303dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
22403dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
22503dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
22603dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
22703dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0,
22803dabef0Sis 
22903dabef0Sis /*	C0    C1    C2    C3    C4    C5    C6    C7 */
23003dabef0Sis 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
23103dabef0Sis 
23203dabef0Sis /*	C8    C9    CA    CB    CC    CD    CE    CF */
23303dabef0Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
23403dabef0Sis 
23503dabef0Sis /*	D0    D1    D2    D3    D4    D5    D6    D7 */
23603dabef0Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
23703dabef0Sis 
23803dabef0Sis /*	D8    D9    DA    DB    DC    DD    DE    DF */
23903dabef0Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
24003dabef0Sis 
24103dabef0Sis /*	E0    E1    E2    E3    E4    E5    E6    E7 */
24203dabef0Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
24303dabef0Sis 
24403dabef0Sis /*	E8    E9    EA    EB    EC    ED    EE    EF */
24503dabef0Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
24603dabef0Sis 
24703dabef0Sis /*	F0    F1    F2    F3    F4    F5    F6    F7 */
24803dabef0Sis 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
24903dabef0Sis 
25003dabef0Sis 	0,    0,    0,    0,    0,    0,    0,    0
25103dabef0Sis };
25203dabef0Sis 
25303dabef0Sis 
25403dabef0Sis static int
check_endian(int flag,int * in,int * out)25503dabef0Sis check_endian(int flag, int *in, int *out)
25603dabef0Sis {
25703dabef0Sis 	*in = flag & UCONV_IN_ENDIAN_MASKS;
25803dabef0Sis 
25903dabef0Sis 	/* You cannot have both. */
26003dabef0Sis 	if (*in == UCONV_IN_ENDIAN_MASKS)
26103dabef0Sis 		return (EBADF);
26203dabef0Sis 
26303dabef0Sis 	if (*in == 0)
26403dabef0Sis 		*in = UCONV_IN_NAT_ENDIAN;
26503dabef0Sis 
26603dabef0Sis 	*out = flag & UCONV_OUT_ENDIAN_MASKS;
26703dabef0Sis 
26803dabef0Sis 	/* You cannot have both. */
26903dabef0Sis 	if (*out == UCONV_OUT_ENDIAN_MASKS)
27003dabef0Sis 		return (EBADF);
27103dabef0Sis 
27203dabef0Sis 	if (*out == 0)
27303dabef0Sis 		*out = UCONV_OUT_NAT_ENDIAN;
27403dabef0Sis 
27503dabef0Sis 	return (0);
27603dabef0Sis }
27703dabef0Sis 
27803dabef0Sis static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)27903dabef0Sis check_bom16(const uint16_t *u16s, size_t u16l, int *in)
28003dabef0Sis {
28103dabef0Sis 	if (u16l > 0) {
28203dabef0Sis 		if (*u16s == UCONV_BOM_NORMAL) {
28303dabef0Sis 			*in = UCONV_IN_NAT_ENDIAN;
28403dabef0Sis 			return (B_TRUE);
28503dabef0Sis 		}
28603dabef0Sis 		if (*u16s == UCONV_BOM_SWAPPED) {
28703dabef0Sis 			*in = UCONV_IN_REV_ENDIAN;
28803dabef0Sis 			return (B_TRUE);
28903dabef0Sis 		}
29003dabef0Sis 	}
29103dabef0Sis 
29203dabef0Sis 	return (B_FALSE);
29303dabef0Sis }
29403dabef0Sis 
29503dabef0Sis static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)29603dabef0Sis check_bom32(const uint32_t *u32s, size_t u32l, int *in)
29703dabef0Sis {
29803dabef0Sis 	if (u32l > 0) {
29903dabef0Sis 		if (*u32s == UCONV_BOM_NORMAL) {
30003dabef0Sis 			*in = UCONV_IN_NAT_ENDIAN;
30103dabef0Sis 			return (B_TRUE);
30203dabef0Sis 		}
30303dabef0Sis 		if (*u32s == UCONV_BOM_SWAPPED_32) {
30403dabef0Sis 			*in = UCONV_IN_REV_ENDIAN;
30503dabef0Sis 			return (B_TRUE);
30603dabef0Sis 		}
30703dabef0Sis 	}
30803dabef0Sis 
30903dabef0Sis 	return (B_FALSE);
31003dabef0Sis }
31103dabef0Sis 
31203dabef0Sis int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)31303dabef0Sis uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
31403dabef0Sis     uint32_t *u32s, size_t *utf32len, int flag)
31503dabef0Sis {
31603dabef0Sis 	int inendian;
31703dabef0Sis 	int outendian;
31803dabef0Sis 	size_t u16l;
31903dabef0Sis 	size_t u32l;
32003dabef0Sis 	uint32_t hi;
32103dabef0Sis 	uint32_t lo;
32203dabef0Sis 	boolean_t do_not_ignore_null;
32303dabef0Sis 
32403dabef0Sis 	/*
32503dabef0Sis 	 * Do preliminary validity checks on parameters and collect info on
32603dabef0Sis 	 * endians.
32703dabef0Sis 	 */
32803dabef0Sis 	if (u16s == NULL || utf16len == NULL)
32903dabef0Sis 		return (EILSEQ);
33003dabef0Sis 
33103dabef0Sis 	if (u32s == NULL || utf32len == NULL)
33203dabef0Sis 		return (E2BIG);
33303dabef0Sis 
33403dabef0Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
33503dabef0Sis 		return (EBADF);
33603dabef0Sis 
33703dabef0Sis 	/*
33803dabef0Sis 	 * Initialize input and output parameter buffer indices and
33903dabef0Sis 	 * temporary variables.
34003dabef0Sis 	 */
34103dabef0Sis 	u16l = u32l = 0;
34203dabef0Sis 	hi = 0;
34303dabef0Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
34403dabef0Sis 
34503dabef0Sis 	/*
34603dabef0Sis 	 * Check on the BOM at the beginning of the input buffer if required
34703dabef0Sis 	 * and if there is indeed one, process it.
34803dabef0Sis 	 */
34903dabef0Sis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
35003dabef0Sis 	    check_bom16(u16s, *utf16len, &inendian))
35103dabef0Sis 		u16l++;
35203dabef0Sis 
35303dabef0Sis 	/*
35403dabef0Sis 	 * Reset inendian and outendian so that after this point, those can be
35503dabef0Sis 	 * used as condition values.
35603dabef0Sis 	 */
35703dabef0Sis 	inendian &= UCONV_IN_NAT_ENDIAN;
35803dabef0Sis 	outendian &= UCONV_OUT_NAT_ENDIAN;
35903dabef0Sis 
36003dabef0Sis 	/*
36103dabef0Sis 	 * If there is something in the input buffer and if necessary and
36203dabef0Sis 	 * requested, save the BOM at the output buffer.
36303dabef0Sis 	 */
36403dabef0Sis 	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
36503dabef0Sis 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
3664703203dSis 		    UCONV_BOM_SWAPPED_32;
36703dabef0Sis 
36803dabef0Sis 	/*
36903dabef0Sis 	 * Do conversion; if encounter a surrogate pair, assemble high and
37003dabef0Sis 	 * low pair values to form a UTF-32 character. If a half of a pair
37103dabef0Sis 	 * exists alone, then, either it is an illegal (EILSEQ) or
37203dabef0Sis 	 * invalid (EINVAL) value.
37303dabef0Sis 	 */
37403dabef0Sis 	for (; u16l < *utf16len; u16l++) {
37503dabef0Sis 		if (u16s[u16l] == 0 && do_not_ignore_null)
37603dabef0Sis 			break;
37703dabef0Sis 
37803dabef0Sis 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
37903dabef0Sis 
38003dabef0Sis 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
38103dabef0Sis 			if (hi)
38203dabef0Sis 				return (EILSEQ);
38303dabef0Sis 			hi = lo;
38403dabef0Sis 			continue;
38503dabef0Sis 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
38603dabef0Sis 			if (! hi)
38703dabef0Sis 				return (EILSEQ);
38803dabef0Sis 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
3894703203dSis 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
3904703203dSis 			    + UCONV_U16_START;
39103dabef0Sis 			hi = 0;
39203dabef0Sis 		} else if (hi) {
39303dabef0Sis 			return (EILSEQ);
39403dabef0Sis 		}
39503dabef0Sis 
39603dabef0Sis 		if (u32l >= *utf32len)
39703dabef0Sis 			return (E2BIG);
39803dabef0Sis 
39903dabef0Sis 		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
40003dabef0Sis 	}
40103dabef0Sis 
40203dabef0Sis 	/*
40303dabef0Sis 	 * If high half didn't see low half, then, it's most likely the input
40403dabef0Sis 	 * parameter is incomplete.
40503dabef0Sis 	 */
40603dabef0Sis 	if (hi)
40703dabef0Sis 		return (EINVAL);
40803dabef0Sis 
40903dabef0Sis 	/*
41003dabef0Sis 	 * Save the number of consumed and saved characters. They do not
41103dabef0Sis 	 * include terminating NULL character (U+0000) at the end of
41203dabef0Sis 	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
41303dabef0Sis 	 * the input buffer length is big enough to include the terminating
41403dabef0Sis 	 * NULL character).
41503dabef0Sis 	 */
41603dabef0Sis 	*utf16len = u16l;
41703dabef0Sis 	*utf32len = u32l;
41803dabef0Sis 
41903dabef0Sis 	return (0);
42003dabef0Sis }
42103dabef0Sis 
42203dabef0Sis int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)42303dabef0Sis uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
42403dabef0Sis     uchar_t *u8s, size_t *utf8len, int flag)
42503dabef0Sis {
42603dabef0Sis 	int inendian;
42703dabef0Sis 	int outendian;
42803dabef0Sis 	size_t u16l;
42903dabef0Sis 	size_t u8l;
43003dabef0Sis 	uint32_t hi;
43103dabef0Sis 	uint32_t lo;
43203dabef0Sis 	boolean_t do_not_ignore_null;
43303dabef0Sis 
43403dabef0Sis 	if (u16s == NULL || utf16len == NULL)
43503dabef0Sis 		return (EILSEQ);
43603dabef0Sis 
43703dabef0Sis 	if (u8s == NULL || utf8len == NULL)
43803dabef0Sis 		return (E2BIG);
43903dabef0Sis 
44003dabef0Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
44103dabef0Sis 		return (EBADF);
44203dabef0Sis 
44303dabef0Sis 	u16l = u8l = 0;
44403dabef0Sis 	hi = 0;
44503dabef0Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
44603dabef0Sis 
44703dabef0Sis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
44803dabef0Sis 	    check_bom16(u16s, *utf16len, &inendian))
44903dabef0Sis 		u16l++;
45003dabef0Sis 
45103dabef0Sis 	inendian &= UCONV_IN_NAT_ENDIAN;
45203dabef0Sis 
45303dabef0Sis 	for (; u16l < *utf16len; u16l++) {
45403dabef0Sis 		if (u16s[u16l] == 0 && do_not_ignore_null)
45503dabef0Sis 			break;
45603dabef0Sis 
45703dabef0Sis 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
45803dabef0Sis 
45903dabef0Sis 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
46003dabef0Sis 			if (hi)
46103dabef0Sis 				return (EILSEQ);
46203dabef0Sis 			hi = lo;
46303dabef0Sis 			continue;
46403dabef0Sis 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
46503dabef0Sis 			if (! hi)
46603dabef0Sis 				return (EILSEQ);
46703dabef0Sis 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
4684703203dSis 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
4694703203dSis 			    + UCONV_U16_START;
47003dabef0Sis 			hi = 0;
47103dabef0Sis 		} else if (hi) {
47203dabef0Sis 			return (EILSEQ);
47303dabef0Sis 		}
47403dabef0Sis 
47503dabef0Sis 		/*
47603dabef0Sis 		 * Now we convert a UTF-32 character into a UTF-8 character.
47703dabef0Sis 		 * Unicode coding space is between U+0000 and U+10FFFF;
47803dabef0Sis 		 * anything bigger is an illegal character.
47903dabef0Sis 		 */
48003dabef0Sis 		if (lo <= UCONV_U8_ONE_BYTE) {
48103dabef0Sis 			if (u8l >= *utf8len)
48203dabef0Sis 				return (E2BIG);
48303dabef0Sis 			u8s[u8l++] = (uchar_t)lo;
48403dabef0Sis 		} else if (lo <= UCONV_U8_TWO_BYTES) {
48503dabef0Sis 			if ((u8l + 1) >= *utf8len)
48603dabef0Sis 				return (E2BIG);
48703dabef0Sis 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
48803dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
48903dabef0Sis 		} else if (lo <= UCONV_U8_THREE_BYTES) {
49003dabef0Sis 			if ((u8l + 2) >= *utf8len)
49103dabef0Sis 				return (E2BIG);
49203dabef0Sis 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
49303dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
49403dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
49503dabef0Sis 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
49603dabef0Sis 			if ((u8l + 3) >= *utf8len)
49703dabef0Sis 				return (E2BIG);
49803dabef0Sis 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
49903dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
50003dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
50103dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
50203dabef0Sis 		} else {
50303dabef0Sis 			return (EILSEQ);
50403dabef0Sis 		}
50503dabef0Sis 	}
50603dabef0Sis 
50703dabef0Sis 	if (hi)
50803dabef0Sis 		return (EINVAL);
50903dabef0Sis 
51003dabef0Sis 	*utf16len = u16l;
51103dabef0Sis 	*utf8len = u8l;
51203dabef0Sis 
51303dabef0Sis 	return (0);
51403dabef0Sis }
51503dabef0Sis 
51603dabef0Sis int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)51703dabef0Sis uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
51803dabef0Sis     uint16_t *u16s, size_t *utf16len, int flag)
51903dabef0Sis {
52003dabef0Sis 	int inendian;
52103dabef0Sis 	int outendian;
52203dabef0Sis 	size_t u16l;
52303dabef0Sis 	size_t u32l;
52403dabef0Sis 	uint32_t hi;
52503dabef0Sis 	uint32_t lo;
52603dabef0Sis 	boolean_t do_not_ignore_null;
52703dabef0Sis 
52803dabef0Sis 	if (u32s == NULL || utf32len == NULL)
52903dabef0Sis 		return (EILSEQ);
53003dabef0Sis 
53103dabef0Sis 	if (u16s == NULL || utf16len == NULL)
53203dabef0Sis 		return (E2BIG);
53303dabef0Sis 
53403dabef0Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
53503dabef0Sis 		return (EBADF);
53603dabef0Sis 
53703dabef0Sis 	u16l = u32l = 0;
53803dabef0Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
53903dabef0Sis 
54003dabef0Sis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
54103dabef0Sis 	    check_bom32(u32s, *utf32len, &inendian))
54203dabef0Sis 		u32l++;
54303dabef0Sis 
54403dabef0Sis 	inendian &= UCONV_IN_NAT_ENDIAN;
54503dabef0Sis 	outendian &= UCONV_OUT_NAT_ENDIAN;
54603dabef0Sis 
54703dabef0Sis 	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
54803dabef0Sis 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
5494703203dSis 		    UCONV_BOM_SWAPPED;
55003dabef0Sis 
55103dabef0Sis 	for (; u32l < *utf32len; u32l++) {
55203dabef0Sis 		if (u32s[u32l] == 0 && do_not_ignore_null)
55303dabef0Sis 			break;
55403dabef0Sis 
55503dabef0Sis 		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
55603dabef0Sis 
55703dabef0Sis 		/*
55803dabef0Sis 		 * Anything bigger than the Unicode coding space, i.e.,
55903dabef0Sis 		 * Unicode scalar value bigger than U+10FFFF, is an illegal
56003dabef0Sis 		 * character.
56103dabef0Sis 		 */
56203dabef0Sis 		if (hi > UCONV_UNICODE_MAX)
56303dabef0Sis 			return (EILSEQ);
56403dabef0Sis 
56503dabef0Sis 		/*
56603dabef0Sis 		 * Anything bigger than U+FFFF must be converted into
56703dabef0Sis 		 * a surrogate pair in UTF-16.
56803dabef0Sis 		 */
56903dabef0Sis 		if (hi >= UCONV_U16_START) {
57003dabef0Sis 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
5714703203dSis 			    UCONV_U16_LO_MIN;
57203dabef0Sis 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
5734703203dSis 			    UCONV_U16_HI_MIN;
57403dabef0Sis 
57503dabef0Sis 			if ((u16l + 1) >= *utf16len)
57603dabef0Sis 				return (E2BIG);
57703dabef0Sis 
57803dabef0Sis 			if (outendian) {
57903dabef0Sis 				u16s[u16l++] = (uint16_t)hi;
58003dabef0Sis 				u16s[u16l++] = (uint16_t)lo;
58103dabef0Sis 			} else {
58203dabef0Sis 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
58303dabef0Sis 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
58403dabef0Sis 			}
58503dabef0Sis 		} else {
58603dabef0Sis 			if (u16l >= *utf16len)
58703dabef0Sis 				return (E2BIG);
58803dabef0Sis 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
5894703203dSis 			    BSWAP_16(((uint16_t)hi));
59003dabef0Sis 		}
59103dabef0Sis 	}
59203dabef0Sis 
59303dabef0Sis 	*utf16len = u16l;
59403dabef0Sis 	*utf32len = u32l;
59503dabef0Sis 
59603dabef0Sis 	return (0);
59703dabef0Sis }
59803dabef0Sis 
59903dabef0Sis int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)60003dabef0Sis uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
60103dabef0Sis     uchar_t *u8s, size_t *utf8len, int flag)
60203dabef0Sis {
60303dabef0Sis 	int inendian;
60403dabef0Sis 	int outendian;
60503dabef0Sis 	size_t u32l;
60603dabef0Sis 	size_t u8l;
60703dabef0Sis 	uint32_t lo;
60803dabef0Sis 	boolean_t do_not_ignore_null;
60903dabef0Sis 
61003dabef0Sis 	if (u32s == NULL || utf32len == NULL)
61103dabef0Sis 		return (EILSEQ);
61203dabef0Sis 
61303dabef0Sis 	if (u8s == NULL || utf8len == NULL)
61403dabef0Sis 		return (E2BIG);
61503dabef0Sis 
61603dabef0Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
61703dabef0Sis 		return (EBADF);
61803dabef0Sis 
61903dabef0Sis 	u32l = u8l = 0;
62003dabef0Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
62103dabef0Sis 
62203dabef0Sis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
62303dabef0Sis 	    check_bom32(u32s, *utf32len, &inendian))
62403dabef0Sis 		u32l++;
62503dabef0Sis 
62603dabef0Sis 	inendian &= UCONV_IN_NAT_ENDIAN;
62703dabef0Sis 
62803dabef0Sis 	for (; u32l < *utf32len; u32l++) {
62903dabef0Sis 		if (u32s[u32l] == 0 && do_not_ignore_null)
63003dabef0Sis 			break;
63103dabef0Sis 
63203dabef0Sis 		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
63303dabef0Sis 
63403dabef0Sis 		if (lo <= UCONV_U8_ONE_BYTE) {
63503dabef0Sis 			if (u8l >= *utf8len)
63603dabef0Sis 				return (E2BIG);
63703dabef0Sis 			u8s[u8l++] = (uchar_t)lo;
63803dabef0Sis 		} else if (lo <= UCONV_U8_TWO_BYTES) {
63903dabef0Sis 			if ((u8l + 1) >= *utf8len)
64003dabef0Sis 				return (E2BIG);
64103dabef0Sis 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
64203dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
64303dabef0Sis 		} else if (lo <= UCONV_U8_THREE_BYTES) {
64403dabef0Sis 			if ((u8l + 2) >= *utf8len)
64503dabef0Sis 				return (E2BIG);
64603dabef0Sis 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
64703dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
64803dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
64903dabef0Sis 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
65003dabef0Sis 			if ((u8l + 3) >= *utf8len)
65103dabef0Sis 				return (E2BIG);
65203dabef0Sis 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
65303dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
65403dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
65503dabef0Sis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
65603dabef0Sis 		} else {
65703dabef0Sis 			return (EILSEQ);
65803dabef0Sis 		}
65903dabef0Sis 	}
66003dabef0Sis 
66103dabef0Sis 	*utf32len = u32l;
66203dabef0Sis 	*utf8len = u8l;
66303dabef0Sis 
66403dabef0Sis 	return (0);
66503dabef0Sis }
66603dabef0Sis 
66703dabef0Sis int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)66803dabef0Sis uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
66903dabef0Sis     uint16_t *u16s, size_t *utf16len, int flag)
67003dabef0Sis {
67103dabef0Sis 	int inendian;
67203dabef0Sis 	int outendian;
67303dabef0Sis 	size_t u16l;
67403dabef0Sis 	size_t u8l;
67503dabef0Sis 	uint32_t hi;
67603dabef0Sis 	uint32_t lo;
67703dabef0Sis 	int remaining_bytes;
67803dabef0Sis 	int first_b;
67903dabef0Sis 	boolean_t do_not_ignore_null;
68003dabef0Sis 
68103dabef0Sis 	if (u8s == NULL || utf8len == NULL)
68203dabef0Sis 		return (EILSEQ);
68303dabef0Sis 
68403dabef0Sis 	if (u16s == NULL || utf16len == NULL)
68503dabef0Sis 		return (E2BIG);
68603dabef0Sis 
68703dabef0Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
68803dabef0Sis 		return (EBADF);
68903dabef0Sis 
69003dabef0Sis 	u16l = u8l = 0;
69103dabef0Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
69203dabef0Sis 
69303dabef0Sis 	outendian &= UCONV_OUT_NAT_ENDIAN;
69403dabef0Sis 
69503dabef0Sis 	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
69603dabef0Sis 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
6974703203dSis 		    UCONV_BOM_SWAPPED;
69803dabef0Sis 
69903dabef0Sis 	for (; u8l < *utf8len; ) {
70003dabef0Sis 		if (u8s[u8l] == 0 && do_not_ignore_null)
70103dabef0Sis 			break;
70203dabef0Sis 
70303dabef0Sis 		/*
70403dabef0Sis 		 * Collect a UTF-8 character and convert it to a UTF-32
70503dabef0Sis 		 * character. In doing so, we screen out illegally formed
70603dabef0Sis 		 * UTF-8 characters and treat such as illegal characters.
70703dabef0Sis 		 * The algorithm at below also screens out anything bigger
70803dabef0Sis 		 * than the U+10FFFF.
70903dabef0Sis 		 *
71003dabef0Sis 		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
71103dabef0Sis 		 * more details on the illegal values of UTF-8 character
71203dabef0Sis 		 * bytes.
71303dabef0Sis 		 */
71403dabef0Sis 		hi = (uint32_t)u8s[u8l++];
71503dabef0Sis 
71603dabef0Sis 		if (hi > UCONV_ASCII_MAX) {
71703dabef0Sis 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
71803dabef0Sis 				return (EILSEQ);
71903dabef0Sis 
72003dabef0Sis 			first_b = hi;
721*15d9d0b5Syy 			hi = hi & u8_masks_tbl[remaining_bytes];
72203dabef0Sis 
72303dabef0Sis 			for (; remaining_bytes > 0; remaining_bytes--) {
72403dabef0Sis 				/*
72503dabef0Sis 				 * If we have no more bytes, the current
72603dabef0Sis 				 * UTF-8 character is incomplete.
72703dabef0Sis 				 */
72803dabef0Sis 				if (u8l >= *utf8len)
72903dabef0Sis 					return (EINVAL);
73003dabef0Sis 
73103dabef0Sis 				lo = (uint32_t)u8s[u8l++];
73203dabef0Sis 
73303dabef0Sis 				if (first_b) {
73403dabef0Sis 					if (lo < valid_min_2nd_byte[first_b] ||
73503dabef0Sis 					    lo > valid_max_2nd_byte[first_b])
73603dabef0Sis 						return (EILSEQ);
73703dabef0Sis 					first_b = 0;
73803dabef0Sis 				} else if (lo < UCONV_U8_BYTE_MIN ||
7394703203dSis 				    lo > UCONV_U8_BYTE_MAX) {
74003dabef0Sis 					return (EILSEQ);
74103dabef0Sis 				}
74203dabef0Sis 				hi = (hi << UCONV_U8_BIT_SHIFT) |
7434703203dSis 				    (lo & UCONV_U8_BIT_MASK);
74403dabef0Sis 			}
74503dabef0Sis 		}
74603dabef0Sis 
74703dabef0Sis 		if (hi >= UCONV_U16_START) {
74803dabef0Sis 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
7494703203dSis 			    UCONV_U16_LO_MIN;
75003dabef0Sis 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
7514703203dSis 			    UCONV_U16_HI_MIN;
75203dabef0Sis 
75303dabef0Sis 			if ((u16l + 1) >= *utf16len)
75403dabef0Sis 				return (E2BIG);
75503dabef0Sis 
75603dabef0Sis 			if (outendian) {
75703dabef0Sis 				u16s[u16l++] = (uint16_t)hi;
75803dabef0Sis 				u16s[u16l++] = (uint16_t)lo;
75903dabef0Sis 			} else {
76003dabef0Sis 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
76103dabef0Sis 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
76203dabef0Sis 			}
76303dabef0Sis 		} else {
76403dabef0Sis 			if (u16l >= *utf16len)
76503dabef0Sis 				return (E2BIG);
76603dabef0Sis 
76703dabef0Sis 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
7684703203dSis 			    BSWAP_16(((uint16_t)hi));
76903dabef0Sis 		}
77003dabef0Sis 	}
77103dabef0Sis 
77203dabef0Sis 	*utf16len = u16l;
77303dabef0Sis 	*utf8len = u8l;
77403dabef0Sis 
77503dabef0Sis 	return (0);
77603dabef0Sis }
77703dabef0Sis 
77803dabef0Sis int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)77903dabef0Sis uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
78003dabef0Sis     uint32_t *u32s, size_t *utf32len, int flag)
78103dabef0Sis {
78203dabef0Sis 	int inendian;
78303dabef0Sis 	int outendian;
78403dabef0Sis 	size_t u32l;
78503dabef0Sis 	size_t u8l;
78603dabef0Sis 	uint32_t hi;
78703dabef0Sis 	uint32_t c;
78803dabef0Sis 	int remaining_bytes;
78903dabef0Sis 	int first_b;
79003dabef0Sis 	boolean_t do_not_ignore_null;
79103dabef0Sis 
79203dabef0Sis 	if (u8s == NULL || utf8len == NULL)
79303dabef0Sis 		return (EILSEQ);
79403dabef0Sis 
79503dabef0Sis 	if (u32s == NULL || utf32len == NULL)
79603dabef0Sis 		return (E2BIG);
79703dabef0Sis 
79803dabef0Sis 	if (check_endian(flag, &inendian, &outendian) != 0)
79903dabef0Sis 		return (EBADF);
80003dabef0Sis 
80103dabef0Sis 	u32l = u8l = 0;
80203dabef0Sis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
80303dabef0Sis 
80403dabef0Sis 	outendian &= UCONV_OUT_NAT_ENDIAN;
80503dabef0Sis 
80603dabef0Sis 	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
80703dabef0Sis 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
8084703203dSis 		    UCONV_BOM_SWAPPED_32;
80903dabef0Sis 
81003dabef0Sis 	for (; u8l < *utf8len; ) {
81103dabef0Sis 		if (u8s[u8l] == 0 && do_not_ignore_null)
81203dabef0Sis 			break;
81303dabef0Sis 
81403dabef0Sis 		hi = (uint32_t)u8s[u8l++];
81503dabef0Sis 
81603dabef0Sis 		if (hi > UCONV_ASCII_MAX) {
81703dabef0Sis 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
81803dabef0Sis 				return (EILSEQ);
81903dabef0Sis 
82003dabef0Sis 			first_b = hi;
821*15d9d0b5Syy 			hi = hi & u8_masks_tbl[remaining_bytes];
82203dabef0Sis 
82303dabef0Sis 			for (; remaining_bytes > 0; remaining_bytes--) {
82403dabef0Sis 				if (u8l >= *utf8len)
82503dabef0Sis 					return (EINVAL);
82603dabef0Sis 
82703dabef0Sis 				c = (uint32_t)u8s[u8l++];
82803dabef0Sis 
82903dabef0Sis 				if (first_b) {
83003dabef0Sis 					if (c < valid_min_2nd_byte[first_b] ||
83103dabef0Sis 					    c > valid_max_2nd_byte[first_b])
83203dabef0Sis 						return (EILSEQ);
83303dabef0Sis 					first_b = 0;
83403dabef0Sis 				} else if (c < UCONV_U8_BYTE_MIN ||
8354703203dSis 				    c > UCONV_U8_BYTE_MAX) {
83603dabef0Sis 					return (EILSEQ);
83703dabef0Sis 				}
83803dabef0Sis 				hi = (hi << UCONV_U8_BIT_SHIFT) |
8394703203dSis 				    (c & UCONV_U8_BIT_MASK);
84003dabef0Sis 			}
84103dabef0Sis 		}
84203dabef0Sis 
84303dabef0Sis 		if (u32l >= *utf32len)
84403dabef0Sis 			return (E2BIG);
84503dabef0Sis 
84603dabef0Sis 		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
84703dabef0Sis 	}
84803dabef0Sis 
84903dabef0Sis 	*utf32len = u32l;
85003dabef0Sis 	*utf8len = u8l;
85103dabef0Sis 
85203dabef0Sis 	return (0);
85303dabef0Sis }
854