12765a47cSis /*
22765a47cSis  * CDDL HEADER START
32765a47cSis  *
42765a47cSis  * The contents of this file are subject to the terms of the
52765a47cSis  * Common Development and Distribution License (the "License").
62765a47cSis  * You may not use this file except in compliance with the License.
72765a47cSis  *
82765a47cSis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
92765a47cSis  * or http://www.opensolaris.org/os/licensing.
102765a47cSis  * See the License for the specific language governing permissions
112765a47cSis  * and limitations under the License.
122765a47cSis  *
132765a47cSis  * When distributing Covered Code, include this CDDL HEADER in each
142765a47cSis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
152765a47cSis  * If applicable, add the following below this CDDL HEADER, with the
162765a47cSis  * fields enclosed by brackets "[]" replaced with your own identifying
172765a47cSis  * information: Portions Copyright [yyyy] [name of copyright owner]
182765a47cSis  *
192765a47cSis  * CDDL HEADER END
202765a47cSis  */
212765a47cSis /*
2285bb5f1dSis  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
232765a47cSis  * Use is subject to license terms.
242765a47cSis  */
252765a47cSis 
26*f137b22eSDan McDonald /*
27*f137b22eSDan McDonald  * Copyright 2022 MNX Cloud, Inc.
28*f137b22eSDan McDonald  */
292765a47cSis 
302765a47cSis 
312765a47cSis /*
324703203dSis  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
332765a47cSis  *
342765a47cSis  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
354703203dSis  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
364703203dSis  * the section 3C man pages.
372765a47cSis  * Interface stability: Committed.
382765a47cSis  */
392765a47cSis 
402765a47cSis #include <sys/types.h>
414703203dSis #ifdef	_KERNEL
422765a47cSis #include <sys/param.h>
432765a47cSis #include <sys/sysmacros.h>
442765a47cSis #include <sys/systm.h>
452765a47cSis #include <sys/debug.h>
462765a47cSis #include <sys/kmem.h>
472765a47cSis #include <sys/ddi.h>
482765a47cSis #include <sys/sunddi.h>
494703203dSis #else
504703203dSis #include <sys/u8_textprep.h>
514703203dSis #include <strings.h>
524703203dSis #endif	/* _KERNEL */
532765a47cSis #include <sys/byteorder.h>
542765a47cSis #include <sys/errno.h>
552765a47cSis #include <sys/u8_textprep_data.h>
562765a47cSis 
572765a47cSis 
582765a47cSis /* The maximum possible number of bytes in a UTF-8 character. */
592765a47cSis #define	U8_MB_CUR_MAX			(4)
602765a47cSis 
612765a47cSis /*
622765a47cSis  * The maximum number of bytes needed for a UTF-8 character to cover
632765a47cSis  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
642765a47cSis  */
652765a47cSis #define	U8_MAX_BYTES_UCS2		(3)
662765a47cSis 
672765a47cSis /* The maximum possible number of bytes in a Stream-Safe Text. */
682765a47cSis #define	U8_STREAM_SAFE_TEXT_MAX		(128)
692765a47cSis 
702765a47cSis /*
712765a47cSis  * The maximum number of characters in a combining/conjoining sequence and
722765a47cSis  * the actual upperbound limit of a combining/conjoining sequence.
732765a47cSis  */
742765a47cSis #define	U8_MAX_CHARS_A_SEQ		(32)
752765a47cSis #define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
762765a47cSis 
772765a47cSis /* The combining class value for Starter. */
782765a47cSis #define	U8_COMBINING_CLASS_STARTER	(0)
792765a47cSis 
802765a47cSis /*
812765a47cSis  * Some Hangul related macros at below.
822765a47cSis  *
832765a47cSis  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
842765a47cSis  * Vowels, and optional Trailing consonants in Unicode scalar values.
852765a47cSis  *
862765a47cSis  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
872765a47cSis  * the actual U+11A8. This is due to that the trailing consonant is optional
882765a47cSis  * and thus we are doing a pre-calculation of subtracting one.
892765a47cSis  *
902765a47cSis  * Each of 19 modern leading consonants has total 588 possible syllables since
912765a47cSis  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
922765a47cSis  * no trailing consonant case, i.e., 21 x 28 = 588.
932765a47cSis  *
942765a47cSis  * We also have bunch of Hangul related macros at below. Please bear in mind
952765a47cSis  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
962765a47cSis  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
972765a47cSis  * Jamo; it just guarantee that it will be most likely.
982765a47cSis  */
992765a47cSis #define	U8_HANGUL_SYL_FIRST		(0xAC00U)
1002765a47cSis #define	U8_HANGUL_SYL_LAST		(0xD7A3U)
1012765a47cSis 
1022765a47cSis #define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
1032765a47cSis #define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
1042765a47cSis #define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
1052765a47cSis #define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
1062765a47cSis #define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
1072765a47cSis #define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
1082765a47cSis 
1092765a47cSis #define	U8_HANGUL_V_COUNT		(21)
1102765a47cSis #define	U8_HANGUL_VT_COUNT		(588)
1112765a47cSis #define	U8_HANGUL_T_COUNT		(28)
1122765a47cSis 
1132765a47cSis #define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
1142765a47cSis 
1152765a47cSis #define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
1162765a47cSis 	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
1172765a47cSis 	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
1182765a47cSis 	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
1192765a47cSis 
1202765a47cSis #define	U8_HANGUL_JAMO_L(u) \
1212765a47cSis 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
1222765a47cSis 
1232765a47cSis #define	U8_HANGUL_JAMO_V(u) \
1242765a47cSis 	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
1252765a47cSis 
1262765a47cSis #define	U8_HANGUL_JAMO_T(u) \
1272765a47cSis 	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1282765a47cSis 
1292765a47cSis #define	U8_HANGUL_JAMO(u) \
1302765a47cSis 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1312765a47cSis 
1322765a47cSis #define	U8_HANGUL_SYLLABLE(u) \
1332765a47cSis 	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
1342765a47cSis 
1352765a47cSis #define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
1362765a47cSis 	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
1372765a47cSis 
1382765a47cSis #define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
1392765a47cSis 	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
1402765a47cSis 
1412765a47cSis /* The types of decomposition mappings. */
1422765a47cSis #define	U8_DECOMP_BOTH			(0xF5U)
1432765a47cSis #define	U8_DECOMP_CANONICAL		(0xF6U)
1442765a47cSis 
1452765a47cSis /* The indicator for 16-bit table. */
1462765a47cSis #define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
1472765a47cSis 
1482765a47cSis /* The following are some convenience macros. */
1492765a47cSis #define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
1502765a47cSis 	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
1512765a47cSis 		(uint32_t)(b3) & 0x3F;
1522765a47cSis 
1532765a47cSis #define	U8_SIMPLE_SWAP(a, b, t) \
1542765a47cSis 	(t) = (a); \
1552765a47cSis 	(a) = (b); \
1562765a47cSis 	(b) = (t);
1572765a47cSis 
1582765a47cSis #define	U8_ASCII_TOUPPER(c) \
1592765a47cSis 	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
1602765a47cSis 
1612765a47cSis #define	U8_ASCII_TOLOWER(c) \
1622765a47cSis 	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
1632765a47cSis 
1642765a47cSis #define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
1652765a47cSis /*
1662765a47cSis  * The following macro assumes that the two characters that are to be
1672765a47cSis  * swapped are adjacent to each other and 'a' comes before 'b'.
1682765a47cSis  *
1692765a47cSis  * If the assumptions are not met, then, the macro will fail.
1702765a47cSis  */
1712765a47cSis #define	U8_SWAP_COMB_MARKS(a, b) \
1722765a47cSis 	for (k = 0; k < disp[(a)]; k++) \
1732765a47cSis 		u8t[k] = u8s[start[(a)] + k]; \
1742765a47cSis 	for (k = 0; k < disp[(b)]; k++) \
1752765a47cSis 		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
1762765a47cSis 	start[(b)] = start[(a)] + disp[(b)]; \
1772765a47cSis 	for (k = 0; k < disp[(a)]; k++) \
1782765a47cSis 		u8s[start[(b)] + k] = u8t[k]; \
1792765a47cSis 	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
1802765a47cSis 	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
1812765a47cSis 
1822765a47cSis /* The possible states during normalization. */
1832765a47cSis typedef enum {
1842765a47cSis 	U8_STATE_START = 0,
1852765a47cSis 	U8_STATE_HANGUL_L = 1,
1862765a47cSis 	U8_STATE_HANGUL_LV = 2,
1872765a47cSis 	U8_STATE_HANGUL_LVT = 3,
1882765a47cSis 	U8_STATE_HANGUL_V = 4,
1892765a47cSis 	U8_STATE_HANGUL_T = 5,
1902765a47cSis 	U8_STATE_COMBINING_MARK = 6
1912765a47cSis } u8_normalization_states_t;
1922765a47cSis 
1932765a47cSis /*
1942765a47cSis  * The three vectors at below are used to check bytes of a given UTF-8
1952765a47cSis  * character are valid and not containing any malformed byte values.
1962765a47cSis  *
1972765a47cSis  * We used to have a quite relaxed UTF-8 binary representation but then there
1982765a47cSis  * was some security related issues and so the Unicode Consortium defined
1992765a47cSis  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
2002765a47cSis  * one more time at the Unicode 3.2. The following three tables are based on
2012765a47cSis  * that.
2022765a47cSis  */
2032765a47cSis 
2042765a47cSis #define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
2052765a47cSis 
2062765a47cSis #define	I_				U8_ILLEGAL_CHAR
2072765a47cSis #define	O_				U8_OUT_OF_RANGE_CHAR
2082765a47cSis 
2092765a47cSis const int8_t u8_number_of_bytes[0x100] = {
2102765a47cSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2112765a47cSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2122765a47cSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2132765a47cSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2142765a47cSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2152765a47cSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2162765a47cSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2172765a47cSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2182765a47cSis 
2192765a47cSis /*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
2202765a47cSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2212765a47cSis 
222*f137b22eSDan McDonald /*	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
2232765a47cSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2242765a47cSis 
225*f137b22eSDan McDonald /*	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
2262765a47cSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2272765a47cSis 
2282765a47cSis /*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
2292765a47cSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2302765a47cSis 
2312765a47cSis /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
2322765a47cSis 	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2332765a47cSis 
2342765a47cSis /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
2352765a47cSis 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2362765a47cSis 
2372765a47cSis /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
2382765a47cSis 	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2392765a47cSis 
2402765a47cSis /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
2412765a47cSis 	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
2422765a47cSis };
2432765a47cSis 
2442765a47cSis #undef	I_
2452765a47cSis #undef	O_
2462765a47cSis 
2472765a47cSis const uint8_t u8_valid_min_2nd_byte[0x100] = {
2482765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2492765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2502765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2512765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2522765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2532765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2542765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2552765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2562765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2572765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2582765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2592765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2602765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2612765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2622765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2632765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2642765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2652765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2662765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2672765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2682765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2692765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2702765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2712765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2722765a47cSis /*	C0    C1    C2    C3    C4    C5    C6    C7    */
2732765a47cSis 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2742765a47cSis /*	C8    C9    CA    CB    CC    CD    CE    CF    */
2752765a47cSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2762765a47cSis /*	D0    D1    D2    D3    D4    D5    D6    D7    */
2772765a47cSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2782765a47cSis /*	D8    D9    DA    DB    DC    DD    DE    DF    */
2792765a47cSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2802765a47cSis /*	E0    E1    E2    E3    E4    E5    E6    E7    */
2812765a47cSis 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2822765a47cSis /*	E8    E9    EA    EB    EC    ED    EE    EF    */
2832765a47cSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2842765a47cSis /*	F0    F1    F2    F3    F4    F5    F6    F7    */
2852765a47cSis 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
2862765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2872765a47cSis };
2882765a47cSis 
2892765a47cSis const uint8_t u8_valid_max_2nd_byte[0x100] = {
2902765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2912765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2922765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2932765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2942765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2952765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2962765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2972765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2982765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
2992765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3002765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3012765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3022765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3032765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3042765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3052765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3062765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3072765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3082765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3092765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3102765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3112765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3122765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3132765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3142765a47cSis /*	C0    C1    C2    C3    C4    C5    C6    C7    */
3152765a47cSis 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3162765a47cSis /*	C8    C9    CA    CB    CC    CD    CE    CF    */
3172765a47cSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3182765a47cSis /*	D0    D1    D2    D3    D4    D5    D6    D7    */
3192765a47cSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3202765a47cSis /*	D8    D9    DA    DB    DC    DD    DE    DF    */
3212765a47cSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3222765a47cSis /*	E0    E1    E2    E3    E4    E5    E6    E7    */
3232765a47cSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3242765a47cSis /*	E8    E9    EA    EB    EC    ED    EE    EF    */
3252765a47cSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
3262765a47cSis /*	F0    F1    F2    F3    F4    F5    F6    F7    */
3272765a47cSis 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
3282765a47cSis 	0,    0,    0,    0,    0,    0,    0,    0,
3292765a47cSis };
3302765a47cSis 
3312765a47cSis 
3322765a47cSis /*
3332765a47cSis  * The u8_validate() validates on the given UTF-8 character string and
3342765a47cSis  * calculate the byte length. It is quite similar to mblen(3C) except that
3352765a47cSis  * this will validate against the list of characters if required and
3362765a47cSis  * specific to UTF-8 and Unicode.
3372765a47cSis  */
3382765a47cSis int
u8_validate(char * u8str,size_t n,char ** list,int flag,int * errnum)33985bb5f1dSis u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
3402765a47cSis {
3412765a47cSis 	uchar_t *ib;
3422765a47cSis 	uchar_t *ibtail;
3432765a47cSis 	uchar_t **p;
3442765a47cSis 	uchar_t *s1;
3452765a47cSis 	uchar_t *s2;
3462765a47cSis 	uchar_t f;
3472765a47cSis 	int sz;
3482765a47cSis 	size_t i;
3492765a47cSis 	int ret_val;
3502765a47cSis 	boolean_t second;
3512765a47cSis 	boolean_t no_need_to_validate_entire;
3522765a47cSis 	boolean_t check_additional;
3532765a47cSis 	boolean_t validate_ucs2_range_only;
3542765a47cSis 
3552765a47cSis 	if (! u8str)
3562765a47cSis 		return (0);
3572765a47cSis 
3582765a47cSis 	ib = (uchar_t *)u8str;
3592765a47cSis 	ibtail = ib + n;
3602765a47cSis 
3612765a47cSis 	ret_val = 0;
3622765a47cSis 
3632765a47cSis 	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
3642765a47cSis 	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
3652765a47cSis 	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
3662765a47cSis 
3672765a47cSis 	while (ib < ibtail) {
3682765a47cSis 		/*
3692765a47cSis 		 * The first byte of a UTF-8 character tells how many
3702765a47cSis 		 * bytes will follow for the character. If the first byte
3712765a47cSis 		 * is an illegal byte value or out of range value, we just
3722765a47cSis 		 * return -1 with an appropriate error number.
3732765a47cSis 		 */
3742765a47cSis 		sz = u8_number_of_bytes[*ib];
3752765a47cSis 		if (sz == U8_ILLEGAL_CHAR) {
37685bb5f1dSis 			*errnum = EILSEQ;
3772765a47cSis 			return (-1);
3782765a47cSis 		}
3792765a47cSis 
3802765a47cSis 		if (sz == U8_OUT_OF_RANGE_CHAR ||
3812765a47cSis 		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
38285bb5f1dSis 			*errnum = ERANGE;
3832765a47cSis 			return (-1);
3842765a47cSis 		}
3852765a47cSis 
3862765a47cSis 		/*
3872765a47cSis 		 * If we don't have enough bytes to check on, that's also
3882765a47cSis 		 * an error. As you can see, we give illegal byte sequence
3892765a47cSis 		 * checking higher priority then EINVAL cases.
3902765a47cSis 		 */
3912765a47cSis 		if ((ibtail - ib) < sz) {
39285bb5f1dSis 			*errnum = EINVAL;
3932765a47cSis 			return (-1);
3942765a47cSis 		}
3952765a47cSis 
3962765a47cSis 		if (sz == 1) {
3972765a47cSis 			ib++;
3982765a47cSis 			ret_val++;
3992765a47cSis 		} else {
4002765a47cSis 			/*
4012765a47cSis 			 * Check on the multi-byte UTF-8 character. For more
4022765a47cSis 			 * details on this, see comment added for the used
4032765a47cSis 			 * data structures at the beginning of the file.
4042765a47cSis 			 */
4052765a47cSis 			f = *ib++;
4062765a47cSis 			ret_val++;
4072765a47cSis 			second = B_TRUE;
4082765a47cSis 			for (i = 1; i < sz; i++) {
4092765a47cSis 				if (second) {
4102765a47cSis 					if (*ib < u8_valid_min_2nd_byte[f] ||
4112765a47cSis 					    *ib > u8_valid_max_2nd_byte[f]) {
41285bb5f1dSis 						*errnum = EILSEQ;
4132765a47cSis 						return (-1);
4142765a47cSis 					}
4152765a47cSis 					second = B_FALSE;
4162765a47cSis 				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
41785bb5f1dSis 					*errnum = EILSEQ;
4182765a47cSis 					return (-1);
4192765a47cSis 				}
4202765a47cSis 				ib++;
4212765a47cSis 				ret_val++;
4222765a47cSis 			}
4232765a47cSis 		}
4242765a47cSis 
4252765a47cSis 		if (check_additional) {
4262765a47cSis 			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
4272765a47cSis 				s1 = ib - sz;
4282765a47cSis 				s2 = p[i];
4292765a47cSis 				while (s1 < ib) {
4302765a47cSis 					if (*s1 != *s2 || *s2 == '\0')
4312765a47cSis 						break;
4322765a47cSis 					s1++;
4332765a47cSis 					s2++;
4342765a47cSis 				}
4352765a47cSis 
4362765a47cSis 				if (s1 >= ib && *s2 == '\0') {
43785bb5f1dSis 					*errnum = EBADF;
4382765a47cSis 					return (-1);
4392765a47cSis 				}
4402765a47cSis 			}
4412765a47cSis 		}
4422765a47cSis 
4432765a47cSis 		if (no_need_to_validate_entire)
4442765a47cSis 			break;
4452765a47cSis 	}
4462765a47cSis 
4472765a47cSis 	return (ret_val);
4482765a47cSis }
4492765a47cSis 
4502765a47cSis /*
4512765a47cSis  * The do_case_conv() looks at the mapping tables and returns found
4522765a47cSis  * bytes if any. If not found, the input bytes are returned. The function
4532765a47cSis  * always terminate the return bytes with a null character assuming that
4542765a47cSis  * there are plenty of room to do so.
4552765a47cSis  *
4562765a47cSis  * The case conversions are simple case conversions mapping a character to
4572765a47cSis  * another character as specified in the Unicode data. The byte size of
4582765a47cSis  * the mapped character could be different from that of the input character.
4592765a47cSis  *
4602765a47cSis  * The return value is the byte length of the returned character excluding
4612765a47cSis  * the terminating null byte.
4622765a47cSis  */
4632765a47cSis static size_t
do_case_conv(int uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t is_it_toupper)4642765a47cSis do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
4652765a47cSis {
4662765a47cSis 	size_t i;
4672765a47cSis 	uint16_t b1 = 0;
4682765a47cSis 	uint16_t b2 = 0;
4692765a47cSis 	uint16_t b3 = 0;
4702765a47cSis 	uint16_t b3_tbl;
4712765a47cSis 	uint16_t b3_base;
4722765a47cSis 	uint16_t b4 = 0;
4732765a47cSis 	size_t start_id;
4742765a47cSis 	size_t end_id;
4752765a47cSis 
4762765a47cSis 	/*
4772765a47cSis 	 * At this point, the only possible values for sz are 2, 3, and 4.
4782765a47cSis 	 * The u8s should point to a vector that is well beyond the size of
4792765a47cSis 	 * 5 bytes.
4802765a47cSis 	 */
4812765a47cSis 	if (sz == 2) {
4822765a47cSis 		b3 = u8s[0] = s[0];
4832765a47cSis 		b4 = u8s[1] = s[1];
4842765a47cSis 	} else if (sz == 3) {
4852765a47cSis 		b2 = u8s[0] = s[0];
4862765a47cSis 		b3 = u8s[1] = s[1];
4872765a47cSis 		b4 = u8s[2] = s[2];
4882765a47cSis 	} else if (sz == 4) {
4892765a47cSis 		b1 = u8s[0] = s[0];
4902765a47cSis 		b2 = u8s[1] = s[1];
4912765a47cSis 		b3 = u8s[2] = s[2];
4922765a47cSis 		b4 = u8s[3] = s[3];
4932765a47cSis 	} else {
4942765a47cSis 		/* This is not possible but just in case as a fallback. */
4952765a47cSis 		if (is_it_toupper)
4962765a47cSis 			*u8s = U8_ASCII_TOUPPER(*s);
4972765a47cSis 		else
4982765a47cSis 			*u8s = U8_ASCII_TOLOWER(*s);
4992765a47cSis 		u8s[1] = '\0';
5002765a47cSis 
5012765a47cSis 		return (1);
5022765a47cSis 	}
5032765a47cSis 	u8s[sz] = '\0';
5042765a47cSis 
5052765a47cSis 	/*
5062765a47cSis 	 * Let's find out if we have a corresponding character.
5072765a47cSis 	 */
5082765a47cSis 	b1 = u8_common_b1_tbl[uv][b1];
5092765a47cSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5102765a47cSis 		return ((size_t)sz);
5112765a47cSis 
5122765a47cSis 	b2 = u8_case_common_b2_tbl[uv][b1][b2];
5132765a47cSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5142765a47cSis 		return ((size_t)sz);
5152765a47cSis 
5162765a47cSis 	if (is_it_toupper) {
5172765a47cSis 		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
5182765a47cSis 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5192765a47cSis 			return ((size_t)sz);
5202765a47cSis 
5212765a47cSis 		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
5222765a47cSis 		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
5232765a47cSis 
5242765a47cSis 		/* Either there is no match or an error at the table. */
5252765a47cSis 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5262765a47cSis 			return ((size_t)sz);
5272765a47cSis 
5282765a47cSis 		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
5292765a47cSis 
5302765a47cSis 		for (i = 0; start_id < end_id; start_id++)
5312765a47cSis 			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
5322765a47cSis 	} else {
5332765a47cSis 		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
5342765a47cSis 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5352765a47cSis 			return ((size_t)sz);
5362765a47cSis 
5372765a47cSis 		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
5382765a47cSis 		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
5392765a47cSis 
5402765a47cSis 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5412765a47cSis 			return ((size_t)sz);
5422765a47cSis 
5432765a47cSis 		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
5442765a47cSis 
5452765a47cSis 		for (i = 0; start_id < end_id; start_id++)
5462765a47cSis 			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
5472765a47cSis 	}
5482765a47cSis 
5492765a47cSis 	/*
5502765a47cSis 	 * If i is still zero, that means there is no corresponding character.
5512765a47cSis 	 */
5522765a47cSis 	if (i == 0)
5532765a47cSis 		return ((size_t)sz);
5542765a47cSis 
5552765a47cSis 	u8s[i] = '\0';
5562765a47cSis 
5572765a47cSis 	return (i);
5582765a47cSis }
5592765a47cSis 
5602765a47cSis /*
5612765a47cSis  * The do_case_compare() function compares the two input strings, s1 and s2,
5622765a47cSis  * one character at a time doing case conversions if applicable and return
5632765a47cSis  * the comparison result as like strcmp().
5642765a47cSis  *
5652765a47cSis  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
5662765a47cSis  * we treat the 7-bit ASCII characters as a special case trying to yield
5672765a47cSis  * faster processing time.
5682765a47cSis  */
5692765a47cSis static int
do_case_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,boolean_t is_it_toupper,int * errnum)5702765a47cSis do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
571*f137b22eSDan McDonald     size_t n2, boolean_t is_it_toupper, int *errnum)
5722765a47cSis {
5732765a47cSis 	int f;
5742765a47cSis 	int sz1;
5752765a47cSis 	int sz2;
5762765a47cSis 	size_t j;
5772765a47cSis 	size_t i1;
5782765a47cSis 	size_t i2;
5792765a47cSis 	uchar_t u8s1[U8_MB_CUR_MAX + 1];
5802765a47cSis 	uchar_t u8s2[U8_MB_CUR_MAX + 1];
5812765a47cSis 
5822765a47cSis 	i1 = i2 = 0;
5832765a47cSis 	while (i1 < n1 && i2 < n2) {
5842765a47cSis 		/*
5852765a47cSis 		 * Find out what would be the byte length for this UTF-8
5862765a47cSis 		 * character at string s1 and also find out if this is
5872765a47cSis 		 * an illegal start byte or not and if so, issue a proper
58885bb5f1dSis 		 * error number and yet treat this byte as a character.
5892765a47cSis 		 */
5902765a47cSis 		sz1 = u8_number_of_bytes[*s1];
5912765a47cSis 		if (sz1 < 0) {
59285bb5f1dSis 			*errnum = EILSEQ;
5932765a47cSis 			sz1 = 1;
5942765a47cSis 		}
5952765a47cSis 
5962765a47cSis 		/*
5972765a47cSis 		 * For 7-bit ASCII characters mainly, we do a quick case
5982765a47cSis 		 * conversion right at here.
5992765a47cSis 		 *
6002765a47cSis 		 * If we don't have enough bytes for this character, issue
6012765a47cSis 		 * an EINVAL error and use what are available.
6022765a47cSis 		 *
6032765a47cSis 		 * If we have enough bytes, find out if there is
6042765a47cSis 		 * a corresponding uppercase character and if so, copy over
6052765a47cSis 		 * the bytes for a comparison later. If there is no
6062765a47cSis 		 * corresponding uppercase character, then, use what we have
6072765a47cSis 		 * for the comparison.
6082765a47cSis 		 */
6092765a47cSis 		if (sz1 == 1) {
6102765a47cSis 			if (is_it_toupper)
6112765a47cSis 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
6122765a47cSis 			else
6132765a47cSis 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
6142765a47cSis 			s1++;
6152765a47cSis 			u8s1[1] = '\0';
6162765a47cSis 		} else if ((i1 + sz1) > n1) {
61785bb5f1dSis 			*errnum = EINVAL;
6182765a47cSis 			for (j = 0; (i1 + j) < n1; )
6192765a47cSis 				u8s1[j++] = *s1++;
6202765a47cSis 			u8s1[j] = '\0';
6212765a47cSis 		} else {
6222765a47cSis 			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
6232765a47cSis 			s1 += sz1;
6242765a47cSis 		}
6252765a47cSis 
6262765a47cSis 		/* Do the same for the string s2. */
6272765a47cSis 		sz2 = u8_number_of_bytes[*s2];
6282765a47cSis 		if (sz2 < 0) {
62985bb5f1dSis 			*errnum = EILSEQ;
6302765a47cSis 			sz2 = 1;
6312765a47cSis 		}
6322765a47cSis 
6332765a47cSis 		if (sz2 == 1) {
6342765a47cSis 			if (is_it_toupper)
6352765a47cSis 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
6362765a47cSis 			else
6372765a47cSis 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
6382765a47cSis 			s2++;
6392765a47cSis 			u8s2[1] = '\0';
6402765a47cSis 		} else if ((i2 + sz2) > n2) {
64185bb5f1dSis 			*errnum = EINVAL;
6422765a47cSis 			for (j = 0; (i2 + j) < n2; )
6432765a47cSis 				u8s2[j++] = *s2++;
6442765a47cSis 			u8s2[j] = '\0';
6452765a47cSis 		} else {
6462765a47cSis 			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
6472765a47cSis 			s2 += sz2;
6482765a47cSis 		}
6492765a47cSis 
6502765a47cSis 		/* Now compare the two characters. */
6512765a47cSis 		if (sz1 == 1 && sz2 == 1) {
6522765a47cSis 			if (*u8s1 > *u8s2)
6532765a47cSis 				return (1);
6542765a47cSis 			if (*u8s1 < *u8s2)
6552765a47cSis 				return (-1);
6562765a47cSis 		} else {
6572765a47cSis 			f = strcmp((const char *)u8s1, (const char *)u8s2);
6582765a47cSis 			if (f != 0)
6592765a47cSis 				return (f);
6602765a47cSis 		}
6612765a47cSis 
6622765a47cSis 		/*
6632765a47cSis 		 * They were the same. Let's move on to the next
6642765a47cSis 		 * characters then.
6652765a47cSis 		 */
6662765a47cSis 		i1 += sz1;
6672765a47cSis 		i2 += sz2;
6682765a47cSis 	}
6692765a47cSis 
6702765a47cSis 	/*
6712765a47cSis 	 * We compared until the end of either or both strings.
6722765a47cSis 	 *
6732765a47cSis 	 * If we reached to or went over the ends for the both, that means
6742765a47cSis 	 * they are the same.
6752765a47cSis 	 *
6762765a47cSis 	 * If we reached only one of the two ends, that means the other string
6772765a47cSis 	 * has something which then the fact can be used to determine
6782765a47cSis 	 * the return value.
6792765a47cSis 	 */
6802765a47cSis 	if (i1 >= n1) {
6812765a47cSis 		if (i2 >= n2)
6822765a47cSis 			return (0);
6832765a47cSis 		return (-1);
6842765a47cSis 	}
6852765a47cSis 	return (1);
6862765a47cSis }
6872765a47cSis 
6882765a47cSis /*
6892765a47cSis  * The combining_class() function checks on the given bytes and find out
6902765a47cSis  * the corresponding Unicode combining class value. The return value 0 means
6912765a47cSis  * it is a Starter. Any illegal UTF-8 character will also be treated as
6922765a47cSis  * a Starter.
6932765a47cSis  */
6942765a47cSis static uchar_t
combining_class(size_t uv,uchar_t * s,size_t sz)6952765a47cSis combining_class(size_t uv, uchar_t *s, size_t sz)
6962765a47cSis {
6972765a47cSis 	uint16_t b1 = 0;
6982765a47cSis 	uint16_t b2 = 0;
6992765a47cSis 	uint16_t b3 = 0;
7002765a47cSis 	uint16_t b4 = 0;
7012765a47cSis 
7022765a47cSis 	if (sz == 1 || sz > 4)
7032765a47cSis 		return (0);
7042765a47cSis 
7052765a47cSis 	if (sz == 2) {
7062765a47cSis 		b3 = s[0];
7072765a47cSis 		b4 = s[1];
7082765a47cSis 	} else if (sz == 3) {
7092765a47cSis 		b2 = s[0];
7102765a47cSis 		b3 = s[1];
7112765a47cSis 		b4 = s[2];
7122765a47cSis 	} else if (sz == 4) {
7132765a47cSis 		b1 = s[0];
7142765a47cSis 		b2 = s[1];
7152765a47cSis 		b3 = s[2];
7162765a47cSis 		b4 = s[3];
7172765a47cSis 	}
7182765a47cSis 
7192765a47cSis 	b1 = u8_common_b1_tbl[uv][b1];
7202765a47cSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
7212765a47cSis 		return (0);
7222765a47cSis 
7232765a47cSis 	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
7242765a47cSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
7252765a47cSis 		return (0);
7262765a47cSis 
7272765a47cSis 	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
7282765a47cSis 	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
7292765a47cSis 		return (0);
7302765a47cSis 
7312765a47cSis 	return (u8_combining_class_b4_tbl[uv][b3][b4]);
7322765a47cSis }
7332765a47cSis 
7342765a47cSis /*
7352765a47cSis  * The do_decomp() function finds out a matching decomposition if any
7362765a47cSis  * and return. If there is no match, the input bytes are copied and returned.
7372765a47cSis  * The function also checks if there is a Hangul, decomposes it if necessary
7382765a47cSis  * and returns.
7392765a47cSis  *
7402765a47cSis  * To save time, a single byte 7-bit ASCII character should be handled by
7412765a47cSis  * the caller.
7422765a47cSis  *
7432765a47cSis  * The function returns the number of bytes returned sans always terminating
7442765a47cSis  * the null byte. It will also return a state that will tell if there was
7452765a47cSis  * a Hangul character decomposed which then will be used by the caller.
7462765a47cSis  */
7472765a47cSis static size_t
do_decomp(size_t uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t canonical_decomposition,u8_normalization_states_t * state)7482765a47cSis do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
749*f137b22eSDan McDonald     boolean_t canonical_decomposition, u8_normalization_states_t *state)
7502765a47cSis {
7512765a47cSis 	uint16_t b1 = 0;
7522765a47cSis 	uint16_t b2 = 0;
7532765a47cSis 	uint16_t b3 = 0;
7542765a47cSis 	uint16_t b3_tbl;
7552765a47cSis 	uint16_t b3_base;
7562765a47cSis 	uint16_t b4 = 0;
7572765a47cSis 	size_t start_id;
7582765a47cSis 	size_t end_id;
7592765a47cSis 	size_t i;
7602765a47cSis 	uint32_t u1;
7612765a47cSis 
7622765a47cSis 	if (sz == 2) {
7632765a47cSis 		b3 = u8s[0] = s[0];
7642765a47cSis 		b4 = u8s[1] = s[1];
7652765a47cSis 		u8s[2] = '\0';
7662765a47cSis 	} else if (sz == 3) {
7672765a47cSis 		/* Convert it to a Unicode scalar value. */
7682765a47cSis 		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
7692765a47cSis 
7702765a47cSis 		/*
7712765a47cSis 		 * If this is a Hangul syllable, we decompose it into
7722765a47cSis 		 * a leading consonant, a vowel, and an optional trailing
7732765a47cSis 		 * consonant and then return.
7742765a47cSis 		 */
7752765a47cSis 		if (U8_HANGUL_SYLLABLE(u1)) {
7762765a47cSis 			u1 -= U8_HANGUL_SYL_FIRST;
7772765a47cSis 
7782765a47cSis 			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
7792765a47cSis 			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
7802765a47cSis 			    / U8_HANGUL_T_COUNT;
7812765a47cSis 			b3 = u1 % U8_HANGUL_T_COUNT;
7822765a47cSis 
7832765a47cSis 			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
7842765a47cSis 			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
7852765a47cSis 			if (b3) {
7862765a47cSis 				b3 += U8_HANGUL_JAMO_T_FIRST;
7872765a47cSis 				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
7882765a47cSis 
7892765a47cSis 				u8s[9] = '\0';
7902765a47cSis 				*state = U8_STATE_HANGUL_LVT;
7912765a47cSis 				return (9);
7922765a47cSis 			}
7932765a47cSis 
7942765a47cSis 			u8s[6] = '\0';
7952765a47cSis 			*state = U8_STATE_HANGUL_LV;
7962765a47cSis 			return (6);
7972765a47cSis 		}
7982765a47cSis 
7992765a47cSis 		b2 = u8s[0] = s[0];
8002765a47cSis 		b3 = u8s[1] = s[1];
8012765a47cSis 		b4 = u8s[2] = s[2];
8022765a47cSis 		u8s[3] = '\0';
8032765a47cSis 
8042765a47cSis 		/*
8052765a47cSis 		 * If this is a Hangul Jamo, we know there is nothing
8062765a47cSis 		 * further that we can decompose.
8072765a47cSis 		 */
8082765a47cSis 		if (U8_HANGUL_JAMO_L(u1)) {
8092765a47cSis 			*state = U8_STATE_HANGUL_L;
8102765a47cSis 			return (3);
8112765a47cSis 		}
8122765a47cSis 
8132765a47cSis 		if (U8_HANGUL_JAMO_V(u1)) {
8142765a47cSis 			if (*state == U8_STATE_HANGUL_L)
8152765a47cSis 				*state = U8_STATE_HANGUL_LV;
8162765a47cSis 			else
8172765a47cSis 				*state = U8_STATE_HANGUL_V;
8182765a47cSis 			return (3);
8192765a47cSis 		}
8202765a47cSis 
8212765a47cSis 		if (U8_HANGUL_JAMO_T(u1)) {
8222765a47cSis 			if (*state == U8_STATE_HANGUL_LV)
8232765a47cSis 				*state = U8_STATE_HANGUL_LVT;
8242765a47cSis 			else
8252765a47cSis 				*state = U8_STATE_HANGUL_T;
8262765a47cSis 			return (3);
8272765a47cSis 		}
8282765a47cSis 	} else if (sz == 4) {
8292765a47cSis 		b1 = u8s[0] = s[0];
8302765a47cSis 		b2 = u8s[1] = s[1];
8312765a47cSis 		b3 = u8s[2] = s[2];
8322765a47cSis 		b4 = u8s[3] = s[3];
8332765a47cSis 		u8s[4] = '\0';
8342765a47cSis 	} else {
8352765a47cSis 		/*
8362765a47cSis 		 * This is a fallback and should not happen if the function
8372765a47cSis 		 * was called properly.
8382765a47cSis 		 */
8392765a47cSis 		u8s[0] = s[0];
8402765a47cSis 		u8s[1] = '\0';
8412765a47cSis 		*state = U8_STATE_START;
8422765a47cSis 		return (1);
8432765a47cSis 	}
8442765a47cSis 
8452765a47cSis 	/*
8462765a47cSis 	 * At this point, this rountine does not know what it would get.
8472765a47cSis 	 * The caller should sort it out if the state isn't a Hangul one.
8482765a47cSis 	 */
8492765a47cSis 	*state = U8_STATE_START;
8502765a47cSis 
8512765a47cSis 	/* Try to find matching decomposition mapping byte sequence. */
8522765a47cSis 	b1 = u8_common_b1_tbl[uv][b1];
8532765a47cSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
8542765a47cSis 		return ((size_t)sz);
8552765a47cSis 
8562765a47cSis 	b2 = u8_decomp_b2_tbl[uv][b1][b2];
8572765a47cSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
8582765a47cSis 		return ((size_t)sz);
8592765a47cSis 
8602765a47cSis 	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
8612765a47cSis 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
8622765a47cSis 		return ((size_t)sz);
8632765a47cSis 
8642765a47cSis 	/*
8652765a47cSis 	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
8662765a47cSis 	 * which is 0x8000, this means we couldn't fit the mappings into
8672765a47cSis 	 * the cardinality of a unsigned byte.
8682765a47cSis 	 */
8692765a47cSis 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
8702765a47cSis 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
8712765a47cSis 		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
8722765a47cSis 		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
8732765a47cSis 	} else {
8742765a47cSis 		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
8752765a47cSis 		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
8762765a47cSis 	}
8772765a47cSis 
8782765a47cSis 	/* This also means there wasn't any matching decomposition. */
8792765a47cSis 	if (start_id >= end_id)
8802765a47cSis 		return ((size_t)sz);
8812765a47cSis 
8822765a47cSis 	/*
8832765a47cSis 	 * The final table for decomposition mappings has three types of
8842765a47cSis 	 * byte sequences depending on whether a mapping is for compatibility
8852765a47cSis 	 * decomposition, canonical decomposition, or both like the following:
8862765a47cSis 	 *
8872765a47cSis 	 * (1) Compatibility decomposition mappings:
8882765a47cSis 	 *
8892765a47cSis 	 *	+---+---+-...-+---+
8902765a47cSis 	 *	| B0| B1| ... | Bm|
8912765a47cSis 	 *	+---+---+-...-+---+
8922765a47cSis 	 *
8932765a47cSis 	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
8942765a47cSis 	 *
8952765a47cSis 	 * (2) Canonical decomposition mappings:
8962765a47cSis 	 *
8972765a47cSis 	 *	+---+---+---+-...-+---+
8982765a47cSis 	 *	| T | b0| b1| ... | bn|
8992765a47cSis 	 *	+---+---+---+-...-+---+
9002765a47cSis 	 *
9012765a47cSis 	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
9022765a47cSis 	 *
9032765a47cSis 	 * (3) Both mappings:
9042765a47cSis 	 *
9052765a47cSis 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
9062765a47cSis 	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
9072765a47cSis 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
9082765a47cSis 	 *
9092765a47cSis 	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
9102765a47cSis 	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
9112765a47cSis 	 *	compatibility mapping bytes.
9122765a47cSis 	 *
9132765a47cSis 	 * Note that compatibility decomposition means doing recursive
9142765a47cSis 	 * decompositions using both compatibility decomposition mappings and
9152765a47cSis 	 * canonical decomposition mappings. On the other hand, canonical
9162765a47cSis 	 * decomposition means doing recursive decompositions using only
9172765a47cSis 	 * canonical decomposition mappings. Since the table we have has gone
9182765a47cSis 	 * through the recursions already, we do not need to do so during
9192765a47cSis 	 * runtime, i.e., the table has been completely flattened out
9202765a47cSis 	 * already.
9212765a47cSis 	 */
9222765a47cSis 
9232765a47cSis 	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
9242765a47cSis 
9252765a47cSis 	/* Get the type, T, of the byte sequence. */
9262765a47cSis 	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
9272765a47cSis 
9282765a47cSis 	/*
9292765a47cSis 	 * If necessary, adjust start_id, end_id, or both. Note that if
9302765a47cSis 	 * this is compatibility decomposition mapping, there is no
9312765a47cSis 	 * adjustment.
9322765a47cSis 	 */
9332765a47cSis 	if (canonical_decomposition) {
9342765a47cSis 		/* Is the mapping only for compatibility decomposition? */
9352765a47cSis 		if (b1 < U8_DECOMP_BOTH)
9362765a47cSis 			return ((size_t)sz);
9372765a47cSis 
9382765a47cSis 		start_id++;
9392765a47cSis 
9402765a47cSis 		if (b1 == U8_DECOMP_BOTH) {
9412765a47cSis 			end_id = start_id +
9422765a47cSis 			    u8_decomp_final_tbl[uv][b3_base + start_id];
9432765a47cSis 			start_id++;
9442765a47cSis 		}
9452765a47cSis 	} else {
9462765a47cSis 		/*
9472765a47cSis 		 * Unless this is a compatibility decomposition mapping,
9482765a47cSis 		 * we adjust the start_id.
9492765a47cSis 		 */
9502765a47cSis 		if (b1 == U8_DECOMP_BOTH) {
9512765a47cSis 			start_id++;
9522765a47cSis 			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
9532765a47cSis 		} else if (b1 == U8_DECOMP_CANONICAL) {
9542765a47cSis 			start_id++;
9552765a47cSis 		}
9562765a47cSis 	}
9572765a47cSis 
9582765a47cSis 	for (i = 0; start_id < end_id; start_id++)
9592765a47cSis 		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
9602765a47cSis 	u8s[i] = '\0';
9612765a47cSis 
9622765a47cSis 	return (i);
9632765a47cSis }
9642765a47cSis 
9652765a47cSis /*
9662765a47cSis  * The find_composition_start() function uses the character bytes given and
9672765a47cSis  * find out the matching composition mappings if any and return the address
9682765a47cSis  * to the composition mappings as explained in the do_composition().
9692765a47cSis  */
9702765a47cSis static uchar_t *
find_composition_start(size_t uv,uchar_t * s,size_t sz)9712765a47cSis find_composition_start(size_t uv, uchar_t *s, size_t sz)
9722765a47cSis {
9732765a47cSis 	uint16_t b1 = 0;
9742765a47cSis 	uint16_t b2 = 0;
9752765a47cSis 	uint16_t b3 = 0;
9762765a47cSis 	uint16_t b3_tbl;
9772765a47cSis 	uint16_t b3_base;
9782765a47cSis 	uint16_t b4 = 0;
9792765a47cSis 	size_t start_id;
9802765a47cSis 	size_t end_id;
9812765a47cSis 
9822765a47cSis 	if (sz == 1) {
9832765a47cSis 		b4 = s[0];
9842765a47cSis 	} else if (sz == 2) {
9852765a47cSis 		b3 = s[0];
9862765a47cSis 		b4 = s[1];
9872765a47cSis 	} else if (sz == 3) {
9882765a47cSis 		b2 = s[0];
9892765a47cSis 		b3 = s[1];
9902765a47cSis 		b4 = s[2];
9912765a47cSis 	} else if (sz == 4) {
9922765a47cSis 		b1 = s[0];
9932765a47cSis 		b2 = s[1];
9942765a47cSis 		b3 = s[2];
9952765a47cSis 		b4 = s[3];
9962765a47cSis 	} else {
9972765a47cSis 		/*
9982765a47cSis 		 * This is a fallback and should not happen if the function
9992765a47cSis 		 * was called properly.
10002765a47cSis 		 */
10012765a47cSis 		return (NULL);
10022765a47cSis 	}
10032765a47cSis 
10042765a47cSis 	b1 = u8_composition_b1_tbl[uv][b1];
10052765a47cSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
10062765a47cSis 		return (NULL);
10072765a47cSis 
10082765a47cSis 	b2 = u8_composition_b2_tbl[uv][b1][b2];
10092765a47cSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
10102765a47cSis 		return (NULL);
10112765a47cSis 
10122765a47cSis 	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
10132765a47cSis 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
10142765a47cSis 		return (NULL);
10152765a47cSis 
10162765a47cSis 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
10172765a47cSis 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
10182765a47cSis 		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
10192765a47cSis 		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
10202765a47cSis 	} else {
10212765a47cSis 		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
10222765a47cSis 		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
10232765a47cSis 	}
10242765a47cSis 
10252765a47cSis 	if (start_id >= end_id)
10262765a47cSis 		return (NULL);
10272765a47cSis 
10282765a47cSis 	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
10292765a47cSis 
10302765a47cSis 	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
10312765a47cSis }
10322765a47cSis 
10332765a47cSis /*
10342765a47cSis  * The blocked() function checks on the combining class values of previous
10352765a47cSis  * characters in this sequence and return whether it is blocked or not.
10362765a47cSis  */
10372765a47cSis static boolean_t
blocked(uchar_t * comb_class,size_t last)10382765a47cSis blocked(uchar_t *comb_class, size_t last)
10392765a47cSis {
10402765a47cSis 	uchar_t my_comb_class;
10412765a47cSis 	size_t i;
10422765a47cSis 
10432765a47cSis 	my_comb_class = comb_class[last];
10442765a47cSis 	for (i = 1; i < last; i++)
10452765a47cSis 		if (comb_class[i] >= my_comb_class ||
10462765a47cSis 		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
10472765a47cSis 			return (B_TRUE);
10482765a47cSis 
10492765a47cSis 	return (B_FALSE);
10502765a47cSis }
10512765a47cSis 
10522765a47cSis /*
10532765a47cSis  * The do_composition() reads the character string pointed by 's' and
10542765a47cSis  * do necessary canonical composition and then copy over the result back to
10552765a47cSis  * the 's'.
10562765a47cSis  *
10572765a47cSis  * The input argument 's' cannot contain more than 32 characters.
10582765a47cSis  */
10592765a47cSis static size_t
do_composition(size_t uv,uchar_t * s,uchar_t * comb_class,uchar_t * start,uchar_t * disp,size_t last,uchar_t ** os,uchar_t * oslast)10602765a47cSis do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
1061*f137b22eSDan McDonald     uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
10622765a47cSis {
10632765a47cSis 	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
10642765a47cSis 	uchar_t tc[U8_MB_CUR_MAX];
10652765a47cSis 	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
10662765a47cSis 	size_t saved_marks_count;
10672765a47cSis 	uchar_t *p;
10682765a47cSis 	uchar_t *saved_p;
10692765a47cSis 	uchar_t *q;
10702765a47cSis 	size_t i;
10712765a47cSis 	size_t saved_i;
10722765a47cSis 	size_t j;
10732765a47cSis 	size_t k;
10742765a47cSis 	size_t l;
10752765a47cSis 	size_t C;
10762765a47cSis 	size_t saved_l;
10772765a47cSis 	size_t size;
10782765a47cSis 	uint32_t u1;
10792765a47cSis 	uint32_t u2;
10802765a47cSis 	boolean_t match_not_found = B_TRUE;
10812765a47cSis 
10822765a47cSis 	/*
10832765a47cSis 	 * This should never happen unless the callers are doing some strange
10842765a47cSis 	 * and unexpected things.
10852765a47cSis 	 *
10862765a47cSis 	 * The "last" is the index pointing to the last character not last + 1.
10872765a47cSis 	 */
10882765a47cSis 	if (last >= U8_MAX_CHARS_A_SEQ)
10892765a47cSis 		last = U8_UPPER_LIMIT_IN_A_SEQ;
10902765a47cSis 
10912765a47cSis 	for (i = l = 0; i <= last; i++) {
10922765a47cSis 		/*
10932765a47cSis 		 * The last or any non-Starters at the beginning, we don't
10942765a47cSis 		 * have any chance to do composition and so we just copy them
10952765a47cSis 		 * to the temporary buffer.
10962765a47cSis 		 */
10972765a47cSis 		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
10982765a47cSis SAVE_THE_CHAR:
10992765a47cSis 			p = s + start[i];
11002765a47cSis 			size = disp[i];
11012765a47cSis 			for (k = 0; k < size; k++)
11022765a47cSis 				t[l++] = *p++;
11032765a47cSis 			continue;
11042765a47cSis 		}
11052765a47cSis 
11062765a47cSis 		/*
11072765a47cSis 		 * If this could be a start of Hangul Jamos, then, we try to
11082765a47cSis 		 * conjoin them.
11092765a47cSis 		 */
11102765a47cSis 		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
11112765a47cSis 			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
11122765a47cSis 			    s[start[i] + 1], s[start[i] + 2]);
11132765a47cSis 			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
11142765a47cSis 			    s[start[i] + 4], s[start[i] + 5]);
11152765a47cSis 
11162765a47cSis 			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
11172765a47cSis 				u1 -= U8_HANGUL_JAMO_L_FIRST;
11182765a47cSis 				u2 -= U8_HANGUL_JAMO_V_FIRST;
11192765a47cSis 				u1 = U8_HANGUL_SYL_FIRST +
11202765a47cSis 				    (u1 * U8_HANGUL_V_COUNT + u2) *
11212765a47cSis 				    U8_HANGUL_T_COUNT;
11222765a47cSis 
11232765a47cSis 				i += 2;
11242765a47cSis 				if (i <= last) {
11252765a47cSis 					U8_PUT_3BYTES_INTO_UTF32(u2,
11262765a47cSis 					    s[start[i]], s[start[i] + 1],
11272765a47cSis 					    s[start[i] + 2]);
11282765a47cSis 
11292765a47cSis 					if (U8_HANGUL_JAMO_T(u2)) {
11302765a47cSis 						u1 += u2 -
11312765a47cSis 						    U8_HANGUL_JAMO_T_FIRST;
11322765a47cSis 						i++;
11332765a47cSis 					}
11342765a47cSis 				}
11352765a47cSis 
11362765a47cSis 				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
11372765a47cSis 				i--;
11382765a47cSis 				l += 3;
11392765a47cSis 				continue;
11402765a47cSis 			}
11412765a47cSis 		}
11422765a47cSis 
11432765a47cSis 		/*
11442765a47cSis 		 * Let's then find out if this Starter has composition
11452765a47cSis 		 * mapping.
11462765a47cSis 		 */
11472765a47cSis 		p = find_composition_start(uv, s + start[i], disp[i]);
11482765a47cSis 		if (p == NULL)
11492765a47cSis 			goto SAVE_THE_CHAR;
11502765a47cSis 
11512765a47cSis 		/*
11522765a47cSis 		 * We have a Starter with composition mapping and the next
11532765a47cSis 		 * character is a non-Starter. Let's try to find out if
11542765a47cSis 		 * we can do composition.
11552765a47cSis 		 */
11562765a47cSis 
11572765a47cSis 		saved_p = p;
11582765a47cSis 		saved_i = i;
11592765a47cSis 		saved_l = l;
11602765a47cSis 		saved_marks_count = 0;
11612765a47cSis 
11622765a47cSis TRY_THE_NEXT_MARK:
11632765a47cSis 		q = s + start[++i];
11642765a47cSis 		size = disp[i];
11652765a47cSis 
11662765a47cSis 		/*
11672765a47cSis 		 * The next for() loop compares the non-Starter pointed by
11682765a47cSis 		 * 'q' with the possible (joinable) characters pointed by 'p'.
11692765a47cSis 		 *
11702765a47cSis 		 * The composition final table entry pointed by the 'p'
11712765a47cSis 		 * looks like the following:
11722765a47cSis 		 *
11732765a47cSis 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11742765a47cSis 		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
11752765a47cSis 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11762765a47cSis 		 *
11772765a47cSis 		 * where C is the count byte indicating the number of
11782765a47cSis 		 * mapping pairs where each pair would be look like
11792765a47cSis 		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
11802765a47cSis 		 * character of a canonical decomposition and the B0-Bm are
11812765a47cSis 		 * the bytes of a matching composite character. The F is
11822765a47cSis 		 * a filler byte after each character as the separator.
11832765a47cSis 		 */
11842765a47cSis 
11852765a47cSis 		match_not_found = B_TRUE;
11862765a47cSis 
11872765a47cSis 		for (C = *p++; C > 0; C--) {
11882765a47cSis 			for (k = 0; k < size; p++, k++)
11892765a47cSis 				if (*p != q[k])
11902765a47cSis 					break;
11912765a47cSis 
11922765a47cSis 			/* Have we found it? */
11932765a47cSis 			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
11942765a47cSis 				match_not_found = B_FALSE;
11952765a47cSis 
11962765a47cSis 				l = saved_l;
11972765a47cSis 
11982765a47cSis 				while (*++p != U8_TBL_ELEMENT_FILLER)
11992765a47cSis 					t[l++] = *p;
12002765a47cSis 
12012765a47cSis 				break;
12022765a47cSis 			}
12032765a47cSis 
12042765a47cSis 			/* We didn't find; skip to the next pair. */
12052765a47cSis 			if (*p != U8_TBL_ELEMENT_FILLER)
12062765a47cSis 				while (*++p != U8_TBL_ELEMENT_FILLER)
12072765a47cSis 					;
12082765a47cSis 			while (*++p != U8_TBL_ELEMENT_FILLER)
12092765a47cSis 				;
12102765a47cSis 			p++;
12112765a47cSis 		}
12122765a47cSis 
12132765a47cSis 		/*
12142765a47cSis 		 * If there was no match, we will need to save the combining
12152765a47cSis 		 * mark for later appending. After that, if the next one
12162765a47cSis 		 * is a non-Starter and not blocked, then, we try once
12172765a47cSis 		 * again to do composition with the next non-Starter.
12182765a47cSis 		 *
12192765a47cSis 		 * If there was no match and this was a Starter, then,
12202765a47cSis 		 * this is a new start.
12212765a47cSis 		 *
12222765a47cSis 		 * If there was a match and a composition done and we have
12232765a47cSis 		 * more to check on, then, we retrieve a new composition final
12242765a47cSis 		 * table entry for the composite and then try to do the
12252765a47cSis 		 * composition again.
12262765a47cSis 		 */
12272765a47cSis 
12282765a47cSis 		if (match_not_found) {
12292765a47cSis 			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
12302765a47cSis 				i--;
12312765a47cSis 				goto SAVE_THE_CHAR;
12322765a47cSis 			}
12332765a47cSis 
12342765a47cSis 			saved_marks[saved_marks_count++] = i;
12352765a47cSis 		}
12362765a47cSis 
12372765a47cSis 		if (saved_l == l) {
12382765a47cSis 			while (i < last) {
12392765a47cSis 				if (blocked(comb_class, i + 1))
12402765a47cSis 					saved_marks[saved_marks_count++] = ++i;
12412765a47cSis 				else
12422765a47cSis 					break;
12432765a47cSis 			}
12442765a47cSis 			if (i < last) {
12452765a47cSis 				p = saved_p;
12462765a47cSis 				goto TRY_THE_NEXT_MARK;
12472765a47cSis 			}
12482765a47cSis 		} else if (i < last) {
12492765a47cSis 			p = find_composition_start(uv, t + saved_l,
12502765a47cSis 			    l - saved_l);
12512765a47cSis 			if (p != NULL) {
12522765a47cSis 				saved_p = p;
12532765a47cSis 				goto TRY_THE_NEXT_MARK;
12542765a47cSis 			}
12552765a47cSis 		}
12562765a47cSis 
12572765a47cSis 		/*
12582765a47cSis 		 * There is no more composition possible.
12592765a47cSis 		 *
12602765a47cSis 		 * If there was no composition what so ever then we copy
12612765a47cSis 		 * over the original Starter and then append any non-Starters
12622765a47cSis 		 * remaining at the target string sequentially after that.
12632765a47cSis 		 */
12642765a47cSis 
12652765a47cSis 		if (saved_l == l) {
12662765a47cSis 			p = s + start[saved_i];
12672765a47cSis 			size = disp[saved_i];
12682765a47cSis 			for (j = 0; j < size; j++)
12692765a47cSis 				t[l++] = *p++;
12702765a47cSis 		}
12712765a47cSis 
12722765a47cSis 		for (k = 0; k < saved_marks_count; k++) {
12732765a47cSis 			p = s + start[saved_marks[k]];
12742765a47cSis 			size = disp[saved_marks[k]];
12752765a47cSis 			for (j = 0; j < size; j++)
12762765a47cSis 				t[l++] = *p++;
12772765a47cSis 		}
12782765a47cSis 	}
12792765a47cSis 
12802765a47cSis 	/*
12812765a47cSis 	 * If the last character is a Starter and if we have a character
12822765a47cSis 	 * (possibly another Starter) that can be turned into a composite,
12832765a47cSis 	 * we do so and we do so until there is no more of composition
12842765a47cSis 	 * possible.
12852765a47cSis 	 */
12862765a47cSis 	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
12872765a47cSis 		p = *os;
12882765a47cSis 		saved_l = l - disp[last];
12892765a47cSis 
12902765a47cSis 		while (p < oslast) {
1291*f137b22eSDan McDonald 			int8_t number_of_bytes = u8_number_of_bytes[*p];
1292*f137b22eSDan McDonald 
1293*f137b22eSDan McDonald 			if (number_of_bytes <= 1)
1294*f137b22eSDan McDonald 				break;
1295*f137b22eSDan McDonald 			size = number_of_bytes;
1296*f137b22eSDan McDonald 			if ((p + size) > oslast)
12972765a47cSis 				break;
12982765a47cSis 
12992765a47cSis 			saved_p = p;
13002765a47cSis 
13012765a47cSis 			for (i = 0; i < size; i++)
13022765a47cSis 				tc[i] = *p++;
13032765a47cSis 
13042765a47cSis 			q = find_composition_start(uv, t + saved_l,
13052765a47cSis 			    l - saved_l);
13062765a47cSis 			if (q == NULL) {
13072765a47cSis 				p = saved_p;
13082765a47cSis 				break;
13092765a47cSis 			}
13102765a47cSis 
13112765a47cSis 			match_not_found = B_TRUE;
13122765a47cSis 
13132765a47cSis 			for (C = *q++; C > 0; C--) {
13142765a47cSis 				for (k = 0; k < size; q++, k++)
13152765a47cSis 					if (*q != tc[k])
13162765a47cSis 						break;
13172765a47cSis 
13182765a47cSis 				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
13192765a47cSis 					match_not_found = B_FALSE;
13202765a47cSis 
13212765a47cSis 					l = saved_l;
13222765a47cSis 
13232765a47cSis 					while (*++q != U8_TBL_ELEMENT_FILLER) {
13242765a47cSis 						/*
13252765a47cSis 						 * This is practically
13262765a47cSis 						 * impossible but we don't
13272765a47cSis 						 * want to take any chances.
13282765a47cSis 						 */
13292765a47cSis 						if (l >=
13302765a47cSis 						    U8_STREAM_SAFE_TEXT_MAX) {
13312765a47cSis 							p = saved_p;
13322765a47cSis 							goto SAFE_RETURN;
13332765a47cSis 						}
13342765a47cSis 						t[l++] = *q;
13352765a47cSis 					}
13362765a47cSis 
13372765a47cSis 					break;
13382765a47cSis 				}
13392765a47cSis 
13402765a47cSis 				if (*q != U8_TBL_ELEMENT_FILLER)
13412765a47cSis 					while (*++q != U8_TBL_ELEMENT_FILLER)
13422765a47cSis 						;
13432765a47cSis 				while (*++q != U8_TBL_ELEMENT_FILLER)
13442765a47cSis 					;
13452765a47cSis 				q++;
13462765a47cSis 			}
13472765a47cSis 
13482765a47cSis 			if (match_not_found) {
13492765a47cSis 				p = saved_p;
13502765a47cSis 				break;
13512765a47cSis 			}
13522765a47cSis 		}
13532765a47cSis SAFE_RETURN:
13542765a47cSis 		*os = p;
13552765a47cSis 	}
13562765a47cSis 
13572765a47cSis 	/*
13582765a47cSis 	 * Now we copy over the temporary string to the target string.
13592765a47cSis 	 * Since composition always reduces the number of characters or
13602765a47cSis 	 * the number of characters stay, we don't need to worry about
13612765a47cSis 	 * the buffer overflow here.
13622765a47cSis 	 */
13632765a47cSis 	for (i = 0; i < l; i++)
13642765a47cSis 		s[i] = t[i];
13652765a47cSis 	s[l] = '\0';
13662765a47cSis 
13672765a47cSis 	return (l);
13682765a47cSis }
13692765a47cSis 
13702765a47cSis /*
13712765a47cSis  * The collect_a_seq() function checks on the given string s, collect
13722765a47cSis  * a sequence of characters at u8s, and return the sequence. While it collects
13732765a47cSis  * a sequence, it also applies case conversion, canonical or compatibility
13742765a47cSis  * decomposition, canonical decomposition, or some or all of them and
13752765a47cSis  * in that order.
13762765a47cSis  *
13772765a47cSis  * The collected sequence cannot be bigger than 32 characters since if
13782765a47cSis  * it is having more than 31 characters, the sequence will be terminated
13792765a47cSis  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
13802765a47cSis  * a Stream-Safe Text. The collected sequence is always terminated with
13812765a47cSis  * a null byte and the return value is the byte length of the sequence
13822765a47cSis  * including 0. The return value does not include the terminating
13832765a47cSis  * null byte.
13842765a47cSis  */
13852765a47cSis static size_t
collect_a_seq(size_t uv,uchar_t * u8s,uchar_t ** source,uchar_t * slast,boolean_t is_it_toupper,boolean_t is_it_tolower,boolean_t canonical_decomposition,boolean_t compatibility_decomposition,boolean_t canonical_composition,int * errnum,u8_normalization_states_t * state)13862765a47cSis collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
1387*f137b22eSDan McDonald     boolean_t is_it_toupper,
1388*f137b22eSDan McDonald     boolean_t is_it_tolower,
1389*f137b22eSDan McDonald     boolean_t canonical_decomposition,
1390*f137b22eSDan McDonald     boolean_t compatibility_decomposition,
1391*f137b22eSDan McDonald     boolean_t canonical_composition,
1392*f137b22eSDan McDonald     int *errnum, u8_normalization_states_t *state)
13932765a47cSis {
13942765a47cSis 	uchar_t *s;
13952765a47cSis 	int sz;
13962765a47cSis 	int saved_sz;
13972765a47cSis 	size_t i;
13982765a47cSis 	size_t j;
13992765a47cSis 	size_t k;
14002765a47cSis 	size_t l;
14012765a47cSis 	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
14022765a47cSis 	uchar_t disp[U8_MAX_CHARS_A_SEQ];
14032765a47cSis 	uchar_t start[U8_MAX_CHARS_A_SEQ];
14042765a47cSis 	uchar_t u8t[U8_MB_CUR_MAX];
14052765a47cSis 	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
14062765a47cSis 	uchar_t tc;
14072765a47cSis 	size_t last;
14082765a47cSis 	size_t saved_last;
14092765a47cSis 	uint32_t u1;
14102765a47cSis 
14112765a47cSis 	/*
14122765a47cSis 	 * Save the source string pointer which we will return a changed
14132765a47cSis 	 * pointer if we do processing.
14142765a47cSis 	 */
14152765a47cSis 	s = *source;
14162765a47cSis 
14172765a47cSis 	/*
14182765a47cSis 	 * The following is a fallback for just in case callers are not
14192765a47cSis 	 * checking the string boundaries before the calling.
14202765a47cSis 	 */
14212765a47cSis 	if (s >= slast) {
14222765a47cSis 		u8s[0] = '\0';
14232765a47cSis 
14242765a47cSis 		return (0);
14252765a47cSis 	}
14262765a47cSis 
14272765a47cSis 	/*
14282765a47cSis 	 * As the first thing, let's collect a character and do case
14292765a47cSis 	 * conversion if necessary.
14302765a47cSis 	 */
14312765a47cSis 
14322765a47cSis 	sz = u8_number_of_bytes[*s];
14332765a47cSis 
14342765a47cSis 	if (sz < 0) {
143585bb5f1dSis 		*errnum = EILSEQ;
14362765a47cSis 
14372765a47cSis 		u8s[0] = *s++;
14382765a47cSis 		u8s[1] = '\0';
14392765a47cSis 
14402765a47cSis 		*source = s;
14412765a47cSis 
14422765a47cSis 		return (1);
14432765a47cSis 	}
14442765a47cSis 
14452765a47cSis 	if (sz == 1) {
14462765a47cSis 		if (is_it_toupper)
14472765a47cSis 			u8s[0] = U8_ASCII_TOUPPER(*s);
14482765a47cSis 		else if (is_it_tolower)
14492765a47cSis 			u8s[0] = U8_ASCII_TOLOWER(*s);
14502765a47cSis 		else
14512765a47cSis 			u8s[0] = *s;
14522765a47cSis 		s++;
14532765a47cSis 		u8s[1] = '\0';
14542765a47cSis 	} else if ((s + sz) > slast) {
145585bb5f1dSis 		*errnum = EINVAL;
14562765a47cSis 
14572765a47cSis 		for (i = 0; s < slast; )
14582765a47cSis 			u8s[i++] = *s++;
14592765a47cSis 		u8s[i] = '\0';
14602765a47cSis 
14612765a47cSis 		*source = s;
14622765a47cSis 
14632765a47cSis 		return (i);
14642765a47cSis 	} else {
14652765a47cSis 		if (is_it_toupper || is_it_tolower) {
14662765a47cSis 			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
14672765a47cSis 			s += sz;
14682765a47cSis 			sz = i;
14692765a47cSis 		} else {
14702765a47cSis 			for (i = 0; i < sz; )
14712765a47cSis 				u8s[i++] = *s++;
14722765a47cSis 			u8s[i] = '\0';
14732765a47cSis 		}
14742765a47cSis 	}
14752765a47cSis 
14762765a47cSis 	/*
14772765a47cSis 	 * And then canonical/compatibility decomposition followed by
14782765a47cSis 	 * an optional canonical composition. Please be noted that
14792765a47cSis 	 * canonical composition is done only when a decomposition is
14802765a47cSis 	 * done.
14812765a47cSis 	 */
14822765a47cSis 	if (canonical_decomposition || compatibility_decomposition) {
14832765a47cSis 		if (sz == 1) {
14842765a47cSis 			*state = U8_STATE_START;
14852765a47cSis 
14862765a47cSis 			saved_sz = 1;
14872765a47cSis 
14882765a47cSis 			comb_class[0] = 0;
14892765a47cSis 			start[0] = 0;
14902765a47cSis 			disp[0] = 1;
14912765a47cSis 
14922765a47cSis 			last = 1;
14932765a47cSis 		} else {
14942765a47cSis 			saved_sz = do_decomp(uv, u8s, u8s, sz,
14952765a47cSis 			    canonical_decomposition, state);
14962765a47cSis 
14972765a47cSis 			last = 0;
14982765a47cSis 
14992765a47cSis 			for (i = 0; i < saved_sz; ) {
15002765a47cSis 				sz = u8_number_of_bytes[u8s[i]];
15012765a47cSis 
15022765a47cSis 				comb_class[last] = combining_class(uv,
15032765a47cSis 				    u8s + i, sz);
15042765a47cSis 				start[last] = i;
15052765a47cSis 				disp[last] = sz;
15062765a47cSis 
15072765a47cSis 				last++;
15082765a47cSis 				i += sz;
15092765a47cSis 			}
15102765a47cSis 
15112765a47cSis 			/*
15122765a47cSis 			 * Decomposition yields various Hangul related
15132765a47cSis 			 * states but not on combining marks. We need to
15142765a47cSis 			 * find out at here by checking on the last
15152765a47cSis 			 * character.
15162765a47cSis 			 */
15172765a47cSis 			if (*state == U8_STATE_START) {
15182765a47cSis 				if (comb_class[last - 1])
15192765a47cSis 					*state = U8_STATE_COMBINING_MARK;
15202765a47cSis 			}
15212765a47cSis 		}
15222765a47cSis 
15232765a47cSis 		saved_last = last;
15242765a47cSis 
15252765a47cSis 		while (s < slast) {
15262765a47cSis 			sz = u8_number_of_bytes[*s];
15272765a47cSis 
15282765a47cSis 			/*
15292765a47cSis 			 * If this is an illegal character, an incomplete
15302765a47cSis 			 * character, or an 7-bit ASCII Starter character,
15312765a47cSis 			 * then we have collected a sequence; break and let
15322765a47cSis 			 * the next call deal with the two cases.
15332765a47cSis 			 *
15342765a47cSis 			 * Note that this is okay only if you are using this
15352765a47cSis 			 * function with a fixed length string, not on
15362765a47cSis 			 * a buffer with multiple calls of one chunk at a time.
15372765a47cSis 			 */
15382765a47cSis 			if (sz <= 1) {
15392765a47cSis 				break;
15402765a47cSis 			} else if ((s + sz) > slast) {
15412765a47cSis 				break;
15422765a47cSis 			} else {
15432765a47cSis 				/*
15442765a47cSis 				 * If the previous character was a Hangul Jamo
15452765a47cSis 				 * and this character is a Hangul Jamo that
15462765a47cSis 				 * can be conjoined, we collect the Jamo.
15472765a47cSis 				 */
15482765a47cSis 				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
15492765a47cSis 					U8_PUT_3BYTES_INTO_UTF32(u1,
15502765a47cSis 					    *s, *(s + 1), *(s + 2));
15512765a47cSis 
15522765a47cSis 					if (U8_HANGUL_COMPOSABLE_L_V(*state,
15532765a47cSis 					    u1)) {
15542765a47cSis 						i = 0;
15552765a47cSis 						*state = U8_STATE_HANGUL_LV;
15562765a47cSis 						goto COLLECT_A_HANGUL;
15572765a47cSis 					}
15582765a47cSis 
15592765a47cSis 					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
15602765a47cSis 					    u1)) {
15612765a47cSis 						i = 0;
15622765a47cSis 						*state = U8_STATE_HANGUL_LVT;
15632765a47cSis 						goto COLLECT_A_HANGUL;
15642765a47cSis 					}
15652765a47cSis 				}
15662765a47cSis 
15672765a47cSis 				/*
15682765a47cSis 				 * Regardless of whatever it was, if this is
15692765a47cSis 				 * a Starter, we don't collect the character
15702765a47cSis 				 * since that's a new start and we will deal
15712765a47cSis 				 * with it at the next time.
15722765a47cSis 				 */
15732765a47cSis 				i = combining_class(uv, s, sz);
15742765a47cSis 				if (i == U8_COMBINING_CLASS_STARTER)
15752765a47cSis 					break;
15762765a47cSis 
15772765a47cSis 				/*
15782765a47cSis 				 * We know the current character is a combining
15792765a47cSis 				 * mark. If the previous character wasn't
15802765a47cSis 				 * a Starter (not Hangul) or a combining mark,
15812765a47cSis 				 * then, we don't collect this combining mark.
15822765a47cSis 				 */
15832765a47cSis 				if (*state != U8_STATE_START &&
15842765a47cSis 				    *state != U8_STATE_COMBINING_MARK)
15852765a47cSis 					break;
15862765a47cSis 
15872765a47cSis 				*state = U8_STATE_COMBINING_MARK;
15882765a47cSis COLLECT_A_HANGUL:
15892765a47cSis 				/*
15902765a47cSis 				 * If we collected a Starter and combining
15912765a47cSis 				 * marks up to 30, i.e., total 31 characters,
15922765a47cSis 				 * then, we terminate this degenerately long
15932765a47cSis 				 * combining sequence with a U+034F COMBINING
15942765a47cSis 				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
15952765a47cSis 				 * UTF-8 and turn this into a Stream-Safe
15962765a47cSis 				 * Text. This will be extremely rare but
15972765a47cSis 				 * possible.
15982765a47cSis 				 *
15992765a47cSis 				 * The following will also guarantee that
16002765a47cSis 				 * we are not writing more than 32 characters
16012765a47cSis 				 * plus a NULL at u8s[].
16022765a47cSis 				 */
16032765a47cSis 				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
16042765a47cSis TURN_STREAM_SAFE:
16052765a47cSis 					*state = U8_STATE_START;
16062765a47cSis 					comb_class[last] = 0;
16072765a47cSis 					start[last] = saved_sz;
16082765a47cSis 					disp[last] = 2;
16092765a47cSis 					last++;
16102765a47cSis 
16112765a47cSis 					u8s[saved_sz++] = 0xCD;
16122765a47cSis 					u8s[saved_sz++] = 0x8F;
16132765a47cSis 
16142765a47cSis 					break;
16152765a47cSis 				}
16162765a47cSis 
16172765a47cSis 				/*
16182765a47cSis 				 * Some combining marks also do decompose into
16192765a47cSis 				 * another combining mark or marks.
16202765a47cSis 				 */
16212765a47cSis 				if (*state == U8_STATE_COMBINING_MARK) {
16222765a47cSis 					k = last;
16232765a47cSis 					l = sz;
16242765a47cSis 					i = do_decomp(uv, uts, s, sz,
16252765a47cSis 					    canonical_decomposition, state);
16262765a47cSis 					for (j = 0; j < i; ) {
16272765a47cSis 						sz = u8_number_of_bytes[uts[j]];
16282765a47cSis 
16292765a47cSis 						comb_class[last] =
16302765a47cSis 						    combining_class(uv,
16312765a47cSis 						    uts + j, sz);
16322765a47cSis 						start[last] = saved_sz + j;
16332765a47cSis 						disp[last] = sz;
16342765a47cSis 
16352765a47cSis 						last++;
16362765a47cSis 						if (last >=
16372765a47cSis 						    U8_UPPER_LIMIT_IN_A_SEQ) {
16382765a47cSis 							last = k;
16392765a47cSis 							goto TURN_STREAM_SAFE;
16402765a47cSis 						}
16412765a47cSis 						j += sz;
16422765a47cSis 					}
16432765a47cSis 
16442765a47cSis 					*state = U8_STATE_COMBINING_MARK;
16452765a47cSis 					sz = i;
16462765a47cSis 					s += l;
16472765a47cSis 
16482765a47cSis 					for (i = 0; i < sz; i++)
16492765a47cSis 						u8s[saved_sz++] = uts[i];
16502765a47cSis 				} else {
16512765a47cSis 					comb_class[last] = i;
16522765a47cSis 					start[last] = saved_sz;
16532765a47cSis 					disp[last] = sz;
16542765a47cSis 					last++;
16552765a47cSis 
16562765a47cSis 					for (i = 0; i < sz; i++)
16572765a47cSis 						u8s[saved_sz++] = *s++;
16582765a47cSis 				}
16592765a47cSis 
16602765a47cSis 				/*
16612765a47cSis 				 * If this is U+0345 COMBINING GREEK
16622765a47cSis 				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
16632765a47cSis 				 * iota subscript, and need to be converted to
16642765a47cSis 				 * uppercase letter, convert it to U+0399 GREEK
16652765a47cSis 				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
16662765a47cSis 				 * i.e., convert to capital adscript form as
16672765a47cSis 				 * specified in the Unicode standard.
16682765a47cSis 				 *
16692765a47cSis 				 * This is the only special case of (ambiguous)
16702765a47cSis 				 * case conversion at combining marks and
16712765a47cSis 				 * probably the standard will never have
16722765a47cSis 				 * anything similar like this in future.
16732765a47cSis 				 */
16742765a47cSis 				if (is_it_toupper && sz >= 2 &&
16752765a47cSis 				    u8s[saved_sz - 2] == 0xCD &&
16762765a47cSis 				    u8s[saved_sz - 1] == 0x85) {
16772765a47cSis 					u8s[saved_sz - 2] = 0xCE;
16782765a47cSis 					u8s[saved_sz - 1] = 0x99;
16792765a47cSis 				}
16802765a47cSis 			}
16812765a47cSis 		}
16822765a47cSis 
16832765a47cSis 		/*
16842765a47cSis 		 * Let's try to ensure a canonical ordering for the collected
16852765a47cSis 		 * combining marks. We do this only if we have collected
16862765a47cSis 		 * at least one more non-Starter. (The decomposition mapping
16872765a47cSis 		 * data tables have fully (and recursively) expanded and
16882765a47cSis 		 * canonically ordered decompositions.)
16892765a47cSis 		 *
16902765a47cSis 		 * The U8_SWAP_COMB_MARKS() convenience macro has some
16912765a47cSis 		 * assumptions and we are meeting the assumptions.
16922765a47cSis 		 */
16932765a47cSis 		last--;
16942765a47cSis 		if (last >= saved_last) {
16952765a47cSis 			for (i = 0; i < last; i++)
16962765a47cSis 				for (j = last; j > i; j--)
16972765a47cSis 					if (comb_class[j] &&
16982765a47cSis 					    comb_class[j - 1] > comb_class[j]) {
16992765a47cSis 						U8_SWAP_COMB_MARKS(j - 1, j);
17002765a47cSis 					}
17012765a47cSis 		}
17022765a47cSis 
17032765a47cSis 		*source = s;
17042765a47cSis 
17052765a47cSis 		if (! canonical_composition) {
17062765a47cSis 			u8s[saved_sz] = '\0';
17072765a47cSis 			return (saved_sz);
17082765a47cSis 		}
17092765a47cSis 
17102765a47cSis 		/*
17112765a47cSis 		 * Now do the canonical composition. Note that we do this
17122765a47cSis 		 * only after a canonical or compatibility decomposition to
17132765a47cSis 		 * finish up NFC or NFKC.
17142765a47cSis 		 */
17152765a47cSis 		sz = do_composition(uv, u8s, comb_class, start, disp, last,
17162765a47cSis 		    &s, slast);
17172765a47cSis 	}
17182765a47cSis 
17192765a47cSis 	*source = s;
17202765a47cSis 
17212765a47cSis 	return ((size_t)sz);
17222765a47cSis }
17232765a47cSis 
17242765a47cSis /*
17252765a47cSis  * The do_norm_compare() function does string comparion based on Unicode
17262765a47cSis  * simple case mappings and Unicode Normalization definitions.
17272765a47cSis  *
17282765a47cSis  * It does so by collecting a sequence of character at a time and comparing
17292765a47cSis  * the collected sequences from the strings.
17302765a47cSis  *
17312765a47cSis  * The meanings on the return values are the same as the usual strcmp().
17322765a47cSis  */
17332765a47cSis static int
do_norm_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,int flag,int * errnum)17342765a47cSis do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1735*f137b22eSDan McDonald     int flag, int *errnum)
17362765a47cSis {
17372765a47cSis 	int result;
17382765a47cSis 	size_t sz1;
17392765a47cSis 	size_t sz2;
17402765a47cSis 	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
17412765a47cSis 	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
17422765a47cSis 	uchar_t *s1last;
17432765a47cSis 	uchar_t *s2last;
17442765a47cSis 	boolean_t is_it_toupper;
17452765a47cSis 	boolean_t is_it_tolower;
17462765a47cSis 	boolean_t canonical_decomposition;
17472765a47cSis 	boolean_t compatibility_decomposition;
17482765a47cSis 	boolean_t canonical_composition;
17492765a47cSis 	u8_normalization_states_t state;
17502765a47cSis 
17512765a47cSis 	s1last = s1 + n1;
17522765a47cSis 	s2last = s2 + n2;
17532765a47cSis 
17542765a47cSis 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
17552765a47cSis 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
17562765a47cSis 	canonical_decomposition = flag & U8_CANON_DECOMP;
17572765a47cSis 	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
17582765a47cSis 	canonical_composition = flag & U8_CANON_COMP;
17592765a47cSis 
17602765a47cSis 	while (s1 < s1last && s2 < s2last) {
17612765a47cSis 		/*
17622765a47cSis 		 * If the current character is a 7-bit ASCII and the last
17632765a47cSis 		 * character, or, if the current character and the next
17642765a47cSis 		 * character are both some 7-bit ASCII characters then
17652765a47cSis 		 * we treat the current character as a sequence.
17662765a47cSis 		 *
17672765a47cSis 		 * In any other cases, we need to call collect_a_seq().
17682765a47cSis 		 */
17692765a47cSis 
17702765a47cSis 		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
17712765a47cSis 		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
17722765a47cSis 			if (is_it_toupper)
17732765a47cSis 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
17742765a47cSis 			else if (is_it_tolower)
17752765a47cSis 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
17762765a47cSis 			else
17772765a47cSis 				u8s1[0] = *s1;
17782765a47cSis 			u8s1[1] = '\0';
17792765a47cSis 			sz1 = 1;
17802765a47cSis 			s1++;
17812765a47cSis 		} else {
17822765a47cSis 			state = U8_STATE_START;
17832765a47cSis 			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
17842765a47cSis 			    is_it_toupper, is_it_tolower,
17852765a47cSis 			    canonical_decomposition,
17862765a47cSis 			    compatibility_decomposition,
178785bb5f1dSis 			    canonical_composition, errnum, &state);
17882765a47cSis 		}
17892765a47cSis 
17902765a47cSis 		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
17912765a47cSis 		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
17922765a47cSis 			if (is_it_toupper)
17932765a47cSis 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
17942765a47cSis 			else if (is_it_tolower)
17952765a47cSis 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
17962765a47cSis 			else
17972765a47cSis 				u8s2[0] = *s2;
17982765a47cSis 			u8s2[1] = '\0';
17992765a47cSis 			sz2 = 1;
18002765a47cSis 			s2++;
18012765a47cSis 		} else {
18022765a47cSis 			state = U8_STATE_START;
18032765a47cSis 			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
18042765a47cSis 			    is_it_toupper, is_it_tolower,
18052765a47cSis 			    canonical_decomposition,
18062765a47cSis 			    compatibility_decomposition,
180785bb5f1dSis 			    canonical_composition, errnum, &state);
18082765a47cSis 		}
18092765a47cSis 
18102765a47cSis 		/*
18112765a47cSis 		 * Now compare the two characters. If they are the same,
18122765a47cSis 		 * we move on to the next character sequences.
18132765a47cSis 		 */
18142765a47cSis 		if (sz1 == 1 && sz2 == 1) {
18152765a47cSis 			if (*u8s1 > *u8s2)
18162765a47cSis 				return (1);
18172765a47cSis 			if (*u8s1 < *u8s2)
18182765a47cSis 				return (-1);
18192765a47cSis 		} else {
18202765a47cSis 			result = strcmp((const char *)u8s1, (const char *)u8s2);
18212765a47cSis 			if (result != 0)
18222765a47cSis 				return (result);
18232765a47cSis 		}
18242765a47cSis 	}
18252765a47cSis 
18262765a47cSis 	/*
18272765a47cSis 	 * We compared until the end of either or both strings.
18282765a47cSis 	 *
18292765a47cSis 	 * If we reached to or went over the ends for the both, that means
18302765a47cSis 	 * they are the same.
18312765a47cSis 	 *
18322765a47cSis 	 * If we reached only one end, that means the other string has
18332765a47cSis 	 * something which then can be used to determine the return value.
18342765a47cSis 	 */
18352765a47cSis 	if (s1 >= s1last) {
18362765a47cSis 		if (s2 >= s2last)
18372765a47cSis 			return (0);
18382765a47cSis 		return (-1);
18392765a47cSis 	}
18402765a47cSis 	return (1);
18412765a47cSis }
18422765a47cSis 
18432765a47cSis /*
18442765a47cSis  * The u8_strcmp() function compares two UTF-8 strings quite similar to
18452765a47cSis  * the strcmp(). For the comparison, however, Unicode Normalization specific
18462765a47cSis  * equivalency and Unicode simple case conversion mappings based equivalency
18472765a47cSis  * can be requested and checked against.
18482765a47cSis  */
18492765a47cSis int
u8_strcmp(const char * s1,const char * s2,size_t n,int flag,size_t uv,int * errnum)18502765a47cSis u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1851*f137b22eSDan McDonald     int *errnum)
18522765a47cSis {
18532765a47cSis 	int f;
18542765a47cSis 	size_t n1;
18552765a47cSis 	size_t n2;
18562765a47cSis 
185785bb5f1dSis 	*errnum = 0;
18582765a47cSis 
18592765a47cSis 	/*
18602765a47cSis 	 * Check on the requested Unicode version, case conversion, and
18612765a47cSis 	 * normalization flag values.
18622765a47cSis 	 */
18632765a47cSis 
18642765a47cSis 	if (uv > U8_UNICODE_LATEST) {
186585bb5f1dSis 		*errnum = ERANGE;
18662765a47cSis 		uv = U8_UNICODE_LATEST;
18672765a47cSis 	}
18682765a47cSis 
18692765a47cSis 	if (flag == 0) {
18702765a47cSis 		flag = U8_STRCMP_CS;
18712765a47cSis 	} else {
18722765a47cSis 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
18732765a47cSis 		    U8_STRCMP_CI_LOWER);
18742765a47cSis 		if (f == 0) {
18752765a47cSis 			flag |= U8_STRCMP_CS;
18762765a47cSis 		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
18772765a47cSis 		    f != U8_STRCMP_CI_LOWER) {
187885bb5f1dSis 			*errnum = EBADF;
18792765a47cSis 			flag = U8_STRCMP_CS;
18802765a47cSis 		}
18812765a47cSis 
18822765a47cSis 		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
18832765a47cSis 		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
18842765a47cSis 		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
188585bb5f1dSis 			*errnum = EBADF;
18862765a47cSis 			flag = U8_STRCMP_CS;
18872765a47cSis 		}
18882765a47cSis 	}
18892765a47cSis 
18902765a47cSis 	if (flag == U8_STRCMP_CS) {
18912765a47cSis 		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
18922765a47cSis 	}
18932765a47cSis 
18942765a47cSis 	n1 = strlen(s1);
18952765a47cSis 	n2 = strlen(s2);
18962765a47cSis 	if (n != 0) {
18972765a47cSis 		if (n < n1)
18982765a47cSis 			n1 = n;
18992765a47cSis 		if (n < n2)
19002765a47cSis 			n2 = n;
19012765a47cSis 	}
19022765a47cSis 
19032765a47cSis 	/*
19042765a47cSis 	 * Simple case conversion can be done much faster and so we do
19052765a47cSis 	 * them separately here.
19062765a47cSis 	 */
19072765a47cSis 	if (flag == U8_STRCMP_CI_UPPER) {
19082765a47cSis 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
190985bb5f1dSis 		    n1, n2, B_TRUE, errnum));
19102765a47cSis 	} else if (flag == U8_STRCMP_CI_LOWER) {
19112765a47cSis 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
191285bb5f1dSis 		    n1, n2, B_FALSE, errnum));
19132765a47cSis 	}
19142765a47cSis 
19152765a47cSis 	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
191685bb5f1dSis 	    flag, errnum));
19172765a47cSis }
19182765a47cSis 
19192765a47cSis size_t
u8_textprep_str(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,size_t unicode_version,int * errnum)19202765a47cSis u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1921*f137b22eSDan McDonald     int flag, size_t unicode_version, int *errnum)
19222765a47cSis {
19232765a47cSis 	int f;
19242765a47cSis 	int sz;
19252765a47cSis 	uchar_t *ib;
19262765a47cSis 	uchar_t *ibtail;
19272765a47cSis 	uchar_t *ob;
19282765a47cSis 	uchar_t *obtail;
19292765a47cSis 	boolean_t do_not_ignore_null;
19302765a47cSis 	boolean_t do_not_ignore_invalid;
19312765a47cSis 	boolean_t is_it_toupper;
19322765a47cSis 	boolean_t is_it_tolower;
19332765a47cSis 	boolean_t canonical_decomposition;
19342765a47cSis 	boolean_t compatibility_decomposition;
19352765a47cSis 	boolean_t canonical_composition;
19362765a47cSis 	size_t ret_val;
19372765a47cSis 	size_t i;
19382765a47cSis 	size_t j;
19392765a47cSis 	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
19402765a47cSis 	u8_normalization_states_t state;
19412765a47cSis 
19422765a47cSis 	if (unicode_version > U8_UNICODE_LATEST) {
194385bb5f1dSis 		*errnum = ERANGE;
19442765a47cSis 		return ((size_t)-1);
19452765a47cSis 	}
19462765a47cSis 
19472765a47cSis 	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
19482765a47cSis 	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
194985bb5f1dSis 		*errnum = EBADF;
19502765a47cSis 		return ((size_t)-1);
19512765a47cSis 	}
19522765a47cSis 
19532765a47cSis 	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
19542765a47cSis 	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
19552765a47cSis 	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
195685bb5f1dSis 		*errnum = EBADF;
19572765a47cSis 		return ((size_t)-1);
19582765a47cSis 	}
19592765a47cSis 
19602765a47cSis 	if (inarray == NULL || *inlen == 0)
19612765a47cSis 		return (0);
19622765a47cSis 
19632765a47cSis 	if (outarray == NULL) {
196485bb5f1dSis 		*errnum = E2BIG;
19652765a47cSis 		return ((size_t)-1);
19662765a47cSis 	}
19672765a47cSis 
19682765a47cSis 	ib = (uchar_t *)inarray;
19692765a47cSis 	ob = (uchar_t *)outarray;
19702765a47cSis 	ibtail = ib + *inlen;
19712765a47cSis 	obtail = ob + *outlen;
19722765a47cSis 
19732765a47cSis 	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
19742765a47cSis 	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
19752765a47cSis 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
19762765a47cSis 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
19772765a47cSis 
19782765a47cSis 	ret_val = 0;
19792765a47cSis 
19802765a47cSis 	/*
19812765a47cSis 	 * If we don't have a normalization flag set, we do the simple case
19822765a47cSis 	 * conversion based text preparation separately below. Text
19832765a47cSis 	 * preparation involving Normalization will be done in the false task
19842765a47cSis 	 * block, again, separately since it will take much more time and
19852765a47cSis 	 * resource than doing simple case conversions.
19862765a47cSis 	 */
19872765a47cSis 	if (f == 0) {
19882765a47cSis 		while (ib < ibtail) {
19892765a47cSis 			if (*ib == '\0' && do_not_ignore_null)
19902765a47cSis 				break;
19912765a47cSis 
19922765a47cSis 			sz = u8_number_of_bytes[*ib];
19932765a47cSis 
19942765a47cSis 			if (sz < 0) {
19952765a47cSis 				if (do_not_ignore_invalid) {
199685bb5f1dSis 					*errnum = EILSEQ;
19972765a47cSis 					ret_val = (size_t)-1;
19982765a47cSis 					break;
19992765a47cSis 				}
20002765a47cSis 
20012765a47cSis 				sz = 1;
20022765a47cSis 				ret_val++;
20032765a47cSis 			}
20042765a47cSis 
20052765a47cSis 			if (sz == 1) {
20062765a47cSis 				if (ob >= obtail) {
200785bb5f1dSis 					*errnum = E2BIG;
20082765a47cSis 					ret_val = (size_t)-1;
20092765a47cSis 					break;
20102765a47cSis 				}
20112765a47cSis 
20122765a47cSis 				if (is_it_toupper)
20132765a47cSis 					*ob = U8_ASCII_TOUPPER(*ib);
20142765a47cSis 				else if (is_it_tolower)
20152765a47cSis 					*ob = U8_ASCII_TOLOWER(*ib);
20162765a47cSis 				else
20172765a47cSis 					*ob = *ib;
20182765a47cSis 				ib++;
20192765a47cSis 				ob++;
20202765a47cSis 			} else if ((ib + sz) > ibtail) {
20212765a47cSis 				if (do_not_ignore_invalid) {
202285bb5f1dSis 					*errnum = EINVAL;
20232765a47cSis 					ret_val = (size_t)-1;
20242765a47cSis 					break;
20252765a47cSis 				}
20262765a47cSis 
20272765a47cSis 				if ((obtail - ob) < (ibtail - ib)) {
202885bb5f1dSis 					*errnum = E2BIG;
20292765a47cSis 					ret_val = (size_t)-1;
20302765a47cSis 					break;
20312765a47cSis 				}
20322765a47cSis 
20332765a47cSis 				/*
20342765a47cSis 				 * We treat the remaining incomplete character
20352765a47cSis 				 * bytes as a character.
20362765a47cSis 				 */
20372765a47cSis 				ret_val++;
20382765a47cSis 
20392765a47cSis 				while (ib < ibtail)
20402765a47cSis 					*ob++ = *ib++;
20412765a47cSis 			} else {
20422765a47cSis 				if (is_it_toupper || is_it_tolower) {
20432765a47cSis 					i = do_case_conv(unicode_version, u8s,
20442765a47cSis 					    ib, sz, is_it_toupper);
20452765a47cSis 
20462765a47cSis 					if ((obtail - ob) < i) {
204785bb5f1dSis 						*errnum = E2BIG;
20482765a47cSis 						ret_val = (size_t)-1;
20492765a47cSis 						break;
20502765a47cSis 					}
20512765a47cSis 
20522765a47cSis 					ib += sz;
20532765a47cSis 
20542765a47cSis 					for (sz = 0; sz < i; sz++)
20552765a47cSis 						*ob++ = u8s[sz];
20562765a47cSis 				} else {
20572765a47cSis 					if ((obtail - ob) < sz) {
205885bb5f1dSis 						*errnum = E2BIG;
20592765a47cSis 						ret_val = (size_t)-1;
20602765a47cSis 						break;
20612765a47cSis 					}
20622765a47cSis 
20632765a47cSis 					for (i = 0; i < sz; i++)
20642765a47cSis 						*ob++ = *ib++;
20652765a47cSis 				}
20662765a47cSis 			}
20672765a47cSis 		}
20682765a47cSis 	} else {
20692765a47cSis 		canonical_decomposition = flag & U8_CANON_DECOMP;
20702765a47cSis 		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
20712765a47cSis 		canonical_composition = flag & U8_CANON_COMP;
20722765a47cSis 
20732765a47cSis 		while (ib < ibtail) {
20742765a47cSis 			if (*ib == '\0' && do_not_ignore_null)
20752765a47cSis 				break;
20762765a47cSis 
20772765a47cSis 			/*
20782765a47cSis 			 * If the current character is a 7-bit ASCII
20792765a47cSis 			 * character and it is the last character, or,
20802765a47cSis 			 * if the current character is a 7-bit ASCII
20812765a47cSis 			 * character and the next character is also a 7-bit
20822765a47cSis 			 * ASCII character, then, we copy over this
20832765a47cSis 			 * character without going through collect_a_seq().
20842765a47cSis 			 *
20852765a47cSis 			 * In any other cases, we need to look further with
20862765a47cSis 			 * the collect_a_seq() function.
20872765a47cSis 			 */
20882765a47cSis 			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
20892765a47cSis 			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
20902765a47cSis 				if (ob >= obtail) {
209185bb5f1dSis 					*errnum = E2BIG;
20922765a47cSis 					ret_val = (size_t)-1;
20932765a47cSis 					break;
20942765a47cSis 				}
20952765a47cSis 
20962765a47cSis 				if (is_it_toupper)
20972765a47cSis 					*ob = U8_ASCII_TOUPPER(*ib);
20982765a47cSis 				else if (is_it_tolower)
20992765a47cSis 					*ob = U8_ASCII_TOLOWER(*ib);
21002765a47cSis 				else
21012765a47cSis 					*ob = *ib;
21022765a47cSis 				ib++;
21032765a47cSis 				ob++;
21042765a47cSis 			} else {
210585bb5f1dSis 				*errnum = 0;
21062765a47cSis 				state = U8_STATE_START;
21072765a47cSis 
21082765a47cSis 				j = collect_a_seq(unicode_version, u8s,
21092765a47cSis 				    &ib, ibtail,
21102765a47cSis 				    is_it_toupper,
21112765a47cSis 				    is_it_tolower,
21122765a47cSis 				    canonical_decomposition,
21132765a47cSis 				    compatibility_decomposition,
21142765a47cSis 				    canonical_composition,
211585bb5f1dSis 				    errnum, &state);
21162765a47cSis 
211785bb5f1dSis 				if (*errnum && do_not_ignore_invalid) {
21182765a47cSis 					ret_val = (size_t)-1;
21192765a47cSis 					break;
21202765a47cSis 				}
21212765a47cSis 
21222765a47cSis 				if ((obtail - ob) < j) {
212385bb5f1dSis 					*errnum = E2BIG;
21242765a47cSis 					ret_val = (size_t)-1;
21252765a47cSis 					break;
21262765a47cSis 				}
21272765a47cSis 
21282765a47cSis 				for (i = 0; i < j; i++)
21292765a47cSis 					*ob++ = u8s[i];
21302765a47cSis 			}
21312765a47cSis 		}
21322765a47cSis 	}
21332765a47cSis 
21342765a47cSis 	*inlen = ibtail - ib;
21352765a47cSis 	*outlen = obtail - ob;
21362765a47cSis 
21372765a47cSis 	return (ret_val);
21382765a47cSis }
2139