1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/systm.h>
30 #include <sys/debug.h>
31 #include <sys/kmem.h>
32 #include <sys/sunddi.h>
33 #include <sys/byteorder.h>
34 #include <sys/errno.h>
35 #include <sys/u8_textprep.h>
36 #include <sys/kiconv.h>
37 #include <sys/kiconv_cck_common.h>
38 
39 /*
40  * Common kiconv_open method for UTF-8 -> CCK conversion.
41  */
42 void *
kiconv_open_to_cck()43 kiconv_open_to_cck()
44 {
45 	kiconv_state_t st;
46 
47 	st = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
48 
49 	st->bom_processed = 0;
50 
51 	return ((void *)st);
52 }
53 
54 /*
55  * Common kiconv_close method for UTF-8 -> CCK conversion.
56  */
57 int
kiconv_close_to_cck(void * kcd)58 kiconv_close_to_cck(void *kcd)
59 {
60 	if (! kcd || kcd == (void *)-1)
61 		return (EBADF);
62 
63 	kmem_free(kcd, sizeof (kiconv_state_data_t));
64 
65 	return (0);
66 }
67 
68 /*
69  * Common routine to convert UTF-8 sequence to CCK legal character sequence.
70  */
71 size_t
kiconv_utf8_to_cck(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno,kiconv_utf8tocck_t ptr_utf8tocck)72 kiconv_utf8_to_cck(void *kcd, char **inbuf, size_t *inbytesleft,
73     char **outbuf, size_t *outbytesleft, int *errno,
74     kiconv_utf8tocck_t ptr_utf8tocck)
75 {
76 	uchar_t		*ib;
77 	uchar_t		*ob;
78 	uchar_t		*ibtail;
79 	uchar_t		*obtail;
80 	uchar_t		*oldib;
81 	size_t		ret_val;
82 	size_t		i;		/* temp variable in for loop */
83 	uint32_t	u8;
84 	int8_t		sz;
85 
86 	/* Check on the kiconv code conversion descriptor. */
87 	if (! kcd || kcd == (void *)-1) {
88 		*errno = EBADF;
89 		return ((size_t)-1);
90 	}
91 
92 	/* If this is a state reset request, process and return. */
93 	if (! inbuf || !(*inbuf)) {
94 		((kiconv_state_t)kcd)->bom_processed = 0;
95 		return (0);
96 	}
97 
98 	ret_val = 0;
99 	ib = (uchar_t *)*inbuf;
100 	ob = (uchar_t *)*outbuf;
101 	ibtail = ib + *inbytesleft;
102 	obtail = ob + *outbytesleft;
103 
104 	KICONV_CHECK_UTF8_BOM(ib, ibtail);
105 
106 	while (ib < ibtail) {
107 		sz = u8_number_of_bytes[*ib];
108 
109 		/*
110 		 * If it is a 7-bit ASCII character, we don't need to
111 		 * process further and we just copy the character over.
112 		 *
113 		 * If not, we connect the chracter bytes up to four bytes,
114 		 * validate the bytes, and binary search for the corresponding
115 		 * table. If we find it from the mapping table, we put that
116 		 * into the output buffer; otherwise, we put a replacement
117 		 * character instead as a non-identical conversion.
118 		 */
119 		if (sz == 1) {
120 			if (ob >= obtail) {
121 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
122 			}
123 
124 			*ob++ = *ib++;
125 			continue;
126 		}
127 
128 		/*
129 		 * Issue EILSEQ error if the first byte is a
130 		 * invalid UTF-8 character leading byte.
131 		 */
132 		if (sz <= 0) {
133 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
134 		}
135 
136 		/*
137 		 * Issue EINVAL error if input buffer has an incomplete
138 		 * character at the end of the buffer.
139 		 */
140 		if (ibtail - ib < sz) {
141 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
142 		}
143 
144 		/*
145 		 * We collect UTF-8 character bytes and also check if this
146 		 * is a valid UTF-8 character without any bogus bytes based
147 		 * on the latest UTF-8 binary representation.
148 		 */
149 		oldib = ib;
150 		u8 = *ib++;
151 
152 		if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
153 			goto ILLEGAL_CHAR_PROCESS;
154 		u8 = (u8 << 8) | *ib++;
155 
156 		for (i = 2; i < sz; i++) {
157 			if (*ib < 0x80 || *ib > 0xbf) {
158 ILLEGAL_CHAR_PROCESS:
159 				*errno = EILSEQ;
160 				ret_val = (size_t)-1;
161 				ib = oldib;
162 				goto ILLEGAL_CHAR_ERR;
163 			}
164 
165 			u8 = (u8 << 8) | *ib++;
166 		}
167 
168 		/* Now we have a valid UTF-8 character. */
169 		sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
170 		if (sz < 0) {
171 			ib = oldib;
172 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
173 		}
174 
175 		ob += sz;
176 	}
177 
178 ILLEGAL_CHAR_ERR:
179 	*inbuf = (char *)ib;
180 	*inbytesleft = ibtail - ib;
181 	*outbuf = (char *)ob;
182 	*outbytesleft = obtail - ob;
183 
184 	return (ret_val);
185 }
186 
187 size_t
kiconvstr_utf8_to_cck(uchar_t * ib,size_t * inlen,uchar_t * ob,size_t * outlen,int flag,int * errno,kiconv_utf8tocck_t ptr_utf8tocck)188 kiconvstr_utf8_to_cck(uchar_t *ib, size_t *inlen, uchar_t *ob, size_t *outlen,
189     int flag, int *errno, kiconv_utf8tocck_t ptr_utf8tocck)
190 {
191 	uchar_t		*ibtail;
192 	uchar_t		*obtail;
193 	uchar_t		*oldib;
194 	size_t		ret_val;
195 	size_t		i;		/* temp variable in for loop */
196 	uint32_t	u8;
197 	int8_t		sz;
198 	boolean_t	do_not_ignore_null;
199 
200 	ret_val = 0;
201 	ibtail = ib + *inlen;
202 	obtail = ob + *outlen;
203 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
204 
205 	KICONV_CHECK_UTF8_BOM_WITHOUT_STATE(ib, ibtail);
206 
207 	while (ib < ibtail) {
208 		if (*ib == '\0' && do_not_ignore_null)
209 			break;
210 
211 		sz = u8_number_of_bytes[*ib];
212 
213 		if (sz == 1) {
214 			if (ob >= obtail) {
215 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
216 			}
217 
218 			*ob++ = *ib++;
219 			continue;
220 		}
221 
222 		oldib = ib;
223 
224 		if (sz <= 0) {
225 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
226 		}
227 
228 		if (ibtail - ib < sz) {
229 			if (flag & KICONV_REPLACE_INVALID) {
230 				ib = ibtail;
231 				goto REPLACE_INVALID;
232 			}
233 
234 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
235 		}
236 
237 		u8 = *ib++;
238 
239 		if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
240 			goto ILLEGAL_CHAR_PROCESS;
241 		u8 = (u8 << 8) | *ib++;
242 
243 		for (i = 2; i < sz; i++) {
244 			if (*ib < 0x80 || *ib > 0xbf) {
245 ILLEGAL_CHAR_PROCESS:
246 				if (flag & KICONV_REPLACE_INVALID) {
247 					ib = oldib + sz;
248 					goto REPLACE_INVALID;
249 				}
250 
251 				*errno = EILSEQ;
252 				ret_val = (size_t)-1;
253 				ib = oldib;
254 				goto ILLEGAL_CHAR_ERR;
255 			}
256 
257 			u8 = (u8 << 8) | *ib++;
258 		}
259 
260 		/* Now we get a valid character encoded in UTF-8. */
261 		sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
262 		if (sz < 0) {
263 			ib = oldib;
264 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
265 		}
266 
267 		ob += sz;
268 		continue;
269 
270 REPLACE_INVALID:
271 		if (ob >= obtail) {
272 			ib = oldib;
273 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
274 		}
275 
276 		*ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
277 		ret_val++;
278 	}
279 
280 ILLEGAL_CHAR_ERR:
281 	*inlen = ibtail - ib;
282 	*outlen = obtail - ob;
283 
284 	return (ret_val);
285 }
286 
287 /*
288  * Search key in tbl[0] <= tbl[1] <= ... <= tbl[n-1].  Return 0 if not found.
289  * tbl[0] is a special element for non-identical conversion.
290  */
291 size_t
kiconv_binsearch(uint32_t key,void * tbl,size_t nitems)292 kiconv_binsearch(uint32_t key, void *tbl, size_t nitems)
293 {
294 	size_t low, high, mid;
295 	kiconv_table_t *table;
296 
297 	low = 1;
298 	high = nitems - 1;
299 	table = (kiconv_table_t *)tbl;
300 
301 	while (low <= high) {
302 		mid = (low + high) / 2;
303 
304 		if (key < table[mid].key)
305 			high = mid - 1;
306 		else if (key > table[mid].key)
307 			low = mid + 1;
308 		else
309 			return (mid);
310 	}
311 
312 	return (0);
313 }
314