1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/systm.h>
30 #include <sys/debug.h>
31 #include <sys/kmem.h>
32 #include <sys/sunddi.h>
33 #include <sys/byteorder.h>
34 #include <sys/errno.h>
35 #include <sys/modctl.h>
36 #include <sys/kiconv.h>
37 #include <sys/u8_textprep.h>
38 #include <sys/kiconv_cck_common.h>
39 #include <sys/kiconv_sc.h>
40 #include <sys/kiconv_gb18030_utf8.h>
41 #include <sys/kiconv_gb2312_utf8.h>
42 #include <sys/kiconv_utf8_gb18030.h>
43 #include <sys/kiconv_utf8_gb2312.h>
44 
45 static int8_t gb2312_to_utf8(uchar_t byte1, uchar_t byte2, uchar_t *ob,
46 	uchar_t *obtail, size_t *ret_val);
47 static int8_t gbk_to_utf8(uint32_t gbk_val, uchar_t *ob, uchar_t *obtail,
48 	size_t *ret_val, boolean_t isgbk4);
49 static int8_t utf8_to_gb2312(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
50 	uchar_t *ob, uchar_t *obtail, size_t *ret);
51 static int8_t utf8_to_gbk(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
52 	uchar_t *ob, uchar_t *obtail, size_t *ret);
53 static int8_t utf8_to_gb18030(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
54 	uchar_t *ob, uchar_t *obtail, size_t *ret);
55 
56 #define	KICONV_SC_GB18030		(0x01)
57 #define	KICONV_SC_GBK			(0x02)
58 #define	KICONV_SC_EUCCN			(0x03)
59 #define	KICONV_SC_MAX_MAGIC_ID		(0x03)
60 
61 static void *
open_fr_gb18030()62 open_fr_gb18030()
63 {
64 	return ((void *)KICONV_SC_GB18030);
65 }
66 
67 static void *
open_fr_gbk()68 open_fr_gbk()
69 {
70 	return ((void *)KICONV_SC_GBK);
71 }
72 
73 static void *
open_fr_euccn()74 open_fr_euccn()
75 {
76 	return ((void *)KICONV_SC_EUCCN);
77 }
78 
79 static int
close_fr_sc(void * s)80 close_fr_sc(void *s)
81 {
82 	if ((uintptr_t)s > KICONV_SC_MAX_MAGIC_ID)
83 		return (EBADF);
84 
85 	return (0);
86 }
87 
88 /*
89  * Encoding convertor from UTF-8 to GB18030.
90  */
91 size_t
kiconv_to_gb18030(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)92 kiconv_to_gb18030(void *kcd, char **inbuf, size_t *inbytesleft,
93     char **outbuf, size_t *outbytesleft, int *errno)
94 {
95 
96 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
97 	    outbytesleft, errno, utf8_to_gb18030);
98 }
99 
100 /*
101  * String based encoding convertor from UTF-8 to GB18030.
102  */
103 size_t
kiconvstr_to_gb18030(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)104 kiconvstr_to_gb18030(char *inarray, size_t *inlen, char *outarray,
105     size_t *outlen, int flag, int *errno)
106 {
107 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
108 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_gb18030);
109 }
110 
111 /*
112  * Encoding convertor from GB18030 to UTF-8.
113  */
114 size_t
kiconv_fr_gb18030(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)115 kiconv_fr_gb18030(void *kcd, char **inbuf, size_t *inbytesleft,
116     char **outbuf, size_t *outbytesleft, int *errno)
117 {
118 	uchar_t		*ib;
119 	uchar_t		*ob;
120 	uchar_t		*ibtail;
121 	uchar_t		*obtail;
122 	size_t		ret_val;
123 	int8_t		sz;
124 	uint32_t	gb_val;
125 	boolean_t	isgbk4;
126 
127 	/* Check on the kiconv code conversion descriptor. */
128 	if (kcd == NULL || kcd == (void *)-1) {
129 		*errno = EBADF;
130 		return ((size_t)-1);
131 	}
132 
133 	/* If this is a state reset request, process and return. */
134 	if (inbuf == NULL || *inbuf == NULL) {
135 		return (0);
136 	}
137 
138 	ret_val = 0;
139 	ib = (uchar_t *)*inbuf;
140 	ob = (uchar_t *)*outbuf;
141 	ibtail = ib + *inbytesleft;
142 	obtail = ob + *outbytesleft;
143 
144 	while (ib < ibtail) {
145 		if (KICONV_IS_ASCII(*ib)) {
146 			if (ob >= obtail) {
147 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
148 			}
149 
150 			*ob++ = *ib++;
151 			continue;
152 		}
153 
154 		/*
155 		 * Issue EILSEQ error if the first byte is not a
156 		 * valid GB18030 leading byte.
157 		 */
158 		if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
159 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
160 		}
161 
162 		isgbk4 = (ibtail - ib < 2) ? B_FALSE :
163 		    KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1));
164 
165 		if (isgbk4) {
166 			if (ibtail - ib < 4) {
167 				KICONV_SET_ERRNO_AND_BREAK(EINVAL);
168 			}
169 
170 			if (! (KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)) &&
171 			    KICONV_SC_IS_GB18030_3rd_BYTE(*(ib + 2)) &&
172 			    KICONV_SC_IS_GB18030_4th_BYTE(*(ib + 3)))) {
173 				KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
174 			}
175 
176 			gb_val = (uint32_t)(*ib) << 24 |
177 			    (uint32_t)(*(ib + 1)) << 16 |
178 			    (uint32_t)(*(ib + 2)) << 8 | *(ib + 3);
179 		} else {
180 			if (ibtail - ib < 2) {
181 				KICONV_SET_ERRNO_AND_BREAK(EINVAL);
182 			}
183 
184 			if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
185 				KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
186 			}
187 
188 			gb_val = (uint32_t)(*ib) << 8 | *(ib + 1);
189 		}
190 
191 		sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, isgbk4);
192 		if (sz < 0) {
193 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
194 		}
195 
196 		ib += isgbk4 ? 4 : 2;
197 		ob += sz;
198 	}
199 
200 	*inbuf = (char *)ib;
201 	*inbytesleft = ibtail - ib;
202 	*outbuf = (char *)ob;
203 	*outbytesleft = obtail - ob;
204 
205 	return (ret_val);
206 }
207 
208 /*
209  * String based encoding convertor from GB18030 to UTF-8.
210  */
211 size_t
kiconvstr_fr_gb18030(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)212 kiconvstr_fr_gb18030(char *inarray, size_t *inlen, char *outarray,
213     size_t *outlen, int flag, int *errno)
214 {
215 	uchar_t		*ib;
216 	uchar_t		*ob;
217 	uchar_t		*ibtail;
218 	uchar_t		*obtail;
219 	uchar_t		*oldib;
220 	size_t		ret_val;
221 	int8_t		sz;
222 	uint32_t	gb_val;
223 	boolean_t	isgbk4;
224 	boolean_t	do_not_ignore_null;
225 
226 	ret_val = 0;
227 	ib = (uchar_t *)inarray;
228 	ob = (uchar_t *)outarray;
229 	ibtail = ib + *inlen;
230 	obtail = ob + *outlen;
231 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
232 
233 	while (ib < ibtail) {
234 		if (*ib == '\0' && do_not_ignore_null)
235 			break;
236 
237 		if (KICONV_IS_ASCII(*ib)) {
238 			if (ob >= obtail) {
239 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
240 			}
241 
242 			*ob++ = *ib++;
243 			continue;
244 		}
245 
246 		oldib = ib;
247 
248 		if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
249 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
250 		}
251 
252 		isgbk4 = (ibtail - ib < 2) ? B_FALSE :
253 		    KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1));
254 
255 		if (isgbk4) {
256 			if (ibtail - ib < 4) {
257 				if (flag & KICONV_REPLACE_INVALID) {
258 					ib = ibtail;
259 					goto REPLACE_INVALID;
260 				}
261 
262 				KICONV_SET_ERRNO_AND_BREAK(EINVAL);
263 			}
264 
265 			if (! (KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)) &&
266 			    KICONV_SC_IS_GB18030_3rd_BYTE(*(ib + 2)) &&
267 			    KICONV_SC_IS_GB18030_4th_BYTE(*(ib + 3)))) {
268 				KICONV_SET_ERRNO_WITH_FLAG(4, EILSEQ);
269 			}
270 
271 			gb_val = (uint32_t)(*ib) << 24 |
272 			    (uint32_t)(*(ib + 1)) << 16 |
273 			    (uint32_t)(*(ib + 2)) << 8 | *(ib + 3);
274 		} else {
275 			if (ibtail - ib < 2) {
276 				if (flag & KICONV_REPLACE_INVALID) {
277 					ib = ibtail;
278 					goto REPLACE_INVALID;
279 				}
280 
281 				KICONV_SET_ERRNO_AND_BREAK(EINVAL);
282 			}
283 
284 			if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
285 				KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
286 			}
287 
288 			gb_val = (uint32_t)(*ib) << 8 | *(ib + 1);
289 		}
290 
291 		sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, isgbk4);
292 		if (sz < 0) {
293 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
294 		}
295 
296 		ib += isgbk4 ? 4 : 2;
297 		ob += sz;
298 		continue;
299 
300 REPLACE_INVALID:
301 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
302 			ib = oldib;
303 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
304 		}
305 
306 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
307 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
308 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
309 		ret_val++;
310 	}
311 
312 	*inlen = ibtail - ib;
313 	*outlen = obtail - ob;
314 
315 	return (ret_val);
316 }
317 
318 /*
319  * Encoding convertor from UTF-8 to GBK.
320  */
321 size_t
kiconv_to_gbk(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)322 kiconv_to_gbk(void *kcd, char **inbuf, size_t *inbytesleft,
323     char **outbuf, size_t *outbytesleft, int *errno)
324 {
325 
326 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
327 	    outbytesleft, errno, utf8_to_gbk);
328 }
329 
330 /*
331  * String based encoding convertor from UTF-8 to GBK.
332  */
333 size_t
kiconvstr_to_gbk(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)334 kiconvstr_to_gbk(char *inarray, size_t *inlen, char *outarray,
335     size_t *outlen, int flag, int *errno)
336 {
337 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
338 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_gbk);
339 }
340 
341 /*
342  * Encoding convertor from GBK to UTF-8.
343  */
344 size_t
kiconv_fr_gbk(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)345 kiconv_fr_gbk(void *kcd, char **inbuf, size_t *inbytesleft,
346     char **outbuf, size_t *outbytesleft, int *errno)
347 {
348 	uchar_t		*ib;
349 	uchar_t		*ob;
350 	uchar_t		*ibtail;
351 	uchar_t		*obtail;
352 	size_t		ret_val;
353 	int8_t		sz;
354 	uint32_t	gb_val;
355 
356 	/* Check on the kiconv code conversion descriptor. */
357 	if (kcd == NULL || kcd == (void *)-1) {
358 		*errno = EBADF;
359 		return ((size_t)-1);
360 	}
361 
362 	/* If this is a state reset request, process and return. */
363 	if (inbuf == NULL || *inbuf == NULL) {
364 		return (0);
365 	}
366 
367 	ret_val = 0;
368 	ib = (uchar_t *)*inbuf;
369 	ob = (uchar_t *)*outbuf;
370 	ibtail = ib + *inbytesleft;
371 	obtail = ob + *outbytesleft;
372 
373 	while (ib < ibtail) {
374 		if (KICONV_IS_ASCII(*ib)) {
375 			if (ob >= obtail) {
376 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
377 			}
378 
379 			*ob++ = *ib++;
380 			continue;
381 		}
382 
383 		/*
384 		 * Issue EILSEQ error if the first byte is not a
385 		 * valid GBK leading byte.
386 		 */
387 		if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
388 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
389 		}
390 
391 		/*
392 		 * Issue EINVAL error if input buffer has an incomplete
393 		 * character at the end of the buffer.
394 		 */
395 		if (ibtail - ib < 2) {
396 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
397 		}
398 
399 		/*
400 		 * Issue EILSEQ error if the remaining byte is not
401 		 * a valid GBK byte.
402 		 */
403 		if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
404 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
405 		}
406 
407 		/* Now we have a valid GBK character. */
408 		gb_val = (uint32_t)(*ib) << 8 | *(ib + 1);
409 		sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, B_FALSE);
410 
411 		if (sz < 0) {
412 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
413 		}
414 
415 		ib += 2;
416 		ob += sz;
417 	}
418 
419 	*inbuf = (char *)ib;
420 	*inbytesleft = ibtail - ib;
421 	*outbuf = (char *)ob;
422 	*outbytesleft = obtail - ob;
423 
424 	return (ret_val);
425 }
426 
427 /*
428  * String based encoding convertor from GBK to UTF-8.
429  */
430 size_t
kiconvstr_fr_gbk(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)431 kiconvstr_fr_gbk(char *inarray, size_t *inlen, char *outarray,
432     size_t *outlen, int flag, int *errno)
433 {
434 	uchar_t		*ib;
435 	uchar_t		*ob;
436 	uchar_t		*ibtail;
437 	uchar_t		*obtail;
438 	uchar_t		*oldib;
439 	size_t		ret_val;
440 	int8_t		sz;
441 	uint32_t	gb_val;
442 	boolean_t	do_not_ignore_null;
443 
444 	ret_val = 0;
445 	ib = (uchar_t *)inarray;
446 	ob = (uchar_t *)outarray;
447 	ibtail = ib + *inlen;
448 	obtail = ob + *outlen;
449 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
450 
451 	while (ib < ibtail) {
452 		if (*ib == '\0' && do_not_ignore_null)
453 			break;
454 
455 		if (KICONV_IS_ASCII(*ib)) {
456 			if (ob >= obtail) {
457 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
458 			}
459 
460 			*ob++ = *ib++;
461 			continue;
462 		}
463 
464 		oldib = ib;
465 
466 		if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
467 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
468 		}
469 
470 		if (ibtail - ib < 2) {
471 			KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
472 		}
473 
474 		if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
475 			KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
476 		}
477 
478 		gb_val = (uint32_t)(*ib << 8) | *(ib + 1);
479 		sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, B_FALSE);
480 
481 		if (sz < 0) {
482 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
483 		}
484 
485 		ib += 2;
486 		ob += sz;
487 		continue;
488 
489 REPLACE_INVALID:
490 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
491 			ib = oldib;
492 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
493 		}
494 
495 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
496 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
497 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
498 		ret_val++;
499 	}
500 
501 	*inlen = ibtail - ib;
502 	*outlen = obtail - ob;
503 
504 	return (ret_val);
505 }
506 
507 /*
508  * Encoding convertor from UTF-8 to EUC-CN.
509  */
510 size_t
kiconv_to_euccn(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)511 kiconv_to_euccn(void *kcd, char **inbuf, size_t *inbytesleft,
512     char **outbuf, size_t *outbytesleft, int *errno)
513 {
514 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
515 	    outbytesleft, errno, utf8_to_gb2312);
516 }
517 
518 /*
519  * String based encoding convertor from UTF-8 to EUC-CN.
520  */
521 size_t
kiconvstr_to_euccn(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)522 kiconvstr_to_euccn(char *inarray, size_t *inlen, char *outarray,
523     size_t *outlen, int flag, int *errno)
524 {
525 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
526 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_gb2312);
527 }
528 
529 /*
530  * Encoding converto from EUC-CN to UTF-8 code.
531  */
532 size_t
kiconv_fr_euccn(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)533 kiconv_fr_euccn(void *kcd, char **inbuf, size_t *inbytesleft,
534     char **outbuf, size_t *outbytesleft, int *errno)
535 {
536 	uchar_t		*ib;
537 	uchar_t		*ob;
538 	uchar_t		*ibtail;
539 	uchar_t		*obtail;
540 	size_t		ret_val;
541 	int8_t		sz;
542 
543 	/* Check on the kiconv code conversion descriptor. */
544 	if (kcd == NULL || kcd == (void *)-1) {
545 		*errno = EBADF;
546 		return ((size_t)-1);
547 	}
548 
549 	/* If this is a state reset request, process and return. */
550 	if (inbuf == NULL || *inbuf == NULL) {
551 		return (0);
552 	}
553 
554 	ret_val = 0;
555 	ib = (uchar_t *)*inbuf;
556 	ob = (uchar_t *)*outbuf;
557 	ibtail = ib + *inbytesleft;
558 	obtail = ob + *outbytesleft;
559 
560 	while (ib < ibtail) {
561 		if (KICONV_IS_ASCII(*ib)) {
562 			if (ob >= obtail) {
563 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
564 			}
565 
566 			*ob++ = *ib++;
567 			continue;
568 		}
569 
570 		/*
571 		 * Issue EILSEQ error if the first byte is not a
572 		 * valid GB2312 leading byte.
573 		 */
574 		if (! KICONV_SC_IS_GB2312_BYTE(*ib)) {
575 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
576 		}
577 
578 		/*
579 		 * Issue EINVAL error if input buffer has an incomplete
580 		 * character at the end of the buffer.
581 		 */
582 		if (ibtail - ib < 2) {
583 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
584 		}
585 
586 		/*
587 		 * Issue EILSEQ error if the remaining byte is not
588 		 * a valid GB2312 byte.
589 		 */
590 		if (! KICONV_SC_IS_GB2312_BYTE(*(ib + 1))) {
591 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
592 		}
593 
594 		/* Now we have a valid GB2312 character */
595 		sz = gb2312_to_utf8(*ib, *(ib + 1), ob, obtail, &ret_val);
596 		if (sz < 0) {
597 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
598 		}
599 
600 		ib += 2;
601 		ob += sz;
602 	}
603 
604 	*inbuf = (char *)ib;
605 	*inbytesleft = ibtail - ib;
606 	*outbuf = (char *)ob;
607 	*outbytesleft = obtail - ob;
608 
609 	return (ret_val);
610 }
611 
612 /*
613  * String based encoding convertor from EUC-CN to UTF-8.
614  */
615 size_t
kiconvstr_fr_euccn(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)616 kiconvstr_fr_euccn(char *inarray, size_t *inlen, char *outarray,
617     size_t *outlen, int flag, int *errno)
618 {
619 	uchar_t		*ib;
620 	uchar_t		*ob;
621 	uchar_t		*ibtail;
622 	uchar_t		*obtail;
623 	uchar_t		*oldib;
624 	size_t		ret_val;
625 	int8_t		sz;
626 	boolean_t	do_not_ignore_null;
627 
628 	ret_val = 0;
629 	ib = (uchar_t *)inarray;
630 	ob = (uchar_t *)outarray;
631 	ibtail = ib + *inlen;
632 	obtail = ob + *outlen;
633 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
634 
635 	while (ib < ibtail) {
636 		if (*ib == '\0' && do_not_ignore_null)
637 			break;
638 
639 		if (KICONV_IS_ASCII(*ib)) {
640 			if (ob >= obtail) {
641 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
642 			}
643 
644 			*ob++ = *ib++;
645 			continue;
646 		}
647 
648 		oldib = ib;
649 
650 		if (! KICONV_SC_IS_GB2312_BYTE(*ib)) {
651 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
652 		}
653 
654 		if (ibtail - ib < 2) {
655 			KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
656 		}
657 
658 		if (! KICONV_SC_IS_GB2312_BYTE(*(ib + 1))) {
659 			KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
660 		}
661 
662 		sz = gb2312_to_utf8(*ib, *(ib + 1), ob, obtail, &ret_val);
663 		if (sz < 0) {
664 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
665 		}
666 
667 		ib += 2;
668 		ob += sz;
669 		continue;
670 
671 REPLACE_INVALID:
672 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
673 			ib = oldib;
674 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
675 		}
676 
677 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
678 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
679 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
680 		ret_val++;
681 	}
682 
683 	*inlen = ibtail - ib;
684 	*outlen = obtail - ob;
685 
686 	return (ret_val);
687 }
688 
689 /*
690  * Convert single GB2312 character to UTF-8.
691  * Return: > 0  - Converted successfully
692  *         = -1 - E2BIG
693  */
694 static int8_t
gb2312_to_utf8(uchar_t b1,uchar_t b2,uchar_t * ob,uchar_t * obtail,size_t * ret_val)695 gb2312_to_utf8(uchar_t b1, uchar_t b2, uchar_t *ob, uchar_t *obtail,
696     size_t *ret_val)
697 {
698 	size_t	index;
699 	int8_t	sz;
700 	uchar_t	*u8;
701 
702 	/* index = (b1 - KICONV_EUC_START) * 94 + b2 - KICONV_EUC_START; */
703 	index = b1 * 94 + b2 - 0x3BBF;
704 
705 	if (index >= KICONV_GB2312_UTF8_MAX)
706 		index = KICONV_GB2312_UTF8_MAX - 1;	/* Map to 0xEFBFBD */
707 
708 	u8 = kiconv_gb2312_utf8[index];
709 	sz = u8_number_of_bytes[u8[0]];
710 
711 	if (obtail - ob < sz) {
712 		*ret_val = (size_t)-1;
713 		return (-1);
714 	}
715 
716 	for (index = 0; index < sz; index++)
717 		*ob++ = u8[index];
718 
719 	/*
720 	 * As kiconv_gb2312_utf8 contain muliple KICONV_UTF8_REPLACEMENT_CHAR
721 	 * elements, so need to ckeck more.
722 	 */
723 	if (sz == KICONV_UTF8_REPLACEMENT_CHAR_LEN &&
724 	    u8[0] == KICONV_UTF8_REPLACEMENT_CHAR1 &&
725 	    u8[1] == KICONV_UTF8_REPLACEMENT_CHAR2 &&
726 	    u8[2] == KICONV_UTF8_REPLACEMENT_CHAR3)
727 		(*ret_val)++;
728 
729 	return (sz);
730 }
731 
732 /*
733  * Convert single GB18030 or GBK character to UTF-8.
734  * Return: > 0  - Converted successfully
735  *         = -1 - E2BIG
736  */
737 static int8_t
gbk_to_utf8(uint32_t gbk_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val,boolean_t isgbk4)738 gbk_to_utf8(uint32_t gbk_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val,
739     boolean_t isgbk4)
740 {
741 	size_t	index;
742 	int8_t	sz;
743 	uchar_t	u8array[4];
744 	uchar_t	*u8;
745 
746 	if (isgbk4) {
747 		if (gbk_val >= KICONV_SC_PLANE1_GB18030_START) {
748 			uint32_t	u32;
749 
750 			/*
751 			 * u32 = ((gbk_val >> 24) - 0x90) * 12600 +
752 			 *   (((gbk_val & 0xFF0000) >> 16) - 0x30) * 1260 +
753 			 *   (((gbk_val & 0xFF00) >> 8) - 0x81) * 10 +
754 			 *   (gbk_val & 0xFF - 0x30)+
755 			 *   KICONV_SC_PLANE1_UCS4_START;
756 			 */
757 			u32 = (gbk_val >> 24) * 12600 +
758 			    ((gbk_val & 0xFF0000) >> 16) * 1260 +
759 			    ((gbk_val & 0xFF00) >> 8) * 10 +
760 			    (gbk_val & 0xFF) - 0x1BA0FA;
761 			u8array[0] = (uchar_t)(0xF0 | ((u32 & 0x1C0000) >> 18));
762 			u8array[1] = (uchar_t)(0x80 | ((u32 & 0x03F000) >> 12));
763 			u8array[2] = (uchar_t)(0x80 | ((u32 & 0x000FC0) >> 6));
764 			u8array[3] = (uchar_t)(0x80 | (u32 & 0x00003F));
765 			u8 = u8array;
766 			index = 1;
767 		} else {
768 			index = kiconv_binsearch(gbk_val,
769 			    kiconv_gbk4_utf8, KICONV_GBK4_UTF8_MAX);
770 			u8 = kiconv_gbk4_utf8[index].u8;
771 		}
772 	} else {
773 		index = kiconv_binsearch(gbk_val,
774 		    kiconv_gbk_utf8, KICONV_GBK_UTF8_MAX);
775 		u8 = kiconv_gbk_utf8[index].u8;
776 	}
777 
778 	sz = u8_number_of_bytes[u8[0]];
779 	if (obtail - ob < sz) {
780 		*ret_val = (size_t)-1;
781 		return (-1);
782 	}
783 
784 	if (index == 0)
785 		(*ret_val)++;	/* Non-identical conversion */
786 
787 	for (index = 0; index < sz; index++)
788 		*ob++ = u8[index];
789 
790 	return (sz);
791 }
792 
793 /*
794  * Convert single UTF-8 character to GB18030.
795  * Return: > 0  - Converted successfully
796  *         = -1 - E2BIG
797  */
798 /* ARGSUSED */
799 static int8_t
utf8_to_gb18030(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret)800 utf8_to_gb18030(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
801     uchar_t *ob, uchar_t *obtail, size_t *ret)
802 {
803 	size_t		index;
804 	int8_t		gbklen;
805 	uint32_t	gbkcode;
806 
807 	if (utf8 >= KICONV_SC_PLANE1_UTF8_START) {
808 		/* Four bytes GB18030 [0x90308130, 0xe339fe39] handling. */
809 		uint32_t	u32;
810 
811 		u32 = (((utf8 & 0x07000000) >> 6) | ((utf8 & 0x3F0000) >> 4) |
812 		    ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) -
813 		    KICONV_SC_PLANE1_UCS4_START;
814 		gbkcode = ((u32 / 12600 + 0x90) << 24) |
815 		    (((u32 % 12600) / 1260 + 0x30) << 16) |
816 		    (((u32 % 1260) / 10 + 0x81) << 8) | (u32 % 10 + 0x30);
817 		gbklen = 4;
818 		index = 1;
819 	} else {
820 		index = kiconv_binsearch(utf8, kiconv_utf8_gb18030,
821 		    KICONV_UTF8_GB18030_MAX);
822 		gbkcode = kiconv_utf8_gb18030[index].value;
823 		KICONV_SC_GET_GB_LEN(gbkcode, gbklen);
824 	}
825 
826 	if (obtail - ob < gbklen) {
827 		*ret = (size_t)-1;
828 		return (-1);
829 	}
830 
831 	if (index == 0)
832 		(*ret)++;		/* Non-identical conversion */
833 
834 	if (gbklen == 2) {
835 		*ob++ = (uchar_t)(gbkcode >> 8);
836 	} else if (gbklen == 4) {
837 		*ob++ = (uchar_t)(gbkcode >> 24);
838 		*ob++ = (uchar_t)(gbkcode >> 16);
839 		*ob++ = (uchar_t)(gbkcode >> 8);
840 	}
841 	*ob = (uchar_t)(gbkcode & 0xFF);
842 
843 	return (gbklen);
844 }
845 
846 /*
847  * Convert single UTF-8 character to GBK.
848  * Return: > 0  - Converted successfully
849  *         = -1 - E2BIG
850  */
851 /* ARGSUSED */
852 static int8_t
utf8_to_gbk(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret)853 utf8_to_gbk(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
854     uchar_t *ob, uchar_t *obtail, size_t *ret)
855 {
856 	size_t		index;
857 	int8_t		gbklen;
858 	uint32_t	gbkcode;
859 
860 	index = kiconv_binsearch(utf8, kiconv_utf8_gb18030,
861 	    KICONV_UTF8_GB18030_MAX);
862 	gbkcode = kiconv_utf8_gb18030[index].value;
863 	KICONV_SC_GET_GB_LEN(gbkcode, gbklen);
864 
865 	/* GBK and GB18030 share the same table, so check the length. */
866 	if (gbklen == 4) {
867 		index = 0;
868 		gbkcode = kiconv_utf8_gb18030[index].value;
869 		gbklen = 1;
870 	}
871 
872 	if (obtail - ob < gbklen) {
873 		*ret = (size_t)-1;
874 		return (-1);
875 	}
876 
877 	if (index == 0)
878 		(*ret)++;		/* Non-identical conversion */
879 
880 	if (gbklen > 1)
881 		*ob++ = (uchar_t)(gbkcode >> 8);
882 	*ob = (uchar_t)(gbkcode & 0xFF);
883 
884 	return (gbklen);
885 }
886 
887 /*
888  * Convert single UTF-8 character to GB2312.
889  * Return: > 0  - Converted successfully
890  *         = -1 - E2BIG
891  */
892 /* ARGSUSED */
893 static int8_t
utf8_to_gb2312(uint32_t utf8,uchar_t ** inbuf,uchar_t * intail,uchar_t * ob,uchar_t * obtail,size_t * ret)894 utf8_to_gb2312(uint32_t utf8, uchar_t **inbuf, uchar_t *intail,
895     uchar_t *ob, uchar_t *obtail, size_t *ret)
896 {
897 	size_t		index;
898 	int8_t		gblen;
899 	uint32_t	gbcode;
900 
901 	index = kiconv_binsearch(utf8, kiconv_utf8_gb2312,
902 	    KICONV_UTF8_GB2312_MAX);
903 	gbcode = kiconv_utf8_gb2312[index].value;
904 	gblen = (gbcode <= 0xFF) ? 1 : 2;
905 
906 	if (obtail - ob < gblen) {
907 		*ret = (size_t)-1;
908 		return (-1);
909 	}
910 
911 	if (index == 0)
912 		(*ret)++;
913 
914 	if (gblen > 1)
915 		*ob++ = (uchar_t)(gbcode >> 8);
916 	*ob = (uchar_t)(gbcode & 0xFF);
917 
918 	return (gblen);
919 }
920 
921 static kiconv_ops_t kiconv_sc_ops_tbl[] = {
922 	{
923 		"gb18030", "utf-8", kiconv_open_to_cck, kiconv_to_gb18030,
924 		kiconv_close_to_cck, kiconvstr_to_gb18030
925 	},
926 	{
927 		"utf-8", "gb18030", open_fr_gb18030, kiconv_fr_gb18030,
928 		close_fr_sc, kiconvstr_fr_gb18030
929 	},
930 	{
931 		"gbk", "utf-8", kiconv_open_to_cck, kiconv_to_gbk,
932 		kiconv_close_to_cck, kiconvstr_to_gbk
933 	},
934 	{
935 		"utf-8", "gbk", open_fr_gbk, kiconv_fr_gbk,
936 		close_fr_sc, kiconvstr_fr_gbk
937 	},
938 	{
939 		"euccn", "utf-8", kiconv_open_to_cck, kiconv_to_euccn,
940 		kiconv_close_to_cck, kiconvstr_to_euccn
941 	},
942 	{
943 		"utf-8", "euccn", open_fr_euccn, kiconv_fr_euccn,
944 		close_fr_sc, kiconvstr_fr_euccn
945 	},
946 };
947 
948 static kiconv_module_info_t kiconv_sc_info = {
949 	"kiconv_sc",		/* module name */
950 	sizeof (kiconv_sc_ops_tbl) / sizeof (kiconv_sc_ops_tbl[0]),
951 	kiconv_sc_ops_tbl,
952 	0,
953 	NULL,
954 	NULL,
955 	0
956 };
957 
958 static struct modlkiconv modlkiconv_sc = {
959 	&mod_kiconvops,
960 	"kiconv Simplified Chinese module 1.0",
961 	&kiconv_sc_info
962 };
963 
964 static struct modlinkage modlinkage = {
965 	MODREV_1,
966 	(void *)&modlkiconv_sc,
967 	NULL
968 };
969 
970 int
_init(void)971 _init(void)
972 {
973 	int err;
974 
975 	err = mod_install(&modlinkage);
976 	if (err)
977 		cmn_err(CE_WARN, "kiconv_sc: failed to load kernel module");
978 
979 	return (err);
980 }
981 
982 int
_fini(void)983 _fini(void)
984 {
985 	int err;
986 
987 	/*
988 	 * If this module is being used, then, we cannot remove the module.
989 	 * The following checking will catch pretty much all usual cases.
990 	 *
991 	 * Any remaining will be catached by the kiconv_unregister_module()
992 	 * during mod_remove() at below.
993 	 */
994 	if (kiconv_module_ref_count(KICONV_MODULE_ID_SC))
995 		return (EBUSY);
996 
997 	err = mod_remove(&modlinkage);
998 	if (err)
999 		cmn_err(CE_WARN, "kiconv_sc: failed to remove kernel module");
1000 
1001 	return (err);
1002 }
1003 
1004 int
_info(struct modinfo * modinfop)1005 _info(struct modinfo *modinfop)
1006 {
1007 	return (mod_info(&modlinkage, modinfop));
1008 }
1009