1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright(c) 1998 Sun Microsystems, Inc.
23  * All right reserved.
24  */
25 
26 #include <stdio.h>
27 #include <errno.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <public_struc.h>
31 #include <unicode_gb2312.h>
32 #include <unicode_cns11643_CN.h>
33 #ifdef DEBUG
34 #include <fcntl.h>
35 #include <sys/stat.h>
36 #endif
37 #include "common_defs.h"
38 
39 #define	SI	0x0f
40 #define	SO	0x0e
41 #define SS2 0x4e
42 #define SS3 0x4f
43 #define	ESC	0x1b
44 #define	MSB	0x80
45 #define MSB_OFF 0x7f
46 
47 #define	NON_ID_CHAR1	0x21
48 #define NON_ID_CHAR2	0x75
49 
50 typedef struct _icv_state {
51 	short	_ustate;
52 	short	_istate;
53 	short	_gstate;
54 	char	_keepc[6];
55 	int		_errno;
56 } _iconv_st;
57 
58 enum	_USTATE	{ U0, U1, U2, U3, U4, U5, U6, U7 };
59 enum	_ISTATE	{ IN, OUT };
60 enum	_GSTATE	{ G0, G1, G2 };
61 
62 int binary_search(unsigned long key, table_t *table, int tab_len);
63 
64 /*
65  *	Open; called from iconv_open()
66  */
_icv_open()67 void * _icv_open() {
68 	_iconv_st * st;
69 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
70 		errno = ENOMEM;
71 		return (void *)-1;
72 	}
73 
74 	st->_ustate = U0;
75 	st->_istate = IN;
76 	st->_gstate = -1;
77 	st->_errno = 0;
78 
79 	return (void *)st;
80 }
81 
82 /*
83  *	Close; called from iconv_close()
84  */
85 
_icv_close(_iconv_st * st)86 void _icv_close(_iconv_st *st) {
87 	if (st == NULL)
88 		errno = EBADF;
89 	else
90 		free(st);
91 }
92 
93 /*
94  *	Actual conversion; called from iconv()
95  */
96 
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)97 size_t _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
98 					char **outbuf, size_t *outbytesleft) {
99 	char c1 = '\0', c2 = '\0';
100 	int n = 0;
101 	unsigned long key;
102 	unsigned long gbk;
103 	int index;
104 	short new_state;
105 
106 #ifdef DEBUG
107 	fprintf(stderr, "in length is %d\toutlength is %d\n",
108 			*inbytesleft, *outbytesleft);
109 #endif
110 	if (st == NULL) {
111 		errno = EBADF;
112 		return ((size_t)-1);
113 	}
114 
115 	if (inbuf == NULL || *inbuf == NULL) {	/* Reset request. */
116 		st->_ustate = U0;
117 		st->_istate = IN;
118 		st->_gstate = G0;
119 		st->_errno = 0;
120 		return ((size_t)0);
121 	}
122 
123 	errno = 0;
124 	while (*inbytesleft > 0 && *outbytesleft > 0) {
125 
126 	        uchar_t  first_byte;
127 
128 		switch (st->_ustate) {
129 			case U0:
130 				if ((**inbuf & MSB) == 0) {	/* ASCII */
131 					if (st->_istate == OUT) {
132 						if (*outbytesleft < 2) {
133 #ifdef DEBUG
134 							fprintf(stderr, "11111 outbytesleft is %d\n", *outbytesleft);
135 #endif
136 							errno = E2BIG;
137 							return (size_t) -1;
138 						}
139 						st->_istate = IN;
140 						**outbuf = SI;
141 						(*outbuf)++;
142 						(*outbytesleft)--;
143 					}
144 					if (*outbytesleft < 1) {
145 #ifdef DEBUG
146 						fprintf(stderr, "22222 outbytesleft is %d\n", *outbytesleft);
147 #endif
148 						errno = E2BIG;
149 						return (size_t) -1;
150 					}
151 					**outbuf = **inbuf;
152 					(*outbuf)++;
153 					(*outbytesleft)--;
154 				} else {	/* Chinese charactor */
155 					if ((**inbuf & 0xe0) == 0xc0) {	/* 2-byte unicode 0xc2..0xdf */
156 
157 					   /* invalid sequence if the first char is either 0xc0 or 0xc1 */
158 					   if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
159 					        st->_errno = errno = EILSEQ;
160 					   else {
161 						st->_ustate = U1;
162 						st->_keepc[0] = **inbuf;
163 					   }
164 					} else if ((**inbuf & 0xf0) == 0xe0) {	/* 3-bytes unicode */
165 						st->_ustate = U2;
166 						st->_keepc[0] = **inbuf;
167 					} else {
168 
169 					   /* four bytes of UTF-8 sequences */
170 					   if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
171 						st->_errno = errno = EILSEQ;
172 					   else
173 					     {
174 						st->_ustate = U5;
175 						st->_keepc[0] = **inbuf;
176 					     }
177 #ifdef DEBUG
178 						fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]);
179 #endif
180 					}
181 				}
182 				break;
183 
184 			case U1:	/* 2-byte unicode */
185 				if ((**inbuf & 0xc0) == 0x80) {	/* 2nd byte is 1xxxxxxx */
186 					st->_ustate = U4;
187 					st->_keepc[1] = **inbuf;
188 					c1 = (st->_keepc[0] & 0x1c)>>2;
189 					c2 = ((st->_keepc[0] & 0x03) << 6) | \
190 							(st->_keepc[1] & 0x3f);
191 					continue;
192 				} else {
193 					st->_errno = errno = EILSEQ;
194 #ifdef DEBUG
195 					fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]);
196 #endif
197 				}
198 				break;
199 
200 			case U2:	/* 3-byte unicode - 2nd byte */
201 		                first_byte = st->_keepc[0];
202 
203 		                /* if the first byte is 0xed, it is illegal sequence if the second
204 				 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
205 				 */
206 		                if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
207 				    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
208 		                        st->_errno = errno = EILSEQ;
209 		                else {
210 					st->_ustate = U3;
211 					st->_keepc[1] = **inbuf;
212 				}
213 				break;
214 
215 			case U3:	/* 3-byte unicode - 3th byte */
216 				if ((**inbuf & 0xc0) == 0x80) {
217 					st->_ustate = U4;
218 					st->_keepc[2] = **inbuf;
219 					c1 = ((st->_keepc[0] & 0x0f) << 4) | \
220 							((st->_keepc[1] & 0x3c) >> 2);
221 					c2 = ((st->_keepc[1] & 0x03) << 6) | \
222 							(st->_keepc[2] & 0x3f);
223 					continue;
224 				} else {
225 					st->_errno = errno = EILSEQ;
226 #ifdef DEBUG
227 					fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]);
228 #endif
229 				}
230 				break;
231 
232 			case U4:	/* Generate iso2022 sequence */
233 				key = ((c1 & 0xff) << 8) | (c2 & 0xff);
234 
235 		                /* 0xFFFE and 0xFFFF should not be allowed */
236 		                if ( key == 0xFFFE || key == 0xFFFF ) {
237 				        st->_errno = errno = EILSEQ;
238 				        break;
239 				}
240 
241 				if ((index = binary_search(key, unicode_gb_tab, UNICODEMAX)) != -1) {	/* GB code set */
242 					gbk = unicode_gb_tab[index].value;
243 					if (st->_gstate != G0) {
244 						if (*outbytesleft < 7) {
245 #ifdef DEBUG
246 							fprintf(stderr, "33333 outbytesleft is %d\n", *outbytesleft);
247 #endif
248 							errno = E2BIG;
249 							return ((size_t)-1);
250 						}
251 						st->_istate = OUT;
252 						st->_gstate = G0;
253 						**outbuf = ESC;
254 						*(*outbuf + 1) = '$';
255 						*(*outbuf + 2) = ')';
256 						*(*outbuf + 3) = 'A';
257 						*(*outbuf + 4) = SO;
258 						*(*outbuf + 5) = (gbk & 0xff00) >> 8;
259 						*(*outbuf + 6) = gbk & 0xff;
260 						n = 7;
261 					} else if (st->_istate == IN) {
262 						if (*outbytesleft < 3) {
263 #ifdef DEBUG
264 							fprintf(stderr, "44444outbytesleft is %d\n", *outbytesleft);
265 #endif
266 							errno = E2BIG;
267 							return ((size_t) -1);
268 						}
269 						st->_istate = OUT;
270 						**(outbuf) = SO;
271 						*(*outbuf + 1) = (gbk & 0xff00) >> 8;
272 						*(*outbuf + 2) = gbk & 0xff;
273 						n = 3;
274 					} else {
275 					        if ( *outbytesleft < 2 ) {
276 						   errno = E2BIG;
277 						   return ((size_t)-1);
278 					        }
279 
280 						**outbuf = (gbk & 0xff00) >> 8;
281 						*(*outbuf + 1) = gbk & 0xff;
282 						n = 2;
283 					}
284 				} else if ((index = binary_search(key, utf_cns_tab, MAX_UTF_NUM)) != -1) {
285 					gbk = utf_cns_tab[index].value;
286 					new_state = ((gbk >> 16 ) & 0xff) - 0x20;
287 					if (new_state == G2 || new_state == G1) {
288 						if (st->_gstate != new_state) {
289 							if (*outbytesleft < 7) {
290 #ifdef DEBUG
291 								fprintf(stderr, "55555 outbytesleft is %d\n", *outbytesleft);
292 #endif
293 								errno = E2BIG;
294 								return (size_t) -1;
295 							}
296 							**outbuf = ESC;
297 							*(*outbuf + 1) = '$';
298 							*(*outbuf + 2) = ')';
299 							*(*outbuf + 3) = 'G' + new_state - 1;
300 							st->_istate = OUT;
301 							st->_gstate = new_state;
302 							*(*outbuf + 4) = SO;
303 							*(*outbuf + 5) = (gbk & 0xff00) >> 8;
304 							*(*outbuf + 6) = gbk & 0xff;
305 							n = 7;
306 						} else if (st->_istate == IN) {
307 							if (*outbytesleft < 3) {
308 #ifdef DEBUG
309 								fprintf(stderr, "66666 outbytesleft is %d\n", *outbytesleft);
310 #endif
311 								errno = E2BIG;
312 								return (size_t) -1;
313 							}
314 							st->_istate = OUT;
315 							**outbuf = SO;
316 							*(*outbuf + 1) = (gbk & 0xff00) >> 8;
317 							*(*outbuf + 2) = gbk & 0xff;
318 							n = 3;
319 						} else {
320 							if (*outbytesleft < 2) {
321 #ifdef DEBUG
322 								fprintf(stderr, "77777 outbytesleft is %d\n", *outbytesleft);
323 #endif
324 								errno = E2BIG;
325 								return (size_t) -1;
326 							}
327 							**outbuf = (gbk & 0xff00) >> 8;
328 							*(*outbuf + 1) = gbk & 0xff;
329 							n = 2;
330 						}
331 					} else if (new_state > G2) {
332 						if (st->_gstate != G0) {
333 							if (*outbytesleft < 7) {
334 #ifdef DEBUG
335 								fprintf(stderr, " 888888 outbytesleft is %d\n", *outbytesleft);
336 #endif
337 								errno = E2BIG;
338 								return (size_t) -1;
339 							}
340 							st->_gstate = G0;
341 							st->_istate = OUT;
342 							**outbuf = ESC;
343 							*(*outbuf + 1) = '$';
344 							*(*outbuf + 2) = ')';
345 							*(*outbuf + 3) = 'A';
346 							*(*outbuf + 4) = SO;
347 							*(*outbuf + 5) = NON_ID_CHAR1;
348 							*(*outbuf + 6) = NON_ID_CHAR2;
349 							n = 7;
350 						} else if (st->_istate == IN) {
351 							if (*outbytesleft < 3) {
352 #ifdef DEBUG
353 								fprintf(stderr, "99999 outbytesleft is %d\n", *outbytesleft);
354 #endif
355 								errno = E2BIG;
356 								return (size_t) -1;
357 							}
358 							st->_gstate = G0;
359 							st->_istate = OUT;
360 							**outbuf = SO;
361 							*(*outbuf + 1) = NON_ID_CHAR1;
362 							*(*outbuf + 2) = NON_ID_CHAR2;
363 							n = 3;
364 						} else {
365 							if (*outbytesleft < 2) {
366 #ifdef DEBUG
367 								fprintf(stderr, "aaaaaaoutbytesleft is %d\n", *outbytesleft);
368 #endif
369 								errno = E2BIG;
370 								return (size_t) -1;
371 							}
372 							**outbuf = NON_ID_CHAR1;
373 							*(*outbuf + 1) = NON_ID_CHAR2;
374 							n = 2;
375 						}
376 					}
377 				} else {	/* Non-GB & Non-Big5 */
378 					if (st->_gstate != G0) {
379 						if (*outbytesleft < 7) {
380 							errno = E2BIG;
381 							return (size_t) -1;
382 						}
383 						st->_gstate = G0;
384 						st->_istate = OUT;
385 						**outbuf = ESC;
386 						*(*outbuf + 1) = '$';
387 						*(*outbuf + 2) = ')';
388 						*(*outbuf + 3) = 'A';
389 						*(*outbuf + 4) = SO;
390 						*(*outbuf + 5) = NON_ID_CHAR1;
391 						*(*outbuf + 6) = NON_ID_CHAR2;
392 						n = 7;
393 					} else if (st->_istate == IN) {
394 						if(*outbytesleft < 3) {
395 							errno = E2BIG;
396 							return (size_t) -1;
397 						}
398 						st->_istate = OUT;
399 						st->_gstate = G0;
400 						**outbuf = SO;
401 						*(*outbuf + 1) = NON_ID_CHAR1;
402 						*(*outbuf + 2) = NON_ID_CHAR2;
403 						n = 3;
404 					} else {
405 					        /* add sanity check to avoid segment error */
406 						if (*outbytesleft < 2) {
407 							errno = E2BIG;
408 							return (size_t) -1;
409 						}
410 						**outbuf = NON_ID_CHAR1;
411 						*(*outbuf + 1) = NON_ID_CHAR2;
412 						n = 2;
413 					}
414 				}
415 /*
416 					n = gen_undef(st, *outbuf, *outbytesleft);
417 					fprintf(stderr, "gen_undef return %d\n", n );
418 				}
419  */
420 				if (n > 0) {
421 					(*outbuf) += n;
422 					(*outbytesleft) -= n;
423 				} else {
424 #ifdef DEBUG
425 					fprintf(stderr, "bbbbb outbytesleft is %d\n", *outbytesleft);
426 #endif
427 					errno = E2BIG;
428 					return ((size_t)-1);
429 				}
430 				st->_ustate = U0;
431 				break;
432 
433 		        case U5:
434 		                first_byte = st->_keepc[0];
435 
436 		                /* if the first byte is 0xf0, it is illegal sequence if
437 				 * the second one is between 0x80 and 0x8f
438 				 * for Four-Byte UTF: U+10000..U+10FFFF
439 				 */
440 		                if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
441 				    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
442 		                    st->_errno = errno = EILSEQ;
443 		                else {
444 				   st->_ustate = U6;
445 				   st->_keepc[1] = **inbuf;
446 				}
447 		                break;
448 		        case U6:
449 		                if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */
450 		                  {
451 				     st->_ustate = U7;
452 				     st->_keepc[2] = **inbuf;
453 				  }
454 		                else
455 		                     st->_errno = errno = EILSEQ;
456 		                break;
457 		        case U7:
458 		                if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */
459 		                  {  /* skip it to simplify */
460 				     st->_ustate = U0;
461 				  }
462 		                else
463 		                     st->_errno = errno = EILSEQ;
464 		                break;
465 			default:
466 				st->_errno = errno = EILSEQ;
467 #ifdef DEBUG
468 				fprintf(stderr, "WHY HERE\n");
469 #endif
470 				st->_ustate = U0;	/* reset state */
471 				break;
472 		}	/* end of switc */
473 		if (st->_errno)
474 			break;
475 		(*inbuf)++;
476 		(*inbytesleft)--;
477 	}
478 
479         if (errno)
480 		return ((size_t)-1);
481 
482         if (*inbytesleft == 0 && st->_ustate != U0)
483          {
484 	    errno = EINVAL;
485 	    return ((size_t) -1);
486          }
487 
488 	if (*inbytesleft > 0 && *outbytesleft == 0) {
489 #ifdef DEBUG
490 		fprintf(stderr, "cccccc outbytesleft is %d\n", *outbytesleft);
491 #endif
492 		errno = E2BIG;
493 		return ((size_t)-1);
494 	}
495 	return ((size_t)(*inbytesleft));
496 }
497 
498 /*
499  *	gen_undef(); Called when a char non-gb and non-big5 found.
500  */
gen_undef(_iconv_st * st,char * outbuf,int bytes)501 int gen_undef(_iconv_st * st, char * outbuf, int bytes) {
502 	if (st->_gstate != G0) {
503 		if (bytes < 7) {
504 #ifdef DEBUG
505 			fprintf(stderr, "in gen outbytesleft is %d\n", bytes);
506 #endif
507 			errno = st->_errno = E2BIG;
508 			return -1;
509 		}
510 		st->_gstate = G0;
511 		st->_istate = OUT;
512 		*outbuf = ESC;
513 		*(outbuf + 1) = '$';
514 		*(outbuf + 2) = ')';
515 		*(outbuf + 3) = 'A';
516 		*(outbuf + 4) = SO;
517 		*(outbuf + 5) = NON_ID_CHAR1;
518 		*(outbuf + 6) = NON_ID_CHAR2;
519 		return 7;
520 	}
521 	if (st->_istate == IN) {
522 		if (bytes < 3) {
523 #ifdef DEBUG
524 			fprintf(stderr, "in gen outbytesleft is %d\n", bytes);
525 #endif
526 			errno = st->_errno = E2BIG;
527 			return -1;
528 		}
529 		st->_istate = OUT;
530 		*outbuf = SO;
531 		*(outbuf + 1) = NON_ID_CHAR1;
532 		*(outbuf + 2) = NON_ID_CHAR2;
533 		return 3;
534 	}
535 	if (bytes < 2) {
536 #ifdef DEBUG
537 		fprintf(stderr, "in gen outbytesleft is %d\n", bytes);
538 #endif
539 		errno = st->_errno = E2BIG;
540 		return -1;
541 	}
542 	*outbuf = NON_ID_CHAR1;
543 	*(outbuf + 1) = NON_ID_CHAR2;
544 	return 2;
545 }
546 
547 /*
548  *	binary_search();
549  */
binary_search(unsigned long key,table_t * table,int tab_len)550 int binary_search(unsigned long key, table_t *table, int tab_len) {
551 	int i, low, high;
552 
553 	for (low = 0, high = tab_len-1; low < high; ) {
554 		if (table[low].key == key)
555 			return low;
556 		if (table[high].key == key)
557 			return high;
558 		i = (low + high) >> 1;
559 		if (table[i].key == key)
560 			return i;
561 		if (table[i].key < key)
562 			low = i + 1;
563 		else
564 			high = i - 1;
565 	}
566 	return -1;
567 }
568 
569 #ifdef DEBUG
main(int argc,char ** argv)570 main(int argc, char ** argv) {
571 	_iconv_st	* st;
572 	int fd;
573 	char * in_str;
574 	char * out_str;
575 	char * tmp_in;
576 	char * tmp_out;
577 	unsigned int in_len;
578 	unsigned int out_len;
579 
580 	struct stat s;
581 
582 	if (argc < 2) {
583 		fprintf(stderr, "Usage: %s input\n", argv[0]);
584 		exit(-1);
585 	}
586 
587 	if (stat(argv[1], &s) == -1) {
588 		perror("stat");
589 		exit(-1);
590 	}
591 
592 	if ((fd = open(argv[1], O_RDONLY)) == -1) {
593 		perror("open");
594 		exit(-1);
595 	}
596 
597 	tmp_in = in_str = (char *) malloc(1024);
598 	tmp_out = out_str = (char *) malloc(1024);
599 	if (!in_str || !out_str) {
600 		perror("malloc");
601 		exit(-3);
602 		free(in_str);
603 		free(out_str);
604 	}
605 	in_len = s.st_size;
606 	out_len = s.st_size << 2;
607 	st = _icv_open();
608 	if (st == (_iconv_st *) -1) {
609 		perror("_icv_open");
610 		free(in_str);
611 		free(out_str);
612 		exit(-3);
613 	}
614 
615 	while (1) {
616 	in_len = 1024;
617 	out_len = 1024;
618 	in_str = tmp_in;
619 	out_str = tmp_out;
620 
621 	if (!read(fd, in_str, in_len))
622 		exit(0);
623 
624 	if (_icv_iconv(st, &in_str, &in_len, &out_str, &out_len) == -1) {
625 		perror("icv_iconv");
626 		fprintf(stderr, "\ninbytesleft = %d\n", in_len);
627 		exit(-2);
628 	}
629 	fprintf(stderr, "Result is in len %d, out len %d\n", in_len,
630 	out_len);
631 	if (write(1, tmp_out, 4096 - out_len) == -1) {
632 		perror("write");
633 	}
634 	}	/* end of while */
635 
636 	free(tmp_in);
637 	free(tmp_out);
638 	close(fd);
639 	_icv_close(st);
640 }
641 #endif
642