1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright(c) 1998 Sun Microsystems, Inc.
23 * All right reserved.
24 */
25
26 #include <stdio.h>
27 #include <errno.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <public_struc.h>
31 #include <unicode_gb2312.h>
32 #include <unicode_cns11643_CN.h>
33 #ifdef DEBUG
34 #include <fcntl.h>
35 #include <sys/stat.h>
36 #endif
37 #include "common_defs.h"
38
39 #define SI 0x0f
40 #define SO 0x0e
41 #define SS2 0x4e
42 #define SS3 0x4f
43 #define ESC 0x1b
44 #define MSB 0x80
45 #define MSB_OFF 0x7f
46
47 #define NON_ID_CHAR1 0x21
48 #define NON_ID_CHAR2 0x75
49
50 typedef struct _icv_state {
51 short _ustate;
52 short _istate;
53 short _gstate;
54 char _keepc[6];
55 int _errno;
56 } _iconv_st;
57
58 enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 };
59 enum _ISTATE { IN, OUT };
60 enum _GSTATE { G0, G1, G2 };
61
62 int binary_search(unsigned long key, table_t *table, int tab_len);
63
64 /*
65 * Open; called from iconv_open()
66 */
_icv_open()67 void * _icv_open() {
68 _iconv_st * st;
69 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
70 errno = ENOMEM;
71 return (void *)-1;
72 }
73
74 st->_ustate = U0;
75 st->_istate = IN;
76 st->_gstate = -1;
77 st->_errno = 0;
78
79 return (void *)st;
80 }
81
82 /*
83 * Close; called from iconv_close()
84 */
85
_icv_close(_iconv_st * st)86 void _icv_close(_iconv_st *st) {
87 if (st == NULL)
88 errno = EBADF;
89 else
90 free(st);
91 }
92
93 /*
94 * Actual conversion; called from iconv()
95 */
96
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)97 size_t _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
98 char **outbuf, size_t *outbytesleft) {
99 char c1 = '\0', c2 = '\0';
100 int n = 0;
101 unsigned long key;
102 unsigned long gbk;
103 int index;
104 short new_state;
105
106 #ifdef DEBUG
107 fprintf(stderr, "in length is %d\toutlength is %d\n",
108 *inbytesleft, *outbytesleft);
109 #endif
110 if (st == NULL) {
111 errno = EBADF;
112 return ((size_t)-1);
113 }
114
115 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
116 st->_ustate = U0;
117 st->_istate = IN;
118 st->_gstate = G0;
119 st->_errno = 0;
120 return ((size_t)0);
121 }
122
123 errno = 0;
124 while (*inbytesleft > 0 && *outbytesleft > 0) {
125
126 uchar_t first_byte;
127
128 switch (st->_ustate) {
129 case U0:
130 if ((**inbuf & MSB) == 0) { /* ASCII */
131 if (st->_istate == OUT) {
132 if (*outbytesleft < 2) {
133 #ifdef DEBUG
134 fprintf(stderr, "11111 outbytesleft is %d\n", *outbytesleft);
135 #endif
136 errno = E2BIG;
137 return (size_t) -1;
138 }
139 st->_istate = IN;
140 **outbuf = SI;
141 (*outbuf)++;
142 (*outbytesleft)--;
143 }
144 if (*outbytesleft < 1) {
145 #ifdef DEBUG
146 fprintf(stderr, "22222 outbytesleft is %d\n", *outbytesleft);
147 #endif
148 errno = E2BIG;
149 return (size_t) -1;
150 }
151 **outbuf = **inbuf;
152 (*outbuf)++;
153 (*outbytesleft)--;
154 } else { /* Chinese charactor */
155 if ((**inbuf & 0xe0) == 0xc0) { /* 2-byte unicode 0xc2..0xdf */
156
157 /* invalid sequence if the first char is either 0xc0 or 0xc1 */
158 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
159 st->_errno = errno = EILSEQ;
160 else {
161 st->_ustate = U1;
162 st->_keepc[0] = **inbuf;
163 }
164 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3-bytes unicode */
165 st->_ustate = U2;
166 st->_keepc[0] = **inbuf;
167 } else {
168
169 /* four bytes of UTF-8 sequences */
170 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
171 st->_errno = errno = EILSEQ;
172 else
173 {
174 st->_ustate = U5;
175 st->_keepc[0] = **inbuf;
176 }
177 #ifdef DEBUG
178 fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]);
179 #endif
180 }
181 }
182 break;
183
184 case U1: /* 2-byte unicode */
185 if ((**inbuf & 0xc0) == 0x80) { /* 2nd byte is 1xxxxxxx */
186 st->_ustate = U4;
187 st->_keepc[1] = **inbuf;
188 c1 = (st->_keepc[0] & 0x1c)>>2;
189 c2 = ((st->_keepc[0] & 0x03) << 6) | \
190 (st->_keepc[1] & 0x3f);
191 continue;
192 } else {
193 st->_errno = errno = EILSEQ;
194 #ifdef DEBUG
195 fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]);
196 #endif
197 }
198 break;
199
200 case U2: /* 3-byte unicode - 2nd byte */
201 first_byte = st->_keepc[0];
202
203 /* if the first byte is 0xed, it is illegal sequence if the second
204 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
205 */
206 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
207 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
208 st->_errno = errno = EILSEQ;
209 else {
210 st->_ustate = U3;
211 st->_keepc[1] = **inbuf;
212 }
213 break;
214
215 case U3: /* 3-byte unicode - 3th byte */
216 if ((**inbuf & 0xc0) == 0x80) {
217 st->_ustate = U4;
218 st->_keepc[2] = **inbuf;
219 c1 = ((st->_keepc[0] & 0x0f) << 4) | \
220 ((st->_keepc[1] & 0x3c) >> 2);
221 c2 = ((st->_keepc[1] & 0x03) << 6) | \
222 (st->_keepc[2] & 0x3f);
223 continue;
224 } else {
225 st->_errno = errno = EILSEQ;
226 #ifdef DEBUG
227 fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]);
228 #endif
229 }
230 break;
231
232 case U4: /* Generate iso2022 sequence */
233 key = ((c1 & 0xff) << 8) | (c2 & 0xff);
234
235 /* 0xFFFE and 0xFFFF should not be allowed */
236 if ( key == 0xFFFE || key == 0xFFFF ) {
237 st->_errno = errno = EILSEQ;
238 break;
239 }
240
241 if ((index = binary_search(key, unicode_gb_tab, UNICODEMAX)) != -1) { /* GB code set */
242 gbk = unicode_gb_tab[index].value;
243 if (st->_gstate != G0) {
244 if (*outbytesleft < 7) {
245 #ifdef DEBUG
246 fprintf(stderr, "33333 outbytesleft is %d\n", *outbytesleft);
247 #endif
248 errno = E2BIG;
249 return ((size_t)-1);
250 }
251 st->_istate = OUT;
252 st->_gstate = G0;
253 **outbuf = ESC;
254 *(*outbuf + 1) = '$';
255 *(*outbuf + 2) = ')';
256 *(*outbuf + 3) = 'A';
257 *(*outbuf + 4) = SO;
258 *(*outbuf + 5) = (gbk & 0xff00) >> 8;
259 *(*outbuf + 6) = gbk & 0xff;
260 n = 7;
261 } else if (st->_istate == IN) {
262 if (*outbytesleft < 3) {
263 #ifdef DEBUG
264 fprintf(stderr, "44444outbytesleft is %d\n", *outbytesleft);
265 #endif
266 errno = E2BIG;
267 return ((size_t) -1);
268 }
269 st->_istate = OUT;
270 **(outbuf) = SO;
271 *(*outbuf + 1) = (gbk & 0xff00) >> 8;
272 *(*outbuf + 2) = gbk & 0xff;
273 n = 3;
274 } else {
275 if ( *outbytesleft < 2 ) {
276 errno = E2BIG;
277 return ((size_t)-1);
278 }
279
280 **outbuf = (gbk & 0xff00) >> 8;
281 *(*outbuf + 1) = gbk & 0xff;
282 n = 2;
283 }
284 } else if ((index = binary_search(key, utf_cns_tab, MAX_UTF_NUM)) != -1) {
285 gbk = utf_cns_tab[index].value;
286 new_state = ((gbk >> 16 ) & 0xff) - 0x20;
287 if (new_state == G2 || new_state == G1) {
288 if (st->_gstate != new_state) {
289 if (*outbytesleft < 7) {
290 #ifdef DEBUG
291 fprintf(stderr, "55555 outbytesleft is %d\n", *outbytesleft);
292 #endif
293 errno = E2BIG;
294 return (size_t) -1;
295 }
296 **outbuf = ESC;
297 *(*outbuf + 1) = '$';
298 *(*outbuf + 2) = ')';
299 *(*outbuf + 3) = 'G' + new_state - 1;
300 st->_istate = OUT;
301 st->_gstate = new_state;
302 *(*outbuf + 4) = SO;
303 *(*outbuf + 5) = (gbk & 0xff00) >> 8;
304 *(*outbuf + 6) = gbk & 0xff;
305 n = 7;
306 } else if (st->_istate == IN) {
307 if (*outbytesleft < 3) {
308 #ifdef DEBUG
309 fprintf(stderr, "66666 outbytesleft is %d\n", *outbytesleft);
310 #endif
311 errno = E2BIG;
312 return (size_t) -1;
313 }
314 st->_istate = OUT;
315 **outbuf = SO;
316 *(*outbuf + 1) = (gbk & 0xff00) >> 8;
317 *(*outbuf + 2) = gbk & 0xff;
318 n = 3;
319 } else {
320 if (*outbytesleft < 2) {
321 #ifdef DEBUG
322 fprintf(stderr, "77777 outbytesleft is %d\n", *outbytesleft);
323 #endif
324 errno = E2BIG;
325 return (size_t) -1;
326 }
327 **outbuf = (gbk & 0xff00) >> 8;
328 *(*outbuf + 1) = gbk & 0xff;
329 n = 2;
330 }
331 } else if (new_state > G2) {
332 if (st->_gstate != G0) {
333 if (*outbytesleft < 7) {
334 #ifdef DEBUG
335 fprintf(stderr, " 888888 outbytesleft is %d\n", *outbytesleft);
336 #endif
337 errno = E2BIG;
338 return (size_t) -1;
339 }
340 st->_gstate = G0;
341 st->_istate = OUT;
342 **outbuf = ESC;
343 *(*outbuf + 1) = '$';
344 *(*outbuf + 2) = ')';
345 *(*outbuf + 3) = 'A';
346 *(*outbuf + 4) = SO;
347 *(*outbuf + 5) = NON_ID_CHAR1;
348 *(*outbuf + 6) = NON_ID_CHAR2;
349 n = 7;
350 } else if (st->_istate == IN) {
351 if (*outbytesleft < 3) {
352 #ifdef DEBUG
353 fprintf(stderr, "99999 outbytesleft is %d\n", *outbytesleft);
354 #endif
355 errno = E2BIG;
356 return (size_t) -1;
357 }
358 st->_gstate = G0;
359 st->_istate = OUT;
360 **outbuf = SO;
361 *(*outbuf + 1) = NON_ID_CHAR1;
362 *(*outbuf + 2) = NON_ID_CHAR2;
363 n = 3;
364 } else {
365 if (*outbytesleft < 2) {
366 #ifdef DEBUG
367 fprintf(stderr, "aaaaaaoutbytesleft is %d\n", *outbytesleft);
368 #endif
369 errno = E2BIG;
370 return (size_t) -1;
371 }
372 **outbuf = NON_ID_CHAR1;
373 *(*outbuf + 1) = NON_ID_CHAR2;
374 n = 2;
375 }
376 }
377 } else { /* Non-GB & Non-Big5 */
378 if (st->_gstate != G0) {
379 if (*outbytesleft < 7) {
380 errno = E2BIG;
381 return (size_t) -1;
382 }
383 st->_gstate = G0;
384 st->_istate = OUT;
385 **outbuf = ESC;
386 *(*outbuf + 1) = '$';
387 *(*outbuf + 2) = ')';
388 *(*outbuf + 3) = 'A';
389 *(*outbuf + 4) = SO;
390 *(*outbuf + 5) = NON_ID_CHAR1;
391 *(*outbuf + 6) = NON_ID_CHAR2;
392 n = 7;
393 } else if (st->_istate == IN) {
394 if(*outbytesleft < 3) {
395 errno = E2BIG;
396 return (size_t) -1;
397 }
398 st->_istate = OUT;
399 st->_gstate = G0;
400 **outbuf = SO;
401 *(*outbuf + 1) = NON_ID_CHAR1;
402 *(*outbuf + 2) = NON_ID_CHAR2;
403 n = 3;
404 } else {
405 /* add sanity check to avoid segment error */
406 if (*outbytesleft < 2) {
407 errno = E2BIG;
408 return (size_t) -1;
409 }
410 **outbuf = NON_ID_CHAR1;
411 *(*outbuf + 1) = NON_ID_CHAR2;
412 n = 2;
413 }
414 }
415 /*
416 n = gen_undef(st, *outbuf, *outbytesleft);
417 fprintf(stderr, "gen_undef return %d\n", n );
418 }
419 */
420 if (n > 0) {
421 (*outbuf) += n;
422 (*outbytesleft) -= n;
423 } else {
424 #ifdef DEBUG
425 fprintf(stderr, "bbbbb outbytesleft is %d\n", *outbytesleft);
426 #endif
427 errno = E2BIG;
428 return ((size_t)-1);
429 }
430 st->_ustate = U0;
431 break;
432
433 case U5:
434 first_byte = st->_keepc[0];
435
436 /* if the first byte is 0xf0, it is illegal sequence if
437 * the second one is between 0x80 and 0x8f
438 * for Four-Byte UTF: U+10000..U+10FFFF
439 */
440 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
441 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
442 st->_errno = errno = EILSEQ;
443 else {
444 st->_ustate = U6;
445 st->_keepc[1] = **inbuf;
446 }
447 break;
448 case U6:
449 if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */
450 {
451 st->_ustate = U7;
452 st->_keepc[2] = **inbuf;
453 }
454 else
455 st->_errno = errno = EILSEQ;
456 break;
457 case U7:
458 if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */
459 { /* skip it to simplify */
460 st->_ustate = U0;
461 }
462 else
463 st->_errno = errno = EILSEQ;
464 break;
465 default:
466 st->_errno = errno = EILSEQ;
467 #ifdef DEBUG
468 fprintf(stderr, "WHY HERE\n");
469 #endif
470 st->_ustate = U0; /* reset state */
471 break;
472 } /* end of switc */
473 if (st->_errno)
474 break;
475 (*inbuf)++;
476 (*inbytesleft)--;
477 }
478
479 if (errno)
480 return ((size_t)-1);
481
482 if (*inbytesleft == 0 && st->_ustate != U0)
483 {
484 errno = EINVAL;
485 return ((size_t) -1);
486 }
487
488 if (*inbytesleft > 0 && *outbytesleft == 0) {
489 #ifdef DEBUG
490 fprintf(stderr, "cccccc outbytesleft is %d\n", *outbytesleft);
491 #endif
492 errno = E2BIG;
493 return ((size_t)-1);
494 }
495 return ((size_t)(*inbytesleft));
496 }
497
498 /*
499 * gen_undef(); Called when a char non-gb and non-big5 found.
500 */
gen_undef(_iconv_st * st,char * outbuf,int bytes)501 int gen_undef(_iconv_st * st, char * outbuf, int bytes) {
502 if (st->_gstate != G0) {
503 if (bytes < 7) {
504 #ifdef DEBUG
505 fprintf(stderr, "in gen outbytesleft is %d\n", bytes);
506 #endif
507 errno = st->_errno = E2BIG;
508 return -1;
509 }
510 st->_gstate = G0;
511 st->_istate = OUT;
512 *outbuf = ESC;
513 *(outbuf + 1) = '$';
514 *(outbuf + 2) = ')';
515 *(outbuf + 3) = 'A';
516 *(outbuf + 4) = SO;
517 *(outbuf + 5) = NON_ID_CHAR1;
518 *(outbuf + 6) = NON_ID_CHAR2;
519 return 7;
520 }
521 if (st->_istate == IN) {
522 if (bytes < 3) {
523 #ifdef DEBUG
524 fprintf(stderr, "in gen outbytesleft is %d\n", bytes);
525 #endif
526 errno = st->_errno = E2BIG;
527 return -1;
528 }
529 st->_istate = OUT;
530 *outbuf = SO;
531 *(outbuf + 1) = NON_ID_CHAR1;
532 *(outbuf + 2) = NON_ID_CHAR2;
533 return 3;
534 }
535 if (bytes < 2) {
536 #ifdef DEBUG
537 fprintf(stderr, "in gen outbytesleft is %d\n", bytes);
538 #endif
539 errno = st->_errno = E2BIG;
540 return -1;
541 }
542 *outbuf = NON_ID_CHAR1;
543 *(outbuf + 1) = NON_ID_CHAR2;
544 return 2;
545 }
546
547 /*
548 * binary_search();
549 */
binary_search(unsigned long key,table_t * table,int tab_len)550 int binary_search(unsigned long key, table_t *table, int tab_len) {
551 int i, low, high;
552
553 for (low = 0, high = tab_len-1; low < high; ) {
554 if (table[low].key == key)
555 return low;
556 if (table[high].key == key)
557 return high;
558 i = (low + high) >> 1;
559 if (table[i].key == key)
560 return i;
561 if (table[i].key < key)
562 low = i + 1;
563 else
564 high = i - 1;
565 }
566 return -1;
567 }
568
569 #ifdef DEBUG
main(int argc,char ** argv)570 main(int argc, char ** argv) {
571 _iconv_st * st;
572 int fd;
573 char * in_str;
574 char * out_str;
575 char * tmp_in;
576 char * tmp_out;
577 unsigned int in_len;
578 unsigned int out_len;
579
580 struct stat s;
581
582 if (argc < 2) {
583 fprintf(stderr, "Usage: %s input\n", argv[0]);
584 exit(-1);
585 }
586
587 if (stat(argv[1], &s) == -1) {
588 perror("stat");
589 exit(-1);
590 }
591
592 if ((fd = open(argv[1], O_RDONLY)) == -1) {
593 perror("open");
594 exit(-1);
595 }
596
597 tmp_in = in_str = (char *) malloc(1024);
598 tmp_out = out_str = (char *) malloc(1024);
599 if (!in_str || !out_str) {
600 perror("malloc");
601 exit(-3);
602 free(in_str);
603 free(out_str);
604 }
605 in_len = s.st_size;
606 out_len = s.st_size << 2;
607 st = _icv_open();
608 if (st == (_iconv_st *) -1) {
609 perror("_icv_open");
610 free(in_str);
611 free(out_str);
612 exit(-3);
613 }
614
615 while (1) {
616 in_len = 1024;
617 out_len = 1024;
618 in_str = tmp_in;
619 out_str = tmp_out;
620
621 if (!read(fd, in_str, in_len))
622 exit(0);
623
624 if (_icv_iconv(st, &in_str, &in_len, &out_str, &out_len) == -1) {
625 perror("icv_iconv");
626 fprintf(stderr, "\ninbytesleft = %d\n", in_len);
627 exit(-2);
628 }
629 fprintf(stderr, "Result is in len %d, out len %d\n", in_len,
630 out_len);
631 if (write(1, tmp_out, 4096 - out_len) == -1) {
632 perror("write");
633 }
634 } /* end of while */
635
636 free(tmp_in);
637 free(tmp_out);
638 close(fd);
639 _icv_close(st);
640 }
641 #endif
642