1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright(c) 1998 Sun Microsystems, Inc.
23  */
24 
25 #include <stdio.h>
26 #include <errno.h>
27 #include <stdlib.h>
28 #include <sys/types.h>
29 #include <unicode_gb2312.h>
30 #include "common_defs.h"
31 
32 #define SI	0x0f
33 #define SO	0x0e
34 #define ESC	0x1b
35 #define MSB	0x80
36 
37 #define NON_ID_CHAR '?'
38 
39 typedef struct _icv_state {
40 	short	_ustate;
41 	short	_istate;
42 	short	_gstate;
43 	char	_cbuf[3];
44 } _iconv_st;
45 
46 enum	_USTATE	{ U0, U1, U2, U3, U4, U5, U6 };
47 enum	_ISTATE	{ IN, OUT };
48 enum	_GSTATE	{ G0, G1 };
49 
50 int unicode_to_iso(char in_byte1, char in_byte2, char *buf, int	buflen);
51 
52 /*
53  * Open; called from iconv_open()
54  */
55 void *
_icv_open()56 _icv_open()
57 {
58 	_iconv_st *st;
59 
60 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
61 		errno = ENOMEM;
62 		return ((void *) -1);
63 	}
64 
65 	st->_ustate = U0;
66 	st->_istate = IN;
67 	st->_gstate = G0;
68 
69 	return ((void *)st);
70 }
71 
72 
73 /*
74  * Close; called from iconv_close()
75  */
76 void
_icv_close(_iconv_st * st)77 _icv_close(_iconv_st *st)
78 {
79 	if (st == NULL)
80 		errno = EBADF;
81 	else
82 		free(st);
83 }
84 
85 
86 /*
87  * Actual conversion; called from iconv()
88  */
89 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)90 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft,
91 			char **outbuf, size_t*outbytesleft)
92 {
93 	char	c1, c2;
94 	int	n;
95 
96 	if (st == NULL) {
97 		errno = EBADF;
98 		return ((size_t)-1);
99 	}
100 
101 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
102 		st->_ustate = U0;
103 		st->_istate = IN;
104 		st->_gstate = G0;
105 		return ((size_t)0);
106 	}
107 
108 	errno = 0;
109 
110 	while (*inbytesleft > 0 && *outbytesleft > 0) {
111 
112 	    uchar_t  first_byte;
113 
114 	    switch (st->_ustate) {
115 	    case U0:
116 		if ((**inbuf & MSB) == 0) {	/* ASCII */
117 		    if (st->_istate == OUT) {
118 			st->_istate = IN;
119 			**outbuf = SI;
120 			(*outbuf)++, (*outbytesleft)--;
121 			if (*outbytesleft <= 0) {
122 			    errno = E2BIG;
123 			    return ((size_t)-1);
124 			}
125 		    }
126 		    **outbuf = **inbuf;
127 		    (*outbuf)++, (*outbytesleft)--;
128 	        } else {
129 		    if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xdf */
130 
131 		        /* invalid sequence if the first char is either 0xc0 or 0xc1 */
132 		        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
133 			 {
134 			    errno = EILSEQ;
135 		            break;
136 			 }
137 		        else
138 			 {
139 			     st->_ustate = U1;
140 			     st->_cbuf[0] = **inbuf;
141 			 }
142 		    } else if ((**inbuf & 0xf0) == 0xe0) { /* 0xe0..0xef */
143 			st->_ustate = U2;
144 			st->_cbuf[0] = **inbuf;
145 		    } else {
146 		        /* four bytes of UTF-8 sequences */
147 		        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
148 			  {
149 			    errno = EILSEQ;
150 		            break;
151 		           }
152 		        else {
153 			   st->_ustate = U4;
154 			   st->_cbuf[0] = **inbuf;
155 			}
156 		    }
157 		    if (st->_istate == IN) {
158 			if (st->_gstate == G0) {
159 			    if (*outbytesleft < 4) {
160 				errno = E2BIG;
161 				return ((size_t)-1);
162 			    }
163 			    st->_gstate = G1;
164 			    **outbuf = ESC;
165 			    *(*outbuf+1) = '$';
166 			    *(*outbuf+2) = ')';
167 			    *(*outbuf+3) = 'A';
168 			    (*outbuf) += 4, (*outbytesleft) -= 4;
169 			    if (*outbytesleft <= 0) {
170 				errno = E2BIG;
171 				return ((size_t)-1);
172 			    }
173 			}
174 			st->_istate = OUT;
175 			**outbuf = SO;
176 			(*outbuf)++, (*outbytesleft)--;
177 		    }
178 		}
179 		break;
180 	    case U1:
181 		if ((**inbuf & 0xc0) == MSB) {	/* two-byte UTF */
182 		    c1 = (st->_cbuf[0]&0x1c)>>2;
183 		    c2 = ((st->_cbuf[0]&0x03)<<6) | ((**inbuf)&0x3f);
184 		    n = unicode_to_iso(c1, c2, *outbuf, *outbytesleft);
185 		    if (n > 0) {
186 			(*outbuf) += n, (*outbytesleft) -= n;
187 		    } else {
188 			errno = E2BIG;
189 			return ((size_t)-1);
190 		    }
191 		    st->_ustate = U0;
192 		} else {
193 		    errno = EILSEQ;
194 		}
195 		break;
196 	    case U2:
197 
198 	        first_byte = st->_cbuf[0];
199 
200 	        /* if the first byte is 0xed, it is illegal sequence if the second
201 		 * one is one between 0xa0 and 0xbf because surrogate section is ill-formed
202 		 */
203 	        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
204 		    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
205 		     errno = EILSEQ;
206 	        else {
207 		     st->_ustate = U3;
208 		     st->_cbuf[1] = **inbuf;
209 		}
210 		break;
211 	    case U3:
212 		if ((**inbuf & 0xc0) == MSB) {	/* three-byte UTF */
213 		    c1 = ((st->_cbuf[0]&0x0f)<<4) | ((st->_cbuf[1]&0x3c)>>2);
214 		    c2 = ((st->_cbuf[1]&0x03)<<6) | ((**inbuf)&0x3f);
215 		    n = unicode_to_iso(c1, c2, *outbuf, *outbytesleft);
216 		    if (n > 0) {
217 			(*outbuf) += n, (*outbytesleft) -= n;
218 		    } else if ( n == -1 ) {
219 		        errno = EILSEQ; /* unicode is either 0xfffe or 0xffff */
220 		    } else {
221 			errno = E2BIG;
222 			return ((size_t)-1);
223 		    }
224 		    st->_ustate = U0;
225 		} else {
226 		    errno = EILSEQ;
227 		}
228 		break;
229 	     case U4:
230 	        first_byte = st->_cbuf[0];
231 
232 	        /* if the first byte is 0xf0, it is illegal sequence if
233 		 * the second one is between 0x80 and 0x8f
234 		 * for Four-Byte UTF: U+10000..U+10FFFF
235 		 */
236 	        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
237 		    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
238 		     errno = EILSEQ;
239 		else {
240 		     st->_ustate = U5;
241 		     st->_cbuf[1] = **inbuf;
242 		}
243 	        break;
244 	     case U5:
245 		if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
246 		  {
247 		     st->_ustate = U6;
248 		     st->_cbuf[2] = **inbuf;
249 	          }
250 		else
251 		     errno = EILSEQ;
252 	        break;
253 	     case U6:
254 	        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
255 		   {
256 		      /* replace with double NON_ID_CHARs */
257 		      if ( *outbytesleft < 2 )
258 			 errno = E2BIG;
259 		      else
260 			{
261 			   **outbuf = NON_ID_CHAR;
262 			   *(*outbuf+1) = NON_ID_CHAR;
263 			   (*outbytesleft) -= 2;
264 
265 			   st->_ustate = U0;
266 			}
267 	           }
268 		else
269 		     errno = EILSEQ;
270 	        break;
271 	    }
272 
273 	    if (errno)
274 		return ((size_t)-1);
275 
276 	    (*inbuf)++; (*inbytesleft)--;
277 	}
278 
279 	if (*inbytesleft == 0 && st->_ustate != U0) {
280 	        errno = EINVAL;
281 	        return ((size_t) -1);
282 	}
283 
284 	if (*inbytesleft > 0 && *outbytesleft == 0) {
285 		errno = E2BIG;
286 		return ((size_t)-1);
287 	}
288 	return ((size_t)(*inbytesleft));
289 }
290 
291 
unicode_to_iso(in_byte1,in_byte2,buf,buflen)292 int unicode_to_iso(in_byte1, in_byte2, buf, buflen)
293 char	in_byte1, in_byte2;
294 char	*buf;
295 int	buflen;
296 {
297 	int	gb, unicode;
298 	int	i, l, h;
299 
300 	if (buflen < 2)
301 		return 0;
302 	unicode = ((in_byte1 & 0xff) << 8) + (in_byte2 & 0xff);
303 
304         /* 0xfffe and 0xffff should not be allowed */
305         if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
306 
307 	for (l = 0, h = UNICODEMAX; l < h; ) {
308 		if (unicode_gb_tab[l].key == unicode) {
309 			i = l;
310 			break;
311 		}
312 		if (unicode_gb_tab[h].key == unicode) {
313 			i = h;
314 			break;
315 		}
316 		i = (l + h) / 2;
317 		if (unicode_gb_tab[i].key == unicode)
318 			break;
319 		if (unicode_gb_tab[i].key < unicode)
320 			l = i + 1;
321 		else	h = i - 1;
322 	}
323 	if (unicode == unicode_gb_tab[i].key) {
324 		gb = unicode_gb_tab[i].value;
325 		*buf = (gb & 0xff00) >> 8;
326 		*(buf+1) = gb & 0xff;
327 	} else {
328 		*buf = *(buf+1) = NON_ID_CHAR;
329 	}
330 	return 2;
331 }
332