1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1994 by Sun Microsystems, Inc.
23 */
24
25
26 #include <stdlib.h>
27 #include <errno.h>
28 #include "hangulcode.h"
29 #include "ktable.h"
30 #include "utf_johap92.h"
31 #include "common_defs.h"
32
33 #define MSB 0x80 /* mask for most-significant-bit */
34 typedef enum _USTATE {U0 = 0, U1, U2, U3, U4, U5, U6,UX} USTATE;
35
36 typedef struct _icv_state {
37 unsigned char _buffer[6];
38 USTATE _ustate;
39 unsigned short _count;
40 int _errno;
41 } _iconv_st;
42
43 /**** _ I C V _ O P E N ****/
44
_icv_open()45 void* _icv_open()
46 {
47 _iconv_st *st;
48 if((st = (_iconv_st *) malloc(sizeof(_iconv_st))) == NULL){
49 errno = ENOMEM;
50 return ((void *) -1);
51 }
52 st->_ustate = U0;
53 st->_errno = 0;
54 st->_count = 0;
55 /*
56 RESET_CONV_DESC();
57 */
58 return ((void *) st);
59 } /* end of int _icv_open(). */
60
61
62 /**** _ I C V _ C L O S E ****/
63
_icv_close(_iconv_st * st)64 void _icv_close(_iconv_st* st)
65 {
66 if(!st)
67 errno = EBADF;
68 else
69 free(st);
70 } /* end of void _icv_close(int*). */
71
72
73 /**** _ I C V _ I C O N V ****/
74
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)75 size_t _icv_iconv(_iconv_st* st, char** inbuf, size_t* inbufleft,
76 char** outbuf, size_t* outbufleft)
77 {
78 size_t ret_val = 0;
79 unsigned char* ib;
80 unsigned char* ob;
81 unsigned char* ibtail;
82 unsigned char* obtail;
83
84 hcode_type utf8_code, johap92_code;
85
86 if(st == NULL){
87 errno = EBADF;
88 return ((size_t) -1);
89 }
90
91 if (!inbuf || !(*inbuf)){
92 st->_ustate = U0;
93 st->_errno = 0;
94 return((size_t)0);
95 }
96
97 st->_errno = 0;
98 errno = 0;
99
100 ib = (unsigned char*)*inbuf;
101 ob = (unsigned char*)*outbuf;
102 ibtail = ib + *inbufleft;
103 obtail = ob + *outbufleft;
104
105
106 while (ib < ibtail)
107 {
108 unsigned char first_byte;
109 switch(st->_ustate){
110 case U0: /* begining of new utf-8 char sequence */
111 if((*ib & MSB) == 0){ /* MSB is off, so ASCII */
112 if(ob >= obtail){
113 errno = E2BIG;
114 ret_val = (size_t) -1;
115 break;
116 }
117 *ob++ = *ib++;
118
119 } else { /* Now, begining of UTF-8 */
120 if((*ib & 0xe0) == 0xc0){
121 /* 2-byte utf-8 */
122 /* true if *ib is (0xc0 ~ 0xdf) */
123 /* but, need to filter out the range */
124 /* 0xc0 ~ 0xc1 */
125
126 if(number_of_bytes_in_utf8_char[(unsigned char) *ib] ==
127 ICV_TYPE_ILLEGAL_CHAR)
128 st->_errno = errno = EILSEQ;
129 else {
130 st->_ustate = U1;
131 st->_buffer[0] = *ib;
132 }
133 } else if((*ib & 0xf0) == 0xe0){
134 /* 3 byte utf-8 */
135 /* if *ib is (0xe0 ~ 0xef) */
136 st->_ustate = U2;
137 st->_buffer[0] = *ib;
138 } else {
139 /* 4 byte utf-8 */
140 /* true if *ib is (0xf0 ~ 0xff) */
141 /* but, need to screen out the range */
142 /* 0xf5 ~ 0xff */
143 if(number_of_bytes_in_utf8_char[(unsigned char) *ib] ==
144 ICV_TYPE_ILLEGAL_CHAR)
145 st->_errno = errno = EILSEQ;
146 else {
147 st->_ustate = U4;
148 st->_buffer[0] = *ib;
149
150 }
151 }
152 st->_count++;
153 ib++;
154 }
155 break;
156 case U1: /* we are getting 2nd byte of 2byte utf-8 */
157 /* convert it right here */
158 if((*ib & 0xc0) == MSB){
159 st->_ustate = UX;
160 st->_buffer[1] = *ib;
161 st->_count++;
162 continue;/* Now, we gotta do the real conversion*/
163 /* becuase we just came to an the last */
164 /* byte of utf-8 character */
165 } else {
166 ib++;
167 st->_errno = errno = EILSEQ;
168 ret_val = (size_t) -1;
169 break;
170 }
171 break;
172 case U2: /* 2nd byte of 3byte utf-8 */
173 first_byte = (unsigned char) st->_buffer[0];
174 /* basic utf-8 validity check first... */
175 if((*ib & 0xc0) == MSB){
176 /* if okay, then what about the range of this byte? */
177 /* if the first byte is 0xed, it is illegal sequence */
178 /* if the second one is between 0xa0 and 0xbf */
179 /* because surrogate section is ill-formed */
180
181 if((unsigned char)*ib < valid_min_2nd_byte[first_byte] ||
182 (unsigned char)*ib > valid_max_2nd_byte[first_byte]){
183 st->_errno = errno = EILSEQ;
184 } else {
185 st->_ustate = U3;
186 st->_buffer[1] = *ib;
187 st->_count++;
188 }
189
190 } else {
191 st->_errno = errno = EILSEQ;
192 }
193 ib++;
194 break;
195 case U3: /* 3rd byte of 3byte utf-8 */
196 if((*ib & 0xc0) == MSB){
197 st->_ustate = UX;
198 st->_buffer[2] = *ib;
199 st->_count++;
200 continue;/* Now, we gotta do the real conversion*/
201 /* becuase we just came to an the last */
202 /* byte of utf-8 character */
203 } else {
204 st->_errno = errno = EILSEQ;
205 ret_val = (size_t) -1;
206 ib++;
207 break;
208 }
209 break;
210 case U4: /* 2nd byte of 4byte utf-8 */
211 first_byte = st->_buffer[0];
212 if((*ib & 0xc0) == MSB){
213 if((unsigned char)*ib < valid_min_2nd_byte[first_byte] ||
214 (unsigned char)*ib > valid_max_2nd_byte[first_byte]){
215 st->_errno = errno = EILSEQ;
216 } else {
217 st->_ustate = U5;
218 st->_buffer[1] = *ib;
219 st->_count++;
220 }
221 } else {
222 st->_errno = errno = EILSEQ;
223 }
224 ib++;
225 break;
226 case U5: /* 3rd byte of 4byte utf-8 */
227 if((*ib & 0xc0) == MSB){
228 st->_ustate = U6;
229 st->_buffer[2] = *ib;
230 st->_count++;
231 } else {
232 st->_errno = errno = EILSEQ;
233 }
234 ib++;
235 break;
236 case U6: /* 4th byte of 4byte utf-8 */
237 if((*ib & 0xc0) == MSB){
238 if((obtail - ob) < 2){
239 st->_errno = errno = E2BIG;
240 } else {
241 *ob++ = NON_ID_CHAR;
242 *ob++ = NON_ID_CHAR;
243 st->_ustate = U0;
244 }
245 } else {
246 st->_errno = errno = EILSEQ;
247 }
248 ib++;
249 break;
250 case UX:
251 /*******************************************************
252 * convert valid utf-8 sequence gathered in the
253 * st->_buffer to euc
254 *******************************************************/
255 utf8_code.code = 0;
256 switch(st->_count){
257 case 2: /* 2byte utf-8 code */
258 utf8_code.byte.byte3 = st->_buffer[0];
259 utf8_code.byte.byte4 = st->_buffer[1];
260 break;
261 case 3: /* 3byte utf-8 code */
262 utf8_code.byte.byte2 = st->_buffer[0];
263 utf8_code.byte.byte3 = st->_buffer[1];
264 utf8_code.byte.byte4 = st->_buffer[2];
265 break;
266 }
267 unsigned short _utf8_to_jahap92(utf_code.code)
268
269 if (euc_code.code != 0) {
270 /* If find something -> EUC code */
271 *ob++ = euc_code.byte.byte3;
272 *ob++ = euc_code.byte.byte4;
273 }
274 else
275 {
276 /* Let's assume the code is not identifiable */
277 if ((obtail - ob) < 2)
278 {
279 errno = E2BIG;
280 ret_val = (size_t)-1;
281 }
282 *ob++ = NON_IDENTICAL;
283 *ob++ = NON_IDENTICAL;
284 ret_val += 2;
285 }
286 st->_ustate = U0;
287 st->_count = 0;
288 ib++;
289 break;
290 default: /* You are not supposed to get here... */
291 /* But, just only for the integrity */
292 st->_errno = errno = EILSEQ;
293 st->_ustate = U0;
294 st->_count = 0;
295 break;
296
297 }
298 if(st->_errno){
299 #ifdef DEBUG
300 fprintf(stderr, "st->_errno=%d\tst->_ustate=%d\n", st->_errno, st->_ustate);
301 #endif /* DEBUG */
302 break;
303 }
304
305 }
306 if(errno) return ((size_t) -1);
307
308 *inbuf = (char*)ib;
309 *inbufleft = ibtail - ib;
310 *outbuf = (char*)ob;
311 *outbufleft = obtail - ob;
312
313 return(ret_val);
314 } /* end of size_t _icv_iconv(int*, char**, size_t*, char**, size_t*).*/
315
316
317
318
319
320
321
322
323
_utf8_to_jahap92(unsigned long utf_code)324 unsigned short _utf8_to_jahap92(unsigned long utf_code)
325 {
326 int low, mid, high;
327 low = 0, high = MAX_U2J92_NUM;
328 while(low < high){
329 mid = (low + high)/2;
330 if(utf8_to_johap92_tbl[mid].utf8 = utf_code){
331 break;
332 } else if(utf8_to_johap92_tbl[mid].utf8 > utf_code){
333 high = mid - 1;
334 } else if(utf8_to_johap92_tbl[mid].utf8 < utf_code){
335 low = mid + 1;
336 }
337 }
338 }
339