1*16d86563SAlexander Pyhalov /*
2*16d86563SAlexander Pyhalov * CDDL HEADER START
3*16d86563SAlexander Pyhalov *
4*16d86563SAlexander Pyhalov * The contents of this file are subject to the terms of the
5*16d86563SAlexander Pyhalov * Common Development and Distribution License (the "License").
6*16d86563SAlexander Pyhalov * You may not use this file except in compliance with the License.
7*16d86563SAlexander Pyhalov *
8*16d86563SAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*16d86563SAlexander Pyhalov * or http://www.opensolaris.org/os/licensing.
10*16d86563SAlexander Pyhalov * See the License for the specific language governing permissions
11*16d86563SAlexander Pyhalov * and limitations under the License.
12*16d86563SAlexander Pyhalov *
13*16d86563SAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each
14*16d86563SAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE.
15*16d86563SAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the
16*16d86563SAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying
17*16d86563SAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner]
18*16d86563SAlexander Pyhalov *
19*16d86563SAlexander Pyhalov * CDDL HEADER END
20*16d86563SAlexander Pyhalov */
21*16d86563SAlexander Pyhalov /*
22*16d86563SAlexander Pyhalov * Copyright (c) 1994 by Sun Microsystems, Inc.
23*16d86563SAlexander Pyhalov */
24*16d86563SAlexander Pyhalov
25*16d86563SAlexander Pyhalov
26*16d86563SAlexander Pyhalov #include <stdlib.h>
27*16d86563SAlexander Pyhalov #include <errno.h>
28*16d86563SAlexander Pyhalov #include "hangulcode.h"
29*16d86563SAlexander Pyhalov #include "ktable.h"
30*16d86563SAlexander Pyhalov #include "utf_johap92.h"
31*16d86563SAlexander Pyhalov #include "common_defs.h"
32*16d86563SAlexander Pyhalov
33*16d86563SAlexander Pyhalov #define MSB 0x80 /* mask for most-significant-bit */
34*16d86563SAlexander Pyhalov typedef enum _USTATE {U0 = 0, U1, U2, U3, U4, U5, U6,UX} USTATE;
35*16d86563SAlexander Pyhalov
36*16d86563SAlexander Pyhalov typedef struct _icv_state {
37*16d86563SAlexander Pyhalov unsigned char _buffer[6];
38*16d86563SAlexander Pyhalov USTATE _ustate;
39*16d86563SAlexander Pyhalov unsigned short _count;
40*16d86563SAlexander Pyhalov int _errno;
41*16d86563SAlexander Pyhalov } _iconv_st;
42*16d86563SAlexander Pyhalov
43*16d86563SAlexander Pyhalov /**** _ I C V _ O P E N ****/
44*16d86563SAlexander Pyhalov
_icv_open()45*16d86563SAlexander Pyhalov void* _icv_open()
46*16d86563SAlexander Pyhalov {
47*16d86563SAlexander Pyhalov _iconv_st *st;
48*16d86563SAlexander Pyhalov if((st = (_iconv_st *) malloc(sizeof(_iconv_st))) == NULL){
49*16d86563SAlexander Pyhalov errno = ENOMEM;
50*16d86563SAlexander Pyhalov return ((void *) -1);
51*16d86563SAlexander Pyhalov }
52*16d86563SAlexander Pyhalov st->_ustate = U0;
53*16d86563SAlexander Pyhalov st->_errno = 0;
54*16d86563SAlexander Pyhalov st->_count = 0;
55*16d86563SAlexander Pyhalov /*
56*16d86563SAlexander Pyhalov RESET_CONV_DESC();
57*16d86563SAlexander Pyhalov */
58*16d86563SAlexander Pyhalov return ((void *) st);
59*16d86563SAlexander Pyhalov } /* end of int _icv_open(). */
60*16d86563SAlexander Pyhalov
61*16d86563SAlexander Pyhalov
62*16d86563SAlexander Pyhalov /**** _ I C V _ C L O S E ****/
63*16d86563SAlexander Pyhalov
_icv_close(_iconv_st * st)64*16d86563SAlexander Pyhalov void _icv_close(_iconv_st* st)
65*16d86563SAlexander Pyhalov {
66*16d86563SAlexander Pyhalov if(!st)
67*16d86563SAlexander Pyhalov errno = EBADF;
68*16d86563SAlexander Pyhalov else
69*16d86563SAlexander Pyhalov free(st);
70*16d86563SAlexander Pyhalov } /* end of void _icv_close(int*). */
71*16d86563SAlexander Pyhalov
72*16d86563SAlexander Pyhalov
73*16d86563SAlexander Pyhalov /**** _ I C V _ I C O N V ****/
74*16d86563SAlexander Pyhalov
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)75*16d86563SAlexander Pyhalov size_t _icv_iconv(_iconv_st* st, char** inbuf, size_t* inbufleft,
76*16d86563SAlexander Pyhalov char** outbuf, size_t* outbufleft)
77*16d86563SAlexander Pyhalov {
78*16d86563SAlexander Pyhalov size_t ret_val = 0;
79*16d86563SAlexander Pyhalov unsigned char* ib;
80*16d86563SAlexander Pyhalov unsigned char* ob;
81*16d86563SAlexander Pyhalov unsigned char* ibtail;
82*16d86563SAlexander Pyhalov unsigned char* obtail;
83*16d86563SAlexander Pyhalov
84*16d86563SAlexander Pyhalov hcode_type utf8_code, johap92_code;
85*16d86563SAlexander Pyhalov
86*16d86563SAlexander Pyhalov if(st == NULL){
87*16d86563SAlexander Pyhalov errno = EBADF;
88*16d86563SAlexander Pyhalov return ((size_t) -1);
89*16d86563SAlexander Pyhalov }
90*16d86563SAlexander Pyhalov
91*16d86563SAlexander Pyhalov if (!inbuf || !(*inbuf)){
92*16d86563SAlexander Pyhalov st->_ustate = U0;
93*16d86563SAlexander Pyhalov st->_errno = 0;
94*16d86563SAlexander Pyhalov return((size_t)0);
95*16d86563SAlexander Pyhalov }
96*16d86563SAlexander Pyhalov
97*16d86563SAlexander Pyhalov st->_errno = 0;
98*16d86563SAlexander Pyhalov errno = 0;
99*16d86563SAlexander Pyhalov
100*16d86563SAlexander Pyhalov ib = (unsigned char*)*inbuf;
101*16d86563SAlexander Pyhalov ob = (unsigned char*)*outbuf;
102*16d86563SAlexander Pyhalov ibtail = ib + *inbufleft;
103*16d86563SAlexander Pyhalov obtail = ob + *outbufleft;
104*16d86563SAlexander Pyhalov
105*16d86563SAlexander Pyhalov
106*16d86563SAlexander Pyhalov while (ib < ibtail)
107*16d86563SAlexander Pyhalov {
108*16d86563SAlexander Pyhalov unsigned char first_byte;
109*16d86563SAlexander Pyhalov switch(st->_ustate){
110*16d86563SAlexander Pyhalov case U0: /* begining of new utf-8 char sequence */
111*16d86563SAlexander Pyhalov if((*ib & MSB) == 0){ /* MSB is off, so ASCII */
112*16d86563SAlexander Pyhalov if(ob >= obtail){
113*16d86563SAlexander Pyhalov errno = E2BIG;
114*16d86563SAlexander Pyhalov ret_val = (size_t) -1;
115*16d86563SAlexander Pyhalov break;
116*16d86563SAlexander Pyhalov }
117*16d86563SAlexander Pyhalov *ob++ = *ib++;
118*16d86563SAlexander Pyhalov
119*16d86563SAlexander Pyhalov } else { /* Now, begining of UTF-8 */
120*16d86563SAlexander Pyhalov if((*ib & 0xe0) == 0xc0){
121*16d86563SAlexander Pyhalov /* 2-byte utf-8 */
122*16d86563SAlexander Pyhalov /* true if *ib is (0xc0 ~ 0xdf) */
123*16d86563SAlexander Pyhalov /* but, need to filter out the range */
124*16d86563SAlexander Pyhalov /* 0xc0 ~ 0xc1 */
125*16d86563SAlexander Pyhalov
126*16d86563SAlexander Pyhalov if(number_of_bytes_in_utf8_char[(unsigned char) *ib] ==
127*16d86563SAlexander Pyhalov ICV_TYPE_ILLEGAL_CHAR)
128*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
129*16d86563SAlexander Pyhalov else {
130*16d86563SAlexander Pyhalov st->_ustate = U1;
131*16d86563SAlexander Pyhalov st->_buffer[0] = *ib;
132*16d86563SAlexander Pyhalov }
133*16d86563SAlexander Pyhalov } else if((*ib & 0xf0) == 0xe0){
134*16d86563SAlexander Pyhalov /* 3 byte utf-8 */
135*16d86563SAlexander Pyhalov /* if *ib is (0xe0 ~ 0xef) */
136*16d86563SAlexander Pyhalov st->_ustate = U2;
137*16d86563SAlexander Pyhalov st->_buffer[0] = *ib;
138*16d86563SAlexander Pyhalov } else {
139*16d86563SAlexander Pyhalov /* 4 byte utf-8 */
140*16d86563SAlexander Pyhalov /* true if *ib is (0xf0 ~ 0xff) */
141*16d86563SAlexander Pyhalov /* but, need to screen out the range */
142*16d86563SAlexander Pyhalov /* 0xf5 ~ 0xff */
143*16d86563SAlexander Pyhalov if(number_of_bytes_in_utf8_char[(unsigned char) *ib] ==
144*16d86563SAlexander Pyhalov ICV_TYPE_ILLEGAL_CHAR)
145*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
146*16d86563SAlexander Pyhalov else {
147*16d86563SAlexander Pyhalov st->_ustate = U4;
148*16d86563SAlexander Pyhalov st->_buffer[0] = *ib;
149*16d86563SAlexander Pyhalov
150*16d86563SAlexander Pyhalov }
151*16d86563SAlexander Pyhalov }
152*16d86563SAlexander Pyhalov st->_count++;
153*16d86563SAlexander Pyhalov ib++;
154*16d86563SAlexander Pyhalov }
155*16d86563SAlexander Pyhalov break;
156*16d86563SAlexander Pyhalov case U1: /* we are getting 2nd byte of 2byte utf-8 */
157*16d86563SAlexander Pyhalov /* convert it right here */
158*16d86563SAlexander Pyhalov if((*ib & 0xc0) == MSB){
159*16d86563SAlexander Pyhalov st->_ustate = UX;
160*16d86563SAlexander Pyhalov st->_buffer[1] = *ib;
161*16d86563SAlexander Pyhalov st->_count++;
162*16d86563SAlexander Pyhalov continue;/* Now, we gotta do the real conversion*/
163*16d86563SAlexander Pyhalov /* becuase we just came to an the last */
164*16d86563SAlexander Pyhalov /* byte of utf-8 character */
165*16d86563SAlexander Pyhalov } else {
166*16d86563SAlexander Pyhalov ib++;
167*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
168*16d86563SAlexander Pyhalov ret_val = (size_t) -1;
169*16d86563SAlexander Pyhalov break;
170*16d86563SAlexander Pyhalov }
171*16d86563SAlexander Pyhalov break;
172*16d86563SAlexander Pyhalov case U2: /* 2nd byte of 3byte utf-8 */
173*16d86563SAlexander Pyhalov first_byte = (unsigned char) st->_buffer[0];
174*16d86563SAlexander Pyhalov /* basic utf-8 validity check first... */
175*16d86563SAlexander Pyhalov if((*ib & 0xc0) == MSB){
176*16d86563SAlexander Pyhalov /* if okay, then what about the range of this byte? */
177*16d86563SAlexander Pyhalov /* if the first byte is 0xed, it is illegal sequence */
178*16d86563SAlexander Pyhalov /* if the second one is between 0xa0 and 0xbf */
179*16d86563SAlexander Pyhalov /* because surrogate section is ill-formed */
180*16d86563SAlexander Pyhalov
181*16d86563SAlexander Pyhalov if((unsigned char)*ib < valid_min_2nd_byte[first_byte] ||
182*16d86563SAlexander Pyhalov (unsigned char)*ib > valid_max_2nd_byte[first_byte]){
183*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
184*16d86563SAlexander Pyhalov } else {
185*16d86563SAlexander Pyhalov st->_ustate = U3;
186*16d86563SAlexander Pyhalov st->_buffer[1] = *ib;
187*16d86563SAlexander Pyhalov st->_count++;
188*16d86563SAlexander Pyhalov }
189*16d86563SAlexander Pyhalov
190*16d86563SAlexander Pyhalov } else {
191*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
192*16d86563SAlexander Pyhalov }
193*16d86563SAlexander Pyhalov ib++;
194*16d86563SAlexander Pyhalov break;
195*16d86563SAlexander Pyhalov case U3: /* 3rd byte of 3byte utf-8 */
196*16d86563SAlexander Pyhalov if((*ib & 0xc0) == MSB){
197*16d86563SAlexander Pyhalov st->_ustate = UX;
198*16d86563SAlexander Pyhalov st->_buffer[2] = *ib;
199*16d86563SAlexander Pyhalov st->_count++;
200*16d86563SAlexander Pyhalov continue;/* Now, we gotta do the real conversion*/
201*16d86563SAlexander Pyhalov /* becuase we just came to an the last */
202*16d86563SAlexander Pyhalov /* byte of utf-8 character */
203*16d86563SAlexander Pyhalov } else {
204*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
205*16d86563SAlexander Pyhalov ret_val = (size_t) -1;
206*16d86563SAlexander Pyhalov ib++;
207*16d86563SAlexander Pyhalov break;
208*16d86563SAlexander Pyhalov }
209*16d86563SAlexander Pyhalov break;
210*16d86563SAlexander Pyhalov case U4: /* 2nd byte of 4byte utf-8 */
211*16d86563SAlexander Pyhalov first_byte = st->_buffer[0];
212*16d86563SAlexander Pyhalov if((*ib & 0xc0) == MSB){
213*16d86563SAlexander Pyhalov if((unsigned char)*ib < valid_min_2nd_byte[first_byte] ||
214*16d86563SAlexander Pyhalov (unsigned char)*ib > valid_max_2nd_byte[first_byte]){
215*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
216*16d86563SAlexander Pyhalov } else {
217*16d86563SAlexander Pyhalov st->_ustate = U5;
218*16d86563SAlexander Pyhalov st->_buffer[1] = *ib;
219*16d86563SAlexander Pyhalov st->_count++;
220*16d86563SAlexander Pyhalov }
221*16d86563SAlexander Pyhalov } else {
222*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
223*16d86563SAlexander Pyhalov }
224*16d86563SAlexander Pyhalov ib++;
225*16d86563SAlexander Pyhalov break;
226*16d86563SAlexander Pyhalov case U5: /* 3rd byte of 4byte utf-8 */
227*16d86563SAlexander Pyhalov if((*ib & 0xc0) == MSB){
228*16d86563SAlexander Pyhalov st->_ustate = U6;
229*16d86563SAlexander Pyhalov st->_buffer[2] = *ib;
230*16d86563SAlexander Pyhalov st->_count++;
231*16d86563SAlexander Pyhalov } else {
232*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
233*16d86563SAlexander Pyhalov }
234*16d86563SAlexander Pyhalov ib++;
235*16d86563SAlexander Pyhalov break;
236*16d86563SAlexander Pyhalov case U6: /* 4th byte of 4byte utf-8 */
237*16d86563SAlexander Pyhalov if((*ib & 0xc0) == MSB){
238*16d86563SAlexander Pyhalov if((obtail - ob) < 2){
239*16d86563SAlexander Pyhalov st->_errno = errno = E2BIG;
240*16d86563SAlexander Pyhalov } else {
241*16d86563SAlexander Pyhalov *ob++ = NON_ID_CHAR;
242*16d86563SAlexander Pyhalov *ob++ = NON_ID_CHAR;
243*16d86563SAlexander Pyhalov st->_ustate = U0;
244*16d86563SAlexander Pyhalov }
245*16d86563SAlexander Pyhalov } else {
246*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
247*16d86563SAlexander Pyhalov }
248*16d86563SAlexander Pyhalov ib++;
249*16d86563SAlexander Pyhalov break;
250*16d86563SAlexander Pyhalov case UX:
251*16d86563SAlexander Pyhalov /*******************************************************
252*16d86563SAlexander Pyhalov * convert valid utf-8 sequence gathered in the
253*16d86563SAlexander Pyhalov * st->_buffer to euc
254*16d86563SAlexander Pyhalov *******************************************************/
255*16d86563SAlexander Pyhalov utf8_code.code = 0;
256*16d86563SAlexander Pyhalov switch(st->_count){
257*16d86563SAlexander Pyhalov case 2: /* 2byte utf-8 code */
258*16d86563SAlexander Pyhalov utf8_code.byte.byte3 = st->_buffer[0];
259*16d86563SAlexander Pyhalov utf8_code.byte.byte4 = st->_buffer[1];
260*16d86563SAlexander Pyhalov break;
261*16d86563SAlexander Pyhalov case 3: /* 3byte utf-8 code */
262*16d86563SAlexander Pyhalov utf8_code.byte.byte2 = st->_buffer[0];
263*16d86563SAlexander Pyhalov utf8_code.byte.byte3 = st->_buffer[1];
264*16d86563SAlexander Pyhalov utf8_code.byte.byte4 = st->_buffer[2];
265*16d86563SAlexander Pyhalov break;
266*16d86563SAlexander Pyhalov }
267*16d86563SAlexander Pyhalov unsigned short _utf8_to_jahap92(utf_code.code)
268*16d86563SAlexander Pyhalov
269*16d86563SAlexander Pyhalov if (euc_code.code != 0) {
270*16d86563SAlexander Pyhalov /* If find something -> EUC code */
271*16d86563SAlexander Pyhalov *ob++ = euc_code.byte.byte3;
272*16d86563SAlexander Pyhalov *ob++ = euc_code.byte.byte4;
273*16d86563SAlexander Pyhalov }
274*16d86563SAlexander Pyhalov else
275*16d86563SAlexander Pyhalov {
276*16d86563SAlexander Pyhalov /* Let's assume the code is not identifiable */
277*16d86563SAlexander Pyhalov if ((obtail - ob) < 2)
278*16d86563SAlexander Pyhalov {
279*16d86563SAlexander Pyhalov errno = E2BIG;
280*16d86563SAlexander Pyhalov ret_val = (size_t)-1;
281*16d86563SAlexander Pyhalov }
282*16d86563SAlexander Pyhalov *ob++ = NON_IDENTICAL;
283*16d86563SAlexander Pyhalov *ob++ = NON_IDENTICAL;
284*16d86563SAlexander Pyhalov ret_val += 2;
285*16d86563SAlexander Pyhalov }
286*16d86563SAlexander Pyhalov st->_ustate = U0;
287*16d86563SAlexander Pyhalov st->_count = 0;
288*16d86563SAlexander Pyhalov ib++;
289*16d86563SAlexander Pyhalov break;
290*16d86563SAlexander Pyhalov default: /* You are not supposed to get here... */
291*16d86563SAlexander Pyhalov /* But, just only for the integrity */
292*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
293*16d86563SAlexander Pyhalov st->_ustate = U0;
294*16d86563SAlexander Pyhalov st->_count = 0;
295*16d86563SAlexander Pyhalov break;
296*16d86563SAlexander Pyhalov
297*16d86563SAlexander Pyhalov }
298*16d86563SAlexander Pyhalov if(st->_errno){
299*16d86563SAlexander Pyhalov #ifdef DEBUG
300*16d86563SAlexander Pyhalov fprintf(stderr, "st->_errno=%d\tst->_ustate=%d\n", st->_errno, st->_ustate);
301*16d86563SAlexander Pyhalov #endif /* DEBUG */
302*16d86563SAlexander Pyhalov break;
303*16d86563SAlexander Pyhalov }
304*16d86563SAlexander Pyhalov
305*16d86563SAlexander Pyhalov }
306*16d86563SAlexander Pyhalov if(errno) return ((size_t) -1);
307*16d86563SAlexander Pyhalov
308*16d86563SAlexander Pyhalov *inbuf = (char*)ib;
309*16d86563SAlexander Pyhalov *inbufleft = ibtail - ib;
310*16d86563SAlexander Pyhalov *outbuf = (char*)ob;
311*16d86563SAlexander Pyhalov *outbufleft = obtail - ob;
312*16d86563SAlexander Pyhalov
313*16d86563SAlexander Pyhalov return(ret_val);
314*16d86563SAlexander Pyhalov } /* end of size_t _icv_iconv(int*, char**, size_t*, char**, size_t*).*/
315*16d86563SAlexander Pyhalov
316*16d86563SAlexander Pyhalov
317*16d86563SAlexander Pyhalov
318*16d86563SAlexander Pyhalov
319*16d86563SAlexander Pyhalov
320*16d86563SAlexander Pyhalov
321*16d86563SAlexander Pyhalov
322*16d86563SAlexander Pyhalov
323*16d86563SAlexander Pyhalov
_utf8_to_jahap92(unsigned long utf_code)324*16d86563SAlexander Pyhalov unsigned short _utf8_to_jahap92(unsigned long utf_code)
325*16d86563SAlexander Pyhalov {
326*16d86563SAlexander Pyhalov int low, mid, high;
327*16d86563SAlexander Pyhalov low = 0, high = MAX_U2J92_NUM;
328*16d86563SAlexander Pyhalov while(low < high){
329*16d86563SAlexander Pyhalov mid = (low + high)/2;
330*16d86563SAlexander Pyhalov if(utf8_to_johap92_tbl[mid].utf8 = utf_code){
331*16d86563SAlexander Pyhalov break;
332*16d86563SAlexander Pyhalov } else if(utf8_to_johap92_tbl[mid].utf8 > utf_code){
333*16d86563SAlexander Pyhalov high = mid - 1;
334*16d86563SAlexander Pyhalov } else if(utf8_to_johap92_tbl[mid].utf8 < utf_code){
335*16d86563SAlexander Pyhalov low = mid + 1;
336*16d86563SAlexander Pyhalov }
337*16d86563SAlexander Pyhalov }
338*16d86563SAlexander Pyhalov }
339