xref: /illumos-gate/usr/src/cmd/msgfmt/gnu_lex.c (revision 7c478bd9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2001, 2002 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "gnu_msgfmt.h"
30 #include "gnu_lex.h"
31 #include "y.tab.h"
32 
33 int	cur_line = 1;
34 
35 static char	backbuf[MB_LEN_MAX];
36 static int	backlen = 0;
37 
38 /*
39  * get_mb() returns one multibyte character.
40  *
41  * This function uses the iconv() function to find out one
42  * multibyte character from a sequence of bytes in the file stream.
43  * The conversion from the codeset specified in the PO file to UTF-8
44  * is performed.  The funcition reads another byte and calls iconv(),
45  * until iconv() successfully returns as a valid UTF-8 character has
46  * been converted or returns EILSEQ.  If iconv() successfully returned,
47  * the function returns the read bytes as one character.  Otherwise,
48  * returns error.  The string converted to UTF-8 in outbuf won't be
49  * used at all.
50  */
51 static size_t
52 get_mb(unsigned char *tmpbuf, unsigned char fc)
53 {
54 	int	c;
55 	char	outbuf[8];			/* max size of a UTF-8 char */
56 	const char	*inptr;
57 	char	*outptr;
58 	size_t	insize = 0, inlen, outlen, ret;
59 
60 	tmpbuf[insize++] = fc;		/* size of tmpbuf is MB_LEN_MAX+1 */
61 
62 	if (cd == (iconv_t)-1) {
63 		/* no conversion */
64 		tmpbuf[insize] = '\0';
65 		return (insize);
66 	}
67 
68 	for (; ; ) {
69 		inptr = (const char *)tmpbuf;
70 		outptr = &outbuf[0];
71 		inlen = insize;
72 		outlen = sizeof (outbuf);
73 
74 		errno = 0;
75 		ret = iconv(cd, &inptr, &inlen, &outptr, &outlen);
76 		if (ret == (size_t)-1) {
77 			/* iconv failed */
78 			switch (errno) {
79 			case EILSEQ:
80 				/* invalid character found */
81 				error(gettext(ERR_INVALID_CHAR),
82 					cur_line, cur_po);
83 				/* NOTREACHED */
84 			case EINVAL:
85 				/* not enough input */
86 				if (insize == MB_LEN_MAX) {
87 					/* invalid character found */
88 					error(gettext(ERR_INVALID_CHAR),
89 						cur_line, cur_po);
90 					/* NOTREACHED */
91 				}
92 				c = getc(fp);
93 				if (c == EOF) {
94 					error(gettext(ERR_UNEXP_EOF),
95 						cur_line, cur_po);
96 					/* NOTREACHED */
97 				}
98 				tmpbuf[insize++] = (unsigned char)c;
99 
100 				/* initialize the conversion */
101 				outptr = &outbuf[0];
102 				outlen = sizeof (outbuf);
103 				(void) iconv(cd, NULL, NULL, &outptr, &outlen);
104 
105 				continue;
106 				/* NOTREACHED */
107 			default:
108 				/* should never happen */
109 				error(ERR_INTERNAL,
110 					cur_line, cur_po);
111 				/* NOTREACHED */
112 			}
113 			/* NOTREACHED */
114 		}
115 		tmpbuf[insize] = '\0';
116 		return (insize);
117 		/* NOTRECHED */
118 	}
119 }
120 
121 static void
122 po_uninput(int c)
123 {
124 	(void) ungetc(c, fp);
125 	if (c == '\n')
126 		cur_line--;
127 }
128 
129 static void
130 po_ungetc(struct ch *pch)
131 {
132 	if (backlen) {
133 		error(gettext(ERR_INTERNAL), cur_line, cur_po);
134 		/* NOTREACHED */
135 	}
136 	if (!pch->eof) {
137 		backlen = pch->len;
138 		(void) memcpy(backbuf, pch->buf, backlen);
139 	}
140 }
141 
142 static struct ch *
143 po_getc(void)
144 {
145 	static struct ch	och;
146 	int	c;
147 
148 	if (backlen) {
149 		och.len = backlen;
150 		(void) memcpy(och.buf, backbuf, backlen);
151 		backlen = 0;
152 		return (&och);
153 	}
154 
155 	for (; ; ) {
156 		c = getc(fp);
157 		if (c == EOF) {
158 			if (ferror(fp)) {
159 				/* error happend */
160 				error(gettext(ERR_READ_FAILED), cur_po);
161 				/* NOTREACHED */
162 			}
163 			och.len = 0;
164 			och.eof = 1;
165 			return (&och);
166 		}
167 		if (c == '\\') {
168 			c = getc(fp);
169 			if (c == '\n') {
170 				/* this newline should be escaped */
171 				cur_line++;
172 				continue;
173 			} else {
174 				po_uninput(c);
175 				och.len = 1;
176 				och.eof = 0;
177 				och.buf[0] = '\\';
178 				return (&och);
179 			}
180 			/* NOTREACHED */
181 		}
182 		if (c == '\n') {
183 			cur_line++;
184 			och.len = 1;
185 			och.eof = 0;
186 			och.buf[0] = '\n';
187 			return (&och);
188 		}
189 		if (isascii((unsigned char)c)) {
190 			/* single byte ascii */
191 			och.len = 1;
192 			och.eof = 0;
193 			och.buf[0] = (unsigned char)c;
194 			return (&och);
195 		}
196 
197 		och.len = get_mb(&och.buf[0], (unsigned char)c);
198 		och.eof = 0;
199 		return (&och);
200 	}
201 	/* NOTREACHED */
202 }
203 
204 static void
205 extend_buf(char **buf, size_t *size, size_t add)
206 {
207 	char	*tmp;
208 
209 	*size += add;
210 	tmp = (char *)Xrealloc(*buf, *size);
211 	*buf = tmp;
212 }
213 
214 static struct ch	*
215 expand_es(void)
216 {
217 	int	c, n, loop;
218 	static struct ch	och;
219 	struct ch	*pch;
220 
221 	pch = po_getc();
222 	if (pch->eof) {
223 		error(gettext(ERR_UNEXP_EOF),
224 			cur_line, cur_po);
225 		/* NOTREACHED */
226 	}
227 	if (pch->len > 1) {
228 		/* not a valid escape sequence */
229 		return (pch);
230 	}
231 
232 	och.len = 1;
233 	och.eof = 0;
234 	switch (pch->buf[0]) {
235 	case '"':
236 	case '\\':
237 		och.buf[0] = pch->buf[0];
238 		break;
239 	case 'b':
240 		och.buf[0] = '\b';
241 		break;
242 	case 'f':
243 		och.buf[0] = '\f';
244 		break;
245 	case 'n':
246 		och.buf[0] = '\n';
247 		break;
248 	case 'r':
249 		och.buf[0] = '\r';
250 		break;
251 	case 't':
252 		och.buf[0] = '\t';
253 		break;
254 	case 'v':
255 		och.buf[0] = '\v';
256 		break;
257 	case 'a':
258 		och.buf[0] = '\a';
259 		break;
260 	case '0':
261 	case '1':
262 	case '2':
263 	case '3':
264 	case '4':
265 	case '5':
266 	case '6':
267 	case '7':
268 		/* octal */
269 		c = pch->buf[0];
270 		for (n = 0, loop = 0; ; ) {
271 			n = n * 8 + c - '0';
272 			loop++;
273 			if (loop >= 3)
274 				break;
275 			pch = po_getc();
276 			if (pch->eof) {
277 				error(gettext(ERR_UNEXP_EOF),
278 					cur_line, cur_po);
279 				/* NOTREACHED */
280 			}
281 			if ((pch->len > 1) || (pch->buf[0] < '0') ||
282 				(pch->buf[0] > '7'))
283 				break;
284 			c = pch->buf[0];
285 		}
286 		po_ungetc(pch);
287 		och.buf[0] = (unsigned char)n;
288 		break;
289 	case 'x':
290 		/* hex */
291 		pch = po_getc();
292 		if (pch->eof) {
293 			error(gettext(ERR_UNEXP_EOF),
294 				cur_line, cur_po);
295 			/* NOTREACHED */
296 		}
297 		if (pch->len > 1) {
298 			po_ungetc(pch);
299 			och.buf[0] = 'x';
300 			break;
301 		}
302 		c = pch->buf[0];
303 		if (!isxdigit((unsigned char)c)) {
304 			po_ungetc(pch);
305 			och.buf[0] = 'x';
306 			break;
307 		}
308 		if (isdigit((unsigned char)c)) {
309 			n = c - '0';
310 		} else if (isupper((unsigned char)c)) {
311 			n = c - 'A' + 10;
312 		} else {
313 			n = c - 'a' + 10;
314 		}
315 
316 		pch = po_getc();
317 		if (pch->eof) {
318 			error(gettext(ERR_UNEXP_EOF),
319 				cur_line, cur_po);
320 			/* NOTREACHED */
321 		}
322 		if (pch->len > 1) {
323 			po_ungetc(pch);
324 			och.buf[0] = (unsigned char)n;
325 			break;
326 		}
327 		c = pch->buf[0];
328 		if (!isxdigit((unsigned char)c)) {
329 			po_ungetc(pch);
330 			och.buf[0] = (unsigned char)n;
331 			break;
332 		}
333 		n *= 16;
334 		if (isdigit((unsigned char)c)) {
335 			n += c - '0';
336 		} else if (isupper((unsigned char)c)) {
337 			n += c - 'A' + 10;
338 		} else {
339 			n += c - 'a' + 10;
340 		}
341 		och.buf[0] = (unsigned char)n;
342 		break;
343 
344 	default:
345 		och.buf[0] = pch->buf[0];
346 		break;
347 	}
348 	return (&och);
349 }
350 
351 int
352 yylex(void)
353 {
354 	unsigned int	uc;
355 	struct ch	*pch;
356 	char	*buf;
357 	size_t	buf_size, buf_pos;
358 
359 	for (; ; ) {
360 		pch = po_getc();
361 
362 		if (pch->eof) {
363 			/* EOF */
364 			return (0);
365 		}
366 
367 		if (pch->len > 1) {
368 			/* multi byte */
369 			yylval.c.len = pch->len;
370 			(void) memcpy(yylval.c.buf, pch->buf, pch->len);
371 			return (CHR);
372 		}
373 		/* single byte */
374 		switch (pch->buf[0]) {
375 		case ' ':
376 		case '\t':
377 		case '\n':
378 			break;
379 
380 		case '#':
381 			/* comment start */
382 			buf_size = CBUFSIZE;
383 			buf = (char *)Xmalloc(buf_size);
384 			buf_pos = 0;
385 			pch = po_getc();
386 			while (!pch->eof &&
387 				((pch->len != 1) || (pch->buf[0] != '\n'))) {
388 				if (buf_pos + pch->len + 1 > buf_size)
389 					extend_buf(&buf, &buf_size, CBUFSIZE);
390 				(void) memcpy(buf + buf_pos,
391 					pch->buf, pch->len);
392 				buf_pos += pch->len;
393 				pch = po_getc();
394 			}
395 			buf[buf_pos] = '\0';
396 			yylval.str = buf;
397 			return (COMMENT);
398 			/* NOTREACHED */
399 
400 		case '[':
401 		case ']':
402 			return (pch->buf[0]);
403 			/* NOTREACHED */
404 
405 		case '"':
406 			buf_size = MBUFSIZE;
407 			buf = (char *)Xmalloc(buf_size);
408 			buf_pos = 0;
409 			for (; ; ) {
410 				pch = po_getc();
411 
412 				if (pch->eof) {
413 					/* EOF */
414 					error(gettext(ERR_UNEXP_EOF),
415 						cur_line, cur_po);
416 					/* NOTREACHED */
417 				}
418 
419 				if (pch->len == 1) {
420 					uc = pch->buf[0];
421 
422 					if (uc == '\n') {
423 						error(gettext(ERR_UNEXP_EOL),
424 							cur_line, cur_po);
425 						/* NOTREACHED */
426 					}
427 					if (uc == '"')
428 						break;
429 					if (uc == '\\')
430 						pch = expand_es();
431 				}
432 				if (buf_pos + pch->len + 1 > buf_size)
433 					extend_buf(&buf, &buf_size,
434 						MBUFSIZE);
435 				(void) memcpy(buf + buf_pos,
436 					pch->buf, pch->len);
437 				buf_pos += pch->len;
438 			}
439 
440 			buf[buf_pos] = '\0';
441 			yylval.str = buf;
442 			return (STR);
443 			/* NOTREACHED */
444 
445 		default:
446 			uc = pch->buf[0];
447 
448 			if (isalpha(uc) || (uc == '_')) {
449 				buf_size = KBUFSIZE;
450 				buf = (char *)Xmalloc(buf_size);
451 				buf_pos = 0;
452 				buf[buf_pos++] = (char)uc;
453 				pch = po_getc();
454 				while (!pch->eof &&
455 					(pch->len == 1) &&
456 					(isalpha(uc = pch->buf[0]) ||
457 					isdigit(uc) || (uc == '_'))) {
458 					if (buf_pos + 1 + 1 > buf_size)
459 						extend_buf(&buf, &buf_size,
460 							KBUFSIZE);
461 					buf[buf_pos++] = (char)uc;
462 					pch = po_getc();
463 				}
464 				/* push back the last char */
465 				po_ungetc(pch);
466 				buf[buf_pos] = '\0';
467 				yylval.str = buf;
468 				if (buf_pos > MAX_KW_LEN) {
469 					/* kbuf is longer than any keywords */
470 					return (SYMBOL);
471 				}
472 				yylval.num = cur_line;
473 				if (strcmp(buf, KW_DOMAIN) == 0) {
474 					free(buf);
475 					return (DOMAIN);
476 				} else if (strcmp(buf, KW_MSGID) == 0) {
477 					free(buf);
478 					return (MSGID);
479 				} else if (strcmp(buf, KW_MSGID_PLURAL) == 0) {
480 					free(buf);
481 					return (MSGID_PLURAL);
482 				} else if (strcmp(buf, KW_MSGSTR) == 0) {
483 					free(buf);
484 					return (MSGSTR);
485 				} else {
486 					free(buf);
487 					return (SYMBOL);
488 				}
489 				/* NOTREACHED */
490 			}
491 			if (isdigit(uc)) {
492 				buf_size = NBUFSIZE;
493 				buf = (char *)Xmalloc(buf_size);
494 				buf_pos = 0;
495 				buf[buf_pos++] = (char)uc;
496 				pch = po_getc();
497 				while (!pch->eof &&
498 					(pch->len == 1) &&
499 					isdigit(uc = pch->buf[0])) {
500 					if (buf_pos + 1 + 1 > buf_size)
501 						extend_buf(&buf, &buf_size,
502 							NBUFSIZE);
503 					buf[buf_pos++] = (char)uc;
504 					pch = po_getc();
505 				}
506 				/* push back the last char */
507 				po_ungetc(pch);
508 				buf[buf_pos] = '\0';
509 				yylval.num = atoi(buf);
510 				free(buf);
511 				return (NUM);
512 			}
513 			/* just a char */
514 			yylval.c.len = 1;
515 			yylval.c.buf[0] = uc;
516 			return (CHR);
517 			/* NOTREACHED */
518 		}
519 	}
520 }
521