1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
14 */
15
16 /*
17 * This file contains the "scanner", which tokenizes charmap files
18 * for iconv for processing by the higher level grammar processor.
19 */
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <ctype.h>
24 #include <limits.h>
25 #include <string.h>
26 #include <widec.h>
27 #include <sys/types.h>
28 #include <assert.h>
29 #include "charmap.h"
30 #include "parser.tab.h"
31
32 int com_char = '#';
33 int esc_char = '\\';
34 int mb_cur_min = 1;
35 int mb_cur_max = MB_LEN_MAX;
36 int lineno = 1;
37 int warnings = 0;
38 static int nextline;
39 static FILE *input = stdin;
40 static const char *filename = "<stdin>";
41 static int instring = 0;
42 static int escaped = 0;
43
44 /*
45 * Token space ... grows on demand.
46 */
47 static char *token = NULL;
48 static int tokidx;
49 static int toksz = 0;
50 static int hadtok = 0;
51
52 /*
53 * The last keyword seen. This is useful to trigger the special lexer rules
54 * for "copy" and also collating symbols and elements.
55 */
56 int last_kw = 0;
57 static int category = T_END;
58
59 static struct token {
60 int id;
61 const char *name;
62 } keywords[] = {
63 { T_COM_CHAR, "comment_char" },
64 { T_ESC_CHAR, "escape_char" },
65 { T_END, "END" },
66
67 /*
68 * These are keywords used in the charmap file. Note that
69 * Solaris orginally used angle brackets to wrap some of them,
70 * but we removed that to simplify our parser. The first of these
71 * items are "global items."
72 */
73 { T_CHARMAP, "CHARMAP" },
74 { T_WIDTH, "WIDTH" },
75 { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" },
76
77 { -1, NULL },
78 };
79
80 /*
81 * These special words are only used in a charmap file, enclosed in <>.
82 */
83 static struct token symwords[] = {
84 { T_COM_CHAR, "comment_char" },
85 { T_ESC_CHAR, "escape_char" },
86 { T_CODE_SET, "code_set_name" },
87 { T_MB_CUR_MAX, "mb_cur_max" },
88 { T_MB_CUR_MIN, "mb_cur_min" },
89 { -1, NULL },
90 };
91
92 static int categories[] = {
93 T_CHARMAP,
94 0
95 };
96
97 void
reset_scanner(const char * fname)98 reset_scanner(const char *fname)
99 {
100 if (fname == NULL) {
101 filename = "<stdin>";
102 input = stdin;
103 } else {
104 if (input != stdin)
105 (void) fclose(input);
106 if ((input = fopen(fname, "r")) == NULL) {
107 perror(fname);
108 exit(1);
109 }
110 filename = fname;
111 }
112 com_char = '#';
113 esc_char = '\\';
114 instring = 0;
115 escaped = 0;
116 lineno = 1;
117 nextline = 1;
118 tokidx = 0;
119 last_kw = 0;
120 category = T_END;
121 }
122
123 #define hex(x) \
124 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
125 #define isodigit(x) ((x >= '0') && (x <= '7'))
126
127 static int
scanc(void)128 scanc(void)
129 {
130 int c;
131
132 c = getc(input);
133 lineno = nextline;
134 if (c == '\n') {
135 nextline++;
136 }
137 return (c);
138 }
139
140 static void
unscanc(int c)141 unscanc(int c)
142 {
143 if (c == '\n') {
144 nextline--;
145 }
146 if (ungetc(c, input) < 0) {
147 yyerror(_("ungetc failed"));
148 }
149 }
150
151 static int
scan_hex_byte(void)152 scan_hex_byte(void)
153 {
154 int c1, c2;
155 int v;
156
157 c1 = scanc();
158 if (!isxdigit(c1)) {
159 yyerror(_("malformed hex digit"));
160 return (0);
161 }
162 c2 = scanc();
163 if (!isxdigit(c2)) {
164 yyerror(_("malformed hex digit"));
165 return (0);
166 }
167 v = ((hex(c1) << 4) | hex(c2));
168 return (v);
169 }
170
171 static int
scan_dec_byte(void)172 scan_dec_byte(void)
173 {
174 int c1, c2, c3;
175 int b;
176
177 c1 = scanc();
178 if (!isdigit(c1)) {
179 yyerror(_("malformed decimal digit"));
180 return (0);
181 }
182 b = c1 - '0';
183 c2 = scanc();
184 if (!isdigit(c2)) {
185 yyerror(_("malformed decimal digit"));
186 return (0);
187 }
188 b *= 10;
189 b += (c2 - '0');
190 c3 = scanc();
191 if (!isdigit(c3)) {
192 unscanc(c3);
193 } else {
194 b *= 10;
195 b += (c3 - '0');
196 }
197 return (b);
198 }
199
200 static int
scan_oct_byte(void)201 scan_oct_byte(void)
202 {
203 int c1, c2, c3;
204 int b;
205
206 b = 0;
207
208 c1 = scanc();
209 if (!isodigit(c1)) {
210 yyerror(_("malformed octal digit"));
211 return (0);
212 }
213 b = c1 - '0';
214 c2 = scanc();
215 if (!isodigit(c2)) {
216 yyerror(_("malformed octal digit"));
217 return (0);
218 }
219 b *= 8;
220 b += (c2 - '0');
221 c3 = scanc();
222 if (!isodigit(c3)) {
223 unscanc(c3);
224 } else {
225 b *= 8;
226 b += (c3 - '0');
227 }
228 return (b);
229 }
230
231 void
add_tok(int c)232 add_tok(int c)
233 {
234 if ((tokidx + 1) >= toksz) {
235 toksz += 64;
236 if ((token = realloc(token, toksz)) == NULL) {
237 yyerror(_("out of memory"));
238 tokidx = 0;
239 toksz = 0;
240 return;
241 }
242 }
243
244 token[tokidx++] = (char)c;
245 token[tokidx] = 0;
246 }
247
248 static int
get_byte(void)249 get_byte(void)
250 {
251 int c;
252
253 if ((c = scanc()) != esc_char) {
254 unscanc(c);
255 return (EOF);
256 }
257 c = scanc();
258
259 switch (c) {
260 case 'd':
261 case 'D':
262 return (scan_dec_byte());
263 case 'x':
264 case 'X':
265 return (scan_hex_byte());
266 case '0':
267 case '1':
268 case '2':
269 case '3':
270 case '4':
271 case '5':
272 case '6':
273 case '7':
274 /* put the character back so we can get it */
275 unscanc(c);
276 return (scan_oct_byte());
277 default:
278 unscanc(c);
279 unscanc(esc_char);
280 return (EOF);
281 }
282 }
283
284 int
get_escaped(int c)285 get_escaped(int c)
286 {
287 switch (c) {
288 case 'n':
289 return ('\n');
290 case 'r':
291 return ('\r');
292 case 't':
293 return ('\t');
294 case 'f':
295 return ('\f');
296 case 'v':
297 return ('\v');
298 case 'b':
299 return ('\b');
300 case 'a':
301 return ('\a');
302 default:
303 return (c);
304 }
305 }
306
307 int
get_wide(void)308 get_wide(void)
309 {
310 /* NB: yylval.mbs[0] is the length */
311 char *mbs = &yylval.mbs[1];
312 int mbi = 0;
313 int c;
314
315 mbs[mbi] = 0;
316 if (mb_cur_max > MB_LEN_MAX) {
317 yyerror(_("max multibyte character size too big"));
318 return (T_NULL);
319 }
320 for (;;) {
321 if ((c = get_byte()) == EOF)
322 break;
323 if (mbi == mb_cur_max) {
324 unscanc(c);
325 yyerror(_("length > mb_cur_max"));
326 return (T_NULL);
327 }
328 mbs[mbi++] = c;
329 mbs[mbi] = 0;
330 }
331
332 /* result in yylval.mbs */
333 mbs[-1] = mbi;
334 return (T_CHAR);
335 }
336
337 int
get_symbol(void)338 get_symbol(void)
339 {
340 int c;
341
342 while ((c = scanc()) != EOF) {
343 if (escaped) {
344 escaped = 0;
345 if (c == '\n')
346 continue;
347 add_tok(get_escaped(c));
348 continue;
349 }
350 if (c == esc_char) {
351 escaped = 1;
352 continue;
353 }
354 if (c == '\n') { /* well that's strange! */
355 yyerror(_("unterminated symbolic name"));
356 continue;
357 }
358 if (c == '>') { /* end of symbol */
359
360 /*
361 * This restarts the token from the beginning
362 * the next time we scan a character. (This
363 * token is complete.)
364 */
365
366 if (token == NULL) {
367 yyerror(_("missing symbolic name"));
368 return (T_NULL);
369 }
370 tokidx = 0;
371
372 /*
373 * A few symbols are handled as keywords outside
374 * of the normal categories.
375 */
376 if (category == T_END) {
377 int i;
378 for (i = 0; symwords[i].name != 0; i++) {
379 if (strcmp(token, symwords[i].name) ==
380 0) {
381 last_kw = symwords[i].id;
382 return (last_kw);
383 }
384 }
385 }
386 /* its an undefined symbol */
387 yylval.token = strdup(token);
388 if (yylval.token == NULL) {
389 perror("malloc");
390 exit(1);
391 }
392 token = NULL;
393 toksz = 0;
394 tokidx = 0;
395 return (T_SYMBOL);
396 }
397 add_tok(c);
398 }
399
400 yyerror(_("unterminated symbolic name"));
401 return (EOF);
402 }
403
404
405 static int
consume_token(void)406 consume_token(void)
407 {
408 int len = tokidx;
409 int i;
410
411 tokidx = 0;
412 if (token == NULL)
413 return (T_NULL);
414
415 /*
416 * this one is special, because we don't want it to alter the
417 * last_kw field.
418 */
419 if (strcmp(token, "...") == 0) {
420 return (T_ELLIPSIS);
421 }
422
423 /* search for reserved words first */
424 for (i = 0; keywords[i].name; i++) {
425 int j;
426 if (strcmp(keywords[i].name, token) != 0) {
427 continue;
428 }
429
430 last_kw = keywords[i].id;
431
432 /* clear the top level category if we're done with it */
433 if (last_kw == T_END) {
434 category = T_END;
435 }
436
437 /* set the top level category if we're changing */
438 for (j = 0; categories[j]; j++) {
439 if (categories[j] != last_kw)
440 continue;
441 category = last_kw;
442 }
443
444 return (keywords[i].id);
445 }
446
447 /* maybe its a numeric constant? */
448 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
449 char *eptr;
450 yylval.num = strtol(token, &eptr, 10);
451 if (*eptr != 0)
452 yyerror(_("malformed number"));
453 return (T_NUMBER);
454 }
455
456 /*
457 * A single lone character is treated as a character literal.
458 * To avoid duplication of effort, we stick in the charmap.
459 */
460 if (len == 1) {
461 yylval.mbs[0] = 1; /* length */
462 yylval.mbs[1] = token[0];
463 yylval.mbs[2] = '\0';
464 return (T_CHAR);
465 }
466
467 /* anything else is treated as a symbolic name */
468 yylval.token = strdup(token);
469 token = NULL;
470 toksz = 0;
471 tokidx = 0;
472 return (T_NAME);
473 }
474
475 void
scan_to_eol(void)476 scan_to_eol(void)
477 {
478 int c;
479 while ((c = scanc()) != '\n') {
480 if (c == EOF) {
481 /* end of file without newline! */
482 errf(_("missing newline"));
483 return;
484 }
485 }
486 assert(c == '\n');
487 }
488
489 int
yylex(void)490 yylex(void)
491 {
492 int c;
493
494 while ((c = scanc()) != EOF) {
495
496 /* special handling for quoted string */
497 if (instring) {
498 if (escaped) {
499 escaped = 0;
500
501 /* if newline, just eat and forget it */
502 if (c == '\n')
503 continue;
504
505 if (strchr("xXd01234567", c)) {
506 unscanc(c);
507 unscanc(esc_char);
508 return (get_wide());
509 }
510 yylval.mbs[0] = 1; /* length */
511 yylval.mbs[1] = get_escaped(c);
512 yylval.mbs[2] = '\0';
513 return (T_CHAR);
514 }
515 if (c == esc_char) {
516 escaped = 1;
517 continue;
518 }
519 switch (c) {
520 case '<':
521 return (get_symbol());
522 case '>':
523 /* oops! should generate syntax error */
524 return (T_GT);
525 case '"':
526 instring = 0;
527 return (T_QUOTE);
528 default:
529 yylval.mbs[0] = 1; /* length */
530 yylval.mbs[1] = c;
531 yylval.mbs[2] = '\0';
532 return (T_CHAR);
533 }
534 }
535
536 /* escaped characters first */
537 if (escaped) {
538 escaped = 0;
539 if (c == '\n') {
540 /* eat the newline */
541 continue;
542 }
543 hadtok = 1;
544 if (tokidx) {
545 /* an escape mid-token is nonsense */
546 return (T_NULL);
547 }
548
549 /* numeric escapes are treated as wide characters */
550 if (strchr("xXd01234567", c)) {
551 unscanc(c);
552 unscanc(esc_char);
553 return (get_wide());
554 }
555
556 add_tok(get_escaped(c));
557 continue;
558 }
559
560 /* if it is the escape charter itself note it */
561 if (c == esc_char) {
562 escaped = 1;
563 continue;
564 }
565
566 /* remove from the comment char to end of line */
567 if (c == com_char) {
568 while (c != '\n') {
569 if ((c = scanc()) == EOF) {
570 /* end of file without newline! */
571 return (EOF);
572 }
573 }
574 assert(c == '\n');
575 if (!hadtok) {
576 /*
577 * If there were no tokens on this line,
578 * then just pretend it didn't exist at all.
579 */
580 continue;
581 }
582 hadtok = 0;
583 return (T_NL);
584 }
585
586 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
587 /*
588 * These are all token delimiters. If there
589 * is a token already in progress, we need to
590 * process it.
591 */
592 unscanc(c);
593 return (consume_token());
594 }
595
596 switch (c) {
597 case '\n':
598 if (!hadtok) {
599 /*
600 * If the line was completely devoid of tokens,
601 * then just ignore it.
602 */
603 continue;
604 }
605 /* we're starting a new line, reset the token state */
606 hadtok = 0;
607 return (T_NL);
608 case ',':
609 hadtok = 1;
610 return (T_COMMA);
611 case ';':
612 hadtok = 1;
613 return (T_SEMI);
614 case '(':
615 hadtok = 1;
616 return (T_LPAREN);
617 case ')':
618 hadtok = 1;
619 return (T_RPAREN);
620 case '>':
621 hadtok = 1;
622 return (T_GT);
623 case '<':
624 /* symbol start! */
625 hadtok = 1;
626 return (get_symbol());
627 case ' ':
628 case '\t':
629 /* whitespace, just ignore it */
630 continue;
631 case '"':
632 hadtok = 1;
633 instring = 1;
634 return (T_QUOTE);
635 default:
636 hadtok = 1;
637 add_tok(c);
638 continue;
639 }
640 }
641 return (EOF);
642 }
643
644 void
yyerror(const char * msg)645 yyerror(const char *msg)
646 {
647 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
648 filename, lineno, msg);
649 exit(1);
650 }
651
652 void
errf(const char * fmt,...)653 errf(const char *fmt, ...)
654 {
655 char *msg;
656
657 va_list va;
658 va_start(va, fmt);
659 (void) vasprintf(&msg, fmt, va);
660 va_end(va);
661
662 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
663 filename, lineno, msg);
664 free(msg);
665 exit(1);
666 }
667
668 void
warn(const char * fmt,...)669 warn(const char *fmt, ...)
670 {
671 char *msg;
672
673 va_list va;
674 va_start(va, fmt);
675 (void) vasprintf(&msg, fmt, va);
676 va_end(va);
677
678 (void) fprintf(stderr, _("%s: %d: warning: %s\n"),
679 filename, lineno, msg);
680 free(msg);
681 warnings++;
682 }
683