1*48edc7cfSGordon Ross /*
2*48edc7cfSGordon Ross * This file and its contents are supplied under the terms of the
3*48edc7cfSGordon Ross * Common Development and Distribution License ("CDDL"), version 1.0.
4*48edc7cfSGordon Ross * You may only use this file in accordance with the terms of version
5*48edc7cfSGordon Ross * 1.0 of the CDDL.
6*48edc7cfSGordon Ross *
7*48edc7cfSGordon Ross * A full copy of the text of the CDDL should have accompanied this
8*48edc7cfSGordon Ross * source. A copy of the CDDL is also available via the Internet at
9*48edc7cfSGordon Ross * http://www.illumos.org/license/CDDL.
10*48edc7cfSGordon Ross */
11*48edc7cfSGordon Ross
12*48edc7cfSGordon Ross /*
13*48edc7cfSGordon Ross * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
14*48edc7cfSGordon Ross */
15*48edc7cfSGordon Ross
16*48edc7cfSGordon Ross /*
17*48edc7cfSGordon Ross * This file contains the "scanner", which tokenizes charmap files
18*48edc7cfSGordon Ross * for iconv for processing by the higher level grammar processor.
19*48edc7cfSGordon Ross */
20*48edc7cfSGordon Ross
21*48edc7cfSGordon Ross #include <stdio.h>
22*48edc7cfSGordon Ross #include <stdlib.h>
23*48edc7cfSGordon Ross #include <ctype.h>
24*48edc7cfSGordon Ross #include <limits.h>
25*48edc7cfSGordon Ross #include <string.h>
26*48edc7cfSGordon Ross #include <widec.h>
27*48edc7cfSGordon Ross #include <sys/types.h>
28*48edc7cfSGordon Ross #include <assert.h>
29*48edc7cfSGordon Ross #include "charmap.h"
30*48edc7cfSGordon Ross #include "parser.tab.h"
31*48edc7cfSGordon Ross
32*48edc7cfSGordon Ross int com_char = '#';
33*48edc7cfSGordon Ross int esc_char = '\\';
34*48edc7cfSGordon Ross int mb_cur_min = 1;
35*48edc7cfSGordon Ross int mb_cur_max = MB_LEN_MAX;
36*48edc7cfSGordon Ross int lineno = 1;
37*48edc7cfSGordon Ross int warnings = 0;
38*48edc7cfSGordon Ross static int nextline;
39*48edc7cfSGordon Ross static FILE *input = stdin;
40*48edc7cfSGordon Ross static const char *filename = "<stdin>";
41*48edc7cfSGordon Ross static int instring = 0;
42*48edc7cfSGordon Ross static int escaped = 0;
43*48edc7cfSGordon Ross
44*48edc7cfSGordon Ross /*
45*48edc7cfSGordon Ross * Token space ... grows on demand.
46*48edc7cfSGordon Ross */
47*48edc7cfSGordon Ross static char *token = NULL;
48*48edc7cfSGordon Ross static int tokidx;
49*48edc7cfSGordon Ross static int toksz = 0;
50*48edc7cfSGordon Ross static int hadtok = 0;
51*48edc7cfSGordon Ross
52*48edc7cfSGordon Ross /*
53*48edc7cfSGordon Ross * The last keyword seen. This is useful to trigger the special lexer rules
54*48edc7cfSGordon Ross * for "copy" and also collating symbols and elements.
55*48edc7cfSGordon Ross */
56*48edc7cfSGordon Ross int last_kw = 0;
57*48edc7cfSGordon Ross static int category = T_END;
58*48edc7cfSGordon Ross
59*48edc7cfSGordon Ross static struct token {
60*48edc7cfSGordon Ross int id;
61*48edc7cfSGordon Ross const char *name;
62*48edc7cfSGordon Ross } keywords[] = {
63*48edc7cfSGordon Ross { T_COM_CHAR, "comment_char" },
64*48edc7cfSGordon Ross { T_ESC_CHAR, "escape_char" },
65*48edc7cfSGordon Ross { T_END, "END" },
66*48edc7cfSGordon Ross
67*48edc7cfSGordon Ross /*
68*48edc7cfSGordon Ross * These are keywords used in the charmap file. Note that
69*48edc7cfSGordon Ross * Solaris orginally used angle brackets to wrap some of them,
70*48edc7cfSGordon Ross * but we removed that to simplify our parser. The first of these
71*48edc7cfSGordon Ross * items are "global items."
72*48edc7cfSGordon Ross */
73*48edc7cfSGordon Ross { T_CHARMAP, "CHARMAP" },
74*48edc7cfSGordon Ross { T_WIDTH, "WIDTH" },
75*48edc7cfSGordon Ross { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" },
76*48edc7cfSGordon Ross
77*48edc7cfSGordon Ross { -1, NULL },
78*48edc7cfSGordon Ross };
79*48edc7cfSGordon Ross
80*48edc7cfSGordon Ross /*
81*48edc7cfSGordon Ross * These special words are only used in a charmap file, enclosed in <>.
82*48edc7cfSGordon Ross */
83*48edc7cfSGordon Ross static struct token symwords[] = {
84*48edc7cfSGordon Ross { T_COM_CHAR, "comment_char" },
85*48edc7cfSGordon Ross { T_ESC_CHAR, "escape_char" },
86*48edc7cfSGordon Ross { T_CODE_SET, "code_set_name" },
87*48edc7cfSGordon Ross { T_MB_CUR_MAX, "mb_cur_max" },
88*48edc7cfSGordon Ross { T_MB_CUR_MIN, "mb_cur_min" },
89*48edc7cfSGordon Ross { -1, NULL },
90*48edc7cfSGordon Ross };
91*48edc7cfSGordon Ross
92*48edc7cfSGordon Ross static int categories[] = {
93*48edc7cfSGordon Ross T_CHARMAP,
94*48edc7cfSGordon Ross 0
95*48edc7cfSGordon Ross };
96*48edc7cfSGordon Ross
97*48edc7cfSGordon Ross void
reset_scanner(const char * fname)98*48edc7cfSGordon Ross reset_scanner(const char *fname)
99*48edc7cfSGordon Ross {
100*48edc7cfSGordon Ross if (fname == NULL) {
101*48edc7cfSGordon Ross filename = "<stdin>";
102*48edc7cfSGordon Ross input = stdin;
103*48edc7cfSGordon Ross } else {
104*48edc7cfSGordon Ross if (input != stdin)
105*48edc7cfSGordon Ross (void) fclose(input);
106*48edc7cfSGordon Ross if ((input = fopen(fname, "r")) == NULL) {
107*48edc7cfSGordon Ross perror(fname);
108*48edc7cfSGordon Ross exit(1);
109*48edc7cfSGordon Ross }
110*48edc7cfSGordon Ross filename = fname;
111*48edc7cfSGordon Ross }
112*48edc7cfSGordon Ross com_char = '#';
113*48edc7cfSGordon Ross esc_char = '\\';
114*48edc7cfSGordon Ross instring = 0;
115*48edc7cfSGordon Ross escaped = 0;
116*48edc7cfSGordon Ross lineno = 1;
117*48edc7cfSGordon Ross nextline = 1;
118*48edc7cfSGordon Ross tokidx = 0;
119*48edc7cfSGordon Ross last_kw = 0;
120*48edc7cfSGordon Ross category = T_END;
121*48edc7cfSGordon Ross }
122*48edc7cfSGordon Ross
123*48edc7cfSGordon Ross #define hex(x) \
124*48edc7cfSGordon Ross (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
125*48edc7cfSGordon Ross #define isodigit(x) ((x >= '0') && (x <= '7'))
126*48edc7cfSGordon Ross
127*48edc7cfSGordon Ross static int
scanc(void)128*48edc7cfSGordon Ross scanc(void)
129*48edc7cfSGordon Ross {
130*48edc7cfSGordon Ross int c;
131*48edc7cfSGordon Ross
132*48edc7cfSGordon Ross c = getc(input);
133*48edc7cfSGordon Ross lineno = nextline;
134*48edc7cfSGordon Ross if (c == '\n') {
135*48edc7cfSGordon Ross nextline++;
136*48edc7cfSGordon Ross }
137*48edc7cfSGordon Ross return (c);
138*48edc7cfSGordon Ross }
139*48edc7cfSGordon Ross
140*48edc7cfSGordon Ross static void
unscanc(int c)141*48edc7cfSGordon Ross unscanc(int c)
142*48edc7cfSGordon Ross {
143*48edc7cfSGordon Ross if (c == '\n') {
144*48edc7cfSGordon Ross nextline--;
145*48edc7cfSGordon Ross }
146*48edc7cfSGordon Ross if (ungetc(c, input) < 0) {
147*48edc7cfSGordon Ross yyerror(_("ungetc failed"));
148*48edc7cfSGordon Ross }
149*48edc7cfSGordon Ross }
150*48edc7cfSGordon Ross
151*48edc7cfSGordon Ross static int
scan_hex_byte(void)152*48edc7cfSGordon Ross scan_hex_byte(void)
153*48edc7cfSGordon Ross {
154*48edc7cfSGordon Ross int c1, c2;
155*48edc7cfSGordon Ross int v;
156*48edc7cfSGordon Ross
157*48edc7cfSGordon Ross c1 = scanc();
158*48edc7cfSGordon Ross if (!isxdigit(c1)) {
159*48edc7cfSGordon Ross yyerror(_("malformed hex digit"));
160*48edc7cfSGordon Ross return (0);
161*48edc7cfSGordon Ross }
162*48edc7cfSGordon Ross c2 = scanc();
163*48edc7cfSGordon Ross if (!isxdigit(c2)) {
164*48edc7cfSGordon Ross yyerror(_("malformed hex digit"));
165*48edc7cfSGordon Ross return (0);
166*48edc7cfSGordon Ross }
167*48edc7cfSGordon Ross v = ((hex(c1) << 4) | hex(c2));
168*48edc7cfSGordon Ross return (v);
169*48edc7cfSGordon Ross }
170*48edc7cfSGordon Ross
171*48edc7cfSGordon Ross static int
scan_dec_byte(void)172*48edc7cfSGordon Ross scan_dec_byte(void)
173*48edc7cfSGordon Ross {
174*48edc7cfSGordon Ross int c1, c2, c3;
175*48edc7cfSGordon Ross int b;
176*48edc7cfSGordon Ross
177*48edc7cfSGordon Ross c1 = scanc();
178*48edc7cfSGordon Ross if (!isdigit(c1)) {
179*48edc7cfSGordon Ross yyerror(_("malformed decimal digit"));
180*48edc7cfSGordon Ross return (0);
181*48edc7cfSGordon Ross }
182*48edc7cfSGordon Ross b = c1 - '0';
183*48edc7cfSGordon Ross c2 = scanc();
184*48edc7cfSGordon Ross if (!isdigit(c2)) {
185*48edc7cfSGordon Ross yyerror(_("malformed decimal digit"));
186*48edc7cfSGordon Ross return (0);
187*48edc7cfSGordon Ross }
188*48edc7cfSGordon Ross b *= 10;
189*48edc7cfSGordon Ross b += (c2 - '0');
190*48edc7cfSGordon Ross c3 = scanc();
191*48edc7cfSGordon Ross if (!isdigit(c3)) {
192*48edc7cfSGordon Ross unscanc(c3);
193*48edc7cfSGordon Ross } else {
194*48edc7cfSGordon Ross b *= 10;
195*48edc7cfSGordon Ross b += (c3 - '0');
196*48edc7cfSGordon Ross }
197*48edc7cfSGordon Ross return (b);
198*48edc7cfSGordon Ross }
199*48edc7cfSGordon Ross
200*48edc7cfSGordon Ross static int
scan_oct_byte(void)201*48edc7cfSGordon Ross scan_oct_byte(void)
202*48edc7cfSGordon Ross {
203*48edc7cfSGordon Ross int c1, c2, c3;
204*48edc7cfSGordon Ross int b;
205*48edc7cfSGordon Ross
206*48edc7cfSGordon Ross b = 0;
207*48edc7cfSGordon Ross
208*48edc7cfSGordon Ross c1 = scanc();
209*48edc7cfSGordon Ross if (!isodigit(c1)) {
210*48edc7cfSGordon Ross yyerror(_("malformed octal digit"));
211*48edc7cfSGordon Ross return (0);
212*48edc7cfSGordon Ross }
213*48edc7cfSGordon Ross b = c1 - '0';
214*48edc7cfSGordon Ross c2 = scanc();
215*48edc7cfSGordon Ross if (!isodigit(c2)) {
216*48edc7cfSGordon Ross yyerror(_("malformed octal digit"));
217*48edc7cfSGordon Ross return (0);
218*48edc7cfSGordon Ross }
219*48edc7cfSGordon Ross b *= 8;
220*48edc7cfSGordon Ross b += (c2 - '0');
221*48edc7cfSGordon Ross c3 = scanc();
222*48edc7cfSGordon Ross if (!isodigit(c3)) {
223*48edc7cfSGordon Ross unscanc(c3);
224*48edc7cfSGordon Ross } else {
225*48edc7cfSGordon Ross b *= 8;
226*48edc7cfSGordon Ross b += (c3 - '0');
227*48edc7cfSGordon Ross }
228*48edc7cfSGordon Ross return (b);
229*48edc7cfSGordon Ross }
230*48edc7cfSGordon Ross
231*48edc7cfSGordon Ross void
add_tok(int c)232*48edc7cfSGordon Ross add_tok(int c)
233*48edc7cfSGordon Ross {
234*48edc7cfSGordon Ross if ((tokidx + 1) >= toksz) {
235*48edc7cfSGordon Ross toksz += 64;
236*48edc7cfSGordon Ross if ((token = realloc(token, toksz)) == NULL) {
237*48edc7cfSGordon Ross yyerror(_("out of memory"));
238*48edc7cfSGordon Ross tokidx = 0;
239*48edc7cfSGordon Ross toksz = 0;
240*48edc7cfSGordon Ross return;
241*48edc7cfSGordon Ross }
242*48edc7cfSGordon Ross }
243*48edc7cfSGordon Ross
244*48edc7cfSGordon Ross token[tokidx++] = (char)c;
245*48edc7cfSGordon Ross token[tokidx] = 0;
246*48edc7cfSGordon Ross }
247*48edc7cfSGordon Ross
248*48edc7cfSGordon Ross static int
get_byte(void)249*48edc7cfSGordon Ross get_byte(void)
250*48edc7cfSGordon Ross {
251*48edc7cfSGordon Ross int c;
252*48edc7cfSGordon Ross
253*48edc7cfSGordon Ross if ((c = scanc()) != esc_char) {
254*48edc7cfSGordon Ross unscanc(c);
255*48edc7cfSGordon Ross return (EOF);
256*48edc7cfSGordon Ross }
257*48edc7cfSGordon Ross c = scanc();
258*48edc7cfSGordon Ross
259*48edc7cfSGordon Ross switch (c) {
260*48edc7cfSGordon Ross case 'd':
261*48edc7cfSGordon Ross case 'D':
262*48edc7cfSGordon Ross return (scan_dec_byte());
263*48edc7cfSGordon Ross case 'x':
264*48edc7cfSGordon Ross case 'X':
265*48edc7cfSGordon Ross return (scan_hex_byte());
266*48edc7cfSGordon Ross case '0':
267*48edc7cfSGordon Ross case '1':
268*48edc7cfSGordon Ross case '2':
269*48edc7cfSGordon Ross case '3':
270*48edc7cfSGordon Ross case '4':
271*48edc7cfSGordon Ross case '5':
272*48edc7cfSGordon Ross case '6':
273*48edc7cfSGordon Ross case '7':
274*48edc7cfSGordon Ross /* put the character back so we can get it */
275*48edc7cfSGordon Ross unscanc(c);
276*48edc7cfSGordon Ross return (scan_oct_byte());
277*48edc7cfSGordon Ross default:
278*48edc7cfSGordon Ross unscanc(c);
279*48edc7cfSGordon Ross unscanc(esc_char);
280*48edc7cfSGordon Ross return (EOF);
281*48edc7cfSGordon Ross }
282*48edc7cfSGordon Ross }
283*48edc7cfSGordon Ross
284*48edc7cfSGordon Ross int
get_escaped(int c)285*48edc7cfSGordon Ross get_escaped(int c)
286*48edc7cfSGordon Ross {
287*48edc7cfSGordon Ross switch (c) {
288*48edc7cfSGordon Ross case 'n':
289*48edc7cfSGordon Ross return ('\n');
290*48edc7cfSGordon Ross case 'r':
291*48edc7cfSGordon Ross return ('\r');
292*48edc7cfSGordon Ross case 't':
293*48edc7cfSGordon Ross return ('\t');
294*48edc7cfSGordon Ross case 'f':
295*48edc7cfSGordon Ross return ('\f');
296*48edc7cfSGordon Ross case 'v':
297*48edc7cfSGordon Ross return ('\v');
298*48edc7cfSGordon Ross case 'b':
299*48edc7cfSGordon Ross return ('\b');
300*48edc7cfSGordon Ross case 'a':
301*48edc7cfSGordon Ross return ('\a');
302*48edc7cfSGordon Ross default:
303*48edc7cfSGordon Ross return (c);
304*48edc7cfSGordon Ross }
305*48edc7cfSGordon Ross }
306*48edc7cfSGordon Ross
307*48edc7cfSGordon Ross int
get_wide(void)308*48edc7cfSGordon Ross get_wide(void)
309*48edc7cfSGordon Ross {
310*48edc7cfSGordon Ross /* NB: yylval.mbs[0] is the length */
311*48edc7cfSGordon Ross char *mbs = &yylval.mbs[1];
312*48edc7cfSGordon Ross int mbi = 0;
313*48edc7cfSGordon Ross int c;
314*48edc7cfSGordon Ross
315*48edc7cfSGordon Ross mbs[mbi] = 0;
316*48edc7cfSGordon Ross if (mb_cur_max > MB_LEN_MAX) {
317*48edc7cfSGordon Ross yyerror(_("max multibyte character size too big"));
318*48edc7cfSGordon Ross return (T_NULL);
319*48edc7cfSGordon Ross }
320*48edc7cfSGordon Ross for (;;) {
321*48edc7cfSGordon Ross if ((c = get_byte()) == EOF)
322*48edc7cfSGordon Ross break;
323*48edc7cfSGordon Ross if (mbi == mb_cur_max) {
324*48edc7cfSGordon Ross unscanc(c);
325*48edc7cfSGordon Ross yyerror(_("length > mb_cur_max"));
326*48edc7cfSGordon Ross return (T_NULL);
327*48edc7cfSGordon Ross }
328*48edc7cfSGordon Ross mbs[mbi++] = c;
329*48edc7cfSGordon Ross mbs[mbi] = 0;
330*48edc7cfSGordon Ross }
331*48edc7cfSGordon Ross
332*48edc7cfSGordon Ross /* result in yylval.mbs */
333*48edc7cfSGordon Ross mbs[-1] = mbi;
334*48edc7cfSGordon Ross return (T_CHAR);
335*48edc7cfSGordon Ross }
336*48edc7cfSGordon Ross
337*48edc7cfSGordon Ross int
get_symbol(void)338*48edc7cfSGordon Ross get_symbol(void)
339*48edc7cfSGordon Ross {
340*48edc7cfSGordon Ross int c;
341*48edc7cfSGordon Ross
342*48edc7cfSGordon Ross while ((c = scanc()) != EOF) {
343*48edc7cfSGordon Ross if (escaped) {
344*48edc7cfSGordon Ross escaped = 0;
345*48edc7cfSGordon Ross if (c == '\n')
346*48edc7cfSGordon Ross continue;
347*48edc7cfSGordon Ross add_tok(get_escaped(c));
348*48edc7cfSGordon Ross continue;
349*48edc7cfSGordon Ross }
350*48edc7cfSGordon Ross if (c == esc_char) {
351*48edc7cfSGordon Ross escaped = 1;
352*48edc7cfSGordon Ross continue;
353*48edc7cfSGordon Ross }
354*48edc7cfSGordon Ross if (c == '\n') { /* well that's strange! */
355*48edc7cfSGordon Ross yyerror(_("unterminated symbolic name"));
356*48edc7cfSGordon Ross continue;
357*48edc7cfSGordon Ross }
358*48edc7cfSGordon Ross if (c == '>') { /* end of symbol */
359*48edc7cfSGordon Ross
360*48edc7cfSGordon Ross /*
361*48edc7cfSGordon Ross * This restarts the token from the beginning
362*48edc7cfSGordon Ross * the next time we scan a character. (This
363*48edc7cfSGordon Ross * token is complete.)
364*48edc7cfSGordon Ross */
365*48edc7cfSGordon Ross
366*48edc7cfSGordon Ross if (token == NULL) {
367*48edc7cfSGordon Ross yyerror(_("missing symbolic name"));
368*48edc7cfSGordon Ross return (T_NULL);
369*48edc7cfSGordon Ross }
370*48edc7cfSGordon Ross tokidx = 0;
371*48edc7cfSGordon Ross
372*48edc7cfSGordon Ross /*
373*48edc7cfSGordon Ross * A few symbols are handled as keywords outside
374*48edc7cfSGordon Ross * of the normal categories.
375*48edc7cfSGordon Ross */
376*48edc7cfSGordon Ross if (category == T_END) {
377*48edc7cfSGordon Ross int i;
378*48edc7cfSGordon Ross for (i = 0; symwords[i].name != 0; i++) {
379*48edc7cfSGordon Ross if (strcmp(token, symwords[i].name) ==
380*48edc7cfSGordon Ross 0) {
381*48edc7cfSGordon Ross last_kw = symwords[i].id;
382*48edc7cfSGordon Ross return (last_kw);
383*48edc7cfSGordon Ross }
384*48edc7cfSGordon Ross }
385*48edc7cfSGordon Ross }
386*48edc7cfSGordon Ross /* its an undefined symbol */
387*48edc7cfSGordon Ross yylval.token = strdup(token);
388*48edc7cfSGordon Ross if (yylval.token == NULL) {
389*48edc7cfSGordon Ross perror("malloc");
390*48edc7cfSGordon Ross exit(1);
391*48edc7cfSGordon Ross }
392*48edc7cfSGordon Ross token = NULL;
393*48edc7cfSGordon Ross toksz = 0;
394*48edc7cfSGordon Ross tokidx = 0;
395*48edc7cfSGordon Ross return (T_SYMBOL);
396*48edc7cfSGordon Ross }
397*48edc7cfSGordon Ross add_tok(c);
398*48edc7cfSGordon Ross }
399*48edc7cfSGordon Ross
400*48edc7cfSGordon Ross yyerror(_("unterminated symbolic name"));
401*48edc7cfSGordon Ross return (EOF);
402*48edc7cfSGordon Ross }
403*48edc7cfSGordon Ross
404*48edc7cfSGordon Ross
405*48edc7cfSGordon Ross static int
consume_token(void)406*48edc7cfSGordon Ross consume_token(void)
407*48edc7cfSGordon Ross {
408*48edc7cfSGordon Ross int len = tokidx;
409*48edc7cfSGordon Ross int i;
410*48edc7cfSGordon Ross
411*48edc7cfSGordon Ross tokidx = 0;
412*48edc7cfSGordon Ross if (token == NULL)
413*48edc7cfSGordon Ross return (T_NULL);
414*48edc7cfSGordon Ross
415*48edc7cfSGordon Ross /*
416*48edc7cfSGordon Ross * this one is special, because we don't want it to alter the
417*48edc7cfSGordon Ross * last_kw field.
418*48edc7cfSGordon Ross */
419*48edc7cfSGordon Ross if (strcmp(token, "...") == 0) {
420*48edc7cfSGordon Ross return (T_ELLIPSIS);
421*48edc7cfSGordon Ross }
422*48edc7cfSGordon Ross
423*48edc7cfSGordon Ross /* search for reserved words first */
424*48edc7cfSGordon Ross for (i = 0; keywords[i].name; i++) {
425*48edc7cfSGordon Ross int j;
426*48edc7cfSGordon Ross if (strcmp(keywords[i].name, token) != 0) {
427*48edc7cfSGordon Ross continue;
428*48edc7cfSGordon Ross }
429*48edc7cfSGordon Ross
430*48edc7cfSGordon Ross last_kw = keywords[i].id;
431*48edc7cfSGordon Ross
432*48edc7cfSGordon Ross /* clear the top level category if we're done with it */
433*48edc7cfSGordon Ross if (last_kw == T_END) {
434*48edc7cfSGordon Ross category = T_END;
435*48edc7cfSGordon Ross }
436*48edc7cfSGordon Ross
437*48edc7cfSGordon Ross /* set the top level category if we're changing */
438*48edc7cfSGordon Ross for (j = 0; categories[j]; j++) {
439*48edc7cfSGordon Ross if (categories[j] != last_kw)
440*48edc7cfSGordon Ross continue;
441*48edc7cfSGordon Ross category = last_kw;
442*48edc7cfSGordon Ross }
443*48edc7cfSGordon Ross
444*48edc7cfSGordon Ross return (keywords[i].id);
445*48edc7cfSGordon Ross }
446*48edc7cfSGordon Ross
447*48edc7cfSGordon Ross /* maybe its a numeric constant? */
448*48edc7cfSGordon Ross if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
449*48edc7cfSGordon Ross char *eptr;
450*48edc7cfSGordon Ross yylval.num = strtol(token, &eptr, 10);
451*48edc7cfSGordon Ross if (*eptr != 0)
452*48edc7cfSGordon Ross yyerror(_("malformed number"));
453*48edc7cfSGordon Ross return (T_NUMBER);
454*48edc7cfSGordon Ross }
455*48edc7cfSGordon Ross
456*48edc7cfSGordon Ross /*
457*48edc7cfSGordon Ross * A single lone character is treated as a character literal.
458*48edc7cfSGordon Ross * To avoid duplication of effort, we stick in the charmap.
459*48edc7cfSGordon Ross */
460*48edc7cfSGordon Ross if (len == 1) {
461*48edc7cfSGordon Ross yylval.mbs[0] = 1; /* length */
462*48edc7cfSGordon Ross yylval.mbs[1] = token[0];
463*48edc7cfSGordon Ross yylval.mbs[2] = '\0';
464*48edc7cfSGordon Ross return (T_CHAR);
465*48edc7cfSGordon Ross }
466*48edc7cfSGordon Ross
467*48edc7cfSGordon Ross /* anything else is treated as a symbolic name */
468*48edc7cfSGordon Ross yylval.token = strdup(token);
469*48edc7cfSGordon Ross token = NULL;
470*48edc7cfSGordon Ross toksz = 0;
471*48edc7cfSGordon Ross tokidx = 0;
472*48edc7cfSGordon Ross return (T_NAME);
473*48edc7cfSGordon Ross }
474*48edc7cfSGordon Ross
475*48edc7cfSGordon Ross void
scan_to_eol(void)476*48edc7cfSGordon Ross scan_to_eol(void)
477*48edc7cfSGordon Ross {
478*48edc7cfSGordon Ross int c;
479*48edc7cfSGordon Ross while ((c = scanc()) != '\n') {
480*48edc7cfSGordon Ross if (c == EOF) {
481*48edc7cfSGordon Ross /* end of file without newline! */
482*48edc7cfSGordon Ross errf(_("missing newline"));
483*48edc7cfSGordon Ross return;
484*48edc7cfSGordon Ross }
485*48edc7cfSGordon Ross }
486*48edc7cfSGordon Ross assert(c == '\n');
487*48edc7cfSGordon Ross }
488*48edc7cfSGordon Ross
489*48edc7cfSGordon Ross int
yylex(void)490*48edc7cfSGordon Ross yylex(void)
491*48edc7cfSGordon Ross {
492*48edc7cfSGordon Ross int c;
493*48edc7cfSGordon Ross
494*48edc7cfSGordon Ross while ((c = scanc()) != EOF) {
495*48edc7cfSGordon Ross
496*48edc7cfSGordon Ross /* special handling for quoted string */
497*48edc7cfSGordon Ross if (instring) {
498*48edc7cfSGordon Ross if (escaped) {
499*48edc7cfSGordon Ross escaped = 0;
500*48edc7cfSGordon Ross
501*48edc7cfSGordon Ross /* if newline, just eat and forget it */
502*48edc7cfSGordon Ross if (c == '\n')
503*48edc7cfSGordon Ross continue;
504*48edc7cfSGordon Ross
505*48edc7cfSGordon Ross if (strchr("xXd01234567", c)) {
506*48edc7cfSGordon Ross unscanc(c);
507*48edc7cfSGordon Ross unscanc(esc_char);
508*48edc7cfSGordon Ross return (get_wide());
509*48edc7cfSGordon Ross }
510*48edc7cfSGordon Ross yylval.mbs[0] = 1; /* length */
511*48edc7cfSGordon Ross yylval.mbs[1] = get_escaped(c);
512*48edc7cfSGordon Ross yylval.mbs[2] = '\0';
513*48edc7cfSGordon Ross return (T_CHAR);
514*48edc7cfSGordon Ross }
515*48edc7cfSGordon Ross if (c == esc_char) {
516*48edc7cfSGordon Ross escaped = 1;
517*48edc7cfSGordon Ross continue;
518*48edc7cfSGordon Ross }
519*48edc7cfSGordon Ross switch (c) {
520*48edc7cfSGordon Ross case '<':
521*48edc7cfSGordon Ross return (get_symbol());
522*48edc7cfSGordon Ross case '>':
523*48edc7cfSGordon Ross /* oops! should generate syntax error */
524*48edc7cfSGordon Ross return (T_GT);
525*48edc7cfSGordon Ross case '"':
526*48edc7cfSGordon Ross instring = 0;
527*48edc7cfSGordon Ross return (T_QUOTE);
528*48edc7cfSGordon Ross default:
529*48edc7cfSGordon Ross yylval.mbs[0] = 1; /* length */
530*48edc7cfSGordon Ross yylval.mbs[1] = c;
531*48edc7cfSGordon Ross yylval.mbs[2] = '\0';
532*48edc7cfSGordon Ross return (T_CHAR);
533*48edc7cfSGordon Ross }
534*48edc7cfSGordon Ross }
535*48edc7cfSGordon Ross
536*48edc7cfSGordon Ross /* escaped characters first */
537*48edc7cfSGordon Ross if (escaped) {
538*48edc7cfSGordon Ross escaped = 0;
539*48edc7cfSGordon Ross if (c == '\n') {
540*48edc7cfSGordon Ross /* eat the newline */
541*48edc7cfSGordon Ross continue;
542*48edc7cfSGordon Ross }
543*48edc7cfSGordon Ross hadtok = 1;
544*48edc7cfSGordon Ross if (tokidx) {
545*48edc7cfSGordon Ross /* an escape mid-token is nonsense */
546*48edc7cfSGordon Ross return (T_NULL);
547*48edc7cfSGordon Ross }
548*48edc7cfSGordon Ross
549*48edc7cfSGordon Ross /* numeric escapes are treated as wide characters */
550*48edc7cfSGordon Ross if (strchr("xXd01234567", c)) {
551*48edc7cfSGordon Ross unscanc(c);
552*48edc7cfSGordon Ross unscanc(esc_char);
553*48edc7cfSGordon Ross return (get_wide());
554*48edc7cfSGordon Ross }
555*48edc7cfSGordon Ross
556*48edc7cfSGordon Ross add_tok(get_escaped(c));
557*48edc7cfSGordon Ross continue;
558*48edc7cfSGordon Ross }
559*48edc7cfSGordon Ross
560*48edc7cfSGordon Ross /* if it is the escape charter itself note it */
561*48edc7cfSGordon Ross if (c == esc_char) {
562*48edc7cfSGordon Ross escaped = 1;
563*48edc7cfSGordon Ross continue;
564*48edc7cfSGordon Ross }
565*48edc7cfSGordon Ross
566*48edc7cfSGordon Ross /* remove from the comment char to end of line */
567*48edc7cfSGordon Ross if (c == com_char) {
568*48edc7cfSGordon Ross while (c != '\n') {
569*48edc7cfSGordon Ross if ((c = scanc()) == EOF) {
570*48edc7cfSGordon Ross /* end of file without newline! */
571*48edc7cfSGordon Ross return (EOF);
572*48edc7cfSGordon Ross }
573*48edc7cfSGordon Ross }
574*48edc7cfSGordon Ross assert(c == '\n');
575*48edc7cfSGordon Ross if (!hadtok) {
576*48edc7cfSGordon Ross /*
577*48edc7cfSGordon Ross * If there were no tokens on this line,
578*48edc7cfSGordon Ross * then just pretend it didn't exist at all.
579*48edc7cfSGordon Ross */
580*48edc7cfSGordon Ross continue;
581*48edc7cfSGordon Ross }
582*48edc7cfSGordon Ross hadtok = 0;
583*48edc7cfSGordon Ross return (T_NL);
584*48edc7cfSGordon Ross }
585*48edc7cfSGordon Ross
586*48edc7cfSGordon Ross if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
587*48edc7cfSGordon Ross /*
588*48edc7cfSGordon Ross * These are all token delimiters. If there
589*48edc7cfSGordon Ross * is a token already in progress, we need to
590*48edc7cfSGordon Ross * process it.
591*48edc7cfSGordon Ross */
592*48edc7cfSGordon Ross unscanc(c);
593*48edc7cfSGordon Ross return (consume_token());
594*48edc7cfSGordon Ross }
595*48edc7cfSGordon Ross
596*48edc7cfSGordon Ross switch (c) {
597*48edc7cfSGordon Ross case '\n':
598*48edc7cfSGordon Ross if (!hadtok) {
599*48edc7cfSGordon Ross /*
600*48edc7cfSGordon Ross * If the line was completely devoid of tokens,
601*48edc7cfSGordon Ross * then just ignore it.
602*48edc7cfSGordon Ross */
603*48edc7cfSGordon Ross continue;
604*48edc7cfSGordon Ross }
605*48edc7cfSGordon Ross /* we're starting a new line, reset the token state */
606*48edc7cfSGordon Ross hadtok = 0;
607*48edc7cfSGordon Ross return (T_NL);
608*48edc7cfSGordon Ross case ',':
609*48edc7cfSGordon Ross hadtok = 1;
610*48edc7cfSGordon Ross return (T_COMMA);
611*48edc7cfSGordon Ross case ';':
612*48edc7cfSGordon Ross hadtok = 1;
613*48edc7cfSGordon Ross return (T_SEMI);
614*48edc7cfSGordon Ross case '(':
615*48edc7cfSGordon Ross hadtok = 1;
616*48edc7cfSGordon Ross return (T_LPAREN);
617*48edc7cfSGordon Ross case ')':
618*48edc7cfSGordon Ross hadtok = 1;
619*48edc7cfSGordon Ross return (T_RPAREN);
620*48edc7cfSGordon Ross case '>':
621*48edc7cfSGordon Ross hadtok = 1;
622*48edc7cfSGordon Ross return (T_GT);
623*48edc7cfSGordon Ross case '<':
624*48edc7cfSGordon Ross /* symbol start! */
625*48edc7cfSGordon Ross hadtok = 1;
626*48edc7cfSGordon Ross return (get_symbol());
627*48edc7cfSGordon Ross case ' ':
628*48edc7cfSGordon Ross case '\t':
629*48edc7cfSGordon Ross /* whitespace, just ignore it */
630*48edc7cfSGordon Ross continue;
631*48edc7cfSGordon Ross case '"':
632*48edc7cfSGordon Ross hadtok = 1;
633*48edc7cfSGordon Ross instring = 1;
634*48edc7cfSGordon Ross return (T_QUOTE);
635*48edc7cfSGordon Ross default:
636*48edc7cfSGordon Ross hadtok = 1;
637*48edc7cfSGordon Ross add_tok(c);
638*48edc7cfSGordon Ross continue;
639*48edc7cfSGordon Ross }
640*48edc7cfSGordon Ross }
641*48edc7cfSGordon Ross return (EOF);
642*48edc7cfSGordon Ross }
643*48edc7cfSGordon Ross
644*48edc7cfSGordon Ross void
yyerror(const char * msg)645*48edc7cfSGordon Ross yyerror(const char *msg)
646*48edc7cfSGordon Ross {
647*48edc7cfSGordon Ross (void) fprintf(stderr, _("%s: %d: error: %s\n"),
648*48edc7cfSGordon Ross filename, lineno, msg);
649*48edc7cfSGordon Ross exit(1);
650*48edc7cfSGordon Ross }
651*48edc7cfSGordon Ross
652*48edc7cfSGordon Ross void
errf(const char * fmt,...)653*48edc7cfSGordon Ross errf(const char *fmt, ...)
654*48edc7cfSGordon Ross {
655*48edc7cfSGordon Ross char *msg;
656*48edc7cfSGordon Ross
657*48edc7cfSGordon Ross va_list va;
658*48edc7cfSGordon Ross va_start(va, fmt);
659*48edc7cfSGordon Ross (void) vasprintf(&msg, fmt, va);
660*48edc7cfSGordon Ross va_end(va);
661*48edc7cfSGordon Ross
662*48edc7cfSGordon Ross (void) fprintf(stderr, _("%s: %d: error: %s\n"),
663*48edc7cfSGordon Ross filename, lineno, msg);
664*48edc7cfSGordon Ross free(msg);
665*48edc7cfSGordon Ross exit(1);
666*48edc7cfSGordon Ross }
667*48edc7cfSGordon Ross
668*48edc7cfSGordon Ross void
warn(const char * fmt,...)669*48edc7cfSGordon Ross warn(const char *fmt, ...)
670*48edc7cfSGordon Ross {
671*48edc7cfSGordon Ross char *msg;
672*48edc7cfSGordon Ross
673*48edc7cfSGordon Ross va_list va;
674*48edc7cfSGordon Ross va_start(va, fmt);
675*48edc7cfSGordon Ross (void) vasprintf(&msg, fmt, va);
676*48edc7cfSGordon Ross va_end(va);
677*48edc7cfSGordon Ross
678*48edc7cfSGordon Ross (void) fprintf(stderr, _("%s: %d: warning: %s\n"),
679*48edc7cfSGordon Ross filename, lineno, msg);
680*48edc7cfSGordon Ross free(msg);
681*48edc7cfSGordon Ross warnings++;
682*48edc7cfSGordon Ross }
683