xref: /illumos-gate/usr/src/tools/smatch/src/tokenize.c (revision 6523a3aa)
1 /*
2  * This is a really stupid C tokenizer. It doesn't do any include
3  * files or anything complex at all. That's the preprocessor.
4  *
5  * Copyright (C) 2003 Transmeta Corp.
6  *               2003 Linus Torvalds
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <ctype.h>
32 #include <unistd.h>
33 #include <stdint.h>
34 
35 #include "lib.h"
36 #include "allocate.h"
37 #include "token.h"
38 #include "symbol.h"
39 
40 #define EOF (-1)
41 
42 int input_stream_nr = 0;
43 struct stream *input_streams;
44 static int input_streams_allocated;
45 unsigned int tabstop = 8;
46 int no_lineno = 0;
47 
48 #define BUFSIZE (8192)
49 
50 typedef struct {
51 	int fd, offset, size;
52 	int pos, line, nr;
53 	int newline, whitespace;
54 	struct token **tokenlist;
55 	struct token *token;
56 	unsigned char *buffer;
57 } stream_t;
58 
stream_name(int stream)59 const char *stream_name(int stream)
60 {
61 	if (stream < 0 || stream > input_stream_nr)
62 		return "<bad stream>";
63 	return input_streams[stream].name;
64 }
65 
stream_pos(stream_t * stream)66 static struct position stream_pos(stream_t *stream)
67 {
68 	struct position pos;
69 	pos.type = 0;
70 	pos.stream = stream->nr;
71 	pos.newline = stream->newline;
72 	pos.whitespace = stream->whitespace;
73 	pos.pos = stream->pos;
74 
75 	pos.line = stream->line;
76 	if (no_lineno)
77 		pos.line = 123456;
78 
79 	pos.noexpand = 0;
80 	return pos;
81 }
82 
show_special(int val)83 const char *show_special(int val)
84 {
85 	static char buffer[4];
86 
87 	buffer[0] = val;
88 	buffer[1] = 0;
89 	if (val >= SPECIAL_BASE)
90 		strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
91 	return buffer;
92 }
93 
show_ident(const struct ident * ident)94 const char *show_ident(const struct ident *ident)
95 {
96 	static char buff[4][256];
97 	static int n;
98 	char *buffer;
99 
100 	if (!ident)
101 		return "<noident>";
102 	buffer = buff[3 & ++n];
103 	sprintf(buffer, "%.*s", ident->len, ident->name);
104 	return buffer;
105 }
106 
charstr(char * ptr,unsigned char c,unsigned char escape,unsigned char next)107 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
108 {
109 	if (isprint(c)) {
110 		if (c == escape || c == '\\')
111 			*ptr++ = '\\';
112 		*ptr++ = c;
113 		return ptr;
114 	}
115 	*ptr++ = '\\';
116 	switch (c) {
117 	case '\n':
118 		*ptr++ = 'n';
119 		return ptr;
120 	case '\t':
121 		*ptr++ = 't';
122 		return ptr;
123 	}
124 	if (!isdigit(next))
125 		return ptr + sprintf(ptr, "%o", c);
126 
127 	return ptr + sprintf(ptr, "%03o", c);
128 }
129 
show_string(const struct string * string)130 const char *show_string(const struct string *string)
131 {
132 	static char buffer[4 * MAX_STRING + 3];
133 	char *ptr;
134 	int i;
135 
136 	if (!string || !string->length)
137 		return "<bad_string>";
138 	ptr = buffer;
139 	*ptr++ = '"';
140 	for (i = 0; i < string->length-1; i++) {
141 		const char *p = string->data + i;
142 		ptr = charstr(ptr, p[0], '"', p[1]);
143 	}
144 	*ptr++ = '"';
145 	*ptr = '\0';
146 	return buffer;
147 }
148 
show_char(const char * s,size_t len,char prefix,char delim)149 static const char *show_char(const char *s, size_t len, char prefix, char delim)
150 {
151 	static char buffer[MAX_STRING + 4];
152 	char *p = buffer;
153 	if (prefix)
154 		*p++ = prefix;
155 	*p++ = delim;
156 	memcpy(p, s, len);
157 	p += len;
158 	*p++ = delim;
159 	*p++ = '\0';
160 	return buffer;
161 }
162 
quote_char(const char * s,size_t len,char prefix,char delim)163 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
164 {
165 	static char buffer[2*MAX_STRING + 6];
166 	size_t i;
167 	char *p = buffer;
168 	if (prefix)
169 		*p++ = prefix;
170 	if (delim == '"')
171 		*p++ = '\\';
172 	*p++ = delim;
173 	for (i = 0; i < len; i++) {
174 		if (s[i] == '"' || s[i] == '\\')
175 			*p++ = '\\';
176 		*p++ = s[i];
177 	}
178 	if (delim == '"')
179 		*p++ = '\\';
180 	*p++ = delim;
181 	*p++ = '\0';
182 	return buffer;
183 }
184 
show_token(const struct token * token)185 const char *show_token(const struct token *token)
186 {
187 	static char buffer[256];
188 
189 	if (!token)
190 		return "<no token>";
191 	switch (token_type(token)) {
192 	case TOKEN_ERROR:
193 		return "syntax error";
194 
195 	case TOKEN_EOF:
196 		return "end-of-input";
197 
198 	case TOKEN_IDENT:
199 		return show_ident(token->ident);
200 
201 	case TOKEN_NUMBER:
202 		return token->number;
203 
204 	case TOKEN_SPECIAL:
205 		return show_special(token->special);
206 
207 	case TOKEN_CHAR:
208 		return show_char(token->string->data,
209 			token->string->length - 1, 0, '\'');
210 	case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
211 		return show_char(token->embedded,
212 			token_type(token) - TOKEN_CHAR, 0, '\'');
213 	case TOKEN_WIDE_CHAR:
214 		return show_char(token->string->data,
215 			token->string->length - 1, 'L', '\'');
216 	case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
217 		return show_char(token->embedded,
218 			token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
219 	case TOKEN_STRING:
220 		return show_char(token->string->data,
221 			token->string->length - 1, 0, '"');
222 	case TOKEN_WIDE_STRING:
223 		return show_char(token->string->data,
224 			token->string->length - 1, 'L', '"');
225 
226 	case TOKEN_STREAMBEGIN:
227 		sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
228 		return buffer;
229 
230 	case TOKEN_STREAMEND:
231 		sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
232 		return buffer;
233 
234 	case TOKEN_UNTAINT:
235 		sprintf(buffer, "<untaint>");
236 		return buffer;
237 
238 	case TOKEN_ARG_COUNT:
239 		sprintf(buffer, "<argcnt>");
240 		return buffer;
241 
242 	default:
243 		sprintf(buffer, "unhandled token type '%d' ", token_type(token));
244 		return buffer;
245 	}
246 }
247 
quote_token(const struct token * token)248 const char *quote_token(const struct token *token)
249 {
250 	static char buffer[256];
251 
252 	switch (token_type(token)) {
253 	case TOKEN_ERROR:
254 		return "syntax error";
255 
256 	case TOKEN_IDENT:
257 		return show_ident(token->ident);
258 
259 	case TOKEN_NUMBER:
260 		return token->number;
261 
262 	case TOKEN_SPECIAL:
263 		return show_special(token->special);
264 
265 	case TOKEN_CHAR:
266 		return quote_char(token->string->data,
267 			token->string->length - 1, 0, '\'');
268 	case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
269 		return quote_char(token->embedded,
270 			token_type(token) - TOKEN_CHAR, 0, '\'');
271 	case TOKEN_WIDE_CHAR:
272 		return quote_char(token->string->data,
273 			token->string->length - 1, 'L', '\'');
274 	case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
275 		return quote_char(token->embedded,
276 			token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
277 	case TOKEN_STRING:
278 		return quote_char(token->string->data,
279 			token->string->length - 1, 0, '"');
280 	case TOKEN_WIDE_STRING:
281 		return quote_char(token->string->data,
282 			token->string->length - 1, 'L', '"');
283 	default:
284 		sprintf(buffer, "unhandled token type '%d' ", token_type(token));
285 		return buffer;
286 	}
287 }
288 
289 #define HASHED_INPUT_BITS (6)
290 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
291 #define HASH_PRIME 0x9e370001UL
292 
293 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
294 
hash_stream(const char * name)295 int *hash_stream(const char *name)
296 {
297 	uint32_t hash = 0;
298 	unsigned char c;
299 
300 	while ((c = *name++) != 0)
301 		hash = (hash + (c << 4) + (c >> 4)) * 11;
302 
303 	hash *= HASH_PRIME;
304 	hash >>= 32 - HASHED_INPUT_BITS;
305 	return input_stream_hashes + hash;
306 }
307 
init_stream(const char * name,int fd,const char ** next_path)308 int init_stream(const char *name, int fd, const char **next_path)
309 {
310 	int stream = input_stream_nr, *hash;
311 	struct stream *current;
312 
313 	if (stream >= input_streams_allocated) {
314 		int newalloc = stream * 4 / 3 + 10;
315 		input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
316 		if (!input_streams)
317 			die("Unable to allocate more streams space");
318 		input_streams_allocated = newalloc;
319 	}
320 	current = input_streams + stream;
321 	memset(current, 0, sizeof(*current));
322 	current->name = name;
323 	current->fd = fd;
324 	current->next_path = next_path;
325 	current->path = NULL;
326 	current->constant = CONSTANT_FILE_MAYBE;
327 	input_stream_nr = stream+1;
328 	hash = hash_stream(name);
329 	current->next_stream = *hash;
330 	*hash = stream;
331 	return stream;
332 }
333 
alloc_token(stream_t * stream)334 static struct token * alloc_token(stream_t *stream)
335 {
336 	struct token *token = __alloc_token(0);
337 	token->pos = stream_pos(stream);
338 	return token;
339 }
340 
341 /*
342  *  Argh...  That was surprisingly messy - handling '\r' complicates the
343  *  things a _lot_.
344  */
nextchar_slow(stream_t * stream)345 static int nextchar_slow(stream_t *stream)
346 {
347 	int offset = stream->offset;
348 	int size = stream->size;
349 	int c;
350 	int spliced = 0, had_cr, had_backslash;
351 
352 restart:
353 	had_cr = had_backslash = 0;
354 
355 repeat:
356 	if (offset >= size) {
357 		if (stream->fd < 0)
358 			goto got_eof;
359 		size = read(stream->fd, stream->buffer, BUFSIZE);
360 		if (size <= 0)
361 			goto got_eof;
362 		stream->size = size;
363 		stream->offset = offset = 0;
364 	}
365 
366 	c = stream->buffer[offset++];
367 	if (had_cr)
368 		goto check_lf;
369 
370 	if (c == '\r') {
371 		had_cr = 1;
372 		goto repeat;
373 	}
374 
375 norm:
376 	if (!had_backslash) {
377 		switch (c) {
378 		case '\t':
379 			stream->pos += tabstop - stream->pos % tabstop;
380 			break;
381 		case '\n':
382 			stream->line++;
383 			stream->pos = 0;
384 			stream->newline = 1;
385 			break;
386 		case '\\':
387 			had_backslash = 1;
388 			stream->pos++;
389 			goto repeat;
390 		default:
391 			stream->pos++;
392 		}
393 	} else {
394 		if (c == '\n') {
395 			stream->line++;
396 			stream->pos = 0;
397 			spliced = 1;
398 			goto restart;
399 		}
400 		offset--;
401 		c = '\\';
402 	}
403 out:
404 	stream->offset = offset;
405 
406 	return c;
407 
408 check_lf:
409 	if (c != '\n')
410 		offset--;
411 	c = '\n';
412 	goto norm;
413 
414 got_eof:
415 	if (had_backslash) {
416 		c = '\\';
417 		goto out;
418 	}
419 	if (stream->pos)
420 		warning(stream_pos(stream), "no newline at end of file");
421 	else if (spliced)
422 		warning(stream_pos(stream), "backslash-newline at end of file");
423 	return EOF;
424 }
425 
426 /*
427  *  We want that as light as possible while covering all normal cases.
428  *  Slow path (including the logics with line-splicing and EOF sanity
429  *  checks) is in nextchar_slow().
430  */
nextchar(stream_t * stream)431 static inline int nextchar(stream_t *stream)
432 {
433 	int offset = stream->offset;
434 
435 	if (offset < stream->size) {
436 		int c = stream->buffer[offset++];
437 		static const char special[256] = {
438 			['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
439 		};
440 		if (!special[c]) {
441 			stream->offset = offset;
442 			stream->pos++;
443 			return c;
444 		}
445 	}
446 	return nextchar_slow(stream);
447 }
448 
449 struct token eof_token_entry;
450 
mark_eof(stream_t * stream)451 static struct token *mark_eof(stream_t *stream)
452 {
453 	struct token *end;
454 
455 	end = alloc_token(stream);
456 	eof_token_entry.pos = end->pos;
457 	token_type(end) = TOKEN_STREAMEND;
458 	end->pos.newline = 1;
459 
460 	eof_token_entry.next = &eof_token_entry;
461 	eof_token_entry.pos.newline = 1;
462 
463 	end->next =  &eof_token_entry;
464 	*stream->tokenlist = end;
465 	stream->tokenlist = NULL;
466 	return end;
467 }
468 
add_token(stream_t * stream)469 static void add_token(stream_t *stream)
470 {
471 	struct token *token = stream->token;
472 
473 	stream->token = NULL;
474 	token->next = NULL;
475 	*stream->tokenlist = token;
476 	stream->tokenlist = &token->next;
477 }
478 
drop_token(stream_t * stream)479 static void drop_token(stream_t *stream)
480 {
481 	stream->newline |= stream->token->pos.newline;
482 	stream->whitespace |= stream->token->pos.whitespace;
483 	stream->token = NULL;
484 }
485 
486 enum {
487 	Letter = 1,
488 	Digit = 2,
489 	Hex = 4,
490 	Exp = 8,
491 	Dot = 16,
492 	ValidSecond = 32,
493 	Quote = 64,
494 };
495 
496 static const char cclass[257] = {
497 	['0' + 1 ... '9' + 1] = Digit | Hex,
498 	['A' + 1 ... 'D' + 1] = Letter | Hex,
499 	['E' + 1] = Letter | Hex | Exp,	/* E<exp> */
500 	['F' + 1] = Letter | Hex,
501 	['G' + 1 ... 'O' + 1] = Letter,
502 	['P' + 1] = Letter | Exp,	/* P<exp> */
503 	['Q' + 1 ... 'Z' + 1] = Letter,
504 	['a' + 1 ... 'd' + 1] = Letter | Hex,
505 	['e' + 1] = Letter | Hex | Exp,	/* e<exp> */
506 	['f' + 1] = Letter | Hex,
507 	['g' + 1 ... 'o' + 1] = Letter,
508 	['p' + 1] = Letter | Exp,	/* p<exp> */
509 	['q' + 1 ... 'z' + 1] = Letter,
510 	['_' + 1] = Letter,
511 	['.' + 1] = Dot | ValidSecond,
512 	['=' + 1] = ValidSecond,
513 	['+' + 1] = ValidSecond,
514 	['-' + 1] = ValidSecond,
515 	['>' + 1] = ValidSecond,
516 	['<' + 1] = ValidSecond,
517 	['&' + 1] = ValidSecond,
518 	['|' + 1] = ValidSecond,
519 	['#' + 1] = ValidSecond,
520 	['\'' + 1] = Quote,
521 	['"' + 1] = Quote,
522 };
523 
524 /*
525  * pp-number:
526  *	digit
527  *	. digit
528  *	pp-number digit
529  *	pp-number identifier-nodigit
530  *	pp-number e sign
531  *	pp-number E sign
532  *	pp-number p sign
533  *	pp-number P sign
534  *	pp-number .
535  */
get_one_number(int c,int next,stream_t * stream)536 static int get_one_number(int c, int next, stream_t *stream)
537 {
538 	struct token *token;
539 	static char buffer[4095];
540 	char *p = buffer, *buffer_end = buffer + sizeof (buffer);
541 
542 	*p++ = c;
543 	for (;;) {
544 		long class =  cclass[next + 1];
545 		if (!(class & (Dot | Digit | Letter)))
546 			break;
547 		if (p != buffer_end)
548 			*p++ = next;
549 		next = nextchar(stream);
550 		if (class & Exp) {
551 			if (next == '-' || next == '+') {
552 				if (p != buffer_end)
553 					*p++ = next;
554 				next = nextchar(stream);
555 			}
556 		}
557 	}
558 
559 	if (p == buffer_end) {
560 		sparse_error(stream_pos(stream), "number token exceeds %td characters",
561 		      buffer_end - buffer);
562 		// Pretend we saw just "1".
563 		buffer[0] = '1';
564 		p = buffer + 1;
565 	}
566 
567 	*p++ = 0;
568 	token = stream->token;
569 	token_type(token) = TOKEN_NUMBER;
570 	token->number = xmemdup(buffer, p - buffer);
571 	add_token(stream);
572 
573 	return next;
574 }
575 
eat_string(int next,stream_t * stream,enum token_type type)576 static int eat_string(int next, stream_t *stream, enum token_type type)
577 {
578 	static char buffer[MAX_STRING];
579 	struct string *string;
580 	struct token *token = stream->token;
581 	int len = 0;
582 	int escape;
583 	int want_hex = 0;
584 	char delim = type < TOKEN_STRING ? '\'' : '"';
585 
586 	for (escape = 0; escape || next != delim; next = nextchar(stream)) {
587 		if (len < MAX_STRING)
588 			buffer[len] = next;
589 		len++;
590 		if (next == '\n') {
591 			warning(stream_pos(stream),
592 				"missing terminating %c character", delim);
593 			/* assume delimiter is lost */
594 			break;
595 		}
596 		if (next == EOF) {
597 			warning(stream_pos(stream),
598 				"End of file in middle of string");
599 			return next;
600 		}
601 		if (!escape) {
602 			if (want_hex && !(cclass[next + 1] & Hex))
603 				warning(stream_pos(stream),
604 					"\\x used with no following hex digits");
605 			want_hex = 0;
606 			escape = next == '\\';
607 		} else {
608 			escape = 0;
609 			want_hex = next == 'x';
610 		}
611 	}
612 	if (want_hex)
613 		warning(stream_pos(stream),
614 			"\\x used with no following hex digits");
615 	if (len > MAX_STRING) {
616 		warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
617 		len = MAX_STRING;
618 	}
619 	if (delim == '\'' && len <= 4) {
620 		if (len == 0) {
621 			sparse_error(stream_pos(stream),
622 				"empty character constant");
623 			return nextchar(stream);
624 		}
625 		token_type(token) = type + len;
626 		memset(buffer + len, '\0', 4 - len);
627 		memcpy(token->embedded, buffer, 4);
628 	} else {
629 		token_type(token) = type;
630 		string = __alloc_string(len+1);
631 		memcpy(string->data, buffer, len);
632 		string->data[len] = '\0';
633 		string->length = len+1;
634 		token->string = string;
635 	}
636 
637 	/* Pass it on.. */
638 	token = stream->token;
639 	add_token(stream);
640 	return nextchar(stream);
641 }
642 
drop_stream_eoln(stream_t * stream)643 static int drop_stream_eoln(stream_t *stream)
644 {
645 	drop_token(stream);
646 	for (;;) {
647 		switch (nextchar(stream)) {
648 		case EOF:
649 			return EOF;
650 		case '\n':
651 			return nextchar(stream);
652 		}
653 	}
654 }
655 
drop_stream_comment(stream_t * stream)656 static int drop_stream_comment(stream_t *stream)
657 {
658 	int newline;
659 	int next;
660 	drop_token(stream);
661 	newline = stream->newline;
662 
663 	next = nextchar(stream);
664 	for (;;) {
665 		int curr = next;
666 		if (curr == EOF) {
667 			warning(stream_pos(stream), "End of file in the middle of a comment");
668 			return curr;
669 		}
670 		next = nextchar(stream);
671 		if (curr == '*' && next == '/')
672 			break;
673 	}
674 	stream->newline = newline;
675 	return nextchar(stream);
676 }
677 
678 unsigned char combinations[][4] = COMBINATION_STRINGS;
679 
680 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
681 
682 /* hash function for two-character punctuators - all give unique values */
683 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
684 
685 /*
686  * note that we won't get false positives - special_hash(0,0) is 0 and
687  * entry 0 is filled (by +=), so all the missing ones are OK.
688  */
689 static unsigned char hash_results[32][2] = {
690 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
691 	RES('+', '='), /* 00 */
692 	RES('/', '='), /* 01 */
693 	RES('^', '='), /* 05 */
694 	RES('&', '&'), /* 07 */
695 	RES('#', '#'), /* 08 */
696 	RES('<', '<'), /* 0a */
697 	RES('<', '='), /* 0c */
698 	RES('!', '='), /* 0e */
699 	RES('%', '='), /* 0f */
700 	RES('-', '-'), /* 10 */
701 	RES('-', '='), /* 11 */
702 	RES('-', '>'), /* 13 */
703 	RES('=', '='), /* 15 */
704 	RES('&', '='), /* 17 */
705 	RES('*', '='), /* 18 */
706 	RES('.', '.'), /* 1a */
707 	RES('+', '+'), /* 1b */
708 	RES('|', '='), /* 1c */
709 	RES('>', '='), /* 1d */
710 	RES('|', '|'), /* 1e */
711 	RES('>', '>')  /* 1f */
712 #undef RES
713 };
714 static int code[32] = {
715 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
716 	CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
717 	CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
718 	CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
719 	CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
720 	CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
721 	CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
722 	CODE('<', '=', SPECIAL_LTE), /* 0c */
723 	CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
724 	CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
725 	CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
726 	CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
727 	CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
728 	CODE('=', '=', SPECIAL_EQUAL), /* 15 */
729 	CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
730 	CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
731 	CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
732 	CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
733 	CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
734 	CODE('>', '=', SPECIAL_GTE), /* 1d */
735 	CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
736 	CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
737 #undef CODE
738 };
739 
get_one_special(int c,stream_t * stream)740 static int get_one_special(int c, stream_t *stream)
741 {
742 	struct token *token;
743 	int next, value, i;
744 
745 	next = nextchar(stream);
746 
747 	/*
748 	 * Check for numbers, strings, character constants, and comments
749 	 */
750 	switch (c) {
751 	case '.':
752 		if (next >= '0' && next <= '9')
753 			return get_one_number(c, next, stream);
754 		break;
755 	case '"':
756 		return eat_string(next, stream, TOKEN_STRING);
757 	case '\'':
758 		return eat_string(next, stream, TOKEN_CHAR);
759 	case '/':
760 		if (next == '/')
761 			return drop_stream_eoln(stream);
762 		if (next == '*')
763 			return drop_stream_comment(stream);
764 	}
765 
766 	/*
767 	 * Check for combinations
768 	 */
769 	value = c;
770 	if (cclass[next + 1] & ValidSecond) {
771 		i = special_hash(c, next);
772 		if (hash_results[i][0] == c && hash_results[i][1] == next) {
773 			value = code[i];
774 			next = nextchar(stream);
775 			if (value >= SPECIAL_LEFTSHIFT &&
776 			    next == "==."[value - SPECIAL_LEFTSHIFT]) {
777 				value += 3;
778 				next = nextchar(stream);
779 			}
780 		}
781 	}
782 
783 	/* Pass it on.. */
784 	token = stream->token;
785 	token_type(token) = TOKEN_SPECIAL;
786 	token->special = value;
787 	add_token(stream);
788 	return next;
789 }
790 
791 #define IDENT_HASH_BITS (13)
792 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
793 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
794 
795 #define ident_hash_init(c)		(c)
796 #define ident_hash_add(oldhash,c)	((oldhash)*11 + (c))
797 #define ident_hash_end(hash)		((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
798 
799 static struct ident *hash_table[IDENT_HASH_SIZE];
800 static int ident_hit, ident_miss, idents;
801 
show_identifier_stats(void)802 void show_identifier_stats(void)
803 {
804 	int i;
805 	int distribution[100];
806 
807 	fprintf(stderr, "identifiers: %d hits, %d misses\n",
808 		ident_hit, ident_miss);
809 
810 	for (i = 0; i < 100; i++)
811 		distribution[i] = 0;
812 
813 	for (i = 0; i < IDENT_HASH_SIZE; i++) {
814 		struct ident * ident = hash_table[i];
815 		int count = 0;
816 
817 		while (ident) {
818 			count++;
819 			ident = ident->next;
820 		}
821 		if (count > 99)
822 			count = 99;
823 		distribution[count]++;
824 	}
825 
826 	for (i = 0; i < 100; i++) {
827 		if (distribution[i])
828 			fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
829 	}
830 }
831 
alloc_ident(const char * name,int len)832 struct ident *alloc_ident(const char *name, int len)
833 {
834 	struct ident *ident = __alloc_ident(len);
835 	ident->symbols = NULL;
836 	ident->len = len;
837 	ident->tainted = 0;
838 	memcpy(ident->name, name, len);
839 	return ident;
840 }
841 
insert_hash(struct ident * ident,unsigned long hash)842 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
843 {
844 	ident->next = hash_table[hash];
845 	hash_table[hash] = ident;
846 	ident_miss++;
847 	return ident;
848 }
849 
create_hashed_ident(const char * name,int len,unsigned long hash)850 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
851 {
852 	struct ident *ident;
853 	struct ident **p;
854 
855 	p = &hash_table[hash];
856 	while ((ident = *p) != NULL) {
857 		if (ident->len == (unsigned char) len) {
858 			if (strncmp(name, ident->name, len) != 0)
859 				goto next;
860 
861 			ident_hit++;
862 			return ident;
863 		}
864 next:
865 		//misses++;
866 		p = &ident->next;
867 	}
868 	ident = alloc_ident(name, len);
869 	*p = ident;
870 	ident->next = NULL;
871 	ident_miss++;
872 	idents++;
873 	return ident;
874 }
875 
hash_name(const char * name,int len)876 static unsigned long hash_name(const char *name, int len)
877 {
878 	unsigned long hash;
879 	const unsigned char *p = (const unsigned char *)name;
880 
881 	hash = ident_hash_init(*p++);
882 	while (--len) {
883 		unsigned int i = *p++;
884 		hash = ident_hash_add(hash, i);
885 	}
886 	return ident_hash_end(hash);
887 }
888 
hash_ident(struct ident * ident)889 struct ident *hash_ident(struct ident *ident)
890 {
891 	return insert_hash(ident, hash_name(ident->name, ident->len));
892 }
893 
built_in_ident(const char * name)894 struct ident *built_in_ident(const char *name)
895 {
896 	int len = strlen(name);
897 	return create_hashed_ident(name, len, hash_name(name, len));
898 }
899 
built_in_token(int stream,struct ident * ident)900 struct token *built_in_token(int stream, struct ident *ident)
901 {
902 	struct token *token;
903 
904 	token = __alloc_token(0);
905 	token->pos.stream = stream;
906 	token_type(token) = TOKEN_IDENT;
907 	token->ident = ident;
908 	return token;
909 }
910 
get_one_identifier(int c,stream_t * stream)911 static int get_one_identifier(int c, stream_t *stream)
912 {
913 	struct token *token;
914 	struct ident *ident;
915 	unsigned long hash;
916 	char buf[256];
917 	int len = 1;
918 	int next;
919 
920 	hash = ident_hash_init(c);
921 	buf[0] = c;
922 	for (;;) {
923 		next = nextchar(stream);
924 		if (!(cclass[next + 1] & (Letter | Digit)))
925 			break;
926 		if (len >= sizeof(buf))
927 			break;
928 		hash = ident_hash_add(hash, next);
929 		buf[len] = next;
930 		len++;
931 	};
932 	if (cclass[next + 1] & Quote) {
933 		if (len == 1 && buf[0] == 'L') {
934 			if (next == '\'')
935 				return eat_string(nextchar(stream), stream,
936 							TOKEN_WIDE_CHAR);
937 			else
938 				return eat_string(nextchar(stream), stream,
939 							TOKEN_WIDE_STRING);
940 		}
941 	}
942 	hash = ident_hash_end(hash);
943 	ident = create_hashed_ident(buf, len, hash);
944 
945 	/* Pass it on.. */
946 	token = stream->token;
947 	token_type(token) = TOKEN_IDENT;
948 	token->ident = ident;
949 	add_token(stream);
950 	return next;
951 }
952 
get_one_token(int c,stream_t * stream)953 static int get_one_token(int c, stream_t *stream)
954 {
955 	long class = cclass[c + 1];
956 	if (class & Digit)
957 		return get_one_number(c, nextchar(stream), stream);
958 	if (class & Letter)
959 		return get_one_identifier(c, stream);
960 	return get_one_special(c, stream);
961 }
962 
setup_stream(stream_t * stream,int idx,int fd,unsigned char * buf,unsigned int buf_size)963 static struct token *setup_stream(stream_t *stream, int idx, int fd,
964 	unsigned char *buf, unsigned int buf_size)
965 {
966 	struct token *begin;
967 
968 	stream->nr = idx;
969 	stream->line = 1;
970 	stream->newline = 1;
971 	stream->whitespace = 0;
972 	stream->pos = 0;
973 
974 	stream->token = NULL;
975 	stream->fd = fd;
976 	stream->offset = 0;
977 	stream->size = buf_size;
978 	stream->buffer = buf;
979 
980 	begin = alloc_token(stream);
981 	token_type(begin) = TOKEN_STREAMBEGIN;
982 	stream->tokenlist = &begin->next;
983 	return begin;
984 }
985 
tokenize_stream(stream_t * stream)986 static struct token *tokenize_stream(stream_t *stream)
987 {
988 	int c = nextchar(stream);
989 	while (c != EOF) {
990 		if (!isspace(c)) {
991 			struct token *token = alloc_token(stream);
992 			stream->token = token;
993 			stream->newline = 0;
994 			stream->whitespace = 0;
995 			c = get_one_token(c, stream);
996 			continue;
997 		}
998 		stream->whitespace = 1;
999 		c = nextchar(stream);
1000 	}
1001 	return mark_eof(stream);
1002 }
1003 
tokenize_buffer(void * buffer,unsigned long size,struct token ** endtoken)1004 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1005 {
1006 	stream_t stream;
1007 	struct token *begin;
1008 
1009 	begin = setup_stream(&stream, 0, -1, buffer, size);
1010 	*endtoken = tokenize_stream(&stream);
1011 	return begin;
1012 }
1013 
tokenize(const char * name,int fd,struct token * endtoken,const char ** next_path)1014 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1015 {
1016 	struct token *begin, *end;
1017 	stream_t stream;
1018 	unsigned char buffer[BUFSIZE];
1019 	int idx;
1020 
1021 	idx = init_stream(name, fd, next_path);
1022 	if (idx < 0) {
1023 		// info(endtoken->pos, "File %s is const", name);
1024 		return endtoken;
1025 	}
1026 
1027 	begin = setup_stream(&stream, idx, fd, buffer, 0);
1028 	end = tokenize_stream(&stream);
1029 	if (endtoken)
1030 		end->next = endtoken;
1031 	return begin;
1032 }
1033