1/*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
4 *
5 * Copyright (C) 2003 Transmeta Corp.
6 *               2003 Linus Torvalds
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26#include <stdio.h>
27#include <stdlib.h>
28#include <stdarg.h>
29#include <stddef.h>
30#include <string.h>
31#include <ctype.h>
32#include <unistd.h>
33#include <stdint.h>
34
35#include "lib.h"
36#include "allocate.h"
37#include "token.h"
38#include "symbol.h"
39
40#define EOF (-1)
41
42int input_stream_nr = 0;
43struct stream *input_streams;
44static int input_streams_allocated;
45unsigned int tabstop = 8;
46int no_lineno = 0;
47
48#define BUFSIZE (8192)
49
50typedef struct {
51	int fd, offset, size;
52	int pos, line, nr;
53	int newline, whitespace;
54	struct token **tokenlist;
55	struct token *token;
56	unsigned char *buffer;
57} stream_t;
58
59const char *stream_name(int stream)
60{
61	if (stream < 0 || stream > input_stream_nr)
62		return "<bad stream>";
63	return input_streams[stream].name;
64}
65
66static struct position stream_pos(stream_t *stream)
67{
68	struct position pos;
69	pos.type = 0;
70	pos.stream = stream->nr;
71	pos.newline = stream->newline;
72	pos.whitespace = stream->whitespace;
73	pos.pos = stream->pos;
74
75	pos.line = stream->line;
76	if (no_lineno)
77		pos.line = 123456;
78
79	pos.noexpand = 0;
80	return pos;
81}
82
83const char *show_special(int val)
84{
85	static char buffer[4];
86
87	buffer[0] = val;
88	buffer[1] = 0;
89	if (val >= SPECIAL_BASE)
90		strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
91	return buffer;
92}
93
94const char *show_ident(const struct ident *ident)
95{
96	static char buff[4][256];
97	static int n;
98	char *buffer;
99
100	if (!ident)
101		return "<noident>";
102	buffer = buff[3 & ++n];
103	sprintf(buffer, "%.*s", ident->len, ident->name);
104	return buffer;
105}
106
107static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
108{
109	if (isprint(c)) {
110		if (c == escape || c == '\\')
111			*ptr++ = '\\';
112		*ptr++ = c;
113		return ptr;
114	}
115	*ptr++ = '\\';
116	switch (c) {
117	case '\n':
118		*ptr++ = 'n';
119		return ptr;
120	case '\t':
121		*ptr++ = 't';
122		return ptr;
123	}
124	if (!isdigit(next))
125		return ptr + sprintf(ptr, "%o", c);
126
127	return ptr + sprintf(ptr, "%03o", c);
128}
129
130const char *show_string(const struct string *string)
131{
132	static char buffer[4 * MAX_STRING + 3];
133	char *ptr;
134	int i;
135
136	if (!string || !string->length)
137		return "<bad_string>";
138	ptr = buffer;
139	*ptr++ = '"';
140	for (i = 0; i < string->length-1; i++) {
141		const char *p = string->data + i;
142		ptr = charstr(ptr, p[0], '"', p[1]);
143	}
144	*ptr++ = '"';
145	*ptr = '\0';
146	return buffer;
147}
148
149static const char *show_char(const char *s, size_t len, char prefix, char delim)
150{
151	static char buffer[MAX_STRING + 4];
152	char *p = buffer;
153	if (prefix)
154		*p++ = prefix;
155	*p++ = delim;
156	memcpy(p, s, len);
157	p += len;
158	*p++ = delim;
159	*p++ = '\0';
160	return buffer;
161}
162
163static const char *quote_char(const char *s, size_t len, char prefix, char delim)
164{
165	static char buffer[2*MAX_STRING + 6];
166	size_t i;
167	char *p = buffer;
168	if (prefix)
169		*p++ = prefix;
170	if (delim == '"')
171		*p++ = '\\';
172	*p++ = delim;
173	for (i = 0; i < len; i++) {
174		if (s[i] == '"' || s[i] == '\\')
175			*p++ = '\\';
176		*p++ = s[i];
177	}
178	if (delim == '"')
179		*p++ = '\\';
180	*p++ = delim;
181	*p++ = '\0';
182	return buffer;
183}
184
185const char *show_token(const struct token *token)
186{
187	static char buffer[256];
188
189	if (!token)
190		return "<no token>";
191	switch (token_type(token)) {
192	case TOKEN_ERROR:
193		return "syntax error";
194
195	case TOKEN_EOF:
196		return "end-of-input";
197
198	case TOKEN_IDENT:
199		return show_ident(token->ident);
200
201	case TOKEN_NUMBER:
202		return token->number;
203
204	case TOKEN_SPECIAL:
205		return show_special(token->special);
206
207	case TOKEN_CHAR:
208		return show_char(token->string->data,
209			token->string->length - 1, 0, '\'');
210	case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
211		return show_char(token->embedded,
212			token_type(token) - TOKEN_CHAR, 0, '\'');
213	case TOKEN_WIDE_CHAR:
214		return show_char(token->string->data,
215			token->string->length - 1, 'L', '\'');
216	case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
217		return show_char(token->embedded,
218			token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
219	case TOKEN_STRING:
220		return show_char(token->string->data,
221			token->string->length - 1, 0, '"');
222	case TOKEN_WIDE_STRING:
223		return show_char(token->string->data,
224			token->string->length - 1, 'L', '"');
225
226	case TOKEN_STREAMBEGIN:
227		sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
228		return buffer;
229
230	case TOKEN_STREAMEND:
231		sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
232		return buffer;
233
234	case TOKEN_UNTAINT:
235		sprintf(buffer, "<untaint>");
236		return buffer;
237
238	case TOKEN_ARG_COUNT:
239		sprintf(buffer, "<argcnt>");
240		return buffer;
241
242	default:
243		sprintf(buffer, "unhandled token type '%d' ", token_type(token));
244		return buffer;
245	}
246}
247
248const char *quote_token(const struct token *token)
249{
250	static char buffer[256];
251
252	switch (token_type(token)) {
253	case TOKEN_ERROR:
254		return "syntax error";
255
256	case TOKEN_IDENT:
257		return show_ident(token->ident);
258
259	case TOKEN_NUMBER:
260		return token->number;
261
262	case TOKEN_SPECIAL:
263		return show_special(token->special);
264
265	case TOKEN_CHAR:
266		return quote_char(token->string->data,
267			token->string->length - 1, 0, '\'');
268	case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
269		return quote_char(token->embedded,
270			token_type(token) - TOKEN_CHAR, 0, '\'');
271	case TOKEN_WIDE_CHAR:
272		return quote_char(token->string->data,
273			token->string->length - 1, 'L', '\'');
274	case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
275		return quote_char(token->embedded,
276			token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
277	case TOKEN_STRING:
278		return quote_char(token->string->data,
279			token->string->length - 1, 0, '"');
280	case TOKEN_WIDE_STRING:
281		return quote_char(token->string->data,
282			token->string->length - 1, 'L', '"');
283	default:
284		sprintf(buffer, "unhandled token type '%d' ", token_type(token));
285		return buffer;
286	}
287}
288
289#define HASHED_INPUT_BITS (6)
290#define HASHED_INPUT (1 << HASHED_INPUT_BITS)
291#define HASH_PRIME 0x9e370001UL
292
293static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
294
295int *hash_stream(const char *name)
296{
297	uint32_t hash = 0;
298	unsigned char c;
299
300	while ((c = *name++) != 0)
301		hash = (hash + (c << 4) + (c >> 4)) * 11;
302
303	hash *= HASH_PRIME;
304	hash >>= 32 - HASHED_INPUT_BITS;
305	return input_stream_hashes + hash;
306}
307
308int init_stream(const char *name, int fd, const char **next_path)
309{
310	int stream = input_stream_nr, *hash;
311	struct stream *current;
312
313	if (stream >= input_streams_allocated) {
314		int newalloc = stream * 4 / 3 + 10;
315		input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
316		if (!input_streams)
317			die("Unable to allocate more streams space");
318		input_streams_allocated = newalloc;
319	}
320	current = input_streams + stream;
321	memset(current, 0, sizeof(*current));
322	current->name = name;
323	current->fd = fd;
324	current->next_path = next_path;
325	current->path = NULL;
326	current->constant = CONSTANT_FILE_MAYBE;
327	input_stream_nr = stream+1;
328	hash = hash_stream(name);
329	current->next_stream = *hash;
330	*hash = stream;
331	return stream;
332}
333
334static struct token * alloc_token(stream_t *stream)
335{
336	struct token *token = __alloc_token(0);
337	token->pos = stream_pos(stream);
338	return token;
339}
340
341/*
342 *  Argh...  That was surprisingly messy - handling '\r' complicates the
343 *  things a _lot_.
344 */
345static int nextchar_slow(stream_t *stream)
346{
347	int offset = stream->offset;
348	int size = stream->size;
349	int c;
350	int spliced = 0, had_cr, had_backslash;
351
352restart:
353	had_cr = had_backslash = 0;
354
355repeat:
356	if (offset >= size) {
357		if (stream->fd < 0)
358			goto got_eof;
359		size = read(stream->fd, stream->buffer, BUFSIZE);
360		if (size <= 0)
361			goto got_eof;
362		stream->size = size;
363		stream->offset = offset = 0;
364	}
365
366	c = stream->buffer[offset++];
367	if (had_cr)
368		goto check_lf;
369
370	if (c == '\r') {
371		had_cr = 1;
372		goto repeat;
373	}
374
375norm:
376	if (!had_backslash) {
377		switch (c) {
378		case '\t':
379			stream->pos += tabstop - stream->pos % tabstop;
380			break;
381		case '\n':
382			stream->line++;
383			stream->pos = 0;
384			stream->newline = 1;
385			break;
386		case '\\':
387			had_backslash = 1;
388			stream->pos++;
389			goto repeat;
390		default:
391			stream->pos++;
392		}
393	} else {
394		if (c == '\n') {
395			stream->line++;
396			stream->pos = 0;
397			spliced = 1;
398			goto restart;
399		}
400		offset--;
401		c = '\\';
402	}
403out:
404	stream->offset = offset;
405
406	return c;
407
408check_lf:
409	if (c != '\n')
410		offset--;
411	c = '\n';
412	goto norm;
413
414got_eof:
415	if (had_backslash) {
416		c = '\\';
417		goto out;
418	}
419	if (stream->pos)
420		warning(stream_pos(stream), "no newline at end of file");
421	else if (spliced)
422		warning(stream_pos(stream), "backslash-newline at end of file");
423	return EOF;
424}
425
426/*
427 *  We want that as light as possible while covering all normal cases.
428 *  Slow path (including the logics with line-splicing and EOF sanity
429 *  checks) is in nextchar_slow().
430 */
431static inline int nextchar(stream_t *stream)
432{
433	int offset = stream->offset;
434
435	if (offset < stream->size) {
436		int c = stream->buffer[offset++];
437		static const char special[256] = {
438			['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
439		};
440		if (!special[c]) {
441			stream->offset = offset;
442			stream->pos++;
443			return c;
444		}
445	}
446	return nextchar_slow(stream);
447}
448
449struct token eof_token_entry;
450
451static struct token *mark_eof(stream_t *stream)
452{
453	struct token *end;
454
455	end = alloc_token(stream);
456	eof_token_entry.pos = end->pos;
457	token_type(end) = TOKEN_STREAMEND;
458	end->pos.newline = 1;
459
460	eof_token_entry.next = &eof_token_entry;
461	eof_token_entry.pos.newline = 1;
462
463	end->next =  &eof_token_entry;
464	*stream->tokenlist = end;
465	stream->tokenlist = NULL;
466	return end;
467}
468
469static void add_token(stream_t *stream)
470{
471	struct token *token = stream->token;
472
473	stream->token = NULL;
474	token->next = NULL;
475	*stream->tokenlist = token;
476	stream->tokenlist = &token->next;
477}
478
479static void drop_token(stream_t *stream)
480{
481	stream->newline |= stream->token->pos.newline;
482	stream->whitespace |= stream->token->pos.whitespace;
483	stream->token = NULL;
484}
485
486enum {
487	Letter = 1,
488	Digit = 2,
489	Hex = 4,
490	Exp = 8,
491	Dot = 16,
492	ValidSecond = 32,
493	Quote = 64,
494};
495
496static const char cclass[257] = {
497	['0' + 1 ... '9' + 1] = Digit | Hex,
498	['A' + 1 ... 'D' + 1] = Letter | Hex,
499	['E' + 1] = Letter | Hex | Exp,	/* E<exp> */
500	['F' + 1] = Letter | Hex,
501	['G' + 1 ... 'O' + 1] = Letter,
502	['P' + 1] = Letter | Exp,	/* P<exp> */
503	['Q' + 1 ... 'Z' + 1] = Letter,
504	['a' + 1 ... 'd' + 1] = Letter | Hex,
505	['e' + 1] = Letter | Hex | Exp,	/* e<exp> */
506	['f' + 1] = Letter | Hex,
507	['g' + 1 ... 'o' + 1] = Letter,
508	['p' + 1] = Letter | Exp,	/* p<exp> */
509	['q' + 1 ... 'z' + 1] = Letter,
510	['_' + 1] = Letter,
511	['.' + 1] = Dot | ValidSecond,
512	['=' + 1] = ValidSecond,
513	['+' + 1] = ValidSecond,
514	['-' + 1] = ValidSecond,
515	['>' + 1] = ValidSecond,
516	['<' + 1] = ValidSecond,
517	['&' + 1] = ValidSecond,
518	['|' + 1] = ValidSecond,
519	['#' + 1] = ValidSecond,
520	['\'' + 1] = Quote,
521	['"' + 1] = Quote,
522};
523
524/*
525 * pp-number:
526 *	digit
527 *	. digit
528 *	pp-number digit
529 *	pp-number identifier-nodigit
530 *	pp-number e sign
531 *	pp-number E sign
532 *	pp-number p sign
533 *	pp-number P sign
534 *	pp-number .
535 */
536static int get_one_number(int c, int next, stream_t *stream)
537{
538	struct token *token;
539	static char buffer[4095];
540	char *p = buffer, *buffer_end = buffer + sizeof (buffer);
541
542	*p++ = c;
543	for (;;) {
544		long class =  cclass[next + 1];
545		if (!(class & (Dot | Digit | Letter)))
546			break;
547		if (p != buffer_end)
548			*p++ = next;
549		next = nextchar(stream);
550		if (class & Exp) {
551			if (next == '-' || next == '+') {
552				if (p != buffer_end)
553					*p++ = next;
554				next = nextchar(stream);
555			}
556		}
557	}
558
559	if (p == buffer_end) {
560		sparse_error(stream_pos(stream), "number token exceeds %td characters",
561		      buffer_end - buffer);
562		// Pretend we saw just "1".
563		buffer[0] = '1';
564		p = buffer + 1;
565	}
566
567	*p++ = 0;
568	token = stream->token;
569	token_type(token) = TOKEN_NUMBER;
570	token->number = xmemdup(buffer, p - buffer);
571	add_token(stream);
572
573	return next;
574}
575
576static int eat_string(int next, stream_t *stream, enum token_type type)
577{
578	static char buffer[MAX_STRING];
579	struct string *string;
580	struct token *token = stream->token;
581	int len = 0;
582	int escape;
583	int want_hex = 0;
584	char delim = type < TOKEN_STRING ? '\'' : '"';
585
586	for (escape = 0; escape || next != delim; next = nextchar(stream)) {
587		if (len < MAX_STRING)
588			buffer[len] = next;
589		len++;
590		if (next == '\n') {
591			warning(stream_pos(stream),
592				"missing terminating %c character", delim);
593			/* assume delimiter is lost */
594			break;
595		}
596		if (next == EOF) {
597			warning(stream_pos(stream),
598				"End of file in middle of string");
599			return next;
600		}
601		if (!escape) {
602			if (want_hex && !(cclass[next + 1] & Hex))
603				warning(stream_pos(stream),
604					"\\x used with no following hex digits");
605			want_hex = 0;
606			escape = next == '\\';
607		} else {
608			escape = 0;
609			want_hex = next == 'x';
610		}
611	}
612	if (want_hex)
613		warning(stream_pos(stream),
614			"\\x used with no following hex digits");
615	if (len > MAX_STRING) {
616		warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
617		len = MAX_STRING;
618	}
619	if (delim == '\'' && len <= 4) {
620		if (len == 0) {
621			sparse_error(stream_pos(stream),
622				"empty character constant");
623			return nextchar(stream);
624		}
625		token_type(token) = type + len;
626		memset(buffer + len, '\0', 4 - len);
627		memcpy(token->embedded, buffer, 4);
628	} else {
629		token_type(token) = type;
630		string = __alloc_string(len+1);
631		memcpy(string->data, buffer, len);
632		string->data[len] = '\0';
633		string->length = len+1;
634		token->string = string;
635	}
636
637	/* Pass it on.. */
638	token = stream->token;
639	add_token(stream);
640	return nextchar(stream);
641}
642
643static int drop_stream_eoln(stream_t *stream)
644{
645	drop_token(stream);
646	for (;;) {
647		switch (nextchar(stream)) {
648		case EOF:
649			return EOF;
650		case '\n':
651			return nextchar(stream);
652		}
653	}
654}
655
656static int drop_stream_comment(stream_t *stream)
657{
658	int newline;
659	int next;
660	drop_token(stream);
661	newline = stream->newline;
662
663	next = nextchar(stream);
664	for (;;) {
665		int curr = next;
666		if (curr == EOF) {
667			warning(stream_pos(stream), "End of file in the middle of a comment");
668			return curr;
669		}
670		next = nextchar(stream);
671		if (curr == '*' && next == '/')
672			break;
673	}
674	stream->newline = newline;
675	return nextchar(stream);
676}
677
678unsigned char combinations[][4] = COMBINATION_STRINGS;
679
680#define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
681
682/* hash function for two-character punctuators - all give unique values */
683#define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
684
685/*
686 * note that we won't get false positives - special_hash(0,0) is 0 and
687 * entry 0 is filled (by +=), so all the missing ones are OK.
688 */
689static unsigned char hash_results[32][2] = {
690#define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
691	RES('+', '='), /* 00 */
692	RES('/', '='), /* 01 */
693	RES('^', '='), /* 05 */
694	RES('&', '&'), /* 07 */
695	RES('#', '#'), /* 08 */
696	RES('<', '<'), /* 0a */
697	RES('<', '='), /* 0c */
698	RES('!', '='), /* 0e */
699	RES('%', '='), /* 0f */
700	RES('-', '-'), /* 10 */
701	RES('-', '='), /* 11 */
702	RES('-', '>'), /* 13 */
703	RES('=', '='), /* 15 */
704	RES('&', '='), /* 17 */
705	RES('*', '='), /* 18 */
706	RES('.', '.'), /* 1a */
707	RES('+', '+'), /* 1b */
708	RES('|', '='), /* 1c */
709	RES('>', '='), /* 1d */
710	RES('|', '|'), /* 1e */
711	RES('>', '>')  /* 1f */
712#undef RES
713};
714static int code[32] = {
715#define CODE(c0, c1, value) [special_hash(c0, c1)] = value
716	CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
717	CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
718	CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
719	CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
720	CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
721	CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
722	CODE('<', '=', SPECIAL_LTE), /* 0c */
723	CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
724	CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
725	CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
726	CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
727	CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
728	CODE('=', '=', SPECIAL_EQUAL), /* 15 */
729	CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
730	CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
731	CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
732	CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
733	CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
734	CODE('>', '=', SPECIAL_GTE), /* 1d */
735	CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
736	CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
737#undef CODE
738};
739
740static int get_one_special(int c, stream_t *stream)
741{
742	struct token *token;
743	int next, value, i;
744
745	next = nextchar(stream);
746
747	/*
748	 * Check for numbers, strings, character constants, and comments
749	 */
750	switch (c) {
751	case '.':
752		if (next >= '0' && next <= '9')
753			return get_one_number(c, next, stream);
754		break;
755	case '"':
756		return eat_string(next, stream, TOKEN_STRING);
757	case '\'':
758		return eat_string(next, stream, TOKEN_CHAR);
759	case '/':
760		if (next == '/')
761			return drop_stream_eoln(stream);
762		if (next == '*')
763			return drop_stream_comment(stream);
764	}
765
766	/*
767	 * Check for combinations
768	 */
769	value = c;
770	if (cclass[next + 1] & ValidSecond) {
771		i = special_hash(c, next);
772		if (hash_results[i][0] == c && hash_results[i][1] == next) {
773			value = code[i];
774			next = nextchar(stream);
775			if (value >= SPECIAL_LEFTSHIFT &&
776			    next == "==."[value - SPECIAL_LEFTSHIFT]) {
777				value += 3;
778				next = nextchar(stream);
779			}
780		}
781	}
782
783	/* Pass it on.. */
784	token = stream->token;
785	token_type(token) = TOKEN_SPECIAL;
786	token->special = value;
787	add_token(stream);
788	return next;
789}
790
791#define IDENT_HASH_BITS (13)
792#define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
793#define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
794
795#define ident_hash_init(c)		(c)
796#define ident_hash_add(oldhash,c)	((oldhash)*11 + (c))
797#define ident_hash_end(hash)		((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
798
799static struct ident *hash_table[IDENT_HASH_SIZE];
800static int ident_hit, ident_miss, idents;
801
802void show_identifier_stats(void)
803{
804	int i;
805	int distribution[100];
806
807	fprintf(stderr, "identifiers: %d hits, %d misses\n",
808		ident_hit, ident_miss);
809
810	for (i = 0; i < 100; i++)
811		distribution[i] = 0;
812
813	for (i = 0; i < IDENT_HASH_SIZE; i++) {
814		struct ident * ident = hash_table[i];
815		int count = 0;
816
817		while (ident) {
818			count++;
819			ident = ident->next;
820		}
821		if (count > 99)
822			count = 99;
823		distribution[count]++;
824	}
825
826	for (i = 0; i < 100; i++) {
827		if (distribution[i])
828			fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
829	}
830}
831
832struct ident *alloc_ident(const char *name, int len)
833{
834	struct ident *ident = __alloc_ident(len);
835	ident->symbols = NULL;
836	ident->len = len;
837	ident->tainted = 0;
838	memcpy(ident->name, name, len);
839	return ident;
840}
841
842static struct ident * insert_hash(struct ident *ident, unsigned long hash)
843{
844	ident->next = hash_table[hash];
845	hash_table[hash] = ident;
846	ident_miss++;
847	return ident;
848}
849
850static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
851{
852	struct ident *ident;
853	struct ident **p;
854
855	p = &hash_table[hash];
856	while ((ident = *p) != NULL) {
857		if (ident->len == (unsigned char) len) {
858			if (strncmp(name, ident->name, len) != 0)
859				goto next;
860
861			ident_hit++;
862			return ident;
863		}
864next:
865		//misses++;
866		p = &ident->next;
867	}
868	ident = alloc_ident(name, len);
869	*p = ident;
870	ident->next = NULL;
871	ident_miss++;
872	idents++;
873	return ident;
874}
875
876static unsigned long hash_name(const char *name, int len)
877{
878	unsigned long hash;
879	const unsigned char *p = (const unsigned char *)name;
880
881	hash = ident_hash_init(*p++);
882	while (--len) {
883		unsigned int i = *p++;
884		hash = ident_hash_add(hash, i);
885	}
886	return ident_hash_end(hash);
887}
888
889struct ident *hash_ident(struct ident *ident)
890{
891	return insert_hash(ident, hash_name(ident->name, ident->len));
892}
893
894struct ident *built_in_ident(const char *name)
895{
896	int len = strlen(name);
897	return create_hashed_ident(name, len, hash_name(name, len));
898}
899
900struct token *built_in_token(int stream, struct ident *ident)
901{
902	struct token *token;
903
904	token = __alloc_token(0);
905	token->pos.stream = stream;
906	token_type(token) = TOKEN_IDENT;
907	token->ident = ident;
908	return token;
909}
910
911static int get_one_identifier(int c, stream_t *stream)
912{
913	struct token *token;
914	struct ident *ident;
915	unsigned long hash;
916	char buf[256];
917	int len = 1;
918	int next;
919
920	hash = ident_hash_init(c);
921	buf[0] = c;
922	for (;;) {
923		next = nextchar(stream);
924		if (!(cclass[next + 1] & (Letter | Digit)))
925			break;
926		if (len >= sizeof(buf))
927			break;
928		hash = ident_hash_add(hash, next);
929		buf[len] = next;
930		len++;
931	};
932	if (cclass[next + 1] & Quote) {
933		if (len == 1 && buf[0] == 'L') {
934			if (next == '\'')
935				return eat_string(nextchar(stream), stream,
936							TOKEN_WIDE_CHAR);
937			else
938				return eat_string(nextchar(stream), stream,
939							TOKEN_WIDE_STRING);
940		}
941	}
942	hash = ident_hash_end(hash);
943	ident = create_hashed_ident(buf, len, hash);
944
945	/* Pass it on.. */
946	token = stream->token;
947	token_type(token) = TOKEN_IDENT;
948	token->ident = ident;
949	add_token(stream);
950	return next;
951}
952
953static int get_one_token(int c, stream_t *stream)
954{
955	long class = cclass[c + 1];
956	if (class & Digit)
957		return get_one_number(c, nextchar(stream), stream);
958	if (class & Letter)
959		return get_one_identifier(c, stream);
960	return get_one_special(c, stream);
961}
962
963static struct token *setup_stream(stream_t *stream, int idx, int fd,
964	unsigned char *buf, unsigned int buf_size)
965{
966	struct token *begin;
967
968	stream->nr = idx;
969	stream->line = 1;
970	stream->newline = 1;
971	stream->whitespace = 0;
972	stream->pos = 0;
973
974	stream->token = NULL;
975	stream->fd = fd;
976	stream->offset = 0;
977	stream->size = buf_size;
978	stream->buffer = buf;
979
980	begin = alloc_token(stream);
981	token_type(begin) = TOKEN_STREAMBEGIN;
982	stream->tokenlist = &begin->next;
983	return begin;
984}
985
986static struct token *tokenize_stream(stream_t *stream)
987{
988	int c = nextchar(stream);
989	while (c != EOF) {
990		if (!isspace(c)) {
991			struct token *token = alloc_token(stream);
992			stream->token = token;
993			stream->newline = 0;
994			stream->whitespace = 0;
995			c = get_one_token(c, stream);
996			continue;
997		}
998		stream->whitespace = 1;
999		c = nextchar(stream);
1000	}
1001	return mark_eof(stream);
1002}
1003
1004struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1005{
1006	stream_t stream;
1007	struct token *begin;
1008
1009	begin = setup_stream(&stream, 0, -1, buffer, size);
1010	*endtoken = tokenize_stream(&stream);
1011	return begin;
1012}
1013
1014struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1015{
1016	struct token *begin, *end;
1017	stream_t stream;
1018	unsigned char buffer[BUFSIZE];
1019	int idx;
1020
1021	idx = init_stream(name, fd, next_path);
1022	if (idx < 0) {
1023		// info(endtoken->pos, "File %s is const", name);
1024		return endtoken;
1025	}
1026
1027	begin = setup_stream(&stream, idx, fd, buffer, 0);
1028	end = tokenize_stream(&stream);
1029	if (endtoken)
1030		end->next = endtoken;
1031	return begin;
1032}
1033