1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019, Joyent, Inc.
14  * Copyright 2021 Jason King
15  */
16 
17 #include <errno.h>
18 #include <libcustr.h>
19 #include <limits.h>
20 #include <string.h>
21 #include <sys/ctype.h>	/* We want the C locale ISXXX() versions */
22 #include <sys/debug.h>
23 #include <stdio.h>
24 #include <sys/sysmacros.h>
25 
26 #include "strview.h"
27 #include "demangle_int.h"
28 
29 /*
30  * Unfortunately, there is currently no official specification for the rust
31  * name mangling.  This is an attempt to document the understanding of the
32  * mangling used here.  It is based off examination of
33  *     https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/
34  *
35  * A mangled rust name is:
36  *     <prefix> <name>
37  *
38  * <prefix>	::=	_Z
39  *			__Z
40  *
41  * <name>	::= N <name-segment>+ [<hash>] E
42  *
43  * <name-segment> ::= <len> <name-chars>{len}
44  *
45  * <len>	::= [1-9][0-9]+
46  *
47  * <name-chars>	::=	<[A-Za-z]> <[A-Za-z0-9]>*
48  *			<separator>
49  *			<special>
50  *
51  * <separator>	::=	'..'	# '::'
52  *
53  * <special>	::=	$SP$	# ' '
54  *			$BP$	# '*'
55  *			$RF$	# '&'
56  *			$LT$	# '<'
57  *			$GT$	# '>'
58  *			$LP$	# '('
59  *			$RP$	# ')'
60  *			$C$	# ','
61  *			$u7e$	# '~'
62  *			$u20$	# ' '
63  *			$u27$	# '\''
64  *			$u3d$	# '='
65  *			$u5b$	# '['
66  *			$u5d$	# ']'
67  *			$u7b$	# '{'
68  *			$u7d$	# '}'
69  *			$u3b$	# ';'
70  *			$u2b$	# '+'
71  *			$u22$	# '"'
72  *
73  * <hash>	:= <len> h <hex-digits>+
74  *
75  * <hex-digits>	:= <[0-9a-f]>
76  */
77 
78 typedef struct rustdem_state {
79 	const char	*rds_str;
80 	custr_t		*rds_demangled;
81 	sysdem_ops_t	*rds_ops;
82 	int		rds_error;
83 } rustdem_state_t;
84 
85 static const struct rust_charmap {
86 	const char	*ruc_seq;
87 	char		ruc_ch;
88 } rust_charmap[] = {
89 	{ "$SP$", '@' },
90 	{ "$BP$", '*' },
91 	{ "$RF$", '&' },
92 	{ "$LT$", '<' },
93 	{ "$GT$", '>' },
94 	{ "$LP$", '(' },
95 	{ "$RP$", ')' },
96 	{ "$C$", ',' },
97 	{ "$u7e$", '~' },
98 	{ "$u20$", ' ' },
99 	{ "$u27$", '\'' },
100 	{ "$u3d$", '=' },
101 	{ "$u5b$", '[' },
102 	{ "$u5d$", ']' },
103 	{ "$u7b$", '{' },
104 	{ "$u7d$", '}' },
105 	{ "$u3b$", ';' },
106 	{ "$u2b$", '+' },
107 	{ "$u22$", '"' }
108 };
109 static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap);
110 
111 static void *rustdem_alloc(custr_alloc_t *, size_t);
112 static void rustdem_free(custr_alloc_t *, void *, size_t);
113 
114 static boolean_t rustdem_append_c(rustdem_state_t *, char);
115 static boolean_t rustdem_all_ascii(const strview_t *);
116 
117 static boolean_t rustdem_parse_prefix(rustdem_state_t *, strview_t *);
118 static boolean_t rustdem_parse_name(rustdem_state_t *, strview_t *);
119 static boolean_t rustdem_parse_hash(rustdem_state_t *, strview_t *);
120 static boolean_t rustdem_parse_num(rustdem_state_t *, strview_t *, uint64_t *);
121 static boolean_t rustdem_parse_special(rustdem_state_t *, strview_t *);
122 static boolean_t rustdem_add_sep(rustdem_state_t *);
123 
124 char *
125 rust_demangle(const char *s, size_t slen, sysdem_ops_t *ops)
126 {
127 	rustdem_state_t st = {
128 		.rds_str = s,
129 		.rds_ops = ops,
130 	};
131 	custr_alloc_ops_t custr_ops = {
132 		.custr_ao_alloc = rustdem_alloc,
133 		.custr_ao_free = rustdem_free
134 	};
135 	custr_alloc_t custr_alloc = {
136 		.cua_version = CUSTR_VERSION
137 	};
138 	strview_t sv;
139 	int ret;
140 
141 	if (custr_alloc_init(&custr_alloc, &custr_ops) != 0)
142 		return (NULL);
143 	custr_alloc.cua_arg = &st;
144 
145 	sv_init_str(&sv, s, s + slen);
146 
147 	if (sv_remaining(&sv) < 1 || sv_peek(&sv, -1) != 'E') {
148 		DEMDEBUG("ERROR: string is either too small or does not end "
149 		    "with 'E'");
150 		errno = EINVAL;
151 		return (NULL);
152 	}
153 
154 	if (!rustdem_parse_prefix(&st, &sv)) {
155 		DEMDEBUG("ERROR: could not parse prefix");
156 		errno = EINVAL;
157 		return (NULL);
158 	}
159 	DEMDEBUG("parsed prefix; remaining='%.*s'", SV_PRINT(&sv));
160 
161 	if (!rustdem_all_ascii(&sv)) {
162 		/* rustdem_all_ascii() provides debug output */
163 		errno = EINVAL;
164 		return (NULL);
165 	}
166 
167 	if ((ret = custr_xalloc(&st.rds_demangled, &custr_alloc)) != 0)
168 		return (NULL);
169 
170 	if (!rustdem_parse_name(&st, &sv)) {
171 		if (st.rds_error == 0)
172 			st.rds_error = EINVAL;
173 		goto fail;
174 	}
175 
176 	if (sv_remaining(&sv) > 0) {
177 		DEMDEBUG("ERROR: unexpected trailing characters after "
178 		    "terminating 'E': '%.*s'", SV_PRINT(&sv));
179 		st.rds_error = EINVAL;
180 		goto fail;
181 	}
182 
183 	char *res = xstrdup(ops, custr_cstr(st.rds_demangled));
184 	if (res == NULL) {
185 		st.rds_error = errno;
186 		goto fail;
187 	}
188 
189 	custr_free(st.rds_demangled);
190 	DEMDEBUG("result = '%s'", res);
191 	return (res);
192 
193 fail:
194 	custr_free(st.rds_demangled);
195 	errno = st.rds_error;
196 	return (NULL);
197 }
198 
199 static boolean_t
200 rustdem_parse_prefix(rustdem_state_t *st, strview_t *svp)
201 {
202 	strview_t pfx;
203 
204 	sv_init_sv(&pfx, svp);
205 
206 	DEMDEBUG("checking for '_Z' or '__Z' in '%.*s'", SV_PRINT(&pfx));
207 
208 	if (st->rds_error != 0)
209 		return (B_FALSE);
210 
211 	if (!sv_consume_if_c(&pfx, '_'))
212 		return (B_FALSE);
213 
214 	(void) sv_consume_if_c(&pfx, '_');
215 
216 	if (!sv_consume_if_c(&pfx, 'Z'))
217 		return (B_FALSE);
218 
219 	/* Update svp with new position */
220 	sv_init_sv(svp, &pfx);
221 	return (B_TRUE);
222 }
223 
224 static boolean_t
225 rustdem_parse_name_segment(rustdem_state_t *st, strview_t *svp, boolean_t first)
226 {
227 	strview_t sv;
228 	strview_t name;
229 	uint64_t len;
230 	size_t rem;
231 	boolean_t last = B_FALSE;
232 
233 	if (st->rds_error != 0 || sv_remaining(svp) == 0)
234 		return (B_FALSE);
235 
236 	sv_init_sv(&sv, svp);
237 
238 	if (!rustdem_parse_num(st, &sv, &len)) {
239 		DEMDEBUG("ERROR: no leading length");
240 		st->rds_error = EINVAL;
241 		return (B_FALSE);
242 	}
243 
244 	rem = sv_remaining(&sv);
245 
246 	if (rem < len) {
247 		st->rds_error = EINVAL;
248 		return (B_FALSE);
249 	}
250 
251 	/* Is this the last segment before the terminating E? */
252 	if (rem == len + 1) {
253 		VERIFY3U(sv_peek(&sv, -1), ==, 'E');
254 		last = B_TRUE;
255 	}
256 
257 	if (!first && !rustdem_add_sep(st))
258 		return (B_FALSE);
259 
260 	/* Reduce length of seg to the length we parsed */
261 	(void) sv_init_sv_range(&name, &sv, len);
262 
263 	DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name));
264 
265 	/*
266 	 * A rust hash starts with 'h', and is the last component of a name
267 	 * before the terminating 'E'. It is however not always present
268 	 * in every mangled symbol, and a last segment that starts with 'h'
269 	 * could be confused for it, so failing to parse it just means
270 	 * we don't have a trailing hash.
271 	 */
272 	if (sv_peek(&name, 0) == 'h' && last) {
273 		if (rustdem_parse_hash(st, &name))
274 			goto done;
275 
276 		/*
277 		 * However any error other than 'not a hash' (e.g. ENOMEM)
278 		 * means we should fail.
279 		 */
280 		if (st->rds_error != 0)
281 			goto done;
282 	}
283 
284 	while (sv_remaining(&name) > 0) {
285 		switch (sv_peek(&name, 0)) {
286 		case '$':
287 			if (rustdem_parse_special(st, &name))
288 				continue;
289 			break;
290 		case '_':
291 			if (sv_peek(&name, 1) == '$') {
292 				/*
293 				 * Only consume/ignore '_'.  Leave
294 				 * $ for next round.
295 				 */
296 				sv_consume_n(&name, 1);
297 				continue;
298 			}
299 			break;
300 		case '.':
301 			/* Convert '..' to '::' */
302 			if (sv_peek(&name, 1) != '.')
303 				break;
304 
305 			if (!rustdem_add_sep(st))
306 				return (B_FALSE);
307 
308 			sv_consume_n(&name, 2);
309 			continue;
310 		default:
311 			break;
312 		}
313 
314 		if (custr_appendc(st->rds_demangled,
315 		    sv_consume_c(&name)) != 0) {
316 			st->rds_error = ENOMEM;
317 			return (B_FALSE);
318 		}
319 	}
320 
321 done:
322 	sv_consume_n(&sv, len);
323 	VERIFY3P(svp->sv_first, <=, sv.sv_first);
324 	DEMDEBUG("%s: consumed '%.*s'", __func__,
325 	    (int)(sv.sv_first - svp->sv_first), svp->sv_first);
326 	sv_init_sv(svp, &sv);
327 	return (B_TRUE);
328 }
329 
330 /*
331  * Parse N (<num><name>{num})+[<num>h<hex digits>]E
332  */
333 static boolean_t
334 rustdem_parse_name(rustdem_state_t *st, strview_t *svp)
335 {
336 	strview_t name;
337 	boolean_t first = B_TRUE;
338 
339 	if (st->rds_error != 0)
340 		return (B_FALSE);
341 
342 	sv_init_sv(&name, svp);
343 
344 	DEMDEBUG("%s: name = '%.*s'", __func__, SV_PRINT(&name));
345 
346 	if (sv_remaining(&name) == 0) {
347 		DEMDEBUG("%s: empty name", __func__);
348 		return (B_FALSE);
349 	}
350 
351 	if (!sv_consume_if_c(&name, 'N')) {
352 		DEMDEBUG("%s: does not start with 'N'", __func__);
353 		return (B_FALSE);
354 	}
355 
356 	while (sv_remaining(&name) > 0 && sv_peek(&name, 0) != 'E') {
357 		if (!rustdem_parse_name_segment(st, &name, first))
358 			return (B_FALSE);
359 		first = B_FALSE;
360 	}
361 	VERIFY(sv_consume_if_c(&name, 'E'));
362 
363 	VERIFY3P(svp->sv_first, <=, name.sv_first);
364 	DEMDEBUG("%s: consumed '%.*s'", __func__,
365 	    (int)(name.sv_first - svp->sv_first), svp->sv_first);
366 
367 	sv_init_sv(svp, &name);
368 	return (B_TRUE);
369 }
370 
371 static boolean_t
372 rustdem_parse_hash(rustdem_state_t *st, strview_t *svp)
373 {
374 	strview_t sv;
375 
376 	sv_init_sv(&sv, svp);
377 
378 	VERIFY(sv_consume_if_c(&sv, 'h'));
379 	if (!rustdem_append_c(st, 'h'))
380 		return (B_FALSE);
381 
382 	while (sv_remaining(&sv) > 0) {
383 		char c = sv_consume_c(&sv);
384 
385 		switch (c) {
386 		/*
387 		 * The upper-case hex digits (A-F) are excluded as valid
388 		 * hash values for several reasons:
389 		 *
390 		 * 1. It would result in two different possible names for
391 		 * the same function, leading to ambiguity in linking (among
392 		 * other things).
393 		 *
394 		 * 2. It would cause potential ambiguity in parsing -- is a
395 		 * trailing 'E' part of the hash, or the terminating character
396 		 * in the mangled name?
397 		 *
398 		 * 3. No examples were able to be found in the wild where
399 		 * uppercase digits are used, and other rust demanglers all
400 		 * seem to assume the hash must contain lower-case hex digits.
401 		 */
402 		case '0': case '1': case '2': case '3':
403 		case '4': case '5': case '6': case '7':
404 		case '8': case '9': case 'a': case 'b':
405 		case 'c': case 'd': case 'e': case 'f':
406 			if (!rustdem_append_c(st, c))
407 				return (B_FALSE);
408 			break;
409 		default:
410 			return (B_FALSE);
411 		}
412 	}
413 
414 	sv_init_sv(svp, &sv);
415 	return (B_TRUE);
416 }
417 
418 /*
419  * We have to pick an arbitrary limit here; 999,999,999 fits comfortably
420  * within an int32_t, so let's go with that, as it seems unlikely we'd
421  * ever see a larger value in context.
422  */
423 #define	MAX_DIGITS 9
424 
425 static boolean_t
426 rustdem_parse_num(rustdem_state_t *restrict st, strview_t *restrict svp,
427     uint64_t *restrict valp)
428 {
429 	strview_t snum;
430 	uint64_t v = 0;
431 	size_t ndigits = 0;
432 	char c;
433 
434 	if (st->rds_error != 0)
435 		return (B_FALSE);
436 
437 	sv_init_sv(&snum, svp);
438 
439 	DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(&snum));
440 
441 	c = sv_peek(&snum, 0);
442 	if (!ISDIGIT(c)) {
443 		DEMDEBUG("%s: ERROR no digits in str\n", __func__);
444 		st->rds_error = EINVAL;
445 		return (B_FALSE);
446 	}
447 
448 	/*
449 	 * Since there is currently no official specification on rust name
450 	 * mangling, only that it has been stated that rust follows what
451 	 * C++ mangling does.  In the Itanium C++ ABI (what practically
452 	 * every non-Windows C++ implementation uses these days), it
453 	 * explicitly disallows leading 0s in numeric values (except for
454 	 * substition and template indexes, which aren't relevant here).
455 	 * We enforce the same restriction -- if a rust implementation allowed
456 	 * leading zeros in numbers (basically segment lengths) it'd
457 	 * cause all sorts of ambiguity problems with names that likely lead
458 	 * to much bigger problems with linking and such, so this seems
459 	 * reasonable.
460 	 */
461 	if (c == '0') {
462 		DEMDEBUG("%s: ERROR number starts with leading 0\n", __func__);
463 		st->rds_error = EINVAL;
464 		return (B_FALSE);
465 	}
466 
467 	while (sv_remaining(&snum) > 0 && ndigits <= MAX_DIGITS) {
468 		c = sv_consume_c(&snum);
469 
470 		if (!ISDIGIT(c))
471 			break;
472 
473 		v *= 10;
474 		v += c - '0';
475 		ndigits++;
476 	}
477 
478 	if (ndigits > MAX_DIGITS) {
479 		DEMDEBUG("%s: value %llu is too large\n", __func__, v);
480 		st->rds_error = ERANGE;
481 		return (B_FALSE);
482 	}
483 
484 	DEMDEBUG("%s: num=%llu", __func__, v);
485 
486 	*valp = v;
487 	sv_consume_n(svp, ndigits);
488 	return (B_TRUE);
489 }
490 
491 static boolean_t
492 rustdem_parse_special(rustdem_state_t *restrict st, strview_t *restrict svp)
493 {
494 	if (st->rds_error != 0)
495 		return (B_FALSE);
496 
497 	if (sv_peek(svp, 0) != '$')
498 		return (B_FALSE);
499 
500 	for (size_t i = 0; i < rust_charmap_sz; i++) {
501 		if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) {
502 			if (!rustdem_append_c(st, rust_charmap[i].ruc_ch))
503 				return (B_FALSE);
504 			return (B_TRUE);
505 		}
506 	}
507 	return (B_FALSE);
508 }
509 
510 static boolean_t
511 rustdem_add_sep(rustdem_state_t *st)
512 {
513 	if (st->rds_error != 0)
514 		return (B_FALSE);
515 
516 	if (!rustdem_append_c(st, ':') ||
517 	    !rustdem_append_c(st, ':'))
518 		return (B_FALSE);
519 
520 	return (B_TRUE);
521 }
522 
523 static boolean_t
524 rustdem_append_c(rustdem_state_t *st, char c)
525 {
526 	if (st->rds_error != 0)
527 		return (B_FALSE);
528 
529 	if (custr_appendc(st->rds_demangled, c) == 0)
530 		return (B_TRUE);
531 
532 	st->rds_error = errno;
533 	return (B_FALSE);
534 }
535 
536 static boolean_t
537 rustdem_all_ascii(const strview_t *svp)
538 {
539 	strview_t p;
540 
541 	sv_init_sv(&p, svp);
542 
543 	while (sv_remaining(&p) > 0) {
544 		char c = sv_consume_c(&p);
545 
546 		/*
547 		 * #including <sys/ctype.h> conflicts with <ctype.h>.  Since
548 		 * we want the C locale macros (ISDIGIT, etc), it also means
549 		 * we can't use isascii(3C).
550 		 */
551 		if ((c & 0x80) != 0) {
552 			DEMDEBUG("%s: found non-ascii character 0x%02hhx at "
553 			    "offset %tu", __func__, c,
554 			    (ptrdiff_t)(p.sv_first - svp->sv_first));
555 			return (B_FALSE);
556 		}
557 	}
558 	return (B_TRUE);
559 }
560 
561 static void *
562 rustdem_alloc(custr_alloc_t *cao, size_t len)
563 {
564 	rustdem_state_t *st = cao->cua_arg;
565 	return (zalloc(st->rds_ops, len));
566 }
567 
568 static void
569 rustdem_free(custr_alloc_t *cao, void *p, size_t len)
570 {
571 	rustdem_state_t *st = cao->cua_arg;
572 	xfree(st->rds_ops, p, len);
573 }
574