1*1cd08393SJason King /*
2*1cd08393SJason King  * This file and its contents are supplied under the terms of the
3*1cd08393SJason King  * Common Development and Distribution License ("CDDL"), version 1.0.
4*1cd08393SJason King  * You may only use this file in accordance with the terms of version
5*1cd08393SJason King  * 1.0 of the CDDL.
6*1cd08393SJason King  *
7*1cd08393SJason King  * A full copy of the text of the CDDL should have accompanied this
8*1cd08393SJason King  * source.  A copy of the CDDL is also available via the Internet at
9*1cd08393SJason King  * http://www.illumos.org/license/CDDL.
10*1cd08393SJason King  */
11*1cd08393SJason King 
12*1cd08393SJason King /*
13*1cd08393SJason King  * Copyright 2019 Joyent, Inc.
14*1cd08393SJason King  * Copyright 2021 Jason King
15*1cd08393SJason King  */
16*1cd08393SJason King 
17*1cd08393SJason King #include <errno.h>
18*1cd08393SJason King #include <libcustr.h>
19*1cd08393SJason King #include <limits.h>
20*1cd08393SJason King #include <string.h>
21*1cd08393SJason King #include <stdio.h>
22*1cd08393SJason King 
23*1cd08393SJason King #include "rust.h"
24*1cd08393SJason King 
25*1cd08393SJason King /*
26*1cd08393SJason King  * Unfortunately, there is currently no official specification for the legacy
27*1cd08393SJason King  * rust name mangling.  This is an attempt to document the understanding of the
28*1cd08393SJason King  * mangling used here.  It is based off examination of
29*1cd08393SJason King  *     https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/
30*1cd08393SJason King  *
31*1cd08393SJason King  * A mangled rust name is:
32*1cd08393SJason King  *     <prefix> <name>
33*1cd08393SJason King  *
34*1cd08393SJason King  * <prefix>	::=	_Z
35*1cd08393SJason King  *			__Z
36*1cd08393SJason King  *
37*1cd08393SJason King  * <name>	::= N <name-segment>+ [<hash>] E
38*1cd08393SJason King  *
39*1cd08393SJason King  * <name-segment> ::= <len> <name-chars>{len}
40*1cd08393SJason King  *
41*1cd08393SJason King  * <len>	::= [1-9][0-9]+
42*1cd08393SJason King  *
43*1cd08393SJason King  * <name-chars>	::=	<[A-Za-z]> <[A-Za-z0-9]>*
44*1cd08393SJason King  *			<separator>
45*1cd08393SJason King  *			<special>
46*1cd08393SJason King  *
47*1cd08393SJason King  * <separator>	::=	'..'	# '::'
48*1cd08393SJason King  *
49*1cd08393SJason King  * <special>	::=	$SP$	# '@'
50*1cd08393SJason King  *			$BP$	# '*'
51*1cd08393SJason King  *			$RF$	# '&'
52*1cd08393SJason King  *			$LT$	# '<'
53*1cd08393SJason King  *			$GT$	# '>'
54*1cd08393SJason King  *			$LP$	# '('
55*1cd08393SJason King  *			$RP$	# ')'
56*1cd08393SJason King  *			$C$	# ','
57*1cd08393SJason King  *
58*1cd08393SJason King  * <hash>	:= <len> h <hex-digits>+
59*1cd08393SJason King  *
60*1cd08393SJason King  * <hex-digits>	:= <[0-9a-f]>
61*1cd08393SJason King  */
62*1cd08393SJason King 
63*1cd08393SJason King static const struct rust_charmap {
64*1cd08393SJason King 	const char	*ruc_seq;
65*1cd08393SJason King 	char		ruc_ch;
66*1cd08393SJason King } rust_charmap[] = {
67*1cd08393SJason King 	{ "$SP$", '@' },
68*1cd08393SJason King 	{ "$BP$", '*' },
69*1cd08393SJason King 	{ "$RF$", '&' },
70*1cd08393SJason King 	{ "$LT$", '<' },
71*1cd08393SJason King 	{ "$GT$", '>' },
72*1cd08393SJason King 	{ "$LP$", '(' },
73*1cd08393SJason King 	{ "$RP$", ')' },
74*1cd08393SJason King 	{ "$C$", ',' },
75*1cd08393SJason King };
76*1cd08393SJason King static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap);
77*1cd08393SJason King 
78*1cd08393SJason King static boolean_t rustleg_valid_sym(const strview_t *);
79*1cd08393SJason King static boolean_t rustleg_parse_name(rust_state_t *, strview_t *);
80*1cd08393SJason King static boolean_t rustleg_parse_hash(rust_state_t *, strview_t *);
81*1cd08393SJason King static boolean_t rustleg_parse_special(rust_state_t *, strview_t *);
82*1cd08393SJason King static boolean_t rustleg_add_sep(rust_state_t *);
83*1cd08393SJason King 
84*1cd08393SJason King boolean_t
rust_demangle_legacy(rust_state_t * restrict st,strview_t * restrict sv)85*1cd08393SJason King rust_demangle_legacy(rust_state_t *restrict st, strview_t *restrict sv)
86*1cd08393SJason King {
87*1cd08393SJason King 
88*1cd08393SJason King 	/* Make sure the whole thing contains valid characters */
89*1cd08393SJason King 	if (!rustleg_valid_sym(sv)) {
90*1cd08393SJason King 		st->rs_error = EINVAL;
91*1cd08393SJason King 		return (B_FALSE);
92*1cd08393SJason King 	}
93*1cd08393SJason King 
94*1cd08393SJason King 	if (sv_peek(sv, -1) != 'E') {
95*1cd08393SJason King 		DEMDEBUG("ERROR: string does not end with 'E'");
96*1cd08393SJason King 		st->rs_error = EINVAL;
97*1cd08393SJason King 		return (B_FALSE);
98*1cd08393SJason King 	}
99*1cd08393SJason King 
100*1cd08393SJason King 	if (!rustleg_parse_name(st, sv))
101*1cd08393SJason King 		return (B_FALSE);
102*1cd08393SJason King 
103*1cd08393SJason King 	if (sv_remaining(sv) != 0) {
104*1cd08393SJason King 		DEMDEBUG("ERROR: trailing characters in name");
105*1cd08393SJason King 		st->rs_error = EINVAL;
106*1cd08393SJason King 		return (B_FALSE);
107*1cd08393SJason King 	}
108*1cd08393SJason King 
109*1cd08393SJason King 	return (B_TRUE);
110*1cd08393SJason King }
111*1cd08393SJason King 
112*1cd08393SJason King static boolean_t
rustleg_parse_name_segment(rust_state_t * st,strview_t * svp,boolean_t first)113*1cd08393SJason King rustleg_parse_name_segment(rust_state_t *st, strview_t *svp, boolean_t first)
114*1cd08393SJason King {
115*1cd08393SJason King 	strview_t orig;
116*1cd08393SJason King 	strview_t name;
117*1cd08393SJason King 	uint64_t len;
118*1cd08393SJason King 	size_t rem;
119*1cd08393SJason King 	boolean_t last = B_FALSE;
120*1cd08393SJason King 
121*1cd08393SJason King 	if (HAS_ERROR(st) || sv_remaining(svp) == 0)
122*1cd08393SJason King 		return (B_FALSE);
123*1cd08393SJason King 
124*1cd08393SJason King 	sv_init_sv(&orig, svp);
125*1cd08393SJason King 
126*1cd08393SJason King 	if (!rust_parse_base10(st, svp, &len)) {
127*1cd08393SJason King 		DEMDEBUG("ERROR: no leading length");
128*1cd08393SJason King 		st->rs_error = EINVAL;
129*1cd08393SJason King 		return (B_FALSE);
130*1cd08393SJason King 	}
131*1cd08393SJason King 
132*1cd08393SJason King 	rem = sv_remaining(svp);
133*1cd08393SJason King 
134*1cd08393SJason King 	if (rem < len) {
135*1cd08393SJason King 		DEMDEBUG("ERROR: segment length (%" PRIu64 ") > remaining "
136*1cd08393SJason King 		    "bytes in string (%zu)", len, rem);
137*1cd08393SJason King 		st->rs_error = EINVAL;
138*1cd08393SJason King 		return (B_FALSE);
139*1cd08393SJason King 	}
140*1cd08393SJason King 
141*1cd08393SJason King 	/* Is this the last segment before the terminating E? */
142*1cd08393SJason King 	if (rem == len + 1) {
143*1cd08393SJason King 		VERIFY3U(sv_peek(svp, -1), ==, 'E');
144*1cd08393SJason King 		last = B_TRUE;
145*1cd08393SJason King 	}
146*1cd08393SJason King 
147*1cd08393SJason King 	if (!first && !rustleg_add_sep(st))
148*1cd08393SJason King 		return (B_FALSE);
149*1cd08393SJason King 
150*1cd08393SJason King 	/* Reduce length of seg to the length we parsed */
151*1cd08393SJason King 	(void) sv_init_sv_range(&name, svp, len);
152*1cd08393SJason King 
153*1cd08393SJason King 	DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name));
154*1cd08393SJason King 
155*1cd08393SJason King 	/*
156*1cd08393SJason King 	 * A rust hash starts with 'h', and is the last component of a name
157*1cd08393SJason King 	 * before the terminating 'E'. It is however not always present
158*1cd08393SJason King 	 * in every mangled symbol, and a last segment that starts with 'h'
159*1cd08393SJason King 	 * could be confused for it, so failing to part it just means
160*1cd08393SJason King 	 * we don't have a trailing hash.
161*1cd08393SJason King 	 */
162*1cd08393SJason King 	if (sv_peek(&name, 0) == 'h' && last) {
163*1cd08393SJason King 		if (rustleg_parse_hash(st, &name))
164*1cd08393SJason King 			goto done;
165*1cd08393SJason King 
166*1cd08393SJason King 		/*
167*1cd08393SJason King 		 * However any error other than 'not a hash' (e.g. ENOMEM)
168*1cd08393SJason King 		 * means we should fail.
169*1cd08393SJason King 		 */
170*1cd08393SJason King 		if (st->rs_error != 0)
171*1cd08393SJason King 			goto done;
172*1cd08393SJason King 	}
173*1cd08393SJason King 
174*1cd08393SJason King 	/* A '_' followed by $ is ignored at the start of a name segment */
175*1cd08393SJason King 	if (sv_peek(&name, 0) == '_' && sv_peek(&name, 1) == '$')
176*1cd08393SJason King 		(void) sv_consume_n(&name, 1);
177*1cd08393SJason King 
178*1cd08393SJason King 	while (sv_remaining(&name) > 0) {
179*1cd08393SJason King 		switch (sv_peek(&name, 0)) {
180*1cd08393SJason King 		case '$':
181*1cd08393SJason King 			if (rustleg_parse_special(st, &name))
182*1cd08393SJason King 				continue;
183*1cd08393SJason King 			break;
184*1cd08393SJason King 		case '.':
185*1cd08393SJason King 			/* Convert '..' to '::' */
186*1cd08393SJason King 			if (sv_peek(&name, 1) != '.')
187*1cd08393SJason King 				break;
188*1cd08393SJason King 
189*1cd08393SJason King 			if (!rustleg_add_sep(st))
190*1cd08393SJason King 				return (B_FALSE);
191*1cd08393SJason King 
192*1cd08393SJason King 			sv_consume_n(&name, 2);
193*1cd08393SJason King 			continue;
194*1cd08393SJason King 		default:
195*1cd08393SJason King 			break;
196*1cd08393SJason King 		}
197*1cd08393SJason King 
198*1cd08393SJason King 		if (!rust_appendc(st, sv_consume_c(&name))) {
199*1cd08393SJason King 			SET_ERROR(st);
200*1cd08393SJason King 			return (B_FALSE);
201*1cd08393SJason King 		}
202*1cd08393SJason King 	}
203*1cd08393SJason King 
204*1cd08393SJason King done:
205*1cd08393SJason King 	sv_consume_n(svp, len);
206*1cd08393SJason King 
207*1cd08393SJason King 	VERIFY3P(orig.sv_first, <=, svp->sv_first);
208*1cd08393SJason King 	DEMDEBUG("%s: consumed '%.*s'", __func__,
209*1cd08393SJason King 	    (int)(uintptr_t)(svp->sv_first - orig.sv_first), orig.sv_first);
210*1cd08393SJason King 	return (B_TRUE);
211*1cd08393SJason King }
212*1cd08393SJason King 
213*1cd08393SJason King /*
214*1cd08393SJason King  * Parse N (<num><name>{num})+ [<num>h<hex digits]E
215*1cd08393SJason King  */
216*1cd08393SJason King static boolean_t
rustleg_parse_name(rust_state_t * st,strview_t * svp)217*1cd08393SJason King rustleg_parse_name(rust_state_t *st, strview_t *svp)
218*1cd08393SJason King {
219*1cd08393SJason King 	strview_t name;
220*1cd08393SJason King 	boolean_t first = B_TRUE;
221*1cd08393SJason King 
222*1cd08393SJason King 	sv_init_sv(&name, svp);
223*1cd08393SJason King 
224*1cd08393SJason King 	if (HAS_ERROR(st))
225*1cd08393SJason King 		return (B_FALSE);
226*1cd08393SJason King 
227*1cd08393SJason King 	DEMDEBUG("%s: name = '%.*s'", __func__, SV_PRINT(&name));
228*1cd08393SJason King 
229*1cd08393SJason King 	if (sv_remaining(svp) == 0) {
230*1cd08393SJason King 		DEMDEBUG("%s: empty name", __func__);
231*1cd08393SJason King 		return (B_FALSE);
232*1cd08393SJason King 	}
233*1cd08393SJason King 
234*1cd08393SJason King 	if (!sv_consume_if_c(svp, 'N')) {
235*1cd08393SJason King 		DEMDEBUG("%s: does not start with 'N'", __func__);
236*1cd08393SJason King 		return (B_FALSE);
237*1cd08393SJason King 	}
238*1cd08393SJason King 
239*1cd08393SJason King 	while (sv_remaining(svp) > 0 && sv_peek(svp, 0) != 'E') {
240*1cd08393SJason King 		if (!rustleg_parse_name_segment(st, svp, first))
241*1cd08393SJason King 			return (B_FALSE);
242*1cd08393SJason King 		first = B_FALSE;
243*1cd08393SJason King 	}
244*1cd08393SJason King 
245*1cd08393SJason King 	if (!sv_consume_if_c(svp, 'E')) {
246*1cd08393SJason King 		DEMDEBUG("%s: ERROR no terminating 'E'", __func__);
247*1cd08393SJason King 		return (B_FALSE);
248*1cd08393SJason King 	}
249*1cd08393SJason King 
250*1cd08393SJason King 	VERIFY3P(name.sv_first, <=, svp->sv_first);
251*1cd08393SJason King 	DEMDEBUG("%s: consumed '%.*s'", __func__,
252*1cd08393SJason King 	    (int)(uintptr_t)(svp->sv_first - name.sv_first), name.sv_first);
253*1cd08393SJason King 
254*1cd08393SJason King 	return (B_TRUE);
255*1cd08393SJason King }
256*1cd08393SJason King 
257*1cd08393SJason King static boolean_t
rustleg_parse_hash(rust_state_t * st,strview_t * svp)258*1cd08393SJason King rustleg_parse_hash(rust_state_t *st, strview_t *svp)
259*1cd08393SJason King {
260*1cd08393SJason King 	if (HAS_ERROR(st))
261*1cd08393SJason King 		return (B_FALSE);
262*1cd08393SJason King 
263*1cd08393SJason King 	VERIFY(sv_consume_if_c(svp, 'h'));
264*1cd08393SJason King 	if (!rust_appendc(st, 'h'))
265*1cd08393SJason King 		return (B_FALSE);
266*1cd08393SJason King 
267*1cd08393SJason King 	while (sv_remaining(svp) > 0) {
268*1cd08393SJason King 		char c = sv_consume_c(svp);
269*1cd08393SJason King 
270*1cd08393SJason King 		switch (c) {
271*1cd08393SJason King 		/*
272*1cd08393SJason King 		 * The upper-case hex digits (A-F) are excluded as valid
273*1cd08393SJason King 		 * hash values for several reasons:
274*1cd08393SJason King 		 *
275*1cd08393SJason King 		 * 1. It would result in two different possible names for
276*1cd08393SJason King 		 * the same function, leading to ambiguity in linking (among
277*1cd08393SJason King 		 * other things).
278*1cd08393SJason King 		 *
279*1cd08393SJason King 		 * 2. It would cause potential ambiguity in parsing -- is a
280*1cd08393SJason King 		 * trailing 'E' part of the hash, or the terminating character
281*1cd08393SJason King 		 * in the mangled name?
282*1cd08393SJason King 		 *
283*1cd08393SJason King 		 * 3. No examples were able to be found in the wild where
284*1cd08393SJason King 		 * uppercase digits are used, and other rust demanglers all
285*1cd08393SJason King 		 * seem to assume the hash must contain lower-case hex digits.
286*1cd08393SJason King 		 */
287*1cd08393SJason King 		case '0': case '1': case '2': case '3':
288*1cd08393SJason King 		case '4': case '5': case '6': case '7':
289*1cd08393SJason King 		case '8': case '9': case 'a': case 'b':
290*1cd08393SJason King 		case 'c': case 'd': case 'e': case 'f':
291*1cd08393SJason King 			if (!rust_appendc(st, c))
292*1cd08393SJason King 				return (B_FALSE);
293*1cd08393SJason King 			break;
294*1cd08393SJason King 		default:
295*1cd08393SJason King 			return (B_FALSE);
296*1cd08393SJason King 		}
297*1cd08393SJason King 	}
298*1cd08393SJason King 
299*1cd08393SJason King 	return (B_TRUE);
300*1cd08393SJason King }
301*1cd08393SJason King 
302*1cd08393SJason King static boolean_t
rustleg_parse_special(rust_state_t * restrict st,strview_t * restrict svp)303*1cd08393SJason King rustleg_parse_special(rust_state_t *restrict st, strview_t *restrict svp)
304*1cd08393SJason King {
305*1cd08393SJason King 	if (HAS_ERROR(st))
306*1cd08393SJason King 		return (B_FALSE);
307*1cd08393SJason King 
308*1cd08393SJason King 	if (sv_peek(svp, 0) != '$')
309*1cd08393SJason King 		return (B_FALSE);
310*1cd08393SJason King 
311*1cd08393SJason King 	for (size_t i = 0; i < rust_charmap_sz; i++) {
312*1cd08393SJason King 		if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) {
313*1cd08393SJason King 			if (!rust_appendc(st, rust_charmap[i].ruc_ch))
314*1cd08393SJason King 				return (B_FALSE);
315*1cd08393SJason King 			return (B_TRUE);
316*1cd08393SJason King 		}
317*1cd08393SJason King 	}
318*1cd08393SJason King 
319*1cd08393SJason King 	/* Handle $uXXXX$ */
320*1cd08393SJason King 
321*1cd08393SJason King 	strview_t sv;
322*1cd08393SJason King 	uint32_t val = 0;
323*1cd08393SJason King 	uint_t ndigits = 0;
324*1cd08393SJason King 
325*1cd08393SJason King 	sv_init_sv(&sv, svp);
326*1cd08393SJason King 
327*1cd08393SJason King 	/* We peeked at this earlier, so it should still be there */
328*1cd08393SJason King 	VERIFY(sv_consume_if_c(&sv, '$'));
329*1cd08393SJason King 
330*1cd08393SJason King 	if (!sv_consume_if_c(&sv, 'u'))
331*1cd08393SJason King 		return (B_FALSE);
332*1cd08393SJason King 
333*1cd08393SJason King 	while (sv_remaining(&sv) > 0) {
334*1cd08393SJason King 		uint32_t cval = 0;
335*1cd08393SJason King 		char c;
336*1cd08393SJason King 
337*1cd08393SJason King 		if (ndigits == 4)
338*1cd08393SJason King 			return (B_FALSE);
339*1cd08393SJason King 
340*1cd08393SJason King 		c = sv_consume_c(&sv);
341*1cd08393SJason King 		if (c >= '0' && c <= '9')
342*1cd08393SJason King 			cval = c - '0';
343*1cd08393SJason King 		else if (c >= 'a' && c <= 'f')
344*1cd08393SJason King 			cval = c - 'a' + 10;
345*1cd08393SJason King 		else if (c == '$')
346*1cd08393SJason King 			break;
347*1cd08393SJason King 		else
348*1cd08393SJason King 			return (B_FALSE);
349*1cd08393SJason King 
350*1cd08393SJason King 		val <<= 4;
351*1cd08393SJason King 		val |= cval;
352*1cd08393SJason King 		ndigits++;
353*1cd08393SJason King 	}
354*1cd08393SJason King 
355*1cd08393SJason King 	if (!rust_append_utf8_c(st, val))
356*1cd08393SJason King 		return (B_FALSE);
357*1cd08393SJason King 
358*1cd08393SJason King 	sv_consume_n(svp, ndigits + 3);
359*1cd08393SJason King 	return (B_TRUE);
360*1cd08393SJason King }
361*1cd08393SJason King 
362*1cd08393SJason King static boolean_t
rustleg_add_sep(rust_state_t * st)363*1cd08393SJason King rustleg_add_sep(rust_state_t *st)
364*1cd08393SJason King {
365*1cd08393SJason King 	if (HAS_ERROR(st))
366*1cd08393SJason King 		return (B_FALSE);
367*1cd08393SJason King 
368*1cd08393SJason King 	return (rust_append(st, "::"));
369*1cd08393SJason King }
370*1cd08393SJason King 
371*1cd08393SJason King static boolean_t
rustleg_valid_sym(const strview_t * sv)372*1cd08393SJason King rustleg_valid_sym(const strview_t *sv)
373*1cd08393SJason King {
374*1cd08393SJason King 	size_t i;
375*1cd08393SJason King 
376*1cd08393SJason King 	for (i = 0; i < sv->sv_rem; i++) {
377*1cd08393SJason King 		char c = sv->sv_first[i];
378*1cd08393SJason King 
379*1cd08393SJason King 		if ((c & 0x80) == 0)
380*1cd08393SJason King 			continue;
381*1cd08393SJason King 		DEMDEBUG("%s: ERROR found 8-bit character '%c' in '%.*s' "
382*1cd08393SJason King 		    "at index %zu", __func__, c, SV_PRINT(sv), i);
383*1cd08393SJason King 		return (B_FALSE);
384*1cd08393SJason King 	}
385*1cd08393SJason King 	return (B_TRUE);
386*1cd08393SJason King }
387