1*1cd08393SJason King /*
2*1cd08393SJason King * This file and its contents are supplied under the terms of the
3*1cd08393SJason King * Common Development and Distribution License ("CDDL"), version 1.0.
4*1cd08393SJason King * You may only use this file in accordance with the terms of version
5*1cd08393SJason King * 1.0 of the CDDL.
6*1cd08393SJason King *
7*1cd08393SJason King * A full copy of the text of the CDDL should have accompanied this
8*1cd08393SJason King * source. A copy of the CDDL is also available via the Internet at
9*1cd08393SJason King * http://www.illumos.org/license/CDDL.
10*1cd08393SJason King */
11*1cd08393SJason King
12*1cd08393SJason King /*
13*1cd08393SJason King * Copyright 2019 Joyent, Inc.
14*1cd08393SJason King * Copyright 2021 Jason King
15*1cd08393SJason King */
16*1cd08393SJason King
17*1cd08393SJason King #include <errno.h>
18*1cd08393SJason King #include <libcustr.h>
19*1cd08393SJason King #include <limits.h>
20*1cd08393SJason King #include <string.h>
21*1cd08393SJason King #include <stdio.h>
22*1cd08393SJason King
23*1cd08393SJason King #include "rust.h"
24*1cd08393SJason King
25*1cd08393SJason King /*
26*1cd08393SJason King * Unfortunately, there is currently no official specification for the legacy
27*1cd08393SJason King * rust name mangling. This is an attempt to document the understanding of the
28*1cd08393SJason King * mangling used here. It is based off examination of
29*1cd08393SJason King * https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/
30*1cd08393SJason King *
31*1cd08393SJason King * A mangled rust name is:
32*1cd08393SJason King * <prefix> <name>
33*1cd08393SJason King *
34*1cd08393SJason King * <prefix> ::= _Z
35*1cd08393SJason King * __Z
36*1cd08393SJason King *
37*1cd08393SJason King * <name> ::= N <name-segment>+ [<hash>] E
38*1cd08393SJason King *
39*1cd08393SJason King * <name-segment> ::= <len> <name-chars>{len}
40*1cd08393SJason King *
41*1cd08393SJason King * <len> ::= [1-9][0-9]+
42*1cd08393SJason King *
43*1cd08393SJason King * <name-chars> ::= <[A-Za-z]> <[A-Za-z0-9]>*
44*1cd08393SJason King * <separator>
45*1cd08393SJason King * <special>
46*1cd08393SJason King *
47*1cd08393SJason King * <separator> ::= '..' # '::'
48*1cd08393SJason King *
49*1cd08393SJason King * <special> ::= $SP$ # '@'
50*1cd08393SJason King * $BP$ # '*'
51*1cd08393SJason King * $RF$ # '&'
52*1cd08393SJason King * $LT$ # '<'
53*1cd08393SJason King * $GT$ # '>'
54*1cd08393SJason King * $LP$ # '('
55*1cd08393SJason King * $RP$ # ')'
56*1cd08393SJason King * $C$ # ','
57*1cd08393SJason King *
58*1cd08393SJason King * <hash> := <len> h <hex-digits>+
59*1cd08393SJason King *
60*1cd08393SJason King * <hex-digits> := <[0-9a-f]>
61*1cd08393SJason King */
62*1cd08393SJason King
63*1cd08393SJason King static const struct rust_charmap {
64*1cd08393SJason King const char *ruc_seq;
65*1cd08393SJason King char ruc_ch;
66*1cd08393SJason King } rust_charmap[] = {
67*1cd08393SJason King { "$SP$", '@' },
68*1cd08393SJason King { "$BP$", '*' },
69*1cd08393SJason King { "$RF$", '&' },
70*1cd08393SJason King { "$LT$", '<' },
71*1cd08393SJason King { "$GT$", '>' },
72*1cd08393SJason King { "$LP$", '(' },
73*1cd08393SJason King { "$RP$", ')' },
74*1cd08393SJason King { "$C$", ',' },
75*1cd08393SJason King };
76*1cd08393SJason King static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap);
77*1cd08393SJason King
78*1cd08393SJason King static boolean_t rustleg_valid_sym(const strview_t *);
79*1cd08393SJason King static boolean_t rustleg_parse_name(rust_state_t *, strview_t *);
80*1cd08393SJason King static boolean_t rustleg_parse_hash(rust_state_t *, strview_t *);
81*1cd08393SJason King static boolean_t rustleg_parse_special(rust_state_t *, strview_t *);
82*1cd08393SJason King static boolean_t rustleg_add_sep(rust_state_t *);
83*1cd08393SJason King
84*1cd08393SJason King boolean_t
rust_demangle_legacy(rust_state_t * restrict st,strview_t * restrict sv)85*1cd08393SJason King rust_demangle_legacy(rust_state_t *restrict st, strview_t *restrict sv)
86*1cd08393SJason King {
87*1cd08393SJason King
88*1cd08393SJason King /* Make sure the whole thing contains valid characters */
89*1cd08393SJason King if (!rustleg_valid_sym(sv)) {
90*1cd08393SJason King st->rs_error = EINVAL;
91*1cd08393SJason King return (B_FALSE);
92*1cd08393SJason King }
93*1cd08393SJason King
94*1cd08393SJason King if (sv_peek(sv, -1) != 'E') {
95*1cd08393SJason King DEMDEBUG("ERROR: string does not end with 'E'");
96*1cd08393SJason King st->rs_error = EINVAL;
97*1cd08393SJason King return (B_FALSE);
98*1cd08393SJason King }
99*1cd08393SJason King
100*1cd08393SJason King if (!rustleg_parse_name(st, sv))
101*1cd08393SJason King return (B_FALSE);
102*1cd08393SJason King
103*1cd08393SJason King if (sv_remaining(sv) != 0) {
104*1cd08393SJason King DEMDEBUG("ERROR: trailing characters in name");
105*1cd08393SJason King st->rs_error = EINVAL;
106*1cd08393SJason King return (B_FALSE);
107*1cd08393SJason King }
108*1cd08393SJason King
109*1cd08393SJason King return (B_TRUE);
110*1cd08393SJason King }
111*1cd08393SJason King
112*1cd08393SJason King static boolean_t
rustleg_parse_name_segment(rust_state_t * st,strview_t * svp,boolean_t first)113*1cd08393SJason King rustleg_parse_name_segment(rust_state_t *st, strview_t *svp, boolean_t first)
114*1cd08393SJason King {
115*1cd08393SJason King strview_t orig;
116*1cd08393SJason King strview_t name;
117*1cd08393SJason King uint64_t len;
118*1cd08393SJason King size_t rem;
119*1cd08393SJason King boolean_t last = B_FALSE;
120*1cd08393SJason King
121*1cd08393SJason King if (HAS_ERROR(st) || sv_remaining(svp) == 0)
122*1cd08393SJason King return (B_FALSE);
123*1cd08393SJason King
124*1cd08393SJason King sv_init_sv(&orig, svp);
125*1cd08393SJason King
126*1cd08393SJason King if (!rust_parse_base10(st, svp, &len)) {
127*1cd08393SJason King DEMDEBUG("ERROR: no leading length");
128*1cd08393SJason King st->rs_error = EINVAL;
129*1cd08393SJason King return (B_FALSE);
130*1cd08393SJason King }
131*1cd08393SJason King
132*1cd08393SJason King rem = sv_remaining(svp);
133*1cd08393SJason King
134*1cd08393SJason King if (rem < len) {
135*1cd08393SJason King DEMDEBUG("ERROR: segment length (%" PRIu64 ") > remaining "
136*1cd08393SJason King "bytes in string (%zu)", len, rem);
137*1cd08393SJason King st->rs_error = EINVAL;
138*1cd08393SJason King return (B_FALSE);
139*1cd08393SJason King }
140*1cd08393SJason King
141*1cd08393SJason King /* Is this the last segment before the terminating E? */
142*1cd08393SJason King if (rem == len + 1) {
143*1cd08393SJason King VERIFY3U(sv_peek(svp, -1), ==, 'E');
144*1cd08393SJason King last = B_TRUE;
145*1cd08393SJason King }
146*1cd08393SJason King
147*1cd08393SJason King if (!first && !rustleg_add_sep(st))
148*1cd08393SJason King return (B_FALSE);
149*1cd08393SJason King
150*1cd08393SJason King /* Reduce length of seg to the length we parsed */
151*1cd08393SJason King (void) sv_init_sv_range(&name, svp, len);
152*1cd08393SJason King
153*1cd08393SJason King DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name));
154*1cd08393SJason King
155*1cd08393SJason King /*
156*1cd08393SJason King * A rust hash starts with 'h', and is the last component of a name
157*1cd08393SJason King * before the terminating 'E'. It is however not always present
158*1cd08393SJason King * in every mangled symbol, and a last segment that starts with 'h'
159*1cd08393SJason King * could be confused for it, so failing to part it just means
160*1cd08393SJason King * we don't have a trailing hash.
161*1cd08393SJason King */
162*1cd08393SJason King if (sv_peek(&name, 0) == 'h' && last) {
163*1cd08393SJason King if (rustleg_parse_hash(st, &name))
164*1cd08393SJason King goto done;
165*1cd08393SJason King
166*1cd08393SJason King /*
167*1cd08393SJason King * However any error other than 'not a hash' (e.g. ENOMEM)
168*1cd08393SJason King * means we should fail.
169*1cd08393SJason King */
170*1cd08393SJason King if (st->rs_error != 0)
171*1cd08393SJason King goto done;
172*1cd08393SJason King }
173*1cd08393SJason King
174*1cd08393SJason King /* A '_' followed by $ is ignored at the start of a name segment */
175*1cd08393SJason King if (sv_peek(&name, 0) == '_' && sv_peek(&name, 1) == '$')
176*1cd08393SJason King (void) sv_consume_n(&name, 1);
177*1cd08393SJason King
178*1cd08393SJason King while (sv_remaining(&name) > 0) {
179*1cd08393SJason King switch (sv_peek(&name, 0)) {
180*1cd08393SJason King case '$':
181*1cd08393SJason King if (rustleg_parse_special(st, &name))
182*1cd08393SJason King continue;
183*1cd08393SJason King break;
184*1cd08393SJason King case '.':
185*1cd08393SJason King /* Convert '..' to '::' */
186*1cd08393SJason King if (sv_peek(&name, 1) != '.')
187*1cd08393SJason King break;
188*1cd08393SJason King
189*1cd08393SJason King if (!rustleg_add_sep(st))
190*1cd08393SJason King return (B_FALSE);
191*1cd08393SJason King
192*1cd08393SJason King sv_consume_n(&name, 2);
193*1cd08393SJason King continue;
194*1cd08393SJason King default:
195*1cd08393SJason King break;
196*1cd08393SJason King }
197*1cd08393SJason King
198*1cd08393SJason King if (!rust_appendc(st, sv_consume_c(&name))) {
199*1cd08393SJason King SET_ERROR(st);
200*1cd08393SJason King return (B_FALSE);
201*1cd08393SJason King }
202*1cd08393SJason King }
203*1cd08393SJason King
204*1cd08393SJason King done:
205*1cd08393SJason King sv_consume_n(svp, len);
206*1cd08393SJason King
207*1cd08393SJason King VERIFY3P(orig.sv_first, <=, svp->sv_first);
208*1cd08393SJason King DEMDEBUG("%s: consumed '%.*s'", __func__,
209*1cd08393SJason King (int)(uintptr_t)(svp->sv_first - orig.sv_first), orig.sv_first);
210*1cd08393SJason King return (B_TRUE);
211*1cd08393SJason King }
212*1cd08393SJason King
213*1cd08393SJason King /*
214*1cd08393SJason King * Parse N (<num><name>{num})+ [<num>h<hex digits]E
215*1cd08393SJason King */
216*1cd08393SJason King static boolean_t
rustleg_parse_name(rust_state_t * st,strview_t * svp)217*1cd08393SJason King rustleg_parse_name(rust_state_t *st, strview_t *svp)
218*1cd08393SJason King {
219*1cd08393SJason King strview_t name;
220*1cd08393SJason King boolean_t first = B_TRUE;
221*1cd08393SJason King
222*1cd08393SJason King sv_init_sv(&name, svp);
223*1cd08393SJason King
224*1cd08393SJason King if (HAS_ERROR(st))
225*1cd08393SJason King return (B_FALSE);
226*1cd08393SJason King
227*1cd08393SJason King DEMDEBUG("%s: name = '%.*s'", __func__, SV_PRINT(&name));
228*1cd08393SJason King
229*1cd08393SJason King if (sv_remaining(svp) == 0) {
230*1cd08393SJason King DEMDEBUG("%s: empty name", __func__);
231*1cd08393SJason King return (B_FALSE);
232*1cd08393SJason King }
233*1cd08393SJason King
234*1cd08393SJason King if (!sv_consume_if_c(svp, 'N')) {
235*1cd08393SJason King DEMDEBUG("%s: does not start with 'N'", __func__);
236*1cd08393SJason King return (B_FALSE);
237*1cd08393SJason King }
238*1cd08393SJason King
239*1cd08393SJason King while (sv_remaining(svp) > 0 && sv_peek(svp, 0) != 'E') {
240*1cd08393SJason King if (!rustleg_parse_name_segment(st, svp, first))
241*1cd08393SJason King return (B_FALSE);
242*1cd08393SJason King first = B_FALSE;
243*1cd08393SJason King }
244*1cd08393SJason King
245*1cd08393SJason King if (!sv_consume_if_c(svp, 'E')) {
246*1cd08393SJason King DEMDEBUG("%s: ERROR no terminating 'E'", __func__);
247*1cd08393SJason King return (B_FALSE);
248*1cd08393SJason King }
249*1cd08393SJason King
250*1cd08393SJason King VERIFY3P(name.sv_first, <=, svp->sv_first);
251*1cd08393SJason King DEMDEBUG("%s: consumed '%.*s'", __func__,
252*1cd08393SJason King (int)(uintptr_t)(svp->sv_first - name.sv_first), name.sv_first);
253*1cd08393SJason King
254*1cd08393SJason King return (B_TRUE);
255*1cd08393SJason King }
256*1cd08393SJason King
257*1cd08393SJason King static boolean_t
rustleg_parse_hash(rust_state_t * st,strview_t * svp)258*1cd08393SJason King rustleg_parse_hash(rust_state_t *st, strview_t *svp)
259*1cd08393SJason King {
260*1cd08393SJason King if (HAS_ERROR(st))
261*1cd08393SJason King return (B_FALSE);
262*1cd08393SJason King
263*1cd08393SJason King VERIFY(sv_consume_if_c(svp, 'h'));
264*1cd08393SJason King if (!rust_appendc(st, 'h'))
265*1cd08393SJason King return (B_FALSE);
266*1cd08393SJason King
267*1cd08393SJason King while (sv_remaining(svp) > 0) {
268*1cd08393SJason King char c = sv_consume_c(svp);
269*1cd08393SJason King
270*1cd08393SJason King switch (c) {
271*1cd08393SJason King /*
272*1cd08393SJason King * The upper-case hex digits (A-F) are excluded as valid
273*1cd08393SJason King * hash values for several reasons:
274*1cd08393SJason King *
275*1cd08393SJason King * 1. It would result in two different possible names for
276*1cd08393SJason King * the same function, leading to ambiguity in linking (among
277*1cd08393SJason King * other things).
278*1cd08393SJason King *
279*1cd08393SJason King * 2. It would cause potential ambiguity in parsing -- is a
280*1cd08393SJason King * trailing 'E' part of the hash, or the terminating character
281*1cd08393SJason King * in the mangled name?
282*1cd08393SJason King *
283*1cd08393SJason King * 3. No examples were able to be found in the wild where
284*1cd08393SJason King * uppercase digits are used, and other rust demanglers all
285*1cd08393SJason King * seem to assume the hash must contain lower-case hex digits.
286*1cd08393SJason King */
287*1cd08393SJason King case '0': case '1': case '2': case '3':
288*1cd08393SJason King case '4': case '5': case '6': case '7':
289*1cd08393SJason King case '8': case '9': case 'a': case 'b':
290*1cd08393SJason King case 'c': case 'd': case 'e': case 'f':
291*1cd08393SJason King if (!rust_appendc(st, c))
292*1cd08393SJason King return (B_FALSE);
293*1cd08393SJason King break;
294*1cd08393SJason King default:
295*1cd08393SJason King return (B_FALSE);
296*1cd08393SJason King }
297*1cd08393SJason King }
298*1cd08393SJason King
299*1cd08393SJason King return (B_TRUE);
300*1cd08393SJason King }
301*1cd08393SJason King
302*1cd08393SJason King static boolean_t
rustleg_parse_special(rust_state_t * restrict st,strview_t * restrict svp)303*1cd08393SJason King rustleg_parse_special(rust_state_t *restrict st, strview_t *restrict svp)
304*1cd08393SJason King {
305*1cd08393SJason King if (HAS_ERROR(st))
306*1cd08393SJason King return (B_FALSE);
307*1cd08393SJason King
308*1cd08393SJason King if (sv_peek(svp, 0) != '$')
309*1cd08393SJason King return (B_FALSE);
310*1cd08393SJason King
311*1cd08393SJason King for (size_t i = 0; i < rust_charmap_sz; i++) {
312*1cd08393SJason King if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) {
313*1cd08393SJason King if (!rust_appendc(st, rust_charmap[i].ruc_ch))
314*1cd08393SJason King return (B_FALSE);
315*1cd08393SJason King return (B_TRUE);
316*1cd08393SJason King }
317*1cd08393SJason King }
318*1cd08393SJason King
319*1cd08393SJason King /* Handle $uXXXX$ */
320*1cd08393SJason King
321*1cd08393SJason King strview_t sv;
322*1cd08393SJason King uint32_t val = 0;
323*1cd08393SJason King uint_t ndigits = 0;
324*1cd08393SJason King
325*1cd08393SJason King sv_init_sv(&sv, svp);
326*1cd08393SJason King
327*1cd08393SJason King /* We peeked at this earlier, so it should still be there */
328*1cd08393SJason King VERIFY(sv_consume_if_c(&sv, '$'));
329*1cd08393SJason King
330*1cd08393SJason King if (!sv_consume_if_c(&sv, 'u'))
331*1cd08393SJason King return (B_FALSE);
332*1cd08393SJason King
333*1cd08393SJason King while (sv_remaining(&sv) > 0) {
334*1cd08393SJason King uint32_t cval = 0;
335*1cd08393SJason King char c;
336*1cd08393SJason King
337*1cd08393SJason King if (ndigits == 4)
338*1cd08393SJason King return (B_FALSE);
339*1cd08393SJason King
340*1cd08393SJason King c = sv_consume_c(&sv);
341*1cd08393SJason King if (c >= '0' && c <= '9')
342*1cd08393SJason King cval = c - '0';
343*1cd08393SJason King else if (c >= 'a' && c <= 'f')
344*1cd08393SJason King cval = c - 'a' + 10;
345*1cd08393SJason King else if (c == '$')
346*1cd08393SJason King break;
347*1cd08393SJason King else
348*1cd08393SJason King return (B_FALSE);
349*1cd08393SJason King
350*1cd08393SJason King val <<= 4;
351*1cd08393SJason King val |= cval;
352*1cd08393SJason King ndigits++;
353*1cd08393SJason King }
354*1cd08393SJason King
355*1cd08393SJason King if (!rust_append_utf8_c(st, val))
356*1cd08393SJason King return (B_FALSE);
357*1cd08393SJason King
358*1cd08393SJason King sv_consume_n(svp, ndigits + 3);
359*1cd08393SJason King return (B_TRUE);
360*1cd08393SJason King }
361*1cd08393SJason King
362*1cd08393SJason King static boolean_t
rustleg_add_sep(rust_state_t * st)363*1cd08393SJason King rustleg_add_sep(rust_state_t *st)
364*1cd08393SJason King {
365*1cd08393SJason King if (HAS_ERROR(st))
366*1cd08393SJason King return (B_FALSE);
367*1cd08393SJason King
368*1cd08393SJason King return (rust_append(st, "::"));
369*1cd08393SJason King }
370*1cd08393SJason King
371*1cd08393SJason King static boolean_t
rustleg_valid_sym(const strview_t * sv)372*1cd08393SJason King rustleg_valid_sym(const strview_t *sv)
373*1cd08393SJason King {
374*1cd08393SJason King size_t i;
375*1cd08393SJason King
376*1cd08393SJason King for (i = 0; i < sv->sv_rem; i++) {
377*1cd08393SJason King char c = sv->sv_first[i];
378*1cd08393SJason King
379*1cd08393SJason King if ((c & 0x80) == 0)
380*1cd08393SJason King continue;
381*1cd08393SJason King DEMDEBUG("%s: ERROR found 8-bit character '%c' in '%.*s' "
382*1cd08393SJason King "at index %zu", __func__, c, SV_PRINT(sv), i);
383*1cd08393SJason King return (B_FALSE);
384*1cd08393SJason King }
385*1cd08393SJason King return (B_TRUE);
386*1cd08393SJason King }
387