1*1cd08393SJason King /*
2*1cd08393SJason King * This file and its contents are supplied under the terms of the
3*1cd08393SJason King * Common Development and Distribution License ("CDDL"), version 1.0.
4*1cd08393SJason King * You may only use this file in accordance with the terms of version
5*1cd08393SJason King * 1.0 of the CDDL.
6*1cd08393SJason King *
7*1cd08393SJason King * A full copy of the text of the CDDL should have accompanied this
8*1cd08393SJason King * source. A copy of the CDDL is also available via the Internet at
9*1cd08393SJason King * http://www.illumos.org/license/CDDL.
10*1cd08393SJason King */
11*1cd08393SJason King
12*1cd08393SJason King /*
13*1cd08393SJason King * Copyright 2019 Joyent, Inc.
14*1cd08393SJason King * Copyright 2021 Jason King
15*1cd08393SJason King */
16*1cd08393SJason King
17*1cd08393SJason King #include <inttypes.h>
18*1cd08393SJason King #include <libcustr.h>
19*1cd08393SJason King #include <limits.h>
20*1cd08393SJason King #include <string.h>
21*1cd08393SJason King #include <sys/byteorder.h>
22*1cd08393SJason King #include "rust.h"
23*1cd08393SJason King #include "strview.h"
24*1cd08393SJason King
25*1cd08393SJason King /*
26*1cd08393SJason King * The rust v0 encoding (rust RFC 2603) uses a slightly modified
27*1cd08393SJason King * version of punycode to encode characters that are not ASCII.
28*1cd08393SJason King * The big difference is that '_' is used to separate the ASCII codepoints
29*1cd08393SJason King * from the non-ASCII code points instead of '-'.
30*1cd08393SJason King *
31*1cd08393SJason King * The decoding is taken almost directly from (IETF) RFC 3492
32*1cd08393SJason King */
33*1cd08393SJason King
34*1cd08393SJason King #define BASE 36
35*1cd08393SJason King #define TMIN 1
36*1cd08393SJason King #define TMAX 26
37*1cd08393SJason King #define SKEW 38
38*1cd08393SJason King #define DAMP 700
39*1cd08393SJason King #define INITIAL_BIAS 72
40*1cd08393SJason King #define INITIAL_N 0x80
41*1cd08393SJason King #define DELIMITER '_'
42*1cd08393SJason King
43*1cd08393SJason King static inline uint32_t char_val(char);
44*1cd08393SJason King
45*1cd08393SJason King static size_t
rustv0_puny_adapt(size_t delta,size_t npoints,boolean_t first)46*1cd08393SJason King rustv0_puny_adapt(size_t delta, size_t npoints, boolean_t first)
47*1cd08393SJason King {
48*1cd08393SJason King size_t k = 0;
49*1cd08393SJason King
50*1cd08393SJason King delta = first ? delta / DAMP : delta / 2;
51*1cd08393SJason King delta += delta / npoints;
52*1cd08393SJason King while (delta > ((BASE - TMIN) * TMAX) / 2) {
53*1cd08393SJason King delta /= (BASE - TMIN);
54*1cd08393SJason King k += BASE;
55*1cd08393SJason King }
56*1cd08393SJason King
57*1cd08393SJason King return (k + (((BASE - TMIN + 1) * delta) / (delta + SKEW)));
58*1cd08393SJason King }
59*1cd08393SJason King
60*1cd08393SJason King boolean_t
rustv0_puny_decode(rust_state_t * restrict st,strview_t * restrict src,boolean_t repl_underscore)61*1cd08393SJason King rustv0_puny_decode(rust_state_t *restrict st, strview_t *restrict src,
62*1cd08393SJason King boolean_t repl_underscore)
63*1cd08393SJason King {
64*1cd08393SJason King uint32_t *buf;
65*1cd08393SJason King size_t bufalloc; /* in units of uint32_t */
66*1cd08393SJason King size_t buflen;
67*1cd08393SJason King size_t nbasic;
68*1cd08393SJason King size_t i, old_i, k, w;
69*1cd08393SJason King size_t n = INITIAL_N;
70*1cd08393SJason King size_t bias = INITIAL_BIAS;
71*1cd08393SJason King size_t delim_idx = 0;
72*1cd08393SJason King boolean_t ret = B_FALSE;
73*1cd08393SJason King char c;
74*1cd08393SJason King
75*1cd08393SJason King DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(src));
76*1cd08393SJason King
77*1cd08393SJason King /*
78*1cd08393SJason King * The decoded string should never contain more codepoints than
79*1cd08393SJason King * the original string, so creating a temporary buffer large
80*1cd08393SJason King * enought to hold sv_remaining(src) uint32_t's should be
81*1cd08393SJason King * large enough.
82*1cd08393SJason King *
83*1cd08393SJason King * This also serves as a size check -- xcalloc will fail if the
84*1cd08393SJason King * resulting size of the buf (sizeof (uint32_t) * bufalloc) >=
85*1cd08393SJason King * SIZE_MAX. If xcalloc succeeds, we therefore know that that
86*1cd08393SJason King * buflen cannot overflow.
87*1cd08393SJason King */
88*1cd08393SJason King buflen = 0;
89*1cd08393SJason King bufalloc = sv_remaining(src) + 1;
90*1cd08393SJason King buf = xcalloc(st->rs_ops, bufalloc, sizeof (uint32_t));
91*1cd08393SJason King if (buf == NULL) {
92*1cd08393SJason King SET_ERROR(st);
93*1cd08393SJason King return (B_FALSE);
94*1cd08393SJason King }
95*1cd08393SJason King
96*1cd08393SJason King /*
97*1cd08393SJason King * Find the position of the last delimiter (if any).
98*1cd08393SJason King * IETF RFC 3492 3.1 states that the delimiter is present if and only
99*1cd08393SJason King * if there are a non-zero number of basic (ASCII) code points. Since
100*1cd08393SJason King * the delimiter itself is a basic code point, the last one present
101*1cd08393SJason King * in the original string is the actual delimiter between the basic
102*1cd08393SJason King * and non-basic code points. Earlier occurences of the delimiter
103*1cd08393SJason King * are treated as normal basic code points. For plain punycode, an
104*1cd08393SJason King * all ASCII string encoded with punycode would terminate with a
105*1cd08393SJason King * final delimiter, and a name with all non-basic code points would
106*1cd08393SJason King * not have a delimiter at all. With the rust v0 encoding, punycode
107*1cd08393SJason King * encoded identifiers have a 'u' prefix prior to the identifier
108*1cd08393SJason King * length (['u'] <decimal-number> <bytes>), so we should never
109*1cd08393SJason King * encounter an all ASCII name that's encoded with punycode (we error
110*1cd08393SJason King * on this). For an all non-basic codepoint identifier, no delimiter
111*1cd08393SJason King * will be present, and we treat that the same as the delimiter being
112*1cd08393SJason King * in the first position of the string, and consume it (if present)
113*1cd08393SJason King * when we transition from copying the basic code points (which there
114*1cd08393SJason King * will be none in this situation) to non-basic code points.
115*1cd08393SJason King */
116*1cd08393SJason King for (i = 0; i < src->sv_rem; i++) {
117*1cd08393SJason King if (src->sv_first[i] == DELIMITER) {
118*1cd08393SJason King delim_idx = i;
119*1cd08393SJason King }
120*1cd08393SJason King }
121*1cd08393SJason King VERIFY3U(delim_idx, <, bufalloc);
122*1cd08393SJason King
123*1cd08393SJason King if (delim_idx + 1 == sv_remaining(src)) {
124*1cd08393SJason King DEMDEBUG("%s: encountered an all-ASCII name encoded with "
125*1cd08393SJason King "punycode", __func__);
126*1cd08393SJason King goto done;
127*1cd08393SJason King }
128*1cd08393SJason King
129*1cd08393SJason King /* Copy all the basic characters up to the delimiter into buf */
130*1cd08393SJason King for (nbasic = 0; nbasic < delim_idx; nbasic++) {
131*1cd08393SJason King c = sv_consume_c(src);
132*1cd08393SJason King
133*1cd08393SJason King /* The rust prefix check should guarantee this */
134*1cd08393SJason King VERIFY3U(c, <, 0x80);
135*1cd08393SJason King
136*1cd08393SJason King /*
137*1cd08393SJason King * Normal rust identifiers do not contain '-' in them.
138*1cd08393SJason King * However ABI identifiers could contain a dash. Those
139*1cd08393SJason King * are translated to _, and we need to replace accordingly
140*1cd08393SJason King * when asked.
141*1cd08393SJason King */
142*1cd08393SJason King if (repl_underscore && c == '_')
143*1cd08393SJason King c = '-';
144*1cd08393SJason King
145*1cd08393SJason King buf[nbasic] = c;
146*1cd08393SJason King buflen++;
147*1cd08393SJason King }
148*1cd08393SJason King DEMDEBUG("%s: %" PRIu32 " ASCII codepoints copied", __func__, nbasic);
149*1cd08393SJason King
150*1cd08393SJason King /*
151*1cd08393SJason King * Consume delimiter between basic and non-basic code points if present.
152*1cd08393SJason King * See above for explanation why it may not be present.
153*1cd08393SJason King */
154*1cd08393SJason King (void) sv_consume_if_c(src, DELIMITER);
155*1cd08393SJason King
156*1cd08393SJason King DEMDEBUG("%s: non-ASCII codepoints to decode: %.*s", __func__,
157*1cd08393SJason King SV_PRINT(src));
158*1cd08393SJason King
159*1cd08393SJason King for (i = 0; sv_remaining(src) > 0; i++) {
160*1cd08393SJason King VERIFY3U(i, <=, buflen);
161*1cd08393SJason King
162*1cd08393SJason King /*
163*1cd08393SJason King * Guarantee we have enough space to insert another codepoint.
164*1cd08393SJason King * Our buffer sizing above should prevent this from ever
165*1cd08393SJason King * tripping, but check this out of paranoia.
166*1cd08393SJason King */
167*1cd08393SJason King VERIFY3U(buflen, <, bufalloc - 1);
168*1cd08393SJason King
169*1cd08393SJason King /* decode the next codepoint */
170*1cd08393SJason King for (old_i = i, k = BASE, w = 1; ; k += BASE) {
171*1cd08393SJason King size_t t;
172*1cd08393SJason King uint32_t digit;
173*1cd08393SJason King
174*1cd08393SJason King if (sv_remaining(src) == 0)
175*1cd08393SJason King goto done;
176*1cd08393SJason King
177*1cd08393SJason King digit = char_val(sv_consume_c(src));
178*1cd08393SJason King if (digit >= BASE)
179*1cd08393SJason King goto done;
180*1cd08393SJason King
181*1cd08393SJason King i = i + digit * w;
182*1cd08393SJason King
183*1cd08393SJason King if (k <= bias)
184*1cd08393SJason King t = TMIN;
185*1cd08393SJason King else if (k >= bias + TMAX)
186*1cd08393SJason King t = TMAX;
187*1cd08393SJason King else
188*1cd08393SJason King t = k - bias;
189*1cd08393SJason King
190*1cd08393SJason King if (digit < t)
191*1cd08393SJason King break;
192*1cd08393SJason King
193*1cd08393SJason King w = w * (BASE - t);
194*1cd08393SJason King }
195*1cd08393SJason King buflen++;
196*1cd08393SJason King
197*1cd08393SJason King bias = rustv0_puny_adapt(i - old_i, buflen,
198*1cd08393SJason King (old_i == 0) ? B_TRUE : B_FALSE);
199*1cd08393SJason King n = n + i / buflen;
200*1cd08393SJason King i = i % buflen;
201*1cd08393SJason King
202*1cd08393SJason King DEMDEBUG("%s: insert \\u%04" PRIx32 " at index %zu (len = %zu)",
203*1cd08393SJason King __func__, n, i, buflen);
204*1cd08393SJason King
205*1cd08393SJason King /*
206*1cd08393SJason King * At the start of this while loop, we guaranteed
207*1cd08393SJason King * buflen < bufalloc - 1. Therefore we know there is room
208*1cd08393SJason King * to move over the contents of buf at i to make room
209*1cd08393SJason King * for the codepoint. We also just guaranteed that i
210*1cd08393SJason King * is in the range [0, buflen), so this should always be
211*1cd08393SJason King * safe.
212*1cd08393SJason King */
213*1cd08393SJason King (void) memmove(buf + i + 1, buf + i,
214*1cd08393SJason King (buflen - i) * sizeof (uint32_t));
215*1cd08393SJason King
216*1cd08393SJason King #if _LP64
217*1cd08393SJason King /*
218*1cd08393SJason King * This is always false for ILP32 and smatch will also complain,
219*1cd08393SJason King * so we just omit it for ILP32.
220*1cd08393SJason King */
221*1cd08393SJason King if (n > UINT32_MAX) {
222*1cd08393SJason King DEMDEBUG("%s: ERROR: utf8 value is out of range",
223*1cd08393SJason King __func__);
224*1cd08393SJason King goto done;
225*1cd08393SJason King }
226*1cd08393SJason King #endif
227*1cd08393SJason King
228*1cd08393SJason King buf[i] = (uint32_t)n;
229*1cd08393SJason King }
230*1cd08393SJason King
231*1cd08393SJason King DEMDEBUG("%s: inserted %zu non-basic code points", __func__,
232*1cd08393SJason King buflen - nbasic);
233*1cd08393SJason King
234*1cd08393SJason King for (i = 0; i < buflen; i++) {
235*1cd08393SJason King if (!rust_append_utf8_c(st, buf[i]))
236*1cd08393SJason King goto done;
237*1cd08393SJason King }
238*1cd08393SJason King ret = B_TRUE;
239*1cd08393SJason King
240*1cd08393SJason King done:
241*1cd08393SJason King xfree(st->rs_ops, buf, bufalloc * sizeof (uint32_t));
242*1cd08393SJason King return (ret);
243*1cd08393SJason King }
244*1cd08393SJason King
245*1cd08393SJason King /*
246*1cd08393SJason King * Convert [0-9][a-z] to a value [0..35]. Rust's punycode encoding always
247*1cd08393SJason King * uses lowercase, so we treat uppercase (and any other characters) as
248*1cd08393SJason King * invalid, and return BASE (36) to indicate a bad value.
249*1cd08393SJason King */
250*1cd08393SJason King static inline uint32_t
char_val(char c)251*1cd08393SJason King char_val(char c)
252*1cd08393SJason King {
253*1cd08393SJason King uint32_t v = c;
254*1cd08393SJason King
255*1cd08393SJason King if (ISLOWER(c)) {
256*1cd08393SJason King return (c - 'a');
257*1cd08393SJason King } else if (ISDIGIT(c)) {
258*1cd08393SJason King return (c - '0' + 26);
259*1cd08393SJason King } else {
260*1cd08393SJason King DEMDEBUG("%s: ERROR: invalid character 0x%02x encountered",
261*1cd08393SJason King __func__, v);
262*1cd08393SJason King return (BASE);
263*1cd08393SJason King }
264*1cd08393SJason King }
265