1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2019 Joyent, Inc. All rights reserved.
23 */
24
25 /*
26 * Don't Panic! If you find the blocks of assembly that follow confusing and
27 * you're questioning why they exist, please go read section 8 of the umem.c big
28 * theory statement. Next familiarize yourself with the malloc and free
29 * implementations in libumem's malloc.c.
30 *
31 * What follows is the amd64 implementation of the thread caching automatic
32 * assembly generation. The amd64 calling conventions are documented in the
33 * 64-bit System V ABI. For our purposes what matters is that our first argument
34 * will come in rdi. Our functions have to preserve rbp, rbx, and r12->r15. We
35 * are free to do whatever we want with rax, rcx, rdx, rsi, rdi, and r8->r11.
36 *
37 * For both our implementation of malloc and free we only use the registers we
38 * don't have to preserve.
39 *
40 * Malloc register usage:
41 * o. rdi: Original size to malloc. This never changes and is preserved.
42 * o. rsi: Adjusted malloc size for malloc_data_tag(s).
43 * o. rcx: Pointer to the tmem_t in the ulwp_t.
44 * o. rdx: Pointer to the tmem_t array of roots
45 * o. r8: Size of the cache
46 * o. r9: Scratch register
47 *
48 * Free register usage:
49 * o. rdi: Original buffer to free. This never changes and is preserved.
50 * o. rax: The actual buffer, adjusted for the hidden malloc_data_t(s).
51 * o. rcx: Pointer to the tmem_t in the ulwp_t.
52 * o. rdx: Pointer to the tmem_t array of roots
53 * o. r8: Size of the cache
54 * o. r9: Scratch register
55 *
56 * Once we determine what cache we are using, we increment %rdx to the
57 * appropriate offset and set %r8 with the size of the cache. This means that
58 * when we break out to the normal buffer allocation point %rdx contains the
59 * head of the linked list and %r8 is the amount that we have to adjust the
60 * thread's cached amount by.
61 *
62 * Each block of assembly has psuedocode that describes its purpose.
63 */
64
65 /*
66 * umem_base must be first.
67 */
68 #include "umem_base.h"
69
70 #include <inttypes.h>
71 #include <strings.h>
72 #include <umem_impl.h>
73 #include <atomic.h>
74 #include <sys/mman.h>
75 #include <errno.h>
76
77
78 #include <stdio.h>
79
80 const int umem_genasm_supported = 1;
81 static uintptr_t umem_genasm_mptr = (uintptr_t)&_malloc;
82 static size_t umem_genasm_msize = 576;
83 static uintptr_t umem_genasm_fptr = (uintptr_t)&_free;
84 static size_t umem_genasm_fsize = 576;
85 static uintptr_t umem_genasm_omptr = (uintptr_t)umem_malloc;
86 static uintptr_t umem_genasm_ofptr = (uintptr_t)umem_malloc_free;
87
88 #define UMEM_GENASM_MAX64 (UINT32_MAX / sizeof (uintptr_t))
89 #define PTC_JMPADDR(dest, src) (dest - (src + 4))
90 #define PTC_ROOT_SIZE sizeof (uintptr_t)
91 #define MULTINOP 0x0000441f0f
92
93 /*
94 * void *ptcmalloc(size_t orig_size);
95 *
96 * size_t size = orig_size + 8;
97 * if (size > UMEM_SECOND_ALIGN)
98 * size += 8;
99 *
100 * if (size < orig_size)
101 * goto tomalloc; ! This is overflow
102 *
103 * if (size > cache_max)
104 * goto tomalloc
105 *
106 * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
107 * void **roots = t->tm_roots;
108 */
109 #define PTC_MALINIT_JOUT 0x13
110 #define PTC_MALINIT_MCS 0x1a
111 #define PTC_MALINIT_JOV 0x20
112 #define PTC_MALINIT_SOFF 0x30
113 static const uint8_t malinit[] = {
114 0x48, 0x8d, 0x77, 0x08, /* leaq 0x8(%rdi),%rsi */
115 0x48, 0x83, 0xfe, 0x10, /* cmpq $0x10, %rsi */
116 0x76, 0x04, /* jbe +0x4 */
117 0x48, 0x8d, 0x77, 0x10, /* leaq 0x10(%rdi),%rsi */
118 0x48, 0x39, 0xfe, /* cmpq %rdi,%rsi */
119 0x0f, 0x82, 0x00, 0x00, 0x00, 0x00, /* jb +errout */
120 0x48, 0x81, 0xfe,
121 0x00, 0x00, 0x00, 0x00, /* cmpq sizeof ($CACHE), %rsi */
122 0x0f, 0x87, 0x00, 0x00, 0x00, 0x00, /* ja +errout */
123 0x64, 0x48, 0x8b, 0x0c, 0x25,
124 0x00, 0x00, 0x00, 0x00, /* movq %fs:0x0,%rcx */
125 0x48, 0x81, 0xc1,
126 0x00, 0x00, 0x00, 0x00, /* addq $SOFF, %rcx */
127 0x48, 0x8d, 0x51, 0x08, /* leaq 0x8(%rcx),%rdx */
128 };
129
130 /*
131 * void ptcfree(void *buf);
132 *
133 * if (buf == NULL)
134 * return;
135 *
136 * malloc_data_t *tag = buf;
137 * tag--;
138 * int size = tag->malloc_size;
139 * int tagval = UMEM_MALLOC_DECODE(tag->malloc_tag, size);
140 * if (tagval == MALLOC_SECOND_MAGIC) {
141 * tag--;
142 * } else if (tagval != MALLOC_MAGIC) {
143 * goto tofree;
144 * }
145 *
146 * if (size > cache_max)
147 * goto tofree;
148 *
149 * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
150 * void **roots = t->tm_roots;
151 */
152 #define PTC_FRINI_JDONE 0x05
153 #define PTC_FRINI_JFREE 0x25
154 #define PTC_FRINI_MCS 0x30
155 #define PTC_FRINI_JOV 0x36
156 #define PTC_FRINI_SOFF 0x46
157 static const uint8_t freeinit[] = {
158 0x48, 0x85, 0xff, /* testq %rdi,%rdi */
159 0x0f, 0x84, 0x00, 0x00, 0x00, 0x00, /* jmp $JDONE (done) */
160 0x8b, 0x77, 0xf8, /* movl -0x8(%rdi),%esi */
161 0x8b, 0x47, 0xfc, /* movl -0x4(%rdi),%eax */
162 0x01, 0xf0, /* addl %esi,%eax */
163 0x3d, 0x00, 0x70, 0xba, 0x16, /* cmpl $MALLOC_2_MAGIC, %eax */
164 0x75, 0x06, /* jne +0x6 (checkover) */
165 0x48, 0x8d, 0x47, 0xf0, /* leaq -0x10(%rdi),%eax */
166 0xeb, 0x0f, /* jmp +0xf (freebuf) */
167 0x3d, 0x00, 0xc0, 0x10, 0x3a, /* cmpl $MALLOC_MAGIC, %eax */
168 0x0f, 0x85, 0x00, 0x00, 0x00, 0x00, /* jmp +JFREE (goto torfree) */
169 0x48, 0x8d, 0x47, 0xf8, /* leaq -0x8(%rdi),%rax */
170 0x48, 0x81, 0xfe,
171 0x00, 0x00, 0x00, 0x00, /* cmpq sizeof ($CACHE), %rsi */
172 0x0f, 0x87, 0x00, 0x00, 0x00, 0x00, /* ja +errout */
173 0x64, 0x48, 0x8b, 0x0c, 0x25,
174 0x00, 0x00, 0x00, 0x00, /* movq %fs:0x0,%rcx */
175 0x48, 0x81, 0xc1,
176 0x00, 0x00, 0x00, 0x00, /* addq $SOFF, %rcx */
177 0x48, 0x8d, 0x51, 0x08, /* leaq 0x8(%rcx),%rdx */
178 };
179
180 /*
181 * if (size <= $CACHE_SIZE) {
182 * csize = $CACHE_SIZE;
183 * } else ... ! goto next cache
184 */
185 #define PTC_INICACHE_CMP 0x03
186 #define PTC_INICACHE_SIZE 0x0c
187 #define PTC_INICACHE_JMP 0x11
188 static const uint8_t inicache[] = {
189 0x48, 0x81, 0xfe,
190 0x00, 0x00, 0x00, 0x00, /* cmpq sizeof ($CACHE), %rsi */
191 0x77, 0x0c, /* ja +0xc (next cache) */
192 0x49, 0xc7, 0xc0,
193 0x00, 0x00, 0x00, 0x00, /* movq sizeof ($CACHE), %r8 */
194 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp $JMP (allocbuf) */
195 };
196
197 /*
198 * if (size <= $CACHE_SIZE) {
199 * csize = $CACHE_SIZE;
200 * roots += $CACHE_NUM;
201 * } else ... ! goto next cache
202 */
203 #define PTC_GENCACHE_CMP 0x03
204 #define PTC_GENCACHE_SIZE 0x0c
205 #define PTC_GENCACHE_NUM 0x13
206 #define PTC_GENCACHE_JMP 0x18
207 static const uint8_t gencache[] = {
208 0x48, 0x81, 0xfe,
209 0x00, 0x00, 0x00, 0x00, /* cmpq sizeof ($CACHE), %rsi */
210 0x77, 0x14, /* ja +0xc (next cache) */
211 0x49, 0xc7, 0xc0,
212 0x00, 0x00, 0x00, 0x00, /* movq sizeof ($CACHE), %r8 */
213 0x48, 0x81, 0xc2,
214 0x00, 0x00, 0x00, 0x00, /* addq $8*ii, %rdx */
215 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp +$JMP (allocbuf ) */
216 };
217
218 /*
219 * else if (size <= $CACHE_SIZE) {
220 * csize = $CACHE_SIZE;
221 * roots += $CACHE_NUM;
222 * } else {
223 * goto tofunc; ! goto tomalloc if ptcmalloc.
224 * } ! goto tofree if ptcfree.
225 */
226 #define PTC_FINCACHE_CMP 0x03
227 #define PTC_FINCACHE_JMP 0x08
228 #define PTC_FINCACHE_SIZE 0x0c
229 #define PTC_FINCACHE_NUM 0x13
230 static const uint8_t fincache[] = {
231 0x48, 0x81, 0xfe,
232 0x00, 0x00, 0x00, 0x00, /* cmpq sizeof ($CACHE), %rsi */
233 0x77, 0x00, /* ja +JMP (to real malloc) */
234 0x49, 0xc7, 0xc0,
235 0x00, 0x00, 0x00, 0x00, /* movq sizeof ($CACHE), %r8 */
236 0x48, 0x81, 0xc2,
237 0x00, 0x00, 0x00, 0x00, /* addq $8*ii, %rdx */
238
239 };
240
241 /*
242 * if (*root == NULL)
243 * goto tomalloc;
244 *
245 * malloc_data_t *ret = *root;
246 * *root = *(void **)ret;
247 * t->tm_size += csize;
248 * ret->malloc_size = size;
249 *
250 * if (size > UMEM_SECOND_ALIGN) {
251 * ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
252 * ret += 2;
253 * } else {
254 * ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
255 * ret += 1;
256 * }
257 *
258 * return ((void *)ret);
259 * tomalloc:
260 * return (malloc(orig_size));
261 */
262 #define PTC_MALFINI_ALLABEL 0x00
263 #define PTC_MALFINI_JMLABEL 0x40
264 #define PTC_MALFINI_JMADDR 0x41
265 static const uint8_t malfini[] = {
266 0x48, 0x8b, 0x02, /* movl (%rdx),%rax */
267 0x48, 0x85, 0xc0, /* testq %rax,%rax */
268 0x74, 0x38, /* je +0x38 (errout) */
269 0x4c, 0x8b, 0x08, /* movq (%rax),%r9 */
270 0x4c, 0x89, 0x0a, /* movq %r9,(%rdx) */
271 0x4c, 0x29, 0x01, /* subq %rsi,(%rcx) */
272 0x48, 0x83, 0xfe, 0x10, /* cmpq $0x10,%rsi */
273 0x76, 0x15, /* jbe +0x15 */
274 0x41, 0xb9, 0x00, 0x70, 0xba, 0x16, /* movl $MALLOC_MAGIC_2, %r9d */
275 0x89, 0x70, 0x08, /* movl %r9d,0x8(%rax) */
276 0x41, 0x29, 0xf1, /* subl %esi, %r9d */
277 0x44, 0x89, 0x48, 0x0c, /* movl %r9d, 0xc(%rax) */
278 0x48, 0x83, 0xc0, 0x10, /* addq $0x10, %rax */
279 0xc3, /* ret */
280 0x41, 0xb9, 0x00, 0xc0, 0x10, 0x3a, /* movl %MALLOC_MAGIC, %r9d */
281 0x89, 0x30, /* movl %esi,(%rax) */
282 0x41, 0x29, 0xf1, /* subl %esi,%r9d */
283 0x44, 0x89, 0x48, 0x04, /* movl %r9d,0x4(%rax) */
284 0x48, 0x83, 0xc0, 0x08, /* addq $0x8,%rax */
285 0xc3, /* ret */
286 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp $MALLOC */
287 };
288
289 /*
290 * if (t->tm_size + csize > umem_ptc_size)
291 * goto tofree;
292 *
293 * t->tm_size += csize
294 * *(void **)tag = *root;
295 * *root = tag;
296 * return;
297 * tofree:
298 * free(buf);
299 * return;
300 */
301 #define PTC_FRFINI_RBUFLABEL 0x00
302 #define PTC_FRFINI_CACHEMAX 0x09
303 #define PTC_FRFINI_DONELABEL 0x1b
304 #define PTC_FRFINI_JFLABEL 0x1c
305 #define PTC_FRFINI_JFADDR 0x1d
306 static const uint8_t freefini[] = {
307 0x4c, 0x8b, 0x09, /* movq (%rcx),%r9 */
308 0x4d, 0x01, 0xc1, /* addq %r8, %r9 */
309 0x49, 0x81, 0xf9,
310 0x00, 0x00, 0x00, 0x00, /* cmpl $THR_CACHE_MAX, %r9 */
311 0x77, 0x0d, /* jae +0xd (torfree) */
312 0x4c, 0x01, 0x01, /* addq %r8,(%rcx) */
313 0x4c, 0x8b, 0x0a, /* movq (%rdx),%r9 */
314 0x4c, 0x89, 0x08, /* movq %r9,(%rax) */
315 0x48, 0x89, 0x02, /* movq %rax,(%rdx) */
316 0xc3, /* ret */
317 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp free */
318 };
319
320 /*
321 * Construct the initial part of malloc. off contains the offset from curthread
322 * to the root of the tmem structure. ep is the address of the label to error
323 * and jump to free. csize is the size of the largest umem_cache in ptcumem.
324 */
325 static int
genasm_malinit(uint8_t * bp,uint32_t off,uint32_t ep,uint32_t csize)326 genasm_malinit(uint8_t *bp, uint32_t off, uint32_t ep, uint32_t csize)
327 {
328 uint32_t addr;
329
330 bcopy(malinit, bp, sizeof (malinit));
331 addr = PTC_JMPADDR(ep, PTC_MALINIT_JOUT);
332 bcopy(&addr, bp + PTC_MALINIT_JOUT, sizeof (addr));
333 bcopy(&csize, bp + PTC_MALINIT_MCS, sizeof (csize));
334 addr = PTC_JMPADDR(ep, PTC_MALINIT_JOV);
335 bcopy(&addr, bp + PTC_MALINIT_JOV, sizeof (addr));
336 bcopy(&off, bp + PTC_MALINIT_SOFF, sizeof (off));
337
338 return (sizeof (malinit));
339 }
340
341 static int
genasm_frinit(uint8_t * bp,uint32_t off,uint32_t dp,uint32_t ep,uint32_t mcs)342 genasm_frinit(uint8_t *bp, uint32_t off, uint32_t dp, uint32_t ep, uint32_t mcs)
343 {
344 uint32_t addr;
345
346 bcopy(freeinit, bp, sizeof (freeinit));
347 addr = PTC_JMPADDR(dp, PTC_FRINI_JDONE);
348 bcopy(&addr, bp + PTC_FRINI_JDONE, sizeof (addr));
349 addr = PTC_JMPADDR(ep, PTC_FRINI_JFREE);
350 bcopy(&addr, bp + PTC_FRINI_JFREE, sizeof (addr));
351 bcopy(&mcs, bp + PTC_FRINI_MCS, sizeof (mcs));
352 addr = PTC_JMPADDR(ep, PTC_FRINI_JOV);
353 bcopy(&addr, bp + PTC_FRINI_JOV, sizeof (addr));
354 bcopy(&off, bp + PTC_FRINI_SOFF, sizeof (off));
355 return (sizeof (freeinit));
356 }
357
358
359 /*
360 * Create the initial cache entry of the specified size. The value of ap tells
361 * us what the address of the label to try and allocate a buffer. This value is
362 * an offset from the current base to that value.
363 */
364 static int
genasm_firstcache(uint8_t * bp,uint32_t csize,uint32_t ap)365 genasm_firstcache(uint8_t *bp, uint32_t csize, uint32_t ap)
366 {
367 uint32_t addr;
368
369 bcopy(inicache, bp, sizeof (inicache));
370 bcopy(&csize, bp + PTC_INICACHE_CMP, sizeof (csize));
371 bcopy(&csize, bp + PTC_INICACHE_SIZE, sizeof (csize));
372 addr = PTC_JMPADDR(ap, PTC_INICACHE_JMP);
373 ASSERT(addr != 0);
374 bcopy(&addr, bp + PTC_INICACHE_JMP, sizeof (addr));
375
376 return (sizeof (inicache));
377 }
378
379 static int
genasm_gencache(uint8_t * bp,int num,uint32_t csize,uint32_t ap)380 genasm_gencache(uint8_t *bp, int num, uint32_t csize, uint32_t ap)
381 {
382 uint32_t addr;
383 uint32_t coff;
384
385 ASSERT(UINT32_MAX / PTC_ROOT_SIZE > num);
386 ASSERT(num != 0);
387 bcopy(gencache, bp, sizeof (gencache));
388 bcopy(&csize, bp + PTC_GENCACHE_CMP, sizeof (csize));
389 bcopy(&csize, bp + PTC_GENCACHE_SIZE, sizeof (csize));
390 coff = num * PTC_ROOT_SIZE;
391 bcopy(&coff, bp + PTC_GENCACHE_NUM, sizeof (coff));
392 addr = PTC_JMPADDR(ap, PTC_GENCACHE_JMP);
393 bcopy(&addr, bp + PTC_GENCACHE_JMP, sizeof (addr));
394
395 return (sizeof (gencache));
396 }
397
398 static int
genasm_lastcache(uint8_t * bp,int num,uint32_t csize,uint32_t ep)399 genasm_lastcache(uint8_t *bp, int num, uint32_t csize, uint32_t ep)
400 {
401 uint8_t eap;
402 uint32_t coff;
403
404 ASSERT(ep <= 0xff && ep > 7);
405 ASSERT(UINT32_MAX / PTC_ROOT_SIZE > num);
406 bcopy(fincache, bp, sizeof (fincache));
407 bcopy(&csize, bp + PTC_FINCACHE_CMP, sizeof (csize));
408 bcopy(&csize, bp + PTC_FINCACHE_SIZE, sizeof (csize));
409 coff = num * PTC_ROOT_SIZE;
410 bcopy(&coff, bp + PTC_FINCACHE_NUM, sizeof (coff));
411 eap = ep - PTC_FINCACHE_JMP - 1;
412 bcopy(&eap, bp + PTC_FINCACHE_JMP, sizeof (eap));
413
414 return (sizeof (fincache));
415 }
416
417 static int
genasm_malfini(uint8_t * bp,uintptr_t mptr)418 genasm_malfini(uint8_t *bp, uintptr_t mptr)
419 {
420 uint32_t addr;
421
422 bcopy(malfini, bp, sizeof (malfini));
423 addr = PTC_JMPADDR(mptr, ((uintptr_t)bp + PTC_MALFINI_JMADDR));
424 bcopy(&addr, bp + PTC_MALFINI_JMADDR, sizeof (addr));
425
426 return (sizeof (malfini));
427 }
428
429 static int
genasm_frfini(uint8_t * bp,uint32_t maxthr,uintptr_t fptr)430 genasm_frfini(uint8_t *bp, uint32_t maxthr, uintptr_t fptr)
431 {
432 uint32_t addr;
433
434 bcopy(freefini, bp, sizeof (freefini));
435 bcopy(&maxthr, bp + PTC_FRFINI_CACHEMAX, sizeof (maxthr));
436 addr = PTC_JMPADDR(fptr, ((uintptr_t)bp + PTC_FRFINI_JFADDR));
437 bcopy(&addr, bp + PTC_FRFINI_JFADDR, sizeof (addr));
438
439 return (sizeof (freefini));
440 }
441
442 /*
443 * The malloc inline assembly is constructed as follows:
444 *
445 * o Malloc prologue assembly
446 * o Generic first-cache check
447 * o n Generic cache checks (where n = _tmem_get_entries() - 2)
448 * o Generic last-cache check
449 * o Malloc epilogue assembly
450 *
451 * Generally there are at least three caches. When there is only one cache we
452 * only use the generic last-cache. In the case where there are two caches, we
453 * just leave out the middle ones.
454 */
455 static int
genasm_malloc(void * base,size_t len,int nents,int * umem_alloc_sizes)456 genasm_malloc(void *base, size_t len, int nents, int *umem_alloc_sizes)
457 {
458 int ii, off;
459 uint8_t *bp;
460 size_t total;
461 uint32_t allocoff, erroff;
462
463 total = sizeof (malinit) + sizeof (malfini) + sizeof (fincache);
464
465 if (nents >= 2)
466 total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
467
468 if (total > len)
469 return (1);
470
471 erroff = total - sizeof (malfini) + PTC_MALFINI_JMLABEL;
472 allocoff = total - sizeof (malfini) + PTC_MALFINI_ALLABEL;
473
474 bp = base;
475
476 off = genasm_malinit(bp, umem_tmem_off, erroff,
477 umem_alloc_sizes[nents-1]);
478 bp += off;
479 allocoff -= off;
480 erroff -= off;
481
482 if (nents > 1) {
483 off = genasm_firstcache(bp, umem_alloc_sizes[0], allocoff);
484 bp += off;
485 allocoff -= off;
486 erroff -= off;
487 }
488
489 for (ii = 1; ii < nents - 1; ii++) {
490 off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], allocoff);
491 bp += off;
492 allocoff -= off;
493 erroff -= off;
494 }
495
496 bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
497 erroff);
498 bp += genasm_malfini(bp, umem_genasm_omptr);
499 ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
500
501 return (0);
502 }
503
504 static int
genasm_free(void * base,size_t len,int nents,int * umem_alloc_sizes)505 genasm_free(void *base, size_t len, int nents, int *umem_alloc_sizes)
506 {
507 uint8_t *bp;
508 int ii, off;
509 size_t total;
510 uint32_t rbufoff, retoff, erroff;
511
512 /* Assume that nents has already been audited for us */
513 total = sizeof (freeinit) + sizeof (freefini) + sizeof (fincache);
514 if (nents >= 2)
515 total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
516
517 if (total > len)
518 return (1);
519
520 erroff = total - (sizeof (freefini) - PTC_FRFINI_JFLABEL);
521 rbufoff = total - (sizeof (freefini) - PTC_FRFINI_RBUFLABEL);
522 retoff = total - (sizeof (freefini) - PTC_FRFINI_DONELABEL);
523
524 bp = base;
525
526 off = genasm_frinit(bp, umem_tmem_off, retoff, erroff,
527 umem_alloc_sizes[nents - 1]);
528 bp += off;
529 erroff -= off;
530 rbufoff -= off;
531
532 if (nents > 1) {
533 off = genasm_firstcache(bp, umem_alloc_sizes[0], rbufoff);
534 bp += off;
535 erroff -= off;
536 rbufoff -= off;
537 }
538
539 for (ii = 1; ii < nents - 1; ii++) {
540 off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], rbufoff);
541 bp += off;
542 rbufoff -= off;
543 erroff -= off;
544 }
545
546 bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
547 erroff);
548 bp += genasm_frfini(bp, umem_ptc_size, umem_genasm_ofptr);
549 ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
550
551 return (0);
552 }
553
554 boolean_t
umem_genasm(int * cp,umem_cache_t ** caches,int nc)555 umem_genasm(int *cp, umem_cache_t **caches, int nc)
556 {
557 int nents, i;
558 uint8_t *mptr;
559 uint8_t *fptr;
560 uint64_t v, *vptr;
561 size_t mplen, fplen;
562 uintptr_t mpbase, fpbase;
563 boolean_t ret = B_FALSE;
564
565 mptr = (void *)((uintptr_t)umem_genasm_mptr + 5);
566 fptr = (void *)((uintptr_t)umem_genasm_fptr + 5);
567 if (umem_genasm_mptr == 0 || umem_genasm_msize == 0 ||
568 umem_genasm_fptr == 0 || umem_genasm_fsize == 0) {
569 return (B_FALSE);
570 }
571
572 mplen = P2ROUNDUP(umem_genasm_msize, pagesize);
573 mpbase = P2ALIGN((uintptr_t)umem_genasm_mptr, pagesize);
574 fplen = P2ROUNDUP(umem_genasm_fsize, pagesize);
575 fpbase = P2ALIGN((uintptr_t)umem_genasm_mptr, pagesize);
576
577 /*
578 * If the values straddle a page boundary, then we might need to
579 * actually remap two pages.
580 */
581 if (P2ALIGN(umem_genasm_msize + (uintptr_t)umem_genasm_mptr,
582 pagesize) != mpbase) {
583 mplen += pagesize;
584 }
585
586 if (P2ALIGN(umem_genasm_fsize + (uintptr_t)umem_genasm_fptr,
587 pagesize) != fpbase) {
588 fplen += pagesize;
589 }
590
591 if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_WRITE |
592 PROT_EXEC) != 0) {
593 return (B_FALSE);
594 }
595
596 if (mprotect((void *)fpbase, fplen, PROT_READ | PROT_WRITE |
597 PROT_EXEC) != 0) {
598 if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_EXEC) !=
599 0) {
600 umem_panic("genasm failed to restore memory "
601 "protection: %d", errno);
602 }
603 return (B_FALSE);
604 }
605
606 /*
607 * The total number of caches that we can service is the minimum of:
608 * o the amount supported by libc
609 * o the total number of umem caches
610 * o we use a single byte addl, so it's MAX_UINT32 / sizeof (uintptr_t)
611 * For 64-bit, this is MAX_UINT32 >> 3, a lot.
612 */
613 nents = _tmem_get_nentries();
614
615 if (UMEM_GENASM_MAX64 < nents)
616 nents = UMEM_GENASM_MAX64;
617
618 if (nc < nents)
619 nents = nc;
620
621 /*
622 * If the number of per-thread caches has been set to zero or the
623 * per-thread cache size has been set to zero, don't bother trying to
624 * write any assembly and just use the default malloc and free. When we
625 * return, indicate that there is no PTC support.
626 */
627 if (nents == 0 || umem_ptc_size == 0) {
628 goto out;
629 }
630
631 /* Take into account the jump */
632 if (genasm_malloc(mptr, umem_genasm_msize, nents, cp) != 0) {
633 goto out;
634 }
635
636 if (genasm_free(fptr, umem_genasm_fsize, nents, cp) != 0) {
637 goto out;
638 }
639
640 /* nop out the jump with a multibyte jump */
641 vptr = (void *)umem_genasm_mptr;
642 v = MULTINOP;
643 v |= *vptr & (0xffffffULL << 40);
644 (void) atomic_swap_64(vptr, v);
645 vptr = (void *)umem_genasm_fptr;
646 v = MULTINOP;
647 v |= *vptr & (0xffffffULL << 40);
648 (void) atomic_swap_64(vptr, v);
649
650 for (i = 0; i < nents; i++)
651 caches[i]->cache_flags |= UMF_PTC;
652
653 ret = B_TRUE;
654 out:
655 if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_EXEC) != 0) {
656 umem_panic("genasm failed to restore memory protection: %d",
657 errno);
658 }
659
660 if (mprotect((void *)fpbase, fplen, PROT_READ | PROT_EXEC) != 0) {
661 umem_panic("genasm failed to restore memory protection: %d",
662 errno);
663 }
664
665 return (ret);
666 }
667