/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2019 Joyent, Inc. All rights reserved. */ /* * Don't Panic! If you find the blocks of assembly that follow confusing and * you're questioning why they exist, please go read section 8 of the umem.c big * theory statement. Next familiarize yourself with the malloc and free * implementations in libumem's malloc.c. * * What follows is the i386 implementation of the thread caching automatic * assembly generation. With i386 a function only has three registers it's * allowed to change without restoring them: eax, ecx, and edx. All others have * to be preserved. Since the set of registers we have available is so small, we * have to make use of esi, ebx, and edi and save their original values to the * stack. * * Malloc register usage: * o. esi: Size of the malloc (passed into us and modified) * o. edi: Size of the cache * o. eax: Buffer to return * o. ebx: Scratch space and temporary values * o. ecx: Pointer to the tmem_t in the ulwp_t. * o. edx: Pointer to the tmem_t array of roots * * Free register usage: * o. esi: Size of the malloc (passed into us and modified) * o. edi: Size of the cache * o. eax: Buffer to free * o. ebx: Scratch space and temporary values * o. ecx: Pointer to the tmem_t in the ulwp_t. * o. edx: Pointer to the tmem_t array of roots * * Once we determine what cache we are using, we increment %edx to the * appropriate offset and set %edi with the size of the cache. This means that * when we break out to the normal buffer allocation point %edx contains the * head of the linked list and %edi is the amount that we have to adjust the * total amount cached by the thread. * * Each block of assembly has psuedocode that describes its purpose. */ /* * umem_base must be first. */ #include "umem_base.h" #include #include #include #include #include #include const int umem_genasm_supported = 1; static uintptr_t umem_genasm_mptr = (uintptr_t)&_malloc; static size_t umem_genasm_msize = 512; static uintptr_t umem_genasm_fptr = (uintptr_t)&_free; static size_t umem_genasm_fsize = 512; static uintptr_t umem_genasm_omptr = (uintptr_t)umem_malloc; static uintptr_t umem_genasm_ofptr = (uintptr_t)umem_malloc_free; /* * The maximum number of caches we can support. We use a single byte addl so * this is 255 (UINT8_MAX) / sizeof (uintptr_t). In this case 63 */ #define UMEM_GENASM_MAX32 63 #define PTC_JMPADDR(dest, src) (dest - (src + 4)) #define PTC_ROOT_SIZE sizeof (uintptr_t) #define MULTINOP 0x0000441f0f /* * void *ptcmalloc(size_t orig_size); * * size_t size = orig_size + 8; * * if (size < orig_size) * goto tomalloc; ! This is overflow * * if (size > cache_size) * goto tomalloc; * * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset; * void **roots = t->tm_roots; */ #define PTC_MALINIT_JOUT 0x0e #define PTC_MALINIT_MCS 0x14 #define PTC_MALINIT_JOV 0x1a #define PTC_MALINIT_SOFF 0x27 static const uint8_t malinit[] = { 0x55, /* pushl %ebp */ 0x89, 0xe5, /* movl %esp, %ebp */ 0x57, /* pushl %edi */ 0x56, /* pushl %esi */ 0x53, /* pushl %ebx */ 0x8b, 0x75, 0x08, /* movl 0x8(%ebp), %esi */ 0x83, 0xc6, 0x08, /* addl $0x8,%esi */ 0x0f, 0x82, 0x00, 0x00, 0x00, 0x00, /* jc +$JMP (errout) */ 0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, /* cmpl sizeof ($C0), %esi */ 0x0f, 0x87, 0x00, 0x00, 0x00, 0x00, /* ja +$JMP (errout) */ 0x65, 0x8b, 0x0d, 0x00, 0x00, 0x00, 0x00, /* movl %gs:0x0,%ecx */ 0x81, 0xc1, 0x00, 0x00, 0x00, 0x00, /* addl $OFF, %ecx */ 0x8d, 0x51, 0x04 /* leal 0x4(%ecx), %edx */ }; /* * void ptcfree(void *buf); * * if (buf == NULL) * return; * * malloc_data_t *tag = buf; * tag--; * int size = tag->malloc_size; * int tagtval = UMEM_MALLOC_DECODE(tag->malloc_tag, size); * * if (tagval != MALLOC_MAGIC) * goto tofree; * * if (size > cache_max) * goto tofree; * * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset; * void **roots = t->tm_roots; */ #define PTC_FRINI_JDONE 0x0d #define PTC_FRINI_JFREE 0x23 #define PTC_FRINI_MCS 0x29 #define PTC_FRINI_JOV 0x2f #define PTC_FRINI_SOFF 0x3c static const uint8_t freeinit[] = { 0x55, /* pushl %ebp */ 0x89, 0xe5, /* movl %esp, %ebp */ 0x57, /* pushl %edi */ 0x56, /* pushl %esi */ 0x53, /* pushl %ebx */ 0x8b, 0x45, 0x08, /* movl 0x8(%ebp), %eax */ 0x85, 0xc0, /* testl %eax, %eax */ 0x0f, 0x84, 0x00, 0x00, 0x00, 0x00, /* je $JDONE (done) */ 0x83, 0xe8, 0x08, /* subl $0x8,%eax */ 0x8b, 0x30, /* movl (%eax),%esi */ 0x8b, 0x50, 0x04, /* movl 0x4(%eax),%edx */ 0x01, 0xf2, /* addl %esi,%edx */ 0x81, 0xfa, 0x00, 0xc0, 0x10, 0x3a, /* cmpl MAGIC32, %edx */ 0x0f, 0x85, 0x00, 0x00, 0x00, 0x00, /* jne +JFREE (goto freebuf) */ 0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, /* cmpl sizeof ($C0), %esi */ 0x0f, 0x87, 0x00, 0x00, 0x00, 0x00, /* ja +$JMP (errout) */ 0x65, 0x8b, 0x0d, 0x00, 0x0, 0x00, 0x00, /* movl %gs:0x0,%ecx */ 0x81, 0xc1, 0x00, 0x00, 0x00, 0x00, /* addl $0xOFF, %ecx */ 0x8d, 0x51, 0x04 /* leal 0x4(%ecx),%edx */ }; /* * if (size <= $CACHE_SIZE) { * csize = $CACHE_SIZE; * } else ... ! goto next cache */ #define PTC_INICACHE_CMP 0x02 #define PTC_INICACHE_SIZE 0x09 #define PTC_INICACHE_JMP 0x0e static const uint8_t inicache[] = { 0x81, 0xfe, 0xff, 0x00, 0x00, 0x00, /* cmpl sizeof ($C0), %esi */ 0x77, 0x0a, /* ja +0xa */ 0xbf, 0xff, 0x00, 0x00, 0x00, /* movl sizeof ($C0), %edi */ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp +$JMP (allocbuf) */ }; /* * if (size <= $CACHE_SIZE) { * csize = $CACHE_SIZE; * roots += $CACHE_NUM; * } else ... ! goto next cache */ #define PTC_GENCACHE_CMP 0x02 #define PTC_GENCACHE_NUM 0x0a #define PTC_GENCACHE_SIZE 0x0c #define PTC_GENCACHE_JMP 0x11 static const uint8_t gencache[] = { 0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, /* cmpl sizeof ($CACHE), %esi */ 0x77, 0x0d, /* ja +0xd (next cache) */ 0x83, 0xc2, 0x00, /* addl $4*$ii, %edx */ 0xbf, 0x00, 0x00, 0x00, 0x00, /* movl sizeof ($CACHE), %edi */ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp +$JMP (allocbuf) */ }; /* * else if (size <= $CACHE_SIZE) { * csize = $CACHE_SIZE; * roots += $CACHE_NUM; * } else { * goto tofunc; ! goto tomalloc if ptcmalloc. * } ! goto tofree if ptcfree. */ #define PTC_FINCACHE_CMP 0x02 #define PTC_FINCACHE_JMP 0x07 #define PTC_FINCACHE_NUM 0x0a #define PTC_FINCACHE_SIZE 0x0c static const uint8_t fincache[] = { 0x81, 0xfe, 0xff, 0x00, 0x00, 0x00, /* cmpl sizeof ($CLAST), %esi */ 0x77, 0x00, /* ja +$JMP (to errout) */ 0x83, 0xc2, 0x00, /* addl $4*($NCACHES-1), %edx */ 0xbf, 0x00, 0x00, 0x00, 0x00, /* movl sizeof ($CLAST), %edi */ }; /* * if (*root == NULL) * goto tomalloc; * * malloc_data_t *ret = *root; * *root = *(void **)ret; * t->tm_size += csize; * ret->malloc_size = size; * * ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size); * ret++; * * return ((void *)ret); * tomalloc: * return (malloc(orig_size)); */ #define PTC_MALFINI_ALLABEL 0x00 #define PTC_MALFINI_JMLABEL 0x20 #define PTC_MALFINI_JMADDR 0x25 static const uint8_t malfini[] = { /* allocbuf: */ 0x8b, 0x02, /* movl (%edx), %eax */ 0x85, 0xc0, /* testl %eax, %eax */ 0x74, 0x1a, /* je +0x1a (errout) */ 0x8b, 0x18, /* movl (%eax), %esi */ 0x89, 0x1a, /* movl %esi, (%edx) */ 0x29, 0x39, /* subl %edi, (%ecx) */ 0x89, 0x30, /* movl %esi, ($eax) */ 0xba, 0x00, 0xc0, 0x10, 0x3a, /* movl $0x3a10c000,%edx */ 0x29, 0xf2, /* subl %esi, %edx */ 0x89, 0x50, 0x04, /* movl %edx, 0x4(%eax) */ 0x83, 0xc0, 0x08, /* addl %0x8, %eax */ 0x5b, /* popl %ebx */ 0x5e, /* popl %esi */ 0x5f, /* popl %edi */ 0xc9, /* leave */ 0xc3, /* ret */ /* errout: */ 0x5b, /* popl %ebx */ 0x5e, /* popl %esi */ 0x5f, /* popl %edi */ 0xc9, /* leave */ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp $malloc */ }; /* * if (t->tm_size + csize > umem_ptc_size) * goto tofree; * * t->tm_size += csize * *(void **)tag = *root; * *root = tag; * return; * tofree: * free(buf); * return; */ #define PTC_FRFINI_RBUFLABEL 0x00 #define PTC_FRFINI_CACHEMAX 0x06 #define PTC_FRFINI_DONELABEL 0x14 #define PTC_FRFINI_JFLABEL 0x19 #define PTC_FRFINI_JFADDR 0x1e static const uint8_t freefini[] = { /* freebuf: */ 0x8b, 0x19, /* movl (%ecx),%ebx */ 0x01, 0xfb, /* addl %edi,%ebx */ 0x81, 0xfb, 0x00, 0x00, 0x00, 0x00, /* cmpl maxsize, %ebx */ 0x73, 0x0d, /* jae +0xd */ 0x01, 0x39, /* addl %edi,(%ecx) */ 0x8b, 0x3a, /* movl (%edx),%edi */ 0x89, 0x38, /* movl %edi,(%eax) */ 0x89, 0x02, /* movl %eax,(%edx) */ /* done: */ 0x5b, /* popl %ebx */ 0x5e, /* popl %esi */ 0x5f, /* popl %edi */ 0xc9, /* leave */ 0xc3, /* ret */ /* realfree: */ 0x5b, /* popl %ebx */ 0x5e, /* popl %esi */ 0x5f, /* popl %edi */ 0xc9, /* leave */ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp free */ }; /* * Construct the initial part of malloc. off contains the offset from curthread * to the root of the tmem structure. ep is the address of the label to error * and jump to free. csize is the size of the largest umem_cache in ptcumem. */ static int genasm_malinit(uint8_t *bp, uint32_t off, uint32_t ep, uint32_t csize) { uint32_t addr; bcopy(malinit, bp, sizeof (malinit)); addr = PTC_JMPADDR(ep, PTC_MALINIT_JOUT); bcopy(&addr, bp + PTC_MALINIT_JOUT, sizeof (addr)); bcopy(&csize, bp + PTC_MALINIT_MCS, sizeof (csize)); addr = PTC_JMPADDR(ep, PTC_MALINIT_JOV); bcopy(&addr, bp + PTC_MALINIT_JOV, sizeof (addr)); bcopy(&off, bp + PTC_MALINIT_SOFF, sizeof (off)); return (sizeof (malinit)); } static int genasm_frinit(uint8_t *bp, uint32_t off, uint32_t dp, uint32_t ep, uint32_t mc) { uint32_t addr; bcopy(freeinit, bp, sizeof (freeinit)); addr = PTC_JMPADDR(dp, PTC_FRINI_JDONE); bcopy(&addr, bp + PTC_FRINI_JDONE, sizeof (addr)); addr = PTC_JMPADDR(ep, PTC_FRINI_JFREE); bcopy(&addr, bp + PTC_FRINI_JFREE, sizeof (addr)); bcopy(&mc, bp + PTC_FRINI_MCS, sizeof (mc)); addr = PTC_JMPADDR(ep, PTC_FRINI_JOV); bcopy(&addr, bp + PTC_FRINI_JOV, sizeof (addr)); bcopy(&off, bp + PTC_FRINI_SOFF, sizeof (off)); return (sizeof (freeinit)); } /* * Create the initial cache entry of the specified size. The value of ap tells * us what the address of the label to try and allocate a buffer. This value is * an offset from the current base to that value. */ static int genasm_firstcache(uint8_t *bp, uint32_t csize, uint32_t ap) { uint32_t addr; bcopy(inicache, bp, sizeof (inicache)); bcopy(&csize, bp + PTC_INICACHE_CMP, sizeof (csize)); bcopy(&csize, bp + PTC_INICACHE_SIZE, sizeof (csize)); addr = PTC_JMPADDR(ap, PTC_INICACHE_JMP); ASSERT(addr != 0); bcopy(&addr, bp + PTC_INICACHE_JMP, sizeof (addr)); return (sizeof (inicache)); } static int genasm_gencache(uint8_t *bp, int num, uint32_t csize, uint32_t ap) { uint32_t addr; uint8_t coff; ASSERT(256 / PTC_ROOT_SIZE > num); ASSERT(num != 0); bcopy(gencache, bp, sizeof (gencache)); bcopy(&csize, bp + PTC_GENCACHE_CMP, sizeof (csize)); bcopy(&csize, bp + PTC_GENCACHE_SIZE, sizeof (csize)); coff = num * PTC_ROOT_SIZE; bcopy(&coff, bp + PTC_GENCACHE_NUM, sizeof (coff)); addr = PTC_JMPADDR(ap, PTC_GENCACHE_JMP); bcopy(&addr, bp + PTC_GENCACHE_JMP, sizeof (addr)); return (sizeof (gencache)); } static int genasm_lastcache(uint8_t *bp, int num, uint32_t csize, uint32_t ep) { uint8_t addr; ASSERT(ep <= 0xff && ep > 7); ASSERT(256 / PTC_ROOT_SIZE > num); bcopy(fincache, bp, sizeof (fincache)); bcopy(&csize, bp + PTC_FINCACHE_CMP, sizeof (csize)); bcopy(&csize, bp + PTC_FINCACHE_SIZE, sizeof (csize)); addr = num * PTC_ROOT_SIZE; bcopy(&addr, bp + PTC_FINCACHE_NUM, sizeof (addr)); addr = ep - PTC_FINCACHE_JMP - 1; bcopy(&addr, bp + PTC_FINCACHE_JMP, sizeof (addr)); return (sizeof (fincache)); } static int genasm_malfini(uint8_t *bp, uintptr_t mptr) { uint32_t addr; bcopy(malfini, bp, sizeof (malfini)); addr = PTC_JMPADDR(mptr, ((uintptr_t)bp + PTC_MALFINI_JMADDR)); bcopy(&addr, bp + PTC_MALFINI_JMADDR, sizeof (addr)); return (sizeof (malfini)); } static int genasm_frfini(uint8_t *bp, uint32_t maxthr, uintptr_t fptr) { uint32_t addr; bcopy(freefini, bp, sizeof (freefini)); bcopy(&maxthr, bp + PTC_FRFINI_CACHEMAX, sizeof (maxthr)); addr = PTC_JMPADDR(fptr, ((uintptr_t)bp + PTC_FRFINI_JFADDR)); bcopy(&addr, bp + PTC_FRFINI_JFADDR, sizeof (addr)); return (sizeof (freefini)); } /* * The malloc inline assembly is constructed as follows: * * o Malloc prologue assembly * o Generic first-cache check * o n Generic cache checks (where n = _tmem_get_entries() - 2) * o Generic last-cache check * o Malloc epilogue assembly * * Generally there are at least three caches. When there is only one cache we * only use the generic last-cache. In the case where there are two caches, we * just leave out the middle ones. */ static int genasm_malloc(void *base, size_t len, int nents, int *umem_alloc_sizes) { int ii, off; uint8_t *bp; size_t total; uint32_t allocoff, erroff; total = sizeof (malinit) + sizeof (malfini) + sizeof (fincache); if (nents >= 2) total += sizeof (inicache) + sizeof (gencache) * (nents - 2); if (total > len) return (1); erroff = total - sizeof (malfini) + PTC_MALFINI_JMLABEL; allocoff = total - sizeof (malfini) + PTC_MALFINI_ALLABEL; bp = base; off = genasm_malinit(bp, umem_tmem_off, erroff, umem_alloc_sizes[nents-1]); bp += off; allocoff -= off; erroff -= off; if (nents > 1) { off = genasm_firstcache(bp, umem_alloc_sizes[0], allocoff); bp += off; allocoff -= off; erroff -= off; } for (ii = 1; ii < nents - 1; ii++) { off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], allocoff); bp += off; allocoff -= off; erroff -= off; } bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1], erroff); bp += genasm_malfini(bp, umem_genasm_omptr); ASSERT(((uintptr_t)bp - total) == (uintptr_t)base); return (0); } static int genasm_free(void *base, size_t len, int nents, int *umem_alloc_sizes) { uint8_t *bp; int ii, off; size_t total; uint32_t rbufoff, retoff, erroff; /* Assume that nents has already been audited for us */ total = sizeof (freeinit) + sizeof (freefini) + sizeof (fincache); if (nents >= 2) total += sizeof (inicache) + sizeof (gencache) * (nents - 2); if (total > len) return (1); erroff = total - (sizeof (freefini) - PTC_FRFINI_JFLABEL); rbufoff = total - (sizeof (freefini) - PTC_FRFINI_RBUFLABEL); retoff = total - (sizeof (freefini) - PTC_FRFINI_DONELABEL); bp = base; off = genasm_frinit(bp, umem_tmem_off, retoff, erroff, umem_alloc_sizes[nents - 1]); bp += off; erroff -= off; rbufoff -= off; if (nents > 1) { off = genasm_firstcache(bp, umem_alloc_sizes[0], rbufoff); bp += off; erroff -= off; rbufoff -= off; } for (ii = 1; ii < nents - 1; ii++) { off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], rbufoff); bp += off; rbufoff -= off; erroff -= off; } bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1], erroff); bp += genasm_frfini(bp, umem_ptc_size, umem_genasm_ofptr); ASSERT(((uintptr_t)bp - total) == (uintptr_t)base); return (0); } boolean_t umem_genasm(int *alloc_sizes, umem_cache_t **caches, int ncaches) { int nents, i; uint8_t *mptr; uint8_t *fptr; uint64_t v, *vptr; size_t mplen, fplen; uintptr_t mpbase, fpbase; boolean_t ret = B_FALSE; mptr = (void *)((uintptr_t)umem_genasm_mptr + 5); fptr = (void *)((uintptr_t)umem_genasm_fptr + 5); if (umem_genasm_mptr == 0 || umem_genasm_msize == 0 || umem_genasm_fptr == 0 || umem_genasm_fsize == 0) { return (B_FALSE); } mplen = P2ROUNDUP(umem_genasm_msize, pagesize); mpbase = P2ALIGN((uintptr_t)umem_genasm_mptr, pagesize); fplen = P2ROUNDUP(umem_genasm_fsize, pagesize); fpbase = P2ALIGN((uintptr_t)umem_genasm_mptr, pagesize); /* * If the values straddle a page boundary, then we might need to * actually remap two pages. */ if (P2ALIGN(umem_genasm_msize + (uintptr_t)umem_genasm_mptr, pagesize) != mpbase) { mplen += pagesize; } if (P2ALIGN(umem_genasm_fsize + (uintptr_t)umem_genasm_fptr, pagesize) != fpbase) { fplen += pagesize; } if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_WRITE | PROT_EXEC) != 0) { return (B_FALSE); } if (mprotect((void *)fpbase, fplen, PROT_READ | PROT_WRITE | PROT_EXEC) != 0) { if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_EXEC) != 0) { umem_panic("genasm failed to restore memory " "protection: %d", errno); } return (B_FALSE); } /* * The total number of caches that we can service is the minimum of: * o the amount supported by libc * o the total number of umem caches * o we use a single byte addl, so it's 255 / sizeof (uintptr_t). For * 32-bit, this is 63. */ nents = _tmem_get_nentries(); if (UMEM_GENASM_MAX32 < nents) nents = UMEM_GENASM_MAX32; if (ncaches < nents) nents = ncaches; /* * If the number of per-thread caches has been set to zero or the * per-thread cache size has been set to zero, don't bother trying to * write any assembly and just use the default malloc and free. When we * return, indicate that there is no PTC support. */ if (nents == 0 || umem_ptc_size == 0) { goto out; } /* Take into account the jump */ if (genasm_malloc(mptr, umem_genasm_msize, nents, alloc_sizes) != 0) { goto out; } if (genasm_free(fptr, umem_genasm_fsize, nents, alloc_sizes) != 0) { goto out; } /* nop out the jump with a multibyte jump */ vptr = (void *)umem_genasm_mptr; v = MULTINOP; v |= *vptr & (0xffffffULL << 40); (void) atomic_swap_64(vptr, v); vptr = (void *)umem_genasm_fptr; v = MULTINOP; v |= *vptr & (0xffffffULL << 40); (void) atomic_swap_64(vptr, v); for (i = 0; i < nents; i++) caches[i]->cache_flags |= UMF_PTC; ret = B_TRUE; out: if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_EXEC) != 0) { umem_panic("genasm failed to restore memory protection: %d", errno); } if (mprotect((void *)fpbase, fplen, PROT_READ | PROT_EXEC) != 0) { umem_panic("genasm failed to restore memory protection: %d", errno); } return (ret); }