17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 57d692464Sdp * Common Development and Distribution License (the "License"). 67d692464Sdp * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 22b942e89bSDavid Valin * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. 23929d5b43SMatthew Ahrens * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 240c833d64SJosef 'Jeff' Sipek * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 2536a64e62STim Kordas * Copyright 2018, Joyent, Inc. 26*baf00aa8SJoshua M. Clulow * Copyright 2020 Oxide Computer Company 277c478bd9Sstevel@tonic-gate */ 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate /* 30b5fca8f8Stomee * Kernel memory allocator, as described in the following two papers and a 31b5fca8f8Stomee * statement about the consolidator: 327c478bd9Sstevel@tonic-gate * 337c478bd9Sstevel@tonic-gate * Jeff Bonwick, 347c478bd9Sstevel@tonic-gate * The Slab Allocator: An Object-Caching Kernel Memory Allocator. 357c478bd9Sstevel@tonic-gate * Proceedings of the Summer 1994 Usenix Conference. 367c478bd9Sstevel@tonic-gate * Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf. 377c478bd9Sstevel@tonic-gate * 387c478bd9Sstevel@tonic-gate * Jeff Bonwick and Jonathan Adams, 397c478bd9Sstevel@tonic-gate * Magazines and vmem: Extending the Slab Allocator to Many CPUs and 407c478bd9Sstevel@tonic-gate * Arbitrary Resources. 417c478bd9Sstevel@tonic-gate * Proceedings of the 2001 Usenix Conference. 427c478bd9Sstevel@tonic-gate * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf. 43b5fca8f8Stomee * 44b5fca8f8Stomee * kmem Slab Consolidator Big Theory Statement: 45b5fca8f8Stomee * 46b5fca8f8Stomee * 1. Motivation 47b5fca8f8Stomee * 48b5fca8f8Stomee * As stated in Bonwick94, slabs provide the following advantages over other 49b5fca8f8Stomee * allocation structures in terms of memory fragmentation: 50b5fca8f8Stomee * 51b5fca8f8Stomee * - Internal fragmentation (per-buffer wasted space) is minimal. 52b5fca8f8Stomee * - Severe external fragmentation (unused buffers on the free list) is 53b5fca8f8Stomee * unlikely. 54b5fca8f8Stomee * 55b5fca8f8Stomee * Segregating objects by size eliminates one source of external fragmentation, 56b5fca8f8Stomee * and according to Bonwick: 57b5fca8f8Stomee * 58b5fca8f8Stomee * The other reason that slabs reduce external fragmentation is that all 59b5fca8f8Stomee * objects in a slab are of the same type, so they have the same lifetime 60b5fca8f8Stomee * distribution. The resulting segregation of short-lived and long-lived 61b5fca8f8Stomee * objects at slab granularity reduces the likelihood of an entire page being 62b5fca8f8Stomee * held hostage due to a single long-lived allocation [Barrett93, Hanson90]. 63b5fca8f8Stomee * 64b5fca8f8Stomee * While unlikely, severe external fragmentation remains possible. Clients that 65b5fca8f8Stomee * allocate both short- and long-lived objects from the same cache cannot 66b5fca8f8Stomee * anticipate the distribution of long-lived objects within the allocator's slab 67b5fca8f8Stomee * implementation. Even a small percentage of long-lived objects distributed 68b5fca8f8Stomee * randomly across many slabs can lead to a worst case scenario where the client 69b5fca8f8Stomee * frees the majority of its objects and the system gets back almost none of the 70b5fca8f8Stomee * slabs. Despite the client doing what it reasonably can to help the system 71b5fca8f8Stomee * reclaim memory, the allocator cannot shake free enough slabs because of 72b5fca8f8Stomee * lonely allocations stubbornly hanging on. Although the allocator is in a 73b5fca8f8Stomee * position to diagnose the fragmentation, there is nothing that the allocator 74b5fca8f8Stomee * by itself can do about it. It only takes a single allocated object to prevent 75b5fca8f8Stomee * an entire slab from being reclaimed, and any object handed out by 76b5fca8f8Stomee * kmem_cache_alloc() is by definition in the client's control. Conversely, 77b5fca8f8Stomee * although the client is in a position to move a long-lived object, it has no 78b5fca8f8Stomee * way of knowing if the object is causing fragmentation, and if so, where to 79b5fca8f8Stomee * move it. A solution necessarily requires further cooperation between the 80b5fca8f8Stomee * allocator and the client. 81b5fca8f8Stomee * 82b5fca8f8Stomee * 2. Move Callback 83b5fca8f8Stomee * 84b5fca8f8Stomee * The kmem slab consolidator therefore adds a move callback to the 85b5fca8f8Stomee * allocator/client interface, improving worst-case external fragmentation in 86b5fca8f8Stomee * kmem caches that supply a function to move objects from one memory location 87b5fca8f8Stomee * to another. In a situation of low memory kmem attempts to consolidate all of 88b5fca8f8Stomee * a cache's slabs at once; otherwise it works slowly to bring external 89b5fca8f8Stomee * fragmentation within the 1/8 limit guaranteed for internal fragmentation, 90b5fca8f8Stomee * thereby helping to avoid a low memory situation in the future. 91b5fca8f8Stomee * 92b5fca8f8Stomee * The callback has the following signature: 93b5fca8f8Stomee * 94b5fca8f8Stomee * kmem_cbrc_t move(void *old, void *new, size_t size, void *user_arg) 95b5fca8f8Stomee * 96b5fca8f8Stomee * It supplies the kmem client with two addresses: the allocated object that 97b5fca8f8Stomee * kmem wants to move and a buffer selected by kmem for the client to use as the 98b5fca8f8Stomee * copy destination. The callback is kmem's way of saying "Please get off of 99b5fca8f8Stomee * this buffer and use this one instead." kmem knows where it wants to move the 100b5fca8f8Stomee * object in order to best reduce fragmentation. All the client needs to know 101b5fca8f8Stomee * about the second argument (void *new) is that it is an allocated, constructed 102b5fca8f8Stomee * object ready to take the contents of the old object. When the move function 103b5fca8f8Stomee * is called, the system is likely to be low on memory, and the new object 104b5fca8f8Stomee * spares the client from having to worry about allocating memory for the 105b5fca8f8Stomee * requested move. The third argument supplies the size of the object, in case a 106b5fca8f8Stomee * single move function handles multiple caches whose objects differ only in 107b5fca8f8Stomee * size (such as zio_buf_512, zio_buf_1024, etc). Finally, the same optional 108b5fca8f8Stomee * user argument passed to the constructor, destructor, and reclaim functions is 109b5fca8f8Stomee * also passed to the move callback. 110b5fca8f8Stomee * 111b5fca8f8Stomee * 2.1 Setting the Move Callback 112b5fca8f8Stomee * 113b5fca8f8Stomee * The client sets the move callback after creating the cache and before 114b5fca8f8Stomee * allocating from it: 115b5fca8f8Stomee * 116b5fca8f8Stomee * object_cache = kmem_cache_create(...); 117b5fca8f8Stomee * kmem_cache_set_move(object_cache, object_move); 118b5fca8f8Stomee * 119b5fca8f8Stomee * 2.2 Move Callback Return Values 120b5fca8f8Stomee * 121b5fca8f8Stomee * Only the client knows about its own data and when is a good time to move it. 122b5fca8f8Stomee * The client is cooperating with kmem to return unused memory to the system, 123b5fca8f8Stomee * and kmem respectfully accepts this help at the client's convenience. When 124b5fca8f8Stomee * asked to move an object, the client can respond with any of the following: 125b5fca8f8Stomee * 126b5fca8f8Stomee * typedef enum kmem_cbrc { 127b5fca8f8Stomee * KMEM_CBRC_YES, 128b5fca8f8Stomee * KMEM_CBRC_NO, 129b5fca8f8Stomee * KMEM_CBRC_LATER, 130b5fca8f8Stomee * KMEM_CBRC_DONT_NEED, 131b5fca8f8Stomee * KMEM_CBRC_DONT_KNOW 132b5fca8f8Stomee * } kmem_cbrc_t; 133b5fca8f8Stomee * 134b5fca8f8Stomee * The client must not explicitly kmem_cache_free() either of the objects passed 135b5fca8f8Stomee * to the callback, since kmem wants to free them directly to the slab layer 136b5fca8f8Stomee * (bypassing the per-CPU magazine layer). The response tells kmem which of the 137b5fca8f8Stomee * objects to free: 138b5fca8f8Stomee * 139b5fca8f8Stomee * YES: (Did it) The client moved the object, so kmem frees the old one. 140b5fca8f8Stomee * NO: (Never) The client refused, so kmem frees the new object (the 141b5fca8f8Stomee * unused copy destination). kmem also marks the slab of the old 142b5fca8f8Stomee * object so as not to bother the client with further callbacks for 143b5fca8f8Stomee * that object as long as the slab remains on the partial slab list. 144b5fca8f8Stomee * (The system won't be getting the slab back as long as the 145b5fca8f8Stomee * immovable object holds it hostage, so there's no point in moving 146b5fca8f8Stomee * any of its objects.) 147b5fca8f8Stomee * LATER: The client is using the object and cannot move it now, so kmem 148b5fca8f8Stomee * frees the new object (the unused copy destination). kmem still 149b5fca8f8Stomee * attempts to move other objects off the slab, since it expects to 150b5fca8f8Stomee * succeed in clearing the slab in a later callback. The client 151b5fca8f8Stomee * should use LATER instead of NO if the object is likely to become 152b5fca8f8Stomee * movable very soon. 153b5fca8f8Stomee * DONT_NEED: The client no longer needs the object, so kmem frees the old along 154b5fca8f8Stomee * with the new object (the unused copy destination). This response 155b5fca8f8Stomee * is the client's opportunity to be a model citizen and give back as 156b5fca8f8Stomee * much as it can. 157b5fca8f8Stomee * DONT_KNOW: The client does not know about the object because 158b5fca8f8Stomee * a) the client has just allocated the object and not yet put it 159b5fca8f8Stomee * wherever it expects to find known objects 160b5fca8f8Stomee * b) the client has removed the object from wherever it expects to 161b5fca8f8Stomee * find known objects and is about to free it, or 162b5fca8f8Stomee * c) the client has freed the object. 163b5fca8f8Stomee * In all these cases (a, b, and c) kmem frees the new object (the 164d7db73d1SBryan Cantrill * unused copy destination). In the first case, the object is in 165d7db73d1SBryan Cantrill * use and the correct action is that for LATER; in the latter two 166d7db73d1SBryan Cantrill * cases, we know that the object is either freed or about to be 167d7db73d1SBryan Cantrill * freed, in which case it is either already in a magazine or about 168d7db73d1SBryan Cantrill * to be in one. In these cases, we know that the object will either 169d7db73d1SBryan Cantrill * be reallocated and reused, or it will end up in a full magazine 170d7db73d1SBryan Cantrill * that will be reaped (thereby liberating the slab). Because it 171d7db73d1SBryan Cantrill * is prohibitively expensive to differentiate these cases, and 172d7db73d1SBryan Cantrill * because the defrag code is executed when we're low on memory 173d7db73d1SBryan Cantrill * (thereby biasing the system to reclaim full magazines) we treat 174d7db73d1SBryan Cantrill * all DONT_KNOW cases as LATER and rely on cache reaping to 175d7db73d1SBryan Cantrill * generally clean up full magazines. While we take the same action 176d7db73d1SBryan Cantrill * for these cases, we maintain their semantic distinction: if 177d7db73d1SBryan Cantrill * defragmentation is not occurring, it is useful to know if this 178d7db73d1SBryan Cantrill * is due to objects in use (LATER) or objects in an unknown state 179d7db73d1SBryan Cantrill * of transition (DONT_KNOW). 180b5fca8f8Stomee * 181b5fca8f8Stomee * 2.3 Object States 182b5fca8f8Stomee * 183b5fca8f8Stomee * Neither kmem nor the client can be assumed to know the object's whereabouts 184b5fca8f8Stomee * at the time of the callback. An object belonging to a kmem cache may be in 185b5fca8f8Stomee * any of the following states: 186b5fca8f8Stomee * 187b5fca8f8Stomee * 1. Uninitialized on the slab 188b5fca8f8Stomee * 2. Allocated from the slab but not constructed (still uninitialized) 189b5fca8f8Stomee * 3. Allocated from the slab, constructed, but not yet ready for business 190b5fca8f8Stomee * (not in a valid state for the move callback) 191b5fca8f8Stomee * 4. In use (valid and known to the client) 192b5fca8f8Stomee * 5. About to be freed (no longer in a valid state for the move callback) 193b5fca8f8Stomee * 6. Freed to a magazine (still constructed) 194b5fca8f8Stomee * 7. Allocated from a magazine, not yet ready for business (not in a valid 195b5fca8f8Stomee * state for the move callback), and about to return to state #4 196b5fca8f8Stomee * 8. Deconstructed on a magazine that is about to be freed 197b5fca8f8Stomee * 9. Freed to the slab 198b5fca8f8Stomee * 199b5fca8f8Stomee * Since the move callback may be called at any time while the object is in any 200b5fca8f8Stomee * of the above states (except state #1), the client needs a safe way to 201b5fca8f8Stomee * determine whether or not it knows about the object. Specifically, the client 202b5fca8f8Stomee * needs to know whether or not the object is in state #4, the only state in 203b5fca8f8Stomee * which a move is valid. If the object is in any other state, the client should 204b5fca8f8Stomee * immediately return KMEM_CBRC_DONT_KNOW, since it is unsafe to access any of 205b5fca8f8Stomee * the object's fields. 206b5fca8f8Stomee * 207b5fca8f8Stomee * Note that although an object may be in state #4 when kmem initiates the move 208b5fca8f8Stomee * request, the object may no longer be in that state by the time kmem actually 209b5fca8f8Stomee * calls the move function. Not only does the client free objects 210b5fca8f8Stomee * asynchronously, kmem itself puts move requests on a queue where thay are 211b5fca8f8Stomee * pending until kmem processes them from another context. Also, objects freed 212b5fca8f8Stomee * to a magazine appear allocated from the point of view of the slab layer, so 213b5fca8f8Stomee * kmem may even initiate requests for objects in a state other than state #4. 214b5fca8f8Stomee * 215b5fca8f8Stomee * 2.3.1 Magazine Layer 216b5fca8f8Stomee * 217b5fca8f8Stomee * An important insight revealed by the states listed above is that the magazine 218b5fca8f8Stomee * layer is populated only by kmem_cache_free(). Magazines of constructed 219b5fca8f8Stomee * objects are never populated directly from the slab layer (which contains raw, 220b5fca8f8Stomee * unconstructed objects). Whenever an allocation request cannot be satisfied 221b5fca8f8Stomee * from the magazine layer, the magazines are bypassed and the request is 222b5fca8f8Stomee * satisfied from the slab layer (creating a new slab if necessary). kmem calls 223b5fca8f8Stomee * the object constructor only when allocating from the slab layer, and only in 224b5fca8f8Stomee * response to kmem_cache_alloc() or to prepare the destination buffer passed in 225b5fca8f8Stomee * the move callback. kmem does not preconstruct objects in anticipation of 226b5fca8f8Stomee * kmem_cache_alloc(). 227b5fca8f8Stomee * 228b5fca8f8Stomee * 2.3.2 Object Constructor and Destructor 229b5fca8f8Stomee * 230b5fca8f8Stomee * If the client supplies a destructor, it must be valid to call the destructor 231b5fca8f8Stomee * on a newly created object (immediately after the constructor). 232b5fca8f8Stomee * 233b5fca8f8Stomee * 2.4 Recognizing Known Objects 234b5fca8f8Stomee * 235b5fca8f8Stomee * There is a simple test to determine safely whether or not the client knows 236b5fca8f8Stomee * about a given object in the move callback. It relies on the fact that kmem 237b5fca8f8Stomee * guarantees that the object of the move callback has only been touched by the 238b5fca8f8Stomee * client itself or else by kmem. kmem does this by ensuring that none of the 239b5fca8f8Stomee * cache's slabs are freed to the virtual memory (VM) subsystem while a move 240b5fca8f8Stomee * callback is pending. When the last object on a slab is freed, if there is a 241b5fca8f8Stomee * pending move, kmem puts the slab on a per-cache dead list and defers freeing 242b5fca8f8Stomee * slabs on that list until all pending callbacks are completed. That way, 243b5fca8f8Stomee * clients can be certain that the object of a move callback is in one of the 244b5fca8f8Stomee * states listed above, making it possible to distinguish known objects (in 245b5fca8f8Stomee * state #4) using the two low order bits of any pointer member (with the 246b5fca8f8Stomee * exception of 'char *' or 'short *' which may not be 4-byte aligned on some 247b5fca8f8Stomee * platforms). 248b5fca8f8Stomee * 249b5fca8f8Stomee * The test works as long as the client always transitions objects from state #4 250b5fca8f8Stomee * (known, in use) to state #5 (about to be freed, invalid) by setting the low 251b5fca8f8Stomee * order bit of the client-designated pointer member. Since kmem only writes 252b5fca8f8Stomee * invalid memory patterns, such as 0xbaddcafe to uninitialized memory and 253b5fca8f8Stomee * 0xdeadbeef to freed memory, any scribbling on the object done by kmem is 254b5fca8f8Stomee * guaranteed to set at least one of the two low order bits. Therefore, given an 255b5fca8f8Stomee * object with a back pointer to a 'container_t *o_container', the client can 256b5fca8f8Stomee * test 257b5fca8f8Stomee * 258b5fca8f8Stomee * container_t *container = object->o_container; 259b5fca8f8Stomee * if ((uintptr_t)container & 0x3) { 260b5fca8f8Stomee * return (KMEM_CBRC_DONT_KNOW); 261b5fca8f8Stomee * } 262b5fca8f8Stomee * 263b5fca8f8Stomee * Typically, an object will have a pointer to some structure with a list or 264b5fca8f8Stomee * hash where objects from the cache are kept while in use. Assuming that the 265b5fca8f8Stomee * client has some way of knowing that the container structure is valid and will 266b5fca8f8Stomee * not go away during the move, and assuming that the structure includes a lock 267b5fca8f8Stomee * to protect whatever collection is used, then the client would continue as 268b5fca8f8Stomee * follows: 269b5fca8f8Stomee * 270b5fca8f8Stomee * // Ensure that the container structure does not go away. 271b5fca8f8Stomee * if (container_hold(container) == 0) { 272b5fca8f8Stomee * return (KMEM_CBRC_DONT_KNOW); 273b5fca8f8Stomee * } 274b5fca8f8Stomee * mutex_enter(&container->c_objects_lock); 275b5fca8f8Stomee * if (container != object->o_container) { 276b5fca8f8Stomee * mutex_exit(&container->c_objects_lock); 277b5fca8f8Stomee * container_rele(container); 278b5fca8f8Stomee * return (KMEM_CBRC_DONT_KNOW); 279b5fca8f8Stomee * } 280b5fca8f8Stomee * 281b5fca8f8Stomee * At this point the client knows that the object cannot be freed as long as 282b5fca8f8Stomee * c_objects_lock is held. Note that after acquiring the lock, the client must 283b5fca8f8Stomee * recheck the o_container pointer in case the object was removed just before 284b5fca8f8Stomee * acquiring the lock. 285b5fca8f8Stomee * 286b5fca8f8Stomee * When the client is about to free an object, it must first remove that object 287b5fca8f8Stomee * from the list, hash, or other structure where it is kept. At that time, to 288b5fca8f8Stomee * mark the object so it can be distinguished from the remaining, known objects, 289b5fca8f8Stomee * the client sets the designated low order bit: 290b5fca8f8Stomee * 291b5fca8f8Stomee * mutex_enter(&container->c_objects_lock); 292b5fca8f8Stomee * object->o_container = (void *)((uintptr_t)object->o_container | 0x1); 293b5fca8f8Stomee * list_remove(&container->c_objects, object); 294b5fca8f8Stomee * mutex_exit(&container->c_objects_lock); 295b5fca8f8Stomee * 296b5fca8f8Stomee * In the common case, the object is freed to the magazine layer, where it may 297b5fca8f8Stomee * be reused on a subsequent allocation without the overhead of calling the 298b5fca8f8Stomee * constructor. While in the magazine it appears allocated from the point of 299b5fca8f8Stomee * view of the slab layer, making it a candidate for the move callback. Most 300b5fca8f8Stomee * objects unrecognized by the client in the move callback fall into this 301b5fca8f8Stomee * category and are cheaply distinguished from known objects by the test 302d7db73d1SBryan Cantrill * described earlier. Because searching magazines is prohibitively expensive 303d7db73d1SBryan Cantrill * for kmem, clients that do not mark freed objects (and therefore return 304d7db73d1SBryan Cantrill * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation 305d7db73d1SBryan Cantrill * efficacy reduced. 306b5fca8f8Stomee * 307b5fca8f8Stomee * Invalidating the designated pointer member before freeing the object marks 308b5fca8f8Stomee * the object to be avoided in the callback, and conversely, assigning a valid 309b5fca8f8Stomee * value to the designated pointer member after allocating the object makes the 310b5fca8f8Stomee * object fair game for the callback: 311b5fca8f8Stomee * 312b5fca8f8Stomee * ... allocate object ... 313b5fca8f8Stomee * ... set any initial state not set by the constructor ... 314b5fca8f8Stomee * 315b5fca8f8Stomee * mutex_enter(&container->c_objects_lock); 316b5fca8f8Stomee * list_insert_tail(&container->c_objects, object); 317b5fca8f8Stomee * membar_producer(); 318b5fca8f8Stomee * object->o_container = container; 319b5fca8f8Stomee * mutex_exit(&container->c_objects_lock); 320b5fca8f8Stomee * 321b5fca8f8Stomee * Note that everything else must be valid before setting o_container makes the 322b5fca8f8Stomee * object fair game for the move callback. The membar_producer() call ensures 323b5fca8f8Stomee * that all the object's state is written to memory before setting the pointer 324b5fca8f8Stomee * that transitions the object from state #3 or #7 (allocated, constructed, not 325b5fca8f8Stomee * yet in use) to state #4 (in use, valid). That's important because the move 326b5fca8f8Stomee * function has to check the validity of the pointer before it can safely 327b5fca8f8Stomee * acquire the lock protecting the collection where it expects to find known 328b5fca8f8Stomee * objects. 329b5fca8f8Stomee * 330b5fca8f8Stomee * This method of distinguishing known objects observes the usual symmetry: 331b5fca8f8Stomee * invalidating the designated pointer is the first thing the client does before 332b5fca8f8Stomee * freeing the object, and setting the designated pointer is the last thing the 333b5fca8f8Stomee * client does after allocating the object. Of course, the client is not 334b5fca8f8Stomee * required to use this method. Fundamentally, how the client recognizes known 335b5fca8f8Stomee * objects is completely up to the client, but this method is recommended as an 336b5fca8f8Stomee * efficient and safe way to take advantage of the guarantees made by kmem. If 337b5fca8f8Stomee * the entire object is arbitrary data without any markable bits from a suitable 338b5fca8f8Stomee * pointer member, then the client must find some other method, such as 339b5fca8f8Stomee * searching a hash table of known objects. 340b5fca8f8Stomee * 341b5fca8f8Stomee * 2.5 Preventing Objects From Moving 342b5fca8f8Stomee * 343b5fca8f8Stomee * Besides a way to distinguish known objects, the other thing that the client 344b5fca8f8Stomee * needs is a strategy to ensure that an object will not move while the client 345b5fca8f8Stomee * is actively using it. The details of satisfying this requirement tend to be 346b5fca8f8Stomee * highly cache-specific. It might seem that the same rules that let a client 347b5fca8f8Stomee * remove an object safely should also decide when an object can be moved 348b5fca8f8Stomee * safely. However, any object state that makes a removal attempt invalid is 349b5fca8f8Stomee * likely to be long-lasting for objects that the client does not expect to 350b5fca8f8Stomee * remove. kmem knows nothing about the object state and is equally likely (from 351b5fca8f8Stomee * the client's point of view) to request a move for any object in the cache, 352b5fca8f8Stomee * whether prepared for removal or not. Even a low percentage of objects stuck 353b5fca8f8Stomee * in place by unremovability will defeat the consolidator if the stuck objects 354b5fca8f8Stomee * are the same long-lived allocations likely to hold slabs hostage. 355b5fca8f8Stomee * Fundamentally, the consolidator is not aimed at common cases. Severe external 356b5fca8f8Stomee * fragmentation is a worst case scenario manifested as sparsely allocated 357b5fca8f8Stomee * slabs, by definition a low percentage of the cache's objects. When deciding 358b5fca8f8Stomee * what makes an object movable, keep in mind the goal of the consolidator: to 359b5fca8f8Stomee * bring worst-case external fragmentation within the limits guaranteed for 360b5fca8f8Stomee * internal fragmentation. Removability is a poor criterion if it is likely to 361b5fca8f8Stomee * exclude more than an insignificant percentage of objects for long periods of 362b5fca8f8Stomee * time. 363b5fca8f8Stomee * 364b5fca8f8Stomee * A tricky general solution exists, and it has the advantage of letting you 365b5fca8f8Stomee * move any object at almost any moment, practically eliminating the likelihood 366b5fca8f8Stomee * that an object can hold a slab hostage. However, if there is a cache-specific 367b5fca8f8Stomee * way to ensure that an object is not actively in use in the vast majority of 368b5fca8f8Stomee * cases, a simpler solution that leverages this cache-specific knowledge is 369b5fca8f8Stomee * preferred. 370b5fca8f8Stomee * 371b5fca8f8Stomee * 2.5.1 Cache-Specific Solution 372b5fca8f8Stomee * 373b5fca8f8Stomee * As an example of a cache-specific solution, the ZFS znode cache takes 374b5fca8f8Stomee * advantage of the fact that the vast majority of znodes are only being 375b5fca8f8Stomee * referenced from the DNLC. (A typical case might be a few hundred in active 376b5fca8f8Stomee * use and a hundred thousand in the DNLC.) In the move callback, after the ZFS 377b5fca8f8Stomee * client has established that it recognizes the znode and can access its fields 378b5fca8f8Stomee * safely (using the method described earlier), it then tests whether the znode 379b5fca8f8Stomee * is referenced by anything other than the DNLC. If so, it assumes that the 380b5fca8f8Stomee * znode may be in active use and is unsafe to move, so it drops its locks and 381b5fca8f8Stomee * returns KMEM_CBRC_LATER. The advantage of this strategy is that everywhere 382b5fca8f8Stomee * else znodes are used, no change is needed to protect against the possibility 383b5fca8f8Stomee * of the znode moving. The disadvantage is that it remains possible for an 384b5fca8f8Stomee * application to hold a znode slab hostage with an open file descriptor. 385b5fca8f8Stomee * However, this case ought to be rare and the consolidator has a way to deal 386b5fca8f8Stomee * with it: If the client responds KMEM_CBRC_LATER repeatedly for the same 387b5fca8f8Stomee * object, kmem eventually stops believing it and treats the slab as if the 388b5fca8f8Stomee * client had responded KMEM_CBRC_NO. Having marked the hostage slab, kmem can 389b5fca8f8Stomee * then focus on getting it off of the partial slab list by allocating rather 390b5fca8f8Stomee * than freeing all of its objects. (Either way of getting a slab off the 391b5fca8f8Stomee * free list reduces fragmentation.) 392b5fca8f8Stomee * 393b5fca8f8Stomee * 2.5.2 General Solution 394b5fca8f8Stomee * 395b5fca8f8Stomee * The general solution, on the other hand, requires an explicit hold everywhere 396b5fca8f8Stomee * the object is used to prevent it from moving. To keep the client locking 397b5fca8f8Stomee * strategy as uncomplicated as possible, kmem guarantees the simplifying 398b5fca8f8Stomee * assumption that move callbacks are sequential, even across multiple caches. 399b5fca8f8Stomee * Internally, a global queue processed by a single thread supports all caches 400b5fca8f8Stomee * implementing the callback function. No matter how many caches supply a move 401b5fca8f8Stomee * function, the consolidator never moves more than one object at a time, so the 402b5fca8f8Stomee * client does not have to worry about tricky lock ordering involving several 403b5fca8f8Stomee * related objects from different kmem caches. 404b5fca8f8Stomee * 405b5fca8f8Stomee * The general solution implements the explicit hold as a read-write lock, which 406b5fca8f8Stomee * allows multiple readers to access an object from the cache simultaneously 407b5fca8f8Stomee * while a single writer is excluded from moving it. A single rwlock for the 408b5fca8f8Stomee * entire cache would lock out all threads from using any of the cache's objects 409b5fca8f8Stomee * even though only a single object is being moved, so to reduce contention, 410b5fca8f8Stomee * the client can fan out the single rwlock into an array of rwlocks hashed by 411b5fca8f8Stomee * the object address, making it probable that moving one object will not 412b5fca8f8Stomee * prevent other threads from using a different object. The rwlock cannot be a 413b5fca8f8Stomee * member of the object itself, because the possibility of the object moving 414b5fca8f8Stomee * makes it unsafe to access any of the object's fields until the lock is 415b5fca8f8Stomee * acquired. 416b5fca8f8Stomee * 417b5fca8f8Stomee * Assuming a small, fixed number of locks, it's possible that multiple objects 418b5fca8f8Stomee * will hash to the same lock. A thread that needs to use multiple objects in 419b5fca8f8Stomee * the same function may acquire the same lock multiple times. Since rwlocks are 420b5fca8f8Stomee * reentrant for readers, and since there is never more than a single writer at 421b5fca8f8Stomee * a time (assuming that the client acquires the lock as a writer only when 422b5fca8f8Stomee * moving an object inside the callback), there would seem to be no problem. 423b5fca8f8Stomee * However, a client locking multiple objects in the same function must handle 424b5fca8f8Stomee * one case of potential deadlock: Assume that thread A needs to prevent both 425b5fca8f8Stomee * object 1 and object 2 from moving, and thread B, the callback, meanwhile 426b5fca8f8Stomee * tries to move object 3. It's possible, if objects 1, 2, and 3 all hash to the 427b5fca8f8Stomee * same lock, that thread A will acquire the lock for object 1 as a reader 428b5fca8f8Stomee * before thread B sets the lock's write-wanted bit, preventing thread A from 429b5fca8f8Stomee * reacquiring the lock for object 2 as a reader. Unable to make forward 430b5fca8f8Stomee * progress, thread A will never release the lock for object 1, resulting in 431b5fca8f8Stomee * deadlock. 432b5fca8f8Stomee * 433b5fca8f8Stomee * There are two ways of avoiding the deadlock just described. The first is to 434b5fca8f8Stomee * use rw_tryenter() rather than rw_enter() in the callback function when 435b5fca8f8Stomee * attempting to acquire the lock as a writer. If tryenter discovers that the 436b5fca8f8Stomee * same object (or another object hashed to the same lock) is already in use, it 437b5fca8f8Stomee * aborts the callback and returns KMEM_CBRC_LATER. The second way is to use 438b5fca8f8Stomee * rprwlock_t (declared in common/fs/zfs/sys/rprwlock.h) instead of rwlock_t, 439b5fca8f8Stomee * since it allows a thread to acquire the lock as a reader in spite of a 440b5fca8f8Stomee * waiting writer. This second approach insists on moving the object now, no 441b5fca8f8Stomee * matter how many readers the move function must wait for in order to do so, 442b5fca8f8Stomee * and could delay the completion of the callback indefinitely (blocking 443b5fca8f8Stomee * callbacks to other clients). In practice, a less insistent callback using 444b5fca8f8Stomee * rw_tryenter() returns KMEM_CBRC_LATER infrequently enough that there seems 445b5fca8f8Stomee * little reason to use anything else. 446b5fca8f8Stomee * 447b5fca8f8Stomee * Avoiding deadlock is not the only problem that an implementation using an 448b5fca8f8Stomee * explicit hold needs to solve. Locking the object in the first place (to 449b5fca8f8Stomee * prevent it from moving) remains a problem, since the object could move 450b5fca8f8Stomee * between the time you obtain a pointer to the object and the time you acquire 451b5fca8f8Stomee * the rwlock hashed to that pointer value. Therefore the client needs to 452b5fca8f8Stomee * recheck the value of the pointer after acquiring the lock, drop the lock if 453b5fca8f8Stomee * the value has changed, and try again. This requires a level of indirection: 454b5fca8f8Stomee * something that points to the object rather than the object itself, that the 455b5fca8f8Stomee * client can access safely while attempting to acquire the lock. (The object 456b5fca8f8Stomee * itself cannot be referenced safely because it can move at any time.) 457b5fca8f8Stomee * The following lock-acquisition function takes whatever is safe to reference 458b5fca8f8Stomee * (arg), follows its pointer to the object (using function f), and tries as 459b5fca8f8Stomee * often as necessary to acquire the hashed lock and verify that the object 460b5fca8f8Stomee * still has not moved: 461b5fca8f8Stomee * 462b5fca8f8Stomee * object_t * 463b5fca8f8Stomee * object_hold(object_f f, void *arg) 464b5fca8f8Stomee * { 465b5fca8f8Stomee * object_t *op; 466b5fca8f8Stomee * 467b5fca8f8Stomee * op = f(arg); 468b5fca8f8Stomee * if (op == NULL) { 469b5fca8f8Stomee * return (NULL); 470b5fca8f8Stomee * } 471b5fca8f8Stomee * 472b5fca8f8Stomee * rw_enter(OBJECT_RWLOCK(op), RW_READER); 473b5fca8f8Stomee * while (op != f(arg)) { 474b5fca8f8Stomee * rw_exit(OBJECT_RWLOCK(op)); 475b5fca8f8Stomee * op = f(arg); 476b5fca8f8Stomee * if (op == NULL) { 477b5fca8f8Stomee * break; 478b5fca8f8Stomee * } 479b5fca8f8Stomee * rw_enter(OBJECT_RWLOCK(op), RW_READER); 480b5fca8f8Stomee * } 481b5fca8f8Stomee * 482b5fca8f8Stomee * return (op); 483b5fca8f8Stomee * } 484b5fca8f8Stomee * 485b5fca8f8Stomee * The OBJECT_RWLOCK macro hashes the object address to obtain the rwlock. The 486b5fca8f8Stomee * lock reacquisition loop, while necessary, almost never executes. The function 487b5fca8f8Stomee * pointer f (used to obtain the object pointer from arg) has the following type 488b5fca8f8Stomee * definition: 489b5fca8f8Stomee * 490b5fca8f8Stomee * typedef object_t *(*object_f)(void *arg); 491b5fca8f8Stomee * 492b5fca8f8Stomee * An object_f implementation is likely to be as simple as accessing a structure 493b5fca8f8Stomee * member: 494b5fca8f8Stomee * 495b5fca8f8Stomee * object_t * 496b5fca8f8Stomee * s_object(void *arg) 497b5fca8f8Stomee * { 498b5fca8f8Stomee * something_t *sp = arg; 499b5fca8f8Stomee * return (sp->s_object); 500b5fca8f8Stomee * } 501b5fca8f8Stomee * 502b5fca8f8Stomee * The flexibility of a function pointer allows the path to the object to be 503b5fca8f8Stomee * arbitrarily complex and also supports the notion that depending on where you 504b5fca8f8Stomee * are using the object, you may need to get it from someplace different. 505b5fca8f8Stomee * 506b5fca8f8Stomee * The function that releases the explicit hold is simpler because it does not 507b5fca8f8Stomee * have to worry about the object moving: 508b5fca8f8Stomee * 509b5fca8f8Stomee * void 510b5fca8f8Stomee * object_rele(object_t *op) 511b5fca8f8Stomee * { 512b5fca8f8Stomee * rw_exit(OBJECT_RWLOCK(op)); 513b5fca8f8Stomee * } 514b5fca8f8Stomee * 515b5fca8f8Stomee * The caller is spared these details so that obtaining and releasing an 516b5fca8f8Stomee * explicit hold feels like a simple mutex_enter()/mutex_exit() pair. The caller 517b5fca8f8Stomee * of object_hold() only needs to know that the returned object pointer is valid 518b5fca8f8Stomee * if not NULL and that the object will not move until released. 519b5fca8f8Stomee * 520b5fca8f8Stomee * Although object_hold() prevents an object from moving, it does not prevent it 521b5fca8f8Stomee * from being freed. The caller must take measures before calling object_hold() 522b5fca8f8Stomee * (afterwards is too late) to ensure that the held object cannot be freed. The 523b5fca8f8Stomee * caller must do so without accessing the unsafe object reference, so any lock 524b5fca8f8Stomee * or reference count used to ensure the continued existence of the object must 525b5fca8f8Stomee * live outside the object itself. 526b5fca8f8Stomee * 527b5fca8f8Stomee * Obtaining a new object is a special case where an explicit hold is impossible 528b5fca8f8Stomee * for the caller. Any function that returns a newly allocated object (either as 529b5fca8f8Stomee * a return value, or as an in-out paramter) must return it already held; after 530b5fca8f8Stomee * the caller gets it is too late, since the object cannot be safely accessed 531b5fca8f8Stomee * without the level of indirection described earlier. The following 532b5fca8f8Stomee * object_alloc() example uses the same code shown earlier to transition a new 533b5fca8f8Stomee * object into the state of being recognized (by the client) as a known object. 534b5fca8f8Stomee * The function must acquire the hold (rw_enter) before that state transition 535b5fca8f8Stomee * makes the object movable: 536b5fca8f8Stomee * 537b5fca8f8Stomee * static object_t * 538b5fca8f8Stomee * object_alloc(container_t *container) 539b5fca8f8Stomee * { 5404d4c4c43STom Erickson * object_t *object = kmem_cache_alloc(object_cache, 0); 541b5fca8f8Stomee * ... set any initial state not set by the constructor ... 542b5fca8f8Stomee * rw_enter(OBJECT_RWLOCK(object), RW_READER); 543b5fca8f8Stomee * mutex_enter(&container->c_objects_lock); 544b5fca8f8Stomee * list_insert_tail(&container->c_objects, object); 545b5fca8f8Stomee * membar_producer(); 546b5fca8f8Stomee * object->o_container = container; 547b5fca8f8Stomee * mutex_exit(&container->c_objects_lock); 548b5fca8f8Stomee * return (object); 549b5fca8f8Stomee * } 550b5fca8f8Stomee * 551b5fca8f8Stomee * Functions that implicitly acquire an object hold (any function that calls 552b5fca8f8Stomee * object_alloc() to supply an object for the caller) need to be carefully noted 553b5fca8f8Stomee * so that the matching object_rele() is not neglected. Otherwise, leaked holds 554b5fca8f8Stomee * prevent all objects hashed to the affected rwlocks from ever being moved. 555b5fca8f8Stomee * 556b5fca8f8Stomee * The pointer to a held object can be hashed to the holding rwlock even after 557b5fca8f8Stomee * the object has been freed. Although it is possible to release the hold 558b5fca8f8Stomee * after freeing the object, you may decide to release the hold implicitly in 559b5fca8f8Stomee * whatever function frees the object, so as to release the hold as soon as 560b5fca8f8Stomee * possible, and for the sake of symmetry with the function that implicitly 561b5fca8f8Stomee * acquires the hold when it allocates the object. Here, object_free() releases 562b5fca8f8Stomee * the hold acquired by object_alloc(). Its implicit object_rele() forms a 563b5fca8f8Stomee * matching pair with object_hold(): 564b5fca8f8Stomee * 565b5fca8f8Stomee * void 566b5fca8f8Stomee * object_free(object_t *object) 567b5fca8f8Stomee * { 568b5fca8f8Stomee * container_t *container; 569b5fca8f8Stomee * 570b5fca8f8Stomee * ASSERT(object_held(object)); 571b5fca8f8Stomee * container = object->o_container; 572b5fca8f8Stomee * mutex_enter(&container->c_objects_lock); 573b5fca8f8Stomee * object->o_container = 574b5fca8f8Stomee * (void *)((uintptr_t)object->o_container | 0x1); 575b5fca8f8Stomee * list_remove(&container->c_objects, object); 576b5fca8f8Stomee * mutex_exit(&container->c_objects_lock); 577b5fca8f8Stomee * object_rele(object); 578b5fca8f8Stomee * kmem_cache_free(object_cache, object); 579b5fca8f8Stomee * } 580b5fca8f8Stomee * 581b5fca8f8Stomee * Note that object_free() cannot safely accept an object pointer as an argument 582b5fca8f8Stomee * unless the object is already held. Any function that calls object_free() 583b5fca8f8Stomee * needs to be carefully noted since it similarly forms a matching pair with 584b5fca8f8Stomee * object_hold(). 585b5fca8f8Stomee * 586b5fca8f8Stomee * To complete the picture, the following callback function implements the 587b5fca8f8Stomee * general solution by moving objects only if they are currently unheld: 588b5fca8f8Stomee * 589b5fca8f8Stomee * static kmem_cbrc_t 590b5fca8f8Stomee * object_move(void *buf, void *newbuf, size_t size, void *arg) 591b5fca8f8Stomee * { 592b5fca8f8Stomee * object_t *op = buf, *np = newbuf; 593b5fca8f8Stomee * container_t *container; 594b5fca8f8Stomee * 595b5fca8f8Stomee * container = op->o_container; 596b5fca8f8Stomee * if ((uintptr_t)container & 0x3) { 597b5fca8f8Stomee * return (KMEM_CBRC_DONT_KNOW); 598b5fca8f8Stomee * } 599b5fca8f8Stomee * 600b5fca8f8Stomee * // Ensure that the container structure does not go away. 601b5fca8f8Stomee * if (container_hold(container) == 0) { 602b5fca8f8Stomee * return (KMEM_CBRC_DONT_KNOW); 603b5fca8f8Stomee * } 604b5fca8f8Stomee * 605b5fca8f8Stomee * mutex_enter(&container->c_objects_lock); 606b5fca8f8Stomee * if (container != op->o_container) { 607b5fca8f8Stomee * mutex_exit(&container->c_objects_lock); 608b5fca8f8Stomee * container_rele(container); 609b5fca8f8Stomee * return (KMEM_CBRC_DONT_KNOW); 610b5fca8f8Stomee * } 611b5fca8f8Stomee * 612b5fca8f8Stomee * if (rw_tryenter(OBJECT_RWLOCK(op), RW_WRITER) == 0) { 613b5fca8f8Stomee * mutex_exit(&container->c_objects_lock); 614b5fca8f8Stomee * container_rele(container); 615b5fca8f8Stomee * return (KMEM_CBRC_LATER); 616b5fca8f8Stomee * } 617b5fca8f8Stomee * 618b5fca8f8Stomee * object_move_impl(op, np); // critical section 619b5fca8f8Stomee * rw_exit(OBJECT_RWLOCK(op)); 620b5fca8f8Stomee * 621b5fca8f8Stomee * op->o_container = (void *)((uintptr_t)op->o_container | 0x1); 622b5fca8f8Stomee * list_link_replace(&op->o_link_node, &np->o_link_node); 623b5fca8f8Stomee * mutex_exit(&container->c_objects_lock); 624b5fca8f8Stomee * container_rele(container); 625b5fca8f8Stomee * return (KMEM_CBRC_YES); 626b5fca8f8Stomee * } 627b5fca8f8Stomee * 628b5fca8f8Stomee * Note that object_move() must invalidate the designated o_container pointer of 629b5fca8f8Stomee * the old object in the same way that object_free() does, since kmem will free 630b5fca8f8Stomee * the object in response to the KMEM_CBRC_YES return value. 631b5fca8f8Stomee * 632b5fca8f8Stomee * The lock order in object_move() differs from object_alloc(), which locks 633b5fca8f8Stomee * OBJECT_RWLOCK first and &container->c_objects_lock second, but as long as the 634b5fca8f8Stomee * callback uses rw_tryenter() (preventing the deadlock described earlier), it's 635b5fca8f8Stomee * not a problem. Holding the lock on the object list in the example above 636b5fca8f8Stomee * through the entire callback not only prevents the object from going away, it 637b5fca8f8Stomee * also allows you to lock the list elsewhere and know that none of its elements 638b5fca8f8Stomee * will move during iteration. 639b5fca8f8Stomee * 640b5fca8f8Stomee * Adding an explicit hold everywhere an object from the cache is used is tricky 641b5fca8f8Stomee * and involves much more change to client code than a cache-specific solution 642b5fca8f8Stomee * that leverages existing state to decide whether or not an object is 643b5fca8f8Stomee * movable. However, this approach has the advantage that no object remains 644b5fca8f8Stomee * immovable for any significant length of time, making it extremely unlikely 645b5fca8f8Stomee * that long-lived allocations can continue holding slabs hostage; and it works 646b5fca8f8Stomee * for any cache. 647b5fca8f8Stomee * 648b5fca8f8Stomee * 3. Consolidator Implementation 649b5fca8f8Stomee * 650b5fca8f8Stomee * Once the client supplies a move function that a) recognizes known objects and 651b5fca8f8Stomee * b) avoids moving objects that are actively in use, the remaining work is up 652b5fca8f8Stomee * to the consolidator to decide which objects to move and when to issue 653b5fca8f8Stomee * callbacks. 654b5fca8f8Stomee * 655b5fca8f8Stomee * The consolidator relies on the fact that a cache's slabs are ordered by 656b5fca8f8Stomee * usage. Each slab has a fixed number of objects. Depending on the slab's 657b5fca8f8Stomee * "color" (the offset of the first object from the beginning of the slab; 658b5fca8f8Stomee * offsets are staggered to mitigate false sharing of cache lines) it is either 659b5fca8f8Stomee * the maximum number of objects per slab determined at cache creation time or 660b5fca8f8Stomee * else the number closest to the maximum that fits within the space remaining 661b5fca8f8Stomee * after the initial offset. A completely allocated slab may contribute some 662b5fca8f8Stomee * internal fragmentation (per-slab overhead) but no external fragmentation, so 663b5fca8f8Stomee * it is of no interest to the consolidator. At the other extreme, slabs whose 664b5fca8f8Stomee * objects have all been freed to the slab are released to the virtual memory 665b5fca8f8Stomee * (VM) subsystem (objects freed to magazines are still allocated as far as the 666b5fca8f8Stomee * slab is concerned). External fragmentation exists when there are slabs 667b5fca8f8Stomee * somewhere between these extremes. A partial slab has at least one but not all 668b5fca8f8Stomee * of its objects allocated. The more partial slabs, and the fewer allocated 669b5fca8f8Stomee * objects on each of them, the higher the fragmentation. Hence the 670b5fca8f8Stomee * consolidator's overall strategy is to reduce the number of partial slabs by 671b5fca8f8Stomee * moving allocated objects from the least allocated slabs to the most allocated 672b5fca8f8Stomee * slabs. 673b5fca8f8Stomee * 674b5fca8f8Stomee * Partial slabs are kept in an AVL tree ordered by usage. Completely allocated 675b5fca8f8Stomee * slabs are kept separately in an unordered list. Since the majority of slabs 676b5fca8f8Stomee * tend to be completely allocated (a typical unfragmented cache may have 677b5fca8f8Stomee * thousands of complete slabs and only a single partial slab), separating 678b5fca8f8Stomee * complete slabs improves the efficiency of partial slab ordering, since the 679b5fca8f8Stomee * complete slabs do not affect the depth or balance of the AVL tree. This 680b5fca8f8Stomee * ordered sequence of partial slabs acts as a "free list" supplying objects for 681b5fca8f8Stomee * allocation requests. 682b5fca8f8Stomee * 683b5fca8f8Stomee * Objects are always allocated from the first partial slab in the free list, 684b5fca8f8Stomee * where the allocation is most likely to eliminate a partial slab (by 685b5fca8f8Stomee * completely allocating it). Conversely, when a single object from a completely 686b5fca8f8Stomee * allocated slab is freed to the slab, that slab is added to the front of the 687b5fca8f8Stomee * free list. Since most free list activity involves highly allocated slabs 688b5fca8f8Stomee * coming and going at the front of the list, slabs tend naturally toward the 689b5fca8f8Stomee * ideal order: highly allocated at the front, sparsely allocated at the back. 690b5fca8f8Stomee * Slabs with few allocated objects are likely to become completely free if they 691b5fca8f8Stomee * keep a safe distance away from the front of the free list. Slab misorders 692b5fca8f8Stomee * interfere with the natural tendency of slabs to become completely free or 693b5fca8f8Stomee * completely allocated. For example, a slab with a single allocated object 694b5fca8f8Stomee * needs only a single free to escape the cache; its natural desire is 695b5fca8f8Stomee * frustrated when it finds itself at the front of the list where a second 696b5fca8f8Stomee * allocation happens just before the free could have released it. Another slab 697b5fca8f8Stomee * with all but one object allocated might have supplied the buffer instead, so 698b5fca8f8Stomee * that both (as opposed to neither) of the slabs would have been taken off the 699b5fca8f8Stomee * free list. 700b5fca8f8Stomee * 701b5fca8f8Stomee * Although slabs tend naturally toward the ideal order, misorders allowed by a 702b5fca8f8Stomee * simple list implementation defeat the consolidator's strategy of merging 703b5fca8f8Stomee * least- and most-allocated slabs. Without an AVL tree to guarantee order, kmem 704b5fca8f8Stomee * needs another way to fix misorders to optimize its callback strategy. One 705b5fca8f8Stomee * approach is to periodically scan a limited number of slabs, advancing a 706b5fca8f8Stomee * marker to hold the current scan position, and to move extreme misorders to 707b5fca8f8Stomee * the front or back of the free list and to the front or back of the current 708b5fca8f8Stomee * scan range. By making consecutive scan ranges overlap by one slab, the least 709b5fca8f8Stomee * allocated slab in the current range can be carried along from the end of one 710b5fca8f8Stomee * scan to the start of the next. 711b5fca8f8Stomee * 712b5fca8f8Stomee * Maintaining partial slabs in an AVL tree relieves kmem of this additional 713b5fca8f8Stomee * task, however. Since most of the cache's activity is in the magazine layer, 714b5fca8f8Stomee * and allocations from the slab layer represent only a startup cost, the 715b5fca8f8Stomee * overhead of maintaining a balanced tree is not a significant concern compared 716b5fca8f8Stomee * to the opportunity of reducing complexity by eliminating the partial slab 717b5fca8f8Stomee * scanner just described. The overhead of an AVL tree is minimized by 718b5fca8f8Stomee * maintaining only partial slabs in the tree and keeping completely allocated 719b5fca8f8Stomee * slabs separately in a list. To avoid increasing the size of the slab 720b5fca8f8Stomee * structure the AVL linkage pointers are reused for the slab's list linkage, 721b5fca8f8Stomee * since the slab will always be either partial or complete, never stored both 722b5fca8f8Stomee * ways at the same time. To further minimize the overhead of the AVL tree the 723b5fca8f8Stomee * compare function that orders partial slabs by usage divides the range of 724b5fca8f8Stomee * allocated object counts into bins such that counts within the same bin are 725b5fca8f8Stomee * considered equal. Binning partial slabs makes it less likely that allocating 726b5fca8f8Stomee * or freeing a single object will change the slab's order, requiring a tree 727b5fca8f8Stomee * reinsertion (an avl_remove() followed by an avl_add(), both potentially 728b5fca8f8Stomee * requiring some rebalancing of the tree). Allocation counts closest to 729b5fca8f8Stomee * completely free and completely allocated are left unbinned (finely sorted) to 730b5fca8f8Stomee * better support the consolidator's strategy of merging slabs at either 731b5fca8f8Stomee * extreme. 732b5fca8f8Stomee * 733b5fca8f8Stomee * 3.1 Assessing Fragmentation and Selecting Candidate Slabs 734b5fca8f8Stomee * 735b5fca8f8Stomee * The consolidator piggybacks on the kmem maintenance thread and is called on 736b5fca8f8Stomee * the same interval as kmem_cache_update(), once per cache every fifteen 737b5fca8f8Stomee * seconds. kmem maintains a running count of unallocated objects in the slab 738b5fca8f8Stomee * layer (cache_bufslab). The consolidator checks whether that number exceeds 739b5fca8f8Stomee * 12.5% (1/8) of the total objects in the cache (cache_buftotal), and whether 740b5fca8f8Stomee * there is a significant number of slabs in the cache (arbitrarily a minimum 741b5fca8f8Stomee * 101 total slabs). Unused objects that have fallen out of the magazine layer's 742b5fca8f8Stomee * working set are included in the assessment, and magazines in the depot are 743b5fca8f8Stomee * reaped if those objects would lift cache_bufslab above the fragmentation 744b5fca8f8Stomee * threshold. Once the consolidator decides that a cache is fragmented, it looks 745b5fca8f8Stomee * for a candidate slab to reclaim, starting at the end of the partial slab free 746b5fca8f8Stomee * list and scanning backwards. At first the consolidator is choosy: only a slab 747b5fca8f8Stomee * with fewer than 12.5% (1/8) of its objects allocated qualifies (or else a 748b5fca8f8Stomee * single allocated object, regardless of percentage). If there is difficulty 749b5fca8f8Stomee * finding a candidate slab, kmem raises the allocation threshold incrementally, 750b5fca8f8Stomee * up to a maximum 87.5% (7/8), so that eventually the consolidator will reduce 751b5fca8f8Stomee * external fragmentation (unused objects on the free list) below 12.5% (1/8), 752b5fca8f8Stomee * even in the worst case of every slab in the cache being almost 7/8 allocated. 753b5fca8f8Stomee * The threshold can also be lowered incrementally when candidate slabs are easy 754b5fca8f8Stomee * to find, and the threshold is reset to the minimum 1/8 as soon as the cache 755b5fca8f8Stomee * is no longer fragmented. 756b5fca8f8Stomee * 757b5fca8f8Stomee * 3.2 Generating Callbacks 758b5fca8f8Stomee * 759b5fca8f8Stomee * Once an eligible slab is chosen, a callback is generated for every allocated 760b5fca8f8Stomee * object on the slab, in the hope that the client will move everything off the 761b5fca8f8Stomee * slab and make it reclaimable. Objects selected as move destinations are 762b5fca8f8Stomee * chosen from slabs at the front of the free list. Assuming slabs in the ideal 763b5fca8f8Stomee * order (most allocated at the front, least allocated at the back) and a 764b5fca8f8Stomee * cooperative client, the consolidator will succeed in removing slabs from both 765b5fca8f8Stomee * ends of the free list, completely allocating on the one hand and completely 766b5fca8f8Stomee * freeing on the other. Objects selected as move destinations are allocated in 767b5fca8f8Stomee * the kmem maintenance thread where move requests are enqueued. A separate 768b5fca8f8Stomee * callback thread removes pending callbacks from the queue and calls the 769b5fca8f8Stomee * client. The separate thread ensures that client code (the move function) does 770b5fca8f8Stomee * not interfere with internal kmem maintenance tasks. A map of pending 771b5fca8f8Stomee * callbacks keyed by object address (the object to be moved) is checked to 772b5fca8f8Stomee * ensure that duplicate callbacks are not generated for the same object. 773b5fca8f8Stomee * Allocating the move destination (the object to move to) prevents subsequent 774b5fca8f8Stomee * callbacks from selecting the same destination as an earlier pending callback. 775b5fca8f8Stomee * 776b5fca8f8Stomee * Move requests can also be generated by kmem_cache_reap() when the system is 777b5fca8f8Stomee * desperate for memory and by kmem_cache_move_notify(), called by the client to 778b5fca8f8Stomee * notify kmem that a move refused earlier with KMEM_CBRC_LATER is now possible. 779b5fca8f8Stomee * The map of pending callbacks is protected by the same lock that protects the 780b5fca8f8Stomee * slab layer. 781b5fca8f8Stomee * 782b5fca8f8Stomee * When the system is desperate for memory, kmem does not bother to determine 783b5fca8f8Stomee * whether or not the cache exceeds the fragmentation threshold, but tries to 784b5fca8f8Stomee * consolidate as many slabs as possible. Normally, the consolidator chews 785b5fca8f8Stomee * slowly, one sparsely allocated slab at a time during each maintenance 786b5fca8f8Stomee * interval that the cache is fragmented. When desperate, the consolidator 787b5fca8f8Stomee * starts at the last partial slab and enqueues callbacks for every allocated 788b5fca8f8Stomee * object on every partial slab, working backwards until it reaches the first 789b5fca8f8Stomee * partial slab. The first partial slab, meanwhile, advances in pace with the 790b5fca8f8Stomee * consolidator as allocations to supply move destinations for the enqueued 791b5fca8f8Stomee * callbacks use up the highly allocated slabs at the front of the free list. 792b5fca8f8Stomee * Ideally, the overgrown free list collapses like an accordion, starting at 793b5fca8f8Stomee * both ends and ending at the center with a single partial slab. 794b5fca8f8Stomee * 795b5fca8f8Stomee * 3.3 Client Responses 796b5fca8f8Stomee * 797b5fca8f8Stomee * When the client returns KMEM_CBRC_NO in response to the move callback, kmem 798b5fca8f8Stomee * marks the slab that supplied the stuck object non-reclaimable and moves it to 799b5fca8f8Stomee * front of the free list. The slab remains marked as long as it remains on the 800b5fca8f8Stomee * free list, and it appears more allocated to the partial slab compare function 801b5fca8f8Stomee * than any unmarked slab, no matter how many of its objects are allocated. 802b5fca8f8Stomee * Since even one immovable object ties up the entire slab, the goal is to 803b5fca8f8Stomee * completely allocate any slab that cannot be completely freed. kmem does not 804b5fca8f8Stomee * bother generating callbacks to move objects from a marked slab unless the 805b5fca8f8Stomee * system is desperate. 806b5fca8f8Stomee * 807b5fca8f8Stomee * When the client responds KMEM_CBRC_LATER, kmem increments a count for the 808b5fca8f8Stomee * slab. If the client responds LATER too many times, kmem disbelieves and 809b5fca8f8Stomee * treats the response as a NO. The count is cleared when the slab is taken off 810b5fca8f8Stomee * the partial slab list or when the client moves one of the slab's objects. 811b5fca8f8Stomee * 812b5fca8f8Stomee * 4. Observability 813b5fca8f8Stomee * 814b5fca8f8Stomee * A kmem cache's external fragmentation is best observed with 'mdb -k' using 815b5fca8f8Stomee * the ::kmem_slabs dcmd. For a complete description of the command, enter 816b5fca8f8Stomee * '::help kmem_slabs' at the mdb prompt. 8177c478bd9Sstevel@tonic-gate */ 8187c478bd9Sstevel@tonic-gate 8197c478bd9Sstevel@tonic-gate #include <sys/kmem_impl.h> 8207c478bd9Sstevel@tonic-gate #include <sys/vmem_impl.h> 8217c478bd9Sstevel@tonic-gate #include <sys/param.h> 8227c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h> 8237c478bd9Sstevel@tonic-gate #include <sys/vm.h> 8247c478bd9Sstevel@tonic-gate #include <sys/proc.h> 8257c478bd9Sstevel@tonic-gate #include <sys/tuneable.h> 8267c478bd9Sstevel@tonic-gate #include <sys/systm.h> 8277c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 8287c478bd9Sstevel@tonic-gate #include <sys/debug.h> 829b5fca8f8Stomee #include <sys/sdt.h> 8307c478bd9Sstevel@tonic-gate #include <sys/mutex.h> 8317c478bd9Sstevel@tonic-gate #include <sys/bitmap.h> 8327c478bd9Sstevel@tonic-gate #include <sys/atomic.h> 8337c478bd9Sstevel@tonic-gate #include <sys/kobj.h> 8347c478bd9Sstevel@tonic-gate #include <sys/disp.h> 8357c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 8367c478bd9Sstevel@tonic-gate #include <sys/log.h> 8377c478bd9Sstevel@tonic-gate #include <sys/callb.h> 8387c478bd9Sstevel@tonic-gate #include <sys/taskq.h> 8397c478bd9Sstevel@tonic-gate #include <sys/modctl.h> 8407c478bd9Sstevel@tonic-gate #include <sys/reboot.h> 8417c478bd9Sstevel@tonic-gate #include <sys/id32.h> 8427c478bd9Sstevel@tonic-gate #include <sys/zone.h> 843f4b3ec61Sdh #include <sys/netstack.h> 844b5fca8f8Stomee #ifdef DEBUG 845b5fca8f8Stomee #include <sys/random.h> 846b5fca8f8Stomee #endif 8477c478bd9Sstevel@tonic-gate 8487c478bd9Sstevel@tonic-gate extern void streams_msg_init(void); 8497c478bd9Sstevel@tonic-gate extern int segkp_fromheap; 8507c478bd9Sstevel@tonic-gate extern void segkp_cache_free(void); 8516e00b116SPeter Telford extern int callout_init_done; 8527c478bd9Sstevel@tonic-gate 8537c478bd9Sstevel@tonic-gate struct kmem_cache_kstat { 8547c478bd9Sstevel@tonic-gate kstat_named_t kmc_buf_size; 8557c478bd9Sstevel@tonic-gate kstat_named_t kmc_align; 8567c478bd9Sstevel@tonic-gate kstat_named_t kmc_chunk_size; 8577c478bd9Sstevel@tonic-gate kstat_named_t kmc_slab_size; 8587c478bd9Sstevel@tonic-gate kstat_named_t kmc_alloc; 8597c478bd9Sstevel@tonic-gate kstat_named_t kmc_alloc_fail; 8607c478bd9Sstevel@tonic-gate kstat_named_t kmc_free; 8617c478bd9Sstevel@tonic-gate kstat_named_t kmc_depot_alloc; 8627c478bd9Sstevel@tonic-gate kstat_named_t kmc_depot_free; 8637c478bd9Sstevel@tonic-gate kstat_named_t kmc_depot_contention; 8647c478bd9Sstevel@tonic-gate kstat_named_t kmc_slab_alloc; 8657c478bd9Sstevel@tonic-gate kstat_named_t kmc_slab_free; 8667c478bd9Sstevel@tonic-gate kstat_named_t kmc_buf_constructed; 8677c478bd9Sstevel@tonic-gate kstat_named_t kmc_buf_avail; 8687c478bd9Sstevel@tonic-gate kstat_named_t kmc_buf_inuse; 8697c478bd9Sstevel@tonic-gate kstat_named_t kmc_buf_total; 8707c478bd9Sstevel@tonic-gate kstat_named_t kmc_buf_max; 8717c478bd9Sstevel@tonic-gate kstat_named_t kmc_slab_create; 8727c478bd9Sstevel@tonic-gate kstat_named_t kmc_slab_destroy; 8737c478bd9Sstevel@tonic-gate kstat_named_t kmc_vmem_source; 8747c478bd9Sstevel@tonic-gate kstat_named_t kmc_hash_size; 8757c478bd9Sstevel@tonic-gate kstat_named_t kmc_hash_lookup_depth; 8767c478bd9Sstevel@tonic-gate kstat_named_t kmc_hash_rescale; 8777c478bd9Sstevel@tonic-gate kstat_named_t kmc_full_magazines; 8787c478bd9Sstevel@tonic-gate kstat_named_t kmc_empty_magazines; 8797c478bd9Sstevel@tonic-gate kstat_named_t kmc_magazine_size; 880686031edSTom Erickson kstat_named_t kmc_reap; /* number of kmem_cache_reap() calls */ 881686031edSTom Erickson kstat_named_t kmc_defrag; /* attempts to defrag all partial slabs */ 882686031edSTom Erickson kstat_named_t kmc_scan; /* attempts to defrag one partial slab */ 883686031edSTom Erickson kstat_named_t kmc_move_callbacks; /* sum of yes, no, later, dn, dk */ 884b5fca8f8Stomee kstat_named_t kmc_move_yes; 885b5fca8f8Stomee kstat_named_t kmc_move_no; 886b5fca8f8Stomee kstat_named_t kmc_move_later; 887b5fca8f8Stomee kstat_named_t kmc_move_dont_need; 888686031edSTom Erickson kstat_named_t kmc_move_dont_know; /* obj unrecognized by client ... */ 889686031edSTom Erickson kstat_named_t kmc_move_hunt_found; /* ... but found in mag layer */ 890686031edSTom Erickson kstat_named_t kmc_move_slabs_freed; /* slabs freed by consolidator */ 891686031edSTom Erickson kstat_named_t kmc_move_reclaimable; /* buffers, if consolidator ran */ 8927c478bd9Sstevel@tonic-gate } kmem_cache_kstat = { 8937c478bd9Sstevel@tonic-gate { "buf_size", KSTAT_DATA_UINT64 }, 8947c478bd9Sstevel@tonic-gate { "align", KSTAT_DATA_UINT64 }, 8957c478bd9Sstevel@tonic-gate { "chunk_size", KSTAT_DATA_UINT64 }, 8967c478bd9Sstevel@tonic-gate { "slab_size", KSTAT_DATA_UINT64 }, 8977c478bd9Sstevel@tonic-gate { "alloc", KSTAT_DATA_UINT64 }, 8987c478bd9Sstevel@tonic-gate { "alloc_fail", KSTAT_DATA_UINT64 }, 8997c478bd9Sstevel@tonic-gate { "free", KSTAT_DATA_UINT64 }, 9007c478bd9Sstevel@tonic-gate { "depot_alloc", KSTAT_DATA_UINT64 }, 9017c478bd9Sstevel@tonic-gate { "depot_free", KSTAT_DATA_UINT64 }, 9027c478bd9Sstevel@tonic-gate { "depot_contention", KSTAT_DATA_UINT64 }, 9037c478bd9Sstevel@tonic-gate { "slab_alloc", KSTAT_DATA_UINT64 }, 9047c478bd9Sstevel@tonic-gate { "slab_free", KSTAT_DATA_UINT64 }, 9057c478bd9Sstevel@tonic-gate { "buf_constructed", KSTAT_DATA_UINT64 }, 9067c478bd9Sstevel@tonic-gate { "buf_avail", KSTAT_DATA_UINT64 }, 9077c478bd9Sstevel@tonic-gate { "buf_inuse", KSTAT_DATA_UINT64 }, 9087c478bd9Sstevel@tonic-gate { "buf_total", KSTAT_DATA_UINT64 }, 9097c478bd9Sstevel@tonic-gate { "buf_max", KSTAT_DATA_UINT64 }, 9107c478bd9Sstevel@tonic-gate { "slab_create", KSTAT_DATA_UINT64 }, 9117c478bd9Sstevel@tonic-gate { "slab_destroy", KSTAT_DATA_UINT64 }, 9127c478bd9Sstevel@tonic-gate { "vmem_source", KSTAT_DATA_UINT64 }, 9137c478bd9Sstevel@tonic-gate { "hash_size", KSTAT_DATA_UINT64 }, 9147c478bd9Sstevel@tonic-gate { "hash_lookup_depth", KSTAT_DATA_UINT64 }, 9157c478bd9Sstevel@tonic-gate { "hash_rescale", KSTAT_DATA_UINT64 }, 9167c478bd9Sstevel@tonic-gate { "full_magazines", KSTAT_DATA_UINT64 }, 9177c478bd9Sstevel@tonic-gate { "empty_magazines", KSTAT_DATA_UINT64 }, 9187c478bd9Sstevel@tonic-gate { "magazine_size", KSTAT_DATA_UINT64 }, 919686031edSTom Erickson { "reap", KSTAT_DATA_UINT64 }, 920686031edSTom Erickson { "defrag", KSTAT_DATA_UINT64 }, 921686031edSTom Erickson { "scan", KSTAT_DATA_UINT64 }, 922b5fca8f8Stomee { "move_callbacks", KSTAT_DATA_UINT64 }, 923b5fca8f8Stomee { "move_yes", KSTAT_DATA_UINT64 }, 924b5fca8f8Stomee { "move_no", KSTAT_DATA_UINT64 }, 925b5fca8f8Stomee { "move_later", KSTAT_DATA_UINT64 }, 926b5fca8f8Stomee { "move_dont_need", KSTAT_DATA_UINT64 }, 927b5fca8f8Stomee { "move_dont_know", KSTAT_DATA_UINT64 }, 928b5fca8f8Stomee { "move_hunt_found", KSTAT_DATA_UINT64 }, 929686031edSTom Erickson { "move_slabs_freed", KSTAT_DATA_UINT64 }, 930686031edSTom Erickson { "move_reclaimable", KSTAT_DATA_UINT64 }, 9317c478bd9Sstevel@tonic-gate }; 9327c478bd9Sstevel@tonic-gate 9337c478bd9Sstevel@tonic-gate static kmutex_t kmem_cache_kstat_lock; 9347c478bd9Sstevel@tonic-gate 9357c478bd9Sstevel@tonic-gate /* 9367c478bd9Sstevel@tonic-gate * The default set of caches to back kmem_alloc(). 9377c478bd9Sstevel@tonic-gate * These sizes should be reevaluated periodically. 9387c478bd9Sstevel@tonic-gate * 9397c478bd9Sstevel@tonic-gate * We want allocations that are multiples of the coherency granularity 9407c478bd9Sstevel@tonic-gate * (64 bytes) to be satisfied from a cache which is a multiple of 64 9417c478bd9Sstevel@tonic-gate * bytes, so that it will be 64-byte aligned. For all multiples of 64, 9427c478bd9Sstevel@tonic-gate * the next kmem_cache_size greater than or equal to it must be a 9437c478bd9Sstevel@tonic-gate * multiple of 64. 944dce01e3fSJonathan W Adams * 945dce01e3fSJonathan W Adams * We split the table into two sections: size <= 4k and size > 4k. This 946dce01e3fSJonathan W Adams * saves a lot of space and cache footprint in our cache tables. 9477c478bd9Sstevel@tonic-gate */ 9487c478bd9Sstevel@tonic-gate static const int kmem_alloc_sizes[] = { 9497c478bd9Sstevel@tonic-gate 1 * 8, 9507c478bd9Sstevel@tonic-gate 2 * 8, 9517c478bd9Sstevel@tonic-gate 3 * 8, 9527c478bd9Sstevel@tonic-gate 4 * 8, 5 * 8, 6 * 8, 7 * 8, 9537c478bd9Sstevel@tonic-gate 4 * 16, 5 * 16, 6 * 16, 7 * 16, 9547c478bd9Sstevel@tonic-gate 4 * 32, 5 * 32, 6 * 32, 7 * 32, 9557c478bd9Sstevel@tonic-gate 4 * 64, 5 * 64, 6 * 64, 7 * 64, 9567c478bd9Sstevel@tonic-gate 4 * 128, 5 * 128, 6 * 128, 7 * 128, 9577c478bd9Sstevel@tonic-gate P2ALIGN(8192 / 7, 64), 9587c478bd9Sstevel@tonic-gate P2ALIGN(8192 / 6, 64), 9597c478bd9Sstevel@tonic-gate P2ALIGN(8192 / 5, 64), 9607c478bd9Sstevel@tonic-gate P2ALIGN(8192 / 4, 64), 9617c478bd9Sstevel@tonic-gate P2ALIGN(8192 / 3, 64), 9627c478bd9Sstevel@tonic-gate P2ALIGN(8192 / 2, 64), 9637c478bd9Sstevel@tonic-gate }; 9647c478bd9Sstevel@tonic-gate 965dce01e3fSJonathan W Adams static const int kmem_big_alloc_sizes[] = { 966dce01e3fSJonathan W Adams 2 * 4096, 3 * 4096, 967dce01e3fSJonathan W Adams 2 * 8192, 3 * 8192, 968dce01e3fSJonathan W Adams 4 * 8192, 5 * 8192, 6 * 8192, 7 * 8192, 969dce01e3fSJonathan W Adams 8 * 8192, 9 * 8192, 10 * 8192, 11 * 8192, 970dce01e3fSJonathan W Adams 12 * 8192, 13 * 8192, 14 * 8192, 15 * 8192, 971dce01e3fSJonathan W Adams 16 * 8192 972dce01e3fSJonathan W Adams }; 973dce01e3fSJonathan W Adams 974dce01e3fSJonathan W Adams #define KMEM_MAXBUF 4096 975dce01e3fSJonathan W Adams #define KMEM_BIG_MAXBUF_32BIT 32768 976dce01e3fSJonathan W Adams #define KMEM_BIG_MAXBUF 131072 977dce01e3fSJonathan W Adams 978dce01e3fSJonathan W Adams #define KMEM_BIG_MULTIPLE 4096 /* big_alloc_sizes must be a multiple */ 979dce01e3fSJonathan W Adams #define KMEM_BIG_SHIFT 12 /* lg(KMEM_BIG_MULTIPLE) */ 9807c478bd9Sstevel@tonic-gate 9817c478bd9Sstevel@tonic-gate static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT]; 982dce01e3fSJonathan W Adams static kmem_cache_t *kmem_big_alloc_table[KMEM_BIG_MAXBUF >> KMEM_BIG_SHIFT]; 983dce01e3fSJonathan W Adams 984dce01e3fSJonathan W Adams #define KMEM_ALLOC_TABLE_MAX (KMEM_MAXBUF >> KMEM_ALIGN_SHIFT) 985dce01e3fSJonathan W Adams static size_t kmem_big_alloc_table_max = 0; /* # of filled elements */ 9867c478bd9Sstevel@tonic-gate 9877c478bd9Sstevel@tonic-gate static kmem_magtype_t kmem_magtype[] = { 9887c478bd9Sstevel@tonic-gate { 1, 8, 3200, 65536 }, 9897c478bd9Sstevel@tonic-gate { 3, 16, 256, 32768 }, 9907c478bd9Sstevel@tonic-gate { 7, 32, 64, 16384 }, 9917c478bd9Sstevel@tonic-gate { 15, 64, 0, 8192 }, 9927c478bd9Sstevel@tonic-gate { 31, 64, 0, 4096 }, 9937c478bd9Sstevel@tonic-gate { 47, 64, 0, 2048 }, 9947c478bd9Sstevel@tonic-gate { 63, 64, 0, 1024 }, 9957c478bd9Sstevel@tonic-gate { 95, 64, 0, 512 }, 9967c478bd9Sstevel@tonic-gate { 143, 64, 0, 0 }, 9977c478bd9Sstevel@tonic-gate }; 9987c478bd9Sstevel@tonic-gate 9997c478bd9Sstevel@tonic-gate static uint32_t kmem_reaping; 10007c478bd9Sstevel@tonic-gate static uint32_t kmem_reaping_idspace; 10017c478bd9Sstevel@tonic-gate 10027c478bd9Sstevel@tonic-gate /* 10037c478bd9Sstevel@tonic-gate * kmem tunables 10047c478bd9Sstevel@tonic-gate */ 10057c478bd9Sstevel@tonic-gate clock_t kmem_reap_interval; /* cache reaping rate [15 * HZ ticks] */ 10067c478bd9Sstevel@tonic-gate int kmem_depot_contention = 3; /* max failed tryenters per real interval */ 10077c478bd9Sstevel@tonic-gate pgcnt_t kmem_reapahead = 0; /* start reaping N pages before pageout */ 10087c478bd9Sstevel@tonic-gate int kmem_panic = 1; /* whether to panic on error */ 10097c478bd9Sstevel@tonic-gate int kmem_logging = 1; /* kmem_log_enter() override */ 10107c478bd9Sstevel@tonic-gate uint32_t kmem_mtbf = 0; /* mean time between failures [default: off] */ 10117c478bd9Sstevel@tonic-gate size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */ 10127c478bd9Sstevel@tonic-gate size_t kmem_content_log_size; /* content log size [2% of memory] */ 10137c478bd9Sstevel@tonic-gate size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */ 10147c478bd9Sstevel@tonic-gate size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */ 1015d1580181SBryan Cantrill size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */ 10167c478bd9Sstevel@tonic-gate size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */ 10177c478bd9Sstevel@tonic-gate size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */ 10187c478bd9Sstevel@tonic-gate size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */ 10197c478bd9Sstevel@tonic-gate int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */ 10207c478bd9Sstevel@tonic-gate size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */ 10217c478bd9Sstevel@tonic-gate size_t kmem_minfirewall; /* hardware-enforced redzone threshold */ 10227c478bd9Sstevel@tonic-gate 1023d1580181SBryan Cantrill #ifdef DEBUG 1024d1580181SBryan Cantrill int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */ 1025d1580181SBryan Cantrill #else 1026d1580181SBryan Cantrill int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */ 1027d1580181SBryan Cantrill #endif 1028d1580181SBryan Cantrill 1029d1580181SBryan Cantrill int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */ 1030d1580181SBryan Cantrill 1031dce01e3fSJonathan W Adams #ifdef _LP64 1032dce01e3fSJonathan W Adams size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */ 1033dce01e3fSJonathan W Adams #else 1034dce01e3fSJonathan W Adams size_t kmem_max_cached = KMEM_BIG_MAXBUF_32BIT; /* maximum kmem_alloc cache */ 1035dce01e3fSJonathan W Adams #endif 1036dce01e3fSJonathan W Adams 10377c478bd9Sstevel@tonic-gate #ifdef DEBUG 10387c478bd9Sstevel@tonic-gate int kmem_flags = KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS; 10397c478bd9Sstevel@tonic-gate #else 10407c478bd9Sstevel@tonic-gate int kmem_flags = 0; 10417c478bd9Sstevel@tonic-gate #endif 10427c478bd9Sstevel@tonic-gate int kmem_ready; 10437c478bd9Sstevel@tonic-gate 10447c478bd9Sstevel@tonic-gate static kmem_cache_t *kmem_slab_cache; 10457c478bd9Sstevel@tonic-gate static kmem_cache_t *kmem_bufctl_cache; 10467c478bd9Sstevel@tonic-gate static kmem_cache_t *kmem_bufctl_audit_cache; 10477c478bd9Sstevel@tonic-gate 10487c478bd9Sstevel@tonic-gate static kmutex_t kmem_cache_lock; /* inter-cache linkage only */ 1049b5fca8f8Stomee static list_t kmem_caches; 10507c478bd9Sstevel@tonic-gate 10517c478bd9Sstevel@tonic-gate static taskq_t *kmem_taskq; 10527c478bd9Sstevel@tonic-gate static kmutex_t kmem_flags_lock; 10537c478bd9Sstevel@tonic-gate static vmem_t *kmem_metadata_arena; 10547c478bd9Sstevel@tonic-gate static vmem_t *kmem_msb_arena; /* arena for metadata caches */ 10557c478bd9Sstevel@tonic-gate static vmem_t *kmem_cache_arena; 10567c478bd9Sstevel@tonic-gate static vmem_t *kmem_hash_arena; 10577c478bd9Sstevel@tonic-gate static vmem_t *kmem_log_arena; 10587c478bd9Sstevel@tonic-gate static vmem_t *kmem_oversize_arena; 10597c478bd9Sstevel@tonic-gate static vmem_t *kmem_va_arena; 10607c478bd9Sstevel@tonic-gate static vmem_t *kmem_default_arena; 10617c478bd9Sstevel@tonic-gate static vmem_t *kmem_firewall_va_arena; 10627c478bd9Sstevel@tonic-gate static vmem_t *kmem_firewall_arena; 10637c478bd9Sstevel@tonic-gate 1064d1580181SBryan Cantrill static int kmem_zerosized; /* # of zero-sized allocs */ 1065d1580181SBryan Cantrill 1066b5fca8f8Stomee /* 1067b5fca8f8Stomee * kmem slab consolidator thresholds (tunables) 1068b5fca8f8Stomee */ 1069686031edSTom Erickson size_t kmem_frag_minslabs = 101; /* minimum total slabs */ 1070686031edSTom Erickson size_t kmem_frag_numer = 1; /* free buffers (numerator) */ 1071686031edSTom Erickson size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */ 1072b5fca8f8Stomee /* 1073b5fca8f8Stomee * Maximum number of slabs from which to move buffers during a single 1074b5fca8f8Stomee * maintenance interval while the system is not low on memory. 1075b5fca8f8Stomee */ 1076686031edSTom Erickson size_t kmem_reclaim_max_slabs = 1; 1077b5fca8f8Stomee /* 1078b5fca8f8Stomee * Number of slabs to scan backwards from the end of the partial slab list 1079b5fca8f8Stomee * when searching for buffers to relocate. 1080b5fca8f8Stomee */ 1081686031edSTom Erickson size_t kmem_reclaim_scan_range = 12; 1082b5fca8f8Stomee 1083b5fca8f8Stomee /* consolidator knobs */ 1084929d5b43SMatthew Ahrens boolean_t kmem_move_noreap; 1085929d5b43SMatthew Ahrens boolean_t kmem_move_blocked; 1086929d5b43SMatthew Ahrens boolean_t kmem_move_fulltilt; 1087929d5b43SMatthew Ahrens boolean_t kmem_move_any_partial; 1088b5fca8f8Stomee 1089b5fca8f8Stomee #ifdef DEBUG 1090b5fca8f8Stomee /* 1091686031edSTom Erickson * kmem consolidator debug tunables: 1092b5fca8f8Stomee * Ensure code coverage by occasionally running the consolidator even when the 1093b5fca8f8Stomee * caches are not fragmented (they may never be). These intervals are mean time 1094b5fca8f8Stomee * in cache maintenance intervals (kmem_cache_update). 1095b5fca8f8Stomee */ 1096686031edSTom Erickson uint32_t kmem_mtb_move = 60; /* defrag 1 slab (~15min) */ 1097686031edSTom Erickson uint32_t kmem_mtb_reap = 1800; /* defrag all slabs (~7.5hrs) */ 1098b5fca8f8Stomee #endif /* DEBUG */ 1099