1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <mtmalloc.h>
28#include "mtmalloc_impl.h"
29#include <unistd.h>
30#include <synch.h>
31#include <thread.h>
32#include <pthread.h>
33#include <stdio.h>
34#include <limits.h>
35#include <errno.h>
36#include <string.h>
37#include <strings.h>
38#include <sys/param.h>
39#include <sys/sysmacros.h>
40
41/*
42 * To turn on the asserts just compile -DDEBUG
43 */
44
45#ifndef	DEBUG
46#define	NDEBUG
47#endif
48
49#include <assert.h>
50
51/*
52 * The MT hot malloc implementation contained herein is designed to be
53 * plug-compatible with the libc version of malloc. It is not intended
54 * to replace that implementation until we decide that it is ok to break
55 * customer apps (Solaris 3.0).
56 *
57 * For requests up to 2^^16, the allocator initializes itself into NCPUS
58 * worth of chains of caches. When a memory request is made, the calling thread
59 * is vectored into one of NCPUS worth of caches.  The LWP id gives us a cheap,
60 * contention-reducing index to use, eventually, this should be replaced with
61 * the actual CPU sequence number, when an interface to get it is available.
62 *
63 * Once the thread is vectored into one of the list of caches the real
64 * allocation of the memory begins. The size is determined to figure out which
65 * bucket the allocation should be satisfied from. The management of free
66 * buckets is done via a bitmask. A free bucket is represented by a 1. The
67 * first free bit represents the first free bucket. The position of the bit,
68 * represents the position of the bucket in the arena.
69 *
70 * When the memory from the arena is handed out, the address of the cache
71 * control structure is written in the word preceeding the returned memory.
72 * This cache control address is used during free() to mark the buffer free
73 * in the cache control structure.
74 *
75 * When all available memory in a cache has been depleted, a new chunk of memory
76 * is allocated via sbrk(). The new cache is allocated from this chunk of memory
77 * and initialized in the function create_cache(). New caches are installed at
78 * the front of a singly linked list of the same size memory pools. This helps
79 * to ensure that there will tend to be available memory in the beginning of the
80 * list.
81 *
82 * Long linked lists hurt performance. To decrease this effect, there is a
83 * tunable, requestsize, that bumps up the sbrk allocation size and thus
84 * increases the number of available blocks within an arena.  We also keep
85 * a "hint" for each cache list, which is the last cache in the list allocated
86 * from.  This lowers the cost of searching if there are a lot of fully
87 * allocated blocks at the front of the list.
88 *
89 * For requests greater than 2^^16 (oversize allocations), there are two pieces
90 * of overhead. There is the OVERHEAD used to hold the cache addr
91 * (&oversize_list), plus an oversize_t structure to further describe the block.
92 *
93 * The oversize list is kept as defragmented as possible by coalescing
94 * freed oversized allocations with adjacent neighbors.
95 *
96 * Addresses handed out are stored in a hash table, and are aligned on
97 * MTMALLOC_MIN_ALIGN-byte boundaries at both ends. Request sizes are rounded-up
98 * where necessary in order to achieve this. This eases the implementation of
99 * MTDEBUGPATTERN and MTINITPATTERN, particularly where coalescing occurs.
100 *
101 * A memalign allocation takes memalign header overhead.  There's two
102 * types of memalign headers distinguished by MTMALLOC_MEMALIGN_MAGIC
103 * and MTMALLOC_MEMALIGN_MIN_MAGIC.  When the size of memory taken to
104 * get to the aligned address from malloc'ed address is the minimum size
105 * OVERHEAD, we create a header taking only one OVERHEAD space with magic
106 * number MTMALLOC_MEMALIGN_MIN_MAGIC, and we know by subtracting OVERHEAD
107 * from memaligned address, we can get to the malloc'ed address. Otherwise,
108 * we create a memalign header taking two OVERHEAD space, one stores
109 * MTMALLOC_MEMALIGN_MAGIC magic number, the other one points back to the
110 * malloc'ed address.
111 */
112
113#if defined(__i386) || defined(__amd64)
114#include <arpa/inet.h>	/* for htonl() */
115#endif
116
117static void * morecore(size_t);
118static void create_cache(cache_t *, size_t bufsize, uint_t hunks);
119static void * malloc_internal(size_t, percpu_t *);
120static void * oversize(size_t);
121static oversize_t *find_oversize(size_t);
122static void add_oversize(oversize_t *);
123static void copy_pattern(uint32_t, void *, size_t);
124static void * verify_pattern(uint32_t, void *, size_t);
125static void reinit_cpu_list(void);
126static void reinit_cache(cache_t *);
127static void free_oversize(oversize_t *);
128static oversize_t *oversize_header_alloc(uintptr_t, size_t);
129
130/*
131 * oversize hash table stuff
132 */
133#define	NUM_BUCKETS	67	/* must be prime */
134#define	HASH_OVERSIZE(caddr)	((uintptr_t)(caddr) % NUM_BUCKETS)
135oversize_t *ovsz_hashtab[NUM_BUCKETS];
136
137#define	ALIGN(x, a)	((((uintptr_t)(x) + ((uintptr_t)(a) - 1)) \
138			& ~((uintptr_t)(a) - 1)))
139
140/* need this to deal with little endianess of x86 */
141#if defined(__i386) || defined(__amd64)
142#define	FLIP_EM(x)	htonl((x))
143#else
144#define	FLIP_EM(x)	(x)
145#endif
146
147#define	INSERT_ONLY			0
148#define	COALESCE_LEFT			0x00000001
149#define	COALESCE_RIGHT			0x00000002
150#define	COALESCE_WITH_BOTH_SIDES	(COALESCE_LEFT | COALESCE_RIGHT)
151
152#define	OVERHEAD	8	/* size needed to write cache addr */
153#define	HUNKSIZE	8192	/* just a multiplier */
154
155#define	MAX_CACHED_SHIFT	16	/* 64K is the max cached size */
156#define	MAX_CACHED		(1 << MAX_CACHED_SHIFT)
157#define	MIN_CACHED_SHIFT	4	/* smaller requests rounded up */
158#define	MTMALLOC_MIN_ALIGN	8	/* min guaranteed alignment */
159
160/* maximum size before overflow */
161#define	MAX_MTMALLOC	(SIZE_MAX - (SIZE_MAX % MTMALLOC_MIN_ALIGN) \
162			- OVSZ_HEADER_SIZE)
163
164#define	NUM_CACHES	(MAX_CACHED_SHIFT - MIN_CACHED_SHIFT + 1)
165#define	CACHELIST_SIZE	ALIGN(NUM_CACHES * sizeof (cache_head_t), \
166    CACHE_COHERENCY_UNIT)
167
168#define	MINSIZE		9	/* for requestsize, tunable */
169#define	MAXSIZE		256	/* arbitrary, big enough, for requestsize */
170
171#define	FREEPATTERN	0xdeadbeef /* debug fill pattern for free buf */
172#define	INITPATTERN	0xbaddcafe /* debug fill pattern for new buf */
173
174#define	misaligned(p)	((unsigned)(p) & (sizeof (int) - 1))
175#define	IS_OVERSIZE(x, y)	(((x) < (y)) && (((x) > MAX_CACHED)? 1 : 0))
176
177static long requestsize = MINSIZE; /* 9 pages per cache; tunable; 9 is min */
178
179static uint_t cpu_mask;
180static curcpu_func curcpu;
181
182static int32_t debugopt;
183static int32_t reinit;
184
185static percpu_t *cpu_list;
186static oversize_t oversize_list;
187static mutex_t oversize_lock = DEFAULTMUTEX;
188
189static int ncpus = 0;
190
191#define	MTMALLOC_OVERSIZE_MAGIC		((uintptr_t)&oversize_list)
192#define	MTMALLOC_MEMALIGN_MAGIC		((uintptr_t)&oversize_list + 1)
193#define	MTMALLOC_MEMALIGN_MIN_MAGIC	((uintptr_t)&oversize_list + 2)
194
195/*
196 * We require allocations handed out to be aligned on MTMALLOC_MIN_ALIGN-byte
197 * boundaries. We round up sizeof (oversize_t) (when necessary) to ensure that
198 * this is achieved.
199 */
200#define	OVSZ_SIZE		(ALIGN(sizeof (oversize_t), MTMALLOC_MIN_ALIGN))
201#define	OVSZ_HEADER_SIZE	(OVSZ_SIZE + OVERHEAD)
202
203/*
204 * memalign header takes 2 OVERHEAD space.  One for memalign magic, and the
205 * other one points back to the start address of originally allocated space.
206 */
207#define	MEMALIGN_HEADER_SIZE	2 * OVERHEAD
208#define	MEMALIGN_HEADER_ALLOC(x, shift, malloc_addr)\
209	if (shift == OVERHEAD)\
210		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
211			MTMALLOC_MEMALIGN_MIN_MAGIC; \
212	else {\
213		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
214			MTMALLOC_MEMALIGN_MAGIC; \
215		*((uintptr_t *)((caddr_t)x - 2 * OVERHEAD)) = \
216			(uintptr_t)malloc_addr; \
217	}
218
219/*
220 * Add big to the oversize hash table at the head of the relevant bucket.
221 */
222static void
223insert_hash(oversize_t *big)
224{
225	caddr_t ret = big->addr;
226	int bucket = HASH_OVERSIZE(ret);
227
228	assert(MUTEX_HELD(&oversize_lock));
229	big->hash_next = ovsz_hashtab[bucket];
230	ovsz_hashtab[bucket] = big;
231}
232
233void *
234malloc(size_t bytes)
235{
236	percpu_t *list_rotor;
237	uint_t	list_index;
238
239	if (bytes > MAX_CACHED)
240		return (oversize(bytes));
241
242	list_index = (curcpu() & cpu_mask);
243
244	list_rotor = &cpu_list[list_index];
245
246	return (malloc_internal(bytes, list_rotor));
247}
248
249void *
250realloc(void * ptr, size_t bytes)
251{
252	void *new, *data_ptr;
253	cache_t *cacheptr;
254	caddr_t mem;
255	size_t shift = 0;
256
257	if (ptr == NULL)
258		return (malloc(bytes));
259
260	if (bytes == 0) {
261		free(ptr);
262		return (NULL);
263	}
264
265	data_ptr = ptr;
266	mem = (caddr_t)ptr - OVERHEAD;
267
268	/*
269	 * Optimization possibility :
270	 *	p = malloc(64);
271	 *	q = realloc(p, 64);
272	 * q can be same as p.
273	 * Apply this optimization for the normal
274	 * sized caches for now.
275	 */
276	if (*(uintptr_t *)mem < MTMALLOC_OVERSIZE_MAGIC ||
277	    *(uintptr_t *)mem > MTMALLOC_MEMALIGN_MIN_MAGIC) {
278		cacheptr = (cache_t *)*(uintptr_t *)mem;
279		if (bytes <= (cacheptr->mt_size - OVERHEAD))
280			return (ptr);
281	}
282
283	new = malloc(bytes);
284
285	if (new == NULL)
286		return (NULL);
287
288	/*
289	 * If new == ptr, ptr has previously been freed. Passing a freed pointer
290	 * to realloc() is not allowed - unless the caller specifically states
291	 * otherwise, in which case we must avoid freeing ptr (ie new) before we
292	 * return new. There is (obviously) no requirement to memcpy() ptr to
293	 * new before we return.
294	 */
295	if (new == ptr) {
296		if (!(debugopt & MTDOUBLEFREE))
297			abort();
298		return (new);
299	}
300
301	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
302		mem -= OVERHEAD;
303		ptr = (void *)*(uintptr_t *)mem;
304		mem = (caddr_t)ptr - OVERHEAD;
305		shift = (size_t)((uintptr_t)data_ptr - (uintptr_t)ptr);
306	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
307		ptr = (void *) mem;
308		mem -= OVERHEAD;
309		shift = OVERHEAD;
310	}
311
312	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
313		oversize_t *old;
314
315		old = (oversize_t *)(mem - OVSZ_SIZE);
316		(void) memcpy(new, data_ptr, MIN(bytes, old->size - shift));
317		free(ptr);
318		return (new);
319	}
320
321	cacheptr = (cache_t *)*(uintptr_t *)mem;
322
323	(void) memcpy(new, data_ptr,
324	    MIN(cacheptr->mt_size - OVERHEAD - shift, bytes));
325	free(ptr);
326
327	return (new);
328}
329
330void *
331calloc(size_t nelem, size_t bytes)
332{
333	void * ptr;
334	size_t size;
335
336	if (nelem == 0 || bytes == 0) {
337		size = 0;
338	} else {
339		size = nelem * bytes;
340
341		/* check for overflow */
342		if ((size / nelem) != bytes) {
343			errno = ENOMEM;
344			return (NULL);
345		}
346	}
347
348	ptr = malloc(size);
349	if (ptr == NULL)
350		return (NULL);
351	(void) memset(ptr, 0, size);
352
353	return (ptr);
354}
355
356void
357free(void * ptr)
358{
359	cache_t *cacheptr;
360	caddr_t mem;
361	int32_t i;
362	caddr_t freeblocks;
363	uintptr_t offset;
364	uchar_t mask;
365	int32_t which_bit, num_bytes;
366
367	if (ptr == NULL)
368		return;
369
370	mem = (caddr_t)ptr - OVERHEAD;
371
372	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
373		mem -= OVERHEAD;
374		ptr = (void *)*(uintptr_t *)mem;
375		mem = (caddr_t)ptr - OVERHEAD;
376	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
377		ptr = (void *) mem;
378		mem -= OVERHEAD;
379	}
380
381	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
382		oversize_t *big, **opp;
383		int bucket;
384
385		big = (oversize_t *)(mem - OVSZ_SIZE);
386		(void) mutex_lock(&oversize_lock);
387
388		bucket = HASH_OVERSIZE(big->addr);
389		for (opp = &ovsz_hashtab[bucket]; *opp != NULL;
390		    opp = &(*opp)->hash_next)
391			if (*opp == big)
392				break;
393
394		if (*opp == NULL) {
395			if (!(debugopt & MTDOUBLEFREE))
396				abort();
397			(void) mutex_unlock(&oversize_lock);
398			return;
399		}
400
401		*opp = big->hash_next;	/* remove big from the hash table */
402		big->hash_next = NULL;
403
404		if (debugopt & MTDEBUGPATTERN)
405			copy_pattern(FREEPATTERN, ptr, big->size);
406		add_oversize(big);
407		(void) mutex_unlock(&oversize_lock);
408		return;
409	}
410
411	cacheptr = (cache_t *)*(uintptr_t *)mem;
412	freeblocks = cacheptr->mt_freelist;
413
414	/*
415	 * This is the distance measured in bits into the arena.
416	 * The value of offset is in bytes but there is a 1-1 correlation
417	 * between distance into the arena and distance into the
418	 * freelist bitmask.
419	 */
420	offset = mem - cacheptr->mt_arena;
421
422	/*
423	 * i is total number of bits to offset into freelist bitmask.
424	 */
425
426	i = offset / cacheptr->mt_size;
427
428	num_bytes = i >> 3;
429
430	/*
431	 * which_bit is the bit offset into the byte in the freelist.
432	 * if our freelist bitmask looks like 0xf3 and we are freeing
433	 * block 5 (ie: the 6th block) our mask will be 0xf7 after
434	 * the free. Things go left to right that's why the mask is 0x80
435	 * and not 0x01.
436	 */
437	which_bit = i - (num_bytes << 3);
438
439	mask = 0x80 >> which_bit;
440
441	freeblocks += num_bytes;
442
443	if (debugopt & MTDEBUGPATTERN)
444		copy_pattern(FREEPATTERN, ptr, cacheptr->mt_size - OVERHEAD);
445
446	(void) mutex_lock(&cacheptr->mt_cache_lock);
447
448	if (*freeblocks & mask) {
449		if (!(debugopt & MTDOUBLEFREE))
450			abort();
451	} else {
452		*freeblocks |= mask;
453		cacheptr->mt_nfree++;
454	}
455
456	(void) mutex_unlock(&cacheptr->mt_cache_lock);
457}
458
459void *
460memalign(size_t alignment, size_t size)
461{
462	size_t alloc_size;
463	uintptr_t offset;
464	void *alloc_buf;
465	void *ret_buf;
466
467	if (size == 0 || alignment == 0 || misaligned(alignment) ||
468	    (alignment & (alignment - 1)) != 0) {
469		errno = EINVAL;
470		return (NULL);
471	}
472
473	/* <= MTMALLOC_MIN_ALIGN, malloc can provide directly */
474	if (alignment <= MTMALLOC_MIN_ALIGN)
475		return (malloc(size));
476
477	alloc_size = size + alignment - MTMALLOC_MIN_ALIGN;
478
479	if (alloc_size < size) { /* overflow */
480		errno = ENOMEM;
481		return (NULL);
482	}
483
484	alloc_buf = malloc(alloc_size);
485
486	if (alloc_buf == NULL)
487		/* malloc sets errno */
488		return (NULL);
489
490	/*
491	 * If alloc_size > MAX_CACHED, malloc() will have returned a multiple of
492	 * MTMALLOC_MIN_ALIGN, having rounded-up alloc_size if necessary. Since
493	 * we will use alloc_size to return the excess fragments to the free
494	 * list, we also round-up alloc_size if necessary.
495	 */
496	if ((alloc_size > MAX_CACHED) &&
497	    (alloc_size & (MTMALLOC_MIN_ALIGN - 1)))
498		alloc_size = ALIGN(alloc_size, MTMALLOC_MIN_ALIGN);
499
500	if ((offset = (uintptr_t)alloc_buf & (alignment - 1)) == 0) {
501		/* aligned correctly */
502
503		size_t frag_size = alloc_size -
504		    (size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
505
506		/*
507		 * If the leftover piece of the memory > MAX_CACHED,
508		 * split off the piece and return it back to the freelist.
509		 */
510		if (IS_OVERSIZE(frag_size, alloc_size)) {
511			oversize_t *orig, *tail;
512			uintptr_t taddr;
513			size_t data_size;
514			taddr = ALIGN((uintptr_t)alloc_buf + size,
515			    MTMALLOC_MIN_ALIGN);
516			data_size = taddr - (uintptr_t)alloc_buf;
517			orig = (oversize_t *)((uintptr_t)alloc_buf -
518			    OVSZ_HEADER_SIZE);
519			frag_size = orig->size - data_size -
520			    OVSZ_HEADER_SIZE;
521			orig->size = data_size;
522			tail = oversize_header_alloc(taddr, frag_size);
523			free_oversize(tail);
524		}
525		ret_buf = alloc_buf;
526	} else {
527		uchar_t	oversize_bits = 0;
528		size_t	head_sz, data_sz, tail_sz;
529		uintptr_t ret_addr, taddr, shift, tshift;
530		oversize_t *orig, *tail, *big;
531		size_t tsize;
532
533		/* needs to be aligned */
534		shift = alignment - offset;
535
536		assert(shift >= MTMALLOC_MIN_ALIGN);
537
538		ret_addr = ((uintptr_t)alloc_buf + shift);
539		ret_buf = (void *)ret_addr;
540
541		if (alloc_size <= MAX_CACHED) {
542			MEMALIGN_HEADER_ALLOC(ret_addr, shift, alloc_buf);
543			return (ret_buf);
544		}
545
546		/*
547		 * Only check for the fragments when the memory is allocted
548		 * from oversize_list.  Split off a fragment and return it
549		 * to the oversize freelist when it's > MAX_CACHED.
550		 */
551
552		head_sz = shift - MAX(MEMALIGN_HEADER_SIZE, OVSZ_HEADER_SIZE);
553
554		tail_sz = alloc_size -
555		    (shift + size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
556
557		oversize_bits |= IS_OVERSIZE(head_sz, alloc_size) |
558		    IS_OVERSIZE(size, alloc_size) << DATA_SHIFT |
559		    IS_OVERSIZE(tail_sz, alloc_size) << TAIL_SHIFT;
560
561		switch (oversize_bits) {
562			case NONE_OVERSIZE:
563			case DATA_OVERSIZE:
564				MEMALIGN_HEADER_ALLOC(ret_addr, shift,
565				    alloc_buf);
566				break;
567			case HEAD_OVERSIZE:
568				/*
569				 * If we can extend data > MAX_CACHED and have
570				 * head still > MAX_CACHED, we split head-end
571				 * as the case of head-end and data oversized,
572				 * otherwise just create memalign header.
573				 */
574				tsize = (shift + size) - (MAX_CACHED + 8 +
575				    MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
576
577				if (!IS_OVERSIZE(tsize, alloc_size)) {
578					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
579					    alloc_buf);
580					break;
581				} else {
582					tsize += OVSZ_HEADER_SIZE;
583					taddr = ALIGN((uintptr_t)alloc_buf +
584					    tsize, MTMALLOC_MIN_ALIGN);
585					tshift = ret_addr - taddr;
586					MEMALIGN_HEADER_ALLOC(ret_addr, tshift,
587					    taddr);
588					ret_addr = taddr;
589					shift = ret_addr - (uintptr_t)alloc_buf;
590				}
591				/* FALLTHROUGH */
592			case HEAD_AND_DATA_OVERSIZE:
593				/*
594				 * Split off the head fragment and
595				 * return it back to oversize freelist.
596				 * Create oversize header for the piece
597				 * of (data + tail fragment).
598				 */
599				orig = (oversize_t *)((uintptr_t)alloc_buf -
600				    OVSZ_HEADER_SIZE);
601				big = oversize_header_alloc(ret_addr -
602				    OVSZ_HEADER_SIZE, (orig->size - shift));
603				(void) mutex_lock(&oversize_lock);
604				insert_hash(big);
605				(void) mutex_unlock(&oversize_lock);
606				orig->size = shift - OVSZ_HEADER_SIZE;
607
608				/* free up the head fragment */
609				free_oversize(orig);
610				break;
611			case TAIL_OVERSIZE:
612				/*
613				 * If we can extend data > MAX_CACHED and have
614				 * tail-end still > MAX_CACHED, we split tail
615				 * end, otherwise just create memalign header.
616				 */
617				orig = (oversize_t *)((uintptr_t)alloc_buf -
618				    OVSZ_HEADER_SIZE);
619				tsize =  orig->size - (MAX_CACHED + 8 +
620				    shift + OVSZ_HEADER_SIZE +
621				    MTMALLOC_MIN_ALIGN);
622				if (!IS_OVERSIZE(tsize, alloc_size)) {
623					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
624					    alloc_buf);
625					break;
626				} else {
627					size = MAX_CACHED + 8;
628				}
629				/* FALLTHROUGH */
630			case DATA_AND_TAIL_OVERSIZE:
631				/*
632				 * Split off the tail fragment and
633				 * return it back to oversize freelist.
634				 * Create memalign header and adjust
635				 * the size for the piece of
636				 * (head fragment + data).
637				 */
638				taddr = ALIGN(ret_addr + size,
639				    MTMALLOC_MIN_ALIGN);
640				data_sz = (size_t)(taddr -
641				    (uintptr_t)alloc_buf);
642				orig = (oversize_t *)((uintptr_t)alloc_buf -
643				    OVSZ_HEADER_SIZE);
644				tsize = orig->size - data_sz;
645				orig->size = data_sz;
646				MEMALIGN_HEADER_ALLOC(ret_buf, shift,
647				    alloc_buf);
648				tsize -= OVSZ_HEADER_SIZE;
649				tail = oversize_header_alloc(taddr,  tsize);
650				free_oversize(tail);
651				break;
652			case HEAD_AND_TAIL_OVERSIZE:
653				/*
654				 * Split off the head fragment.
655				 * We try to free up tail-end when we can
656				 * extend data size to (MAX_CACHED + 8)
657				 * and remain tail-end oversized.
658				 * The bottom line is all split pieces
659				 * should be oversize in size.
660				 */
661				orig = (oversize_t *)((uintptr_t)alloc_buf -
662				    OVSZ_HEADER_SIZE);
663				tsize =  orig->size - (MAX_CACHED + 8 +
664				    OVSZ_HEADER_SIZE + shift +
665				    MTMALLOC_MIN_ALIGN);
666
667				if (!IS_OVERSIZE(tsize, alloc_size)) {
668					/*
669					 * If the chunk is not big enough
670					 * to make both data and tail oversize
671					 * we just keep them as one piece.
672					 */
673					big = oversize_header_alloc(ret_addr -
674					    OVSZ_HEADER_SIZE,
675					    orig->size - shift);
676					(void) mutex_lock(&oversize_lock);
677					insert_hash(big);
678					(void) mutex_unlock(&oversize_lock);
679					orig->size = shift - OVSZ_HEADER_SIZE;
680					free_oversize(orig);
681					break;
682				} else {
683					/*
684					 * extend data size > MAX_CACHED
685					 * and handle it as head, data, tail
686					 * are all oversized.
687					 */
688					size = MAX_CACHED + 8;
689				}
690				/* FALLTHROUGH */
691			case ALL_OVERSIZE:
692				/*
693				 * split off the head and tail fragments,
694				 * return them back to the oversize freelist.
695				 * Alloc oversize header for data seg.
696				 */
697				orig = (oversize_t *)((uintptr_t)alloc_buf -
698				    OVSZ_HEADER_SIZE);
699				tsize = orig->size;
700				orig->size = shift - OVSZ_HEADER_SIZE;
701				free_oversize(orig);
702
703				taddr = ALIGN(ret_addr + size,
704				    MTMALLOC_MIN_ALIGN);
705				data_sz = taddr - ret_addr;
706				assert(tsize > (shift + data_sz +
707				    OVSZ_HEADER_SIZE));
708				tail_sz = tsize -
709				    (shift + data_sz + OVSZ_HEADER_SIZE);
710
711				/* create oversize header for data seg */
712				big = oversize_header_alloc(ret_addr -
713				    OVSZ_HEADER_SIZE, data_sz);
714				(void) mutex_lock(&oversize_lock);
715				insert_hash(big);
716				(void) mutex_unlock(&oversize_lock);
717
718				/* create oversize header for tail fragment */
719				tail = oversize_header_alloc(taddr, tail_sz);
720				free_oversize(tail);
721				break;
722			default:
723				/* should not reach here */
724				assert(0);
725		}
726	}
727	return (ret_buf);
728}
729
730
731void *
732valloc(size_t size)
733{
734	static unsigned pagesize;
735
736	if (size == 0)
737		return (NULL);
738
739	if (!pagesize)
740		pagesize = sysconf(_SC_PAGESIZE);
741
742	return (memalign(pagesize, size));
743}
744
745void
746mallocctl(int cmd, long value)
747{
748	switch (cmd) {
749
750	case MTDEBUGPATTERN:
751		/*
752		 * Reinitialize free blocks in case malloc() is called prior
753		 * to mallocctl().
754		 */
755		if (value && !(debugopt & cmd)) {
756			reinit++;
757			debugopt |= cmd;
758			reinit_cpu_list();
759		}
760		/*FALLTHRU*/
761	case MTDOUBLEFREE:
762	case MTINITBUFFER:
763		if (value)
764			debugopt |= cmd;
765		else
766			debugopt &= ~cmd;
767		break;
768	case MTCHUNKSIZE:
769		if (value >= MINSIZE && value <= MAXSIZE)
770			requestsize = value;
771		break;
772	default:
773		break;
774	}
775}
776
777/*
778 * Initialization function, called from the init section of the library.
779 * No locking is required here because we are single-threaded during
780 * library initialization.
781 */
782static void
783setup_caches(void)
784{
785	uintptr_t oldbrk;
786	uintptr_t newbrk;
787
788	size_t cache_space_needed;
789	size_t padding;
790
791	curcpu_func new_curcpu;
792	uint_t new_cpu_mask;
793	percpu_t *new_cpu_list;
794
795	uint_t i, j;
796	uintptr_t list_addr;
797
798	/*
799	 * Get a decent "current cpu identifier", to be used to reduce
800	 * contention.  Eventually, this should be replaced by an interface
801	 * to get the actual CPU sequence number in libthread/liblwp.
802	 */
803	new_curcpu = (curcpu_func)thr_self;
804	if ((ncpus = 2 * sysconf(_SC_NPROCESSORS_CONF)) <= 0)
805		ncpus = 4; /* decent default value */
806
807	/* round ncpus up to a power of 2 */
808	while (ncpus & (ncpus - 1))
809		ncpus++;
810
811	new_cpu_mask = ncpus - 1;	/* create the cpu mask */
812
813	/*
814	 * We now do some magic with the brk.  What we want to get in the
815	 * end is a bunch of well-aligned stuff in a big initial allocation.
816	 * Along the way, we do sanity checks to make sure no one else has
817	 * touched the brk (which shouldn't happen, but it's always good to
818	 * check)
819	 *
820	 * First, make sure sbrk is sane, and store the current brk in oldbrk.
821	 */
822	oldbrk = (uintptr_t)sbrk(0);
823	if ((void *)oldbrk == (void *)-1)
824		abort();	/* sbrk is broken -- we're doomed. */
825
826	/*
827	 * Now, align the brk to a multiple of CACHE_COHERENCY_UNIT, so that
828	 * the percpu structures and cache lists will be properly aligned.
829	 *
830	 *   2.  All hunks will be page-aligned, assuming HUNKSIZE >= PAGESIZE,
831	 *	so they can be paged out individually.
832	 */
833	newbrk = ALIGN(oldbrk, CACHE_COHERENCY_UNIT);
834	if (newbrk != oldbrk && (uintptr_t)sbrk(newbrk - oldbrk) != oldbrk)
835		abort();	/* sbrk is broken -- we're doomed. */
836
837	/*
838	 * For each cpu, there is one percpu_t and a list of caches
839	 */
840	cache_space_needed = ncpus * (sizeof (percpu_t) + CACHELIST_SIZE);
841
842	new_cpu_list = (percpu_t *)sbrk(cache_space_needed);
843
844	if (new_cpu_list == (percpu_t *)-1 ||
845	    (uintptr_t)new_cpu_list != newbrk)
846		abort();	/* sbrk is broken -- we're doomed. */
847
848	/*
849	 * Finally, align the brk to HUNKSIZE so that all hunks are
850	 * page-aligned, to avoid edge-effects.
851	 */
852
853	newbrk = (uintptr_t)new_cpu_list + cache_space_needed;
854
855	padding = ALIGN(newbrk, HUNKSIZE) - newbrk;
856
857	if (padding > 0 && (uintptr_t)sbrk(padding) != newbrk)
858		abort();	/* sbrk is broken -- we're doomed. */
859
860	list_addr = ((uintptr_t)new_cpu_list + (sizeof (percpu_t) * ncpus));
861
862	/* initialize the percpu list */
863	for (i = 0; i < ncpus; i++) {
864		new_cpu_list[i].mt_caches = (cache_head_t *)list_addr;
865		for (j = 0; j < NUM_CACHES; j++) {
866			new_cpu_list[i].mt_caches[j].mt_cache = NULL;
867			new_cpu_list[i].mt_caches[j].mt_hint = NULL;
868		}
869
870		(void) mutex_init(&new_cpu_list[i].mt_parent_lock,
871		    USYNC_THREAD, NULL);
872
873		/* get the correct cache list alignment */
874		list_addr += CACHELIST_SIZE;
875	}
876
877	/*
878	 * Initialize oversize listhead
879	 */
880	oversize_list.next_bysize = &oversize_list;
881	oversize_list.prev_bysize = &oversize_list;
882	oversize_list.next_byaddr = &oversize_list;
883	oversize_list.prev_byaddr = &oversize_list;
884	oversize_list.addr = NULL;
885	oversize_list.size = 0;		/* sentinal */
886
887	/*
888	 * Now install the global variables.
889	 */
890	curcpu = new_curcpu;
891	cpu_mask = new_cpu_mask;
892	cpu_list = new_cpu_list;
893}
894
895static void
896create_cache(cache_t *cp, size_t size, uint_t chunksize)
897{
898	long nblocks;
899
900	(void) mutex_init(&cp->mt_cache_lock, USYNC_THREAD, NULL);
901	cp->mt_size = size;
902	cp->mt_freelist = ((caddr_t)cp + sizeof (cache_t));
903	cp->mt_span = chunksize * HUNKSIZE - sizeof (cache_t);
904	cp->mt_hunks = chunksize;
905	/*
906	 * rough calculation. We will need to adjust later.
907	 */
908	nblocks = cp->mt_span / cp->mt_size;
909	nblocks >>= 3;
910	if (nblocks == 0) { /* less than 8 free blocks in this pool */
911		int32_t numblocks = 0;
912		long i = cp->mt_span;
913		size_t sub = cp->mt_size;
914		uchar_t mask = 0;
915
916		while (i > sub) {
917			numblocks++;
918			i -= sub;
919		}
920		nblocks = numblocks;
921		cp->mt_arena = (caddr_t)ALIGN(cp->mt_freelist + 8, 8);
922		cp->mt_nfree = numblocks;
923		while (numblocks--) {
924			mask |= 0x80 >> numblocks;
925		}
926		*(cp->mt_freelist) = mask;
927	} else {
928		cp->mt_arena = (caddr_t)ALIGN((caddr_t)cp->mt_freelist +
929		    nblocks, 32);
930		/* recompute nblocks */
931		nblocks = (uintptr_t)((caddr_t)cp->mt_freelist +
932		    cp->mt_span - cp->mt_arena) / cp->mt_size;
933		cp->mt_nfree = ((nblocks >> 3) << 3);
934		/* Set everything to free */
935		(void) memset(cp->mt_freelist, 0xff, nblocks >> 3);
936	}
937
938	if (debugopt & MTDEBUGPATTERN)
939		copy_pattern(FREEPATTERN, cp->mt_arena, cp->mt_size * nblocks);
940
941	cp->mt_next = NULL;
942}
943
944static void
945reinit_cpu_list(void)
946{
947	oversize_t *wp = oversize_list.next_bysize;
948	percpu_t *cpuptr;
949	cache_t *thiscache;
950	cache_head_t *cachehead;
951
952	/* Reinitialize free oversize blocks. */
953	(void) mutex_lock(&oversize_lock);
954	if (debugopt & MTDEBUGPATTERN)
955		for (; wp != &oversize_list; wp = wp->next_bysize)
956			copy_pattern(FREEPATTERN, wp->addr, wp->size);
957	(void) mutex_unlock(&oversize_lock);
958
959	/* Reinitialize free blocks. */
960	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
961		(void) mutex_lock(&cpuptr->mt_parent_lock);
962		for (cachehead = &cpuptr->mt_caches[0]; cachehead <
963		    &cpuptr->mt_caches[NUM_CACHES]; cachehead++) {
964			for (thiscache = cachehead->mt_cache; thiscache != NULL;
965			    thiscache = thiscache->mt_next) {
966				(void) mutex_lock(&thiscache->mt_cache_lock);
967				if (thiscache->mt_nfree == 0) {
968					(void) mutex_unlock(
969					    &thiscache->mt_cache_lock);
970					continue;
971				}
972				if (thiscache != NULL)
973					reinit_cache(thiscache);
974				(void) mutex_unlock(&thiscache->mt_cache_lock);
975			}
976		}
977		(void) mutex_unlock(&cpuptr->mt_parent_lock);
978	}
979	reinit = 0;
980}
981
982static void
983reinit_cache(cache_t *thiscache)
984{
985	uint32_t *freeblocks; /* not a uintptr_t on purpose */
986	int32_t i, n;
987	caddr_t ret;
988
989	freeblocks = (uint32_t *)thiscache->mt_freelist;
990	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
991		if (*freeblocks & 0xffffffff) {
992			for (i = 0; i < 32; i++) {
993				if (FLIP_EM(*freeblocks) & (0x80000000 >> i)) {
994					n = (uintptr_t)(((freeblocks -
995					    (uint32_t *)thiscache->mt_freelist)
996					    << 5) + i) * thiscache->mt_size;
997					ret = thiscache->mt_arena + n;
998					ret += OVERHEAD;
999					copy_pattern(FREEPATTERN, ret,
1000					    thiscache->mt_size);
1001				}
1002			}
1003		}
1004		freeblocks++;
1005	}
1006}
1007
1008static void *
1009malloc_internal(size_t size, percpu_t *cpuptr)
1010{
1011	cache_head_t *cachehead;
1012	cache_t *thiscache, *hintcache;
1013	int32_t i, n, logsz, bucket;
1014	uint32_t index;
1015	uint32_t *freeblocks; /* not a uintptr_t on purpose */
1016	caddr_t ret;
1017
1018	logsz = MIN_CACHED_SHIFT;
1019
1020	while (size > (1 << logsz))
1021		logsz++;
1022
1023	bucket = logsz - MIN_CACHED_SHIFT;
1024
1025	(void) mutex_lock(&cpuptr->mt_parent_lock);
1026
1027	/*
1028	 * Find a cache of the appropriate size with free buffers.
1029	 *
1030	 * We don't need to lock each cache as we check their mt_nfree count,
1031	 * since:
1032	 *	1.  We are only looking for caches with mt_nfree > 0.  If a
1033	 *	   free happens during our search, it will increment mt_nfree,
1034	 *	   which will not effect the test.
1035	 *	2.  Allocations can decrement mt_nfree, but they can't happen
1036	 *	   as long as we hold mt_parent_lock.
1037	 */
1038
1039	cachehead = &cpuptr->mt_caches[bucket];
1040
1041	/* Search through the list, starting at the mt_hint */
1042	thiscache = cachehead->mt_hint;
1043
1044	while (thiscache != NULL && thiscache->mt_nfree == 0)
1045		thiscache = thiscache->mt_next;
1046
1047	if (thiscache == NULL) {
1048		/* wrap around -- search up to the hint */
1049		thiscache = cachehead->mt_cache;
1050		hintcache = cachehead->mt_hint;
1051
1052		while (thiscache != NULL && thiscache != hintcache &&
1053		    thiscache->mt_nfree == 0)
1054			thiscache = thiscache->mt_next;
1055
1056		if (thiscache == hintcache)
1057			thiscache = NULL;
1058	}
1059
1060
1061	if (thiscache == NULL) { /* there are no free caches */
1062		int32_t thisrequest = requestsize;
1063		int32_t buffer_size = (1 << logsz) + OVERHEAD;
1064
1065		thiscache = (cache_t *)morecore(thisrequest * HUNKSIZE);
1066
1067		if (thiscache == (cache_t *)-1) {
1068			(void) mutex_unlock(&cpuptr->mt_parent_lock);
1069			errno = EAGAIN;
1070			return (NULL);
1071		}
1072		create_cache(thiscache, buffer_size, thisrequest);
1073
1074		/* link in the new block at the beginning of the list */
1075		thiscache->mt_next = cachehead->mt_cache;
1076		cachehead->mt_cache = thiscache;
1077	}
1078
1079	/* update the hint to the cache we found or created */
1080	cachehead->mt_hint = thiscache;
1081
1082	/* thiscache now points to a cache with available space */
1083	(void) mutex_lock(&thiscache->mt_cache_lock);
1084
1085	freeblocks = (uint32_t *)thiscache->mt_freelist;
1086	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
1087		if (*freeblocks & 0xffffffff)
1088			break;
1089		freeblocks++;
1090		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1091		    *freeblocks & 0xffffffff)
1092			break;
1093		freeblocks++;
1094		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1095		    *freeblocks & 0xffffffff)
1096			break;
1097		freeblocks++;
1098		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1099		    *freeblocks & 0xffffffff)
1100			break;
1101		freeblocks++;
1102	}
1103
1104	/*
1105	 * the offset from mt_freelist to freeblocks is the offset into
1106	 * the arena. Be sure to include the offset into freeblocks
1107	 * of the bitmask. n is the offset.
1108	 */
1109	for (i = 0; i < 32; ) {
1110		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1111			break;
1112		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1113			break;
1114		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1115			break;
1116		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1117			break;
1118	}
1119	index = 0x80000000 >> --i;
1120
1121
1122	*freeblocks &= FLIP_EM(~index);
1123
1124	thiscache->mt_nfree--;
1125
1126	(void) mutex_unlock(&thiscache->mt_cache_lock);
1127	(void) mutex_unlock(&cpuptr->mt_parent_lock);
1128
1129	n = (uintptr_t)(((freeblocks - (uint32_t *)thiscache->mt_freelist) << 5)
1130	    + i) * thiscache->mt_size;
1131	/*
1132	 * Now you have the offset in n, you've changed the free mask
1133	 * in the freelist. Nothing left to do but find the block
1134	 * in the arena and put the value of thiscache in the word
1135	 * ahead of the handed out address and return the memory
1136	 * back to the user.
1137	 */
1138	ret = thiscache->mt_arena + n;
1139
1140	/* Store the cache addr for this buf. Makes free go fast. */
1141	*(uintptr_t *)ret = (uintptr_t)thiscache;
1142
1143	/*
1144	 * This assert makes sure we don't hand out memory that is not
1145	 * owned by this cache.
1146	 */
1147	assert(ret + thiscache->mt_size <= thiscache->mt_freelist +
1148	    thiscache->mt_span);
1149
1150	ret += OVERHEAD;
1151
1152	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1153
1154	if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1155		if (verify_pattern(FREEPATTERN, ret, size))
1156			abort();	/* reference after free */
1157
1158	if (debugopt & MTINITBUFFER)
1159		copy_pattern(INITPATTERN, ret, size);
1160	return ((void *)ret);
1161}
1162
1163static void *
1164morecore(size_t bytes)
1165{
1166	void * ret;
1167
1168	if (bytes > LONG_MAX) {
1169		intptr_t wad;
1170		/*
1171		 * The request size is too big. We need to do this in
1172		 * chunks. Sbrk only takes an int for an arg.
1173		 */
1174		if (bytes == ULONG_MAX)
1175			return ((void *)-1);
1176
1177		ret = sbrk(0);
1178		wad = LONG_MAX;
1179		while (wad > 0) {
1180			if (sbrk(wad) == (void *)-1) {
1181				if (ret != sbrk(0))
1182					(void) sbrk(-LONG_MAX);
1183				return ((void *)-1);
1184			}
1185			bytes -= LONG_MAX;
1186			wad = bytes;
1187		}
1188	} else
1189		ret = sbrk(bytes);
1190
1191	return (ret);
1192}
1193
1194
1195static void *
1196oversize(size_t size)
1197{
1198	caddr_t ret;
1199	oversize_t *big;
1200
1201	/* make sure we will not overflow */
1202	if (size > MAX_MTMALLOC) {
1203		errno = ENOMEM;
1204		return (NULL);
1205	}
1206
1207	/*
1208	 * Since we ensure every address we hand back is
1209	 * MTMALLOC_MIN_ALIGN-byte aligned, ALIGNing size ensures that the
1210	 * memory handed out is MTMALLOC_MIN_ALIGN-byte aligned at both ends.
1211	 * This eases the implementation of MTDEBUGPATTERN and MTINITPATTERN,
1212	 * particularly where coalescing occurs.
1213	 */
1214	size = ALIGN(size, MTMALLOC_MIN_ALIGN);
1215
1216	/*
1217	 * The idea with the global lock is that we are sure to
1218	 * block in the kernel anyway since given an oversize alloc
1219	 * we are sure to have to call morecore();
1220	 */
1221	(void) mutex_lock(&oversize_lock);
1222
1223	if ((big = find_oversize(size)) != NULL) {
1224		if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1225			if (verify_pattern(FREEPATTERN, big->addr, size))
1226				abort();	/* reference after free */
1227	} else {
1228		/* Get more 8-byte aligned memory from heap */
1229		ret = morecore(size + OVSZ_HEADER_SIZE);
1230		if (ret == (caddr_t)-1) {
1231			(void) mutex_unlock(&oversize_lock);
1232			errno = ENOMEM;
1233			return (NULL);
1234		}
1235		big = oversize_header_alloc((uintptr_t)ret, size);
1236	}
1237	ret = big->addr;
1238
1239	insert_hash(big);
1240
1241	if (debugopt & MTINITBUFFER)
1242		copy_pattern(INITPATTERN, ret, size);
1243
1244	(void) mutex_unlock(&oversize_lock);
1245	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1246	return ((void *)ret);
1247}
1248
1249static void
1250insert_oversize(oversize_t *op, oversize_t *nx)
1251{
1252	oversize_t *sp;
1253
1254	/* locate correct insertion point in size-ordered list */
1255	for (sp = oversize_list.next_bysize;
1256	    sp != &oversize_list && (op->size > sp->size);
1257	    sp = sp->next_bysize)
1258		;
1259
1260	/* link into size-ordered list */
1261	op->next_bysize = sp;
1262	op->prev_bysize = sp->prev_bysize;
1263	op->prev_bysize->next_bysize = op;
1264	op->next_bysize->prev_bysize = op;
1265
1266	/*
1267	 * link item into address-ordered list
1268	 * (caller provides insertion point as an optimization)
1269	 */
1270	op->next_byaddr = nx;
1271	op->prev_byaddr = nx->prev_byaddr;
1272	op->prev_byaddr->next_byaddr = op;
1273	op->next_byaddr->prev_byaddr = op;
1274
1275}
1276
1277static void
1278unlink_oversize(oversize_t *lp)
1279{
1280	/* unlink from address list */
1281	lp->prev_byaddr->next_byaddr = lp->next_byaddr;
1282	lp->next_byaddr->prev_byaddr = lp->prev_byaddr;
1283
1284	/* unlink from size list */
1285	lp->prev_bysize->next_bysize = lp->next_bysize;
1286	lp->next_bysize->prev_bysize = lp->prev_bysize;
1287}
1288
1289static void
1290position_oversize_by_size(oversize_t *op)
1291{
1292	oversize_t *sp;
1293
1294	if (op->size > op->next_bysize->size ||
1295	    op->size < op->prev_bysize->size) {
1296
1297		/* unlink from size list */
1298		op->prev_bysize->next_bysize = op->next_bysize;
1299		op->next_bysize->prev_bysize = op->prev_bysize;
1300
1301		/* locate correct insertion point in size-ordered list */
1302		for (sp = oversize_list.next_bysize;
1303		    sp != &oversize_list && (op->size > sp->size);
1304		    sp = sp->next_bysize)
1305			;
1306
1307		/* link into size-ordered list */
1308		op->next_bysize = sp;
1309		op->prev_bysize = sp->prev_bysize;
1310		op->prev_bysize->next_bysize = op;
1311		op->next_bysize->prev_bysize = op;
1312	}
1313}
1314
1315static void
1316add_oversize(oversize_t *lp)
1317{
1318	int merge_flags = INSERT_ONLY;
1319	oversize_t *nx;  	/* ptr to item right of insertion point */
1320	oversize_t *pv;  	/* ptr to item left of insertion point */
1321	uint_t size_lp, size_pv, size_nx;
1322	uintptr_t endp_lp, endp_pv, endp_nx;
1323
1324	/*
1325	 * Locate insertion point in address-ordered list
1326	 */
1327
1328	for (nx = oversize_list.next_byaddr;
1329	    nx != &oversize_list && (lp->addr > nx->addr);
1330	    nx = nx->next_byaddr)
1331		;
1332
1333	/*
1334	 * Determine how to add chunk to oversize freelist
1335	 */
1336
1337	size_lp = OVSZ_HEADER_SIZE + lp->size;
1338	endp_lp = ALIGN((uintptr_t)lp + size_lp, MTMALLOC_MIN_ALIGN);
1339	size_lp = endp_lp - (uintptr_t)lp;
1340
1341	pv = nx->prev_byaddr;
1342
1343	if (pv->size) {
1344
1345		size_pv = OVSZ_HEADER_SIZE + pv->size;
1346		endp_pv = ALIGN((uintptr_t)pv + size_pv,
1347		    MTMALLOC_MIN_ALIGN);
1348		size_pv = endp_pv - (uintptr_t)pv;
1349
1350		/* Check for adjacency with left chunk */
1351		if ((uintptr_t)lp == endp_pv)
1352			merge_flags |= COALESCE_LEFT;
1353	}
1354
1355	if (nx->size) {
1356
1357		/* Check for adjacency with right chunk */
1358		if ((uintptr_t)nx == endp_lp) {
1359			size_nx = OVSZ_HEADER_SIZE + nx->size;
1360			endp_nx = ALIGN((uintptr_t)nx + size_nx,
1361			    MTMALLOC_MIN_ALIGN);
1362			size_nx = endp_nx - (uintptr_t)nx;
1363			merge_flags |= COALESCE_RIGHT;
1364		}
1365	}
1366
1367	/*
1368	 * If MTDEBUGPATTERN==1, lp->addr will have been overwritten with
1369	 * FREEPATTERN for lp->size bytes. If we can merge, the oversize
1370	 * header(s) that will also become part of the memory available for
1371	 * reallocation (ie lp and/or nx) must also be overwritten with
1372	 * FREEPATTERN or we will SIGABRT when this memory is next reallocated.
1373	 */
1374	switch (merge_flags) {
1375
1376	case INSERT_ONLY:		/* Coalescing not possible */
1377		insert_oversize(lp, nx);
1378		break;
1379	case COALESCE_LEFT:
1380		pv->size += size_lp;
1381		position_oversize_by_size(pv);
1382		if (debugopt & MTDEBUGPATTERN)
1383			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1384		break;
1385	case COALESCE_RIGHT:
1386		unlink_oversize(nx);
1387		lp->size += size_nx;
1388		insert_oversize(lp, pv->next_byaddr);
1389		if (debugopt & MTDEBUGPATTERN)
1390			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1391		break;
1392	case COALESCE_WITH_BOTH_SIDES:	/* Merge (with right) to the left */
1393		pv->size += size_lp + size_nx;
1394		unlink_oversize(nx);
1395		position_oversize_by_size(pv);
1396		if (debugopt & MTDEBUGPATTERN) {
1397			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1398			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1399		}
1400		break;
1401	}
1402}
1403
1404/*
1405 * Find memory on our list that is at least size big. If we find a block that is
1406 * big enough, we break it up and return the associated oversize_t struct back
1407 * to the calling client. Any leftover piece of that block is returned to the
1408 * freelist.
1409 */
1410static oversize_t *
1411find_oversize(size_t size)
1412{
1413	oversize_t *wp = oversize_list.next_bysize;
1414	while (wp != &oversize_list && size > wp->size)
1415		wp = wp->next_bysize;
1416
1417	if (wp == &oversize_list) /* empty list or nothing big enough */
1418		return (NULL);
1419	/* breaking up a chunk of memory */
1420	if ((long)((wp->size - (size + OVSZ_HEADER_SIZE + MTMALLOC_MIN_ALIGN)))
1421	    > MAX_CACHED) {
1422		caddr_t off;
1423		oversize_t *np;
1424		size_t osize;
1425		off = (caddr_t)ALIGN(wp->addr + size,
1426		    MTMALLOC_MIN_ALIGN);
1427		osize = wp->size;
1428		wp->size = (size_t)(off - wp->addr);
1429		np = oversize_header_alloc((uintptr_t)off,
1430		    osize - (wp->size + OVSZ_HEADER_SIZE));
1431		if ((long)np->size < 0)
1432			abort();
1433		unlink_oversize(wp);
1434		add_oversize(np);
1435	} else {
1436		unlink_oversize(wp);
1437	}
1438	return (wp);
1439}
1440
1441static void
1442copy_pattern(uint32_t pattern, void *buf_arg, size_t size)
1443{
1444	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1445	uint32_t *buf = buf_arg;
1446
1447	while (buf < bufend - 3) {
1448		buf[3] = buf[2] = buf[1] = buf[0] = pattern;
1449		buf += 4;
1450	}
1451	while (buf < bufend)
1452		*buf++ = pattern;
1453}
1454
1455static void *
1456verify_pattern(uint32_t pattern, void *buf_arg, size_t size)
1457{
1458	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1459	uint32_t *buf;
1460
1461	for (buf = buf_arg; buf < bufend; buf++)
1462		if (*buf != pattern)
1463			return (buf);
1464	return (NULL);
1465}
1466
1467static void
1468free_oversize(oversize_t *ovp)
1469{
1470	assert(((uintptr_t)ovp->addr & 7) == 0); /* are we 8 byte aligned */
1471	assert(ovp->size > MAX_CACHED);
1472
1473	ovp->next_bysize = ovp->prev_bysize = NULL;
1474	ovp->next_byaddr = ovp->prev_byaddr = NULL;
1475	(void) mutex_lock(&oversize_lock);
1476	add_oversize(ovp);
1477	(void) mutex_unlock(&oversize_lock);
1478}
1479
1480static oversize_t *
1481oversize_header_alloc(uintptr_t mem, size_t size)
1482{
1483	oversize_t *ovsz_hdr;
1484
1485	assert(size > MAX_CACHED);
1486
1487	ovsz_hdr = (oversize_t *)mem;
1488	ovsz_hdr->prev_bysize = NULL;
1489	ovsz_hdr->next_bysize = NULL;
1490	ovsz_hdr->prev_byaddr = NULL;
1491	ovsz_hdr->next_byaddr = NULL;
1492	ovsz_hdr->hash_next = NULL;
1493	ovsz_hdr->size = size;
1494	mem += OVSZ_SIZE;
1495	*(uintptr_t *)mem = MTMALLOC_OVERSIZE_MAGIC;
1496	mem += OVERHEAD;
1497	assert(((uintptr_t)mem & 7) == 0); /* are we 8 byte aligned */
1498	ovsz_hdr->addr = (caddr_t)mem;
1499	return (ovsz_hdr);
1500}
1501
1502static void
1503malloc_prepare()
1504{
1505	percpu_t *cpuptr;
1506	cache_head_t *cachehead;
1507	cache_t *thiscache;
1508
1509	(void) mutex_lock(&oversize_lock);
1510	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
1511		(void) mutex_lock(&cpuptr->mt_parent_lock);
1512		for (cachehead = &cpuptr->mt_caches[0];
1513		    cachehead < &cpuptr->mt_caches[NUM_CACHES];
1514		    cachehead++) {
1515			for (thiscache = cachehead->mt_cache;
1516			    thiscache != NULL;
1517			    thiscache = thiscache->mt_next) {
1518				(void) mutex_lock(
1519				    &thiscache->mt_cache_lock);
1520			}
1521		}
1522	}
1523}
1524
1525static void
1526malloc_release()
1527{
1528	percpu_t *cpuptr;
1529	cache_head_t *cachehead;
1530	cache_t *thiscache;
1531
1532	for (cpuptr = &cpu_list[ncpus - 1]; cpuptr >= &cpu_list[0]; cpuptr--) {
1533		for (cachehead = &cpuptr->mt_caches[NUM_CACHES - 1];
1534		    cachehead >= &cpuptr->mt_caches[0];
1535		    cachehead--) {
1536			for (thiscache = cachehead->mt_cache;
1537			    thiscache != NULL;
1538			    thiscache = thiscache->mt_next) {
1539				(void) mutex_unlock(
1540				    &thiscache->mt_cache_lock);
1541			}
1542		}
1543		(void) mutex_unlock(&cpuptr->mt_parent_lock);
1544	}
1545	(void) mutex_unlock(&oversize_lock);
1546}
1547
1548#pragma init(malloc_init)
1549static void
1550malloc_init(void)
1551{
1552	/*
1553	 * This works in the init section for this library
1554	 * because setup_caches() doesn't call anything in libc
1555	 * that calls malloc().  If it did, disaster would ensue.
1556	 *
1557	 * For this to work properly, this library must be the first
1558	 * one to have its init section called (after libc) by the
1559	 * dynamic linker.  If some other library's init section
1560	 * ran first and called malloc(), disaster would ensue.
1561	 * Because this is an interposer library for malloc(), the
1562	 * dynamic linker arranges for its init section to run first.
1563	 */
1564	(void) setup_caches();
1565
1566	(void) pthread_atfork(malloc_prepare, malloc_release, malloc_release);
1567}
1568