27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
50efe5e5dv * Common Development and Distribution License (the "License").
60efe5e5dv * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
22753a6d4Sherry Moore * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
237c478bdstevel@tonic-gate * Use is subject to license terms.
2448bbca8Daniel Hoffman * Copyright (c) 2016 by Delphix. All rights reserved.
257c478bdstevel@tonic-gate */
287c478bdstevel@tonic-gate * Big Theory Statement for mutual exclusion locking primitives.
297c478bdstevel@tonic-gate *
307c478bdstevel@tonic-gate * A mutex serializes multiple threads so that only one thread
317c478bdstevel@tonic-gate * (the "owner" of the mutex) is active at a time.  See mutex(9F)
327c478bdstevel@tonic-gate * for a full description of the interfaces and programming model.
337c478bdstevel@tonic-gate * The rest of this comment describes the implementation.
347c478bdstevel@tonic-gate *
357c478bdstevel@tonic-gate * Mutexes come in two flavors: adaptive and spin.  mutex_init(9F)
367c478bdstevel@tonic-gate * determines the type based solely on the iblock cookie (PIL) argument.
377c478bdstevel@tonic-gate * PIL > LOCK_LEVEL implies a spin lock; everything else is adaptive.
387c478bdstevel@tonic-gate *
397c478bdstevel@tonic-gate * Spin mutexes block interrupts and spin until the lock becomes available.
407c478bdstevel@tonic-gate * A thread may not sleep, or call any function that might sleep, while
417c478bdstevel@tonic-gate * holding a spin mutex.  With few exceptions, spin mutexes should only
427c478bdstevel@tonic-gate * be used to synchronize with interrupt handlers.
437c478bdstevel@tonic-gate *
447c478bdstevel@tonic-gate * Adaptive mutexes (the default type) spin if the owner is running on
457c478bdstevel@tonic-gate * another CPU and block otherwise.  This policy is based on the assumption
467c478bdstevel@tonic-gate * that mutex hold times are typically short enough that the time spent
477c478bdstevel@tonic-gate * spinning is less than the time it takes to block.  If you need mutual
487c478bdstevel@tonic-gate * exclusion semantics with long hold times, consider an rwlock(9F) as
497c478bdstevel@tonic-gate * RW_WRITER.  Better still, reconsider the algorithm: if it requires
507c478bdstevel@tonic-gate * mutual exclusion for long periods of time, it's probably not scalable.
517c478bdstevel@tonic-gate *
527c478bdstevel@tonic-gate * Adaptive mutexes are overwhelmingly more common than spin mutexes,
537c478bdstevel@tonic-gate * so mutex_enter() assumes that the lock is adaptive.  We get away
547c478bdstevel@tonic-gate * with this by structuring mutexes so that an attempt to acquire a
557c478bdstevel@tonic-gate * spin mutex as adaptive always fails.  When mutex_enter() fails
567c478bdstevel@tonic-gate * it punts to mutex_vector_enter(), which does all the hard stuff.
577c478bdstevel@tonic-gate *
587c478bdstevel@tonic-gate * mutex_vector_enter() first checks the type.  If it's spin mutex,
597c478bdstevel@tonic-gate * we just call lock_set_spl() and return.  If it's an adaptive mutex,
607c478bdstevel@tonic-gate * we check to see what the owner is doing.  If the owner is running,
617c478bdstevel@tonic-gate * we spin until the lock becomes available; if not, we mark the lock
627c478bdstevel@tonic-gate * as having waiters and block.
637c478bdstevel@tonic-gate *
647c478bdstevel@tonic-gate * Blocking on a mutex is surprisingly delicate dance because, for speed,
657c478bdstevel@tonic-gate * mutex_exit() doesn't use an atomic instruction.  Thus we have to work
667c478bdstevel@tonic-gate * a little harder in the (rarely-executed) blocking path to make sure
677c478bdstevel@tonic-gate * we don't block on a mutex that's just been released -- otherwise we
687c478bdstevel@tonic-gate * might never be woken up.
697c478bdstevel@tonic-gate *
707c478bdstevel@tonic-gate * The logic for synchronizing mutex_vector_enter() with mutex_exit()
717c478bdstevel@tonic-gate * in the face of preemption and relaxed memory ordering is as follows:
727c478bdstevel@tonic-gate *
737c478bdstevel@tonic-gate * (1) Preemption in the middle of mutex_exit() must cause mutex_exit()
747c478bdstevel@tonic-gate *     to restart.  Each platform must enforce this by checking the
757c478bdstevel@tonic-gate *     interrupted PC in the interrupt handler (or on return from trap --
767c478bdstevel@tonic-gate *     whichever is more convenient for the platform).  If the PC
777c478bdstevel@tonic-gate *     lies within the critical region of mutex_exit(), the interrupt
787c478bdstevel@tonic-gate *     handler must reset the PC back to the beginning of mutex_exit().
797c478bdstevel@tonic-gate *     The critical region consists of all instructions up to, but not
807c478bdstevel@tonic-gate *     including, the store that clears the lock (which, of course,
817c478bdstevel@tonic-gate *     must never be executed twice.)
827c478bdstevel@tonic-gate *
837c478bdstevel@tonic-gate *     This ensures that the owner will always check for waiters after
847c478bdstevel@tonic-gate *     resuming from a previous preemption.
857c478bdstevel@tonic-gate *
867c478bdstevel@tonic-gate * (2) A thread resuming in mutex_exit() does (at least) the following:
877c478bdstevel@tonic-gate *
887c478bdstevel@tonic-gate *	when resuming:	set CPU_THREAD = owner
897c478bdstevel@tonic-gate *			membar #StoreLoad
907c478bdstevel@tonic-gate *
917c478bdstevel@tonic-gate *	in mutex_exit:	check waiters bit; do wakeup if set
927c478bdstevel@tonic-gate *			membar #LoadStore|#StoreStore
937c478bdstevel@tonic-gate *			clear owner
947c478bdstevel@tonic-gate *			(at this point, other threads may or may not grab
957c478bdstevel@tonic-gate *			the lock, and we may or may not reacquire it)
967c478bdstevel@tonic-gate *
977c478bdstevel@tonic-gate *	when blocking:	membar #StoreStore (due to disp_lock_enter())
987c478bdstevel@tonic-gate *			set CPU_THREAD = (possibly) someone else
997c478bdstevel@tonic-gate *
1007c478bdstevel@tonic-gate * (3) A thread blocking in mutex_vector_enter() does the following:
1017c478bdstevel@tonic-gate *
1027c478bdstevel@tonic-gate *			set waiters bit
1037c478bdstevel@tonic-gate *			membar #StoreLoad (via membar_enter())
104575a742pt *			check CPU_THREAD for owner's t_cpu
105575a742pt *				continue if owner running
1067c478bdstevel@tonic-gate *			membar #LoadLoad (via membar_consumer())
1077c478bdstevel@tonic-gate *			check owner and waiters bit; abort if either changed
1087c478bdstevel@tonic-gate *			block
1097c478bdstevel@tonic-gate *
1107c478bdstevel@tonic-gate * Thus the global memory orderings for (2) and (3) are as follows:
1117c478bdstevel@tonic-gate *
1127c478bdstevel@tonic-gate * (2M) mutex_exit() memory order:
1137c478bdstevel@tonic-gate *
1147c478bdstevel@tonic-gate *			STORE	CPU_THREAD = owner
1157c478bdstevel@tonic-gate *			LOAD	waiters bit
1167c478bdstevel@tonic-gate *			STORE	owner = NULL
1177c478bdstevel@tonic-gate *			STORE	CPU_THREAD = (possibly) someone else
1187c478bdstevel@tonic-gate *
1197c478bdstevel@tonic-gate * (3M) mutex_vector_enter() memory order:
1207c478bdstevel@tonic-gate *
1217c478bdstevel@tonic-gate *			STORE	waiters bit = 1
1227c478bdstevel@tonic-gate *			LOAD	CPU_THREAD for each CPU
1237c478bdstevel@tonic-gate *			LOAD	owner and waiters bit
1247c478bdstevel@tonic-gate *
1257c478bdstevel@tonic-gate * It has been verified by exhaustive simulation that all possible global
1267c478bdstevel@tonic-gate * memory orderings of (2M) interleaved with (3M) result in correct
1277c478bdstevel@tonic-gate * behavior.  Moreover, these ordering constraints are minimal: changing
1287c478bdstevel@tonic-gate * the ordering of anything in (2M) or (3M) breaks the algorithm, creating
1297c478bdstevel@tonic-gate * windows for missed wakeups.  Note: the possibility that other threads
1307c478bdstevel@tonic-gate * may grab the lock after the owner drops it can be factored out of the
1317c478bdstevel@tonic-gate * memory ordering analysis because mutex_vector_enter() won't block
1327c478bdstevel@tonic-gate * if the lock isn't still owned by the same thread.
1337c478bdstevel@tonic-gate *
1347c478bdstevel@tonic-gate * The only requirements of code outside the mutex implementation are
1357c478bdstevel@tonic-gate * (1) mutex_exit() preemption fixup in interrupt handlers or trap return,
136575a742pt * (2) a membar #StoreLoad after setting CPU_THREAD in resume(),
137575a742pt * (3) mutex_owner_running() preemption fixup in interrupt handlers
138575a742pt * or trap returns.
1397c478bdstevel@tonic-gate * Note: idle threads cannot grab adaptive locks (since they cannot block),
1407c478bdstevel@tonic-gate * so the membar may be safely omitted when resuming an idle thread.
1417c478bdstevel@tonic-gate *
1427c478bdstevel@tonic-gate * When a mutex has waiters, mutex_vector_exit() has several options:
1437c478bdstevel@tonic-gate *
1447c478bdstevel@tonic-gate * (1) Choose a waiter and make that thread the owner before waking it;
1457c478bdstevel@tonic-gate *     this is known as "direct handoff" of ownership.
1467c478bdstevel@tonic-gate *
1477c478bdstevel@tonic-gate * (2) Drop the lock and wake one waiter.
1487c478bdstevel@tonic-gate *
1497c478bdstevel@tonic-gate * (3) Drop the lock, clear the waiters bit, and wake all waiters.
1507c478bdstevel@tonic-gate *
1517c478bdstevel@tonic-gate * In many ways (1) is the cleanest solution, but if a lock is moderately
1527c478bdstevel@tonic-gate * contended it defeats the adaptive spin logic.  If we make some other
15348bbca8Daniel Hoffman * thread the owner, but it's not ONPROC yet, then all other threads on
1547c478bdstevel@tonic-gate * other cpus that try to get the lock will conclude that the owner is
1557c478bdstevel@tonic-gate * blocked, so they'll block too.  And so on -- it escalates quickly,
1567c478bdstevel@tonic-gate * with every thread taking the blocking path rather than the spin path.
1577c478bdstevel@tonic-gate * Thus, direct handoff is *not* a good idea for adaptive mutexes.
1587c478bdstevel@tonic-gate *
1597c478bdstevel@tonic-gate * Option (2) is the next most natural-seeming option, but it has several
1607c478bdstevel@tonic-gate * annoying properties.  If there's more than one waiter, we must preserve
1617c478bdstevel@tonic-gate * the waiters bit on an unheld lock.  On cas-capable platforms, where
1627c478bdstevel@tonic-gate * the waiters bit is part of the lock word, this means that both 0x0
1637c478bdstevel@tonic-gate * and 0x1 represent unheld locks, so we have to cas against *both*.
1647c478bdstevel@tonic-gate * Priority inheritance also gets more complicated, because a lock can
1657c478bdstevel@tonic-gate * have waiters but no owner to whom priority can be willed.  So while
1667c478bdstevel@tonic-gate * it is possible to make option (2) work, it's surprisingly vile.
1677c478bdstevel@tonic-gate *
1687c478bdstevel@tonic-gate * Option (3), the least-intuitive at first glance, is what we actually do.
1697c478bdstevel@tonic-gate * It has the advantage that because you always wake all waiters, you
1707c478bdstevel@tonic-gate * never have to preserve the waiters bit.  Waking all waiters seems like
1717c478bdstevel@tonic-gate * begging for a thundering herd problem, but consider: under option (2),
1727c478bdstevel@tonic-gate * every thread that grabs and drops the lock will wake one waiter -- so
1737c478bdstevel@tonic-gate * if the lock is fairly active, all waiters will be awakened very quickly
1747c478bdstevel@tonic-gate * anyway.  Moreover, this is how adaptive locks are *supposed* to work.
1757c478bdstevel@tonic-gate * The blocking case is rare; the more common case (by 3-4 orders of
1767c478bdstevel@tonic-gate * magnitude) is that one or more threads spin waiting to get the lock.
1777c478bdstevel@tonic-gate * Only direct handoff can prevent the thundering herd problem, but as
1787c478bdstevel@tonic-gate * mentioned earlier, that would tend to defeat the adaptive spin logic.
1797c478bdstevel@tonic-gate * In practice, option (3) works well because the blocking case is rare.
1807c478bdstevel@tonic-gate */
1837c478bdstevel@tonic-gate * delayed lock retry with exponential delay for spin locks
1847c478bdstevel@tonic-gate *
1857c478bdstevel@tonic-gate * It is noted above that for both the spin locks and the adaptive locks,
1867c478bdstevel@tonic-gate * spinning is the dominate mode of operation.  So long as there is only
1877c478bdstevel@tonic-gate * one thread waiting on a lock, the naive spin loop works very well in
1887c478bdstevel@tonic-gate * cache based architectures.  The lock data structure is pulled into the
1897c478bdstevel@tonic-gate * cache of the processor with the waiting/spinning thread and no further
1907c478bdstevel@tonic-gate * memory traffic is generated until the lock is released.  Unfortunately,
1917c478bdstevel@tonic-gate * once two or more threads are waiting on a lock, the naive spin has
1927c478bdstevel@tonic-gate * the property of generating maximum memory traffic from each spinning
1937c478bdstevel@tonic-gate * thread as the spinning threads contend for the lock data structure.
1947c478bdstevel@tonic-gate *
1957c478bdstevel@tonic-gate * By executing a delay loop before retrying a lock, a waiting thread
1967c478bdstevel@tonic-gate * can reduce its memory traffic by a large factor, depending on the
1977c478bdstevel@tonic-gate * size of the delay loop.  A large delay loop greatly reduced the memory
1987c478bdstevel@tonic-gate * traffic, but has the drawback of having a period of time when
1997c478bdstevel@tonic-gate * no thread is attempting to gain the lock even though several threads
2007c478bdstevel@tonic-gate * might be waiting.  A small delay loop has the drawback of not
2017c478bdstevel@tonic-gate * much reduction in memory traffic, but reduces the potential idle time.
2027c478bdstevel@tonic-gate * The theory of the exponential delay code is to start with a short
2037c478bdstevel@tonic-gate * delay loop and double the waiting time on each iteration, up to
204575a742pt * a preselected maximum.
2057c478bdstevel@tonic-gate */
2077c478bdstevel@tonic-gate#include <sys/param.h>
2087c478bdstevel@tonic-gate#include <sys/time.h>
2097c478bdstevel@tonic-gate#include <sys/cpuvar.h>
2107c478bdstevel@tonic-gate#include <sys/thread.h>
2117c478bdstevel@tonic-gate#include <sys/debug.h>
2127c478bdstevel@tonic-gate#include <sys/cmn_err.h>
2137c478bdstevel@tonic-gate#include <sys/sobject.h>
2147c478bdstevel@tonic-gate#include <sys/turnstile.h>
2157c478bdstevel@tonic-gate#include <sys/systm.h>
2167c478bdstevel@tonic-gate#include <sys/mutex_impl.h>
2177c478bdstevel@tonic-gate#include <sys/spl.h>
2187c478bdstevel@tonic-gate#include <sys/lockstat.h>
2197c478bdstevel@tonic-gate#include <sys/atomic.h>
2207c478bdstevel@tonic-gate#include <sys/cpu.h>
2217c478bdstevel@tonic-gate#include <sys/stack.h>
222843e198johnlev#include <sys/archsystm.h>
223575a742pt#include <sys/machsystm.h>
224575a742pt#include <sys/x_call.h>
2277c478bdstevel@tonic-gate * The sobj_ops vector exports a set of functions needed when a thread
2287c478bdstevel@tonic-gate * is asleep on a synchronization object of this type.
2297c478bdstevel@tonic-gate */
2307c478bdstevel@tonic-gatestatic sobj_ops_t mutex_sobj_ops = {
2317c478bdstevel@tonic-gate	SOBJ_MUTEX, mutex_owner, turnstile_stay_asleep, turnstile_change_pri
2357c478bdstevel@tonic-gate * If the system panics on a mutex, save the address of the offending
2367c478bdstevel@tonic-gate * mutex in panic_mutex_addr, and save the contents in panic_mutex.
2377c478bdstevel@tonic-gate */
2387c478bdstevel@tonic-gatestatic mutex_impl_t panic_mutex;
2397c478bdstevel@tonic-gatestatic mutex_impl_t *panic_mutex_addr;
2417c478bdstevel@tonic-gatestatic void
2427c478bdstevel@tonic-gatemutex_panic(char *msg, mutex_impl_t *lp)
2447c478bdstevel@tonic-gate	if (panicstr)
2457c478bdstevel@tonic-gate		return;
24775d9446Josef 'Jeff' Sipek	if (atomic_cas_ptr(&panic_mutex_addr, NULL, lp) == NULL)
2487c478bdstevel@tonic-gate		panic_mutex = *lp;
2507c478bdstevel@tonic-gate	panic("%s, lp=%p owner=%p thread=%p",
2518793b36Nick Todd	    msg, (void *)lp, (void *)MUTEX_OWNER(&panic_mutex),
2528793b36Nick Todd	    (void *)curthread);
255575a742pt/* "tunables" for per-platform backoff constants. */
256575a742ptuint_t mutex_backoff_cap = 0;
257575a742ptushort_t mutex_backoff_base = MUTEX_BACKOFF_BASE;
258575a742ptushort_t mutex_cap_factor = MUTEX_CAP_FACTOR;
259575a742ptuchar_t mutex_backoff_shift = MUTEX_BACKOFF_SHIFT;
264575a742pt	MUTEX_SYNC();
267575a742pt/* calculate the backoff interval */
269575a742ptdefault_lock_backoff(uint_t backoff)
271575a742pt	uint_t cap;		/* backoff cap calculated */
273575a742pt	if (backoff == 0) {
274575a742pt		backoff = mutex_backoff_base;
275575a742pt		/* first call just sets the base */
276575a742pt		return (backoff);
277575a742pt	}
279575a742pt	/* set cap */
280575a742pt	if (mutex_backoff_cap == 0) {
281575a742pt		/*
282575a742pt		 * For a contended lock, in the worst case a load + cas may
283575a742pt		 * be queued  at the controller for each contending CPU.
284575a742pt		 * Therefore, to avoid queueing, the accesses for all CPUS must
285575a742pt		 * be spread out in time over an interval of (ncpu *
286575a742pt		 * cap-factor).  Maximum backoff is set to this value, and
287575a742pt		 * actual backoff is a random number from 0 to the current max.
288575a742pt		 */
289575a742pt		cap = ncpus_online * mutex_cap_factor;
290575a742pt	} else {
291575a742pt		cap = mutex_backoff_cap;
292575a742pt	}
294575a742pt	/* calculate new backoff value */
295575a742pt	backoff <<= mutex_backoff_shift;	/* increase backoff */
296575a742pt	if (backoff > cap) {
297575a742pt		if (cap < mutex_backoff_base)
298575a742pt			backoff = mutex_backoff_base;
299575a742pt		else
300575a742pt			backoff = cap;
301575a742pt	}
303575a742pt	return (backoff);
307575a742pt * default delay function for mutexes.
308575a742pt */
310575a742ptdefault_lock_delay(uint_t backoff)
312575a742pt	ulong_t rnd;		/* random factor */
313575a742pt	uint_t cur_backoff;	/* calculated backoff */
314575a742pt	uint_t backctr;
316575a742pt	/*
317575a742pt	 * Modify backoff by a random amount to avoid lockstep, and to
318575a742pt	 * make it probable that some thread gets a small backoff, and
319575a742pt	 * re-checks quickly
320575a742pt	 */
321575a742pt	rnd = (((long)curthread >> PTR24_LSB) ^ (long)MUTEX_GETTICK());
322575a742pt	cur_backoff = (uint_t)(rnd % (backoff - mutex_backoff_base + 1)) +
323575a742pt	    mutex_backoff_base;
325575a742pt	/*
326575a742pt	 * Delay before trying
327575a742pt	 * to touch the mutex data structure.
328575a742pt	 */
329575a742pt	for (backctr = cur_backoff; backctr; backctr--) {
330575a742pt		MUTEX_DELAY();
331575a742pt	};
334575a742ptuint_t (*mutex_lock_backoff)(uint_t) = default_lock_backoff;
335575a742ptvoid (*mutex_lock_delay)(uint_t) = default_lock_delay;
336575a742ptvoid (*mutex_delay)(void) = mutex_delay_default;
3397c478bdstevel@tonic-gate * mutex_vector_enter() is called from the assembly mutex_enter() routine
3407c478bdstevel@tonic-gate * if the lock is held or is not of type MUTEX_ADAPTIVE.
3417c478bdstevel@tonic-gate */
3437c478bdstevel@tonic-gatemutex_vector_enter(mutex_impl_t *lp)
3457c478bdstevel@tonic-gate	kthread_id_t	owner;
346575a742pt	kthread_id_t	lastowner = MUTEX_NO_OWNER; /* track owner changes */
3477c478bdstevel@tonic-gate	hrtime_t	sleep_time = 0;	/* how long we slept */
3489d68b18ck	hrtime_t	spin_time = 0;	/* how long we spun */
349575a742pt	cpu_t 		*cpup;
3507c478bdstevel@tonic-gate	turnstile_t	*ts;
3517c478bdstevel@tonic-gate	volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp;
352575a742pt	uint_t		backoff = 0;	/* current backoff */
353575a742pt	int		changecnt = 0;	/* count of owner changes */
3557c478bdstevel@tonic-gate	ASSERT_STACK_ALIGNED();
3577c478bdstevel@tonic-gate	if (MUTEX_TYPE_SPIN(lp)) {
3587c478bdstevel@tonic-gate		lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl,
3597c478bdstevel@tonic-gate		    &lp->m_spin.m_oldspl);
3607c478bdstevel@tonic-gate		return;
3617c478bdstevel@tonic-gate	}
3637c478bdstevel@tonic-gate	if (!MUTEX_TYPE_ADAPTIVE(lp)) {
3647c478bdstevel@tonic-gate		mutex_panic("mutex_enter: bad mutex", lp);
3657c478bdstevel@tonic-gate		return;
3667c478bdstevel@tonic-gate	}
3687c478bdstevel@tonic-gate	/*
3697c478bdstevel@tonic-gate	 * Adaptive mutexes must not be acquired from above LOCK_LEVEL.
3707c478bdstevel@tonic-gate	 * We can migrate after loading CPU but before checking CPU_ON_INTR,
3717c478bdstevel@tonic-gate	 * so we must verify by disabling preemption and loading CPU again.
3727c478bdstevel@tonic-gate	 */
3737c478bdstevel@tonic-gate	cpup = CPU;
3747c478bdstevel@tonic-gate	if (CPU_ON_INTR(cpup) && !panicstr) {
3757c478bdstevel@tonic-gate		kpreempt_disable();
3767c478bdstevel@tonic-gate		if (CPU_ON_INTR(CPU))
3777c478bdstevel@tonic-gate			mutex_panic("mutex_enter: adaptive at high PIL", lp);
3787c478bdstevel@tonic-gate		kpreempt_enable();
3797c478bdstevel@tonic-gate	}
3817c478bdstevel@tonic-gate	CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1);
3839d68b18ck	spin_time = LOCKSTAT_START_TIME(LS_MUTEX_ENTER_SPIN);
385575a742pt	backoff = mutex_lock_backoff(0);	/* set base backoff */
3867c478bdstevel@tonic-gate	for (;;) {
387575a742pt		mutex_lock_delay(backoff); /* backoff delay */
3897c478bdstevel@tonic-gate		if (panicstr)
3907c478bdstevel@tonic-gate			return;
3927c478bdstevel@tonic-gate		if ((owner = MUTEX_OWNER(vlp)) == NULL) {
393575a742pt			if (mutex_adaptive_tryenter(lp)) {
3947c478bdstevel@tonic-gate				break;
395575a742pt			}
396575a742pt			/* increase backoff only on failed attempt. */
397575a742pt			backoff = mutex_lock_backoff(backoff);
398575a742pt			changecnt++;
3997c478bdstevel@tonic-gate			continue;
400575a742pt		} else if (lastowner != owner) {
401575a742pt			lastowner = owner;
402575a742pt			backoff = mutex_lock_backoff(backoff);
403575a742pt			changecnt++;
404575a742pt		}
406575a742pt		if (changecnt >= ncpus_online) {
407575a742pt			backoff = mutex_lock_backoff(0);
408575a742pt			changecnt = 0;
4097c478bdstevel@tonic-gate		}
4117c478bdstevel@tonic-gate		if (owner == curthread)
4127c478bdstevel@tonic-gate			mutex_panic("recursive mutex_enter", lp);
4147c478bdstevel@tonic-gate		/*
4157c478bdstevel@tonic-gate		 * If lock is held but owner is not yet set, spin.
4167c478bdstevel@tonic-gate		 * (Only relevant for platforms that don't have cas.)
4177c478bdstevel@tonic-gate		 */
4187c478bdstevel@tonic-gate		if (owner == MUTEX_NO_OWNER)
4197c478bdstevel@tonic-gate			continue;
421575a742pt		if (mutex_owner_running(lp) != NULL)  {
422575a742pt			continue;
423575a742pt		}
4257c478bdstevel@tonic-gate		/*
4267c478bdstevel@tonic-gate		 * The owner appears not to be running, so block.
4277c478bdstevel@tonic-gate		 * See the Big Theory Statement for memory ordering issues.
4287c478bdstevel@tonic-gate		 */
4297c478bdstevel@tonic-gate		ts = turnstile_lookup(lp);
4307c478bdstevel@tonic-gate		MUTEX_SET_WAITERS(lp);
4317c478bdstevel@tonic-gate		membar_enter();
4337c478bdstevel@tonic-gate		/*
4347c478bdstevel@tonic-gate		 * Recheck whether owner is running after waiters bit hits
4357c478bdstevel@tonic-gate		 * global visibility (above).  If owner is running, spin.
4367c478bdstevel@tonic-gate		 */
437575a742pt		if (mutex_owner_running(lp) != NULL) {
438575a742pt			turnstile_exit(lp);
439575a742pt			continue;
440575a742pt		}
4417c478bdstevel@tonic-gate		membar_consumer();
4437c478bdstevel@tonic-gate		/*
4447c478bdstevel@tonic-gate		 * If owner and waiters bit are unchanged, block.
4457c478bdstevel@tonic-gate		 */
4467c478bdstevel@tonic-gate		if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) {
4477c478bdstevel@tonic-gate			sleep_time -= gethrtime();
4487c478bdstevel@tonic-gate			(void) turnstile_block(ts, TS_WRITER_Q, lp,
4497c478bdstevel@tonic-gate			    &mutex_sobj_ops, NULL, NULL);
4507c478bdstevel@tonic-gate			sleep_time += gethrtime();
451575a742pt			/* reset backoff after turnstile */
452575a742pt			backoff = mutex_lock_backoff(0);
4537c478bdstevel@tonic-gate		} else {
4547c478bdstevel@tonic-gate			turnstile_exit(lp);
4557c478bdstevel@tonic-gate		}
4567c478bdstevel@tonic-gate	}
4587c478bdstevel@tonic-gate	ASSERT(MUTEX_OWNER(lp) == curthread);
4600efe5e5dv	if (sleep_time != 0) {
4610efe5e5dv		/*
4620efe5e5dv		 * Note, sleep time is the sum of all the sleeping we
4630efe5e5dv		 * did.
4640efe5e5dv		 */
4657c478bdstevel@tonic-gate		LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time);
4667c478bdstevel@tonic-gate	}
4689d68b18ck	/* record spin time, don't count sleep time */
4699d68b18ck	if (spin_time != 0) {
4719d68b18ck		    spin_time + sleep_time);
472575a742pt	}
4747c478bdstevel@tonic-gate	LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp);
4787c478bdstevel@tonic-gate * mutex_vector_tryenter() is called from the assembly mutex_tryenter()
4797c478bdstevel@tonic-gate * routine if the lock is held or is not of type MUTEX_ADAPTIVE.
4807c478bdstevel@tonic-gate */
4827c478bdstevel@tonic-gatemutex_vector_tryenter(mutex_impl_t *lp)
4847c478bdstevel@tonic-gate	int s;
4867c478bdstevel@tonic-gate	if (MUTEX_TYPE_ADAPTIVE(lp))
4877c478bdstevel@tonic-gate		return (0);		/* we already tried in assembly */
4897c478bdstevel@tonic-gate	if (!MUTEX_TYPE_SPIN(lp)) {
4907c478bdstevel@tonic-gate		mutex_panic("mutex_tryenter: bad mutex", lp);
4917c478bdstevel@tonic-gate		return (0);
4927c478bdstevel@tonic-gate	}
4947c478bdstevel@tonic-gate	s = splr(lp->m_spin.m_minspl);
4957c478bdstevel@tonic-gate	if (lock_try(&lp->m_spin.m_spinlock)) {
4967c478bdstevel@tonic-gate		lp->m_spin.m_oldspl = (ushort_t)s;
4977c478bdstevel@tonic-gate		return (1);
4987c478bdstevel@tonic-gate	}
4997c478bdstevel@tonic-gate	splx(s);
5007c478bdstevel@tonic-gate	return (0);
5047c478bdstevel@tonic-gate * mutex_vector_exit() is called from mutex_exit() if the lock is not
5057c478bdstevel@tonic-gate * adaptive, has waiters, or is not owned by the current thread (panic).
5067c478bdstevel@tonic-gate */
5087c478bdstevel@tonic-gatemutex_vector_exit(mutex_impl_t *lp)
5107c478bdstevel@tonic-gate	turnstile_t *ts;
5127c478bdstevel@tonic-gate	if (MUTEX_TYPE_SPIN(lp)) {
5137c478bdstevel@tonic-gate		lock_clear_splx(&lp->m_spin.m_spinlock, lp->m_spin.m_oldspl);
5147c478bdstevel@tonic-gate		return;
5157c478bdstevel@tonic-gate	}
5177c478bdstevel@tonic-gate	if (MUTEX_OWNER(lp) != curthread) {
5187c478bdstevel@tonic-gate		mutex_panic("mutex_exit: not owner", lp);
5197c478bdstevel@tonic-gate		return;
5207c478bdstevel@tonic-gate	}
5227c478bdstevel@tonic-gate	ts = turnstile_lookup(lp);
5237c478bdstevel@tonic-gate	MUTEX_CLEAR_LOCK_AND_WAITERS(lp);
5247c478bdstevel@tonic-gate	if (ts == NULL)
5257c478bdstevel@tonic-gate		turnstile_exit(lp);
5267c478bdstevel@tonic-gate	else
5277c478bdstevel@tonic-gate		turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
5287c478bdstevel@tonic-gate	LOCKSTAT_RECORD0(LS_MUTEX_EXIT_RELEASE, lp);
532b5fca8ftomeemutex_owned(const kmutex_t *mp)
534b5fca8ftomee	const mutex_impl_t *lp = (const mutex_impl_t *)mp;
5361939740Sherry Moore	if (panicstr || quiesce_active)
5377c478bdstevel@tonic-gate		return (1);
5397c478bdstevel@tonic-gate	if (MUTEX_TYPE_ADAPTIVE(lp))
5407c478bdstevel@tonic-gate		return (MUTEX_OWNER(lp) == curthread);
5417c478bdstevel@tonic-gate	return (LOCK_HELD(&lp->m_spin.m_spinlock));
5447c478bdstevel@tonic-gatekthread_t *
545b5fca8ftomeemutex_owner(const kmutex_t *mp)
547b5fca8ftomee	const mutex_impl_t *lp = (const mutex_impl_t *)mp;
5487c478bdstevel@tonic-gate	kthread_id_t t;
5507c478bdstevel@tonic-gate	if (MUTEX_TYPE_ADAPTIVE(lp) && (t = MUTEX_OWNER(lp)) != MUTEX_NO_OWNER)
5517c478bdstevel@tonic-gate		return (t);
5527c478bdstevel@tonic-gate	return (NULL);
5567c478bdstevel@tonic-gate * The iblock cookie 'ibc' is the spl level associated with the lock;
5577c478bdstevel@tonic-gate * this alone determines whether the lock will be ADAPTIVE or SPIN.
5587c478bdstevel@tonic-gate *
5597c478bdstevel@tonic-gate * Adaptive mutexes created in zeroed memory do not need to call
5607c478bdstevel@tonic-gate * mutex_init() as their allocation in this fashion guarantees
5617c478bdstevel@tonic-gate * their initialization.
5627c478bdstevel@tonic-gate *   eg adaptive mutexes created as static within the BSS or allocated
5637c478bdstevel@tonic-gate *      by kmem_zalloc().
5647c478bdstevel@tonic-gate */
5657c478bdstevel@tonic-gate/* ARGSUSED */
5677c478bdstevel@tonic-gatemutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc)
5697c478bdstevel@tonic-gate	mutex_impl_t *lp = (mutex_impl_t *)mp;
5717c478bdstevel@tonic-gate	ASSERT(ibc < (void *)KERNELBASE);	/* see 1215173 */
5737c478bdstevel@tonic-gate	if ((intptr_t)ibc > ipltospl(LOCK_LEVEL) && ibc < (void *)KERNELBASE) {
5747c478bdstevel@tonic-gate		ASSERT(type != MUTEX_ADAPTIVE && type != MUTEX_DEFAULT);
5757c478bdstevel@tonic-gate		MUTEX_SET_TYPE(lp, MUTEX_SPIN);
5767c478bdstevel@tonic-gate		LOCK_INIT_CLEAR(&lp->m_spin.m_spinlock);
5777c478bdstevel@tonic-gate		LOCK_INIT_HELD(&lp->m_spin.m_dummylock);
5787c478bdstevel@tonic-gate		lp->m_spin.m_minspl = (int)(intptr_t)ibc;
5797c478bdstevel@tonic-gate	} else {
5807f30f49ck#ifdef MUTEX_ALIGN
5817f30f49ck		static int misalign_cnt = 0;
5837f30f49ck		if (((uintptr_t)lp & (uintptr_t)(MUTEX_ALIGN - 1)) &&
5847f30f49ck		    (misalign_cnt < MUTEX_ALIGN_WARNINGS)) {
5857f30f49ck			/*
5867f30f49ck			 * The mutex is not aligned and may cross a cache line.
5877f30f49ck			 * This is not supported and may cause a panic.
5887f30f49ck			 * Show a warning that the mutex is not aligned
5897f30f49ck			 * and attempt to identify the origin.
5907f30f49ck			 * Unaligned mutexes are not (supposed to be)
5917f30f49ck			 * possible on SPARC.
5927f30f49ck			 */
5937f30f49ck			char *funcname;
5947f30f49ck			ulong_t offset = 0;
5967f30f49ck			funcname = modgetsymname((uintptr_t)caller(), &offset);
5977f30f49ck			cmn_err(CE_WARN, "mutex_init: %p is not %d byte "
5987f30f49ck			    "aligned; caller %s+%lx in module %s. "
5997f30f49ck			    "This is unsupported and may cause a panic. "
6007f30f49ck			    "Please report this to the kernel module supplier.",
6011f7e274ck			    (void *)lp, MUTEX_ALIGN,
6027f30f49ck			    funcname ? funcname : "unknown", offset,
6037f30f49ck			    mod_containing_pc(caller()));
6047f30f49ck			misalign_cnt++;
6057f30f49ck			if (misalign_cnt >= MUTEX_ALIGN_WARNINGS) {
6067f30f49ck				cmn_err(CE_WARN, "mutex_init: further unaligned"
6077f30f49ck				    " mutex warnings will be suppressed.");
6087f30f49ck			}
6097f30f49ck		}
6107f30f49ck#endif	/* MUTEX_ALIGN */
6117c478bdstevel@tonic-gate		ASSERT(type != MUTEX_SPIN);
6137c478bdstevel@tonic-gate		MUTEX_SET_TYPE(lp, MUTEX_ADAPTIVE);
6147c478bdstevel@tonic-gate		MUTEX_CLEAR_LOCK_AND_WAITERS(lp);
6157c478bdstevel@tonic-gate	}
6197c478bdstevel@tonic-gatemutex_destroy(kmutex_t *mp)
6217c478bdstevel@tonic-gate	mutex_impl_t *lp = (mutex_impl_t *)mp;
6237c478bdstevel@tonic-gate	if (lp->m_owner == 0 && !MUTEX_HAS_WAITERS(lp)) {
6247c478bdstevel@tonic-gate		MUTEX_DESTROY(lp);
6257c478bdstevel@tonic-gate	} else if (MUTEX_TYPE_SPIN(lp)) {
6267c478bdstevel@tonic-gate		LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE, lp);
6277c478bdstevel@tonic-gate		MUTEX_DESTROY(lp);
6287c478bdstevel@tonic-gate	} else if (MUTEX_TYPE_ADAPTIVE(lp)) {
6297c478bdstevel@tonic-gate		LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE, lp);
6307c478bdstevel@tonic-gate		if (MUTEX_OWNER(lp) != curthread)
6317c478bdstevel@tonic-gate			mutex_panic("mutex_destroy: not owner", lp);
6327c478bdstevel@tonic-gate		if (MUTEX_HAS_WAITERS(lp)) {
6337c478bdstevel@tonic-gate			turnstile_t *ts = turnstile_lookup(lp);
6347c478bdstevel@tonic-gate			turnstile_exit(lp);
6357c478bdstevel@tonic-gate			if (ts != NULL)
6367c478bdstevel@tonic-gate				mutex_panic("mutex_destroy: has waiters", lp);
6377c478bdstevel@tonic-gate		}
6387c478bdstevel@tonic-gate		MUTEX_DESTROY(lp);
6397c478bdstevel@tonic-gate	} else {
6407c478bdstevel@tonic-gate		mutex_panic("mutex_destroy: bad mutex", lp);
6417c478bdstevel@tonic-gate	}
6457c478bdstevel@tonic-gate * Simple C support for the cases where spin locks miss on the first try.
6467c478bdstevel@tonic-gate */
6487c478bdstevel@tonic-gatelock_set_spin(lock_t *lp)
650575a742pt	int loop_count = 0;
651575a742pt	uint_t backoff = 0;	/* current backoff */
6529d68b18ck	hrtime_t spin_time = 0;	/* how long we spun */
6547c478bdstevel@tonic-gate	if (panicstr)
6557c478bdstevel@tonic-gate		return;
6577c478bdstevel@tonic-gate	if (ncpus == 1)
6588793b36Nick Todd		panic("lock_set: %p lock held and only one CPU", (void *)lp);
6609d68b18ck	spin_time = LOCKSTAT_START_TIME(LS_LOCK_SET_SPIN);
6627c478bdstevel@tonic-gate	while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
6637c478bdstevel@tonic-gate		if (panicstr)
6647c478bdstevel@tonic-gate			return;
665575a742pt		loop_count++;
667575a742pt		if (ncpus_online == loop_count) {
668575a742pt			backoff = mutex_lock_backoff(0);
669575a742pt			loop_count = 0;
670575a742pt		} else {
671575a742pt			backoff = mutex_lock_backoff(backoff);
6727c478bdstevel@tonic-gate		}
673575a742pt		mutex_lock_delay(backoff);
6747c478bdstevel@tonic-gate	}
6769d68b18ck	LOCKSTAT_RECORD_TIME(LS_LOCK_SET_SPIN, lp, spin_time);
6787c478bdstevel@tonic-gate	LOCKSTAT_RECORD0(LS_LOCK_SET_ACQUIRE, lp);
6827c478bdstevel@tonic-gatelock_set_spl_spin(lock_t *lp, int new_pil, ushort_t *old_pil_addr, int old_pil)
684575a742pt	int loop_count = 0;
685575a742pt	uint_t backoff = 0;	/* current backoff */
6869d68b18ck	hrtime_t spin_time = 0;	/* how long we spun */
6887c478bdstevel@tonic-gate	if (panicstr)
6897c478bdstevel@tonic-gate		return;
6917c478bdstevel@tonic-gate	if (ncpus == 1)
6928793b36Nick Todd		panic("lock_set_spl: %p lock held and only one CPU",
6938793b36Nick Todd		    (void *)lp);
6957c478bdstevel@tonic-gate	ASSERT(new_pil > LOCK_LEVEL);
6979d68b18ck	spin_time = LOCKSTAT_START_TIME(LS_LOCK_SET_SPL_SPIN);
6997c478bdstevel@tonic-gate	do {
7007c478bdstevel@tonic-gate		splx(old_pil);
7017c478bdstevel@tonic-gate		while (LOCK_HELD(lp)) {
702575a742pt			loop_count++;
7047c478bdstevel@tonic-gate			if (panicstr) {
7057c478bdstevel@tonic-gate				*old_pil_addr = (ushort_t)splr(new_pil);
7067c478bdstevel@tonic-gate				return;
7077c478bdstevel@tonic-gate			}
708575a742pt			if (ncpus_online == loop_count) {
709575a742pt				backoff = mutex_lock_backoff(0);
710575a742pt				loop_count = 0;
711e603b7dpm			} else {
712575a742pt				backoff = mutex_lock_backoff(backoff);
7137c478bdstevel@tonic-gate			}
714575a742pt			mutex_lock_delay(backoff);
7157c478bdstevel@tonic-gate		}
7167c478bdstevel@tonic-gate		old_pil = splr(new_pil);
7177c478bdstevel@tonic-gate	} while (!lock_spin_try(lp));
7197c478bdstevel@tonic-gate	*old_pil_addr = (ushort_t)old_pil;
7219d68b18ck	LOCKSTAT_RECORD_TIME(LS_LOCK_SET_SPL_SPIN, lp, spin_time);