17c478bdstevel@tonic-gate/*
27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
52be60c5raf * Common Development and Distribution License (the "License").
62be60c5raf * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
21e8031f0raf
227c478bdstevel@tonic-gate/*
2309ce0d4Roger A. Faulkner * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
247c478bdstevel@tonic-gate * Use is subject to license terms.
250d045c0Robert Mustacchi * Copyright 2015, Joyent, Inc.
2648bbca8Daniel Hoffman * Copyright (c) 2016 by Delphix. All rights reserved.
277c478bdstevel@tonic-gate */
287c478bdstevel@tonic-gate
297c478bdstevel@tonic-gate#include "lint.h"
307c478bdstevel@tonic-gate#include "thr_uberdata.h"
31d4204c8raf#include <sys/rtpriocntl.h>
3231db3c2raf#include <sys/sdt.h>
3331db3c2raf#include <atomic.h>
347c478bdstevel@tonic-gate
35d4204c8raf#if defined(THREAD_DEBUG)
36d4204c8raf#define	INCR32(x)	(((x) != UINT32_MAX)? (x)++ : 0)
37d4204c8raf#define	INCR(x)		((x)++)
38d4204c8raf#define	DECR(x)		((x)--)
39d4204c8raf#define	MAXINCR(m, x)	((m < ++x)? (m = x) : 0)
40d4204c8raf#else
41d4204c8raf#define	INCR32(x)
42d4204c8raf#define	INCR(x)
43d4204c8raf#define	DECR(x)
44d4204c8raf#define	MAXINCR(m, x)
45d4204c8raf#endif
46d4204c8raf
477c478bdstevel@tonic-gate/*
487c478bdstevel@tonic-gate * This mutex is initialized to be held by lwp#1.
497c478bdstevel@tonic-gate * It is used to block a thread that has returned from a mutex_lock()
50883492draf * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
517c478bdstevel@tonic-gate */
527c478bdstevel@tonic-gatemutex_t	stall_mutex = DEFAULTMUTEX;
537c478bdstevel@tonic-gate
547c478bdstevel@tonic-gatestatic int shared_mutex_held(mutex_t *);
55883492drafstatic int mutex_queuelock_adaptive(mutex_t *);
56883492drafstatic void mutex_wakeup_all(mutex_t *);
577c478bdstevel@tonic-gate
587c478bdstevel@tonic-gate/*
597c478bdstevel@tonic-gate * Lock statistics support functions.
607c478bdstevel@tonic-gate */
617c478bdstevel@tonic-gatevoid
627c478bdstevel@tonic-gaterecord_begin_hold(tdb_mutex_stats_t *msp)
637c478bdstevel@tonic-gate{
647c478bdstevel@tonic-gate	tdb_incr(msp->mutex_lock);
657c478bdstevel@tonic-gate	msp->mutex_begin_hold = gethrtime();
667c478bdstevel@tonic-gate}
677c478bdstevel@tonic-gate
687c478bdstevel@tonic-gatehrtime_t
697c478bdstevel@tonic-gaterecord_hold_time(tdb_mutex_stats_t *msp)
707c478bdstevel@tonic-gate{
717c478bdstevel@tonic-gate	hrtime_t now = gethrtime();
727c478bdstevel@tonic-gate
737c478bdstevel@tonic-gate	if (msp->mutex_begin_hold)
747c478bdstevel@tonic-gate		msp->mutex_hold_time += now - msp->mutex_begin_hold;
757c478bdstevel@tonic-gate	msp->mutex_begin_hold = 0;
767c478bdstevel@tonic-gate	return (now);
777c478bdstevel@tonic-gate}
787c478bdstevel@tonic-gate
797c478bdstevel@tonic-gate/*
807c478bdstevel@tonic-gate * Called once at library initialization.
817c478bdstevel@tonic-gate */
827c478bdstevel@tonic-gatevoid
837c478bdstevel@tonic-gatemutex_setup(void)
847c478bdstevel@tonic-gate{
857c478bdstevel@tonic-gate	if (set_lock_byte(&stall_mutex.mutex_lockw))
867c478bdstevel@tonic-gate		thr_panic("mutex_setup() cannot acquire stall_mutex");
877c478bdstevel@tonic-gate	stall_mutex.mutex_owner = (uintptr_t)curthread;
887c478bdstevel@tonic-gate}
897c478bdstevel@tonic-gate
907c478bdstevel@tonic-gate/*
915d1dd9araf * The default spin count of 1000 is experimentally determined.
925d1dd9araf * On sun4u machines with any number of processors it could be raised
937c478bdstevel@tonic-gate * to 10,000 but that (experimentally) makes almost no difference.
945d1dd9araf * The environment variable:
957c478bdstevel@tonic-gate *	_THREAD_ADAPTIVE_SPIN=count
965d1dd9araf * can be used to override and set the count in the range [0 .. 1,000,000].
977c478bdstevel@tonic-gate */
987c478bdstevel@tonic-gateint	thread_adaptive_spin = 1000;
997c478bdstevel@tonic-gateuint_t	thread_max_spinners = 100;
1007c478bdstevel@tonic-gateint	thread_queue_verify = 0;
1017c478bdstevel@tonic-gatestatic	int	ncpus;
1027c478bdstevel@tonic-gate
1037c478bdstevel@tonic-gate/*
1047c478bdstevel@tonic-gate * Distinguish spinning for queue locks from spinning for regular locks.
1055d1dd9araf * We try harder to acquire queue locks by spinning.
1067c478bdstevel@tonic-gate * The environment variable:
1077c478bdstevel@tonic-gate *	_THREAD_QUEUE_SPIN=count
1087c478bdstevel@tonic-gate * can be used to override and set the count in the range [0 .. 1,000,000].
1097c478bdstevel@tonic-gate */
1105d1dd9arafint	thread_queue_spin = 10000;
1117c478bdstevel@tonic-gate
112883492draf#define	ALL_ATTRIBUTES				\
113883492draf	(LOCK_RECURSIVE | LOCK_ERRORCHECK |	\
114883492draf	LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT |	\
115883492draf	LOCK_ROBUST)
1167c478bdstevel@tonic-gate
1177c478bdstevel@tonic-gate/*
118883492draf * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
119883492draf * augmented by zero or more the flags:
120883492draf *	LOCK_RECURSIVE
121883492draf *	LOCK_ERRORCHECK
122883492draf *	LOCK_PRIO_INHERIT
123883492draf *	LOCK_PRIO_PROTECT
124883492draf *	LOCK_ROBUST
1257c478bdstevel@tonic-gate */
1267257d1braf#pragma weak _mutex_init = mutex_init
1277c478bdstevel@tonic-gate/* ARGSUSED2 */
1287c478bdstevel@tonic-gateint
1297257d1brafmutex_init(mutex_t *mp, int type, void *arg)
1307c478bdstevel@tonic-gate{
131883492draf	int basetype = (type & ~ALL_ATTRIBUTES);
132d4204c8raf	const pcclass_t *pccp;
133883492draf	int error = 0;
134d4204c8raf	int ceil;
135883492draf
136883492draf	if (basetype == USYNC_PROCESS_ROBUST) {
137883492draf		/*
138883492draf		 * USYNC_PROCESS_ROBUST is a deprecated historical type.
139883492draf		 * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
140883492draf		 * retain the USYNC_PROCESS_ROBUST flag so we can return
141883492draf		 * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
142883492draf		 * mutexes will ever draw ELOCKUNMAPPED).
143883492draf		 */
144883492draf		type |= (USYNC_PROCESS | LOCK_ROBUST);
145883492draf		basetype = USYNC_PROCESS;
146883492draf	}
1477c478bdstevel@tonic-gate
148d4204c8raf	if (type & LOCK_PRIO_PROTECT)
149d4204c8raf		pccp = get_info_by_policy(SCHED_FIFO);
150d4204c8raf	if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
151883492draf	    (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
152d4204c8raf	    == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
153d4204c8raf	    ((type & LOCK_PRIO_PROTECT) &&
154d4204c8raf	    ((ceil = *(int *)arg) < pccp->pcc_primin ||
155d4204c8raf	    ceil > pccp->pcc_primax))) {
156883492draf		error = EINVAL;
157883492draf	} else if (type & LOCK_ROBUST) {
158883492draf		/*
159883492draf		 * Callers of mutex_init() with the LOCK_ROBUST attribute
160883492draf		 * are required to pass an initially all-zero mutex.
161883492draf		 * Multiple calls to mutex_init() are allowed; all but
162883492draf		 * the first return EBUSY.  A call to mutex_init() is
163883492draf		 * allowed to make an inconsistent robust lock consistent
164883492draf		 * (for historical usage, even though the proper interface
165883492draf		 * for this is mutex_consistent()).  Note that we use
166883492draf		 * atomic_or_16() to set the LOCK_INITED flag so as
167883492draf		 * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
168883492draf		 */
169883492draf		if (!(mp->mutex_flag & LOCK_INITED)) {
170883492draf			mp->mutex_type = (uint8_t)type;
1717257d1braf			atomic_or_16(&mp->mutex_flag, LOCK_INITED);
172883492draf			mp->mutex_magic = MUTEX_MAGIC;
173883492draf		} else if (type != mp->mutex_type ||
174d4204c8raf		    ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
175883492draf			error = EINVAL;
1767257d1braf		} else if (mutex_consistent(mp) != 0) {
177883492draf			error = EBUSY;
178883492draf		}
179883492draf		/* register a process robust mutex with the kernel */
180883492draf		if (basetype == USYNC_PROCESS)
181883492draf			register_lock(mp);
182883492draf	} else {
1838cd4554raf		(void) memset(mp, 0, sizeof (*mp));
1847c478bdstevel@tonic-gate		mp->mutex_type = (uint8_t)type;
1857c478bdstevel@tonic-gate		mp->mutex_flag = LOCK_INITED;
1867c478bdstevel@tonic-gate		mp->mutex_magic = MUTEX_MAGIC;
187883492draf	}
188883492draf
189d4204c8raf	if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
190d4204c8raf		mp->mutex_ceiling = ceil;
191d4204c8raf	}
192883492draf
1937c5714fraf	/*
1947c5714fraf	 * This should be at the beginning of the function,
1957c5714fraf	 * but for the sake of old broken applications that
1967c5714fraf	 * do not have proper alignment for their mutexes
1977c5714fraf	 * (and don't check the return code from mutex_init),
1987c5714fraf	 * we put it here, after initializing the mutex regardless.
1997c5714fraf	 */
2007c5714fraf	if (error == 0 &&
2017c5714fraf	    ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2027c5714fraf	    curthread->ul_misaligned == 0)
2037c5714fraf		error = EINVAL;
2047c5714fraf
2057c478bdstevel@tonic-gate	return (error);
2067c478bdstevel@tonic-gate}
2077c478bdstevel@tonic-gate
2087c478bdstevel@tonic-gate/*
209d4204c8raf * Delete mp from list of ceiling mutexes owned by curthread.
2107c478bdstevel@tonic-gate * Return 1 if the head of the chain was updated.
2117c478bdstevel@tonic-gate */
2127c478bdstevel@tonic-gateint
2137c478bdstevel@tonic-gate_ceil_mylist_del(mutex_t *mp)
2147c478bdstevel@tonic-gate{
2157c478bdstevel@tonic-gate	ulwp_t *self = curthread;
2167c478bdstevel@tonic-gate	mxchain_t **mcpp;
2177c478bdstevel@tonic-gate	mxchain_t *mcp;
2187c478bdstevel@tonic-gate
219d4204c8raf	for (mcpp = &self->ul_mxchain;
220d4204c8raf	    (mcp = *mcpp) != NULL;
221d4204c8raf	    mcpp = &mcp->mxchain_next) {
222d4204c8raf		if (mcp->mxchain_mx == mp) {
223d4204c8raf			*mcpp = mcp->mxchain_next;
224d4204c8raf			lfree(mcp, sizeof (*mcp));
225d4204c8raf			return (mcpp == &self->ul_mxchain);
226d4204c8raf		}
227d4204c8raf	}
228d4204c8raf	return (0);
2297c478bdstevel@tonic-gate}
2307c478bdstevel@tonic-gate
2317c478bdstevel@tonic-gate/*
232d4204c8raf * Add mp to the list of ceiling mutexes owned by curthread.
2337c478bdstevel@tonic-gate * Return ENOMEM if no memory could be allocated.
2347c478bdstevel@tonic-gate */
2357c478bdstevel@tonic-gateint
2367c478bdstevel@tonic-gate_ceil_mylist_add(mutex_t *mp)
2377c478bdstevel@tonic-gate{
2387c478bdstevel@tonic-gate	ulwp_t *self = curthread;
2397c478bdstevel@tonic-gate	mxchain_t *mcp;
2407c478bdstevel@tonic-gate
2417c478bdstevel@tonic-gate	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
2427c478bdstevel@tonic-gate		return (ENOMEM);
2437c478bdstevel@tonic-gate	mcp->mxchain_mx = mp;
2447c478bdstevel@tonic-gate	mcp->mxchain_next = self->ul_mxchain;
2457c478bdstevel@tonic-gate	self->ul_mxchain = mcp;
2467c478bdstevel@tonic-gate	return (0);
2477c478bdstevel@tonic-gate}
2487c478bdstevel@tonic-gate
2497c478bdstevel@tonic-gate/*
250d4204c8raf * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
251d4204c8raf */
252d4204c8rafstatic void
253d4204c8rafset_rt_priority(ulwp_t *self, int prio)
254d4204c8raf{
255d4204c8raf	pcparms_t pcparm;
256d4204c8raf
257d4204c8raf	pcparm.pc_cid = self->ul_rtclassid;
258d4204c8raf	((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
259d4204c8raf	((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
2608cd4554raf	(void) priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
261d4204c8raf}
262d4204c8raf
263d4204c8raf/*
264d4204c8raf * Inherit priority from ceiling.
265d4204c8raf * This changes the effective priority, not the assigned priority.
2667c478bdstevel@tonic-gate */
2677c478bdstevel@tonic-gatevoid
268d4204c8raf_ceil_prio_inherit(int prio)
2697c478bdstevel@tonic-gate{
2707c478bdstevel@tonic-gate	ulwp_t *self = curthread;
2717c478bdstevel@tonic-gate
272d4204c8raf	self->ul_epri = prio;
273d4204c8raf	set_rt_priority(self, prio);
2747c478bdstevel@tonic-gate}
2757c478bdstevel@tonic-gate
2767c478bdstevel@tonic-gate/*
2777c478bdstevel@tonic-gate * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
2787c478bdstevel@tonic-gate * if holding at least one ceiling lock.  If no ceiling locks are held at this
2797c478bdstevel@tonic-gate * point, disinherit completely, reverting back to assigned priority.
2807c478bdstevel@tonic-gate */
2817c478bdstevel@tonic-gatevoid
2827c478bdstevel@tonic-gate_ceil_prio_waive(void)
2837c478bdstevel@tonic-gate{
2847c478bdstevel@tonic-gate	ulwp_t *self = curthread;
285d4204c8raf	mxchain_t *mcp = self->ul_mxchain;
286d4204c8raf	int prio;
2877c478bdstevel@tonic-gate
288d4204c8raf	if (mcp == NULL) {
289d4204c8raf		prio = self->ul_pri;
290d4204c8raf		self->ul_epri = 0;
2917c478bdstevel@tonic-gate	} else {
292d4204c8raf		prio = mcp->mxchain_mx->mutex_ceiling;
293d4204c8raf		self->ul_epri = prio;
2947c478bdstevel@tonic-gate	}
295d4204c8raf	set_rt_priority(self, prio);
2967c478bdstevel@tonic-gate}
2977c478bdstevel@tonic-gate
2987c478bdstevel@tonic-gate/*
2995d1dd9araf * Clear the lock byte.  Retain the waiters byte and the spinners byte.
3005d1dd9araf * Return the old value of the lock word.
3015d1dd9araf */
3025d1dd9arafstatic uint32_t
3035d1dd9arafclear_lockbyte(volatile uint32_t *lockword)
3045d1dd9araf{
3055d1dd9araf	uint32_t old;
3065d1dd9araf	uint32_t new;
3075d1dd9araf
3085d1dd9araf	do {
3095d1dd9araf		old = *lockword;
3105d1dd9araf		new = old & ~LOCKMASK;
3115d1dd9araf	} while (atomic_cas_32(lockword, old, new) != old);
3125d1dd9araf
3135d1dd9araf	return (old);
3145d1dd9araf}
3155d1dd9araf
3165d1dd9araf/*
31731db3c2raf * Same as clear_lockbyte(), but operates on mutex_lockword64.
31831db3c2raf * The mutex_ownerpid field is cleared along with the lock byte.
31931db3c2raf */
32031db3c2rafstatic uint64_t
32131db3c2rafclear_lockbyte64(volatile uint64_t *lockword64)
32231db3c2raf{
32331db3c2raf	uint64_t old;
32431db3c2raf	uint64_t new;
32531db3c2raf
32631db3c2raf	do {
32731db3c2raf		old = *lockword64;
32831db3c2raf		new = old & ~LOCKMASK64;
32931db3c2raf	} while (atomic_cas_64(lockword64, old, new) != old);
33031db3c2raf
33131db3c2raf	return (old);
33231db3c2raf}
33331db3c2raf
33431db3c2raf/*
33531db3c2raf * Similar to set_lock_byte(), which only tries to set the lock byte.
3367c5714fraf * Here, we attempt to set the lock byte AND the mutex_ownerpid, keeping
3377c5714fraf * the remaining bytes constant.  This atomic operation is required for the
3387c5714fraf * correctness of process-shared robust locks, otherwise there would be
3397c5714fraf * a window or vulnerability in which the lock byte had been set but the
3407c5714fraf * mutex_ownerpid had not yet been set.  If the process were to die in
3417c5714fraf * this window of vulnerability (due to some other thread calling exit()
3427c5714fraf * or the process receiving a fatal signal), the mutex would be left locked
3437c5714fraf * but without a process-ID to determine which process was holding the lock.
3447c5714fraf * The kernel would then be unable to mark the robust mutex as LOCK_OWNERDEAD
3457c5714fraf * when the process died.  For all other cases of process-shared locks, this
3467c5714fraf * operation is just a convenience, for the sake of common code.
3477c5714fraf *
3487c5714fraf * This operation requires process-shared robust locks to be properly
3497c5714fraf * aligned on an 8-byte boundary, at least on sparc machines, lest the
3507c5714fraf * operation incur an alignment fault.  This is automatic when locks
3517c5714fraf * are declared properly using the mutex_t or pthread_mutex_t data types
3527c5714fraf * and the application does not allocate dynamic memory on less than an
3537c5714fraf * 8-byte boundary.  See the 'horrible hack' comments below for cases
3547c5714fraf * dealing with such broken applications.
35531db3c2raf */
35631db3c2rafstatic int
35731db3c2rafset_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
35831db3c2raf{
35931db3c2raf	uint64_t old;
36031db3c2raf	uint64_t new;
36131db3c2raf
36231db3c2raf	old = *lockword64 & ~LOCKMASK64;
36331db3c2raf	new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
36431db3c2raf	if (atomic_cas_64(lockword64, old, new) == old)
36531db3c2raf		return (LOCKCLEAR);
36631db3c2raf
36731db3c2raf	return (LOCKSET);
36831db3c2raf}
36931db3c2raf
37031db3c2raf/*
3715d1dd9araf * Increment the spinners count in the mutex lock word.
3725d1dd9araf * Return 0 on success.  Return -1 if the count would overflow.
3735d1dd9araf */
3745d1dd9arafstatic int
3755d1dd9arafspinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
3765d1dd9araf{
3775d1dd9araf	uint32_t old;
3785d1dd9araf	uint32_t new;
3795d1dd9araf
3805d1dd9araf	do {
3815d1dd9araf		old = *lockword;
3825d1dd9araf		if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
3835d1dd9araf			return (-1);
3845d1dd9araf		new = old + (1 << SPINNERSHIFT);
3855d1dd9araf	} while (atomic_cas_32(lockword, old, new) != old);
3865d1dd9araf
3875d1dd9araf	return (0);
3885d1dd9araf}
3895d1dd9araf
3905d1dd9araf/*
3915d1dd9araf * Decrement the spinners count in the mutex lock word.
3925d1dd9araf * Return the new value of the lock word.
3935d1dd9araf */
3945d1dd9arafstatic uint32_t
3955d1dd9arafspinners_decr(volatile uint32_t *lockword)
3965d1dd9araf{
3975d1dd9araf	uint32_t old;
3985d1dd9araf	uint32_t new;
3995d1dd9araf
4005d1dd9araf	do {
4015d1dd9araf		new = old = *lockword;
4025d1dd9araf		if (new & SPINNERMASK)
4035d1dd9araf			new -= (1 << SPINNERSHIFT);
4045d1dd9araf	} while (atomic_cas_32(lockword, old, new) != old);
4055d1dd9araf
4065d1dd9araf	return (new);
4075d1dd9araf}
4085d1dd9araf
4095d1dd9araf/*
4107c478bdstevel@tonic-gate * Non-preemptive spin locks.  Used by queue_lock().
4117c478bdstevel@tonic-gate * No lock statistics are gathered for these locks.
4125d1dd9araf * No DTrace probes are provided for these locks.
4137c478bdstevel@tonic-gate */
4147c478bdstevel@tonic-gatevoid
4157c478bdstevel@tonic-gatespin_lock_set(mutex_t *mp)
4167c478bdstevel@tonic-gate{
4177c478bdstevel@tonic-gate	ulwp_t *self = curthread;
4187c478bdstevel@tonic-gate
4197c478bdstevel@tonic-gate	no_preempt(self);
4207c478bdstevel@tonic-gate	if (set_lock_byte(&mp->mutex_lockw) == 0) {
4217c478bdstevel@tonic-gate		mp->mutex_owner = (uintptr_t)self;
4227c478bdstevel@tonic-gate		return;
4237c478bdstevel@tonic-gate	}
4247c478bdstevel@tonic-gate	/*
4257c478bdstevel@tonic-gate	 * Spin for a while, attempting to acquire the lock.
4267c478bdstevel@tonic-gate	 */
427d4204c8raf	INCR32(self->ul_spin_lock_spin);
4287c478bdstevel@tonic-gate	if (mutex_queuelock_adaptive(mp) == 0 ||
4297c478bdstevel@tonic-gate	    set_lock_byte(&mp->mutex_lockw) == 0) {
4307c478bdstevel@tonic-gate		mp->mutex_owner = (uintptr_t)self;
4317c478bdstevel@tonic-gate		return;
4327c478bdstevel@tonic-gate	}
4337c478bdstevel@tonic-gate	/*
4347c478bdstevel@tonic-gate	 * Try harder if we were previously at a no premption level.
4357c478bdstevel@tonic-gate	 */
4367c478bdstevel@tonic-gate	if (self->ul_preempt > 1) {
437d4204c8raf		INCR32(self->ul_spin_lock_spin2);
4387c478bdstevel@tonic-gate		if (mutex_queuelock_adaptive(mp) == 0 ||
4397c478bdstevel@tonic-gate		    set_lock_byte(&mp->mutex_lockw) == 0) {
4407c478bdstevel@tonic-gate			mp->mutex_owner = (uintptr_t)self;
4417c478bdstevel@tonic-gate			return;
4427c478bdstevel@tonic-gate		}
4437c478bdstevel@tonic-gate	}
4447c478bdstevel@tonic-gate	/*
4457c478bdstevel@tonic-gate	 * Give up and block in the kernel for the mutex.
4467c478bdstevel@tonic-gate	 */
447d4204c8raf	INCR32(self->ul_spin_lock_sleep);
448db94676Roger A. Faulkner	(void) ___lwp_mutex_timedlock(mp, NULL, self);
4497c478bdstevel@tonic-gate}
4507c478bdstevel@tonic-gate
4517c478bdstevel@tonic-gatevoid
4527c478bdstevel@tonic-gatespin_lock_clear(mutex_t *mp)
4537c478bdstevel@tonic-gate{
4547c478bdstevel@tonic-gate	ulwp_t *self = curthread;
4557c478bdstevel@tonic-gate
4567c478bdstevel@tonic-gate	mp->mutex_owner = 0;
45741efec2raf	if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
458883492draf		(void) ___lwp_mutex_wakeup(mp, 0);
459d4204c8raf		INCR32(self->ul_spin_lock_wakeup);
4607c478bdstevel@tonic-gate	}
4617c478bdstevel@tonic-gate	preempt(self);
4627c478bdstevel@tonic-gate}
4637c478bdstevel@tonic-gate
4647c478bdstevel@tonic-gate/*
4657c478bdstevel@tonic-gate * Allocate the sleep queue hash table.
4667c478bdstevel@tonic-gate */
4677c478bdstevel@tonic-gatevoid
4687c478bdstevel@tonic-gatequeue_alloc(void)
4697c478bdstevel@tonic-gate{
4707c478bdstevel@tonic-gate	ulwp_t *self = curthread;
4717c478bdstevel@tonic-gate	uberdata_t *udp = self->ul_uberdata;
472d4204c8raf	queue_head_t *qp;
4737c478bdstevel@tonic-gate	void *data;
4747c478bdstevel@tonic-gate	int i;
4757c478bdstevel@tonic-gate
4767c478bdstevel@tonic-gate	/*
4777c478bdstevel@tonic-gate	 * No locks are needed; we call here only when single-threaded.
4787c478bdstevel@tonic-gate	 */
4797c478bdstevel@tonic-gate	ASSERT(self == udp->ulwp_one);
4807c478bdstevel@tonic-gate	ASSERT(!udp->uberflags.uf_mt);
4818cd4554raf	if ((data = mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
4827c478bdstevel@tonic-gate	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
4837c478bdstevel@tonic-gate	    == MAP_FAILED)
4847c478bdstevel@tonic-gate		thr_panic("cannot allocate thread queue_head table");
485d4204c8raf	udp->queue_head = qp = (queue_head_t *)data;
486d4204c8raf	for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
487d4204c8raf		qp->qh_type = (i < QHASHSIZE)? MX : CV;
488d4204c8raf		qp->qh_lock.mutex_flag = LOCK_INITED;
489d4204c8raf		qp->qh_lock.mutex_magic = MUTEX_MAGIC;
490d4204c8raf		qp->qh_hlist = &qp->qh_def_root;
491d4204c8raf#if defined(THREAD_DEBUG)
492d4204c8raf		qp->qh_hlen = 1;
493d4204c8raf		qp->qh_hmax = 1;
494d4204c8raf#endif
495883492draf	}
4967c478bdstevel@tonic-gate}
4977c478bdstevel@tonic-gate
4987c478bdstevel@tonic-gate#if defined(THREAD_DEBUG)
4997c478bdstevel@tonic-gate
5007c478bdstevel@tonic-gate/*
5017c478bdstevel@tonic-gate * Debugging: verify correctness of a sleep queue.
5027c478bdstevel@tonic-gate */
5037c478bdstevel@tonic-gatevoid
5047c478bdstevel@tonic-gateQVERIFY(queue_head_t *qp)
5057c478bdstevel@tonic-gate{
5067c478bdstevel@tonic-gate	ulwp_t *self = curthread;
5077c478bdstevel@tonic-gate	uberdata_t *udp = self->ul_uberdata;
508d4204c8raf	queue_root_t *qrp;
5097c478bdstevel@tonic-gate	ulwp_t *ulwp;
5107c478bdstevel@tonic-gate	ulwp_t *prev;
5117c478bdstevel@tonic-gate	uint_t index;
512d4204c8raf	uint32_t cnt;
5137c478bdstevel@tonic-gate	char qtype;
5147c478bdstevel@tonic-gate	void *wchan;
5157c478bdstevel@tonic-gate
5167c478bdstevel@tonic-gate	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
5177c478bdstevel@tonic-gate	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
518d4204c8raf	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
519d4204c8raf		cnt++;
520d4204c8raf		ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
521d4204c8raf		    (qrp->qr_head == NULL && qrp->qr_tail == NULL));
522d4204c8raf	}
523d4204c8raf	ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
524d4204c8raf	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
525d4204c8raf	ASSERT(qp->qh_type == qtype);
5267c478bdstevel@tonic-gate	if (!thread_queue_verify)
5277c478bdstevel@tonic-gate		return;
5287c478bdstevel@tonic-gate	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
529d4204c8raf	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
530d4204c8raf		for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
531d4204c8raf		    prev = ulwp, ulwp = ulwp->ul_link) {
532d4204c8raf			cnt++;
533d4204c8raf			if (ulwp->ul_writer)
534d4204c8raf				ASSERT(prev == NULL || prev->ul_writer);
535d4204c8raf			ASSERT(ulwp->ul_qtype == qtype);
536d4204c8raf			ASSERT(ulwp->ul_wchan != NULL);
537d4204c8raf			ASSERT(ulwp->ul_sleepq == qp);
538d4204c8raf			wchan = ulwp->ul_wchan;
539d4204c8raf			ASSERT(qrp->qr_wchan == wchan);
540d4204c8raf			index = QUEUE_HASH(wchan, qtype);
541d4204c8raf			ASSERT(&udp->queue_head[index] == qp);
542d4204c8raf		}
543d4204c8raf		ASSERT(qrp->qr_tail == prev);
544d4204c8raf	}
5457c478bdstevel@tonic-gate	ASSERT(qp->qh_qlen == cnt);
5467c478bdstevel@tonic-gate}
5477c478bdstevel@tonic-gate
5487c478bdstevel@tonic-gate#else	/* THREAD_DEBUG */
5497c478bdstevel@tonic-gate
5507c478bdstevel@tonic-gate#define	QVERIFY(qp)
5517c478bdstevel@tonic-gate
5527c478bdstevel@tonic-gate#endif	/* THREAD_DEBUG */
5537c478bdstevel@tonic-gate
5547c478bdstevel@tonic-gate/*
5557c478bdstevel@tonic-gate * Acquire a queue head.
5567c478bdstevel@tonic-gate */
5577c478bdstevel@tonic-gatequeue_head_t *
5587c478bdstevel@tonic-gatequeue_lock(void *wchan, int qtype)
5597c478bdstevel@tonic-gate{
5607c478bdstevel@tonic-gate	uberdata_t *udp = curthread->ul_uberdata;
5617c478bdstevel@tonic-gate	queue_head_t *qp;
562d4204c8raf	queue_root_t *qrp;
5637c478bdstevel@tonic-gate
5647c478bdstevel@tonic-gate	ASSERT(qtype == MX || qtype == CV);
5657c478bdstevel@tonic-gate
5667c478bdstevel@tonic-gate	/*
5677c478bdstevel@tonic-gate	 * It is possible that we could be called while still single-threaded.
5687c478bdstevel@tonic-gate	 * If so, we call queue_alloc() to allocate the queue_head[] array.
5697c478bdstevel@tonic-gate	 */
5707c478bdstevel@tonic-gate	if ((qp = udp->queue_head) == NULL) {
5717c478bdstevel@tonic-gate		queue_alloc();
5727c478bdstevel@tonic-gate		qp = udp->queue_head;
5737c478bdstevel@tonic-gate	}
5747c478bdstevel@tonic-gate	qp += QUEUE_HASH(wchan, qtype);
5757c478bdstevel@tonic-gate	spin_lock_set(&qp->qh_lock);
576d4204c8raf	for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
577d4204c8raf		if (qrp->qr_wchan == wchan)
578d4204c8raf			break;
579d4204c8raf	if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
580d4204c8raf		/* the default queue root is available; use it */
581d4204c8raf		qrp = &qp->qh_def_root;
582d4204c8raf		qrp->qr_wchan = wchan;
583d4204c8raf		ASSERT(qrp->qr_next == NULL);
584d4204c8raf		ASSERT(qrp->qr_tail == NULL &&
585d4204c8raf		    qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
586d4204c8raf	}
587d4204c8raf	qp->qh_wchan = wchan;	/* valid until queue_unlock() is called */
588d4204c8raf	qp->qh_root = qrp;	/* valid until queue_unlock() is called */
589d4204c8raf	INCR32(qp->qh_lockcount);
5907c478bdstevel@tonic-gate	QVERIFY(qp);
5917c478bdstevel@tonic-gate	return (qp);
5927c478bdstevel@tonic-gate}
5937c478bdstevel@tonic-gate
5947c478bdstevel@tonic-gate/*
5957c478bdstevel@tonic-gate * Release a queue head.
5967c478bdstevel@tonic-gate */
5977c478bdstevel@tonic-gatevoid
5987c478bdstevel@tonic-gatequeue_unlock(queue_head_t *qp)
5997c478bdstevel@tonic-gate{
6007c478bdstevel@tonic-gate	QVERIFY(qp);
6017c478bdstevel@tonic-gate	spin_lock_clear(&qp->qh_lock);
6027c478bdstevel@tonic-gate}
6037c478bdstevel@tonic-gate
6047c478bdstevel@tonic-gate/*
6057c478bdstevel@tonic-gate * For rwlock queueing, we must queue writers ahead of readers of the
6067c478bdstevel@tonic-gate * same priority.  We do this by making writers appear to have a half
6077c478bdstevel@tonic-gate * point higher priority for purposes of priority comparisons below.
6087c478bdstevel@tonic-gate */
6097c478bdstevel@tonic-gate#define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
6107c478bdstevel@tonic-gate
6117c478bdstevel@tonic-gatevoid
612d4204c8rafenqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
6137c478bdstevel@tonic-gate{
614d4204c8raf	queue_root_t *qrp;
6157c478bdstevel@tonic-gate	ulwp_t **ulwpp;
6167c478bdstevel@tonic-gate	ulwp_t *next;
6177c478bdstevel@tonic-gate	int pri = CMP_PRIO(ulwp);
6187c478bdstevel@tonic-gate
6197c478bdstevel@tonic-gate	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
6207c478bdstevel@tonic-gate	ASSERT(ulwp->ul_sleepq != qp);
6217c478bdstevel@tonic-gate
622d4204c8raf	if ((qrp = qp->qh_root) == NULL) {
623d4204c8raf		/* use the thread's queue root for the linkage */
624d4204c8raf		qrp = &ulwp->ul_queue_root;
625d4204c8raf		qrp->qr_next = qp->qh_hlist;
626d4204c8raf		qrp->qr_prev = NULL;
627d4204c8raf		qrp->qr_head = NULL;
628d4204c8raf		qrp->qr_tail = NULL;
629d4204c8raf		qrp->qr_wchan = qp->qh_wchan;
630d4204c8raf		qrp->qr_rtcount = 0;
631d4204c8raf		qrp->qr_qlen = 0;
632d4204c8raf		qrp->qr_qmax = 0;
633d4204c8raf		qp->qh_hlist->qr_prev = qrp;
634d4204c8raf		qp->qh_hlist = qrp;
635d4204c8raf		qp->qh_root = qrp;
636d4204c8raf		MAXINCR(qp->qh_hmax, qp->qh_hlen);
637d4204c8raf	}
638d4204c8raf
6397c478bdstevel@tonic-gate	/*
6407c478bdstevel@tonic-gate	 * LIFO queue ordering is unfair and can lead to starvation,
6417c478bdstevel@tonic-gate	 * but it gives better performance for heavily contended locks.
6427c478bdstevel@tonic-gate	 * We use thread_queue_fifo (range is 0..8) to determine
6437c478bdstevel@tonic-gate	 * the frequency of FIFO vs LIFO queuing:
6447c478bdstevel@tonic-gate	 *	0 : every 256th time	(almost always LIFO)
6457c478bdstevel@tonic-gate	 *	1 : every 128th time
6467c478bdstevel@tonic-gate	 *	2 : every 64th  time
6477c478bdstevel@tonic-gate	 *	3 : every 32nd  time
6487c478bdstevel@tonic-gate	 *	4 : every 16th  time	(the default value, mostly LIFO)
6497c478bdstevel@tonic-gate	 *	5 : every 8th   time
6507c478bdstevel@tonic-gate	 *	6 : every 4th   time
6517c478bdstevel@tonic-gate	 *	7 : every 2nd   time
6527c478bdstevel@tonic-gate	 *	8 : every time		(never LIFO, always FIFO)
6537c478bdstevel@tonic-gate	 * Note that there is always some degree of FIFO ordering.
6547c478bdstevel@tonic-gate	 * This breaks live lock conditions that occur in applications
6557c478bdstevel@tonic-gate	 * that are written assuming (incorrectly) that threads acquire
6567c478bdstevel@tonic-gate	 * locks fairly, that is, in roughly round-robin order.
657d4204c8raf	 * In any event, the queue is maintained in kernel priority order.
6587c478bdstevel@tonic-gate	 *
659d4204c8raf	 * If force_fifo is non-zero, fifo queueing is forced.
6607c478bdstevel@tonic-gate	 * SUSV3 requires this for semaphores.
6617c478bdstevel@tonic-gate	 */
662d4204c8raf	if (qrp->qr_head == NULL) {
6637c478bdstevel@tonic-gate		/*
6647c478bdstevel@tonic-gate		 * The queue is empty.  LIFO/FIFO doesn't matter.
6657c478bdstevel@tonic-gate		 */
666d4204c8raf		ASSERT(qrp->qr_tail == NULL);
667d4204c8raf		ulwpp = &qrp->qr_head;
668d4204c8raf	} else if (force_fifo |
669d4204c8raf	    (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
6707c478bdstevel@tonic-gate		/*
6717c478bdstevel@tonic-gate		 * Enqueue after the last thread whose priority is greater
6727c478bdstevel@tonic-gate		 * than or equal to the priority of the thread being queued.
6737c478bdstevel@tonic-gate		 * Attempt first to go directly onto the tail of the queue.
6747c478bdstevel@tonic-gate		 */
675d4204c8raf		if (pri <= CMP_PRIO(qrp->qr_tail))
676d4204c8raf			ulwpp = &qrp->qr_tail->ul_link;
6777c478bdstevel@tonic-gate		else {
678d4204c8raf			for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
6797c478bdstevel@tonic-gate			    ulwpp = &next->ul_link)
6807c478bdstevel@tonic-gate				if (pri > CMP_PRIO(next))
6817c478bdstevel@tonic-gate					break;
6827c478bdstevel@tonic-gate		}
6837c478bdstevel@tonic-gate	} else {
6847c478bdstevel@tonic-gate		/*
6857c478bdstevel@tonic-gate		 * Enqueue before the first thread whose priority is less
6867c478bdstevel@tonic-gate		 * than or equal to the priority of the thread being queued.
6877c478bdstevel@tonic-gate		 * Hopefully we can go directly onto the head of the queue.
6887c478bdstevel@tonic-gate		 */
689d4204c8raf		for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
6907c478bdstevel@tonic-gate		    ulwpp = &next->ul_link)
6917c478bdstevel@tonic-gate			if (pri >= CMP_PRIO(next))
6927c478bdstevel@tonic-gate				break;
6937c478bdstevel@tonic-gate	}
6947c478bdstevel@tonic-gate	if ((ulwp->ul_link = *ulwpp) == NULL)
695d4204c8raf		qrp->qr_tail = ulwp;
6967c478bdstevel@tonic-gate	*ulwpp = ulwp;
6977c478bdstevel@tonic-gate
6987c478bdstevel@tonic-gate	ulwp->ul_sleepq = qp;
699d4204c8raf	ulwp->ul_wchan = qp->qh_wchan;
700d4204c8raf	ulwp->ul_qtype = qp->qh_type;
701d4204c8raf	if ((ulwp->ul_schedctl != NULL &&
702d4204c8raf	    ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
703d4204c8raf	    ulwp->ul_pilocks) {
704d4204c8raf		ulwp->ul_rtqueued = 1;
705d4204c8raf		qrp->qr_rtcount++;
706d4204c8raf	}
707d4204c8raf	MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
708d4204c8raf	MAXINCR(qp->qh_qmax, qp->qh_qlen);
7097c478bdstevel@tonic-gate}
7107c478bdstevel@tonic-gate
7117c478bdstevel@tonic-gate/*
712d4204c8raf * Helper function for queue_slot() and queue_slot_rt().
713d4204c8raf * Try to find a non-suspended thread on the queue.
7147c478bdstevel@tonic-gate */
7157c478bdstevel@tonic-gatestatic ulwp_t **
716d4204c8rafqueue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
7177c478bdstevel@tonic-gate{
7187c478bdstevel@tonic-gate	ulwp_t *ulwp;
719d4204c8raf	ulwp_t **foundpp = NULL;
720d4204c8raf	int priority = -1;
721d4204c8raf	ulwp_t *prev;
722d4204c8raf	int tpri;
7237c478bdstevel@tonic-gate
724d4204c8raf	for (prev = NULL;
725d4204c8raf	    (ulwp = *ulwpp) != NULL;
7267c478bdstevel@tonic-gate	    prev = ulwp, ulwpp = &ulwp->ul_link) {
727d4204c8raf		if (ulwp->ul_stop)	/* skip suspended threads */
728d4204c8raf			continue;
729d4204c8raf		tpri = rt? CMP_PRIO(ulwp) : 0;
730d4204c8raf		if (tpri > priority) {
731d4204c8raf			foundpp = ulwpp;
732d4204c8raf			*prevp = prev;
733d4204c8raf			priority = tpri;
734d4204c8raf			if (!rt)
7357c478bdstevel@tonic-gate				break;
7367c478bdstevel@tonic-gate		}
7377c478bdstevel@tonic-gate	}
738d4204c8raf	return (foundpp);
739d4204c8raf}
740d4204c8raf
741d4204c8raf/*
742d4204c8raf * For real-time, we search the entire queue because the dispatch
743d4204c8raf * (kernel) priorities may have changed since enqueueing.
744d4204c8raf */
745d4204c8rafstatic ulwp_t **
746d4204c8rafqueue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
747d4204c8raf{
748d4204c8raf	ulwp_t **ulwpp = ulwpp_org;
749d4204c8raf	ulwp_t *ulwp = *ulwpp;
750d4204c8raf	ulwp_t **foundpp = ulwpp;
751d4204c8raf	int priority = CMP_PRIO(ulwp);
752d4204c8raf	ulwp_t *prev;
753d4204c8raf	int tpri;
7547c478bdstevel@tonic-gate
755d4204c8raf	for (prev = ulwp, ulwpp = &ulwp->ul_link;
756d4204c8raf	    (ulwp = *ulwpp) != NULL;
757d4204c8raf	    prev = ulwp, ulwpp = &ulwp->ul_link) {
758d4204c8raf		tpri = CMP_PRIO(ulwp);
759d4204c8raf		if (tpri > priority) {
760d4204c8raf			foundpp = ulwpp;
761d4204c8raf			*prevp = prev;
762d4204c8raf			priority = tpri;
763d4204c8raf		}
7647c478bdstevel@tonic-gate	}
765d4204c8raf	ulwp = *foundpp;
766d4204c8raf
767d4204c8raf	/*
768d4204c8raf	 * Try not to return a suspended thread.
769d4204c8raf	 * This mimics the old libthread's behavior.
770d4204c8raf	 */
771d4204c8raf	if (ulwp->ul_stop &&
772d4204c8raf	    (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
773d4204c8raf		foundpp = ulwpp;
774d4204c8raf		ulwp = *foundpp;
7757c478bdstevel@tonic-gate	}
776d4204c8raf	ulwp->ul_rt = 1;
777d4204c8raf	return (foundpp);
778d4204c8raf}
7797c478bdstevel@tonic-gate
780d4204c8rafulwp_t **
781d4204c8rafqueue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
782d4204c8raf{
783d4204c8raf	queue_root_t *qrp;
784d4204c8raf	ulwp_t **ulwpp;
785d4204c8raf	ulwp_t *ulwp;
786d4204c8raf	int rt;
7877c478bdstevel@tonic-gate
788d4204c8raf	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
789d4204c8raf
790d4204c8raf	if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
791d4204c8raf		*more = 0;
792d4204c8raf		return (NULL);		/* no lwps on the queue */
793d4204c8raf	}
794d4204c8raf	rt = (qrp->qr_rtcount != 0);
795d4204c8raf	*prevp = NULL;
796d4204c8raf	if (ulwp->ul_link == NULL) {	/* only one lwp on the queue */
797d4204c8raf		*more = 0;
798d4204c8raf		ulwp->ul_rt = rt;
799d4204c8raf		return (&qrp->qr_head);
800d4204c8raf	}
801d4204c8raf	*more = 1;
802d4204c8raf
803d4204c8raf	if (rt)		/* real-time queue */
804d4204c8raf		return (queue_slot_rt(&qrp->qr_head, prevp));
8057c478bdstevel@tonic-gate	/*
806d4204c8raf	 * Try not to return a suspended thread.
807d4204c8raf	 * This mimics the old libthread's behavior.
8087c478bdstevel@tonic-gate	 */
809d4204c8raf	if (ulwp->ul_stop &&
810d4204c8raf	    (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
811d4204c8raf		ulwp = *ulwpp;
812d4204c8raf		ulwp->ul_rt = 0;
8137c478bdstevel@tonic-gate		return (ulwpp);
8147c478bdstevel@tonic-gate	}
815d4204c8raf	/*
816d4204c8raf	 * The common case; just pick the first thread on the queue.
817d4204c8raf	 */
818d4204c8raf	ulwp->ul_rt = 0;
819d4204c8raf	return (&qrp->qr_head);
8207c478bdstevel@tonic-gate}
8217c478bdstevel@tonic-gate
822d4204c8raf/*
823d4204c8raf * Common code for unlinking an lwp from a user-level sleep queue.
824d4204c8raf */
825d4204c8rafvoid
82641efec2rafqueue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
8277c478bdstevel@tonic-gate{
828d4204c8raf	queue_root_t *qrp = qp->qh_root;
829d4204c8raf	queue_root_t *nqrp;
830d4204c8raf	ulwp_t *ulwp = *ulwpp;
831d4204c8raf	ulwp_t *next;
8327c478bdstevel@tonic-gate
833d4204c8raf	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
834d4204c8raf	ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
8357c478bdstevel@tonic-gate
836d4204c8raf	DECR(qp->qh_qlen);
837d4204c8raf	DECR(qrp->qr_qlen);
838d4204c8raf	if (ulwp->ul_rtqueued) {
839d4204c8raf		ulwp->ul_rtqueued = 0;
840d4204c8raf		qrp->qr_rtcount--;
841d4204c8raf	}
842d4204c8raf	next = ulwp->ul_link;
843d4204c8raf	*ulwpp = next;
844d4204c8raf	ulwp->ul_link = NULL;
845d4204c8raf	if (qrp->qr_tail == ulwp)
846d4204c8raf		qrp->qr_tail = prev;
847d4204c8raf	if (qrp == &ulwp->ul_queue_root) {
848d4204c8raf		/*
849d4204c8raf		 * We can't continue to use the unlinked thread's
850d4204c8raf		 * queue root for the linkage.
851d4204c8raf		 */
852d4204c8raf		queue_root_t *qr_next = qrp->qr_next;
853d4204c8raf		queue_root_t *qr_prev = qrp->qr_prev;
854d4204c8raf
855d4204c8raf		if (qrp->qr_tail) {
856d4204c8raf			/* switch to using the last thread's queue root */
857d4204c8raf			ASSERT(qrp->qr_qlen != 0);
858d4204c8raf			nqrp = &qrp->qr_tail->ul_queue_root;
859d4204c8raf			*nqrp = *qrp;
860d4204c8raf			if (qr_next)
861d4204c8raf				qr_next->qr_prev = nqrp;
862d4204c8raf			if (qr_prev)
863d4204c8raf				qr_prev->qr_next = nqrp;
864d4204c8raf			else
865d4204c8raf				qp->qh_hlist = nqrp;
866d4204c8raf			qp->qh_root = nqrp;
867d4204c8raf		} else {
868d4204c8raf			/* empty queue root; just delete from the hash list */
869d4204c8raf			ASSERT(qrp->qr_qlen == 0);
870d4204c8raf			if (qr_next)
871d4204c8raf				qr_next->qr_prev = qr_prev;
872d4204c8raf			if (qr_prev)
873d4204c8raf				qr_prev->qr_next = qr_next;
874d4204c8raf			else
875d4204c8raf				qp->qh_hlist = qr_next;
876d4204c8raf			qp->qh_root = NULL;
877d4204c8raf			DECR(qp->qh_hlen);
878d4204c8raf		}
879d4204c8raf	}
8807c478bdstevel@tonic-gate}
8817c478bdstevel@tonic-gate
88241efec2rafulwp_t *
883d4204c8rafdequeue(queue_head_t *qp, int *more)
88441efec2raf{
88541efec2raf	ulwp_t **ulwpp;
886d4204c8raf	ulwp_t *ulwp;
88741efec2raf	ulwp_t *prev;
88841efec2raf
889d4204c8raf	if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
89041efec2raf		return (NULL);
891d4204c8raf	ulwp = *ulwpp;
892d4204c8raf	queue_unlink(qp, ulwpp, prev);
893d4204c8raf	ulwp->ul_sleepq = NULL;
894d4204c8raf	ulwp->ul_wchan = NULL;
895d4204c8raf	return (ulwp);
89641efec2raf}
89741efec2raf
8987c478bdstevel@tonic-gate/*
8997c478bdstevel@tonic-gate * Return a pointer to the highest priority thread sleeping on wchan.
9007c478bdstevel@tonic-gate */
9017c478bdstevel@tonic-gateulwp_t *
902d4204c8rafqueue_waiter(queue_head_t *qp)
9037c478bdstevel@tonic-gate{
9047c478bdstevel@tonic-gate	ulwp_t **ulwpp;
905d4204c8raf	ulwp_t *prev;
906d4204c8raf	int more;
9077c478bdstevel@tonic-gate
908d4204c8raf	if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
9097c478bdstevel@tonic-gate		return (NULL);
9107c478bdstevel@tonic-gate	return (*ulwpp);
9117c478bdstevel@tonic-gate}
9127c478bdstevel@tonic-gate
913d4204c8rafint
914d4204c8rafdequeue_self(queue_head_t *qp)
9157c478bdstevel@tonic-gate{
9167c478bdstevel@tonic-gate	ulwp_t *self = curthread;
917d4204c8raf	queue_root_t *qrp;
9187c478bdstevel@tonic-gate	ulwp_t **ulwpp;
9197c478bdstevel@tonic-gate	ulwp_t *ulwp;
920d4204c8raf	ulwp_t *prev;
9217c478bdstevel@tonic-gate	int found = 0;
9227c478bdstevel@tonic-gate
9237c478bdstevel@tonic-gate	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
9247c478bdstevel@tonic-gate
9257c478bdstevel@tonic-gate	/* find self on the sleep queue */
926d4204c8raf	if ((qrp = qp->qh_root) != NULL) {
927d4204c8raf		for (prev = NULL, ulwpp = &qrp->qr_head;
928d4204c8raf		    (ulwp = *ulwpp) != NULL;
929d4204c8raf		    prev = ulwp, ulwpp = &ulwp->ul_link) {
930d4204c8raf			if (ulwp == self) {
931d4204c8raf				queue_unlink(qp, ulwpp, prev);
932d4204c8raf				self->ul_cvmutex = NULL;
933d4204c8raf				self->ul_sleepq = NULL;
934d4204c8raf				self->ul_wchan = NULL;
935d4204c8raf				found = 1;
936d4204c8raf				break;
937d4204c8raf			}
9387c478bdstevel@tonic-gate		}
9397c478bdstevel@tonic-gate	}
9407c478bdstevel@tonic-gate
9417c478bdstevel@tonic-gate	if (!found)
9427c478bdstevel@tonic-gate		thr_panic("dequeue_self(): curthread not found on queue");
9437c478bdstevel@tonic-gate
944d4204c8raf	return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
9457c478bdstevel@tonic-gate}
9467c478bdstevel@tonic-gate
9477c478bdstevel@tonic-gate/*
9487c478bdstevel@tonic-gate * Called from call_user_handler() and _thrp_suspend() to take
9497c478bdstevel@tonic-gate * ourself off of our sleep queue so we can grab locks.
9507c478bdstevel@tonic-gate */
9517c478bdstevel@tonic-gatevoid
9527c478bdstevel@tonic-gateunsleep_self(void)
9537c478bdstevel@tonic-gate{
9547c478bdstevel@tonic-gate	ulwp_t *self = curthread;
9557c478bdstevel@tonic-gate	queue_head_t *qp;
9567c478bdstevel@tonic-gate
9577c478bdstevel@tonic-gate	/*
9587c478bdstevel@tonic-gate	 * Calling enter_critical()/exit_critical() here would lead
9597c478bdstevel@tonic-gate	 * to recursion.  Just manipulate self->ul_critical directly.
9607c478bdstevel@tonic-gate	 */
9617c478bdstevel@tonic-gate	self->ul_critical++;
9627c478bdstevel@tonic-gate	while (self->ul_sleepq != NULL) {
9637c478bdstevel@tonic-gate		qp = queue_lock(self->ul_wchan, self->ul_qtype);
9647c478bdstevel@tonic-gate		/*
9657c478bdstevel@tonic-gate		 * We may have been moved from a CV queue to a
9667c478bdstevel@tonic-gate		 * mutex queue while we were attempting queue_lock().
9677c478bdstevel@tonic-gate		 * If so, just loop around and try again.
9687c478bdstevel@tonic-gate		 * dequeue_self() clears self->ul_sleepq.
9697c478bdstevel@tonic-gate		 */
970d4204c8raf		if (qp == self->ul_sleepq)
971d4204c8raf			(void) dequeue_self(qp);
9727c478bdstevel@tonic-gate		queue_unlock(qp);
9737c478bdstevel@tonic-gate	}
974d4204c8raf	self->ul_writer = 0;
9757c478bdstevel@tonic-gate	self->ul_critical--;
9767c478bdstevel@tonic-gate}
9777c478bdstevel@tonic-gate
9787c478bdstevel@tonic-gate/*
9797c478bdstevel@tonic-gate * Common code for calling the the ___lwp_mutex_timedlock() system call.
9807c478bdstevel@tonic-gate * Returns with mutex_owner and mutex_ownerpid set correctly.
9817c478bdstevel@tonic-gate */
982883492drafstatic int
9837c478bdstevel@tonic-gatemutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
9847c478bdstevel@tonic-gate{
9857c478bdstevel@tonic-gate	ulwp_t *self = curthread;
9867c478bdstevel@tonic-gate	uberdata_t *udp = self->ul_uberdata;
987883492draf	int mtype = mp->mutex_type;
9887c478bdstevel@tonic-gate	hrtime_t begin_sleep;
989883492draf	int acquired;
9907c478bdstevel@tonic-gate	int error;
9917c478bdstevel@tonic-gate
9927c478bdstevel@tonic-gate	self->ul_sp = stkptr();
9937c478bdstevel@tonic-gate	self->ul_wchan = mp;
9947c478bdstevel@tonic-gate	if (__td_event_report(self, TD_SLEEP, udp)) {
9957c478bdstevel@tonic-gate		self->ul_td_evbuf.eventnum = TD_SLEEP;
9967c478bdstevel@tonic-gate		self->ul_td_evbuf.eventdata = mp;
9977c478bdstevel@tonic-gate		tdb_event(TD_SLEEP, udp);
9987c478bdstevel@tonic-gate	}
9997c478bdstevel@tonic-gate	if (msp) {
10007c478bdstevel@tonic-gate		tdb_incr(msp->mutex_sleep);
10017c478bdstevel@tonic-gate		begin_sleep = gethrtime();
10027c478bdstevel@tonic-gate	}
10037c478bdstevel@tonic-gate
10047c478bdstevel@tonic-gate	DTRACE_PROBE1(plockstat, mutex__block, mp);
10057c478bdstevel@tonic-gate
10067c478bdstevel@tonic-gate	for (;;) {
1007883492draf		/*
1008883492draf		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1009883492draf		 * means we successfully acquired the lock.
1010883492draf		 */
1011db94676Roger A. Faulkner		if ((error = ___lwp_mutex_timedlock(mp, tsp, self)) != 0 &&
1012883492draf		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1013883492draf			acquired = 0;
10147c478bdstevel@tonic-gate			break;
10157c478bdstevel@tonic-gate		}
10167c478bdstevel@tonic-gate
1017883492draf		if (mtype & USYNC_PROCESS) {
10187c478bdstevel@tonic-gate			/*
10197c478bdstevel@tonic-gate			 * Defend against forkall().  We may be the child,
10207c478bdstevel@tonic-gate			 * in which case we don't actually own the mutex.
10217c478bdstevel@tonic-gate			 */
10227c478bdstevel@tonic-gate			enter_critical(self);
10237c478bdstevel@tonic-gate			if (mp->mutex_ownerpid == udp->pid) {
10247c478bdstevel@tonic-gate				exit_critical(self);
1025883492draf				acquired = 1;
10267c478bdstevel@tonic-gate				break;
10277c478bdstevel@tonic-gate			}
10287c478bdstevel@tonic-gate			exit_critical(self);
10297c478bdstevel@tonic-gate		} else {
1030883492draf			acquired = 1;
10317c478bdstevel@tonic-gate			break;
10327c478bdstevel@tonic-gate		}
10337c478bdstevel@tonic-gate	}
1034328cc3eRoger A. Faulkner
10357c478bdstevel@tonic-gate	if (msp)
10367c478bdstevel@tonic-gate		msp->mutex_sleep_time += gethrtime() - begin_sleep;
10377c478bdstevel@tonic-gate	self->ul_wchan = NULL;
10387c478bdstevel@tonic-gate	self->ul_sp = 0;
10397c478bdstevel@tonic-gate
1040883492draf	if (acquired) {
1041db94676Roger A. Faulkner		ASSERT(mp->mutex_owner == (uintptr_t)self);
1042883492draf		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1043883492draf		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1044883492draf	} else {
1045883492draf		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1046883492draf		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1047883492draf	}
1048883492draf
10497c478bdstevel@tonic-gate	return (error);
10507c478bdstevel@tonic-gate}
10517c478bdstevel@tonic-gate
10527c478bdstevel@tonic-gate/*
10537c478bdstevel@tonic-gate * Common code for calling the ___lwp_mutex_trylock() system call.
10547c478bdstevel@tonic-gate * Returns with mutex_owner and mutex_ownerpid set correctly.
10557c478bdstevel@tonic-gate */
10567c478bdstevel@tonic-gateint
10577c478bdstevel@tonic-gatemutex_trylock_kernel(mutex_t *mp)
10587c478bdstevel@tonic-gate{
10597c478bdstevel@tonic-gate	ulwp_t *self = curthread;
10607c478bdstevel@tonic-gate	uberdata_t *udp = self->ul_uberdata;
1061883492draf	int mtype = mp->mutex_type;
10627c478bdstevel@tonic-gate	int error;
1063883492draf	int acquired;
10647c478bdstevel@tonic-gate
10657c478bdstevel@tonic-gate	for (;;) {
1066883492draf		/*
1067883492draf		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1068883492draf		 * means we successfully acquired the lock.
1069883492draf		 */
1070db94676Roger A. Faulkner		if ((error = ___lwp_mutex_trylock(mp, self)) != 0 &&
1071883492draf		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1072883492draf			acquired = 0;
10737c478bdstevel@tonic-gate			break;
10747c478bdstevel@tonic-gate		}
10757c478bdstevel@tonic-gate
1076883492draf		if (mtype & USYNC_PROCESS) {
10777c478bdstevel@tonic-gate			/*
10787c478bdstevel@tonic-gate			 * Defend against forkall().  We may be the child,
10797c478bdstevel@tonic-gate			 * in which case we don't actually own the mutex.
10807c478bdstevel@tonic-gate			 */
10817c478bdstevel@tonic-gate			enter_critical(self);
10827c478bdstevel@tonic-gate			if (mp->mutex_ownerpid == udp->pid) {
10837c478bdstevel@tonic-gate				exit_critical(self);
1084883492draf				acquired = 1;
10857c478bdstevel@tonic-gate				break;
10867c478bdstevel@tonic-gate			}
10877c478bdstevel@tonic-gate			exit_critical(self);
10887c478bdstevel@tonic-gate		} else {
1089883492draf			acquired = 1;
10907c478bdstevel@tonic-gate			break;
10917c478bdstevel@tonic-gate		}
10927c478bdstevel@tonic-gate	}
10937c478bdstevel@tonic-gate
1094883492draf	if (acquired) {
1095db94676Roger A. Faulkner		ASSERT(mp->mutex_owner == (uintptr_t)self);
1096883492draf		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1097883492draf	} else if (error != EBUSY) {
1098883492draf		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1099883492draf	}
1100883492draf
11017c478bdstevel@tonic-gate	return (error);
11027c478bdstevel@tonic-gate}
11037c478bdstevel@tonic-gate
11047c478bdstevel@tonic-gatevolatile sc_shared_t *
11057c478bdstevel@tonic-gatesetup_schedctl(void)
11067c478bdstevel@tonic-gate{
11077c478bdstevel@tonic-gate	ulwp_t *self = curthread;
11087c478bdstevel@tonic-gate	volatile sc_shared_t *scp;
11097c478bdstevel@tonic-gate	sc_shared_t *tmp;
11107c478bdstevel@tonic-gate
11117c478bdstevel@tonic-gate	if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
11127c478bdstevel@tonic-gate	    !self->ul_vfork &&			/* not a child of vfork() */
11137c478bdstevel@tonic-gate	    !self->ul_schedctl_called) {	/* haven't been called before */
11147c478bdstevel@tonic-gate		enter_critical(self);
11157c478bdstevel@tonic-gate		self->ul_schedctl_called = &self->ul_uberdata->uberflags;
11167c478bdstevel@tonic-gate		if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
11177c478bdstevel@tonic-gate			self->ul_schedctl = scp = tmp;
11187c478bdstevel@tonic-gate		exit_critical(self);
11197c478bdstevel@tonic-gate	}
11207c478bdstevel@tonic-gate	/*
11217c478bdstevel@tonic-gate	 * Unless the call to setup_schedctl() is surrounded
11227c478bdstevel@tonic-gate	 * by enter_critical()/exit_critical(), the address
11237c478bdstevel@tonic-gate	 * we are returning could be invalid due to a forkall()
11247c478bdstevel@tonic-gate	 * having occurred in another thread.
11257c478bdstevel@tonic-gate	 */
11267c478bdstevel@tonic-gate	return (scp);
11277c478bdstevel@tonic-gate}
11287c478bdstevel@tonic-gate
11297c478bdstevel@tonic-gate/*
11307c478bdstevel@tonic-gate * Interfaces from libsched, incorporated into libc.
11317c478bdstevel@tonic-gate * libsched.so.1 is now a filter library onto libc.
11327c478bdstevel@tonic-gate */
11337257d1braf#pragma weak schedctl_lookup = schedctl_init
11347c478bdstevel@tonic-gateschedctl_t *
11357257d1brafschedctl_init(void)
11367c478bdstevel@tonic-gate{
11377c478bdstevel@tonic-gate	volatile sc_shared_t *scp = setup_schedctl();
11387c478bdstevel@tonic-gate	return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
11397c478bdstevel@tonic-gate}
11407c478bdstevel@tonic-gate
11417c478bdstevel@tonic-gatevoid
11427257d1brafschedctl_exit(void)
11437c478bdstevel@tonic-gate{
11447c478bdstevel@tonic-gate}
11457c478bdstevel@tonic-gate
11467c478bdstevel@tonic-gate/*
11477c478bdstevel@tonic-gate * Contract private interface for java.
11487c478bdstevel@tonic-gate * Set up the schedctl data if it doesn't exist yet.
11497c478bdstevel@tonic-gate * Return a pointer to the pointer to the schedctl data.
11507c478bdstevel@tonic-gate */
11517c478bdstevel@tonic-gatevolatile sc_shared_t *volatile *
11527c478bdstevel@tonic-gate_thr_schedctl(void)
11537c478bdstevel@tonic-gate{
11547c478bdstevel@tonic-gate	ulwp_t *self = curthread;
11557c478bdstevel@tonic-gate	volatile sc_shared_t *volatile *ptr;
11567c478bdstevel@tonic-gate
11577c478bdstevel@tonic-gate	if (self->ul_vfork)
11587c478bdstevel@tonic-gate		return (NULL);
11597c478bdstevel@tonic-gate	if (*(ptr = &self->ul_schedctl) == NULL)
11607c478bdstevel@tonic-gate		(void) setup_schedctl();
11617c478bdstevel@tonic-gate	return (ptr);
11627c478bdstevel@tonic-gate}
11637c478bdstevel@tonic-gate
11647c478bdstevel@tonic-gate/*
11657c478bdstevel@tonic-gate * Block signals and attempt to block preemption.
11667c478bdstevel@tonic-gate * no_preempt()/preempt() must be used in pairs but can be nested.
11677c478bdstevel@tonic-gate */
11687c478bdstevel@tonic-gatevoid
11697c478bdstevel@tonic-gateno_preempt(ulwp_t *self)
11707c478bdstevel@tonic-gate{
11717c478bdstevel@tonic-gate	volatile sc_shared_t *scp;
11727c478bdstevel@tonic-gate
11737c478bdstevel@tonic-gate	if (self->ul_preempt++ == 0) {
11747c478bdstevel@tonic-gate		enter_critical(self);
11757c478bdstevel@tonic-gate		if ((scp = self->ul_schedctl) != NULL ||
11767c478bdstevel@tonic-gate		    (scp = setup_schedctl()) != NULL) {
11777c478bdstevel@tonic-gate			/*
11787c478bdstevel@tonic-gate			 * Save the pre-existing preempt value.
11797c478bdstevel@tonic-gate			 */
11807c478bdstevel@tonic-gate			self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
11817c478bdstevel@tonic-gate			scp->sc_preemptctl.sc_nopreempt = 1;
11827c478bdstevel@tonic-gate		}
11837c478bdstevel@tonic-gate	}
11847c478bdstevel@tonic-gate}
11857c478bdstevel@tonic-gate
11867c478bdstevel@tonic-gate/*
11877c478bdstevel@tonic-gate * Undo the effects of no_preempt().
11887c478bdstevel@tonic-gate */
11897c478bdstevel@tonic-gatevoid
11907c478bdstevel@tonic-gatepreempt(ulwp_t *self)
11917c478bdstevel@tonic-gate{
11927c478bdstevel@tonic-gate	volatile sc_shared_t *scp;
11937c478bdstevel@tonic-gate
11947c478bdstevel@tonic-gate	ASSERT(self->ul_preempt > 0);
11957c478bdstevel@tonic-gate	if (--self->ul_preempt == 0) {
11967c478bdstevel@tonic-gate		if ((scp = self->ul_schedctl) != NULL) {
11977c478bdstevel@tonic-gate			/*
11987c478bdstevel@tonic-gate			 * Restore the pre-existing preempt value.
11997c478bdstevel@tonic-gate			 */
12007c478bdstevel@tonic-gate			scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
12017c478bdstevel@tonic-gate			if (scp->sc_preemptctl.sc_yield &&
12027c478bdstevel@tonic-gate			    scp->sc_preemptctl.sc_nopreempt == 0) {
12038cd4554raf				yield();
12047c478bdstevel@tonic-gate				if (scp->sc_preemptctl.sc_yield) {
12057c478bdstevel@tonic-gate					/*
12067c478bdstevel@tonic-gate					 * Shouldn't happen.  This is either
12077c478bdstevel@tonic-gate					 * a race condition or the thread
12087c478bdstevel@tonic-gate					 * just entered the real-time class.
12097c478bdstevel@tonic-gate					 */
12108cd4554raf					yield();
12117c478bdstevel@tonic-gate					scp->sc_preemptctl.sc_yield = 0;
12127c478bdstevel@tonic-gate				}
12137c478bdstevel@tonic-gate			}
12147c478bdstevel@tonic-gate		}
12157c478bdstevel@tonic-gate		exit_critical(self);
12167c478bdstevel@tonic-gate	}
12177c478bdstevel@tonic-gate}
12187c478bdstevel@tonic-gate
12197c478bdstevel@tonic-gate/*
12207c478bdstevel@tonic-gate * If a call to preempt() would cause the current thread to yield or to
12217c478bdstevel@tonic-gate * take deferred actions in exit_critical(), then unpark the specified
12227c478bdstevel@tonic-gate * lwp so it can run while we delay.  Return the original lwpid if the
12237c478bdstevel@tonic-gate * unpark was not performed, else return zero.  The tests are a repeat
12247c478bdstevel@tonic-gate * of some of the tests in preempt(), above.  This is a statistical
12257c478bdstevel@tonic-gate * optimization solely for cond_sleep_queue(), below.
12267c478bdstevel@tonic-gate */
12277c478bdstevel@tonic-gatestatic lwpid_t
12287c478bdstevel@tonic-gatepreempt_unpark(ulwp_t *self, lwpid_t lwpid)
12297c478bdstevel@tonic-gate{
12307c478bdstevel@tonic-gate	volatile sc_shared_t *scp = self->ul_schedctl;
12317c478bdstevel@tonic-gate
12327c478bdstevel@tonic-gate	ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
12337c478bdstevel@tonic-gate	if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
12347c478bdstevel@tonic-gate	    (self->ul_curplease && self->ul_critical == 1)) {
12357c478bdstevel@tonic-gate		(void) __lwp_unpark(lwpid);
12367c478bdstevel@tonic-gate		lwpid = 0;
12377c478bdstevel@tonic-gate	}
12387c478bdstevel@tonic-gate	return (lwpid);
12397c478bdstevel@tonic-gate}
12407c478bdstevel@tonic-gate
12417c478bdstevel@tonic-gate/*
124216b0177raf * Spin for a while (if 'tryhard' is true), trying to grab the lock.
12437c478bdstevel@tonic-gate * If this fails, return EBUSY and let the caller deal with it.
12447c478bdstevel@tonic-gate * If this succeeds, return 0 with mutex_owner set to curthread.
12457c478bdstevel@tonic-gate */
1246883492drafstatic int
124716b0177rafmutex_trylock_adaptive(mutex_t *mp, int tryhard)
12487c478bdstevel@tonic-gate{
12497c478bdstevel@tonic-gate	ulwp_t *self = curthread;
1250883492draf	int error = EBUSY;
12517c478bdstevel@tonic-gate	ulwp_t *ulwp;
12527c478bdstevel@tonic-gate	volatile sc_shared_t *scp;
12535d1dd9araf	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
12545d1dd9araf	volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
12555d1dd9araf	uint32_t new_lockword;
12565d1dd9araf	int count = 0;
12575d1dd9araf	int max_count;
12585d1dd9araf	uint8_t max_spinners;
12597c478bdstevel@tonic-gate
1260883492draf	ASSERT(!(mp->mutex_type & USYNC_PROCESS));
12617c478bdstevel@tonic-gate
1262328cc3eRoger A. Faulkner	if (MUTEX_OWNED(mp, self))
12637c478bdstevel@tonic-gate		return (EBUSY);
12647c478bdstevel@tonic-gate
1265328cc3eRoger A. Faulkner	enter_critical(self);
1266328cc3eRoger A. Faulkner
1267883492draf	/* short-cut, not definitive (see below) */
1268883492draf	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1269883492draf		ASSERT(mp->mutex_type & LOCK_ROBUST);
12705d1dd9araf		error = ENOTRECOVERABLE;
12715d1dd9araf		goto done;
1272883492draf	}
1273883492draf
12745d1dd9araf	/*
12755d1dd9araf	 * Make one attempt to acquire the lock before
12765d1dd9araf	 * incurring the overhead of the spin loop.
12775d1dd9araf	 */
12785d1dd9araf	if (set_lock_byte(lockp) == 0) {
12795d1dd9araf		*ownerp = (uintptr_t)self;
12805d1dd9araf		error = 0;
12815d1dd9araf		goto done;
12825d1dd9araf	}
12835d1dd9araf	if (!tryhard)
12845d1dd9araf		goto done;
12855d1dd9araf	if (ncpus == 0)
12865d1dd9araf		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
12875d1dd9araf	if ((max_spinners = self->ul_max_spinners) >= ncpus)
12885d1dd9araf		max_spinners = ncpus - 1;
12895d1dd9araf	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
12905d1dd9araf	if (max_count == 0)
12915d1dd9araf		goto done;
12927c478bdstevel@tonic-gate
12937c478bdstevel@tonic-gate	/*
12947c478bdstevel@tonic-gate	 * This spin loop is unfair to lwps that have already dropped into
12957c478bdstevel@tonic-gate	 * the kernel to sleep.  They will starve on a highly-contended mutex.
12967c478bdstevel@tonic-gate	 * This is just too bad.  The adaptive spin algorithm is intended
12977c478bdstevel@tonic-gate	 * to allow programs with highly-contended locks (that is, broken
12987c478bdstevel@tonic-gate	 * programs) to execute with reasonable speed despite their contention.
12997c478bdstevel@tonic-gate	 * Being fair would reduce the speed of such programs and well-written
13007c478bdstevel@tonic-gate	 * programs will not suffer in any case.
13017c478bdstevel@tonic-gate	 */
1302328cc3eRoger A. Faulkner	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
13035d1dd9araf		goto done;
13045d1dd9araf	DTRACE_PROBE1(plockstat, mutex__spin, mp);
13055d1dd9araf	for (count = 1; ; count++) {
13067c478bdstevel@tonic-gate		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
13077c478bdstevel@tonic-gate			*ownerp = (uintptr_t)self;
1308883492draf			error = 0;
1309883492draf			break;
13107c478bdstevel@tonic-gate		}
13115d1dd9araf		if (count == max_count)
13125d1dd9araf			break;
13137c478bdstevel@tonic-gate		SMT_PAUSE();
13147c478bdstevel@tonic-gate		/*
13157c478bdstevel@tonic-gate		 * Stop spinning if the mutex owner is not running on
13167c478bdstevel@tonic-gate		 * a processor; it will not drop the lock any time soon
13177c478bdstevel@tonic-gate		 * and we would just be wasting time to keep spinning.
13187c478bdstevel@tonic-gate		 *
13197c478bdstevel@tonic-gate		 * Note that we are looking at another thread (ulwp_t)
13207c478bdstevel@tonic-gate		 * without ensuring that the other thread does not exit.
13217c478bdstevel@tonic-gate		 * The scheme relies on ulwp_t structures never being
13227c478bdstevel@tonic-gate		 * deallocated by the library (the library employs a free
13237c478bdstevel@tonic-gate		 * list of ulwp_t structs that are reused when new threads
13247c478bdstevel@tonic-gate		 * are created) and on schedctl shared memory never being
13257c478bdstevel@tonic-gate		 * deallocated once created via __schedctl().
13267c478bdstevel@tonic-gate		 *
13277c478bdstevel@tonic-gate		 * Thus, the worst that can happen when the spinning thread
13287c478bdstevel@tonic-gate		 * looks at the owner's schedctl data is that it is looking
13297c478bdstevel@tonic-gate		 * at some other thread's schedctl data.  This almost never
13307c478bdstevel@tonic-gate		 * happens and is benign when it does.
13317c478bdstevel@tonic-gate		 */
13327c478bdstevel@tonic-gate		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
13337c478bdstevel@tonic-gate		    ((scp = ulwp->ul_schedctl) == NULL ||
13347c478bdstevel@tonic-gate		    scp->sc_state != SC_ONPROC))
13357c478bdstevel@tonic-gate			break;
13367c478bdstevel@tonic-gate	}
13375d1dd9araf	new_lockword = spinners_decr(&mp->mutex_lockword);
13385d1dd9araf	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
13395d1dd9araf		/*
13405d1dd9araf		 * We haven't yet acquired the lock, the lock
13415d1dd9araf		 * is free, and there are no other spinners.
13425d1dd9araf		 * Make one final attempt to acquire the lock.
13435d1dd9araf		 *
13445d1dd9araf		 * This isn't strictly necessary since mutex_lock_queue()
13455d1dd9araf		 * (the next action this thread will take if it doesn't
13465d1dd9araf		 * acquire the lock here) makes one attempt to acquire
13475d1dd9araf		 * the lock before putting the thread to sleep.
13485d1dd9araf		 *
13495d1dd9araf		 * If the next action for this thread (on failure here)
13505d1dd9araf		 * were not to call mutex_lock_queue(), this would be
13515d1dd9araf		 * necessary for correctness, to avoid ending up with an
13525d1dd9araf		 * unheld mutex with waiters but no one to wake them up.
13535d1dd9araf		 */
13545d1dd9araf		if (set_lock_byte(lockp) == 0) {
13555d1dd9araf			*ownerp = (uintptr_t)self;
13565d1dd9araf			error = 0;
13575d1dd9araf		}
13585d1dd9araf		count++;
13595d1dd9araf	}
13607c478bdstevel@tonic-gate
13615d1dd9arafdone:
1362883492draf	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1363883492draf		ASSERT(mp->mutex_type & LOCK_ROBUST);
1364883492draf		/*
136531db3c2raf		 * We shouldn't own the mutex.
136631db3c2raf		 * Just clear the lock; everyone has already been waked up.
1367883492draf		 */
1368328cc3eRoger A. Faulkner		*ownerp = 0;
136931db3c2raf		(void) clear_lockbyte(&mp->mutex_lockword);
1370883492draf		error = ENOTRECOVERABLE;
1371883492draf	}
13727c478bdstevel@tonic-gate
1373328cc3eRoger A. Faulkner	exit_critical(self);
1374328cc3eRoger A. Faulkner
1375883492draf	if (error) {
13765d1dd9araf		if (count) {
13778cb7497Jonathan Haslam			DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
13785d1dd9araf		}
1379883492draf		if (error != EBUSY) {
1380883492draf			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1381883492draf		}
1382883492draf	} else {
13835d1dd9araf		if (count) {
13848cb7497Jonathan Haslam			DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
13855d1dd9araf		}
1386883492draf		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1387883492draf		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1388883492draf			ASSERT(mp->mutex_type & LOCK_ROBUST);
1389883492draf			error = EOWNERDEAD;
1390883492draf		}
1391883492draf	}
1392883492draf
1393883492draf	return (error);
13947c478bdstevel@tonic-gate}
13957c478bdstevel@tonic-gate
13967c478bdstevel@tonic-gate/*
13977c478bdstevel@tonic-gate * Same as mutex_trylock_adaptive(), except specifically for queue locks.
13987c478bdstevel@tonic-gate * The owner field is not set here; the caller (spin_lock_set()) sets it.
13997c478bdstevel@tonic-gate */
1400883492drafstatic int
14017c478bdstevel@tonic-gatemutex_queuelock_adaptive(mutex_t *mp)
14027c478bdstevel@tonic-gate{
14037c478bdstevel@tonic-gate	ulwp_t *ulwp;
14047c478bdstevel@tonic-gate	volatile sc_shared_t *scp;
14057c478bdstevel@tonic-gate	volatile uint8_t *lockp;
14067c478bdstevel@tonic-gate	volatile uint64_t *ownerp;
14077c478bdstevel@tonic-gate	int count = curthread->ul_queue_spin;
14087c478bdstevel@tonic-gate
14097c478bdstevel@tonic-gate	ASSERT(mp->mutex_type == USYNC_THREAD);
14107c478bdstevel@tonic-gate
14117c478bdstevel@tonic-gate	if (count == 0)
14127c478bdstevel@tonic-gate		return (EBUSY);
14137c478bdstevel@tonic-gate
14147c478bdstevel@tonic-gate	lockp = (volatile uint8_t *)&mp->mutex_lockw;
14157c478bdstevel@tonic-gate	ownerp = (volatile uint64_t *)&mp->mutex_owner;
14167c478bdstevel@tonic-gate	while (--count >= 0) {
14177c478bdstevel@tonic-gate		if (*lockp == 0 && set_lock_byte(lockp) == 0)
14187c478bdstevel@tonic-gate			return (0);
14197c478bdstevel@tonic-gate		SMT_PAUSE();
14207c478bdstevel@tonic-gate		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
14217c478bdstevel@tonic-gate		    ((scp = ulwp->ul_schedctl) == NULL ||
14227c478bdstevel@tonic-gate		    scp->sc_state != SC_ONPROC))
14237c478bdstevel@tonic-gate			break;
14247c478bdstevel@tonic-gate	}
14257c478bdstevel@tonic-gate
14267c478bdstevel@tonic-gate	return (EBUSY);
14277c478bdstevel@tonic-gate}
14287c478bdstevel@tonic-gate
14297c478bdstevel@tonic-gate/*
14307c478bdstevel@tonic-gate * Like mutex_trylock_adaptive(), but for process-shared mutexes.
143116b0177raf * Spin for a while (if 'tryhard' is true), trying to grab the lock.
14327c478bdstevel@tonic-gate * If this fails, return EBUSY and let the caller deal with it.
14337c478bdstevel@tonic-gate * If this succeeds, return 0 with mutex_owner set to curthread
14347c478bdstevel@tonic-gate * and mutex_ownerpid set to the current pid.
14357c478bdstevel@tonic-gate */
1436883492drafstatic int
143716b0177rafmutex_trylock_process(mutex_t *mp, int tryhard)
14387c478bdstevel@tonic-gate{
14397c478bdstevel@tonic-gate	ulwp_t *self = curthread;
14405d1dd9araf	uberdata_t *udp = self->ul_uberdata;
1441883492draf	int error = EBUSY;
144231db3c2raf	volatile uint64_t *lockp = (volatile uint64_t *)&mp->mutex_lockword64;
14435d1dd9araf	uint32_t new_lockword;
14445d1dd9araf	int count = 0;
14455d1dd9araf	int max_count;
14465d1dd9araf	uint8_t max_spinners;
14477c478bdstevel@tonic-gate
14487c5714fraf#if defined(__sparc) && !defined(_LP64)
14497c5714fraf	/* horrible hack, necessary only on 32-bit sparc */
14507c5714fraf	int fix_alignment_problem =
14517c5714fraf	    (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
14527c5714fraf	    self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST));
14537c5714fraf#endif
14547c5714fraf
1455883492draf	ASSERT(mp->mutex_type & USYNC_PROCESS);
14567c478bdstevel@tonic-gate
1457883492draf	if (shared_mutex_held(mp))
14587c478bdstevel@tonic-gate		return (EBUSY);
14597c478bdstevel@tonic-gate
1460328cc3eRoger A. Faulkner	enter_critical(self);
1461328cc3eRoger A. Faulkner
1462883492draf	/* short-cut, not definitive (see below) */
1463883492draf	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1464883492draf		ASSERT(mp->mutex_type & LOCK_ROBUST);
14655d1dd9araf		error = ENOTRECOVERABLE;
14665d1dd9araf		goto done;
1467883492draf	}
1468883492draf
14695d1dd9araf	/*
14705d1dd9araf	 * Make one attempt to acquire the lock before
14715d1dd9araf	 * incurring the overhead of the spin loop.
14725d1dd9araf	 */
14737c5714fraf#if defined(__sparc) && !defined(_LP64)
14747c5714fraf	/* horrible hack, necessary only on 32-bit sparc */
14757c5714fraf	if (fix_alignment_problem) {
14767c5714fraf		if (set_lock_byte(&mp->mutex_lockw) == 0) {
14777c5714fraf			mp->mutex_ownerpid = udp->pid;
14787c5714fraf			mp->mutex_owner = (uintptr_t)self;
14797c5714fraf			error = 0;
14807c5714fraf			goto done;
14817c5714fraf		}
14827c5714fraf	} else
14837c5714fraf#endif
148431db3c2raf	if (set_lock_byte64(lockp, udp->pid) == 0) {
14855d1dd9araf		mp->mutex_owner = (uintptr_t)self;
148631db3c2raf		/* mp->mutex_ownerpid was set by set_lock_byte64() */
14875d1dd9araf		error = 0;
14885d1dd9araf		goto done;
14895d1dd9araf	}
14905d1dd9araf	if (!tryhard)
14915d1dd9araf		goto done;
1492883492draf	if (ncpus == 0)
1493883492draf		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
14945d1dd9araf	if ((max_spinners = self->ul_max_spinners) >= ncpus)
14955d1dd9araf		max_spinners = ncpus - 1;
14965d1dd9araf	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
14975d1dd9a