xref: /illumos-gate/usr/src/uts/common/vm/page_lock.c (revision 7c478bd9)
1*7c478bd9Sstevel@tonic-gate /*
2*7c478bd9Sstevel@tonic-gate  * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate  * with the License.
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate  * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate  *
20*7c478bd9Sstevel@tonic-gate  * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate  */
22*7c478bd9Sstevel@tonic-gate /*
23*7c478bd9Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*7c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
25*7c478bd9Sstevel@tonic-gate  */
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*7c478bd9Sstevel@tonic-gate 
29*7c478bd9Sstevel@tonic-gate /*
30*7c478bd9Sstevel@tonic-gate  * VM - page locking primitives
31*7c478bd9Sstevel@tonic-gate  */
32*7c478bd9Sstevel@tonic-gate #include <sys/param.h>
33*7c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
34*7c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
35*7c478bd9Sstevel@tonic-gate #include <sys/debug.h>
36*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
37*7c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
38*7c478bd9Sstevel@tonic-gate #include <sys/bitmap.h>
39*7c478bd9Sstevel@tonic-gate #include <sys/lockstat.h>
40*7c478bd9Sstevel@tonic-gate #include <sys/condvar_impl.h>
41*7c478bd9Sstevel@tonic-gate #include <vm/page.h>
42*7c478bd9Sstevel@tonic-gate #include <vm/seg_enum.h>
43*7c478bd9Sstevel@tonic-gate #include <vm/vm_dep.h>
44*7c478bd9Sstevel@tonic-gate 
45*7c478bd9Sstevel@tonic-gate /*
46*7c478bd9Sstevel@tonic-gate  * This global mutex is for logical page locking.
47*7c478bd9Sstevel@tonic-gate  * The following fields in the page structure are protected
48*7c478bd9Sstevel@tonic-gate  * by this lock:
49*7c478bd9Sstevel@tonic-gate  *
50*7c478bd9Sstevel@tonic-gate  *	p_lckcnt
51*7c478bd9Sstevel@tonic-gate  *	p_cowcnt
52*7c478bd9Sstevel@tonic-gate  */
53*7c478bd9Sstevel@tonic-gate kmutex_t page_llock;
54*7c478bd9Sstevel@tonic-gate 
55*7c478bd9Sstevel@tonic-gate /*
56*7c478bd9Sstevel@tonic-gate  * This is a global lock for the logical page free list.  The
57*7c478bd9Sstevel@tonic-gate  * logical free list, in this implementation, is maintained as two
58*7c478bd9Sstevel@tonic-gate  * separate physical lists - the cache list and the free list.
59*7c478bd9Sstevel@tonic-gate  */
60*7c478bd9Sstevel@tonic-gate kmutex_t  page_freelock;
61*7c478bd9Sstevel@tonic-gate 
62*7c478bd9Sstevel@tonic-gate /*
63*7c478bd9Sstevel@tonic-gate  * The hash table, page_hash[], the p_selock fields, and the
64*7c478bd9Sstevel@tonic-gate  * list of pages associated with vnodes are protected by arrays of mutexes.
65*7c478bd9Sstevel@tonic-gate  *
66*7c478bd9Sstevel@tonic-gate  * Unless the hashes are changed radically, the table sizes must be
67*7c478bd9Sstevel@tonic-gate  * a power of two.  Also, we typically need more mutexes for the
68*7c478bd9Sstevel@tonic-gate  * vnodes since these locks are occasionally held for long periods.
69*7c478bd9Sstevel@tonic-gate  * And since there seem to be two special vnodes (kvp and swapvp),
70*7c478bd9Sstevel@tonic-gate  * we make room for private mutexes for them.
71*7c478bd9Sstevel@tonic-gate  *
72*7c478bd9Sstevel@tonic-gate  * The pse_mutex[] array holds the mutexes to protect the p_selock
73*7c478bd9Sstevel@tonic-gate  * fields of all page_t structures.
74*7c478bd9Sstevel@tonic-gate  *
75*7c478bd9Sstevel@tonic-gate  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
76*7c478bd9Sstevel@tonic-gate  * when given a pointer to a page_t.
77*7c478bd9Sstevel@tonic-gate  *
78*7c478bd9Sstevel@tonic-gate  * PSE_TABLE_SIZE must be a power of two.  One could argue that we
79*7c478bd9Sstevel@tonic-gate  * should go to the trouble of setting it up at run time and base it
80*7c478bd9Sstevel@tonic-gate  * on memory size rather than the number of compile time CPUs.
81*7c478bd9Sstevel@tonic-gate  *
82*7c478bd9Sstevel@tonic-gate  * XX64	We should be using physmem size to calculate PSE_TABLE_SIZE,
83*7c478bd9Sstevel@tonic-gate  *	PSE_SHIFT, PIO_SHIFT.
84*7c478bd9Sstevel@tonic-gate  *
85*7c478bd9Sstevel@tonic-gate  *	These might break in 64 bit world.
86*7c478bd9Sstevel@tonic-gate  */
87*7c478bd9Sstevel@tonic-gate #define	PSE_SHIFT	7		/* log2(PSE_TABLE_SIZE) */
88*7c478bd9Sstevel@tonic-gate 
89*7c478bd9Sstevel@tonic-gate #define	PSE_TABLE_SIZE	128		/* number of mutexes to have */
90*7c478bd9Sstevel@tonic-gate 
91*7c478bd9Sstevel@tonic-gate #define	PIO_SHIFT	PSE_SHIFT	/* next power of 2 bigger than page_t */
92*7c478bd9Sstevel@tonic-gate #define	PIO_TABLE_SIZE	PSE_TABLE_SIZE	/* number of io mutexes to have */
93*7c478bd9Sstevel@tonic-gate 
94*7c478bd9Sstevel@tonic-gate pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
95*7c478bd9Sstevel@tonic-gate pad_mutex_t	pse_mutex[PSE_TABLE_SIZE];
96*7c478bd9Sstevel@tonic-gate kmutex_t	pio_mutex[PIO_TABLE_SIZE];
97*7c478bd9Sstevel@tonic-gate 
98*7c478bd9Sstevel@tonic-gate #define	PAGE_SE_MUTEX(pp) \
99*7c478bd9Sstevel@tonic-gate 	    &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \
100*7c478bd9Sstevel@tonic-gate 		((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \
101*7c478bd9Sstevel@tonic-gate 		(PSE_TABLE_SIZE - 1))].pad_mutex
102*7c478bd9Sstevel@tonic-gate 
103*7c478bd9Sstevel@tonic-gate #define	PAGE_IO_MUTEX(pp) \
104*7c478bd9Sstevel@tonic-gate 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
105*7c478bd9Sstevel@tonic-gate 
106*7c478bd9Sstevel@tonic-gate #define	PSZC_MTX_TABLE_SIZE	128
107*7c478bd9Sstevel@tonic-gate #define	PSZC_MTX_TABLE_SHIFT	7
108*7c478bd9Sstevel@tonic-gate 
109*7c478bd9Sstevel@tonic-gate static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
110*7c478bd9Sstevel@tonic-gate 
111*7c478bd9Sstevel@tonic-gate #define	PAGE_SZC_MUTEX(_pp) \
112*7c478bd9Sstevel@tonic-gate 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
113*7c478bd9Sstevel@tonic-gate 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
114*7c478bd9Sstevel@tonic-gate 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
115*7c478bd9Sstevel@tonic-gate 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
116*7c478bd9Sstevel@tonic-gate 
117*7c478bd9Sstevel@tonic-gate /*
118*7c478bd9Sstevel@tonic-gate  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
119*7c478bd9Sstevel@tonic-gate  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
120*7c478bd9Sstevel@tonic-gate  * and p_vpnext).
121*7c478bd9Sstevel@tonic-gate  *
122*7c478bd9Sstevel@tonic-gate  * The page_vnode_mutex(vp) function returns the address of the appropriate
123*7c478bd9Sstevel@tonic-gate  * mutex from this array given a pointer to a vnode.  It is complicated
124*7c478bd9Sstevel@tonic-gate  * by the fact that the kernel's vnode and the swapfs vnode are referenced
125*7c478bd9Sstevel@tonic-gate  * frequently enough to warrent their own mutexes.
126*7c478bd9Sstevel@tonic-gate  *
127*7c478bd9Sstevel@tonic-gate  * The VP_HASH_FUNC returns the index into the vph_mutex array given
128*7c478bd9Sstevel@tonic-gate  * an address of a vnode.
129*7c478bd9Sstevel@tonic-gate  */
130*7c478bd9Sstevel@tonic-gate 
131*7c478bd9Sstevel@tonic-gate /*
132*7c478bd9Sstevel@tonic-gate  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
133*7c478bd9Sstevel@tonic-gate  *	Need to review again.
134*7c478bd9Sstevel@tonic-gate  */
135*7c478bd9Sstevel@tonic-gate #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
136*7c478bd9Sstevel@tonic-gate 
137*7c478bd9Sstevel@tonic-gate #define	VP_HASH_FUNC(vp) \
138*7c478bd9Sstevel@tonic-gate 	((((uintptr_t)(vp) >> 6) + \
139*7c478bd9Sstevel@tonic-gate 	    ((uintptr_t)(vp) >> 8) + \
140*7c478bd9Sstevel@tonic-gate 	    ((uintptr_t)(vp) >> 10) + \
141*7c478bd9Sstevel@tonic-gate 	    ((uintptr_t)(vp) >> 12)) \
142*7c478bd9Sstevel@tonic-gate 	    & (VPH_TABLE_SIZE - 1))
143*7c478bd9Sstevel@tonic-gate 
144*7c478bd9Sstevel@tonic-gate extern	struct vnode	kvp;
145*7c478bd9Sstevel@tonic-gate 
146*7c478bd9Sstevel@tonic-gate kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
147*7c478bd9Sstevel@tonic-gate 
148*7c478bd9Sstevel@tonic-gate /*
149*7c478bd9Sstevel@tonic-gate  * Initialize the locks used by the Virtual Memory Management system.
150*7c478bd9Sstevel@tonic-gate  */
151*7c478bd9Sstevel@tonic-gate void
152*7c478bd9Sstevel@tonic-gate page_lock_init()
153*7c478bd9Sstevel@tonic-gate {
154*7c478bd9Sstevel@tonic-gate }
155*7c478bd9Sstevel@tonic-gate 
156*7c478bd9Sstevel@tonic-gate /*
157*7c478bd9Sstevel@tonic-gate  * At present we only use page ownership to aid debugging, so it's
158*7c478bd9Sstevel@tonic-gate  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
159*7c478bd9Sstevel@tonic-gate  * can map to the same owner because we just 'or' in 0x80000000 and
160*7c478bd9Sstevel@tonic-gate  * then clear the second highest bit, so that (for example) 0x2faced00
161*7c478bd9Sstevel@tonic-gate  * and 0xafaced00 both map to 0xafaced00.
162*7c478bd9Sstevel@tonic-gate  * In the 64-bit world, p_selock may not be large enough to hold a full
163*7c478bd9Sstevel@tonic-gate  * thread pointer.  If we ever need precise ownership (e.g. if we implement
164*7c478bd9Sstevel@tonic-gate  * priority inheritance for page locks) then p_selock should become a
165*7c478bd9Sstevel@tonic-gate  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
166*7c478bd9Sstevel@tonic-gate  */
167*7c478bd9Sstevel@tonic-gate #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
168*7c478bd9Sstevel@tonic-gate #define	SE_READER	1
169*7c478bd9Sstevel@tonic-gate 
170*7c478bd9Sstevel@tonic-gate /*
171*7c478bd9Sstevel@tonic-gate  * A page that is deleted must be marked as such using the
172*7c478bd9Sstevel@tonic-gate  * page_lock_delete() function. The page must be exclusively locked.
173*7c478bd9Sstevel@tonic-gate  * The SE_DELETED marker is put in p_selock when this function is called.
174*7c478bd9Sstevel@tonic-gate  * SE_DELETED must be distinct from any SE_WRITER value.
175*7c478bd9Sstevel@tonic-gate  */
176*7c478bd9Sstevel@tonic-gate #define	SE_DELETED	(1 | INT_MIN)
177*7c478bd9Sstevel@tonic-gate 
178*7c478bd9Sstevel@tonic-gate #ifdef VM_STATS
179*7c478bd9Sstevel@tonic-gate uint_t	vph_kvp_count;
180*7c478bd9Sstevel@tonic-gate uint_t	vph_swapfsvp_count;
181*7c478bd9Sstevel@tonic-gate uint_t	vph_other;
182*7c478bd9Sstevel@tonic-gate #endif /* VM_STATS */
183*7c478bd9Sstevel@tonic-gate 
184*7c478bd9Sstevel@tonic-gate #ifdef VM_STATS
185*7c478bd9Sstevel@tonic-gate uint_t	page_lock_count;
186*7c478bd9Sstevel@tonic-gate uint_t	page_lock_miss;
187*7c478bd9Sstevel@tonic-gate uint_t	page_lock_miss_lock;
188*7c478bd9Sstevel@tonic-gate uint_t	page_lock_reclaim;
189*7c478bd9Sstevel@tonic-gate uint_t	page_lock_bad_reclaim;
190*7c478bd9Sstevel@tonic-gate uint_t	page_lock_same_page;
191*7c478bd9Sstevel@tonic-gate uint_t	page_lock_upgrade;
192*7c478bd9Sstevel@tonic-gate uint_t	page_lock_upgrade_failed;
193*7c478bd9Sstevel@tonic-gate uint_t	page_lock_deleted;
194*7c478bd9Sstevel@tonic-gate 
195*7c478bd9Sstevel@tonic-gate uint_t	page_trylock_locked;
196*7c478bd9Sstevel@tonic-gate uint_t	page_trylock_missed;
197*7c478bd9Sstevel@tonic-gate 
198*7c478bd9Sstevel@tonic-gate uint_t	page_try_reclaim_upgrade;
199*7c478bd9Sstevel@tonic-gate #endif /* VM_STATS */
200*7c478bd9Sstevel@tonic-gate 
201*7c478bd9Sstevel@tonic-gate 
202*7c478bd9Sstevel@tonic-gate /*
203*7c478bd9Sstevel@tonic-gate  * Acquire the "shared/exclusive" lock on a page.
204*7c478bd9Sstevel@tonic-gate  *
205*7c478bd9Sstevel@tonic-gate  * Returns 1 on success and locks the page appropriately.
206*7c478bd9Sstevel@tonic-gate  *	   0 on failure and does not lock the page.
207*7c478bd9Sstevel@tonic-gate  *
208*7c478bd9Sstevel@tonic-gate  * If `lock' is non-NULL, it will be dropped and reacquired in the
209*7c478bd9Sstevel@tonic-gate  * failure case.  This routine can block, and if it does
210*7c478bd9Sstevel@tonic-gate  * it will always return a failure since the page identity [vp, off]
211*7c478bd9Sstevel@tonic-gate  * or state may have changed.
212*7c478bd9Sstevel@tonic-gate  */
213*7c478bd9Sstevel@tonic-gate 
214*7c478bd9Sstevel@tonic-gate int
215*7c478bd9Sstevel@tonic-gate page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
216*7c478bd9Sstevel@tonic-gate {
217*7c478bd9Sstevel@tonic-gate 	return (page_lock_es(pp, se, lock, reclaim, 0));
218*7c478bd9Sstevel@tonic-gate }
219*7c478bd9Sstevel@tonic-gate 
220*7c478bd9Sstevel@tonic-gate /*
221*7c478bd9Sstevel@tonic-gate  * With the addition of reader-writer lock semantics to page_lock_es,
222*7c478bd9Sstevel@tonic-gate  * callers wanting an exclusive (writer) lock may prevent shared-lock
223*7c478bd9Sstevel@tonic-gate  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
224*7c478bd9Sstevel@tonic-gate  * In this case, when an exclusive lock cannot be acquired, p_selock's
225*7c478bd9Sstevel@tonic-gate  * SE_EWANTED bit is set.
226*7c478bd9Sstevel@tonic-gate  * This bit, along with the se and es parameters, are used to decide
227*7c478bd9Sstevel@tonic-gate  * if the requested lock should be granted:
228*7c478bd9Sstevel@tonic-gate  *
229*7c478bd9Sstevel@tonic-gate  * Lock wanted SE_EXCL_WANTED p_selock/SE_EWANTED  Action
230*7c478bd9Sstevel@tonic-gate  * ----------  -------------- -------------------  ---------
231*7c478bd9Sstevel@tonic-gate  * SE_EXCL        no           dont-care/1         deny lock
232*7c478bd9Sstevel@tonic-gate  * SE_EXCL     any(see note)   unlocked/any        grant lock, clear SE_EWANTED
233*7c478bd9Sstevel@tonic-gate  * SE_EXCL        yes          any lock/any        deny, set SE_EWANTED
234*7c478bd9Sstevel@tonic-gate  * SE_EXCL        no           any lock/any        deny
235*7c478bd9Sstevel@tonic-gate  * SE_SHARED   not applicable    shared/0          grant
236*7c478bd9Sstevel@tonic-gate  * SE_SHARED   not applicable  unlocked/0          grant
237*7c478bd9Sstevel@tonic-gate  * SE_SHARED   not applicable    shared/1          deny
238*7c478bd9Sstevel@tonic-gate  * SE_SHARED   not applicable  unlocked/1          deny
239*7c478bd9Sstevel@tonic-gate  * SE_SHARED   not applicable      excl/any        deny
240*7c478bd9Sstevel@tonic-gate  *
241*7c478bd9Sstevel@tonic-gate  * Note: the code grants an exclusive lock to the caller and clears
242*7c478bd9Sstevel@tonic-gate  * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
243*7c478bd9Sstevel@tonic-gate  * bit's value.  This was deemed acceptable as we are not concerned about
244*7c478bd9Sstevel@tonic-gate  * exclusive-lock starvation. If this ever becomes an issue, a priority or
245*7c478bd9Sstevel@tonic-gate  * fifo mechanism should also be implemented.
246*7c478bd9Sstevel@tonic-gate  */
247*7c478bd9Sstevel@tonic-gate int
248*7c478bd9Sstevel@tonic-gate page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
249*7c478bd9Sstevel@tonic-gate {
250*7c478bd9Sstevel@tonic-gate 	int		retval;
251*7c478bd9Sstevel@tonic-gate 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
252*7c478bd9Sstevel@tonic-gate 	int		upgraded;
253*7c478bd9Sstevel@tonic-gate 	int		reclaim_it;
254*7c478bd9Sstevel@tonic-gate 
255*7c478bd9Sstevel@tonic-gate 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
256*7c478bd9Sstevel@tonic-gate 
257*7c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(page_lock_count);
258*7c478bd9Sstevel@tonic-gate 
259*7c478bd9Sstevel@tonic-gate 	upgraded = 0;
260*7c478bd9Sstevel@tonic-gate 	reclaim_it = 0;
261*7c478bd9Sstevel@tonic-gate 
262*7c478bd9Sstevel@tonic-gate 	mutex_enter(pse);
263*7c478bd9Sstevel@tonic-gate 
264*7c478bd9Sstevel@tonic-gate 	/*
265*7c478bd9Sstevel@tonic-gate 	 * Current uses of 'es':
266*7c478bd9Sstevel@tonic-gate 	 * es == 1 page_lookup_create will attempt page relocation
267*7c478bd9Sstevel@tonic-gate 	 * es == SE_EXCL_WANTED caller wants SE_EWANTED set (eg. delete
268*7c478bd9Sstevel@tonic-gate 	 * memory thread); this prevents reader-starvation of waiting
269*7c478bd9Sstevel@tonic-gate 	 * writer thread(s).
270*7c478bd9Sstevel@tonic-gate 	 */
271*7c478bd9Sstevel@tonic-gate 
272*7c478bd9Sstevel@tonic-gate 
273*7c478bd9Sstevel@tonic-gate 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
274*7c478bd9Sstevel@tonic-gate 	    ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
275*7c478bd9Sstevel@tonic-gate 
276*7c478bd9Sstevel@tonic-gate 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
277*7c478bd9Sstevel@tonic-gate 		se = SE_EXCL;
278*7c478bd9Sstevel@tonic-gate 	}
279*7c478bd9Sstevel@tonic-gate 
280*7c478bd9Sstevel@tonic-gate 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
281*7c478bd9Sstevel@tonic-gate 
282*7c478bd9Sstevel@tonic-gate 		reclaim_it = 1;
283*7c478bd9Sstevel@tonic-gate 		if (se == SE_SHARED) {
284*7c478bd9Sstevel@tonic-gate 			/*
285*7c478bd9Sstevel@tonic-gate 			 * This is an interesting situation.
286*7c478bd9Sstevel@tonic-gate 			 *
287*7c478bd9Sstevel@tonic-gate 			 * Remember that p_free can only change if
288*7c478bd9Sstevel@tonic-gate 			 * p_selock < 0.
289*7c478bd9Sstevel@tonic-gate 			 * p_free does not depend on our holding `pse'.
290*7c478bd9Sstevel@tonic-gate 			 * And, since we hold `pse', p_selock can not change.
291*7c478bd9Sstevel@tonic-gate 			 * So, if p_free changes on us, the page is already
292*7c478bd9Sstevel@tonic-gate 			 * exclusively held, and we would fail to get p_selock
293*7c478bd9Sstevel@tonic-gate 			 * regardless.
294*7c478bd9Sstevel@tonic-gate 			 *
295*7c478bd9Sstevel@tonic-gate 			 * We want to avoid getting the share
296*7c478bd9Sstevel@tonic-gate 			 * lock on a free page that needs to be reclaimed.
297*7c478bd9Sstevel@tonic-gate 			 * It is possible that some other thread has the share
298*7c478bd9Sstevel@tonic-gate 			 * lock and has left the free page on the cache list.
299*7c478bd9Sstevel@tonic-gate 			 * pvn_vplist_dirty() does this for brief periods.
300*7c478bd9Sstevel@tonic-gate 			 * If the se_share is currently SE_EXCL, we will fail
301*7c478bd9Sstevel@tonic-gate 			 * to acquire p_selock anyway.  Blocking is the
302*7c478bd9Sstevel@tonic-gate 			 * right thing to do.
303*7c478bd9Sstevel@tonic-gate 			 * If we need to reclaim this page, we must get
304*7c478bd9Sstevel@tonic-gate 			 * exclusive access to it, force the upgrade now.
305*7c478bd9Sstevel@tonic-gate 			 * Again, we will fail to acquire p_selock if the
306*7c478bd9Sstevel@tonic-gate 			 * page is not free and block.
307*7c478bd9Sstevel@tonic-gate 			 */
308*7c478bd9Sstevel@tonic-gate 			upgraded = 1;
309*7c478bd9Sstevel@tonic-gate 			se = SE_EXCL;
310*7c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_upgrade);
311*7c478bd9Sstevel@tonic-gate 		}
312*7c478bd9Sstevel@tonic-gate 	}
313*7c478bd9Sstevel@tonic-gate 
314*7c478bd9Sstevel@tonic-gate 	if (se == SE_EXCL) {
315*7c478bd9Sstevel@tonic-gate 		if ((es != SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
316*7c478bd9Sstevel@tonic-gate 			/*
317*7c478bd9Sstevel@tonic-gate 			 * if the caller wants a writer lock (but did not
318*7c478bd9Sstevel@tonic-gate 			 * specify exclusive access), and there is a pending
319*7c478bd9Sstevel@tonic-gate 			 * writer that wants exclusive access, return failure
320*7c478bd9Sstevel@tonic-gate 			 */
321*7c478bd9Sstevel@tonic-gate 			retval = 0;
322*7c478bd9Sstevel@tonic-gate 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
323*7c478bd9Sstevel@tonic-gate 			/* no reader/writer lock held */
324*7c478bd9Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
325*7c478bd9Sstevel@tonic-gate 			/* this clears our setting of the SE_EWANTED bit */
326*7c478bd9Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
327*7c478bd9Sstevel@tonic-gate 			retval = 1;
328*7c478bd9Sstevel@tonic-gate 		} else {
329*7c478bd9Sstevel@tonic-gate 			/* page is locked */
330*7c478bd9Sstevel@tonic-gate 			if (es == SE_EXCL_WANTED) {
331*7c478bd9Sstevel@tonic-gate 				/* set the SE_EWANTED bit */
332*7c478bd9Sstevel@tonic-gate 				pp->p_selock |= SE_EWANTED;
333*7c478bd9Sstevel@tonic-gate 			}
334*7c478bd9Sstevel@tonic-gate 			retval = 0;
335*7c478bd9Sstevel@tonic-gate 		}
336*7c478bd9Sstevel@tonic-gate 	} else {
337*7c478bd9Sstevel@tonic-gate 		retval = 0;
338*7c478bd9Sstevel@tonic-gate 		if (pp->p_selock >= 0) {
339*7c478bd9Sstevel@tonic-gate 			/* readers are not allowed when excl wanted */
340*7c478bd9Sstevel@tonic-gate 			if (!(pp->p_selock & SE_EWANTED)) {
341*7c478bd9Sstevel@tonic-gate 				pp->p_selock += SE_READER;
342*7c478bd9Sstevel@tonic-gate 				retval = 1;
343*7c478bd9Sstevel@tonic-gate 			}
344*7c478bd9Sstevel@tonic-gate 		}
345*7c478bd9Sstevel@tonic-gate 	}
346*7c478bd9Sstevel@tonic-gate 
347*7c478bd9Sstevel@tonic-gate 	if (retval == 0) {
348*7c478bd9Sstevel@tonic-gate 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
349*7c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_deleted);
350*7c478bd9Sstevel@tonic-gate 			mutex_exit(pse);
351*7c478bd9Sstevel@tonic-gate 			return (retval);
352*7c478bd9Sstevel@tonic-gate 		}
353*7c478bd9Sstevel@tonic-gate 
354*7c478bd9Sstevel@tonic-gate #ifdef VM_STATS
355*7c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_lock_miss);
356*7c478bd9Sstevel@tonic-gate 		if (upgraded) {
357*7c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_upgrade_failed);
358*7c478bd9Sstevel@tonic-gate 		}
359*7c478bd9Sstevel@tonic-gate #endif
360*7c478bd9Sstevel@tonic-gate 		if (lock) {
361*7c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_miss_lock);
362*7c478bd9Sstevel@tonic-gate 			mutex_exit(lock);
363*7c478bd9Sstevel@tonic-gate 		}
364*7c478bd9Sstevel@tonic-gate 
365*7c478bd9Sstevel@tonic-gate 		/*
366*7c478bd9Sstevel@tonic-gate 		 * Now, wait for the page to be unlocked and
367*7c478bd9Sstevel@tonic-gate 		 * release the lock protecting p_cv and p_selock.
368*7c478bd9Sstevel@tonic-gate 		 */
369*7c478bd9Sstevel@tonic-gate 		cv_wait(&pp->p_cv, pse);
370*7c478bd9Sstevel@tonic-gate 		mutex_exit(pse);
371*7c478bd9Sstevel@tonic-gate 
372*7c478bd9Sstevel@tonic-gate 		/*
373*7c478bd9Sstevel@tonic-gate 		 * The page identity may have changed while we were
374*7c478bd9Sstevel@tonic-gate 		 * blocked.  If we are willing to depend on "pp"
375*7c478bd9Sstevel@tonic-gate 		 * still pointing to a valid page structure (i.e.,
376*7c478bd9Sstevel@tonic-gate 		 * assuming page structures are not dynamically allocated
377*7c478bd9Sstevel@tonic-gate 		 * or freed), we could try to lock the page if its
378*7c478bd9Sstevel@tonic-gate 		 * identity hasn't changed.
379*7c478bd9Sstevel@tonic-gate 		 *
380*7c478bd9Sstevel@tonic-gate 		 * This needs to be measured, since we come back from
381*7c478bd9Sstevel@tonic-gate 		 * cv_wait holding pse (the expensive part of this
382*7c478bd9Sstevel@tonic-gate 		 * operation) we might as well try the cheap part.
383*7c478bd9Sstevel@tonic-gate 		 * Though we would also have to confirm that dropping
384*7c478bd9Sstevel@tonic-gate 		 * `lock' did not cause any grief to the callers.
385*7c478bd9Sstevel@tonic-gate 		 */
386*7c478bd9Sstevel@tonic-gate 		if (lock) {
387*7c478bd9Sstevel@tonic-gate 			mutex_enter(lock);
388*7c478bd9Sstevel@tonic-gate 		}
389*7c478bd9Sstevel@tonic-gate 	} else {
390*7c478bd9Sstevel@tonic-gate 		/*
391*7c478bd9Sstevel@tonic-gate 		 * We have the page lock.
392*7c478bd9Sstevel@tonic-gate 		 * If we needed to reclaim the page, and the page
393*7c478bd9Sstevel@tonic-gate 		 * needed reclaiming (ie, it was free), then we
394*7c478bd9Sstevel@tonic-gate 		 * have the page exclusively locked.  We may need
395*7c478bd9Sstevel@tonic-gate 		 * to downgrade the page.
396*7c478bd9Sstevel@tonic-gate 		 */
397*7c478bd9Sstevel@tonic-gate 		ASSERT((upgraded) ?
398*7c478bd9Sstevel@tonic-gate 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
399*7c478bd9Sstevel@tonic-gate 		mutex_exit(pse);
400*7c478bd9Sstevel@tonic-gate 
401*7c478bd9Sstevel@tonic-gate 		/*
402*7c478bd9Sstevel@tonic-gate 		 * We now hold this page's lock, either shared or
403*7c478bd9Sstevel@tonic-gate 		 * exclusive.  This will prevent its identity from changing.
404*7c478bd9Sstevel@tonic-gate 		 * The page, however, may or may not be free.  If the caller
405*7c478bd9Sstevel@tonic-gate 		 * requested, and it is free, go reclaim it from the
406*7c478bd9Sstevel@tonic-gate 		 * free list.  If the page can't be reclaimed, return failure
407*7c478bd9Sstevel@tonic-gate 		 * so that the caller can start all over again.
408*7c478bd9Sstevel@tonic-gate 		 *
409*7c478bd9Sstevel@tonic-gate 		 * NOTE:page_reclaim() releases the page lock (p_selock)
410*7c478bd9Sstevel@tonic-gate 		 *	if it can't be reclaimed.
411*7c478bd9Sstevel@tonic-gate 		 */
412*7c478bd9Sstevel@tonic-gate 		if (reclaim_it) {
413*7c478bd9Sstevel@tonic-gate 			if (!page_reclaim(pp, lock)) {
414*7c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(page_lock_bad_reclaim);
415*7c478bd9Sstevel@tonic-gate 				retval = 0;
416*7c478bd9Sstevel@tonic-gate 			} else {
417*7c478bd9Sstevel@tonic-gate 				VM_STAT_ADD(page_lock_reclaim);
418*7c478bd9Sstevel@tonic-gate 				if (upgraded) {
419*7c478bd9Sstevel@tonic-gate 					page_downgrade(pp);
420*7c478bd9Sstevel@tonic-gate 				}
421*7c478bd9Sstevel@tonic-gate 			}
422*7c478bd9Sstevel@tonic-gate 		}
423*7c478bd9Sstevel@tonic-gate 	}
424*7c478bd9Sstevel@tonic-gate 	return (retval);
425*7c478bd9Sstevel@tonic-gate }
426*7c478bd9Sstevel@tonic-gate 
427*7c478bd9Sstevel@tonic-gate /*
428*7c478bd9Sstevel@tonic-gate  * Clear the SE_EWANTED bit from p_selock.  This function allows
429*7c478bd9Sstevel@tonic-gate  * callers of page_lock_es and page_try_reclaim_lock to clear
430*7c478bd9Sstevel@tonic-gate  * their setting of this bit if they decide they no longer wish
431*7c478bd9Sstevel@tonic-gate  * to gain exclusive access to the page.  Currently only
432*7c478bd9Sstevel@tonic-gate  * delete_memory_thread uses this when the delete memory
433*7c478bd9Sstevel@tonic-gate  * operation is cancelled.
434*7c478bd9Sstevel@tonic-gate  */
435*7c478bd9Sstevel@tonic-gate void
436*7c478bd9Sstevel@tonic-gate page_lock_clr_exclwanted(page_t *pp)
437*7c478bd9Sstevel@tonic-gate {
438*7c478bd9Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
439*7c478bd9Sstevel@tonic-gate 
440*7c478bd9Sstevel@tonic-gate 	mutex_enter(pse);
441*7c478bd9Sstevel@tonic-gate 	pp->p_selock &= ~SE_EWANTED;
442*7c478bd9Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&pp->p_cv))
443*7c478bd9Sstevel@tonic-gate 		cv_broadcast(&pp->p_cv);
444*7c478bd9Sstevel@tonic-gate 	mutex_exit(pse);
445*7c478bd9Sstevel@tonic-gate }
446*7c478bd9Sstevel@tonic-gate 
447*7c478bd9Sstevel@tonic-gate /*
448*7c478bd9Sstevel@tonic-gate  * Read the comments inside of page_lock_es() carefully.
449*7c478bd9Sstevel@tonic-gate  *
450*7c478bd9Sstevel@tonic-gate  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
451*7c478bd9Sstevel@tonic-gate  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
452*7c478bd9Sstevel@tonic-gate  * This is used by threads subject to reader-starvation (eg. memory delete).
453*7c478bd9Sstevel@tonic-gate  *
454*7c478bd9Sstevel@tonic-gate  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
455*7c478bd9Sstevel@tonic-gate  * it is expected that it will retry at a later time.  Threads that will
456*7c478bd9Sstevel@tonic-gate  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
457*7c478bd9Sstevel@tonic-gate  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
458*7c478bd9Sstevel@tonic-gate  * the bit is cleared.)
459*7c478bd9Sstevel@tonic-gate  */
460*7c478bd9Sstevel@tonic-gate int
461*7c478bd9Sstevel@tonic-gate page_try_reclaim_lock(page_t *pp, se_t se, int es)
462*7c478bd9Sstevel@tonic-gate {
463*7c478bd9Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
464*7c478bd9Sstevel@tonic-gate 	selock_t old;
465*7c478bd9Sstevel@tonic-gate 
466*7c478bd9Sstevel@tonic-gate 	mutex_enter(pse);
467*7c478bd9Sstevel@tonic-gate 
468*7c478bd9Sstevel@tonic-gate 	old = pp->p_selock;
469*7c478bd9Sstevel@tonic-gate 
470*7c478bd9Sstevel@tonic-gate 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
471*7c478bd9Sstevel@tonic-gate 	    ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
472*7c478bd9Sstevel@tonic-gate 
473*7c478bd9Sstevel@tonic-gate 	if (se == SE_SHARED && es == 1 && old == 0) {
474*7c478bd9Sstevel@tonic-gate 		se = SE_EXCL;
475*7c478bd9Sstevel@tonic-gate 	}
476*7c478bd9Sstevel@tonic-gate 
477*7c478bd9Sstevel@tonic-gate 	if (se == SE_SHARED) {
478*7c478bd9Sstevel@tonic-gate 		if (!PP_ISFREE(pp)) {
479*7c478bd9Sstevel@tonic-gate 			if (old >= 0) {
480*7c478bd9Sstevel@tonic-gate 				/* readers are not allowed when excl wanted */
481*7c478bd9Sstevel@tonic-gate 				if (!(old & SE_EWANTED)) {
482*7c478bd9Sstevel@tonic-gate 					pp->p_selock = old + SE_READER;
483*7c478bd9Sstevel@tonic-gate 					mutex_exit(pse);
484*7c478bd9Sstevel@tonic-gate 					return (1);
485*7c478bd9Sstevel@tonic-gate 				}
486*7c478bd9Sstevel@tonic-gate 			}
487*7c478bd9Sstevel@tonic-gate 			mutex_exit(pse);
488*7c478bd9Sstevel@tonic-gate 			return (0);
489*7c478bd9Sstevel@tonic-gate 		}
490*7c478bd9Sstevel@tonic-gate 		/*
491*7c478bd9Sstevel@tonic-gate 		 * The page is free, so we really want SE_EXCL (below)
492*7c478bd9Sstevel@tonic-gate 		 */
493*7c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(page_try_reclaim_upgrade);
494*7c478bd9Sstevel@tonic-gate 	}
495*7c478bd9Sstevel@tonic-gate 
496*7c478bd9Sstevel@tonic-gate 	/*
497*7c478bd9Sstevel@tonic-gate 	 * The caller wants a writer lock.  We try for it only if
498*7c478bd9Sstevel@tonic-gate 	 * SE_EWANTED is not set, or if the caller specified
499*7c478bd9Sstevel@tonic-gate 	 * SE_EXCL_WANTED.
500*7c478bd9Sstevel@tonic-gate 	 */
501*7c478bd9Sstevel@tonic-gate 	if (!(old & SE_EWANTED) || (es == SE_EXCL_WANTED)) {
502*7c478bd9Sstevel@tonic-gate 		if ((old & ~SE_EWANTED) == 0) {
503*7c478bd9Sstevel@tonic-gate 			/* no reader/writer lock held */
504*7c478bd9Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
505*7c478bd9Sstevel@tonic-gate 			/* this clears out our setting of the SE_EWANTED bit */
506*7c478bd9Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
507*7c478bd9Sstevel@tonic-gate 			mutex_exit(pse);
508*7c478bd9Sstevel@tonic-gate 			return (1);
509*7c478bd9Sstevel@tonic-gate 		}
510*7c478bd9Sstevel@tonic-gate 	}
511*7c478bd9Sstevel@tonic-gate 	if (es == SE_EXCL_WANTED) {
512*7c478bd9Sstevel@tonic-gate 		/* page is locked, set the SE_EWANTED bit */
513*7c478bd9Sstevel@tonic-gate 		pp->p_selock |= SE_EWANTED;
514*7c478bd9Sstevel@tonic-gate 	}
515*7c478bd9Sstevel@tonic-gate 	mutex_exit(pse);
516*7c478bd9Sstevel@tonic-gate 	return (0);
517*7c478bd9Sstevel@tonic-gate }
518*7c478bd9Sstevel@tonic-gate 
519*7c478bd9Sstevel@tonic-gate /*
520*7c478bd9Sstevel@tonic-gate  * Acquire a page's "shared/exclusive" lock, but never block.
521*7c478bd9Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
522*7c478bd9Sstevel@tonic-gate  */
523*7c478bd9Sstevel@tonic-gate int
524*7c478bd9Sstevel@tonic-gate page_trylock(page_t *pp, se_t se)
525*7c478bd9Sstevel@tonic-gate {
526*7c478bd9Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
527*7c478bd9Sstevel@tonic-gate 
528*7c478bd9Sstevel@tonic-gate 	mutex_enter(pse);
529*7c478bd9Sstevel@tonic-gate 	if (pp->p_selock & SE_EWANTED) {
530*7c478bd9Sstevel@tonic-gate 		/* fail if a thread wants exclusive access */
531*7c478bd9Sstevel@tonic-gate 		mutex_exit(pse);
532*7c478bd9Sstevel@tonic-gate 		return (0);
533*7c478bd9Sstevel@tonic-gate 	}
534*7c478bd9Sstevel@tonic-gate 
535*7c478bd9Sstevel@tonic-gate 	if (se == SE_EXCL) {
536*7c478bd9Sstevel@tonic-gate 		if (pp->p_selock == 0) {
537*7c478bd9Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
538*7c478bd9Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
539*7c478bd9Sstevel@tonic-gate 			mutex_exit(pse);
540*7c478bd9Sstevel@tonic-gate 			return (1);
541*7c478bd9Sstevel@tonic-gate 		}
542*7c478bd9Sstevel@tonic-gate 	} else {
543*7c478bd9Sstevel@tonic-gate 		if (pp->p_selock >= 0) {
544*7c478bd9Sstevel@tonic-gate 			pp->p_selock += SE_READER;
545*7c478bd9Sstevel@tonic-gate 			mutex_exit(pse);
546*7c478bd9Sstevel@tonic-gate 			return (1);
547*7c478bd9Sstevel@tonic-gate 		}
548*7c478bd9Sstevel@tonic-gate 	}
549*7c478bd9Sstevel@tonic-gate 	mutex_exit(pse);
550*7c478bd9Sstevel@tonic-gate 	return (0);
551*7c478bd9Sstevel@tonic-gate }
552*7c478bd9Sstevel@tonic-gate 
553*7c478bd9Sstevel@tonic-gate /*
554*7c478bd9Sstevel@tonic-gate  * Release the page's "shared/exclusive" lock and wake up anyone
555*7c478bd9Sstevel@tonic-gate  * who might be waiting for it.
556*7c478bd9Sstevel@tonic-gate  */
557*7c478bd9Sstevel@tonic-gate void
558*7c478bd9Sstevel@tonic-gate page_unlock(page_t *pp)
559*7c478bd9Sstevel@tonic-gate {
560*7c478bd9Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
561*7c478bd9Sstevel@tonic-gate 	selock_t old;
562*7c478bd9Sstevel@tonic-gate 
563*7c478bd9Sstevel@tonic-gate 	mutex_enter(pse);
564*7c478bd9Sstevel@tonic-gate 	old = pp->p_selock;
565*7c478bd9Sstevel@tonic-gate 	if ((old & ~SE_EWANTED) == SE_READER) {
566*7c478bd9Sstevel@tonic-gate 		pp->p_selock = old & ~SE_READER;
567*7c478bd9Sstevel@tonic-gate 		if (CV_HAS_WAITERS(&pp->p_cv))
568*7c478bd9Sstevel@tonic-gate 			cv_broadcast(&pp->p_cv);
569*7c478bd9Sstevel@tonic-gate 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
570*7c478bd9Sstevel@tonic-gate 		panic("page_unlock: page %p is deleted", pp);
571*7c478bd9Sstevel@tonic-gate 	} else if (old < 0) {
572*7c478bd9Sstevel@tonic-gate 		THREAD_KPRI_RELEASE();
573*7c478bd9Sstevel@tonic-gate 		pp->p_selock &= SE_EWANTED;
574*7c478bd9Sstevel@tonic-gate 		if (CV_HAS_WAITERS(&pp->p_cv))
575*7c478bd9Sstevel@tonic-gate 			cv_broadcast(&pp->p_cv);
576*7c478bd9Sstevel@tonic-gate 	} else if ((old & ~SE_EWANTED) > SE_READER) {
577*7c478bd9Sstevel@tonic-gate 		pp->p_selock = old - SE_READER;
578*7c478bd9Sstevel@tonic-gate 	} else {
579*7c478bd9Sstevel@tonic-gate 		panic("page_unlock: page %p is not locked", pp);
580*7c478bd9Sstevel@tonic-gate 	}
581*7c478bd9Sstevel@tonic-gate 	mutex_exit(pse);
582*7c478bd9Sstevel@tonic-gate }
583*7c478bd9Sstevel@tonic-gate 
584*7c478bd9Sstevel@tonic-gate /*
585*7c478bd9Sstevel@tonic-gate  * Try to upgrade the lock on the page from a "shared" to an
586*7c478bd9Sstevel@tonic-gate  * "exclusive" lock.  Since this upgrade operation is done while
587*7c478bd9Sstevel@tonic-gate  * holding the mutex protecting this page, no one else can acquire this page's
588*7c478bd9Sstevel@tonic-gate  * lock and change the page. Thus, it is safe to drop the "shared"
589*7c478bd9Sstevel@tonic-gate  * lock and attempt to acquire the "exclusive" lock.
590*7c478bd9Sstevel@tonic-gate  *
591*7c478bd9Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
592*7c478bd9Sstevel@tonic-gate  */
593*7c478bd9Sstevel@tonic-gate int
594*7c478bd9Sstevel@tonic-gate page_tryupgrade(page_t *pp)
595*7c478bd9Sstevel@tonic-gate {
596*7c478bd9Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
597*7c478bd9Sstevel@tonic-gate 
598*7c478bd9Sstevel@tonic-gate 	mutex_enter(pse);
599*7c478bd9Sstevel@tonic-gate 	if (!(pp->p_selock & SE_EWANTED)) {
600*7c478bd9Sstevel@tonic-gate 		/* no threads want exclusive access, try upgrade */
601*7c478bd9Sstevel@tonic-gate 		if (pp->p_selock == SE_READER) {
602*7c478bd9Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
603*7c478bd9Sstevel@tonic-gate 			/* convert to exclusive lock */
604*7c478bd9Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
605*7c478bd9Sstevel@tonic-gate 			mutex_exit(pse);
606*7c478bd9Sstevel@tonic-gate 			return (1);
607*7c478bd9Sstevel@tonic-gate 		}
608*7c478bd9Sstevel@tonic-gate 	}
609*7c478bd9Sstevel@tonic-gate 	mutex_exit(pse);
610*7c478bd9Sstevel@tonic-gate 	return (0);
611*7c478bd9Sstevel@tonic-gate }
612*7c478bd9Sstevel@tonic-gate 
613*7c478bd9Sstevel@tonic-gate /*
614*7c478bd9Sstevel@tonic-gate  * Downgrade the "exclusive" lock on the page to a "shared" lock
615*7c478bd9Sstevel@tonic-gate  * while holding the mutex protecting this page's p_selock field.
616*7c478bd9Sstevel@tonic-gate  */
617*7c478bd9Sstevel@tonic-gate void
618*7c478bd9Sstevel@tonic-gate page_downgrade(page_t *pp)
619*7c478bd9Sstevel@tonic-gate {
620*7c478bd9Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
621*7c478bd9Sstevel@tonic-gate 	int excl_waiting;
622*7c478bd9Sstevel@tonic-gate 
623*7c478bd9Sstevel@tonic-gate 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
624*7c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
625*7c478bd9Sstevel@tonic-gate 
626*7c478bd9Sstevel@tonic-gate 	mutex_enter(pse);
627*7c478bd9Sstevel@tonic-gate 	excl_waiting =  pp->p_selock & SE_EWANTED;
628*7c478bd9Sstevel@tonic-gate 	THREAD_KPRI_RELEASE();
629*7c478bd9Sstevel@tonic-gate 	pp->p_selock = SE_READER | excl_waiting;
630*7c478bd9Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&pp->p_cv))
631*7c478bd9Sstevel@tonic-gate 		cv_broadcast(&pp->p_cv);
632*7c478bd9Sstevel@tonic-gate 	mutex_exit(pse);
633*7c478bd9Sstevel@tonic-gate }
634*7c478bd9Sstevel@tonic-gate 
635*7c478bd9Sstevel@tonic-gate void
636*7c478bd9Sstevel@tonic-gate page_lock_delete(page_t *pp)
637*7c478bd9Sstevel@tonic-gate {
638*7c478bd9Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
639*7c478bd9Sstevel@tonic-gate 
640*7c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
641*7c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_vnode == NULL);
642*7c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_offset == (u_offset_t)-1);
643*7c478bd9Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(pp));
644*7c478bd9Sstevel@tonic-gate 
645*7c478bd9Sstevel@tonic-gate 	mutex_enter(pse);
646*7c478bd9Sstevel@tonic-gate 	THREAD_KPRI_RELEASE();
647*7c478bd9Sstevel@tonic-gate 	pp->p_selock = SE_DELETED;
648*7c478bd9Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&pp->p_cv))
649*7c478bd9Sstevel@tonic-gate 		cv_broadcast(&pp->p_cv);
650*7c478bd9Sstevel@tonic-gate 	mutex_exit(pse);
651*7c478bd9Sstevel@tonic-gate }
652*7c478bd9Sstevel@tonic-gate 
653*7c478bd9Sstevel@tonic-gate /*
654*7c478bd9Sstevel@tonic-gate  * Implement the io lock for pages
655*7c478bd9Sstevel@tonic-gate  */
656*7c478bd9Sstevel@tonic-gate void
657*7c478bd9Sstevel@tonic-gate page_iolock_init(page_t *pp)
658*7c478bd9Sstevel@tonic-gate {
659*7c478bd9Sstevel@tonic-gate 	pp->p_iolock_state = 0;
660*7c478bd9Sstevel@tonic-gate 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
661*7c478bd9Sstevel@tonic-gate }
662*7c478bd9Sstevel@tonic-gate 
663*7c478bd9Sstevel@tonic-gate /*
664*7c478bd9Sstevel@tonic-gate  * Acquire the i/o lock on a page.
665*7c478bd9Sstevel@tonic-gate  */
666*7c478bd9Sstevel@tonic-gate void
667*7c478bd9Sstevel@tonic-gate page_io_lock(page_t *pp)
668*7c478bd9Sstevel@tonic-gate {
669*7c478bd9Sstevel@tonic-gate 	kmutex_t *pio;
670*7c478bd9Sstevel@tonic-gate 
671*7c478bd9Sstevel@tonic-gate 	pio = PAGE_IO_MUTEX(pp);
672*7c478bd9Sstevel@tonic-gate 	mutex_enter(pio);
673*7c478bd9Sstevel@tonic-gate 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
674*7c478bd9Sstevel@tonic-gate 		cv_wait(&(pp->p_io_cv), pio);
675*7c478bd9Sstevel@tonic-gate 	}
676*7c478bd9Sstevel@tonic-gate 	pp->p_iolock_state |= PAGE_IO_INUSE;
677*7c478bd9Sstevel@tonic-gate 	mutex_exit(pio);
678*7c478bd9Sstevel@tonic-gate }
679*7c478bd9Sstevel@tonic-gate 
680*7c478bd9Sstevel@tonic-gate /*
681*7c478bd9Sstevel@tonic-gate  * Release the i/o lock on a page.
682*7c478bd9Sstevel@tonic-gate  */
683*7c478bd9Sstevel@tonic-gate void
684*7c478bd9Sstevel@tonic-gate page_io_unlock(page_t *pp)
685*7c478bd9Sstevel@tonic-gate {
686*7c478bd9Sstevel@tonic-gate 	kmutex_t *pio;
687*7c478bd9Sstevel@tonic-gate 
688*7c478bd9Sstevel@tonic-gate 	pio = PAGE_IO_MUTEX(pp);
689*7c478bd9Sstevel@tonic-gate 	mutex_enter(pio);
690*7c478bd9Sstevel@tonic-gate 	cv_signal(&pp->p_io_cv);
691*7c478bd9Sstevel@tonic-gate 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
692*7c478bd9Sstevel@tonic-gate 	mutex_exit(pio);
693*7c478bd9Sstevel@tonic-gate }
694*7c478bd9Sstevel@tonic-gate 
695*7c478bd9Sstevel@tonic-gate /*
696*7c478bd9Sstevel@tonic-gate  * Try to acquire the i/o lock on a page without blocking.
697*7c478bd9Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
698*7c478bd9Sstevel@tonic-gate  */
699*7c478bd9Sstevel@tonic-gate int
700*7c478bd9Sstevel@tonic-gate page_io_trylock(page_t *pp)
701*7c478bd9Sstevel@tonic-gate {
702*7c478bd9Sstevel@tonic-gate 	kmutex_t *pio;
703*7c478bd9Sstevel@tonic-gate 
704*7c478bd9Sstevel@tonic-gate 	if (pp->p_iolock_state & PAGE_IO_INUSE)
705*7c478bd9Sstevel@tonic-gate 		return (0);
706*7c478bd9Sstevel@tonic-gate 
707*7c478bd9Sstevel@tonic-gate 	pio = PAGE_IO_MUTEX(pp);
708*7c478bd9Sstevel@tonic-gate 	mutex_enter(pio);
709*7c478bd9Sstevel@tonic-gate 
710*7c478bd9Sstevel@tonic-gate 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
711*7c478bd9Sstevel@tonic-gate 		mutex_exit(pio);
712*7c478bd9Sstevel@tonic-gate 		return (0);
713*7c478bd9Sstevel@tonic-gate 	}
714*7c478bd9Sstevel@tonic-gate 	pp->p_iolock_state |= PAGE_IO_INUSE;
715*7c478bd9Sstevel@tonic-gate 	mutex_exit(pio);
716*7c478bd9Sstevel@tonic-gate 
717*7c478bd9Sstevel@tonic-gate 	return (1);
718*7c478bd9Sstevel@tonic-gate }
719*7c478bd9Sstevel@tonic-gate 
720*7c478bd9Sstevel@tonic-gate /*
721*7c478bd9Sstevel@tonic-gate  * Assert that the i/o lock on a page is held.
722*7c478bd9Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
723*7c478bd9Sstevel@tonic-gate  */
724*7c478bd9Sstevel@tonic-gate int
725*7c478bd9Sstevel@tonic-gate page_iolock_assert(page_t *pp)
726*7c478bd9Sstevel@tonic-gate {
727*7c478bd9Sstevel@tonic-gate 	return (pp->p_iolock_state & PAGE_IO_INUSE);
728*7c478bd9Sstevel@tonic-gate }
729*7c478bd9Sstevel@tonic-gate 
730*7c478bd9Sstevel@tonic-gate /*
731*7c478bd9Sstevel@tonic-gate  * Wrapper exported to kernel routines that are built
732*7c478bd9Sstevel@tonic-gate  * platform-independent (the macro is platform-dependent;
733*7c478bd9Sstevel@tonic-gate  * the size of vph_mutex[] is based on NCPU).
734*7c478bd9Sstevel@tonic-gate  *
735*7c478bd9Sstevel@tonic-gate  * Note that you can do stress testing on this by setting the
736*7c478bd9Sstevel@tonic-gate  * variable page_vnode_mutex_stress to something other than
737*7c478bd9Sstevel@tonic-gate  * zero in a DEBUG kernel in a debugger after loading the kernel.
738*7c478bd9Sstevel@tonic-gate  * Setting it after the kernel is running may not work correctly.
739*7c478bd9Sstevel@tonic-gate  */
740*7c478bd9Sstevel@tonic-gate #ifdef DEBUG
741*7c478bd9Sstevel@tonic-gate static int page_vnode_mutex_stress = 0;
742*7c478bd9Sstevel@tonic-gate #endif
743*7c478bd9Sstevel@tonic-gate 
744*7c478bd9Sstevel@tonic-gate kmutex_t *
745*7c478bd9Sstevel@tonic-gate page_vnode_mutex(vnode_t *vp)
746*7c478bd9Sstevel@tonic-gate {
747*7c478bd9Sstevel@tonic-gate 	if (vp == &kvp)
748*7c478bd9Sstevel@tonic-gate 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
749*7c478bd9Sstevel@tonic-gate #ifdef DEBUG
750*7c478bd9Sstevel@tonic-gate 	if (page_vnode_mutex_stress != 0)
751*7c478bd9Sstevel@tonic-gate 		return (&vph_mutex[0]);
752*7c478bd9Sstevel@tonic-gate #endif
753*7c478bd9Sstevel@tonic-gate 
754*7c478bd9Sstevel@tonic-gate 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
755*7c478bd9Sstevel@tonic-gate }
756*7c478bd9Sstevel@tonic-gate 
757*7c478bd9Sstevel@tonic-gate kmutex_t *
758*7c478bd9Sstevel@tonic-gate page_se_mutex(page_t *pp)
759*7c478bd9Sstevel@tonic-gate {
760*7c478bd9Sstevel@tonic-gate 	return (PAGE_SE_MUTEX(pp));
761*7c478bd9Sstevel@tonic-gate }
762*7c478bd9Sstevel@tonic-gate 
763*7c478bd9Sstevel@tonic-gate #ifdef VM_STATS
764*7c478bd9Sstevel@tonic-gate uint_t pszclck_stat[4];
765*7c478bd9Sstevel@tonic-gate #endif
766*7c478bd9Sstevel@tonic-gate /*
767*7c478bd9Sstevel@tonic-gate  * Find, take and return a mutex held by hat_page_demote().
768*7c478bd9Sstevel@tonic-gate  * Called by page_demote_vp_pages() before hat_page_demote() call and by
769*7c478bd9Sstevel@tonic-gate  * routines that want to block hat_page_demote() but can't do it
770*7c478bd9Sstevel@tonic-gate  * via locking all constituent pages.
771*7c478bd9Sstevel@tonic-gate  *
772*7c478bd9Sstevel@tonic-gate  * Return NULL if p_szc is 0.
773*7c478bd9Sstevel@tonic-gate  *
774*7c478bd9Sstevel@tonic-gate  * It should only be used for pages that can be demoted by hat_page_demote()
775*7c478bd9Sstevel@tonic-gate  * i.e. non swapfs file system pages.  The logic here is lifted from
776*7c478bd9Sstevel@tonic-gate  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
777*7c478bd9Sstevel@tonic-gate  * since the page is locked and not free.
778*7c478bd9Sstevel@tonic-gate  *
779*7c478bd9Sstevel@tonic-gate  * Hash of the root page is used to find the lock.
780*7c478bd9Sstevel@tonic-gate  * To find the root in the presense of hat_page_demote() chageing the location
781*7c478bd9Sstevel@tonic-gate  * of the root this routine relies on the fact that hat_page_demote() changes
782*7c478bd9Sstevel@tonic-gate  * root last.
783*7c478bd9Sstevel@tonic-gate  *
784*7c478bd9Sstevel@tonic-gate  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
785*7c478bd9Sstevel@tonic-gate  * returned pp's p_szc may be any value.
786*7c478bd9Sstevel@tonic-gate  */
787*7c478bd9Sstevel@tonic-gate kmutex_t *
788*7c478bd9Sstevel@tonic-gate page_szc_lock(page_t *pp)
789*7c478bd9Sstevel@tonic-gate {
790*7c478bd9Sstevel@tonic-gate 	kmutex_t	*mtx;
791*7c478bd9Sstevel@tonic-gate 	page_t		*rootpp;
792*7c478bd9Sstevel@tonic-gate 	uint_t		szc;
793*7c478bd9Sstevel@tonic-gate 	uint_t		rszc;
794*7c478bd9Sstevel@tonic-gate 	uint_t		pszc = pp->p_szc;
795*7c478bd9Sstevel@tonic-gate 
796*7c478bd9Sstevel@tonic-gate 	ASSERT(pp != NULL);
797*7c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
798*7c478bd9Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(pp));
799*7c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_vnode != NULL);
800*7c478bd9Sstevel@tonic-gate 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
801*7c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_vnode != &kvp);
802*7c478bd9Sstevel@tonic-gate 
803*7c478bd9Sstevel@tonic-gate again:
804*7c478bd9Sstevel@tonic-gate 	if (pszc == 0) {
805*7c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(pszclck_stat[0]);
806*7c478bd9Sstevel@tonic-gate 		return (NULL);
807*7c478bd9Sstevel@tonic-gate 	}
808*7c478bd9Sstevel@tonic-gate 
809*7c478bd9Sstevel@tonic-gate 	/* The lock lives in the root page */
810*7c478bd9Sstevel@tonic-gate 
811*7c478bd9Sstevel@tonic-gate 	rootpp = PP_GROUPLEADER(pp, pszc);
812*7c478bd9Sstevel@tonic-gate 	mtx = PAGE_SZC_MUTEX(rootpp);
813*7c478bd9Sstevel@tonic-gate 	mutex_enter(mtx);
814*7c478bd9Sstevel@tonic-gate 
815*7c478bd9Sstevel@tonic-gate 	/*
816*7c478bd9Sstevel@tonic-gate 	 * since p_szc can only decrease if pp == rootpp
817*7c478bd9Sstevel@tonic-gate 	 * rootpp will be always the same i.e we have the right root
818*7c478bd9Sstevel@tonic-gate 	 * regardless of rootpp->p_szc.
819*7c478bd9Sstevel@tonic-gate 	 * If location of pp's root didn't change after we took
820*7c478bd9Sstevel@tonic-gate 	 * the lock we have the right root. return mutex hashed off it.
821*7c478bd9Sstevel@tonic-gate 	 */
822*7c478bd9Sstevel@tonic-gate 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
823*7c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(pszclck_stat[1]);
824*7c478bd9Sstevel@tonic-gate 		return (mtx);
825*7c478bd9Sstevel@tonic-gate 	}
826*7c478bd9Sstevel@tonic-gate 
827*7c478bd9Sstevel@tonic-gate 	/*
828*7c478bd9Sstevel@tonic-gate 	 * root location changed because page got demoted.
829*7c478bd9Sstevel@tonic-gate 	 * locate the new root.
830*7c478bd9Sstevel@tonic-gate 	 */
831*7c478bd9Sstevel@tonic-gate 	if (rszc < pszc) {
832*7c478bd9Sstevel@tonic-gate 		szc = pp->p_szc;
833*7c478bd9Sstevel@tonic-gate 		ASSERT(szc < pszc);
834*7c478bd9Sstevel@tonic-gate 		mutex_exit(mtx);
835*7c478bd9Sstevel@tonic-gate 		pszc = szc;
836*7c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(pszclck_stat[2]);
837*7c478bd9Sstevel@tonic-gate 		goto again;
838*7c478bd9Sstevel@tonic-gate 	}
839*7c478bd9Sstevel@tonic-gate 
840*7c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(pszclck_stat[3]);
841*7c478bd9Sstevel@tonic-gate 	/*
842*7c478bd9Sstevel@tonic-gate 	 * current hat_page_demote not done yet.
843*7c478bd9Sstevel@tonic-gate 	 * wait for it to finish.
844*7c478bd9Sstevel@tonic-gate 	 */
845*7c478bd9Sstevel@tonic-gate 	mutex_exit(mtx);
846*7c478bd9Sstevel@tonic-gate 	rootpp = PP_GROUPLEADER(rootpp, rszc);
847*7c478bd9Sstevel@tonic-gate 	mtx = PAGE_SZC_MUTEX(rootpp);
848*7c478bd9Sstevel@tonic-gate 	mutex_enter(mtx);
849*7c478bd9Sstevel@tonic-gate 	mutex_exit(mtx);
850*7c478bd9Sstevel@tonic-gate 	ASSERT(rootpp->p_szc < rszc);
851*7c478bd9Sstevel@tonic-gate 	goto again;
852*7c478bd9Sstevel@tonic-gate }
853*7c478bd9Sstevel@tonic-gate 
854*7c478bd9Sstevel@tonic-gate int
855*7c478bd9Sstevel@tonic-gate page_szc_lock_assert(page_t *pp)
856*7c478bd9Sstevel@tonic-gate {
857*7c478bd9Sstevel@tonic-gate 	page_t *rootpp = PP_PAGEROOT(pp);
858*7c478bd9Sstevel@tonic-gate 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
859*7c478bd9Sstevel@tonic-gate 
860*7c478bd9Sstevel@tonic-gate 	return (MUTEX_HELD(mtx));
861*7c478bd9Sstevel@tonic-gate }
862