xref: /illumos-gate/usr/src/uts/common/vm/page_lock.c (revision 903a11eb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * VM - page locking primitives
30  */
31 #include <sys/param.h>
32 #include <sys/t_lock.h>
33 #include <sys/vtrace.h>
34 #include <sys/debug.h>
35 #include <sys/cmn_err.h>
36 #include <sys/vnode.h>
37 #include <sys/bitmap.h>
38 #include <sys/lockstat.h>
39 #include <sys/sysmacros.h>
40 #include <sys/condvar_impl.h>
41 #include <vm/page.h>
42 #include <vm/seg_enum.h>
43 #include <vm/vm_dep.h>
44 
45 /*
46  * This global mutex is for logical page locking.
47  * The following fields in the page structure are protected
48  * by this lock:
49  *
50  *	p_lckcnt
51  *	p_cowcnt
52  */
53 kmutex_t page_llock;
54 
55 /*
56  * This is a global lock for the logical page free list.  The
57  * logical free list, in this implementation, is maintained as two
58  * separate physical lists - the cache list and the free list.
59  */
60 kmutex_t  page_freelock;
61 
62 /*
63  * The hash table, page_hash[], the p_selock fields, and the
64  * list of pages associated with vnodes are protected by arrays of mutexes.
65  *
66  * Unless the hashes are changed radically, the table sizes must be
67  * a power of two.  Also, we typically need more mutexes for the
68  * vnodes since these locks are occasionally held for long periods.
69  * And since there seem to be two special vnodes (kvp and swapvp),
70  * we make room for private mutexes for them.
71  *
72  * The pse_mutex[] array holds the mutexes to protect the p_selock
73  * fields of all page_t structures.
74  *
75  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
76  * when given a pointer to a page_t.
77  *
78  * PIO_TABLE_SIZE must be a power of two.  One could argue that we
79  * should go to the trouble of setting it up at run time and base it
80  * on memory size rather than the number of compile time CPUs.
81  *
82  * XX64	We should be using physmem size to calculate PIO_SHIFT.
83  *
84  *	These might break in 64 bit world.
85  */
86 #define	PIO_SHIFT	7	/* log2(sizeof(page_t)) */
87 #define	PIO_TABLE_SIZE	128	/* number of io mutexes to have */
88 
89 pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
90 kmutex_t	pio_mutex[PIO_TABLE_SIZE];
91 
92 #define	PAGE_IO_MUTEX(pp) \
93 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
94 
95 /*
96  * The pse_mutex[] array is allocated in the platform startup code
97  * based on the size of the machine at startup.
98  */
99 extern pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
100 extern size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
101 extern int pse_shift;			/* log2(pse_table_size) */
102 #define	PAGE_SE_MUTEX(pp)	&pse_mutex[				\
103 	((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &	\
104 	(pse_table_size - 1)].pad_mutex
105 
106 #define	PSZC_MTX_TABLE_SIZE	128
107 #define	PSZC_MTX_TABLE_SHIFT	7
108 
109 static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
110 
111 #define	PAGE_SZC_MUTEX(_pp) \
112 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
113 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
114 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
115 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
116 
117 /*
118  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
119  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
120  * and p_vpnext).
121  *
122  * The page_vnode_mutex(vp) function returns the address of the appropriate
123  * mutex from this array given a pointer to a vnode.  It is complicated
124  * by the fact that the kernel's vnode and the swapfs vnode are referenced
125  * frequently enough to warrent their own mutexes.
126  *
127  * The VP_HASH_FUNC returns the index into the vph_mutex array given
128  * an address of a vnode.
129  */
130 
131 /*
132  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
133  *	Need to review again.
134  */
135 #if defined(_LP64)
136 #define	VPH_TABLE_SIZE  (1 << (VP_SHIFT + 3))
137 #else	/* 32 bits */
138 #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
139 #endif
140 
141 #define	VP_HASH_FUNC(vp) \
142 	((((uintptr_t)(vp) >> 6) + \
143 	    ((uintptr_t)(vp) >> 8) + \
144 	    ((uintptr_t)(vp) >> 10) + \
145 	    ((uintptr_t)(vp) >> 12)) \
146 	    & (VPH_TABLE_SIZE - 1))
147 
148 extern	struct vnode	kvp;
149 
150 /*
151  * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
152  * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
153  * VPH_TABLE_SIZE + 1.
154  */
155 
156 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
157 
158 /*
159  * Initialize the locks used by the Virtual Memory Management system.
160  */
161 void
162 page_lock_init()
163 {
164 }
165 
166 /*
167  * Return a value for pse_shift based on npg (the number of physical pages)
168  * and ncpu (the maximum number of CPUs).  This is called by platform startup
169  * code.
170  *
171  * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
172  * locks grew approximately as the square of the number of threads executing.
173  * So the primary scaling factor used is NCPU^2.  The size of the machine in
174  * megabytes is used as an upper bound, particularly for sun4v machines which
175  * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
176  * (128) is used as a minimum.  Since the size of the table has to be a power
177  * of two, the calculated size is rounded up to the next power of two.
178  */
179 /*ARGSUSED*/
180 int
181 size_pse_array(pgcnt_t npg, int ncpu)
182 {
183 	size_t size;
184 	pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
185 
186 	size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
187 	size += (1 << (highbit(size) - 1)) - 1;
188 	return (highbit(size) - 1);
189 }
190 
191 /*
192  * At present we only use page ownership to aid debugging, so it's
193  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
194  * can map to the same owner because we just 'or' in 0x80000000 and
195  * then clear the second highest bit, so that (for example) 0x2faced00
196  * and 0xafaced00 both map to 0xafaced00.
197  * In the 64-bit world, p_selock may not be large enough to hold a full
198  * thread pointer.  If we ever need precise ownership (e.g. if we implement
199  * priority inheritance for page locks) then p_selock should become a
200  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
201  */
202 #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
203 #define	SE_READER	1
204 
205 /*
206  * A page that is deleted must be marked as such using the
207  * page_lock_delete() function. The page must be exclusively locked.
208  * The SE_DELETED marker is put in p_selock when this function is called.
209  * SE_DELETED must be distinct from any SE_WRITER value.
210  */
211 #define	SE_DELETED	(1 | INT_MIN)
212 
213 #ifdef VM_STATS
214 uint_t	vph_kvp_count;
215 uint_t	vph_swapfsvp_count;
216 uint_t	vph_other;
217 #endif /* VM_STATS */
218 
219 #ifdef VM_STATS
220 uint_t	page_lock_count;
221 uint_t	page_lock_miss;
222 uint_t	page_lock_miss_lock;
223 uint_t	page_lock_reclaim;
224 uint_t	page_lock_bad_reclaim;
225 uint_t	page_lock_same_page;
226 uint_t	page_lock_upgrade;
227 uint_t	page_lock_retired;
228 uint_t	page_lock_upgrade_failed;
229 uint_t	page_lock_deleted;
230 
231 uint_t	page_trylock_locked;
232 uint_t	page_trylock_failed;
233 uint_t	page_trylock_missed;
234 
235 uint_t	page_try_reclaim_upgrade;
236 #endif /* VM_STATS */
237 
238 /*
239  * Acquire the "shared/exclusive" lock on a page.
240  *
241  * Returns 1 on success and locks the page appropriately.
242  *	   0 on failure and does not lock the page.
243  *
244  * If `lock' is non-NULL, it will be dropped and reacquired in the
245  * failure case.  This routine can block, and if it does
246  * it will always return a failure since the page identity [vp, off]
247  * or state may have changed.
248  */
249 
250 int
251 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
252 {
253 	return (page_lock_es(pp, se, lock, reclaim, 0));
254 }
255 
256 /*
257  * With the addition of reader-writer lock semantics to page_lock_es,
258  * callers wanting an exclusive (writer) lock may prevent shared-lock
259  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
260  * In this case, when an exclusive lock cannot be acquired, p_selock's
261  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
262  * if the page is slated for retirement.
263  *
264  * The se and es parameters determine if the lock should be granted
265  * based on the following decision table:
266  *
267  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
268  * ----------- -------------- -------------------  ---------
269  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
270  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
271  * SE_EXCL        none         any lock/any        deny
272  * SE_SHARED      n/a [2]        shared/0          grant
273  * SE_SHARED      n/a [2]      unlocked/0          grant
274  * SE_SHARED      n/a            shared/1          deny
275  * SE_SHARED      n/a          unlocked/1          deny
276  * SE_SHARED      n/a              excl/any        deny
277  *
278  * Notes:
279  * [1] The code grants an exclusive lock to the caller and clears the bit
280  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
281  *   bit's value.  This was deemed acceptable as we are not concerned about
282  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
283  *   fifo mechanism should also be implemented. Meantime, the thread that
284  *   set SE_EWANTED should be prepared to catch this condition and reset it
285  *
286  * [2] Retired pages may not be locked at any time, regardless of the
287  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
288  *
289  * Notes on values of "es":
290  *
291  *   es & 1: page_lookup_create will attempt page relocation
292  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
293  *       memory thread); this prevents reader-starvation of waiting
294  *       writer thread(s) by giving priority to writers over readers.
295  *   es & SE_RETIRED: caller wants to lock pages even if they are
296  *       retired.  Default is to deny the lock if the page is retired.
297  *
298  * And yes, we know, the semantics of this function are too complicated.
299  * It's on the list to be cleaned up.
300  */
301 int
302 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
303 {
304 	int		retval;
305 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
306 	int		upgraded;
307 	int		reclaim_it;
308 
309 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
310 
311 	VM_STAT_ADD(page_lock_count);
312 
313 	upgraded = 0;
314 	reclaim_it = 0;
315 
316 	mutex_enter(pse);
317 
318 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
319 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
320 
321 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
322 		mutex_exit(pse);
323 		VM_STAT_ADD(page_lock_retired);
324 		return (0);
325 	}
326 
327 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
328 		se = SE_EXCL;
329 	}
330 
331 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
332 
333 		reclaim_it = 1;
334 		if (se == SE_SHARED) {
335 			/*
336 			 * This is an interesting situation.
337 			 *
338 			 * Remember that p_free can only change if
339 			 * p_selock < 0.
340 			 * p_free does not depend on our holding `pse'.
341 			 * And, since we hold `pse', p_selock can not change.
342 			 * So, if p_free changes on us, the page is already
343 			 * exclusively held, and we would fail to get p_selock
344 			 * regardless.
345 			 *
346 			 * We want to avoid getting the share
347 			 * lock on a free page that needs to be reclaimed.
348 			 * It is possible that some other thread has the share
349 			 * lock and has left the free page on the cache list.
350 			 * pvn_vplist_dirty() does this for brief periods.
351 			 * If the se_share is currently SE_EXCL, we will fail
352 			 * to acquire p_selock anyway.  Blocking is the
353 			 * right thing to do.
354 			 * If we need to reclaim this page, we must get
355 			 * exclusive access to it, force the upgrade now.
356 			 * Again, we will fail to acquire p_selock if the
357 			 * page is not free and block.
358 			 */
359 			upgraded = 1;
360 			se = SE_EXCL;
361 			VM_STAT_ADD(page_lock_upgrade);
362 		}
363 	}
364 
365 	if (se == SE_EXCL) {
366 		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
367 			/*
368 			 * if the caller wants a writer lock (but did not
369 			 * specify exclusive access), and there is a pending
370 			 * writer that wants exclusive access, return failure
371 			 */
372 			retval = 0;
373 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
374 			/* no reader/writer lock held */
375 			THREAD_KPRI_REQUEST();
376 			/* this clears our setting of the SE_EWANTED bit */
377 			pp->p_selock = SE_WRITER;
378 			retval = 1;
379 		} else {
380 			/* page is locked */
381 			if (es & SE_EXCL_WANTED) {
382 				/* set the SE_EWANTED bit */
383 				pp->p_selock |= SE_EWANTED;
384 			}
385 			retval = 0;
386 		}
387 	} else {
388 		retval = 0;
389 		if (pp->p_selock >= 0) {
390 			if ((pp->p_selock & SE_EWANTED) == 0) {
391 				pp->p_selock += SE_READER;
392 				retval = 1;
393 			}
394 		}
395 	}
396 
397 	if (retval == 0) {
398 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
399 			VM_STAT_ADD(page_lock_deleted);
400 			mutex_exit(pse);
401 			return (retval);
402 		}
403 
404 #ifdef VM_STATS
405 		VM_STAT_ADD(page_lock_miss);
406 		if (upgraded) {
407 			VM_STAT_ADD(page_lock_upgrade_failed);
408 		}
409 #endif
410 		if (lock) {
411 			VM_STAT_ADD(page_lock_miss_lock);
412 			mutex_exit(lock);
413 		}
414 
415 		/*
416 		 * Now, wait for the page to be unlocked and
417 		 * release the lock protecting p_cv and p_selock.
418 		 */
419 		cv_wait(&pp->p_cv, pse);
420 		mutex_exit(pse);
421 
422 		/*
423 		 * The page identity may have changed while we were
424 		 * blocked.  If we are willing to depend on "pp"
425 		 * still pointing to a valid page structure (i.e.,
426 		 * assuming page structures are not dynamically allocated
427 		 * or freed), we could try to lock the page if its
428 		 * identity hasn't changed.
429 		 *
430 		 * This needs to be measured, since we come back from
431 		 * cv_wait holding pse (the expensive part of this
432 		 * operation) we might as well try the cheap part.
433 		 * Though we would also have to confirm that dropping
434 		 * `lock' did not cause any grief to the callers.
435 		 */
436 		if (lock) {
437 			mutex_enter(lock);
438 		}
439 	} else {
440 		/*
441 		 * We have the page lock.
442 		 * If we needed to reclaim the page, and the page
443 		 * needed reclaiming (ie, it was free), then we
444 		 * have the page exclusively locked.  We may need
445 		 * to downgrade the page.
446 		 */
447 		ASSERT((upgraded) ?
448 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
449 		mutex_exit(pse);
450 
451 		/*
452 		 * We now hold this page's lock, either shared or
453 		 * exclusive.  This will prevent its identity from changing.
454 		 * The page, however, may or may not be free.  If the caller
455 		 * requested, and it is free, go reclaim it from the
456 		 * free list.  If the page can't be reclaimed, return failure
457 		 * so that the caller can start all over again.
458 		 *
459 		 * NOTE:page_reclaim() releases the page lock (p_selock)
460 		 *	if it can't be reclaimed.
461 		 */
462 		if (reclaim_it) {
463 			if (!page_reclaim(pp, lock)) {
464 				VM_STAT_ADD(page_lock_bad_reclaim);
465 				retval = 0;
466 			} else {
467 				VM_STAT_ADD(page_lock_reclaim);
468 				if (upgraded) {
469 					page_downgrade(pp);
470 				}
471 			}
472 		}
473 	}
474 	return (retval);
475 }
476 
477 /*
478  * Clear the SE_EWANTED bit from p_selock.  This function allows
479  * callers of page_lock_es and page_try_reclaim_lock to clear
480  * their setting of this bit if they decide they no longer wish
481  * to gain exclusive access to the page.  Currently only
482  * delete_memory_thread uses this when the delete memory
483  * operation is cancelled.
484  */
485 void
486 page_lock_clr_exclwanted(page_t *pp)
487 {
488 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
489 
490 	mutex_enter(pse);
491 	pp->p_selock &= ~SE_EWANTED;
492 	if (CV_HAS_WAITERS(&pp->p_cv))
493 		cv_broadcast(&pp->p_cv);
494 	mutex_exit(pse);
495 }
496 
497 /*
498  * Read the comments inside of page_lock_es() carefully.
499  *
500  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
501  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
502  * This is used by threads subject to reader-starvation (eg. memory delete).
503  *
504  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
505  * it is expected that it will retry at a later time.  Threads that will
506  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
507  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
508  * the bit is cleared.)
509  */
510 int
511 page_try_reclaim_lock(page_t *pp, se_t se, int es)
512 {
513 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
514 	selock_t old;
515 
516 	mutex_enter(pse);
517 
518 	old = pp->p_selock;
519 
520 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
521 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
522 
523 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
524 		mutex_exit(pse);
525 		VM_STAT_ADD(page_trylock_failed);
526 		return (0);
527 	}
528 
529 	if (se == SE_SHARED && es == 1 && old == 0) {
530 		se = SE_EXCL;
531 	}
532 
533 	if (se == SE_SHARED) {
534 		if (!PP_ISFREE(pp)) {
535 			if (old >= 0) {
536 				/*
537 				 * Readers are not allowed when excl wanted
538 				 */
539 				if ((old & SE_EWANTED) == 0) {
540 					pp->p_selock = old + SE_READER;
541 					mutex_exit(pse);
542 					return (1);
543 				}
544 			}
545 			mutex_exit(pse);
546 			return (0);
547 		}
548 		/*
549 		 * The page is free, so we really want SE_EXCL (below)
550 		 */
551 		VM_STAT_ADD(page_try_reclaim_upgrade);
552 	}
553 
554 	/*
555 	 * The caller wants a writer lock.  We try for it only if
556 	 * SE_EWANTED is not set, or if the caller specified
557 	 * SE_EXCL_WANTED.
558 	 */
559 	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
560 		if ((old & ~SE_EWANTED) == 0) {
561 			/* no reader/writer lock held */
562 			THREAD_KPRI_REQUEST();
563 			/* this clears out our setting of the SE_EWANTED bit */
564 			pp->p_selock = SE_WRITER;
565 			mutex_exit(pse);
566 			return (1);
567 		}
568 	}
569 	if (es & SE_EXCL_WANTED) {
570 		/* page is locked, set the SE_EWANTED bit */
571 		pp->p_selock |= SE_EWANTED;
572 	}
573 	mutex_exit(pse);
574 	return (0);
575 }
576 
577 /*
578  * Acquire a page's "shared/exclusive" lock, but never block.
579  * Returns 1 on success, 0 on failure.
580  */
581 int
582 page_trylock(page_t *pp, se_t se)
583 {
584 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
585 
586 	mutex_enter(pse);
587 	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
588 	    (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
589 		/*
590 		 * Fail if a thread wants exclusive access and page is
591 		 * retired, if the page is slated for retirement, or a
592 		 * share lock is requested.
593 		 */
594 		mutex_exit(pse);
595 		VM_STAT_ADD(page_trylock_failed);
596 		return (0);
597 	}
598 
599 	if (se == SE_EXCL) {
600 		if (pp->p_selock == 0) {
601 			THREAD_KPRI_REQUEST();
602 			pp->p_selock = SE_WRITER;
603 			mutex_exit(pse);
604 			return (1);
605 		}
606 	} else {
607 		if (pp->p_selock >= 0) {
608 			pp->p_selock += SE_READER;
609 			mutex_exit(pse);
610 			return (1);
611 		}
612 	}
613 	mutex_exit(pse);
614 	return (0);
615 }
616 
617 /*
618  * Variant of page_unlock() specifically for the page freelist
619  * code. The mere existence of this code is a vile hack that
620  * has resulted due to the backwards locking order of the page
621  * freelist manager; please don't call it.
622  */
623 void
624 page_unlock_nocapture(page_t *pp)
625 {
626 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
627 	selock_t old;
628 
629 	mutex_enter(pse);
630 
631 	old = pp->p_selock;
632 	if ((old & ~SE_EWANTED) == SE_READER) {
633 		pp->p_selock = old & ~SE_READER;
634 		if (CV_HAS_WAITERS(&pp->p_cv))
635 			cv_broadcast(&pp->p_cv);
636 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
637 		panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
638 	} else if (old < 0) {
639 		THREAD_KPRI_RELEASE();
640 		pp->p_selock &= SE_EWANTED;
641 		if (CV_HAS_WAITERS(&pp->p_cv))
642 			cv_broadcast(&pp->p_cv);
643 	} else if ((old & ~SE_EWANTED) > SE_READER) {
644 		pp->p_selock = old - SE_READER;
645 	} else {
646 		panic("page_unlock_nocapture: page %p is not locked",
647 		    (void *)pp);
648 	}
649 
650 	mutex_exit(pse);
651 }
652 
653 /*
654  * Release the page's "shared/exclusive" lock and wake up anyone
655  * who might be waiting for it.
656  */
657 void
658 page_unlock(page_t *pp)
659 {
660 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
661 	selock_t old;
662 
663 	mutex_enter(pse);
664 
665 	old = pp->p_selock;
666 	if ((old & ~SE_EWANTED) == SE_READER) {
667 		pp->p_selock = old & ~SE_READER;
668 		if (CV_HAS_WAITERS(&pp->p_cv))
669 			cv_broadcast(&pp->p_cv);
670 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
671 		panic("page_unlock: page %p is deleted", (void *)pp);
672 	} else if (old < 0) {
673 		THREAD_KPRI_RELEASE();
674 		pp->p_selock &= SE_EWANTED;
675 		if (CV_HAS_WAITERS(&pp->p_cv))
676 			cv_broadcast(&pp->p_cv);
677 	} else if ((old & ~SE_EWANTED) > SE_READER) {
678 		pp->p_selock = old - SE_READER;
679 	} else {
680 		panic("page_unlock: page %p is not locked", (void *)pp);
681 	}
682 
683 	if (pp->p_selock == 0) {
684 		/*
685 		 * If the T_CAPTURING bit is set, that means that we should
686 		 * not try and capture the page again as we could recurse
687 		 * which could lead to a stack overflow panic or spending a
688 		 * relatively long time in the kernel making no progress.
689 		 */
690 		if ((pp->p_toxic & PR_CAPTURE) &&
691 		    !(curthread->t_flag & T_CAPTURING) &&
692 		    !PP_RETIRED(pp)) {
693 			THREAD_KPRI_REQUEST();
694 			pp->p_selock = SE_WRITER;
695 			mutex_exit(pse);
696 			page_unlock_capture(pp);
697 		} else {
698 			mutex_exit(pse);
699 		}
700 	} else {
701 		mutex_exit(pse);
702 	}
703 }
704 
705 /*
706  * Try to upgrade the lock on the page from a "shared" to an
707  * "exclusive" lock.  Since this upgrade operation is done while
708  * holding the mutex protecting this page, no one else can acquire this page's
709  * lock and change the page. Thus, it is safe to drop the "shared"
710  * lock and attempt to acquire the "exclusive" lock.
711  *
712  * Returns 1 on success, 0 on failure.
713  */
714 int
715 page_tryupgrade(page_t *pp)
716 {
717 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
718 
719 	mutex_enter(pse);
720 	if (!(pp->p_selock & SE_EWANTED)) {
721 		/* no threads want exclusive access, try upgrade */
722 		if (pp->p_selock == SE_READER) {
723 			THREAD_KPRI_REQUEST();
724 			/* convert to exclusive lock */
725 			pp->p_selock = SE_WRITER;
726 			mutex_exit(pse);
727 			return (1);
728 		}
729 	}
730 	mutex_exit(pse);
731 	return (0);
732 }
733 
734 /*
735  * Downgrade the "exclusive" lock on the page to a "shared" lock
736  * while holding the mutex protecting this page's p_selock field.
737  */
738 void
739 page_downgrade(page_t *pp)
740 {
741 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
742 	int excl_waiting;
743 
744 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
745 	ASSERT(PAGE_EXCL(pp));
746 
747 	mutex_enter(pse);
748 	excl_waiting =  pp->p_selock & SE_EWANTED;
749 	THREAD_KPRI_RELEASE();
750 	pp->p_selock = SE_READER | excl_waiting;
751 	if (CV_HAS_WAITERS(&pp->p_cv))
752 		cv_broadcast(&pp->p_cv);
753 	mutex_exit(pse);
754 }
755 
756 void
757 page_lock_delete(page_t *pp)
758 {
759 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
760 
761 	ASSERT(PAGE_EXCL(pp));
762 	ASSERT(pp->p_vnode == NULL);
763 	ASSERT(pp->p_offset == (u_offset_t)-1);
764 	ASSERT(!PP_ISFREE(pp));
765 
766 	mutex_enter(pse);
767 	THREAD_KPRI_RELEASE();
768 	pp->p_selock = SE_DELETED;
769 	if (CV_HAS_WAITERS(&pp->p_cv))
770 		cv_broadcast(&pp->p_cv);
771 	mutex_exit(pse);
772 }
773 
774 int
775 page_deleted(page_t *pp)
776 {
777 	return (pp->p_selock == SE_DELETED);
778 }
779 
780 /*
781  * Implement the io lock for pages
782  */
783 void
784 page_iolock_init(page_t *pp)
785 {
786 	pp->p_iolock_state = 0;
787 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
788 }
789 
790 /*
791  * Acquire the i/o lock on a page.
792  */
793 void
794 page_io_lock(page_t *pp)
795 {
796 	kmutex_t *pio;
797 
798 	pio = PAGE_IO_MUTEX(pp);
799 	mutex_enter(pio);
800 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
801 		cv_wait(&(pp->p_io_cv), pio);
802 	}
803 	pp->p_iolock_state |= PAGE_IO_INUSE;
804 	mutex_exit(pio);
805 }
806 
807 /*
808  * Release the i/o lock on a page.
809  */
810 void
811 page_io_unlock(page_t *pp)
812 {
813 	kmutex_t *pio;
814 
815 	pio = PAGE_IO_MUTEX(pp);
816 	mutex_enter(pio);
817 	cv_broadcast(&pp->p_io_cv);
818 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
819 	mutex_exit(pio);
820 }
821 
822 /*
823  * Try to acquire the i/o lock on a page without blocking.
824  * Returns 1 on success, 0 on failure.
825  */
826 int
827 page_io_trylock(page_t *pp)
828 {
829 	kmutex_t *pio;
830 
831 	if (pp->p_iolock_state & PAGE_IO_INUSE)
832 		return (0);
833 
834 	pio = PAGE_IO_MUTEX(pp);
835 	mutex_enter(pio);
836 
837 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
838 		mutex_exit(pio);
839 		return (0);
840 	}
841 	pp->p_iolock_state |= PAGE_IO_INUSE;
842 	mutex_exit(pio);
843 
844 	return (1);
845 }
846 
847 /*
848  * Wait until the i/o lock is not held.
849  */
850 void
851 page_io_wait(page_t *pp)
852 {
853 	kmutex_t *pio;
854 
855 	pio = PAGE_IO_MUTEX(pp);
856 	mutex_enter(pio);
857 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
858 		cv_wait(&(pp->p_io_cv), pio);
859 	}
860 	mutex_exit(pio);
861 }
862 
863 /*
864  * Returns 1 on success, 0 on failure.
865  */
866 int
867 page_io_locked(page_t *pp)
868 {
869 	return (pp->p_iolock_state & PAGE_IO_INUSE);
870 }
871 
872 /*
873  * Assert that the i/o lock on a page is held.
874  * Returns 1 on success, 0 on failure.
875  */
876 int
877 page_iolock_assert(page_t *pp)
878 {
879 	return (page_io_locked(pp));
880 }
881 
882 /*
883  * Wrapper exported to kernel routines that are built
884  * platform-independent (the macro is platform-dependent;
885  * the size of vph_mutex[] is based on NCPU).
886  *
887  * Note that you can do stress testing on this by setting the
888  * variable page_vnode_mutex_stress to something other than
889  * zero in a DEBUG kernel in a debugger after loading the kernel.
890  * Setting it after the kernel is running may not work correctly.
891  */
892 #ifdef DEBUG
893 static int page_vnode_mutex_stress = 0;
894 #endif
895 
896 kmutex_t *
897 page_vnode_mutex(vnode_t *vp)
898 {
899 	if (vp == &kvp)
900 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
901 
902 	if (vp == &zvp)
903 		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
904 #ifdef DEBUG
905 	if (page_vnode_mutex_stress != 0)
906 		return (&vph_mutex[0]);
907 #endif
908 
909 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
910 }
911 
912 kmutex_t *
913 page_se_mutex(page_t *pp)
914 {
915 	return (PAGE_SE_MUTEX(pp));
916 }
917 
918 #ifdef VM_STATS
919 uint_t pszclck_stat[4];
920 #endif
921 /*
922  * Find, take and return a mutex held by hat_page_demote().
923  * Called by page_demote_vp_pages() before hat_page_demote() call and by
924  * routines that want to block hat_page_demote() but can't do it
925  * via locking all constituent pages.
926  *
927  * Return NULL if p_szc is 0.
928  *
929  * It should only be used for pages that can be demoted by hat_page_demote()
930  * i.e. non swapfs file system pages.  The logic here is lifted from
931  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
932  * since the page is locked and not free.
933  *
934  * Hash of the root page is used to find the lock.
935  * To find the root in the presense of hat_page_demote() chageing the location
936  * of the root this routine relies on the fact that hat_page_demote() changes
937  * root last.
938  *
939  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
940  * returned pp's p_szc may be any value.
941  */
942 kmutex_t *
943 page_szc_lock(page_t *pp)
944 {
945 	kmutex_t	*mtx;
946 	page_t		*rootpp;
947 	uint_t		szc;
948 	uint_t		rszc;
949 	uint_t		pszc = pp->p_szc;
950 
951 	ASSERT(pp != NULL);
952 	ASSERT(PAGE_LOCKED(pp));
953 	ASSERT(!PP_ISFREE(pp));
954 	ASSERT(pp->p_vnode != NULL);
955 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
956 	ASSERT(!PP_ISKAS(pp));
957 
958 again:
959 	if (pszc == 0) {
960 		VM_STAT_ADD(pszclck_stat[0]);
961 		return (NULL);
962 	}
963 
964 	/* The lock lives in the root page */
965 
966 	rootpp = PP_GROUPLEADER(pp, pszc);
967 	mtx = PAGE_SZC_MUTEX(rootpp);
968 	mutex_enter(mtx);
969 
970 	/*
971 	 * since p_szc can only decrease if pp == rootpp
972 	 * rootpp will be always the same i.e we have the right root
973 	 * regardless of rootpp->p_szc.
974 	 * If location of pp's root didn't change after we took
975 	 * the lock we have the right root. return mutex hashed off it.
976 	 */
977 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
978 		VM_STAT_ADD(pszclck_stat[1]);
979 		return (mtx);
980 	}
981 
982 	/*
983 	 * root location changed because page got demoted.
984 	 * locate the new root.
985 	 */
986 	if (rszc < pszc) {
987 		szc = pp->p_szc;
988 		ASSERT(szc < pszc);
989 		mutex_exit(mtx);
990 		pszc = szc;
991 		VM_STAT_ADD(pszclck_stat[2]);
992 		goto again;
993 	}
994 
995 	VM_STAT_ADD(pszclck_stat[3]);
996 	/*
997 	 * current hat_page_demote not done yet.
998 	 * wait for it to finish.
999 	 */
1000 	mutex_exit(mtx);
1001 	rootpp = PP_GROUPLEADER(rootpp, rszc);
1002 	mtx = PAGE_SZC_MUTEX(rootpp);
1003 	mutex_enter(mtx);
1004 	mutex_exit(mtx);
1005 	ASSERT(rootpp->p_szc < rszc);
1006 	goto again;
1007 }
1008 
1009 int
1010 page_szc_lock_assert(page_t *pp)
1011 {
1012 	page_t *rootpp = PP_PAGEROOT(pp);
1013 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
1014 
1015 	return (MUTEX_HELD(mtx));
1016 }
1017 
1018 /*
1019  * memseg locking
1020  */
1021 static krwlock_t memsegslock;
1022 
1023 /*
1024  * memlist (phys_install, phys_avail) locking.
1025  */
1026 static krwlock_t memlists_lock;
1027 
1028 void
1029 memsegs_lock(int writer)
1030 {
1031 	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1032 }
1033 
1034 /*ARGSUSED*/
1035 void
1036 memsegs_unlock(int writer)
1037 {
1038 	rw_exit(&memsegslock);
1039 }
1040 
1041 int
1042 memsegs_lock_held(void)
1043 {
1044 	return (RW_LOCK_HELD(&memsegslock));
1045 }
1046 
1047 void
1048 memlist_read_lock(void)
1049 {
1050 	rw_enter(&memlists_lock, RW_READER);
1051 }
1052 
1053 void
1054 memlist_read_unlock(void)
1055 {
1056 	rw_exit(&memlists_lock);
1057 }
1058 
1059 void
1060 memlist_write_lock(void)
1061 {
1062 	rw_enter(&memlists_lock, RW_WRITER);
1063 }
1064 
1065 void
1066 memlist_write_unlock(void)
1067 {
1068 	rw_exit(&memlists_lock);
1069 }
1070