xref: /illumos-gate/usr/src/uts/common/vm/page_lock.c (revision db874c57)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * VM - page locking primitives
31  */
32 #include <sys/param.h>
33 #include <sys/t_lock.h>
34 #include <sys/vtrace.h>
35 #include <sys/debug.h>
36 #include <sys/cmn_err.h>
37 #include <sys/vnode.h>
38 #include <sys/bitmap.h>
39 #include <sys/lockstat.h>
40 #include <sys/condvar_impl.h>
41 #include <vm/page.h>
42 #include <vm/seg_enum.h>
43 #include <vm/vm_dep.h>
44 
45 /*
46  * This global mutex is for logical page locking.
47  * The following fields in the page structure are protected
48  * by this lock:
49  *
50  *	p_lckcnt
51  *	p_cowcnt
52  */
53 kmutex_t page_llock;
54 
55 /*
56  * This is a global lock for the logical page free list.  The
57  * logical free list, in this implementation, is maintained as two
58  * separate physical lists - the cache list and the free list.
59  */
60 kmutex_t  page_freelock;
61 
62 /*
63  * The hash table, page_hash[], the p_selock fields, and the
64  * list of pages associated with vnodes are protected by arrays of mutexes.
65  *
66  * Unless the hashes are changed radically, the table sizes must be
67  * a power of two.  Also, we typically need more mutexes for the
68  * vnodes since these locks are occasionally held for long periods.
69  * And since there seem to be two special vnodes (kvp and swapvp),
70  * we make room for private mutexes for them.
71  *
72  * The pse_mutex[] array holds the mutexes to protect the p_selock
73  * fields of all page_t structures.
74  *
75  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
76  * when given a pointer to a page_t.
77  *
78  * PSE_TABLE_SIZE must be a power of two.  One could argue that we
79  * should go to the trouble of setting it up at run time and base it
80  * on memory size rather than the number of compile time CPUs.
81  *
82  * XX64	We should be using physmem size to calculate PSE_TABLE_SIZE,
83  *	PSE_SHIFT, PIO_SHIFT.
84  *
85  *	These might break in 64 bit world.
86  */
87 #define	PSE_SHIFT	7		/* log2(PSE_TABLE_SIZE) */
88 
89 #define	PSE_TABLE_SIZE	128		/* number of mutexes to have */
90 
91 #define	PIO_SHIFT	PSE_SHIFT	/* next power of 2 bigger than page_t */
92 #define	PIO_TABLE_SIZE	PSE_TABLE_SIZE	/* number of io mutexes to have */
93 
94 pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
95 pad_mutex_t	pse_mutex[PSE_TABLE_SIZE];
96 kmutex_t	pio_mutex[PIO_TABLE_SIZE];
97 
98 #define	PAGE_SE_MUTEX(pp) \
99 	    &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \
100 		((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \
101 		(PSE_TABLE_SIZE - 1))].pad_mutex
102 
103 #define	PAGE_IO_MUTEX(pp) \
104 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
105 
106 #define	PSZC_MTX_TABLE_SIZE	128
107 #define	PSZC_MTX_TABLE_SHIFT	7
108 
109 static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
110 
111 #define	PAGE_SZC_MUTEX(_pp) \
112 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
113 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
114 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
115 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
116 
117 /*
118  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
119  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
120  * and p_vpnext).
121  *
122  * The page_vnode_mutex(vp) function returns the address of the appropriate
123  * mutex from this array given a pointer to a vnode.  It is complicated
124  * by the fact that the kernel's vnode and the swapfs vnode are referenced
125  * frequently enough to warrent their own mutexes.
126  *
127  * The VP_HASH_FUNC returns the index into the vph_mutex array given
128  * an address of a vnode.
129  */
130 
131 /*
132  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
133  *	Need to review again.
134  */
135 #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
136 
137 #define	VP_HASH_FUNC(vp) \
138 	((((uintptr_t)(vp) >> 6) + \
139 	    ((uintptr_t)(vp) >> 8) + \
140 	    ((uintptr_t)(vp) >> 10) + \
141 	    ((uintptr_t)(vp) >> 12)) \
142 	    & (VPH_TABLE_SIZE - 1))
143 
144 extern	struct vnode	kvp;
145 
146 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
147 
148 /*
149  * Initialize the locks used by the Virtual Memory Management system.
150  */
151 void
152 page_lock_init()
153 {
154 }
155 
156 /*
157  * At present we only use page ownership to aid debugging, so it's
158  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
159  * can map to the same owner because we just 'or' in 0x80000000 and
160  * then clear the second highest bit, so that (for example) 0x2faced00
161  * and 0xafaced00 both map to 0xafaced00.
162  * In the 64-bit world, p_selock may not be large enough to hold a full
163  * thread pointer.  If we ever need precise ownership (e.g. if we implement
164  * priority inheritance for page locks) then p_selock should become a
165  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
166  */
167 #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
168 #define	SE_READER	1
169 
170 /*
171  * A page that is deleted must be marked as such using the
172  * page_lock_delete() function. The page must be exclusively locked.
173  * The SE_DELETED marker is put in p_selock when this function is called.
174  * SE_DELETED must be distinct from any SE_WRITER value.
175  */
176 #define	SE_DELETED	(1 | INT_MIN)
177 
178 #ifdef VM_STATS
179 uint_t	vph_kvp_count;
180 uint_t	vph_swapfsvp_count;
181 uint_t	vph_other;
182 #endif /* VM_STATS */
183 
184 #ifdef VM_STATS
185 uint_t	page_lock_count;
186 uint_t	page_lock_miss;
187 uint_t	page_lock_miss_lock;
188 uint_t	page_lock_reclaim;
189 uint_t	page_lock_bad_reclaim;
190 uint_t	page_lock_same_page;
191 uint_t	page_lock_upgrade;
192 uint_t	page_lock_retired;
193 uint_t	page_lock_upgrade_failed;
194 uint_t	page_lock_deleted;
195 
196 uint_t	page_trylock_locked;
197 uint_t	page_trylock_failed;
198 uint_t	page_trylock_missed;
199 
200 uint_t	page_try_reclaim_upgrade;
201 #endif /* VM_STATS */
202 
203 /*
204  * Acquire the "shared/exclusive" lock on a page.
205  *
206  * Returns 1 on success and locks the page appropriately.
207  *	   0 on failure and does not lock the page.
208  *
209  * If `lock' is non-NULL, it will be dropped and reacquired in the
210  * failure case.  This routine can block, and if it does
211  * it will always return a failure since the page identity [vp, off]
212  * or state may have changed.
213  */
214 
215 int
216 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
217 {
218 	return (page_lock_es(pp, se, lock, reclaim, 0));
219 }
220 
221 /*
222  * With the addition of reader-writer lock semantics to page_lock_es,
223  * callers wanting an exclusive (writer) lock may prevent shared-lock
224  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
225  * In this case, when an exclusive lock cannot be acquired, p_selock's
226  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
227  * if the page is slated for retirement.
228  *
229  * The se and es parameters determine if the lock should be granted
230  * based on the following decision table:
231  *
232  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
233  * ----------- -------------- -------------------  ---------
234  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
235  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
236  * SE_EXCL        none         any lock/any        deny
237  * SE_SHARED      n/a [2][3]     shared/0          grant
238  * SE_SHARED      n/a [2][3]   unlocked/0          grant
239  * SE_SHARED      n/a            shared/1          deny
240  * SE_SHARED      n/a          unlocked/1          deny
241  * SE_SHARED      n/a              excl/any        deny
242  *
243  * Notes:
244  * [1] The code grants an exclusive lock to the caller and clears the bit
245  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
246  *   bit's value.  This was deemed acceptable as we are not concerned about
247  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
248  *   fifo mechanism should also be implemented. Meantime, the thread that
249  *   set SE_EWANTED should be prepared to catch this condition and reset it
250  *
251  * [2] Retired pages may not be locked at any time, regardless of the
252  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
253  *
254  * [3] If the page is slated for retirement the lock is denied.
255  *
256  * Notes on values of "es":
257  *
258  *   es & 1: page_lookup_create will attempt page relocation
259  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
260  *       memory thread); this prevents reader-starvation of waiting
261  *       writer thread(s) by giving priority to writers over readers.
262  *   es & SE_RETIRED: caller wants to lock pages even if they are
263  *       retired.  Default is to deny the lock if the page is retired.
264  *
265  * And yes, we know, the semantics of this function are too complicated.
266  * It's on the list to be cleaned up.
267  */
268 int
269 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
270 {
271 	int		retval;
272 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
273 	int		upgraded;
274 	int		reclaim_it;
275 
276 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
277 
278 	VM_STAT_ADD(page_lock_count);
279 
280 	upgraded = 0;
281 	reclaim_it = 0;
282 
283 	mutex_enter(pse);
284 
285 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
286 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
287 
288 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
289 		mutex_exit(pse);
290 		VM_STAT_ADD(page_lock_retired);
291 		return (0);
292 	}
293 
294 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
295 		se = SE_EXCL;
296 	}
297 
298 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
299 
300 		reclaim_it = 1;
301 		if (se == SE_SHARED) {
302 			/*
303 			 * This is an interesting situation.
304 			 *
305 			 * Remember that p_free can only change if
306 			 * p_selock < 0.
307 			 * p_free does not depend on our holding `pse'.
308 			 * And, since we hold `pse', p_selock can not change.
309 			 * So, if p_free changes on us, the page is already
310 			 * exclusively held, and we would fail to get p_selock
311 			 * regardless.
312 			 *
313 			 * We want to avoid getting the share
314 			 * lock on a free page that needs to be reclaimed.
315 			 * It is possible that some other thread has the share
316 			 * lock and has left the free page on the cache list.
317 			 * pvn_vplist_dirty() does this for brief periods.
318 			 * If the se_share is currently SE_EXCL, we will fail
319 			 * to acquire p_selock anyway.  Blocking is the
320 			 * right thing to do.
321 			 * If we need to reclaim this page, we must get
322 			 * exclusive access to it, force the upgrade now.
323 			 * Again, we will fail to acquire p_selock if the
324 			 * page is not free and block.
325 			 */
326 			upgraded = 1;
327 			se = SE_EXCL;
328 			VM_STAT_ADD(page_lock_upgrade);
329 		}
330 	}
331 
332 	if (se == SE_EXCL) {
333 		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
334 			/*
335 			 * if the caller wants a writer lock (but did not
336 			 * specify exclusive access), and there is a pending
337 			 * writer that wants exclusive access, return failure
338 			 */
339 			retval = 0;
340 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
341 			/* no reader/writer lock held */
342 			THREAD_KPRI_REQUEST();
343 			/* this clears our setting of the SE_EWANTED bit */
344 			pp->p_selock = SE_WRITER;
345 			retval = 1;
346 		} else {
347 			/* page is locked */
348 			if (es & SE_EXCL_WANTED) {
349 				/* set the SE_EWANTED bit */
350 				pp->p_selock |= SE_EWANTED;
351 			}
352 			retval = 0;
353 		}
354 	} else {
355 		retval = 0;
356 		if (pp->p_selock >= 0) {
357 			/*
358 			 * Readers are not allowed when excl wanted or
359 			 * a retire is pending. Since kvp pages can take
360 			 * a long time to be retired, we make an exception
361 			 * for them to avoid hanging threads unnecessarily.
362 			 */
363 			if ((pp->p_selock & SE_EWANTED) == 0) {
364 				if (!PP_PR_REQ(pp) || pp->p_vnode == &kvp) {
365 					pp->p_selock += SE_READER;
366 					retval = 1;
367 				}
368 			}
369 		}
370 	}
371 
372 	if (retval == 0) {
373 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
374 			VM_STAT_ADD(page_lock_deleted);
375 			mutex_exit(pse);
376 			return (retval);
377 		}
378 
379 #ifdef VM_STATS
380 		VM_STAT_ADD(page_lock_miss);
381 		if (upgraded) {
382 			VM_STAT_ADD(page_lock_upgrade_failed);
383 		}
384 #endif
385 		if (lock) {
386 			VM_STAT_ADD(page_lock_miss_lock);
387 			mutex_exit(lock);
388 		}
389 
390 		/*
391 		 * Now, wait for the page to be unlocked and
392 		 * release the lock protecting p_cv and p_selock.
393 		 */
394 		cv_wait(&pp->p_cv, pse);
395 		mutex_exit(pse);
396 
397 		/*
398 		 * The page identity may have changed while we were
399 		 * blocked.  If we are willing to depend on "pp"
400 		 * still pointing to a valid page structure (i.e.,
401 		 * assuming page structures are not dynamically allocated
402 		 * or freed), we could try to lock the page if its
403 		 * identity hasn't changed.
404 		 *
405 		 * This needs to be measured, since we come back from
406 		 * cv_wait holding pse (the expensive part of this
407 		 * operation) we might as well try the cheap part.
408 		 * Though we would also have to confirm that dropping
409 		 * `lock' did not cause any grief to the callers.
410 		 */
411 		if (lock) {
412 			mutex_enter(lock);
413 		}
414 	} else {
415 		/*
416 		 * We have the page lock.
417 		 * If we needed to reclaim the page, and the page
418 		 * needed reclaiming (ie, it was free), then we
419 		 * have the page exclusively locked.  We may need
420 		 * to downgrade the page.
421 		 */
422 		ASSERT((upgraded) ?
423 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
424 		mutex_exit(pse);
425 
426 		/*
427 		 * We now hold this page's lock, either shared or
428 		 * exclusive.  This will prevent its identity from changing.
429 		 * The page, however, may or may not be free.  If the caller
430 		 * requested, and it is free, go reclaim it from the
431 		 * free list.  If the page can't be reclaimed, return failure
432 		 * so that the caller can start all over again.
433 		 *
434 		 * NOTE:page_reclaim() releases the page lock (p_selock)
435 		 *	if it can't be reclaimed.
436 		 */
437 		if (reclaim_it) {
438 			if (!page_reclaim(pp, lock)) {
439 				VM_STAT_ADD(page_lock_bad_reclaim);
440 				retval = 0;
441 			} else {
442 				VM_STAT_ADD(page_lock_reclaim);
443 				if (upgraded) {
444 					page_downgrade(pp);
445 				}
446 			}
447 		}
448 	}
449 	return (retval);
450 }
451 
452 /*
453  * Clear the SE_EWANTED bit from p_selock.  This function allows
454  * callers of page_lock_es and page_try_reclaim_lock to clear
455  * their setting of this bit if they decide they no longer wish
456  * to gain exclusive access to the page.  Currently only
457  * delete_memory_thread uses this when the delete memory
458  * operation is cancelled.
459  */
460 void
461 page_lock_clr_exclwanted(page_t *pp)
462 {
463 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
464 
465 	mutex_enter(pse);
466 	pp->p_selock &= ~SE_EWANTED;
467 	if (CV_HAS_WAITERS(&pp->p_cv))
468 		cv_broadcast(&pp->p_cv);
469 	mutex_exit(pse);
470 }
471 
472 /*
473  * Read the comments inside of page_lock_es() carefully.
474  *
475  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
476  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
477  * This is used by threads subject to reader-starvation (eg. memory delete).
478  *
479  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
480  * it is expected that it will retry at a later time.  Threads that will
481  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
482  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
483  * the bit is cleared.)
484  */
485 int
486 page_try_reclaim_lock(page_t *pp, se_t se, int es)
487 {
488 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
489 	selock_t old;
490 
491 	mutex_enter(pse);
492 
493 	old = pp->p_selock;
494 
495 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
496 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
497 
498 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
499 		mutex_exit(pse);
500 		VM_STAT_ADD(page_trylock_failed);
501 		return (0);
502 	}
503 
504 	if (se == SE_SHARED && es == 1 && old == 0) {
505 		se = SE_EXCL;
506 	}
507 
508 	if (se == SE_SHARED) {
509 		if (!PP_ISFREE(pp)) {
510 			if (old >= 0) {
511 				/*
512 				 * Readers are not allowed when excl wanted
513 				 * or a retire is pending. Since kvp pages can
514 				 * take a long time to be retired, we make an
515 				 * exception for them to avoid hanging threads
516 				 * unnecessarily.
517 				 */
518 				if ((old & SE_EWANTED) == 0) {
519 					if (!PP_PR_REQ(pp) ||
520 					    pp->p_vnode == &kvp) {
521 						pp->p_selock = old + SE_READER;
522 						mutex_exit(pse);
523 						return (1);
524 					}
525 				}
526 			}
527 			mutex_exit(pse);
528 			return (0);
529 		}
530 		/*
531 		 * The page is free, so we really want SE_EXCL (below)
532 		 */
533 		VM_STAT_ADD(page_try_reclaim_upgrade);
534 	}
535 
536 	/*
537 	 * The caller wants a writer lock.  We try for it only if
538 	 * SE_EWANTED is not set, or if the caller specified
539 	 * SE_EXCL_WANTED.
540 	 */
541 	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
542 		if ((old & ~SE_EWANTED) == 0) {
543 			/* no reader/writer lock held */
544 			THREAD_KPRI_REQUEST();
545 			/* this clears out our setting of the SE_EWANTED bit */
546 			pp->p_selock = SE_WRITER;
547 			mutex_exit(pse);
548 			return (1);
549 		}
550 	}
551 	if (es & SE_EXCL_WANTED) {
552 		/* page is locked, set the SE_EWANTED bit */
553 		pp->p_selock |= SE_EWANTED;
554 	}
555 	mutex_exit(pse);
556 	return (0);
557 }
558 
559 /*
560  * Acquire a page's "shared/exclusive" lock, but never block.
561  * Returns 1 on success, 0 on failure.
562  */
563 int
564 page_trylock(page_t *pp, se_t se)
565 {
566 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
567 
568 	mutex_enter(pse);
569 	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
570 	    (se == SE_SHARED && PP_PR_REQ(pp) && pp->p_vnode != &kvp)) {
571 		/*
572 		 * Fail if a thread wants exclusive access and page is
573 		 * retired, if the page is slated for retirement, or a
574 		 * share lock is requested.
575 		 */
576 		mutex_exit(pse);
577 		VM_STAT_ADD(page_trylock_failed);
578 		return (0);
579 	}
580 
581 	if (se == SE_EXCL) {
582 		if (pp->p_selock == 0) {
583 			THREAD_KPRI_REQUEST();
584 			pp->p_selock = SE_WRITER;
585 			mutex_exit(pse);
586 			return (1);
587 		}
588 	} else {
589 		if (pp->p_selock >= 0) {
590 			pp->p_selock += SE_READER;
591 			mutex_exit(pse);
592 			return (1);
593 		}
594 	}
595 	mutex_exit(pse);
596 	return (0);
597 }
598 
599 /*
600  * Variant of page_unlock() specifically for the page freelist
601  * code. The mere existence of this code is a vile hack that
602  * has resulted due to the backwards locking order of the page
603  * freelist manager; please don't call it.
604  */
605 void
606 page_unlock_noretire(page_t *pp)
607 {
608 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
609 	selock_t old;
610 
611 	mutex_enter(pse);
612 
613 	old = pp->p_selock;
614 	if ((old & ~SE_EWANTED) == SE_READER) {
615 		pp->p_selock = old & ~SE_READER;
616 		if (CV_HAS_WAITERS(&pp->p_cv))
617 			cv_broadcast(&pp->p_cv);
618 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
619 		panic("page_unlock_noretire: page %p is deleted", pp);
620 	} else if (old < 0) {
621 		THREAD_KPRI_RELEASE();
622 		pp->p_selock &= SE_EWANTED;
623 		if (CV_HAS_WAITERS(&pp->p_cv))
624 			cv_broadcast(&pp->p_cv);
625 	} else if ((old & ~SE_EWANTED) > SE_READER) {
626 		pp->p_selock = old - SE_READER;
627 	} else {
628 		panic("page_unlock_noretire: page %p is not locked", pp);
629 	}
630 
631 	mutex_exit(pse);
632 }
633 
634 /*
635  * Release the page's "shared/exclusive" lock and wake up anyone
636  * who might be waiting for it.
637  */
638 void
639 page_unlock(page_t *pp)
640 {
641 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
642 	selock_t old;
643 
644 	mutex_enter(pse);
645 
646 	old = pp->p_selock;
647 	if ((old & ~SE_EWANTED) == SE_READER) {
648 		pp->p_selock = old & ~SE_READER;
649 		if (CV_HAS_WAITERS(&pp->p_cv))
650 			cv_broadcast(&pp->p_cv);
651 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
652 		panic("page_unlock: page %p is deleted", pp);
653 	} else if (old < 0) {
654 		THREAD_KPRI_RELEASE();
655 		pp->p_selock &= SE_EWANTED;
656 		if (CV_HAS_WAITERS(&pp->p_cv))
657 			cv_broadcast(&pp->p_cv);
658 	} else if ((old & ~SE_EWANTED) > SE_READER) {
659 		pp->p_selock = old - SE_READER;
660 	} else {
661 		panic("page_unlock: page %p is not locked", pp);
662 	}
663 
664 	if (pp->p_selock == 0 && PP_PR_REQ(pp)) {
665 		/*
666 		 * Try to retire the page. If it retires, great.
667 		 * If not, oh well, we'll get it in the next unlock
668 		 * request, and repeat the cycle.  Regardless,
669 		 * page_tryretire() will drop the page lock.
670 		 */
671 		if ((pp->p_toxic & PR_BUSY) == 0) {
672 			THREAD_KPRI_REQUEST();
673 			pp->p_selock = SE_WRITER;
674 			page_settoxic(pp, PR_BUSY);
675 			mutex_exit(pse);
676 			page_tryretire(pp);
677 		} else {
678 			pp->p_selock = SE_WRITER;
679 			page_clrtoxic(pp, PR_BUSY);
680 			pp->p_selock = 0;
681 			mutex_exit(pse);
682 		}
683 	} else {
684 		mutex_exit(pse);
685 	}
686 }
687 
688 /*
689  * Try to upgrade the lock on the page from a "shared" to an
690  * "exclusive" lock.  Since this upgrade operation is done while
691  * holding the mutex protecting this page, no one else can acquire this page's
692  * lock and change the page. Thus, it is safe to drop the "shared"
693  * lock and attempt to acquire the "exclusive" lock.
694  *
695  * Returns 1 on success, 0 on failure.
696  */
697 int
698 page_tryupgrade(page_t *pp)
699 {
700 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
701 
702 	mutex_enter(pse);
703 	if (!(pp->p_selock & SE_EWANTED)) {
704 		/* no threads want exclusive access, try upgrade */
705 		if (pp->p_selock == SE_READER) {
706 			THREAD_KPRI_REQUEST();
707 			/* convert to exclusive lock */
708 			pp->p_selock = SE_WRITER;
709 			mutex_exit(pse);
710 			return (1);
711 		}
712 	}
713 	mutex_exit(pse);
714 	return (0);
715 }
716 
717 /*
718  * Downgrade the "exclusive" lock on the page to a "shared" lock
719  * while holding the mutex protecting this page's p_selock field.
720  */
721 void
722 page_downgrade(page_t *pp)
723 {
724 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
725 	int excl_waiting;
726 
727 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
728 	ASSERT(PAGE_EXCL(pp));
729 
730 	mutex_enter(pse);
731 	excl_waiting =  pp->p_selock & SE_EWANTED;
732 	THREAD_KPRI_RELEASE();
733 	pp->p_selock = SE_READER | excl_waiting;
734 	if (CV_HAS_WAITERS(&pp->p_cv))
735 		cv_broadcast(&pp->p_cv);
736 	mutex_exit(pse);
737 }
738 
739 void
740 page_lock_delete(page_t *pp)
741 {
742 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
743 
744 	ASSERT(PAGE_EXCL(pp));
745 	ASSERT(pp->p_vnode == NULL);
746 	ASSERT(pp->p_offset == (u_offset_t)-1);
747 	ASSERT(!PP_ISFREE(pp));
748 
749 	mutex_enter(pse);
750 	THREAD_KPRI_RELEASE();
751 	pp->p_selock = SE_DELETED;
752 	if (CV_HAS_WAITERS(&pp->p_cv))
753 		cv_broadcast(&pp->p_cv);
754 	mutex_exit(pse);
755 }
756 
757 /*
758  * Implement the io lock for pages
759  */
760 void
761 page_iolock_init(page_t *pp)
762 {
763 	pp->p_iolock_state = 0;
764 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
765 }
766 
767 /*
768  * Acquire the i/o lock on a page.
769  */
770 void
771 page_io_lock(page_t *pp)
772 {
773 	kmutex_t *pio;
774 
775 	pio = PAGE_IO_MUTEX(pp);
776 	mutex_enter(pio);
777 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
778 		cv_wait(&(pp->p_io_cv), pio);
779 	}
780 	pp->p_iolock_state |= PAGE_IO_INUSE;
781 	mutex_exit(pio);
782 }
783 
784 /*
785  * Release the i/o lock on a page.
786  */
787 void
788 page_io_unlock(page_t *pp)
789 {
790 	kmutex_t *pio;
791 
792 	pio = PAGE_IO_MUTEX(pp);
793 	mutex_enter(pio);
794 	cv_signal(&pp->p_io_cv);
795 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
796 	mutex_exit(pio);
797 }
798 
799 /*
800  * Try to acquire the i/o lock on a page without blocking.
801  * Returns 1 on success, 0 on failure.
802  */
803 int
804 page_io_trylock(page_t *pp)
805 {
806 	kmutex_t *pio;
807 
808 	if (pp->p_iolock_state & PAGE_IO_INUSE)
809 		return (0);
810 
811 	pio = PAGE_IO_MUTEX(pp);
812 	mutex_enter(pio);
813 
814 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
815 		mutex_exit(pio);
816 		return (0);
817 	}
818 	pp->p_iolock_state |= PAGE_IO_INUSE;
819 	mutex_exit(pio);
820 
821 	return (1);
822 }
823 
824 /*
825  * Assert that the i/o lock on a page is held.
826  * Returns 1 on success, 0 on failure.
827  */
828 int
829 page_iolock_assert(page_t *pp)
830 {
831 	return (pp->p_iolock_state & PAGE_IO_INUSE);
832 }
833 
834 /*
835  * Wrapper exported to kernel routines that are built
836  * platform-independent (the macro is platform-dependent;
837  * the size of vph_mutex[] is based on NCPU).
838  *
839  * Note that you can do stress testing on this by setting the
840  * variable page_vnode_mutex_stress to something other than
841  * zero in a DEBUG kernel in a debugger after loading the kernel.
842  * Setting it after the kernel is running may not work correctly.
843  */
844 #ifdef DEBUG
845 static int page_vnode_mutex_stress = 0;
846 #endif
847 
848 kmutex_t *
849 page_vnode_mutex(vnode_t *vp)
850 {
851 	if (vp == &kvp)
852 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
853 #ifdef DEBUG
854 	if (page_vnode_mutex_stress != 0)
855 		return (&vph_mutex[0]);
856 #endif
857 
858 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
859 }
860 
861 kmutex_t *
862 page_se_mutex(page_t *pp)
863 {
864 	return (PAGE_SE_MUTEX(pp));
865 }
866 
867 #ifdef VM_STATS
868 uint_t pszclck_stat[4];
869 #endif
870 /*
871  * Find, take and return a mutex held by hat_page_demote().
872  * Called by page_demote_vp_pages() before hat_page_demote() call and by
873  * routines that want to block hat_page_demote() but can't do it
874  * via locking all constituent pages.
875  *
876  * Return NULL if p_szc is 0.
877  *
878  * It should only be used for pages that can be demoted by hat_page_demote()
879  * i.e. non swapfs file system pages.  The logic here is lifted from
880  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
881  * since the page is locked and not free.
882  *
883  * Hash of the root page is used to find the lock.
884  * To find the root in the presense of hat_page_demote() chageing the location
885  * of the root this routine relies on the fact that hat_page_demote() changes
886  * root last.
887  *
888  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
889  * returned pp's p_szc may be any value.
890  */
891 kmutex_t *
892 page_szc_lock(page_t *pp)
893 {
894 	kmutex_t	*mtx;
895 	page_t		*rootpp;
896 	uint_t		szc;
897 	uint_t		rszc;
898 	uint_t		pszc = pp->p_szc;
899 
900 	ASSERT(pp != NULL);
901 	ASSERT(PAGE_LOCKED(pp));
902 	ASSERT(!PP_ISFREE(pp));
903 	ASSERT(pp->p_vnode != NULL);
904 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
905 	ASSERT(pp->p_vnode != &kvp);
906 
907 again:
908 	if (pszc == 0) {
909 		VM_STAT_ADD(pszclck_stat[0]);
910 		return (NULL);
911 	}
912 
913 	/* The lock lives in the root page */
914 
915 	rootpp = PP_GROUPLEADER(pp, pszc);
916 	mtx = PAGE_SZC_MUTEX(rootpp);
917 	mutex_enter(mtx);
918 
919 	/*
920 	 * since p_szc can only decrease if pp == rootpp
921 	 * rootpp will be always the same i.e we have the right root
922 	 * regardless of rootpp->p_szc.
923 	 * If location of pp's root didn't change after we took
924 	 * the lock we have the right root. return mutex hashed off it.
925 	 */
926 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
927 		VM_STAT_ADD(pszclck_stat[1]);
928 		return (mtx);
929 	}
930 
931 	/*
932 	 * root location changed because page got demoted.
933 	 * locate the new root.
934 	 */
935 	if (rszc < pszc) {
936 		szc = pp->p_szc;
937 		ASSERT(szc < pszc);
938 		mutex_exit(mtx);
939 		pszc = szc;
940 		VM_STAT_ADD(pszclck_stat[2]);
941 		goto again;
942 	}
943 
944 	VM_STAT_ADD(pszclck_stat[3]);
945 	/*
946 	 * current hat_page_demote not done yet.
947 	 * wait for it to finish.
948 	 */
949 	mutex_exit(mtx);
950 	rootpp = PP_GROUPLEADER(rootpp, rszc);
951 	mtx = PAGE_SZC_MUTEX(rootpp);
952 	mutex_enter(mtx);
953 	mutex_exit(mtx);
954 	ASSERT(rootpp->p_szc < rszc);
955 	goto again;
956 }
957 
958 int
959 page_szc_lock_assert(page_t *pp)
960 {
961 	page_t *rootpp = PP_PAGEROOT(pp);
962 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
963 
964 	return (MUTEX_HELD(mtx));
965 }
966