1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2019 Joyent, Inc.
24 */
25
26
27/*
28 * VM - page locking primitives
29 */
30#include <sys/param.h>
31#include <sys/t_lock.h>
32#include <sys/vtrace.h>
33#include <sys/debug.h>
34#include <sys/cmn_err.h>
35#include <sys/bitmap.h>
36#include <sys/lockstat.h>
37#include <sys/sysmacros.h>
38#include <sys/condvar_impl.h>
39#include <vm/page.h>
40#include <vm/seg_enum.h>
41#include <vm/vm_dep.h>
42#include <vm/seg_kmem.h>
43
44/*
45 * This global mutex array is for logical page locking.
46 * The following fields in the page structure are protected
47 * by this lock:
48 *
49 *	p_lckcnt
50 *	p_cowcnt
51 */
52pad_mutex_t page_llocks[8 * NCPU_P2];
53
54/*
55 * This is a global lock for the logical page free list.  The
56 * logical free list, in this implementation, is maintained as two
57 * separate physical lists - the cache list and the free list.
58 */
59kmutex_t  page_freelock;
60
61/*
62 * The hash table, page_hash[], the p_selock fields, and the
63 * list of pages associated with vnodes are protected by arrays of mutexes.
64 *
65 * Unless the hashes are changed radically, the table sizes must be
66 * a power of two.  Also, we typically need more mutexes for the
67 * vnodes since these locks are occasionally held for long periods.
68 * And since there seem to be two special vnodes (kvp and swapvp),
69 * we make room for private mutexes for them.
70 *
71 * The pse_mutex[] array holds the mutexes to protect the p_selock
72 * fields of all page_t structures.
73 *
74 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
75 * when given a pointer to a page_t.
76 *
77 * PIO_TABLE_SIZE must be a power of two.  One could argue that we
78 * should go to the trouble of setting it up at run time and base it
79 * on memory size rather than the number of compile time CPUs.
80 *
81 * XX64	We should be using physmem size to calculate PIO_SHIFT.
82 *
83 *	These might break in 64 bit world.
84 */
85#define	PIO_SHIFT	7	/* log2(sizeof(page_t)) */
86#define	PIO_TABLE_SIZE	128	/* number of io mutexes to have */
87
88pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
89kmutex_t	pio_mutex[PIO_TABLE_SIZE];
90
91#define	PAGE_IO_MUTEX(pp) \
92	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
93
94/*
95 * The pse_mutex[] array is allocated in the platform startup code
96 * based on the size of the machine at startup.
97 */
98extern pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
99extern size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
100extern int pse_shift;			/* log2(pse_table_size) */
101#define	PAGE_SE_MUTEX(pp)	&pse_mutex[				\
102	((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &	\
103	(pse_table_size - 1)].pad_mutex
104
105#define	PSZC_MTX_TABLE_SIZE	128
106#define	PSZC_MTX_TABLE_SHIFT	7
107
108static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
109
110#define	PAGE_SZC_MUTEX(_pp) \
111	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
112		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
113		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
114		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
115
116/*
117 * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
118 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
119 * and p_vpnext).
120 *
121 * The page_vnode_mutex(vp) function returns the address of the appropriate
122 * mutex from this array given a pointer to a vnode.  It is complicated
123 * by the fact that the kernel's vnode and the swapfs vnode are referenced
124 * frequently enough to warrent their own mutexes.
125 *
126 * The VP_HASH_FUNC returns the index into the vph_mutex array given
127 * an address of a vnode.
128 */
129
130#if defined(_LP64)
131#define	VPH_TABLE_SIZE  (8 * NCPU_P2)
132#else	/* 32 bits */
133#define	VPH_TABLE_SIZE	(2 * NCPU_P2)
134#endif
135
136#define	VP_HASH_FUNC(vp) \
137	((((uintptr_t)(vp) >> 6) + \
138	    ((uintptr_t)(vp) >> 8) + \
139	    ((uintptr_t)(vp) >> 10) + \
140	    ((uintptr_t)(vp) >> 12)) \
141	    & (VPH_TABLE_SIZE - 1))
142
143/*
144 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes,
145 * one for kvps[KV_ZVP], and one for other kvps[] users.
146 */
147
148kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
149
150/*
151 * Initialize the locks used by the Virtual Memory Management system.
152 */
153void
154page_lock_init()
155{
156}
157
158/*
159 * Return a value for pse_shift based on npg (the number of physical pages)
160 * and ncpu (the maximum number of CPUs).  This is called by platform startup
161 * code.
162 *
163 * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
164 * locks grew approximately as the square of the number of threads executing.
165 * So the primary scaling factor used is NCPU^2.  The size of the machine in
166 * megabytes is used as an upper bound, particularly for sun4v machines which
167 * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
168 * (128) is used as a minimum.  Since the size of the table has to be a power
169 * of two, the calculated size is rounded up to the next power of two.
170 */
171/*ARGSUSED*/
172int
173size_pse_array(pgcnt_t npg, int ncpu)
174{
175	size_t size;
176	pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
177
178	size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
179	size += (1 << (highbit(size) - 1)) - 1;
180	return (highbit(size) - 1);
181}
182
183/*
184 * At present we only use page ownership to aid debugging, so it's
185 * OK if the owner field isn't exact.  In the 32-bit world two thread ids
186 * can map to the same owner because we just 'or' in 0x80000000 and
187 * then clear the second highest bit, so that (for example) 0x2faced00
188 * and 0xafaced00 both map to 0xafaced00.
189 * In the 64-bit world, p_selock may not be large enough to hold a full
190 * thread pointer.  If we ever need precise ownership (e.g. if we implement
191 * priority inheritance for page locks) then p_selock should become a
192 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
193 */
194#define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
195#define	SE_READER	1
196
197/*
198 * A page that is deleted must be marked as such using the
199 * page_lock_delete() function. The page must be exclusively locked.
200 * The SE_DELETED marker is put in p_selock when this function is called.
201 * SE_DELETED must be distinct from any SE_WRITER value.
202 */
203#define	SE_DELETED	(1 | INT_MIN)
204
205#ifdef VM_STATS
206uint_t	vph_kvp_count;
207uint_t	vph_swapfsvp_count;
208uint_t	vph_other;
209#endif /* VM_STATS */
210
211#ifdef VM_STATS
212uint_t	page_lock_count;
213uint_t	page_lock_miss;
214uint_t	page_lock_miss_lock;
215uint_t	page_lock_reclaim;
216uint_t	page_lock_bad_reclaim;
217uint_t	page_lock_same_page;
218uint_t	page_lock_upgrade;
219uint_t	page_lock_retired;
220uint_t	page_lock_upgrade_failed;
221uint_t	page_lock_deleted;
222
223uint_t	page_trylock_locked;
224uint_t	page_trylock_failed;
225uint_t	page_trylock_missed;
226
227uint_t	page_try_reclaim_upgrade;
228#endif /* VM_STATS */
229
230/*
231 * Acquire the "shared/exclusive" lock on a page.
232 *
233 * Returns 1 on success and locks the page appropriately.
234 *	   0 on failure and does not lock the page.
235 *
236 * If `lock' is non-NULL, it will be dropped and reacquired in the
237 * failure case.  This routine can block, and if it does
238 * it will always return a failure since the page identity [vp, off]
239 * or state may have changed.
240 */
241
242int
243page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
244{
245	return (page_lock_es(pp, se, lock, reclaim, 0));
246}
247
248/*
249 * With the addition of reader-writer lock semantics to page_lock_es,
250 * callers wanting an exclusive (writer) lock may prevent shared-lock
251 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
252 * In this case, when an exclusive lock cannot be acquired, p_selock's
253 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
254 * if the page is slated for retirement.
255 *
256 * The se and es parameters determine if the lock should be granted
257 * based on the following decision table:
258 *
259 * Lock wanted   es flags     p_selock/SE_EWANTED  Action
260 * ----------- -------------- -------------------  ---------
261 * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
262 * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
263 * SE_EXCL        none         any lock/any        deny
264 * SE_SHARED      n/a [2]        shared/0          grant
265 * SE_SHARED      n/a [2]      unlocked/0          grant
266 * SE_SHARED      n/a            shared/1          deny
267 * SE_SHARED      n/a          unlocked/1          deny
268 * SE_SHARED      n/a              excl/any        deny
269 *
270 * Notes:
271 * [1] The code grants an exclusive lock to the caller and clears the bit
272 *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
273 *   bit's value.  This was deemed acceptable as we are not concerned about
274 *   exclusive-lock starvation. If this ever becomes an issue, a priority or
275 *   fifo mechanism should also be implemented. Meantime, the thread that
276 *   set SE_EWANTED should be prepared to catch this condition and reset it
277 *
278 * [2] Retired pages may not be locked at any time, regardless of the
279 *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
280 *
281 * Notes on values of "es":
282 *
283 *   es & 1: page_lookup_create will attempt page relocation
284 *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
285 *       memory thread); this prevents reader-starvation of waiting
286 *       writer thread(s) by giving priority to writers over readers.
287 *   es & SE_RETIRED: caller wants to lock pages even if they are
288 *       retired.  Default is to deny the lock if the page is retired.
289 *
290 * And yes, we know, the semantics of this function are too complicated.
291 * It's on the list to be cleaned up.
292 */
293int
294page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
295{
296	int		retval;
297	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
298	int		upgraded;
299	int		reclaim_it;
300
301	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
302
303	VM_STAT_ADD(page_lock_count);
304
305	upgraded = 0;
306	reclaim_it = 0;
307
308	mutex_enter(pse);
309
310	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
311	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
312
313	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
314		mutex_exit(pse);
315		VM_STAT_ADD(page_lock_retired);
316		return (0);
317	}
318
319	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
320		se = SE_EXCL;
321	}
322
323	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
324
325		reclaim_it = 1;
326		if (se == SE_SHARED) {
327			/*
328			 * This is an interesting situation.
329			 *
330			 * Remember that p_free can only change if
331			 * p_selock < 0.
332			 * p_free does not depend on our holding `pse'.
333			 * And, since we hold `pse', p_selock can not change.
334			 * So, if p_free changes on us, the page is already
335			 * exclusively held, and we would fail to get p_selock
336			 * regardless.
337			 *
338			 * We want to avoid getting the share
339			 * lock on a free page that needs to be reclaimed.
340			 * It is possible that some other thread has the share
341			 * lock and has left the free page on the cache list.
342			 * pvn_vplist_dirty() does this for brief periods.
343			 * If the se_share is currently SE_EXCL, we will fail
344			 * to acquire p_selock anyway.  Blocking is the
345			 * right thing to do.
346			 * If we need to reclaim this page, we must get
347			 * exclusive access to it, force the upgrade now.
348			 * Again, we will fail to acquire p_selock if the
349			 * page is not free and block.
350			 */
351			upgraded = 1;
352			se = SE_EXCL;
353			VM_STAT_ADD(page_lock_upgrade);
354		}
355	}
356
357	if (se == SE_EXCL) {
358		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
359			/*
360			 * if the caller wants a writer lock (but did not
361			 * specify exclusive access), and there is a pending
362			 * writer that wants exclusive access, return failure
363			 */
364			retval = 0;
365		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
366			/* no reader/writer lock held */
367			/* this clears our setting of the SE_EWANTED bit */
368			pp->p_selock = SE_WRITER;
369			retval = 1;
370		} else {
371			/* page is locked */
372			if (es & SE_EXCL_WANTED) {
373				/* set the SE_EWANTED bit */
374				pp->p_selock |= SE_EWANTED;
375			}
376			retval = 0;
377		}
378	} else {
379		retval = 0;
380		if (pp->p_selock >= 0) {
381			if ((pp->p_selock & SE_EWANTED) == 0) {
382				pp->p_selock += SE_READER;
383				retval = 1;
384			}
385		}
386	}
387
388	if (retval == 0) {
389		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
390			VM_STAT_ADD(page_lock_deleted);
391			mutex_exit(pse);
392			return (retval);
393		}
394
395#ifdef VM_STATS
396		VM_STAT_ADD(page_lock_miss);
397		if (upgraded) {
398			VM_STAT_ADD(page_lock_upgrade_failed);
399		}
400#endif
401		if (lock) {
402			VM_STAT_ADD(page_lock_miss_lock);
403			mutex_exit(lock);
404		}
405
406		/*
407		 * Now, wait for the page to be unlocked and
408		 * release the lock protecting p_cv and p_selock.
409		 */
410		cv_wait(&pp->p_cv, pse);
411		mutex_exit(pse);
412
413		/*
414		 * The page identity may have changed while we were
415		 * blocked.  If we are willing to depend on "pp"
416		 * still pointing to a valid page structure (i.e.,
417		 * assuming page structures are not dynamically allocated
418		 * or freed), we could try to lock the page if its
419		 * identity hasn't changed.
420		 *
421		 * This needs to be measured, since we come back from
422		 * cv_wait holding pse (the expensive part of this
423		 * operation) we might as well try the cheap part.
424		 * Though we would also have to confirm that dropping
425		 * `lock' did not cause any grief to the callers.
426		 */
427		if (lock) {
428			mutex_enter(lock);
429		}
430	} else {
431		/*
432		 * We have the page lock.
433		 * If we needed to reclaim the page, and the page
434		 * needed reclaiming (ie, it was free), then we
435		 * have the page exclusively locked.  We may need
436		 * to downgrade the page.
437		 */
438		ASSERT((upgraded) ?
439		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
440		mutex_exit(pse);
441
442		/*
443		 * We now hold this page's lock, either shared or
444		 * exclusive.  This will prevent its identity from changing.
445		 * The page, however, may or may not be free.  If the caller
446		 * requested, and it is free, go reclaim it from the
447		 * free list.  If the page can't be reclaimed, return failure
448		 * so that the caller can start all over again.
449		 *
450		 * NOTE:page_reclaim() releases the page lock (p_selock)
451		 *	if it can't be reclaimed.
452		 */
453		if (reclaim_it) {
454			if (!page_reclaim(pp, lock)) {
455				VM_STAT_ADD(page_lock_bad_reclaim);
456				retval = 0;
457			} else {
458				VM_STAT_ADD(page_lock_reclaim);
459				if (upgraded) {
460					page_downgrade(pp);
461				}
462			}
463		}
464	}
465	return (retval);
466}
467
468/*
469 * Clear the SE_EWANTED bit from p_selock.  This function allows
470 * callers of page_lock_es and page_try_reclaim_lock to clear
471 * their setting of this bit if they decide they no longer wish
472 * to gain exclusive access to the page.  Currently only
473 * delete_memory_thread uses this when the delete memory
474 * operation is cancelled.
475 */
476void
477page_lock_clr_exclwanted(page_t *pp)
478{
479	kmutex_t *pse = PAGE_SE_MUTEX(pp);
480
481	mutex_enter(pse);
482	pp->p_selock &= ~SE_EWANTED;
483	if (CV_HAS_WAITERS(&pp->p_cv))
484		cv_broadcast(&pp->p_cv);
485	mutex_exit(pse);
486}
487
488/*
489 * Read the comments inside of page_lock_es() carefully.
490 *
491 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
492 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
493 * This is used by threads subject to reader-starvation (eg. memory delete).
494 *
495 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
496 * it is expected that it will retry at a later time.  Threads that will
497 * not retry the lock *must* call page_lock_clr_exclwanted to clear the
498 * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
499 * the bit is cleared.)
500 */
501int
502page_try_reclaim_lock(page_t *pp, se_t se, int es)
503{
504	kmutex_t *pse = PAGE_SE_MUTEX(pp);
505	selock_t old;
506
507	mutex_enter(pse);
508
509	old = pp->p_selock;
510
511	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
512	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
513
514	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
515		mutex_exit(pse);
516		VM_STAT_ADD(page_trylock_failed);
517		return (0);
518	}
519
520	if (se == SE_SHARED && es == 1 && old == 0) {
521		se = SE_EXCL;
522	}
523
524	if (se == SE_SHARED) {
525		if (!PP_ISFREE(pp)) {
526			if (old >= 0) {
527				/*
528				 * Readers are not allowed when excl wanted
529				 */
530				if ((old & SE_EWANTED) == 0) {
531					pp->p_selock = old + SE_READER;
532					mutex_exit(pse);
533					return (1);
534				}
535			}
536			mutex_exit(pse);
537			return (0);
538		}
539		/*
540		 * The page is free, so we really want SE_EXCL (below)
541		 */
542		VM_STAT_ADD(page_try_reclaim_upgrade);
543	}
544
545	/*
546	 * The caller wants a writer lock.  We try for it only if
547	 * SE_EWANTED is not set, or if the caller specified
548	 * SE_EXCL_WANTED.
549	 */
550	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
551		if ((old & ~SE_EWANTED) == 0) {
552			/* no reader/writer lock held */
553			/* this clears out our setting of the SE_EWANTED bit */
554			pp->p_selock = SE_WRITER;
555			mutex_exit(pse);
556			return (1);
557		}
558	}
559	if (es & SE_EXCL_WANTED) {
560		/* page is locked, set the SE_EWANTED bit */
561		pp->p_selock |= SE_EWANTED;
562	}
563	mutex_exit(pse);
564	return (0);
565}
566
567/*
568 * Acquire a page's "shared/exclusive" lock, but never block.
569 * Returns 1 on success, 0 on failure.
570 */
571int
572page_trylock(page_t *pp, se_t se)
573{
574	kmutex_t *pse = PAGE_SE_MUTEX(pp);
575
576	mutex_enter(pse);
577	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
578	    (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
579		/*
580		 * Fail if a thread wants exclusive access and page is
581		 * retired, if the page is slated for retirement, or a
582		 * share lock is requested.
583		 */
584		mutex_exit(pse);
585		VM_STAT_ADD(page_trylock_failed);
586		return (0);
587	}
588
589	if (se == SE_EXCL) {
590		if (pp->p_selock == 0) {
591			pp->p_selock = SE_WRITER;
592			mutex_exit(pse);
593			return (1);
594		}
595	} else {
596		if (pp->p_selock >= 0) {
597			pp->p_selock += SE_READER;
598			mutex_exit(pse);
599			return (1);
600		}
601	}
602	mutex_exit(pse);
603	return (0);
604}
605
606/*
607 * Variant of page_unlock() specifically for the page freelist
608 * code. The mere existence of this code is a vile hack that
609 * has resulted due to the backwards locking order of the page
610 * freelist manager; please don't call it.
611 */
612void
613page_unlock_nocapture(page_t *pp)
614{
615	kmutex_t *pse = PAGE_SE_MUTEX(pp);
616	selock_t old;
617
618	mutex_enter(pse);
619
620	old = pp->p_selock;
621	if ((old & ~SE_EWANTED) == SE_READER) {
622		pp->p_selock = old & ~SE_READER;
623		if (CV_HAS_WAITERS(&pp->p_cv))
624			cv_broadcast(&pp->p_cv);
625	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
626		panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
627	} else if (old < 0) {
628		pp->p_selock &= SE_EWANTED;
629		if (CV_HAS_WAITERS(&pp->p_cv))
630			cv_broadcast(&pp->p_cv);
631	} else if ((old & ~SE_EWANTED) > SE_READER) {
632		pp->p_selock = old - SE_READER;
633	} else {
634		panic("page_unlock_nocapture: page %p is not locked",
635		    (void *)pp);
636	}
637
638	mutex_exit(pse);
639}
640
641/*
642 * Release the page's "shared/exclusive" lock and wake up anyone
643 * who might be waiting for it.
644 */
645void
646page_unlock(page_t *pp)
647{
648	kmutex_t *pse = PAGE_SE_MUTEX(pp);
649	selock_t old;
650
651	mutex_enter(pse);
652
653	old = pp->p_selock;
654	if ((old & ~SE_EWANTED) == SE_READER) {
655		pp->p_selock = old & ~SE_READER;
656		if (CV_HAS_WAITERS(&pp->p_cv))
657			cv_broadcast(&pp->p_cv);
658	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
659		panic("page_unlock: page %p is deleted", (void *)pp);
660	} else if (old < 0) {
661		pp->p_selock &= SE_EWANTED;
662		if (CV_HAS_WAITERS(&pp->p_cv))
663			cv_broadcast(&pp->p_cv);
664	} else if ((old & ~SE_EWANTED) > SE_READER) {
665		pp->p_selock = old - SE_READER;
666	} else {
667		panic("page_unlock: page %p is not locked", (void *)pp);
668	}
669
670	if (pp->p_selock == 0) {
671		/*
672		 * If the T_CAPTURING bit is set, that means that we should
673		 * not try and capture the page again as we could recurse
674		 * which could lead to a stack overflow panic or spending a
675		 * relatively long time in the kernel making no progress.
676		 */
677		if ((pp->p_toxic & PR_CAPTURE) &&
678		    !(curthread->t_flag & T_CAPTURING) &&
679		    !PP_RETIRED(pp)) {
680			pp->p_selock = SE_WRITER;
681			mutex_exit(pse);
682			page_unlock_capture(pp);
683		} else {
684			mutex_exit(pse);
685		}
686	} else {
687		mutex_exit(pse);
688	}
689}
690
691/*
692 * Try to upgrade the lock on the page from a "shared" to an
693 * "exclusive" lock.  Since this upgrade operation is done while
694 * holding the mutex protecting this page, no one else can acquire this page's
695 * lock and change the page. Thus, it is safe to drop the "shared"
696 * lock and attempt to acquire the "exclusive" lock.
697 *
698 * Returns 1 on success, 0 on failure.
699 */
700int
701page_tryupgrade(page_t *pp)
702{
703	kmutex_t *pse = PAGE_SE_MUTEX(pp);
704
705	mutex_enter(pse);
706	if (!(pp->p_selock & SE_EWANTED)) {
707		/* no threads want exclusive access, try upgrade */
708		if (pp->p_selock == SE_READER) {
709			/* convert to exclusive lock */
710			pp->p_selock = SE_WRITER;
711			mutex_exit(pse);
712			return (1);
713		}
714	}
715	mutex_exit(pse);
716	return (0);
717}
718
719/*
720 * Downgrade the "exclusive" lock on the page to a "shared" lock
721 * while holding the mutex protecting this page's p_selock field.
722 */
723void
724page_downgrade(page_t *pp)
725{
726	kmutex_t *pse = PAGE_SE_MUTEX(pp);
727	int excl_waiting;
728
729	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
730	ASSERT(PAGE_EXCL(pp));
731
732	mutex_enter(pse);
733	excl_waiting =  pp->p_selock & SE_EWANTED;
734	pp->p_selock = SE_READER | excl_waiting;
735	if (CV_HAS_WAITERS(&pp->p_cv))
736		cv_broadcast(&pp->p_cv);
737	mutex_exit(pse);
738}
739
740void
741page_lock_delete(page_t *pp)
742{
743	kmutex_t *pse = PAGE_SE_MUTEX(pp);
744
745	ASSERT(PAGE_EXCL(pp));
746	ASSERT(pp->p_vnode == NULL);
747	ASSERT(pp->p_offset == (u_offset_t)-1);
748	ASSERT(!PP_ISFREE(pp));
749
750	mutex_enter(pse);
751	pp->p_selock = SE_DELETED;
752	if (CV_HAS_WAITERS(&pp->p_cv))
753		cv_broadcast(&pp->p_cv);
754	mutex_exit(pse);
755}
756
757int
758page_deleted(page_t *pp)
759{
760	return (pp->p_selock == SE_DELETED);
761}
762
763/*
764 * Implement the io lock for pages
765 */
766void
767page_iolock_init(page_t *pp)
768{
769	pp->p_iolock_state = 0;
770	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
771}
772
773/*
774 * Acquire the i/o lock on a page.
775 */
776void
777page_io_lock(page_t *pp)
778{
779	kmutex_t *pio;
780
781	pio = PAGE_IO_MUTEX(pp);
782	mutex_enter(pio);
783	while (pp->p_iolock_state & PAGE_IO_INUSE) {
784		cv_wait(&(pp->p_io_cv), pio);
785	}
786	pp->p_iolock_state |= PAGE_IO_INUSE;
787	mutex_exit(pio);
788}
789
790/*
791 * Release the i/o lock on a page.
792 */
793void
794page_io_unlock(page_t *pp)
795{
796	kmutex_t *pio;
797
798	pio = PAGE_IO_MUTEX(pp);
799	mutex_enter(pio);
800	cv_broadcast(&pp->p_io_cv);
801	pp->p_iolock_state &= ~PAGE_IO_INUSE;
802	mutex_exit(pio);
803}
804
805/*
806 * Try to acquire the i/o lock on a page without blocking.
807 * Returns 1 on success, 0 on failure.
808 */
809int
810page_io_trylock(page_t *pp)
811{
812	kmutex_t *pio;
813
814	if (pp->p_iolock_state & PAGE_IO_INUSE)
815		return (0);
816
817	pio = PAGE_IO_MUTEX(pp);
818	mutex_enter(pio);
819
820	if (pp->p_iolock_state & PAGE_IO_INUSE) {
821		mutex_exit(pio);
822		return (0);
823	}
824	pp->p_iolock_state |= PAGE_IO_INUSE;
825	mutex_exit(pio);
826
827	return (1);
828}
829
830/*
831 * Wait until the i/o lock is not held.
832 */
833void
834page_io_wait(page_t *pp)
835{
836	kmutex_t *pio;
837
838	pio = PAGE_IO_MUTEX(pp);
839	mutex_enter(pio);
840	while (pp->p_iolock_state & PAGE_IO_INUSE) {
841		cv_wait(&(pp->p_io_cv), pio);
842	}
843	mutex_exit(pio);
844}
845
846/*
847 * Returns 1 on success, 0 on failure.
848 */
849int
850page_io_locked(page_t *pp)
851{
852	return (pp->p_iolock_state & PAGE_IO_INUSE);
853}
854
855/*
856 * Assert that the i/o lock on a page is held.
857 * Returns 1 on success, 0 on failure.
858 */
859int
860page_iolock_assert(page_t *pp)
861{
862	return (page_io_locked(pp));
863}
864
865/*
866 * Wrapper exported to kernel routines that are built
867 * platform-independent (the macro is platform-dependent;
868 * the size of vph_mutex[] is based on NCPU).
869 *
870 * Note that you can do stress testing on this by setting the
871 * variable page_vnode_mutex_stress to something other than
872 * zero in a DEBUG kernel in a debugger after loading the kernel.
873 * Setting it after the kernel is running may not work correctly.
874 */
875#ifdef DEBUG
876static int page_vnode_mutex_stress = 0;
877#endif
878
879kmutex_t *
880page_vnode_mutex(vnode_t *vp)
881{
882	if (vp == &kvp || vp == &kvps[KV_VVP])
883		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
884
885	if (vp == &kvps[KV_ZVP])
886		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
887#ifdef DEBUG
888	if (page_vnode_mutex_stress != 0)
889		return (&vph_mutex[0]);
890#endif
891
892	return (&vph_mutex[VP_HASH_FUNC(vp)]);
893}
894
895kmutex_t *
896page_se_mutex(page_t *pp)
897{
898	return (PAGE_SE_MUTEX(pp));
899}
900
901#ifdef VM_STATS
902uint_t pszclck_stat[4];
903#endif
904/*
905 * Find, take and return a mutex held by hat_page_demote().
906 * Called by page_demote_vp_pages() before hat_page_demote() call and by
907 * routines that want to block hat_page_demote() but can't do it
908 * via locking all constituent pages.
909 *
910 * Return NULL if p_szc is 0.
911 *
912 * It should only be used for pages that can be demoted by hat_page_demote()
913 * i.e. non swapfs file system pages.  The logic here is lifted from
914 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
915 * since the page is locked and not free.
916 *
917 * Hash of the root page is used to find the lock.
918 * To find the root in the presense of hat_page_demote() chageing the location
919 * of the root this routine relies on the fact that hat_page_demote() changes
920 * root last.
921 *
922 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
923 * returned pp's p_szc may be any value.
924 */
925kmutex_t *
926page_szc_lock(page_t *pp)
927{
928	kmutex_t	*mtx;
929	page_t		*rootpp;
930	uint_t		szc;
931	uint_t		rszc;
932	uint_t		pszc = pp->p_szc;
933
934	ASSERT(pp != NULL);
935	ASSERT(PAGE_LOCKED(pp));
936	ASSERT(!PP_ISFREE(pp));
937	ASSERT(pp->p_vnode != NULL);
938	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
939	ASSERT(!PP_ISKAS(pp));
940
941again:
942	if (pszc == 0) {
943		VM_STAT_ADD(pszclck_stat[0]);
944		return (NULL);
945	}
946
947	/* The lock lives in the root page */
948
949	rootpp = PP_GROUPLEADER(pp, pszc);
950	mtx = PAGE_SZC_MUTEX(rootpp);
951	mutex_enter(mtx);
952
953	/*
954	 * since p_szc can only decrease if pp == rootpp
955	 * rootpp will be always the same i.e we have the right root
956	 * regardless of rootpp->p_szc.
957	 * If location of pp's root didn't change after we took
958	 * the lock we have the right root. return mutex hashed off it.
959	 */
960	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
961		VM_STAT_ADD(pszclck_stat[1]);
962		return (mtx);
963	}
964
965	/*
966	 * root location changed because page got demoted.
967	 * locate the new root.
968	 */
969	if (rszc < pszc) {
970		szc = pp->p_szc;
971		ASSERT(szc < pszc);
972		mutex_exit(mtx);
973		pszc = szc;
974		VM_STAT_ADD(pszclck_stat[2]);
975		goto again;
976	}
977
978	VM_STAT_ADD(pszclck_stat[3]);
979	/*
980	 * current hat_page_demote not done yet.
981	 * wait for it to finish.
982	 */
983	mutex_exit(mtx);
984	rootpp = PP_GROUPLEADER(rootpp, rszc);
985	mtx = PAGE_SZC_MUTEX(rootpp);
986	mutex_enter(mtx);
987	mutex_exit(mtx);
988	ASSERT(rootpp->p_szc < rszc);
989	goto again;
990}
991
992int
993page_szc_lock_assert(page_t *pp)
994{
995	page_t *rootpp = PP_PAGEROOT(pp);
996	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
997
998	return (MUTEX_HELD(mtx));
999}
1000
1001/*
1002 * memseg locking
1003 */
1004static krwlock_t memsegslock;
1005
1006/*
1007 * memlist (phys_install, phys_avail) locking.
1008 */
1009static krwlock_t memlists_lock;
1010
1011int
1012memsegs_trylock(int writer)
1013{
1014	return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER));
1015}
1016
1017void
1018memsegs_lock(int writer)
1019{
1020	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1021}
1022
1023/*ARGSUSED*/
1024void
1025memsegs_unlock(int writer)
1026{
1027	rw_exit(&memsegslock);
1028}
1029
1030int
1031memsegs_lock_held(void)
1032{
1033	return (RW_LOCK_HELD(&memsegslock));
1034}
1035
1036void
1037memlist_read_lock(void)
1038{
1039	rw_enter(&memlists_lock, RW_READER);
1040}
1041
1042void
1043memlist_read_unlock(void)
1044{
1045	rw_exit(&memlists_lock);
1046}
1047
1048void
1049memlist_write_lock(void)
1050{
1051	rw_enter(&memlists_lock, RW_WRITER);
1052}
1053
1054void
1055memlist_write_unlock(void)
1056{
1057	rw_exit(&memlists_lock);
1058}
1059