1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include <sys/types.h>
26#include <sys/param.h>
27#include <sys/thread.h>
28#include <sys/proc.h>
29#include <sys/callb.h>
30#include <sys/vnode.h>
31#include <sys/debug.h>
32#include <sys/systm.h>		/* for bzero */
33#include <sys/memlist.h>
34#include <sys/cmn_err.h>
35#include <sys/sysmacros.h>
36#include <sys/vmsystm.h>	/* for NOMEMWAIT() */
37#include <sys/atomic.h>		/* used to update kcage_freemem */
38#include <sys/kmem.h>		/* for kmem_reap */
39#include <sys/errno.h>
40#include <sys/mem_cage.h>
41#include <vm/seg_kmem.h>
42#include <vm/page.h>
43#include <vm/hat.h>
44#include <vm/vm_dep.h>
45#include <sys/mem_config.h>
46#include <sys/lgrp.h>
47#include <sys/rwlock.h>
48#include <sys/cpupart.h>
49
50extern pri_t maxclsyspri;
51
52#ifdef DEBUG
53#define	KCAGE_STATS
54#endif
55
56#ifdef KCAGE_STATS
57
58#define	KCAGE_STATS_VERSION 9	/* can help report generators */
59#define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
60
61struct kcage_stats_scan {
62	/* managed by KCAGE_STAT_* macros */
63	clock_t	scan_lbolt;
64	uint_t	scan_id;
65
66	/* set in kcage_cageout() */
67	uint_t	kt_passes;
68	clock_t	kt_ticks;
69	pgcnt_t	kt_kcage_freemem_start;
70	pgcnt_t	kt_kcage_freemem_end;
71	pgcnt_t kt_freemem_start;
72	pgcnt_t kt_freemem_end;
73	uint_t	kt_examined;
74	uint_t	kt_cantlock;
75	uint_t	kt_gotone;
76	uint_t	kt_gotonefree;
77	uint_t	kt_skipshared;
78	uint_t	kt_skiprefd;
79	uint_t	kt_destroy;
80
81	/* set in kcage_invalidate_page() */
82	uint_t	kip_reloclocked;
83	uint_t	kip_relocmod;
84	uint_t	kip_destroy;
85	uint_t	kip_nomem;
86	uint_t	kip_demotefailed;
87
88	/* set in kcage_expand() */
89	uint_t	ke_wanted;
90	uint_t	ke_examined;
91	uint_t	ke_lefthole;
92	uint_t	ke_gotone;
93	uint_t	ke_gotonefree;
94};
95
96struct kcage_stats {
97	/* managed by KCAGE_STAT_* macros */
98	uint_t	version;
99	uint_t	size;
100
101	/* set in kcage_cageout */
102	uint_t	kt_wakeups;
103	uint_t	kt_scans;
104	uint_t	kt_cageout_break;
105
106	/* set in kcage_expand */
107	uint_t	ke_calls;
108	uint_t	ke_nopfn;
109	uint_t	ke_nopaget;
110	uint_t	ke_isnoreloc;
111	uint_t	ke_deleting;
112	uint_t	ke_lowfreemem;
113	uint_t	ke_terminate;
114
115	/* set in kcage_freemem_add() */
116	uint_t	kfa_trottlewake;
117
118	/* set in kcage_freemem_sub() */
119	uint_t	kfs_cagewake;
120
121	/* set in kcage_create_throttle */
122	uint_t	kct_calls;
123	uint_t	kct_cageout;
124	uint_t	kct_critical;
125	uint_t	kct_exempt;
126	uint_t	kct_cagewake;
127	uint_t	kct_wait;
128	uint_t	kct_progress;
129	uint_t	kct_noprogress;
130	uint_t	kct_timeout;
131
132	/* set in kcage_cageout_wakeup */
133	uint_t	kcw_expandearly;
134
135	/* managed by KCAGE_STAT_* macros */
136	uint_t	scan_array_size;
137	uint_t	scan_index;
138	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
139};
140
141static struct kcage_stats kcage_stats;
142static struct kcage_stats_scan kcage_stats_scan_zero;
143
144/*
145 * No real need for atomics here. For the most part the incs and sets are
146 * done by the kernel cage thread. There are a few that are done by any
147 * number of other threads. Those cases are noted by comments.
148 */
149#define	KCAGE_STAT_INCR(m)	kcage_stats.m++
150
151#define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
152
153#define	KCAGE_STAT_INCR_SCAN(m)	\
154	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
155
156#define	KCAGE_STAT_NINCR_SCAN(m, v) \
157	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
158
159#define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
160
161#define	KCAGE_STAT_SETZ(m, v)	\
162	if (kcage_stats.m == 0) kcage_stats.m = (v)
163
164#define	KCAGE_STAT_SET_SCAN(m, v)	\
165	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
166
167#define	KCAGE_STAT_SETZ_SCAN(m, v)	\
168	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
169
170#define	KCAGE_STAT_INC_SCAN_INDEX \
171	KCAGE_STAT_SET_SCAN(scan_lbolt, ddi_get_lbolt()); \
172	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
173	kcage_stats.scan_index = \
174	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
175	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
176
177#define	KCAGE_STAT_INIT_SCAN_INDEX \
178	kcage_stats.version = KCAGE_STATS_VERSION; \
179	kcage_stats.size = sizeof (kcage_stats); \
180	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
181	kcage_stats.scan_index = 0
182
183#else /* KCAGE_STATS */
184
185#define	KCAGE_STAT_INCR(v)
186#define	KCAGE_STAT_NINCR(m, v)
187#define	KCAGE_STAT_INCR_SCAN(v)
188#define	KCAGE_STAT_NINCR_SCAN(m, v)
189#define	KCAGE_STAT_SET(m, v)
190#define	KCAGE_STAT_SETZ(m, v)
191#define	KCAGE_STAT_SET_SCAN(m, v)
192#define	KCAGE_STAT_SETZ_SCAN(m, v)
193#define	KCAGE_STAT_INC_SCAN_INDEX
194#define	KCAGE_STAT_INIT_SCAN_INDEX
195
196#endif /* KCAGE_STATS */
197
198static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
199static kcondvar_t kcage_throttle_cv;
200
201static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
202static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
203static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
204kthread_id_t kcage_cageout_thread;	/* to aid debugging */
205
206static krwlock_t kcage_range_rwlock;	/* protects kcage_glist elements */
207
208/*
209 * Cage expansion happens within a range.
210 */
211struct kcage_glist {
212	struct kcage_glist	*next;
213	pfn_t			base;
214	pfn_t			lim;
215	pfn_t			curr;
216	int			decr;
217};
218
219static struct kcage_glist *kcage_glist;
220static struct kcage_glist *kcage_current_glist;
221
222/*
223 * The firstfree element is provided so that kmem_alloc can be avoided
224 * until that cage has somewhere to go. This is not currently a problem
225 * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
226 */
227static vmem_t *kcage_arena;
228static struct kcage_glist kcage_glist_firstfree;
229static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
230
231/*
232 * Miscellaneous forward references
233 */
234static struct kcage_glist *kcage_glist_alloc(void);
235static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
236static void kcage_cageout(void);
237static int kcage_invalidate_page(page_t *, pgcnt_t *);
238static int kcage_setnoreloc_pages(page_t *, se_t);
239static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t);
240static void kcage_init(pgcnt_t preferred_size);
241static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs);
242
243/*
244 * Kernel Memory Cage counters and thresholds.
245 */
246int kcage_on = 0;
247pgcnt_t kcage_freemem;
248pgcnt_t kcage_needfree;
249pgcnt_t kcage_lotsfree;
250pgcnt_t kcage_desfree;
251pgcnt_t kcage_minfree;
252pgcnt_t kcage_throttlefree;
253pgcnt_t	kcage_reserve;
254int kcage_maxwait = 10;	/* in seconds */
255
256/* when we use lp for kmem we start the cage at a higher initial value */
257pgcnt_t kcage_kmemlp_mincage;
258
259#ifdef DEBUG
260pgcnt_t	kcage_pagets;
261#define	KCAGEPAGETS_INC()	kcage_pagets++
262#else
263#define	KCAGEPAGETS_INC()
264#endif
265
266/* kstats to export what pages are currently caged */
267kmutex_t kcage_kstat_lock;
268static int kcage_kstat_update(kstat_t *ksp, int rw);
269static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
270
271/*
272 * Startup and Dynamic Reconfiguration interfaces.
273 * kcage_range_add()
274 * kcage_range_del()
275 * kcage_range_delete_post_mem_del()
276 * kcage_range_init()
277 * kcage_set_thresholds()
278 */
279
280/*
281 * Called from page_get_contig_pages to get the approximate kcage pfn range
282 * for exclusion from search for contiguous pages. This routine is called
283 * without kcage_range lock (kcage routines can call page_get_contig_pages
284 * through page_relocate) and with the assumption, based on kcage_range_add,
285 * that kcage_current_glist always contain a valid pointer.
286 */
287
288int
289kcage_current_pfn(pfn_t *pfncur)
290{
291	struct kcage_glist *lp = kcage_current_glist;
292
293	ASSERT(kcage_on);
294
295	ASSERT(lp != NULL);
296
297	*pfncur = lp->curr;
298
299	return (lp->decr);
300}
301
302/*
303 * Called from vm_pagelist.c during coalesce to find kernel cage regions
304 * within an mnode. Looks for the lowest range between lo and hi.
305 *
306 * Kernel cage memory is defined between kcage_glist and kcage_current_glist.
307 * Non-cage memory is defined between kcage_current_glist and list end.
308 *
309 * If incage is set, returns the lowest kcage range. Otherwise returns lowest
310 * non-cage range.
311 *
312 * Returns zero on success and nlo, nhi:
313 * 	lo <= nlo < nhi <= hi
314 * Returns non-zero if no overlapping range is found.
315 */
316int
317kcage_next_range(int incage, pfn_t lo, pfn_t hi,
318    pfn_t *nlo, pfn_t *nhi)
319{
320	struct kcage_glist *lp;
321	pfn_t tlo = hi;
322	pfn_t thi = hi;
323
324	ASSERT(lo <= hi);
325
326	/*
327	 * Reader lock protects the list, but kcage_get_pfn
328	 * running concurrently may advance kcage_current_glist
329	 * and also update kcage_current_glist->curr. Page
330	 * coalesce can handle this race condition.
331	 */
332	rw_enter(&kcage_range_rwlock, RW_READER);
333
334	for (lp = incage ? kcage_glist : kcage_current_glist;
335	    lp != NULL; lp = lp->next) {
336
337		pfn_t klo, khi;
338
339		/* find the range limits in this element */
340		if ((incage && lp->decr) || (!incage && !lp->decr)) {
341			klo = lp->curr;
342			khi = lp->lim;
343		} else {
344			klo = lp->base;
345			khi = lp->curr;
346		}
347
348		/* handle overlap */
349		if (klo < tlo && klo < khi && lo < khi && klo < hi) {
350			tlo = MAX(lo, klo);
351			thi = MIN(hi, khi);
352			if (tlo == lo)
353				break;
354		}
355
356		/* check end of kcage */
357		if (incage && lp == kcage_current_glist) {
358			break;
359		}
360	}
361
362	rw_exit(&kcage_range_rwlock);
363
364	/* return non-zero if no overlapping range found */
365	if (tlo == thi)
366		return (1);
367
368	ASSERT(lo <= tlo && tlo < thi && thi <= hi);
369
370	/* return overlapping range */
371	*nlo = tlo;
372	*nhi = thi;
373	return (0);
374}
375
376void
377kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size)
378{
379	int ret = 0;
380
381	ASSERT(kcage_arena == NULL);
382	kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t),
383	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
384	ASSERT(kcage_arena != NULL);
385
386	if (d == KCAGE_DOWN) {
387		while (ml->ml_next != NULL)
388			ml = ml->ml_next;
389	}
390
391	rw_enter(&kcage_range_rwlock, RW_WRITER);
392
393	while (ml != NULL) {
394		ret = kcage_range_add_internal(btop(ml->ml_address),
395		    btop(ml->ml_size), d);
396		if (ret)
397			panic("kcage_range_add_internal failed: "
398			    "ml=%p, ret=0x%x\n", (void *)ml, ret);
399
400		ml = (d == KCAGE_DOWN ? ml->ml_prev : ml->ml_next);
401	}
402
403	rw_exit(&kcage_range_rwlock);
404
405	if (ret == 0)
406		kcage_init(preferred_size);
407}
408
409/*
410 * Third arg controls direction of growth: 0: increasing pfns,
411 * 1: decreasing.
412 */
413static int
414kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
415{
416	struct kcage_glist *new, **lpp;
417	pfn_t lim;
418
419	ASSERT(rw_write_held(&kcage_range_rwlock));
420
421	ASSERT(npgs != 0);
422	if (npgs == 0)
423		return (EINVAL);
424
425	lim = base + npgs;
426
427	ASSERT(lim > base);
428	if (lim <= base)
429		return (EINVAL);
430
431	new = kcage_glist_alloc();
432	if (new == NULL) {
433		return (ENOMEM);
434	}
435
436	new->base = base;
437	new->lim = lim;
438	new->decr = (d == KCAGE_DOWN);
439	if (new->decr != 0)
440		new->curr = new->lim;
441	else
442		new->curr = new->base;
443	/*
444	 * Any overlapping existing ranges are removed by deleting
445	 * from the new list as we search for the tail.
446	 */
447	lpp = &kcage_glist;
448	while (*lpp != NULL) {
449		int ret;
450		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
451		if (ret != 0)
452			return (ret);
453		lpp = &(*lpp)->next;
454	}
455
456	*lpp = new;
457
458	if (kcage_current_glist == NULL) {
459		kcage_current_glist = kcage_glist;
460	}
461
462	return (0);
463}
464
465int
466kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
467{
468	int ret;
469
470	rw_enter(&kcage_range_rwlock, RW_WRITER);
471	ret = kcage_range_add_internal(base, npgs, d);
472	rw_exit(&kcage_range_rwlock);
473	return (ret);
474}
475
476/*
477 * Calls to add and delete must be protected by kcage_range_rwlock
478 */
479static int
480kcage_range_delete_internal(pfn_t base, pgcnt_t npgs)
481{
482	struct kcage_glist *lp;
483	pfn_t lim;
484
485	ASSERT(rw_write_held(&kcage_range_rwlock));
486
487	ASSERT(npgs != 0);
488	if (npgs == 0)
489		return (EINVAL);
490
491	lim = base + npgs;
492
493	ASSERT(lim > base);
494	if (lim <= base)
495		return (EINVAL);
496
497	/*
498	 * Check if the delete is OK first as a number of elements
499	 * might be involved and it will be difficult to go
500	 * back and undo (can't just add the range back in).
501	 */
502	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
503		/*
504		 * If there have been no pages allocated from this
505		 * element, we don't need to check it.
506		 */
507		if ((lp->decr == 0 && lp->curr == lp->base) ||
508		    (lp->decr != 0 && lp->curr == lp->lim))
509			continue;
510		/*
511		 * If the element does not overlap, its OK.
512		 */
513		if (base >= lp->lim || lim <= lp->base)
514			continue;
515		/*
516		 * Overlapping element: Does the range to be deleted
517		 * overlap the area already used? If so fail.
518		 */
519		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
520			return (EBUSY);
521		}
522		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
523			return (EBUSY);
524		}
525	}
526	return (kcage_glist_delete(base, lim, &kcage_glist));
527}
528
529int
530kcage_range_delete(pfn_t base, pgcnt_t npgs)
531{
532	int ret;
533
534	rw_enter(&kcage_range_rwlock, RW_WRITER);
535	ret = kcage_range_delete_internal(base, npgs);
536	rw_exit(&kcage_range_rwlock);
537	return (ret);
538}
539
540/*
541 * Calls to add and delete must be protected by kcage_range_rwlock.
542 * This routine gets called after successful Solaris memory
543 * delete operation from DR post memory delete routines.
544 */
545static int
546kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs)
547{
548	pfn_t lim;
549
550	ASSERT(rw_write_held(&kcage_range_rwlock));
551
552	ASSERT(npgs != 0);
553	if (npgs == 0)
554		return (EINVAL);
555
556	lim = base + npgs;
557
558	ASSERT(lim > base);
559	if (lim <= base)
560		return (EINVAL);
561
562	return (kcage_glist_delete(base, lim, &kcage_glist));
563}
564
565int
566kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
567{
568	int ret;
569
570	rw_enter(&kcage_range_rwlock, RW_WRITER);
571	ret = kcage_range_delete_post_mem_del_internal(base, npgs);
572	rw_exit(&kcage_range_rwlock);
573	return (ret);
574}
575
576/*
577 * No locking is required here as the whole operation is covered
578 * by kcage_range_rwlock writer lock.
579 */
580static struct kcage_glist *
581kcage_glist_alloc(void)
582{
583	struct kcage_glist *new;
584
585	if ((new = kcage_glist_freelist) != NULL) {
586		kcage_glist_freelist = new->next;
587	} else if (kernel_cage_enable) {
588		new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP);
589	} else {
590		/*
591		 * On DR supported platforms we allow memory add
592		 * even when kernel cage is disabled. "kcage_arena" is
593		 * created only when kernel cage is enabled.
594		 */
595		new = kmem_zalloc(sizeof (*new), KM_NOSLEEP);
596	}
597
598	if (new != NULL)
599		bzero(new, sizeof (*new));
600
601	return (new);
602}
603
604static void
605kcage_glist_free(struct kcage_glist *lp)
606{
607	lp->next = kcage_glist_freelist;
608	kcage_glist_freelist = lp;
609}
610
611static int
612kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
613{
614	struct kcage_glist *lp, *prev = *lpp;
615
616	while ((lp = *lpp) != NULL) {
617		if (lim > lp->base && base < lp->lim) {
618			/* The delete range overlaps this element. */
619			if (base <= lp->base && lim >= lp->lim) {
620				/* Delete whole element. */
621				*lpp = lp->next;
622				if (lp == kcage_current_glist) {
623					/* This can never happen. */
624					ASSERT(kcage_current_glist != prev);
625					kcage_current_glist = prev;
626				}
627				kcage_glist_free(lp);
628				continue;
629			}
630
631			/* Partial delete. */
632			if (base > lp->base && lim < lp->lim) {
633				struct kcage_glist *new;
634
635				/*
636				 * Remove a section from the middle,
637				 * need to allocate a new element.
638				 */
639				new = kcage_glist_alloc();
640				if (new == NULL) {
641					return (ENOMEM);
642				}
643
644				/*
645				 * Tranfser unused range to new.
646				 * Edit lp in place to preserve
647				 * kcage_current_glist.
648				 */
649				new->decr = lp->decr;
650				if (new->decr != 0) {
651					new->base = lp->base;
652					new->lim = base;
653					new->curr = base;
654
655					lp->base = lim;
656				} else {
657					new->base = lim;
658					new->lim = lp->lim;
659					new->curr = new->base;
660
661					lp->lim = base;
662				}
663
664				/* Insert new. */
665				new->next = lp->next;
666				lp->next = new;
667				lpp = &lp->next;
668			} else {
669				/* Delete part of current block. */
670				if (base > lp->base) {
671					ASSERT(lim >= lp->lim);
672					ASSERT(base < lp->lim);
673					if (lp->decr != 0 &&
674					    lp->curr == lp->lim)
675						lp->curr = base;
676					lp->lim = base;
677				} else {
678					ASSERT(base <= lp->base);
679					ASSERT(lim > lp->base);
680					if (lp->decr == 0 &&
681					    lp->curr == lp->base)
682						lp->curr = lim;
683					lp->base = lim;
684				}
685			}
686		}
687		prev = *lpp;
688		lpp = &(*lpp)->next;
689	}
690
691	return (0);
692}
693
694/*
695 * If lockit is 1, kcage_get_pfn holds the
696 * reader lock for kcage_range_rwlock.
697 * Changes to lp->curr can cause race conditions, but
698 * they are handled by higher level code (see kcage_next_range.)
699 */
700static pfn_t
701kcage_get_pfn(int lockit)
702{
703	struct kcage_glist *lp;
704	pfn_t pfn = PFN_INVALID;
705
706	if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER))
707		return (pfn);
708
709	lp = kcage_current_glist;
710	while (lp != NULL) {
711		if (lp->decr != 0) {
712			if (lp->curr != lp->base) {
713				pfn = --lp->curr;
714				break;
715			}
716		} else {
717			if (lp->curr != lp->lim) {
718				pfn = lp->curr++;
719				break;
720			}
721		}
722
723		lp = lp->next;
724		if (lp)
725			kcage_current_glist = lp;
726	}
727
728	if (lockit)
729		rw_exit(&kcage_range_rwlock);
730	return (pfn);
731}
732
733/*
734 * Walk the physical address space of the cage.
735 * This routine does not guarantee to return PFNs in the order
736 * in which they were allocated to the cage. Instead, it walks
737 * each range as they appear on the growth list returning the PFNs
738 * range in ascending order.
739 *
740 * To begin scanning at lower edge of cage, reset should be nonzero.
741 * To step through cage, reset should be zero.
742 *
743 * PFN_INVALID will be returned when the upper end of the cage is
744 * reached -- indicating a full scan of the cage has been completed since
745 * previous reset. PFN_INVALID will continue to be returned until
746 * kcage_walk_cage is reset.
747 *
748 * It is possible to receive a PFN_INVALID result on reset if a growth
749 * list is not installed or if none of the PFNs in the installed list have
750 * been allocated to the cage. In otherwords, there is no cage.
751 *
752 * Caller need not hold kcage_range_rwlock while calling this function
753 * as the front part of the list is static - pages never come out of
754 * the cage.
755 *
756 * The caller is expected to only be kcage_cageout().
757 */
758static pfn_t
759kcage_walk_cage(int reset)
760{
761	static struct kcage_glist *lp = NULL;
762	static pfn_t pfn;
763
764	if (reset)
765		lp = NULL;
766	if (lp == NULL) {
767		lp = kcage_glist;
768		pfn = PFN_INVALID;
769	}
770again:
771	if (pfn == PFN_INVALID) {
772		if (lp == NULL)
773			return (PFN_INVALID);
774
775		if (lp->decr != 0) {
776			/*
777			 * In this range the cage grows from the highest
778			 * address towards the lowest.
779			 * Arrange to return pfns from curr to lim-1,
780			 * inclusive, in ascending order.
781			 */
782
783			pfn = lp->curr;
784		} else {
785			/*
786			 * In this range the cage grows from the lowest
787			 * address towards the highest.
788			 * Arrange to return pfns from base to curr,
789			 * inclusive, in ascending order.
790			 */
791
792			pfn = lp->base;
793		}
794	}
795
796	if (lp->decr != 0) {		/* decrementing pfn */
797		if (pfn == lp->lim) {
798			/* Don't go beyond the static part of the glist. */
799			if (lp == kcage_current_glist)
800				lp = NULL;
801			else
802				lp = lp->next;
803			pfn = PFN_INVALID;
804			goto again;
805		}
806
807		ASSERT(pfn >= lp->curr && pfn < lp->lim);
808	} else {			/* incrementing pfn */
809		if (pfn == lp->curr) {
810			/* Don't go beyond the static part of the glist. */
811			if (lp == kcage_current_glist)
812				lp = NULL;
813			else
814				lp = lp->next;
815			pfn = PFN_INVALID;
816			goto again;
817		}
818
819		ASSERT(pfn >= lp->base && pfn < lp->curr);
820	}
821
822	return (pfn++);
823}
824
825/*
826 * Callback functions for to recalc cage thresholds after
827 * Kphysm memory add/delete operations.
828 */
829/*ARGSUSED*/
830static void
831kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
832{
833	kcage_recalc_thresholds();
834}
835
836/*ARGSUSED*/
837static int
838kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
839{
840	/* TODO: when should cage refuse memory delete requests? */
841	return (0);
842}
843
844/*ARGSUSED*/
845static  void
846kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
847{
848	kcage_recalc_thresholds();
849}
850
851static kphysm_setup_vector_t kcage_kphysm_vectors = {
852	KPHYSM_SETUP_VECTOR_VERSION,
853	kcage_kphysm_postadd_cb,
854	kcage_kphysm_predel_cb,
855	kcage_kphysm_postdel_cb
856};
857
858/*
859 * This is called before a CPR suspend and after a CPR resume.  We have to
860 * turn off kcage_cageout_ready before a suspend, and turn it back on after a
861 * restart.
862 */
863/*ARGSUSED*/
864static boolean_t
865kcage_cageout_cpr(void *arg, int code)
866{
867	if (code == CB_CODE_CPR_CHKPT) {
868		ASSERT(kcage_cageout_ready);
869		kcage_cageout_ready = 0;
870		return (B_TRUE);
871	} else if (code == CB_CODE_CPR_RESUME) {
872		ASSERT(kcage_cageout_ready == 0);
873		kcage_cageout_ready = 1;
874		return (B_TRUE);
875	}
876	return (B_FALSE);
877}
878
879/*
880 * kcage_recalc_preferred_size() increases initial cage size to improve large
881 * page availability when lp for kmem is enabled and kpr is disabled
882 */
883static pgcnt_t
884kcage_recalc_preferred_size(pgcnt_t preferred_size)
885{
886	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
887		pgcnt_t lpmincage = kcage_kmemlp_mincage;
888		if (lpmincage == 0) {
889			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
890			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
891		}
892		kcage_kmemlp_mincage = MIN(lpmincage,
893		    (segkmem_kmemlp_max / PAGESIZE));
894		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
895	}
896	return (preferred_size);
897}
898
899/*
900 * Kcage_init() builds the cage and initializes the cage thresholds.
901 * The size of the cage is determined by the argument preferred_size.
902 * or the actual amount of memory, whichever is smaller.
903 */
904static void
905kcage_init(pgcnt_t preferred_size)
906{
907	pgcnt_t wanted;
908	pfn_t pfn;
909	page_t *pp;
910	kstat_t *ksp;
911
912	extern void page_list_noreloc_startup(page_t *);
913
914	ASSERT(!kcage_on);
915
916	/* increase preferred cage size for lp for kmem */
917	preferred_size = kcage_recalc_preferred_size(preferred_size);
918
919	/* Debug note: initialize this now so early expansions can stat */
920	KCAGE_STAT_INIT_SCAN_INDEX;
921
922	/*
923	 * Initialize cage thresholds and install kphysm callback.
924	 * If we can't arrange to have the thresholds track with
925	 * available physical memory, then the cage thresholds may
926	 * end up over time at levels that adversly effect system
927	 * performance; so, bail out.
928	 */
929	kcage_recalc_thresholds();
930	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
931		ASSERT(0);		/* Catch this in DEBUG kernels. */
932		return;
933	}
934
935	/*
936	 * Limit startup cage size within the range of kcage_minfree
937	 * and availrmem, inclusively.
938	 */
939	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
940
941	/*
942	 * Construct the cage. PFNs are allocated from the glist. It
943	 * is assumed that the list has been properly ordered for the
944	 * platform by the platform code. Typically, this is as simple
945	 * as calling kcage_range_init(phys_avail, decr), where decr is
946	 * 1 if the kernel has been loaded into upper end of physical
947	 * memory, or 0 if the kernel has been loaded at the low end.
948	 *
949	 * Note: it is assumed that we are in the startup flow, so there
950	 * is no reason to grab the page lock.
951	 */
952	kcage_freemem = 0;
953	pfn = PFN_INVALID;			/* prime for alignment test */
954	while (wanted != 0) {
955		if ((pfn = kcage_get_pfn(0)) == PFN_INVALID)
956			break;
957
958		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
959			KCAGEPAGETS_INC();
960			/*
961			 * Set the noreloc state on the page.
962			 * If the page is free and not already
963			 * on the noreloc list then move it.
964			 */
965			if (PP_ISFREE(pp)) {
966				if (PP_ISNORELOC(pp) == 0)
967					page_list_noreloc_startup(pp);
968			} else {
969				ASSERT(pp->p_szc == 0);
970				PP_SETNORELOC(pp);
971			}
972		}
973		PLCNT_XFER_NORELOC(pp);
974		wanted -= 1;
975	}
976
977	/*
978	 * Need to go through and find kernel allocated pages
979	 * and capture them into the Cage.  These will primarily
980	 * be pages gotten through boot_alloc().
981	 */
982	if (kvp.v_pages) {
983
984		pp = kvp.v_pages;
985		do {
986			ASSERT(!PP_ISFREE(pp));
987			ASSERT(pp->p_szc == 0);
988			if (PP_ISNORELOC(pp) == 0) {
989				PP_SETNORELOC(pp);
990				PLCNT_XFER_NORELOC(pp);
991			}
992		} while ((pp = pp->p_vpnext) != kvp.v_pages);
993
994	}
995
996	kcage_on = 1;
997
998	/*
999	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
1000	 * after the cageout thread is blocked, and executes from cpr_resume()
1001	 * before the cageout thread is restarted.  By executing in this class,
1002	 * we are assured that the kernel cage thread won't miss wakeup calls
1003	 * and also CPR's larger kmem_alloc requests will not fail after
1004	 * CPR shuts down the cageout kernel thread.
1005	 */
1006	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
1007	    "cageout");
1008
1009	/*
1010	 * Coalesce pages to improve large page availability. A better fix
1011	 * would to coalesce pages as they are included in the cage
1012	 */
1013	if (SEGKMEM_USE_LARGEPAGES) {
1014		extern void page_freelist_coalesce_all(int mnode);
1015		page_freelist_coalesce_all(-1);	/* do all mnodes */
1016	}
1017
1018	ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
1019	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
1020	if (ksp != NULL) {
1021		ksp->ks_update = kcage_kstat_update;
1022		ksp->ks_snapshot = kcage_kstat_snapshot;
1023		ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */
1024		kstat_install(ksp);
1025	}
1026}
1027
1028static int
1029kcage_kstat_update(kstat_t *ksp, int rw)
1030{
1031	struct kcage_glist *lp;
1032	uint_t count;
1033
1034	if (rw == KSTAT_WRITE)
1035		return (EACCES);
1036
1037	count = 0;
1038	rw_enter(&kcage_range_rwlock, RW_WRITER);
1039	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
1040		if (lp->decr) {
1041			if (lp->curr != lp->lim) {
1042				count++;
1043			}
1044		} else {
1045			if (lp->curr != lp->base) {
1046				count++;
1047			}
1048		}
1049	}
1050	rw_exit(&kcage_range_rwlock);
1051
1052	ksp->ks_ndata = count;
1053	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1054
1055	return (0);
1056}
1057
1058static int
1059kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1060{
1061	struct kcage_glist *lp;
1062	struct memunit {
1063		uint64_t address;
1064		uint64_t size;
1065	} *kspmem;
1066
1067	if (rw == KSTAT_WRITE)
1068		return (EACCES);
1069
1070	ksp->ks_snaptime = gethrtime();
1071
1072	kspmem = (struct memunit *)buf;
1073	rw_enter(&kcage_range_rwlock, RW_WRITER);
1074	for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) {
1075		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1076			break;
1077
1078		if (lp->decr) {
1079			if (lp->curr != lp->lim) {
1080				kspmem->address = ptob(lp->curr);
1081				kspmem->size = ptob(lp->lim - lp->curr);
1082			}
1083		} else {
1084			if (lp->curr != lp->base) {
1085				kspmem->address = ptob(lp->base);
1086				kspmem->size = ptob(lp->curr - lp->base);
1087			}
1088		}
1089	}
1090	rw_exit(&kcage_range_rwlock);
1091
1092	return (0);
1093}
1094
1095void
1096kcage_recalc_thresholds()
1097{
1098	static int first = 1;
1099	static pgcnt_t init_lotsfree;
1100	static pgcnt_t init_desfree;
1101	static pgcnt_t init_minfree;
1102	static pgcnt_t init_throttlefree;
1103	static pgcnt_t init_reserve;
1104
1105	/* TODO: any reason to take more care than this with live editing? */
1106	mutex_enter(&kcage_cageout_mutex);
1107	mutex_enter(&freemem_lock);
1108
1109	if (first) {
1110		first = 0;
1111		init_lotsfree = kcage_lotsfree;
1112		init_desfree = kcage_desfree;
1113		init_minfree = kcage_minfree;
1114		init_throttlefree = kcage_throttlefree;
1115		init_reserve = kcage_reserve;
1116	} else {
1117		kcage_lotsfree = init_lotsfree;
1118		kcage_desfree = init_desfree;
1119		kcage_minfree = init_minfree;
1120		kcage_throttlefree = init_throttlefree;
1121		kcage_reserve = init_reserve;
1122	}
1123
1124	if (kcage_lotsfree == 0)
1125		kcage_lotsfree = MAX(32, total_pages / 256);
1126
1127	if (kcage_minfree == 0)
1128		kcage_minfree = MAX(32, kcage_lotsfree / 2);
1129
1130	if (kcage_desfree == 0)
1131		kcage_desfree = MAX(32, kcage_minfree);
1132
1133	if (kcage_throttlefree == 0)
1134		kcage_throttlefree = MAX(32, kcage_minfree / 2);
1135
1136	if (kcage_reserve == 0)
1137		kcage_reserve = MIN(32, kcage_throttlefree / 2);
1138
1139	mutex_exit(&freemem_lock);
1140	mutex_exit(&kcage_cageout_mutex);
1141
1142	if (kcage_cageout_ready) {
1143		if (kcage_freemem < kcage_desfree)
1144			kcage_cageout_wakeup();
1145
1146		if (kcage_needfree) {
1147			mutex_enter(&kcage_throttle_mutex);
1148			cv_broadcast(&kcage_throttle_cv);
1149			mutex_exit(&kcage_throttle_mutex);
1150		}
1151	}
1152}
1153
1154/*
1155 * Pageout interface:
1156 * kcage_cageout_init()
1157 */
1158void
1159kcage_cageout_init()
1160{
1161	if (kcage_on) {
1162		(void) lwp_kernel_create(proc_pageout, kcage_cageout, NULL,
1163		    TS_RUN, maxclsyspri - 1);
1164	}
1165}
1166
1167
1168/*
1169 * VM Interfaces:
1170 * kcage_create_throttle()
1171 * kcage_freemem_add()
1172 * kcage_freemem_sub()
1173 */
1174
1175/*
1176 * Wakeup cageout thread and throttle waiting for the number of pages
1177 * requested to become available.  For non-critical requests, a
1178 * timeout is added, since freemem accounting is separate from cage
1179 * freemem accounting: it's possible for us to get stuck and not make
1180 * forward progress even though there was sufficient freemem before
1181 * arriving here.
1182 */
1183int
1184kcage_create_throttle(pgcnt_t npages, int flags)
1185{
1186
1187	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
1188
1189	/*
1190	 * Obviously, we can't throttle the cageout thread since
1191	 * we depend on it.  We also can't throttle the panic thread.
1192	 */
1193	if (curthread == kcage_cageout_thread || panicstr) {
1194		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
1195		return (KCT_CRIT);
1196	}
1197
1198	/*
1199	 * Don't throttle threads which are critical for proper
1200	 * vm management if we're above kcage_throttlefree or
1201	 * if freemem is very low.
1202	 */
1203	if (NOMEMWAIT()) {
1204		if (kcage_freemem > kcage_throttlefree + npages) {
1205			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1206			return (KCT_CRIT);
1207		} else if (freemem < minfree) {
1208			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
1209			return (KCT_CRIT);
1210		}
1211	}
1212
1213	/*
1214	 * Don't throttle real-time threads if kcage_freemem > kcage_reserve.
1215	 */
1216	if (DISP_PRIO(curthread) > maxclsyspri &&
1217	    kcage_freemem > kcage_reserve) {
1218		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1219		return (KCT_CRIT);
1220	}
1221
1222	/*
1223	 * Cause all other threads (which are assumed to not be
1224	 * critical to cageout) to wait here until their request
1225	 * can be satisfied. Be a little paranoid and wake the
1226	 * kernel cage on each loop through this logic.
1227	 */
1228	while (kcage_freemem < kcage_throttlefree + npages) {
1229		ASSERT(kcage_on);
1230		if (kcage_cageout_ready) {
1231			mutex_enter(&kcage_throttle_mutex);
1232
1233			kcage_needfree += npages;
1234			KCAGE_STAT_INCR(kct_wait);
1235
1236			kcage_cageout_wakeup();
1237			KCAGE_STAT_INCR(kct_cagewake);
1238
1239			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
1240
1241			kcage_needfree -= npages;
1242
1243			mutex_exit(&kcage_throttle_mutex);
1244		} else {
1245			/*
1246			 * NOTE: atomics are used just in case we enter
1247			 * mp operation before the cageout thread is ready.
1248			 */
1249			atomic_add_long(&kcage_needfree, npages);
1250
1251			kcage_cageout_wakeup();
1252			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
1253
1254			atomic_add_long(&kcage_needfree, -npages);
1255		}
1256
1257		if (NOMEMWAIT() && freemem < minfree) {
1258			return (KCT_CRIT);
1259		}
1260		if ((flags & PG_WAIT) == 0) {
1261			pgcnt_t limit = (flags & PG_NORMALPRI) ?
1262			    throttlefree : pageout_reserve;
1263
1264			if ((kcage_freemem < kcage_throttlefree + npages) &&
1265			    (freemem < limit + npages)) {
1266				return (KCT_FAILURE);
1267			} else {
1268				return (KCT_NONCRIT);
1269			}
1270		}
1271	}
1272	return (KCT_NONCRIT);
1273}
1274
1275void
1276kcage_freemem_add(pgcnt_t npages)
1277{
1278	extern void wakeup_pcgs(void);
1279
1280	atomic_add_long(&kcage_freemem, npages);
1281
1282	wakeup_pcgs();  /* wakeup threads in pcgs() */
1283
1284	if (kcage_needfree != 0 &&
1285	    kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
1286
1287		mutex_enter(&kcage_throttle_mutex);
1288		cv_broadcast(&kcage_throttle_cv);
1289		KCAGE_STAT_INCR(kfa_trottlewake);
1290		mutex_exit(&kcage_throttle_mutex);
1291	}
1292}
1293
1294void
1295kcage_freemem_sub(pgcnt_t npages)
1296{
1297	atomic_add_long(&kcage_freemem, -npages);
1298
1299	if (kcage_freemem < kcage_desfree) {
1300		kcage_cageout_wakeup();
1301		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
1302	}
1303}
1304
1305/*
1306 * return 0 on failure and 1 on success.
1307 */
1308static int
1309kcage_setnoreloc_pages(page_t *rootpp, se_t se)
1310{
1311	pgcnt_t npgs, i;
1312	page_t *pp;
1313	pfn_t rootpfn = page_pptonum(rootpp);
1314	uint_t szc;
1315
1316	ASSERT(!PP_ISFREE(rootpp));
1317	ASSERT(PAGE_LOCKED_SE(rootpp, se));
1318	if (!group_page_trylock(rootpp, se)) {
1319		return (0);
1320	}
1321	szc = rootpp->p_szc;
1322	if (szc == 0) {
1323		/*
1324		 * The szc of a locked page can only change for pages that are
1325		 * non-swapfs (i.e. anonymous memory) file system pages.
1326		 */
1327		ASSERT(rootpp->p_vnode != NULL &&
1328		    !PP_ISKAS(rootpp) &&
1329		    !IS_SWAPFSVP(rootpp->p_vnode));
1330		PP_SETNORELOC(rootpp);
1331		return (1);
1332	}
1333	npgs = page_get_pagecnt(szc);
1334	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
1335	pp = rootpp;
1336	for (i = 0; i < npgs; i++, pp++) {
1337		ASSERT(PAGE_LOCKED_SE(pp, se));
1338		ASSERT(!PP_ISFREE(pp));
1339		ASSERT(pp->p_szc == szc);
1340		PP_SETNORELOC(pp);
1341	}
1342	group_page_unlock(rootpp);
1343	return (1);
1344}
1345
1346/*
1347 * Attempt to convert page to a caged page (set the P_NORELOC flag).
1348 * If successful and pages is free, move page to the tail of whichever
1349 * list it is on.
1350 * Returns:
1351 *   EBUSY  page already locked, assimilated but not free.
1352 *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
1353 *   EAGAIN page not assimilated. Page not free.
1354 *   ERANGE page assimilated. Page not root.
1355 *   0      page assimilated. Page free.
1356 *   *nfreedp number of pages freed.
1357 * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
1358 * to distinguish between a page that was already a NORELOC page from
1359 * those newly converted to NORELOC pages by this invocation of
1360 * kcage_assimilate_page.
1361 */
1362static int
1363kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
1364{
1365	if (page_trylock(pp, SE_EXCL)) {
1366		if (PP_ISNORELOC(pp)) {
1367check_free_and_return:
1368			if (PP_ISFREE(pp)) {
1369				page_unlock(pp);
1370				*nfreedp = 0;
1371				return (0);
1372			} else {
1373				page_unlock(pp);
1374				return (EBUSY);
1375			}
1376			/*NOTREACHED*/
1377		}
1378	} else {
1379		if (page_trylock(pp, SE_SHARED)) {
1380			if (PP_ISNORELOC(pp))
1381				goto check_free_and_return;
1382		} else {
1383			return (EAGAIN);
1384		}
1385		if (!PP_ISFREE(pp)) {
1386			page_unlock(pp);
1387			return (EAGAIN);
1388		}
1389
1390		/*
1391		 * Need to upgrade the lock on it and set the NORELOC
1392		 * bit. If it is free then remove it from the free
1393		 * list so that the platform free list code can keep
1394		 * NORELOC pages where they should be.
1395		 */
1396		/*
1397		 * Before doing anything, get the exclusive lock.
1398		 * This may fail (eg ISM pages are left shared locked).
1399		 * If the page is free this will leave a hole in the
1400		 * cage. There is no solution yet to this.
1401		 */
1402		if (!page_tryupgrade(pp)) {
1403			page_unlock(pp);
1404			return (EAGAIN);
1405		}
1406	}
1407
1408	ASSERT(PAGE_EXCL(pp));
1409
1410	if (PP_ISFREE(pp)) {
1411		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
1412
1413		page_list_sub(pp, which);
1414		ASSERT(pp->p_szc == 0);
1415		PP_SETNORELOC(pp);
1416		PLCNT_XFER_NORELOC(pp);
1417		page_list_add(pp, which | PG_LIST_TAIL);
1418
1419		page_unlock(pp);
1420		*nfreedp = 1;
1421		return (0);
1422	} else {
1423		if (pp->p_szc != 0) {
1424			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
1425				page_unlock(pp);
1426				return (EAGAIN);
1427			}
1428			ASSERT(PP_ISNORELOC(pp));
1429		} else {
1430			PP_SETNORELOC(pp);
1431		}
1432		PLCNT_XFER_NORELOC(pp);
1433		return (kcage_invalidate_page(pp, nfreedp));
1434	}
1435	/*NOTREACHED*/
1436}
1437
1438static int
1439kcage_expand()
1440{
1441	int did_something = 0;
1442
1443	spgcnt_t wanted;
1444	pfn_t pfn;
1445	page_t *pp;
1446	/* TODO: we don't really need n any more? */
1447	pgcnt_t n;
1448	pgcnt_t nf, nfreed;
1449
1450	/*
1451	 * Expand the cage if available cage memory is really low. Calculate
1452	 * the amount required to return kcage_freemem to the level of
1453	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
1454	 * more.  It is rare for their sum to create an artificial threshold
1455	 * above kcage_lotsfree, but it is possible.
1456	 *
1457	 * Exit early if expansion amount is equal to or less than zero.
1458	 * (<0 is possible if kcage_freemem rises suddenly.)
1459	 *
1460	 * Exit early when freemem drops below pageout_reserve plus the request.
1461	 */
1462	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
1463	    - kcage_freemem;
1464	if (wanted <= 0) {
1465		return (0);
1466	} else if (freemem < pageout_reserve + wanted) {
1467		KCAGE_STAT_INCR(ke_lowfreemem);
1468		return (0);
1469	}
1470
1471	KCAGE_STAT_INCR(ke_calls);
1472	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
1473
1474	/*
1475	 * Assimilate more pages from the global page pool into the cage.
1476	 */
1477	n = 0;				/* number of pages PP_SETNORELOC'd */
1478	nf = 0;				/* number of those actually free */
1479	while (kcage_on && nf < wanted) {
1480		pfn = kcage_get_pfn(1);
1481		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
1482			KCAGE_STAT_INCR(ke_nopfn);
1483			goto terminate;
1484		}
1485
1486		KCAGE_STAT_INCR_SCAN(ke_examined);
1487
1488		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
1489			KCAGE_STAT_INCR(ke_nopaget);
1490			continue;
1491		}
1492		KCAGEPAGETS_INC();
1493		/*
1494		 * Sanity check. Skip this pfn if it is
1495		 * being deleted.
1496		 */
1497		if (pfn_is_being_deleted(pfn)) {
1498			KCAGE_STAT_INCR(ke_deleting);
1499			continue;
1500		}
1501
1502		if (PP_ISNORELOC(pp)) {
1503			KCAGE_STAT_INCR(ke_isnoreloc);
1504			continue;
1505		}
1506
1507		switch (kcage_assimilate_page(pp, &nfreed)) {
1508			case 0:		/* assimilated, page is free */
1509				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
1510				did_something = 1;
1511				nf += nfreed;
1512				n++;
1513				break;
1514
1515			case EBUSY:	/* assimilated, page not free */
1516			case ERANGE:	/* assimilated, page not root */
1517				KCAGE_STAT_INCR_SCAN(ke_gotone);
1518				did_something = 1;
1519				n++;
1520				break;
1521
1522			case ENOMEM:	/* assimilated, but no mem */
1523				KCAGE_STAT_INCR(ke_terminate);
1524				did_something = 1;
1525				n++;
1526				goto terminate;
1527
1528			case EAGAIN:	/* can't assimilate */
1529				KCAGE_STAT_INCR_SCAN(ke_lefthole);
1530				break;
1531
1532			default:	/* catch this with debug kernels */
1533				ASSERT(0);
1534				break;
1535		}
1536	}
1537
1538	/*
1539	 * Realign cage edge with the nearest physical address
1540	 * boundry for big pages. This is done to give us a
1541	 * better chance of actually getting usable big pages
1542	 * in the cage.
1543	 */
1544
1545terminate:
1546
1547	return (did_something);
1548}
1549
1550/*
1551 * Relocate page opp (Original Page Pointer) from cage pool to page rpp
1552 * (Replacement Page Pointer) in the global pool. Page opp will be freed
1553 * if relocation is successful, otherwise it is only unlocked.
1554 * On entry, page opp must be exclusively locked and not free.
1555 * *nfreedp: number of pages freed.
1556 */
1557static int
1558kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
1559{
1560	page_t *opp = pp;
1561	page_t *rpp = NULL;
1562	spgcnt_t npgs;
1563	int result;
1564
1565	ASSERT(!PP_ISFREE(opp));
1566	ASSERT(PAGE_EXCL(opp));
1567
1568	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
1569	*nfreedp = npgs;
1570	if (result == 0) {
1571		while (npgs-- > 0) {
1572			page_t *tpp;
1573
1574			ASSERT(rpp != NULL);
1575			tpp = rpp;
1576			page_sub(&rpp, tpp);
1577			page_unlock(tpp);
1578		}
1579
1580		ASSERT(rpp == NULL);
1581
1582		return (0);		/* success */
1583	}
1584
1585	page_unlock(opp);
1586	return (result);
1587}
1588
1589/*
1590 * Based on page_invalidate_pages()
1591 *
1592 * Kcage_invalidate_page() uses page_relocate() twice. Both instances
1593 * of use must be updated to match the new page_relocate() when it
1594 * becomes available.
1595 *
1596 * Return result of kcage_relocate_page or zero if page was directly freed.
1597 * *nfreedp: number of pages freed.
1598 */
1599static int
1600kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
1601{
1602	int result;
1603
1604#if defined(__sparc)
1605	ASSERT(pp->p_vnode != &promvp);
1606#endif /* __sparc */
1607	ASSERT(!PP_ISFREE(pp));
1608	ASSERT(PAGE_EXCL(pp));
1609
1610	/*
1611	 * Is this page involved in some I/O? shared?
1612	 * The page_struct_lock need not be acquired to
1613	 * examine these fields since the page has an
1614	 * "exclusive" lock.
1615	 */
1616	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1617		result = kcage_relocate_page(pp, nfreedp);
1618#ifdef KCAGE_STATS
1619		if (result == 0)
1620			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
1621		else if (result == ENOMEM)
1622			KCAGE_STAT_INCR_SCAN(kip_nomem);
1623#endif
1624		return (result);
1625	}
1626
1627	ASSERT(pp->p_vnode->v_type != VCHR);
1628
1629	/*
1630	 * Unload the mappings and check if mod bit is set.
1631	 */
1632	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1633
1634	if (hat_ismod(pp)) {
1635		result = kcage_relocate_page(pp, nfreedp);
1636#ifdef KCAGE_STATS
1637		if (result == 0)
1638			KCAGE_STAT_INCR_SCAN(kip_relocmod);
1639		else if (result == ENOMEM)
1640			KCAGE_STAT_INCR_SCAN(kip_nomem);
1641#endif
1642		return (result);
1643	}
1644
1645	if (!page_try_demote_pages(pp)) {
1646		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
1647		page_unlock(pp);
1648		return (EAGAIN);
1649	}
1650
1651	/* LINTED: constant in conditional context */
1652	VN_DISPOSE(pp, B_INVAL, 0, kcred);
1653	KCAGE_STAT_INCR_SCAN(kip_destroy);
1654	*nfreedp = 1;
1655	return (0);
1656}
1657
1658/*
1659 * Expand cage only if there is not enough memory to satisfy
1660 * current request. We only do one (complete) scan of the cage.
1661 * Dirty pages and pages with shared mappings are skipped;
1662 * Locked pages (p_lckcnt and p_cowcnt) are also skipped.
1663 * All other pages are freed (if they can be locked).
1664 * This may affect caching of user pages which are in cage by freeing/
1665 * reclaiming them more often. However cage is mainly for kernel (heap)
1666 * pages and we want to keep user pages outside of cage. The above policy
1667 * should also reduce cage expansion plus it should speed up cage mem
1668 * allocations.
1669 */
1670static void
1671kcage_cageout()
1672{
1673	pfn_t pfn;
1674	page_t *pp;
1675	callb_cpr_t cprinfo;
1676	int did_something;
1677	pfn_t start_pfn;
1678	ulong_t shared_level = 8;
1679	pgcnt_t nfreed;
1680#ifdef KCAGE_STATS
1681	clock_t scan_start;
1682#endif
1683
1684	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
1685	    callb_generic_cpr, "cageout");
1686
1687	mutex_enter(&kcage_cageout_mutex);
1688	kcage_cageout_thread = curthread;
1689
1690	pfn = PFN_INVALID;		/* force scan reset */
1691	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
1692	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
1693
1694loop:
1695	/*
1696	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
1697	 * that kcage_freemem is less than kcage_desfree. When it does
1698	 * notice, kcage_freemem_sub() will wake us up via call to
1699	 * kcage_cageout_wakeup().
1700	 */
1701	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1702	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
1703	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
1704
1705	KCAGE_STAT_INCR(kt_wakeups);
1706	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
1707	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
1708#ifdef KCAGE_STATS
1709	scan_start = ddi_get_lbolt();
1710#endif
1711	if (!kcage_on)
1712		goto loop;
1713
1714	KCAGE_STAT_INCR(kt_scans);
1715	KCAGE_STAT_INCR_SCAN(kt_passes);
1716
1717	did_something = 0;
1718	while (kcage_freemem < kcage_lotsfree + kcage_needfree) {
1719
1720		if ((pfn = kcage_walk_cage(pfn == PFN_INVALID)) ==
1721		    PFN_INVALID) {
1722			break;
1723		}
1724
1725		if (start_pfn == PFN_INVALID)
1726			start_pfn = pfn;
1727		else if (start_pfn == pfn) {
1728			/*
1729			 * Did a complete walk of kernel cage, but didn't free
1730			 * any pages.  If only one cpu is active then
1731			 * stop kernel cage walk and try expanding.
1732			 */
1733			if (cp_default.cp_ncpus == 1 && did_something == 0) {
1734				KCAGE_STAT_INCR(kt_cageout_break);
1735				break;
1736			}
1737		}
1738
1739		pp = page_numtopp_nolock(pfn);
1740		if (pp == NULL) {
1741			continue;
1742		}
1743
1744		KCAGE_STAT_INCR_SCAN(kt_examined);
1745
1746		/*
1747		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
1748		 * of the lock. If one is missed it will be seen next
1749		 * time through.
1750		 *
1751		 * Skip non-caged-pages. These pages can exist in the cage
1752		 * because, if during cage expansion, a page is
1753		 * encountered that is long-term locked the lock prevents the
1754		 * expansion logic from setting the P_NORELOC flag. Hence,
1755		 * non-caged-pages surrounded by caged-pages.
1756		 */
1757		if (!PP_ISNORELOC(pp)) {
1758			switch (kcage_assimilate_page(pp, &nfreed)) {
1759				case 0:
1760					did_something = 1;
1761					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
1762					    nfreed);
1763					break;
1764
1765				case EBUSY:
1766				case ERANGE:
1767					did_something = 1;
1768					KCAGE_STAT_INCR_SCAN(kt_gotone);
1769					break;
1770
1771				case EAGAIN:
1772				case ENOMEM:
1773					break;
1774
1775				default:
1776					/* catch this with debug kernels */
1777					ASSERT(0);
1778					break;
1779			}
1780
1781			continue;
1782		} else {
1783			if (PP_ISFREE(pp)) {
1784				continue;
1785			}
1786
1787			if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) ||
1788			    !page_trylock(pp, SE_EXCL)) {
1789				KCAGE_STAT_INCR_SCAN(kt_cantlock);
1790				continue;
1791			}
1792
1793			/* P_NORELOC bit should not have gone away. */
1794			ASSERT(PP_ISNORELOC(pp));
1795			if (PP_ISFREE(pp) || (PP_ISKAS(pp) &&
1796			    pp->p_lckcnt > 0)) {
1797				page_unlock(pp);
1798				continue;
1799			}
1800
1801			if (hat_page_checkshare(pp, shared_level)) {
1802				page_unlock(pp);
1803				KCAGE_STAT_INCR_SCAN(kt_skipshared);
1804				continue;
1805			}
1806
1807			if (kcage_invalidate_page(pp, &nfreed) == 0) {
1808				did_something = 1;
1809				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
1810			}
1811
1812			/*
1813			 * No need to drop the page lock here.
1814			 * Kcage_invalidate_page has done that for us
1815			 * either explicitly or through a page_free.
1816			 */
1817		}
1818	}
1819
1820	if (kcage_freemem < kcage_throttlefree + kcage_needfree)
1821		(void) kcage_expand();
1822
1823	if (kcage_on && kcage_cageout_ready)
1824		cv_broadcast(&kcage_throttle_cv);
1825
1826	KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
1827	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
1828	KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start);
1829	KCAGE_STAT_INC_SCAN_INDEX;
1830	goto loop;
1831
1832	/*NOTREACHED*/
1833}
1834
1835void
1836kcage_cageout_wakeup()
1837{
1838	if (mutex_tryenter(&kcage_cageout_mutex)) {
1839		if (kcage_cageout_ready) {
1840			cv_signal(&kcage_cageout_cv);
1841		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
1842			/*
1843			 * Available cage memory is really low. Time to
1844			 * start expanding the cage. However, the
1845			 * kernel cage thread is not yet ready to
1846			 * do the work. Use *this* thread, which is
1847			 * most likely to be t0, to do the work.
1848			 */
1849			KCAGE_STAT_INCR(kcw_expandearly);
1850			(void) kcage_expand();
1851			KCAGE_STAT_INC_SCAN_INDEX;
1852		}
1853
1854		mutex_exit(&kcage_cageout_mutex);
1855	}
1856	/* else, kernel cage thread is already running */
1857}
1858
1859void
1860kcage_tick()
1861{
1862	/*
1863	 * Once per second we wake up all the threads throttled
1864	 * waiting for cage memory, in case we've become stuck
1865	 * and haven't made forward progress expanding the cage.
1866	 */
1867	if (kcage_on && kcage_cageout_ready)
1868		cv_broadcast(&kcage_throttle_cv);
1869}
1870