xref: /illumos-gate/usr/src/uts/common/os/mem_cage.c (revision 6899cf3f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/param.h>
27 #include <sys/thread.h>
28 #include <sys/proc.h>
29 #include <sys/callb.h>
30 #include <sys/vnode.h>
31 #include <sys/debug.h>
32 #include <sys/systm.h>		/* for bzero */
33 #include <sys/memlist.h>
34 #include <sys/cmn_err.h>
35 #include <sys/sysmacros.h>
36 #include <sys/vmsystm.h>	/* for NOMEMWAIT() */
37 #include <sys/atomic.h>		/* used to update kcage_freemem */
38 #include <sys/kmem.h>		/* for kmem_reap */
39 #include <sys/errno.h>
40 #include <sys/mem_cage.h>
41 #include <vm/seg_kmem.h>
42 #include <vm/page.h>
43 #include <vm/hat.h>
44 #include <vm/vm_dep.h>
45 #include <sys/mem_config.h>
46 #include <sys/lgrp.h>
47 #include <sys/rwlock.h>
48 #include <sys/cpupart.h>
49 
50 extern pri_t maxclsyspri;
51 
52 #ifdef DEBUG
53 #define	KCAGE_STATS
54 #endif
55 
56 #ifdef KCAGE_STATS
57 
58 #define	KCAGE_STATS_VERSION 9	/* can help report generators */
59 #define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
60 
61 struct kcage_stats_scan {
62 	/* managed by KCAGE_STAT_* macros */
63 	clock_t	scan_lbolt;
64 	uint_t	scan_id;
65 
66 	/* set in kcage_cageout() */
67 	uint_t	kt_passes;
68 	clock_t	kt_ticks;
69 	pgcnt_t	kt_kcage_freemem_start;
70 	pgcnt_t	kt_kcage_freemem_end;
71 	pgcnt_t kt_freemem_start;
72 	pgcnt_t kt_freemem_end;
73 	uint_t	kt_examined;
74 	uint_t	kt_cantlock;
75 	uint_t	kt_gotone;
76 	uint_t	kt_gotonefree;
77 	uint_t	kt_skipshared;
78 	uint_t	kt_skiprefd;
79 	uint_t	kt_destroy;
80 
81 	/* set in kcage_invalidate_page() */
82 	uint_t	kip_reloclocked;
83 	uint_t	kip_relocmod;
84 	uint_t	kip_destroy;
85 	uint_t	kip_nomem;
86 	uint_t	kip_demotefailed;
87 
88 	/* set in kcage_expand() */
89 	uint_t	ke_wanted;
90 	uint_t	ke_examined;
91 	uint_t	ke_lefthole;
92 	uint_t	ke_gotone;
93 	uint_t	ke_gotonefree;
94 };
95 
96 struct kcage_stats {
97 	/* managed by KCAGE_STAT_* macros */
98 	uint_t	version;
99 	uint_t	size;
100 
101 	/* set in kcage_cageout */
102 	uint_t	kt_wakeups;
103 	uint_t	kt_scans;
104 	uint_t	kt_cageout_break;
105 
106 	/* set in kcage_expand */
107 	uint_t	ke_calls;
108 	uint_t	ke_nopfn;
109 	uint_t	ke_nopaget;
110 	uint_t	ke_isnoreloc;
111 	uint_t	ke_deleting;
112 	uint_t	ke_lowfreemem;
113 	uint_t	ke_terminate;
114 
115 	/* set in kcage_freemem_add() */
116 	uint_t	kfa_trottlewake;
117 
118 	/* set in kcage_freemem_sub() */
119 	uint_t	kfs_cagewake;
120 
121 	/* set in kcage_create_throttle */
122 	uint_t	kct_calls;
123 	uint_t	kct_cageout;
124 	uint_t	kct_critical;
125 	uint_t	kct_exempt;
126 	uint_t	kct_cagewake;
127 	uint_t	kct_wait;
128 	uint_t	kct_progress;
129 	uint_t	kct_noprogress;
130 	uint_t	kct_timeout;
131 
132 	/* set in kcage_cageout_wakeup */
133 	uint_t	kcw_expandearly;
134 
135 	/* managed by KCAGE_STAT_* macros */
136 	uint_t	scan_array_size;
137 	uint_t	scan_index;
138 	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
139 };
140 
141 static struct kcage_stats kcage_stats;
142 static struct kcage_stats_scan kcage_stats_scan_zero;
143 
144 /*
145  * No real need for atomics here. For the most part the incs and sets are
146  * done by the kernel cage thread. There are a few that are done by any
147  * number of other threads. Those cases are noted by comments.
148  */
149 #define	KCAGE_STAT_INCR(m)	kcage_stats.m++
150 
151 #define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
152 
153 #define	KCAGE_STAT_INCR_SCAN(m)	\
154 	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
155 
156 #define	KCAGE_STAT_NINCR_SCAN(m, v) \
157 	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
158 
159 #define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
160 
161 #define	KCAGE_STAT_SETZ(m, v)	\
162 	if (kcage_stats.m == 0) kcage_stats.m = (v)
163 
164 #define	KCAGE_STAT_SET_SCAN(m, v)	\
165 	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
166 
167 #define	KCAGE_STAT_SETZ_SCAN(m, v)	\
168 	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
169 
170 #define	KCAGE_STAT_INC_SCAN_INDEX \
171 	KCAGE_STAT_SET_SCAN(scan_lbolt, ddi_get_lbolt()); \
172 	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
173 	kcage_stats.scan_index = \
174 	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
175 	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
176 
177 #define	KCAGE_STAT_INIT_SCAN_INDEX \
178 	kcage_stats.version = KCAGE_STATS_VERSION; \
179 	kcage_stats.size = sizeof (kcage_stats); \
180 	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
181 	kcage_stats.scan_index = 0
182 
183 #else /* KCAGE_STATS */
184 
185 #define	KCAGE_STAT_INCR(v)
186 #define	KCAGE_STAT_NINCR(m, v)
187 #define	KCAGE_STAT_INCR_SCAN(v)
188 #define	KCAGE_STAT_NINCR_SCAN(m, v)
189 #define	KCAGE_STAT_SET(m, v)
190 #define	KCAGE_STAT_SETZ(m, v)
191 #define	KCAGE_STAT_SET_SCAN(m, v)
192 #define	KCAGE_STAT_SETZ_SCAN(m, v)
193 #define	KCAGE_STAT_INC_SCAN_INDEX
194 #define	KCAGE_STAT_INIT_SCAN_INDEX
195 
196 #endif /* KCAGE_STATS */
197 
198 static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
199 static kcondvar_t kcage_throttle_cv;
200 
201 static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
202 static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
203 static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
204 kthread_id_t kcage_cageout_thread;	/* to aid debugging */
205 
206 static krwlock_t kcage_range_rwlock;	/* protects kcage_glist elements */
207 
208 /*
209  * Cage expansion happens within a range.
210  */
211 struct kcage_glist {
212 	struct kcage_glist	*next;
213 	pfn_t			base;
214 	pfn_t			lim;
215 	pfn_t			curr;
216 	int			decr;
217 };
218 
219 static struct kcage_glist *kcage_glist;
220 static struct kcage_glist *kcage_current_glist;
221 
222 /*
223  * The firstfree element is provided so that kmem_alloc can be avoided
224  * until that cage has somewhere to go. This is not currently a problem
225  * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
226  */
227 static vmem_t *kcage_arena;
228 static struct kcage_glist kcage_glist_firstfree;
229 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
230 
231 /*
232  * Miscellaneous forward references
233  */
234 static struct kcage_glist *kcage_glist_alloc(void);
235 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
236 static void kcage_cageout(void);
237 static int kcage_invalidate_page(page_t *, pgcnt_t *);
238 static int kcage_setnoreloc_pages(page_t *, se_t);
239 static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t);
240 static void kcage_init(pgcnt_t preferred_size);
241 static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs);
242 
243 /*
244  * Kernel Memory Cage counters and thresholds.
245  */
246 int kcage_on = 0;
247 pgcnt_t kcage_freemem;
248 pgcnt_t kcage_needfree;
249 pgcnt_t kcage_lotsfree;
250 pgcnt_t kcage_desfree;
251 pgcnt_t kcage_minfree;
252 pgcnt_t kcage_throttlefree;
253 pgcnt_t	kcage_reserve;
254 int kcage_maxwait = 10;	/* in seconds */
255 
256 /* when we use lp for kmem we start the cage at a higher initial value */
257 pgcnt_t kcage_kmemlp_mincage;
258 
259 #ifdef DEBUG
260 pgcnt_t	kcage_pagets;
261 #define	KCAGEPAGETS_INC()	kcage_pagets++
262 #else
263 #define	KCAGEPAGETS_INC()
264 #endif
265 
266 /* kstats to export what pages are currently caged */
267 kmutex_t kcage_kstat_lock;
268 static int kcage_kstat_update(kstat_t *ksp, int rw);
269 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
270 
271 /*
272  * Startup and Dynamic Reconfiguration interfaces.
273  * kcage_range_add()
274  * kcage_range_del()
275  * kcage_range_delete_post_mem_del()
276  * kcage_range_init()
277  * kcage_set_thresholds()
278  */
279 
280 /*
281  * Called from page_get_contig_pages to get the approximate kcage pfn range
282  * for exclusion from search for contiguous pages. This routine is called
283  * without kcage_range lock (kcage routines can call page_get_contig_pages
284  * through page_relocate) and with the assumption, based on kcage_range_add,
285  * that kcage_current_glist always contain a valid pointer.
286  */
287 
288 int
kcage_current_pfn(pfn_t * pfncur)289 kcage_current_pfn(pfn_t *pfncur)
290 {
291 	struct kcage_glist *lp = kcage_current_glist;
292 
293 	ASSERT(kcage_on);
294 
295 	ASSERT(lp != NULL);
296 
297 	*pfncur = lp->curr;
298 
299 	return (lp->decr);
300 }
301 
302 /*
303  * Called from vm_pagelist.c during coalesce to find kernel cage regions
304  * within an mnode. Looks for the lowest range between lo and hi.
305  *
306  * Kernel cage memory is defined between kcage_glist and kcage_current_glist.
307  * Non-cage memory is defined between kcage_current_glist and list end.
308  *
309  * If incage is set, returns the lowest kcage range. Otherwise returns lowest
310  * non-cage range.
311  *
312  * Returns zero on success and nlo, nhi:
313  * 	lo <= nlo < nhi <= hi
314  * Returns non-zero if no overlapping range is found.
315  */
316 int
kcage_next_range(int incage,pfn_t lo,pfn_t hi,pfn_t * nlo,pfn_t * nhi)317 kcage_next_range(int incage, pfn_t lo, pfn_t hi,
318     pfn_t *nlo, pfn_t *nhi)
319 {
320 	struct kcage_glist *lp;
321 	pfn_t tlo = hi;
322 	pfn_t thi = hi;
323 
324 	ASSERT(lo <= hi);
325 
326 	/*
327 	 * Reader lock protects the list, but kcage_get_pfn
328 	 * running concurrently may advance kcage_current_glist
329 	 * and also update kcage_current_glist->curr. Page
330 	 * coalesce can handle this race condition.
331 	 */
332 	rw_enter(&kcage_range_rwlock, RW_READER);
333 
334 	for (lp = incage ? kcage_glist : kcage_current_glist;
335 	    lp != NULL; lp = lp->next) {
336 
337 		pfn_t klo, khi;
338 
339 		/* find the range limits in this element */
340 		if ((incage && lp->decr) || (!incage && !lp->decr)) {
341 			klo = lp->curr;
342 			khi = lp->lim;
343 		} else {
344 			klo = lp->base;
345 			khi = lp->curr;
346 		}
347 
348 		/* handle overlap */
349 		if (klo < tlo && klo < khi && lo < khi && klo < hi) {
350 			tlo = MAX(lo, klo);
351 			thi = MIN(hi, khi);
352 			if (tlo == lo)
353 				break;
354 		}
355 
356 		/* check end of kcage */
357 		if (incage && lp == kcage_current_glist) {
358 			break;
359 		}
360 	}
361 
362 	rw_exit(&kcage_range_rwlock);
363 
364 	/* return non-zero if no overlapping range found */
365 	if (tlo == thi)
366 		return (1);
367 
368 	ASSERT(lo <= tlo && tlo < thi && thi <= hi);
369 
370 	/* return overlapping range */
371 	*nlo = tlo;
372 	*nhi = thi;
373 	return (0);
374 }
375 
376 void
kcage_range_init(struct memlist * ml,kcage_dir_t d,pgcnt_t preferred_size)377 kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size)
378 {
379 	int ret = 0;
380 
381 	ASSERT(kcage_arena == NULL);
382 	kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t),
383 	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
384 	ASSERT(kcage_arena != NULL);
385 
386 	if (d == KCAGE_DOWN) {
387 		while (ml->ml_next != NULL)
388 			ml = ml->ml_next;
389 	}
390 
391 	rw_enter(&kcage_range_rwlock, RW_WRITER);
392 
393 	while (ml != NULL) {
394 		ret = kcage_range_add_internal(btop(ml->ml_address),
395 		    btop(ml->ml_size), d);
396 		if (ret)
397 			panic("kcage_range_add_internal failed: "
398 			    "ml=%p, ret=0x%x\n", (void *)ml, ret);
399 
400 		ml = (d == KCAGE_DOWN ? ml->ml_prev : ml->ml_next);
401 	}
402 
403 	rw_exit(&kcage_range_rwlock);
404 
405 	if (ret == 0)
406 		kcage_init(preferred_size);
407 }
408 
409 /*
410  * Third arg controls direction of growth: 0: increasing pfns,
411  * 1: decreasing.
412  */
413 static int
kcage_range_add_internal(pfn_t base,pgcnt_t npgs,kcage_dir_t d)414 kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
415 {
416 	struct kcage_glist *new, **lpp;
417 	pfn_t lim;
418 
419 	ASSERT(rw_write_held(&kcage_range_rwlock));
420 
421 	ASSERT(npgs != 0);
422 	if (npgs == 0)
423 		return (EINVAL);
424 
425 	lim = base + npgs;
426 
427 	ASSERT(lim > base);
428 	if (lim <= base)
429 		return (EINVAL);
430 
431 	new = kcage_glist_alloc();
432 	if (new == NULL) {
433 		return (ENOMEM);
434 	}
435 
436 	new->base = base;
437 	new->lim = lim;
438 	new->decr = (d == KCAGE_DOWN);
439 	if (new->decr != 0)
440 		new->curr = new->lim;
441 	else
442 		new->curr = new->base;
443 	/*
444 	 * Any overlapping existing ranges are removed by deleting
445 	 * from the new list as we search for the tail.
446 	 */
447 	lpp = &kcage_glist;
448 	while (*lpp != NULL) {
449 		int ret;
450 		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
451 		if (ret != 0)
452 			return (ret);
453 		lpp = &(*lpp)->next;
454 	}
455 
456 	*lpp = new;
457 
458 	if (kcage_current_glist == NULL) {
459 		kcage_current_glist = kcage_glist;
460 	}
461 
462 	return (0);
463 }
464 
465 int
kcage_range_add(pfn_t base,pgcnt_t npgs,kcage_dir_t d)466 kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
467 {
468 	int ret;
469 
470 	rw_enter(&kcage_range_rwlock, RW_WRITER);
471 	ret = kcage_range_add_internal(base, npgs, d);
472 	rw_exit(&kcage_range_rwlock);
473 	return (ret);
474 }
475 
476 /*
477  * Calls to add and delete must be protected by kcage_range_rwlock
478  */
479 static int
kcage_range_delete_internal(pfn_t base,pgcnt_t npgs)480 kcage_range_delete_internal(pfn_t base, pgcnt_t npgs)
481 {
482 	struct kcage_glist *lp;
483 	pfn_t lim;
484 
485 	ASSERT(rw_write_held(&kcage_range_rwlock));
486 
487 	ASSERT(npgs != 0);
488 	if (npgs == 0)
489 		return (EINVAL);
490 
491 	lim = base + npgs;
492 
493 	ASSERT(lim > base);
494 	if (lim <= base)
495 		return (EINVAL);
496 
497 	/*
498 	 * Check if the delete is OK first as a number of elements
499 	 * might be involved and it will be difficult to go
500 	 * back and undo (can't just add the range back in).
501 	 */
502 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
503 		/*
504 		 * If there have been no pages allocated from this
505 		 * element, we don't need to check it.
506 		 */
507 		if ((lp->decr == 0 && lp->curr == lp->base) ||
508 		    (lp->decr != 0 && lp->curr == lp->lim))
509 			continue;
510 		/*
511 		 * If the element does not overlap, its OK.
512 		 */
513 		if (base >= lp->lim || lim <= lp->base)
514 			continue;
515 		/*
516 		 * Overlapping element: Does the range to be deleted
517 		 * overlap the area already used? If so fail.
518 		 */
519 		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
520 			return (EBUSY);
521 		}
522 		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
523 			return (EBUSY);
524 		}
525 	}
526 	return (kcage_glist_delete(base, lim, &kcage_glist));
527 }
528 
529 int
kcage_range_delete(pfn_t base,pgcnt_t npgs)530 kcage_range_delete(pfn_t base, pgcnt_t npgs)
531 {
532 	int ret;
533 
534 	rw_enter(&kcage_range_rwlock, RW_WRITER);
535 	ret = kcage_range_delete_internal(base, npgs);
536 	rw_exit(&kcage_range_rwlock);
537 	return (ret);
538 }
539 
540 /*
541  * Calls to add and delete must be protected by kcage_range_rwlock.
542  * This routine gets called after successful Solaris memory
543  * delete operation from DR post memory delete routines.
544  */
545 static int
kcage_range_delete_post_mem_del_internal(pfn_t base,pgcnt_t npgs)546 kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs)
547 {
548 	pfn_t lim;
549 
550 	ASSERT(rw_write_held(&kcage_range_rwlock));
551 
552 	ASSERT(npgs != 0);
553 	if (npgs == 0)
554 		return (EINVAL);
555 
556 	lim = base + npgs;
557 
558 	ASSERT(lim > base);
559 	if (lim <= base)
560 		return (EINVAL);
561 
562 	return (kcage_glist_delete(base, lim, &kcage_glist));
563 }
564 
565 int
kcage_range_delete_post_mem_del(pfn_t base,pgcnt_t npgs)566 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
567 {
568 	int ret;
569 
570 	rw_enter(&kcage_range_rwlock, RW_WRITER);
571 	ret = kcage_range_delete_post_mem_del_internal(base, npgs);
572 	rw_exit(&kcage_range_rwlock);
573 	return (ret);
574 }
575 
576 /*
577  * No locking is required here as the whole operation is covered
578  * by kcage_range_rwlock writer lock.
579  */
580 static struct kcage_glist *
kcage_glist_alloc(void)581 kcage_glist_alloc(void)
582 {
583 	struct kcage_glist *new;
584 
585 	if ((new = kcage_glist_freelist) != NULL) {
586 		kcage_glist_freelist = new->next;
587 	} else if (kernel_cage_enable) {
588 		new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP);
589 	} else {
590 		/*
591 		 * On DR supported platforms we allow memory add
592 		 * even when kernel cage is disabled. "kcage_arena" is
593 		 * created only when kernel cage is enabled.
594 		 */
595 		new = kmem_zalloc(sizeof (*new), KM_NOSLEEP);
596 	}
597 
598 	if (new != NULL)
599 		bzero(new, sizeof (*new));
600 
601 	return (new);
602 }
603 
604 static void
kcage_glist_free(struct kcage_glist * lp)605 kcage_glist_free(struct kcage_glist *lp)
606 {
607 	lp->next = kcage_glist_freelist;
608 	kcage_glist_freelist = lp;
609 }
610 
611 static int
kcage_glist_delete(pfn_t base,pfn_t lim,struct kcage_glist ** lpp)612 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
613 {
614 	struct kcage_glist *lp, *prev = *lpp;
615 
616 	while ((lp = *lpp) != NULL) {
617 		if (lim > lp->base && base < lp->lim) {
618 			/* The delete range overlaps this element. */
619 			if (base <= lp->base && lim >= lp->lim) {
620 				/* Delete whole element. */
621 				*lpp = lp->next;
622 				if (lp == kcage_current_glist) {
623 					/* This can never happen. */
624 					ASSERT(kcage_current_glist != prev);
625 					kcage_current_glist = prev;
626 				}
627 				kcage_glist_free(lp);
628 				continue;
629 			}
630 
631 			/* Partial delete. */
632 			if (base > lp->base && lim < lp->lim) {
633 				struct kcage_glist *new;
634 
635 				/*
636 				 * Remove a section from the middle,
637 				 * need to allocate a new element.
638 				 */
639 				new = kcage_glist_alloc();
640 				if (new == NULL) {
641 					return (ENOMEM);
642 				}
643 
644 				/*
645 				 * Tranfser unused range to new.
646 				 * Edit lp in place to preserve
647 				 * kcage_current_glist.
648 				 */
649 				new->decr = lp->decr;
650 				if (new->decr != 0) {
651 					new->base = lp->base;
652 					new->lim = base;
653 					new->curr = base;
654 
655 					lp->base = lim;
656 				} else {
657 					new->base = lim;
658 					new->lim = lp->lim;
659 					new->curr = new->base;
660 
661 					lp->lim = base;
662 				}
663 
664 				/* Insert new. */
665 				new->next = lp->next;
666 				lp->next = new;
667 				lpp = &lp->next;
668 			} else {
669 				/* Delete part of current block. */
670 				if (base > lp->base) {
671 					ASSERT(lim >= lp->lim);
672 					ASSERT(base < lp->lim);
673 					if (lp->decr != 0 &&
674 					    lp->curr == lp->lim)
675 						lp->curr = base;
676 					lp->lim = base;
677 				} else {
678 					ASSERT(base <= lp->base);
679 					ASSERT(lim > lp->base);
680 					if (lp->decr == 0 &&
681 					    lp->curr == lp->base)
682 						lp->curr = lim;
683 					lp->base = lim;
684 				}
685 			}
686 		}
687 		prev = *lpp;
688 		lpp = &(*lpp)->next;
689 	}
690 
691 	return (0);
692 }
693 
694 /*
695  * If lockit is 1, kcage_get_pfn holds the
696  * reader lock for kcage_range_rwlock.
697  * Changes to lp->curr can cause race conditions, but
698  * they are handled by higher level code (see kcage_next_range.)
699  */
700 static pfn_t
kcage_get_pfn(int lockit)701 kcage_get_pfn(int lockit)
702 {
703 	struct kcage_glist *lp;
704 	pfn_t pfn = PFN_INVALID;
705 
706 	if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER))
707 		return (pfn);
708 
709 	lp = kcage_current_glist;
710 	while (lp != NULL) {
711 		if (lp->decr != 0) {
712 			if (lp->curr != lp->base) {
713 				pfn = --lp->curr;
714 				break;
715 			}
716 		} else {
717 			if (lp->curr != lp->lim) {
718 				pfn = lp->curr++;
719 				break;
720 			}
721 		}
722 
723 		lp = lp->next;
724 		if (lp)
725 			kcage_current_glist = lp;
726 	}
727 
728 	if (lockit)
729 		rw_exit(&kcage_range_rwlock);
730 	return (pfn);
731 }
732 
733 /*
734  * Walk the physical address space of the cage.
735  * This routine does not guarantee to return PFNs in the order
736  * in which they were allocated to the cage. Instead, it walks
737  * each range as they appear on the growth list returning the PFNs
738  * range in ascending order.
739  *
740  * To begin scanning at lower edge of cage, reset should be nonzero.
741  * To step through cage, reset should be zero.
742  *
743  * PFN_INVALID will be returned when the upper end of the cage is
744  * reached -- indicating a full scan of the cage has been completed since
745  * previous reset. PFN_INVALID will continue to be returned until
746  * kcage_walk_cage is reset.
747  *
748  * It is possible to receive a PFN_INVALID result on reset if a growth
749  * list is not installed or if none of the PFNs in the installed list have
750  * been allocated to the cage. In otherwords, there is no cage.
751  *
752  * Caller need not hold kcage_range_rwlock while calling this function
753  * as the front part of the list is static - pages never come out of
754  * the cage.
755  *
756  * The caller is expected to only be kcage_cageout().
757  */
758 static pfn_t
kcage_walk_cage(int reset)759 kcage_walk_cage(int reset)
760 {
761 	static struct kcage_glist *lp = NULL;
762 	static pfn_t pfn;
763 
764 	if (reset)
765 		lp = NULL;
766 	if (lp == NULL) {
767 		lp = kcage_glist;
768 		pfn = PFN_INVALID;
769 	}
770 again:
771 	if (pfn == PFN_INVALID) {
772 		if (lp == NULL)
773 			return (PFN_INVALID);
774 
775 		if (lp->decr != 0) {
776 			/*
777 			 * In this range the cage grows from the highest
778 			 * address towards the lowest.
779 			 * Arrange to return pfns from curr to lim-1,
780 			 * inclusive, in ascending order.
781 			 */
782 
783 			pfn = lp->curr;
784 		} else {
785 			/*
786 			 * In this range the cage grows from the lowest
787 			 * address towards the highest.
788 			 * Arrange to return pfns from base to curr,
789 			 * inclusive, in ascending order.
790 			 */
791 
792 			pfn = lp->base;
793 		}
794 	}
795 
796 	if (lp->decr != 0) {		/* decrementing pfn */
797 		if (pfn == lp->lim) {
798 			/* Don't go beyond the static part of the glist. */
799 			if (lp == kcage_current_glist)
800 				lp = NULL;
801 			else
802 				lp = lp->next;
803 			pfn = PFN_INVALID;
804 			goto again;
805 		}
806 
807 		ASSERT(pfn >= lp->curr && pfn < lp->lim);
808 	} else {			/* incrementing pfn */
809 		if (pfn == lp->curr) {
810 			/* Don't go beyond the static part of the glist. */
811 			if (lp == kcage_current_glist)
812 				lp = NULL;
813 			else
814 				lp = lp->next;
815 			pfn = PFN_INVALID;
816 			goto again;
817 		}
818 
819 		ASSERT(pfn >= lp->base && pfn < lp->curr);
820 	}
821 
822 	return (pfn++);
823 }
824 
825 /*
826  * Callback functions for to recalc cage thresholds after
827  * Kphysm memory add/delete operations.
828  */
829 /*ARGSUSED*/
830 static void
kcage_kphysm_postadd_cb(void * arg,pgcnt_t delta_pages)831 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
832 {
833 	kcage_recalc_thresholds();
834 }
835 
836 /*ARGSUSED*/
837 static int
kcage_kphysm_predel_cb(void * arg,pgcnt_t delta_pages)838 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
839 {
840 	/* TODO: when should cage refuse memory delete requests? */
841 	return (0);
842 }
843 
844 /*ARGSUSED*/
845 static  void
kcage_kphysm_postdel_cb(void * arg,pgcnt_t delta_pages,int cancelled)846 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
847 {
848 	kcage_recalc_thresholds();
849 }
850 
851 static kphysm_setup_vector_t kcage_kphysm_vectors = {
852 	KPHYSM_SETUP_VECTOR_VERSION,
853 	kcage_kphysm_postadd_cb,
854 	kcage_kphysm_predel_cb,
855 	kcage_kphysm_postdel_cb
856 };
857 
858 /*
859  * This is called before a CPR suspend and after a CPR resume.  We have to
860  * turn off kcage_cageout_ready before a suspend, and turn it back on after a
861  * restart.
862  */
863 /*ARGSUSED*/
864 static boolean_t
kcage_cageout_cpr(void * arg,int code)865 kcage_cageout_cpr(void *arg, int code)
866 {
867 	if (code == CB_CODE_CPR_CHKPT) {
868 		ASSERT(kcage_cageout_ready);
869 		kcage_cageout_ready = 0;
870 		return (B_TRUE);
871 	} else if (code == CB_CODE_CPR_RESUME) {
872 		ASSERT(kcage_cageout_ready == 0);
873 		kcage_cageout_ready = 1;
874 		return (B_TRUE);
875 	}
876 	return (B_FALSE);
877 }
878 
879 /*
880  * kcage_recalc_preferred_size() increases initial cage size to improve large
881  * page availability when lp for kmem is enabled and kpr is disabled
882  */
883 static pgcnt_t
kcage_recalc_preferred_size(pgcnt_t preferred_size)884 kcage_recalc_preferred_size(pgcnt_t preferred_size)
885 {
886 	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
887 		pgcnt_t lpmincage = kcage_kmemlp_mincage;
888 		if (lpmincage == 0) {
889 			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
890 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
891 		}
892 		kcage_kmemlp_mincage = MIN(lpmincage,
893 		    (segkmem_kmemlp_max / PAGESIZE));
894 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
895 	}
896 	return (preferred_size);
897 }
898 
899 /*
900  * Kcage_init() builds the cage and initializes the cage thresholds.
901  * The size of the cage is determined by the argument preferred_size.
902  * or the actual amount of memory, whichever is smaller.
903  */
904 static void
kcage_init(pgcnt_t preferred_size)905 kcage_init(pgcnt_t preferred_size)
906 {
907 	pgcnt_t wanted;
908 	pfn_t pfn;
909 	page_t *pp;
910 	kstat_t *ksp;
911 
912 	extern void page_list_noreloc_startup(page_t *);
913 
914 	ASSERT(!kcage_on);
915 
916 	/* increase preferred cage size for lp for kmem */
917 	preferred_size = kcage_recalc_preferred_size(preferred_size);
918 
919 	/* Debug note: initialize this now so early expansions can stat */
920 	KCAGE_STAT_INIT_SCAN_INDEX;
921 
922 	/*
923 	 * Initialize cage thresholds and install kphysm callback.
924 	 * If we can't arrange to have the thresholds track with
925 	 * available physical memory, then the cage thresholds may
926 	 * end up over time at levels that adversly effect system
927 	 * performance; so, bail out.
928 	 */
929 	kcage_recalc_thresholds();
930 	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
931 		ASSERT(0);		/* Catch this in DEBUG kernels. */
932 		return;
933 	}
934 
935 	/*
936 	 * Limit startup cage size within the range of kcage_minfree
937 	 * and availrmem, inclusively.
938 	 */
939 	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
940 
941 	/*
942 	 * Construct the cage. PFNs are allocated from the glist. It
943 	 * is assumed that the list has been properly ordered for the
944 	 * platform by the platform code. Typically, this is as simple
945 	 * as calling kcage_range_init(phys_avail, decr), where decr is
946 	 * 1 if the kernel has been loaded into upper end of physical
947 	 * memory, or 0 if the kernel has been loaded at the low end.
948 	 *
949 	 * Note: it is assumed that we are in the startup flow, so there
950 	 * is no reason to grab the page lock.
951 	 */
952 	kcage_freemem = 0;
953 	pfn = PFN_INVALID;			/* prime for alignment test */
954 	while (wanted != 0) {
955 		if ((pfn = kcage_get_pfn(0)) == PFN_INVALID)
956 			break;
957 
958 		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
959 			KCAGEPAGETS_INC();
960 			/*
961 			 * Set the noreloc state on the page.
962 			 * If the page is free and not already
963 			 * on the noreloc list then move it.
964 			 */
965 			if (PP_ISFREE(pp)) {
966 				if (PP_ISNORELOC(pp) == 0)
967 					page_list_noreloc_startup(pp);
968 			} else {
969 				ASSERT(pp->p_szc == 0);
970 				PP_SETNORELOC(pp);
971 			}
972 		}
973 		PLCNT_XFER_NORELOC(pp);
974 		wanted -= 1;
975 	}
976 
977 	/*
978 	 * Need to go through and find kernel allocated pages
979 	 * and capture them into the Cage.  These will primarily
980 	 * be pages gotten through boot_alloc().
981 	 */
982 	if (kvp.v_pages) {
983 
984 		pp = kvp.v_pages;
985 		do {
986 			ASSERT(!PP_ISFREE(pp));
987 			ASSERT(pp->p_szc == 0);
988 			if (PP_ISNORELOC(pp) == 0) {
989 				PP_SETNORELOC(pp);
990 				PLCNT_XFER_NORELOC(pp);
991 			}
992 		} while ((pp = pp->p_vpnext) != kvp.v_pages);
993 
994 	}
995 
996 	kcage_on = 1;
997 
998 	/*
999 	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
1000 	 * after the cageout thread is blocked, and executes from cpr_resume()
1001 	 * before the cageout thread is restarted.  By executing in this class,
1002 	 * we are assured that the kernel cage thread won't miss wakeup calls
1003 	 * and also CPR's larger kmem_alloc requests will not fail after
1004 	 * CPR shuts down the cageout kernel thread.
1005 	 */
1006 	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
1007 	    "cageout");
1008 
1009 	/*
1010 	 * Coalesce pages to improve large page availability. A better fix
1011 	 * would to coalesce pages as they are included in the cage
1012 	 */
1013 	if (SEGKMEM_USE_LARGEPAGES) {
1014 		extern void page_freelist_coalesce_all(int mnode);
1015 		page_freelist_coalesce_all(-1);	/* do all mnodes */
1016 	}
1017 
1018 	ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
1019 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
1020 	if (ksp != NULL) {
1021 		ksp->ks_update = kcage_kstat_update;
1022 		ksp->ks_snapshot = kcage_kstat_snapshot;
1023 		ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */
1024 		kstat_install(ksp);
1025 	}
1026 }
1027 
1028 static int
kcage_kstat_update(kstat_t * ksp,int rw)1029 kcage_kstat_update(kstat_t *ksp, int rw)
1030 {
1031 	struct kcage_glist *lp;
1032 	uint_t count;
1033 
1034 	if (rw == KSTAT_WRITE)
1035 		return (EACCES);
1036 
1037 	count = 0;
1038 	rw_enter(&kcage_range_rwlock, RW_WRITER);
1039 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
1040 		if (lp->decr) {
1041 			if (lp->curr != lp->lim) {
1042 				count++;
1043 			}
1044 		} else {
1045 			if (lp->curr != lp->base) {
1046 				count++;
1047 			}
1048 		}
1049 	}
1050 	rw_exit(&kcage_range_rwlock);
1051 
1052 	ksp->ks_ndata = count;
1053 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1054 
1055 	return (0);
1056 }
1057 
1058 static int
kcage_kstat_snapshot(kstat_t * ksp,void * buf,int rw)1059 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1060 {
1061 	struct kcage_glist *lp;
1062 	struct memunit {
1063 		uint64_t address;
1064 		uint64_t size;
1065 	} *kspmem;
1066 
1067 	if (rw == KSTAT_WRITE)
1068 		return (EACCES);
1069 
1070 	ksp->ks_snaptime = gethrtime();
1071 
1072 	kspmem = (struct memunit *)buf;
1073 	rw_enter(&kcage_range_rwlock, RW_WRITER);
1074 	for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) {
1075 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1076 			break;
1077 
1078 		if (lp->decr) {
1079 			if (lp->curr != lp->lim) {
1080 				kspmem->address = ptob(lp->curr);
1081 				kspmem->size = ptob(lp->lim - lp->curr);
1082 			}
1083 		} else {
1084 			if (lp->curr != lp->base) {
1085 				kspmem->address = ptob(lp->base);
1086 				kspmem->size = ptob(lp->curr - lp->base);
1087 			}
1088 		}
1089 	}
1090 	rw_exit(&kcage_range_rwlock);
1091 
1092 	return (0);
1093 }
1094 
1095 void
kcage_recalc_thresholds()1096 kcage_recalc_thresholds()
1097 {
1098 	static int first = 1;
1099 	static pgcnt_t init_lotsfree;
1100 	static pgcnt_t init_desfree;
1101 	static pgcnt_t init_minfree;
1102 	static pgcnt_t init_throttlefree;
1103 	static pgcnt_t init_reserve;
1104 
1105 	/* TODO: any reason to take more care than this with live editing? */
1106 	mutex_enter(&kcage_cageout_mutex);
1107 	mutex_enter(&freemem_lock);
1108 
1109 	if (first) {
1110 		first = 0;
1111 		init_lotsfree = kcage_lotsfree;
1112 		init_desfree = kcage_desfree;
1113 		init_minfree = kcage_minfree;
1114 		init_throttlefree = kcage_throttlefree;
1115 		init_reserve = kcage_reserve;
1116 	} else {
1117 		kcage_lotsfree = init_lotsfree;
1118 		kcage_desfree = init_desfree;
1119 		kcage_minfree = init_minfree;
1120 		kcage_throttlefree = init_throttlefree;
1121 		kcage_reserve = init_reserve;
1122 	}
1123 
1124 	if (kcage_lotsfree == 0)
1125 		kcage_lotsfree = MAX(32, total_pages / 256);
1126 
1127 	if (kcage_minfree == 0)
1128 		kcage_minfree = MAX(32, kcage_lotsfree / 2);
1129 
1130 	if (kcage_desfree == 0)
1131 		kcage_desfree = MAX(32, kcage_minfree);
1132 
1133 	if (kcage_throttlefree == 0)
1134 		kcage_throttlefree = MAX(32, kcage_minfree / 2);
1135 
1136 	if (kcage_reserve == 0)
1137 		kcage_reserve = MIN(32, kcage_throttlefree / 2);
1138 
1139 	mutex_exit(&freemem_lock);
1140 	mutex_exit(&kcage_cageout_mutex);
1141 
1142 	if (kcage_cageout_ready) {
1143 		if (kcage_freemem < kcage_desfree)
1144 			kcage_cageout_wakeup();
1145 
1146 		if (kcage_needfree) {
1147 			mutex_enter(&kcage_throttle_mutex);
1148 			cv_broadcast(&kcage_throttle_cv);
1149 			mutex_exit(&kcage_throttle_mutex);
1150 		}
1151 	}
1152 }
1153 
1154 /*
1155  * Pageout interface:
1156  * kcage_cageout_init()
1157  */
1158 void
kcage_cageout_init()1159 kcage_cageout_init()
1160 {
1161 	if (kcage_on) {
1162 		(void) lwp_kernel_create(proc_pageout, kcage_cageout, NULL,
1163 		    TS_RUN, maxclsyspri - 1);
1164 	}
1165 }
1166 
1167 
1168 /*
1169  * VM Interfaces:
1170  * kcage_create_throttle()
1171  * kcage_freemem_add()
1172  * kcage_freemem_sub()
1173  */
1174 
1175 /*
1176  * Wakeup cageout thread and throttle waiting for the number of pages
1177  * requested to become available.  For non-critical requests, a
1178  * timeout is added, since freemem accounting is separate from cage
1179  * freemem accounting: it's possible for us to get stuck and not make
1180  * forward progress even though there was sufficient freemem before
1181  * arriving here.
1182  */
1183 int
kcage_create_throttle(pgcnt_t npages,int flags)1184 kcage_create_throttle(pgcnt_t npages, int flags)
1185 {
1186 
1187 	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
1188 
1189 	/*
1190 	 * Obviously, we can't throttle the cageout thread since
1191 	 * we depend on it.  We also can't throttle the panic thread.
1192 	 */
1193 	if (curthread == kcage_cageout_thread || panicstr) {
1194 		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
1195 		return (KCT_CRIT);
1196 	}
1197 
1198 	/*
1199 	 * Don't throttle threads which are critical for proper
1200 	 * vm management if we're above kcage_throttlefree or
1201 	 * if freemem is very low.
1202 	 */
1203 	if (NOMEMWAIT()) {
1204 		if (kcage_freemem > kcage_throttlefree + npages) {
1205 			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1206 			return (KCT_CRIT);
1207 		} else if (freemem < minfree) {
1208 			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
1209 			return (KCT_CRIT);
1210 		}
1211 	}
1212 
1213 	/*
1214 	 * Don't throttle real-time threads if kcage_freemem > kcage_reserve.
1215 	 */
1216 	if (DISP_PRIO(curthread) > maxclsyspri &&
1217 	    kcage_freemem > kcage_reserve) {
1218 		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1219 		return (KCT_CRIT);
1220 	}
1221 
1222 	/*
1223 	 * Cause all other threads (which are assumed to not be
1224 	 * critical to cageout) to wait here until their request
1225 	 * can be satisfied. Be a little paranoid and wake the
1226 	 * kernel cage on each loop through this logic.
1227 	 */
1228 	while (kcage_freemem < kcage_throttlefree + npages) {
1229 		ASSERT(kcage_on);
1230 		if (kcage_cageout_ready) {
1231 			mutex_enter(&kcage_throttle_mutex);
1232 
1233 			kcage_needfree += npages;
1234 			KCAGE_STAT_INCR(kct_wait);
1235 
1236 			kcage_cageout_wakeup();
1237 			KCAGE_STAT_INCR(kct_cagewake);
1238 
1239 			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
1240 
1241 			kcage_needfree -= npages;
1242 
1243 			mutex_exit(&kcage_throttle_mutex);
1244 		} else {
1245 			/*
1246 			 * NOTE: atomics are used just in case we enter
1247 			 * mp operation before the cageout thread is ready.
1248 			 */
1249 			atomic_add_long(&kcage_needfree, npages);
1250 
1251 			kcage_cageout_wakeup();
1252 			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
1253 
1254 			atomic_add_long(&kcage_needfree, -npages);
1255 		}
1256 
1257 		if (NOMEMWAIT() && freemem < minfree) {
1258 			return (KCT_CRIT);
1259 		}
1260 		if ((flags & PG_WAIT) == 0) {
1261 			pgcnt_t limit = (flags & PG_NORMALPRI) ?
1262 			    throttlefree : pageout_reserve;
1263 
1264 			if ((kcage_freemem < kcage_throttlefree + npages) &&
1265 			    (freemem < limit + npages)) {
1266 				return (KCT_FAILURE);
1267 			} else {
1268 				return (KCT_NONCRIT);
1269 			}
1270 		}
1271 	}
1272 	return (KCT_NONCRIT);
1273 }
1274 
1275 void
kcage_freemem_add(pgcnt_t npages)1276 kcage_freemem_add(pgcnt_t npages)
1277 {
1278 	extern void wakeup_pcgs(void);
1279 
1280 	atomic_add_long(&kcage_freemem, npages);
1281 
1282 	wakeup_pcgs();  /* wakeup threads in pcgs() */
1283 
1284 	if (kcage_needfree != 0 &&
1285 	    kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
1286 
1287 		mutex_enter(&kcage_throttle_mutex);
1288 		cv_broadcast(&kcage_throttle_cv);
1289 		KCAGE_STAT_INCR(kfa_trottlewake);
1290 		mutex_exit(&kcage_throttle_mutex);
1291 	}
1292 }
1293 
1294 void
kcage_freemem_sub(pgcnt_t npages)1295 kcage_freemem_sub(pgcnt_t npages)
1296 {
1297 	atomic_add_long(&kcage_freemem, -npages);
1298 
1299 	if (kcage_freemem < kcage_desfree) {
1300 		kcage_cageout_wakeup();
1301 		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
1302 	}
1303 }
1304 
1305 /*
1306  * return 0 on failure and 1 on success.
1307  */
1308 static int
kcage_setnoreloc_pages(page_t * rootpp,se_t se)1309 kcage_setnoreloc_pages(page_t *rootpp, se_t se)
1310 {
1311 	pgcnt_t npgs, i;
1312 	page_t *pp;
1313 	pfn_t rootpfn = page_pptonum(rootpp);
1314 	uint_t szc;
1315 
1316 	ASSERT(!PP_ISFREE(rootpp));
1317 	ASSERT(PAGE_LOCKED_SE(rootpp, se));
1318 	if (!group_page_trylock(rootpp, se)) {
1319 		return (0);
1320 	}
1321 	szc = rootpp->p_szc;
1322 	if (szc == 0) {
1323 		/*
1324 		 * The szc of a locked page can only change for pages that are
1325 		 * non-swapfs (i.e. anonymous memory) file system pages.
1326 		 */
1327 		ASSERT(rootpp->p_vnode != NULL &&
1328 		    !PP_ISKAS(rootpp) &&
1329 		    !IS_SWAPFSVP(rootpp->p_vnode));
1330 		PP_SETNORELOC(rootpp);
1331 		return (1);
1332 	}
1333 	npgs = page_get_pagecnt(szc);
1334 	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
1335 	pp = rootpp;
1336 	for (i = 0; i < npgs; i++, pp++) {
1337 		ASSERT(PAGE_LOCKED_SE(pp, se));
1338 		ASSERT(!PP_ISFREE(pp));
1339 		ASSERT(pp->p_szc == szc);
1340 		PP_SETNORELOC(pp);
1341 	}
1342 	group_page_unlock(rootpp);
1343 	return (1);
1344 }
1345 
1346 /*
1347  * Attempt to convert page to a caged page (set the P_NORELOC flag).
1348  * If successful and pages is free, move page to the tail of whichever
1349  * list it is on.
1350  * Returns:
1351  *   EBUSY  page already locked, assimilated but not free.
1352  *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
1353  *   EAGAIN page not assimilated. Page not free.
1354  *   ERANGE page assimilated. Page not root.
1355  *   0      page assimilated. Page free.
1356  *   *nfreedp number of pages freed.
1357  * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
1358  * to distinguish between a page that was already a NORELOC page from
1359  * those newly converted to NORELOC pages by this invocation of
1360  * kcage_assimilate_page.
1361  */
1362 static int
kcage_assimilate_page(page_t * pp,pgcnt_t * nfreedp)1363 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
1364 {
1365 	if (page_trylock(pp, SE_EXCL)) {
1366 		if (PP_ISNORELOC(pp)) {
1367 check_free_and_return:
1368 			if (PP_ISFREE(pp)) {
1369 				page_unlock(pp);
1370 				*nfreedp = 0;
1371 				return (0);
1372 			} else {
1373 				page_unlock(pp);
1374 				return (EBUSY);
1375 			}
1376 			/*NOTREACHED*/
1377 		}
1378 	} else {
1379 		if (page_trylock(pp, SE_SHARED)) {
1380 			if (PP_ISNORELOC(pp))
1381 				goto check_free_and_return;
1382 		} else {
1383 			return (EAGAIN);
1384 		}
1385 		if (!PP_ISFREE(pp)) {
1386 			page_unlock(pp);
1387 			return (EAGAIN);
1388 		}
1389 
1390 		/*
1391 		 * Need to upgrade the lock on it and set the NORELOC
1392 		 * bit. If it is free then remove it from the free
1393 		 * list so that the platform free list code can keep
1394 		 * NORELOC pages where they should be.
1395 		 */
1396 		/*
1397 		 * Before doing anything, get the exclusive lock.
1398 		 * This may fail (eg ISM pages are left shared locked).
1399 		 * If the page is free this will leave a hole in the
1400 		 * cage. There is no solution yet to this.
1401 		 */
1402 		if (!page_tryupgrade(pp)) {
1403 			page_unlock(pp);
1404 			return (EAGAIN);
1405 		}
1406 	}
1407 
1408 	ASSERT(PAGE_EXCL(pp));
1409 
1410 	if (PP_ISFREE(pp)) {
1411 		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
1412 
1413 		page_list_sub(pp, which);
1414 		ASSERT(pp->p_szc == 0);
1415 		PP_SETNORELOC(pp);
1416 		PLCNT_XFER_NORELOC(pp);
1417 		page_list_add(pp, which | PG_LIST_TAIL);
1418 
1419 		page_unlock(pp);
1420 		*nfreedp = 1;
1421 		return (0);
1422 	} else {
1423 		if (pp->p_szc != 0) {
1424 			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
1425 				page_unlock(pp);
1426 				return (EAGAIN);
1427 			}
1428 			ASSERT(PP_ISNORELOC(pp));
1429 		} else {
1430 			PP_SETNORELOC(pp);
1431 		}
1432 		PLCNT_XFER_NORELOC(pp);
1433 		return (kcage_invalidate_page(pp, nfreedp));
1434 	}
1435 	/*NOTREACHED*/
1436 }
1437 
1438 static int
kcage_expand()1439 kcage_expand()
1440 {
1441 	int did_something = 0;
1442 
1443 	spgcnt_t wanted;
1444 	pfn_t pfn;
1445 	page_t *pp;
1446 	/* TODO: we don't really need n any more? */
1447 	pgcnt_t n;
1448 	pgcnt_t nf, nfreed;
1449 
1450 	/*
1451 	 * Expand the cage if available cage memory is really low. Calculate
1452 	 * the amount required to return kcage_freemem to the level of
1453 	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
1454 	 * more.  It is rare for their sum to create an artificial threshold
1455 	 * above kcage_lotsfree, but it is possible.
1456 	 *
1457 	 * Exit early if expansion amount is equal to or less than zero.
1458 	 * (<0 is possible if kcage_freemem rises suddenly.)
1459 	 *
1460 	 * Exit early when freemem drops below pageout_reserve plus the request.
1461 	 */
1462 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
1463 	    - kcage_freemem;
1464 	if (wanted <= 0) {
1465 		return (0);
1466 	} else if (freemem < pageout_reserve + wanted) {
1467 		KCAGE_STAT_INCR(ke_lowfreemem);
1468 		return (0);
1469 	}
1470 
1471 	KCAGE_STAT_INCR(ke_calls);
1472 	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
1473 
1474 	/*
1475 	 * Assimilate more pages from the global page pool into the cage.
1476 	 */
1477 	n = 0;				/* number of pages PP_SETNORELOC'd */
1478 	nf = 0;				/* number of those actually free */
1479 	while (kcage_on && nf < wanted) {
1480 		pfn = kcage_get_pfn(1);
1481 		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
1482 			KCAGE_STAT_INCR(ke_nopfn);
1483 			goto terminate;
1484 		}
1485 
1486 		KCAGE_STAT_INCR_SCAN(ke_examined);
1487 
1488 		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
1489 			KCAGE_STAT_INCR(ke_nopaget);
1490 			continue;
1491 		}
1492 		KCAGEPAGETS_INC();
1493 		/*
1494 		 * Sanity check. Skip this pfn if it is
1495 		 * being deleted.
1496 		 */
1497 		if (pfn_is_being_deleted(pfn)) {
1498 			KCAGE_STAT_INCR(ke_deleting);
1499 			continue;
1500 		}
1501 
1502 		if (PP_ISNORELOC(pp)) {
1503 			KCAGE_STAT_INCR(ke_isnoreloc);
1504 			continue;
1505 		}
1506 
1507 		switch (kcage_assimilate_page(pp, &nfreed)) {
1508 			case 0:		/* assimilated, page is free */
1509 				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
1510 				did_something = 1;
1511 				nf += nfreed;
1512 				n++;
1513 				break;
1514 
1515 			case EBUSY:	/* assimilated, page not free */
1516 			case ERANGE:	/* assimilated, page not root */
1517 				KCAGE_STAT_INCR_SCAN(ke_gotone);
1518 				did_something = 1;
1519 				n++;
1520 				break;
1521 
1522 			case ENOMEM:	/* assimilated, but no mem */
1523 				KCAGE_STAT_INCR(ke_terminate);
1524 				did_something = 1;
1525 				n++;
1526 				goto terminate;
1527 
1528 			case EAGAIN:	/* can't assimilate */
1529 				KCAGE_STAT_INCR_SCAN(ke_lefthole);
1530 				break;
1531 
1532 			default:	/* catch this with debug kernels */
1533 				ASSERT(0);
1534 				break;
1535 		}
1536 	}
1537 
1538 	/*
1539 	 * Realign cage edge with the nearest physical address
1540 	 * boundry for big pages. This is done to give us a
1541 	 * better chance of actually getting usable big pages
1542 	 * in the cage.
1543 	 */
1544 
1545 terminate:
1546 
1547 	return (did_something);
1548 }
1549 
1550 /*
1551  * Relocate page opp (Original Page Pointer) from cage pool to page rpp
1552  * (Replacement Page Pointer) in the global pool. Page opp will be freed
1553  * if relocation is successful, otherwise it is only unlocked.
1554  * On entry, page opp must be exclusively locked and not free.
1555  * *nfreedp: number of pages freed.
1556  */
1557 static int
kcage_relocate_page(page_t * pp,pgcnt_t * nfreedp)1558 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
1559 {
1560 	page_t *opp = pp;
1561 	page_t *rpp = NULL;
1562 	spgcnt_t npgs;
1563 	int result;
1564 
1565 	ASSERT(!PP_ISFREE(opp));
1566 	ASSERT(PAGE_EXCL(opp));
1567 
1568 	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
1569 	*nfreedp = npgs;
1570 	if (result == 0) {
1571 		while (npgs-- > 0) {
1572 			page_t *tpp;
1573 
1574 			ASSERT(rpp != NULL);
1575 			tpp = rpp;
1576 			page_sub(&rpp, tpp);
1577 			page_unlock(tpp);
1578 		}
1579 
1580 		ASSERT(rpp == NULL);
1581 
1582 		return (0);		/* success */
1583 	}
1584 
1585 	page_unlock(opp);
1586 	return (result);
1587 }
1588 
1589 /*
1590  * Based on page_invalidate_pages()
1591  *
1592  * Kcage_invalidate_page() uses page_relocate() twice. Both instances
1593  * of use must be updated to match the new page_relocate() when it
1594  * becomes available.
1595  *
1596  * Return result of kcage_relocate_page or zero if page was directly freed.
1597  * *nfreedp: number of pages freed.
1598  */
1599 static int
kcage_invalidate_page(page_t * pp,pgcnt_t * nfreedp)1600 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
1601 {
1602 	int result;
1603 
1604 #if defined(__sparc)
1605 	ASSERT(pp->p_vnode != &promvp);
1606 #endif /* __sparc */
1607 	ASSERT(!PP_ISFREE(pp));
1608 	ASSERT(PAGE_EXCL(pp));
1609 
1610 	/*
1611 	 * Is this page involved in some I/O? shared?
1612 	 * The page_struct_lock need not be acquired to
1613 	 * examine these fields since the page has an
1614 	 * "exclusive" lock.
1615 	 */
1616 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1617 		result = kcage_relocate_page(pp, nfreedp);
1618 #ifdef KCAGE_STATS
1619 		if (result == 0)
1620 			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
1621 		else if (result == ENOMEM)
1622 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1623 #endif
1624 		return (result);
1625 	}
1626 
1627 	ASSERT(pp->p_vnode->v_type != VCHR);
1628 
1629 	/*
1630 	 * Unload the mappings and check if mod bit is set.
1631 	 */
1632 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1633 
1634 	if (hat_ismod(pp)) {
1635 		result = kcage_relocate_page(pp, nfreedp);
1636 #ifdef KCAGE_STATS
1637 		if (result == 0)
1638 			KCAGE_STAT_INCR_SCAN(kip_relocmod);
1639 		else if (result == ENOMEM)
1640 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1641 #endif
1642 		return (result);
1643 	}
1644 
1645 	if (!page_try_demote_pages(pp)) {
1646 		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
1647 		page_unlock(pp);
1648 		return (EAGAIN);
1649 	}
1650 
1651 	/* LINTED: constant in conditional context */
1652 	VN_DISPOSE(pp, B_INVAL, 0, kcred);
1653 	KCAGE_STAT_INCR_SCAN(kip_destroy);
1654 	*nfreedp = 1;
1655 	return (0);
1656 }
1657 
1658 /*
1659  * Expand cage only if there is not enough memory to satisfy
1660  * current request. We only do one (complete) scan of the cage.
1661  * Dirty pages and pages with shared mappings are skipped;
1662  * Locked pages (p_lckcnt and p_cowcnt) are also skipped.
1663  * All other pages are freed (if they can be locked).
1664  * This may affect caching of user pages which are in cage by freeing/
1665  * reclaiming them more often. However cage is mainly for kernel (heap)
1666  * pages and we want to keep user pages outside of cage. The above policy
1667  * should also reduce cage expansion plus it should speed up cage mem
1668  * allocations.
1669  */
1670 static void
kcage_cageout()1671 kcage_cageout()
1672 {
1673 	pfn_t pfn;
1674 	page_t *pp;
1675 	callb_cpr_t cprinfo;
1676 	int did_something;
1677 	pfn_t start_pfn;
1678 	ulong_t shared_level = 8;
1679 	pgcnt_t nfreed;
1680 #ifdef KCAGE_STATS
1681 	clock_t scan_start;
1682 #endif
1683 
1684 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
1685 	    callb_generic_cpr, "cageout");
1686 
1687 	mutex_enter(&kcage_cageout_mutex);
1688 	kcage_cageout_thread = curthread;
1689 
1690 	pfn = PFN_INVALID;		/* force scan reset */
1691 	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
1692 	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
1693 
1694 loop:
1695 	/*
1696 	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
1697 	 * that kcage_freemem is less than kcage_desfree. When it does
1698 	 * notice, kcage_freemem_sub() will wake us up via call to
1699 	 * kcage_cageout_wakeup().
1700 	 */
1701 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1702 	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
1703 	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
1704 
1705 	KCAGE_STAT_INCR(kt_wakeups);
1706 	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
1707 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
1708 #ifdef KCAGE_STATS
1709 	scan_start = ddi_get_lbolt();
1710 #endif
1711 	if (!kcage_on)
1712 		goto loop;
1713 
1714 	KCAGE_STAT_INCR(kt_scans);
1715 	KCAGE_STAT_INCR_SCAN(kt_passes);
1716 
1717 	did_something = 0;
1718 	while (kcage_freemem < kcage_lotsfree + kcage_needfree) {
1719 
1720 		if ((pfn = kcage_walk_cage(pfn == PFN_INVALID)) ==
1721 		    PFN_INVALID) {
1722 			break;
1723 		}
1724 
1725 		if (start_pfn == PFN_INVALID)
1726 			start_pfn = pfn;
1727 		else if (start_pfn == pfn) {
1728 			/*
1729 			 * Did a complete walk of kernel cage, but didn't free
1730 			 * any pages.  If only one cpu is active then
1731 			 * stop kernel cage walk and try expanding.
1732 			 */
1733 			if (cp_default.cp_ncpus == 1 && did_something == 0) {
1734 				KCAGE_STAT_INCR(kt_cageout_break);
1735 				break;
1736 			}
1737 		}
1738 
1739 		pp = page_numtopp_nolock(pfn);
1740 		if (pp == NULL) {
1741 			continue;
1742 		}
1743 
1744 		KCAGE_STAT_INCR_SCAN(kt_examined);
1745 
1746 		/*
1747 		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
1748 		 * of the lock. If one is missed it will be seen next
1749 		 * time through.
1750 		 *
1751 		 * Skip non-caged-pages. These pages can exist in the cage
1752 		 * because, if during cage expansion, a page is
1753 		 * encountered that is long-term locked the lock prevents the
1754 		 * expansion logic from setting the P_NORELOC flag. Hence,
1755 		 * non-caged-pages surrounded by caged-pages.
1756 		 */
1757 		if (!PP_ISNORELOC(pp)) {
1758 			switch (kcage_assimilate_page(pp, &nfreed)) {
1759 				case 0:
1760 					did_something = 1;
1761 					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
1762 					    nfreed);
1763 					break;
1764 
1765 				case EBUSY:
1766 				case ERANGE:
1767 					did_something = 1;
1768 					KCAGE_STAT_INCR_SCAN(kt_gotone);
1769 					break;
1770 
1771 				case EAGAIN:
1772 				case ENOMEM:
1773 					break;
1774 
1775 				default:
1776 					/* catch this with debug kernels */
1777 					ASSERT(0);
1778 					break;
1779 			}
1780 
1781 			continue;
1782 		} else {
1783 			if (PP_ISFREE(pp)) {
1784 				continue;
1785 			}
1786 
1787 			if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) ||
1788 			    !page_trylock(pp, SE_EXCL)) {
1789 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
1790 				continue;
1791 			}
1792 
1793 			/* P_NORELOC bit should not have gone away. */
1794 			ASSERT(PP_ISNORELOC(pp));
1795 			if (PP_ISFREE(pp) || (PP_ISKAS(pp) &&
1796 			    pp->p_lckcnt > 0)) {
1797 				page_unlock(pp);
1798 				continue;
1799 			}
1800 
1801 			if (hat_page_checkshare(pp, shared_level)) {
1802 				page_unlock(pp);
1803 				KCAGE_STAT_INCR_SCAN(kt_skipshared);
1804 				continue;
1805 			}
1806 
1807 			if (kcage_invalidate_page(pp, &nfreed) == 0) {
1808 				did_something = 1;
1809 				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
1810 			}
1811 
1812 			/*
1813 			 * No need to drop the page lock here.
1814 			 * Kcage_invalidate_page has done that for us
1815 			 * either explicitly or through a page_free.
1816 			 */
1817 		}
1818 	}
1819 
1820 	if (kcage_freemem < kcage_throttlefree + kcage_needfree)
1821 		(void) kcage_expand();
1822 
1823 	if (kcage_on && kcage_cageout_ready)
1824 		cv_broadcast(&kcage_throttle_cv);
1825 
1826 	KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
1827 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
1828 	KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start);
1829 	KCAGE_STAT_INC_SCAN_INDEX;
1830 	goto loop;
1831 
1832 	/*NOTREACHED*/
1833 }
1834 
1835 void
kcage_cageout_wakeup()1836 kcage_cageout_wakeup()
1837 {
1838 	if (mutex_tryenter(&kcage_cageout_mutex)) {
1839 		if (kcage_cageout_ready) {
1840 			cv_signal(&kcage_cageout_cv);
1841 		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
1842 			/*
1843 			 * Available cage memory is really low. Time to
1844 			 * start expanding the cage. However, the
1845 			 * kernel cage thread is not yet ready to
1846 			 * do the work. Use *this* thread, which is
1847 			 * most likely to be t0, to do the work.
1848 			 */
1849 			KCAGE_STAT_INCR(kcw_expandearly);
1850 			(void) kcage_expand();
1851 			KCAGE_STAT_INC_SCAN_INDEX;
1852 		}
1853 
1854 		mutex_exit(&kcage_cageout_mutex);
1855 	}
1856 	/* else, kernel cage thread is already running */
1857 }
1858 
1859 void
kcage_tick()1860 kcage_tick()
1861 {
1862 	/*
1863 	 * Once per second we wake up all the threads throttled
1864 	 * waiting for cage memory, in case we've become stuck
1865 	 * and haven't made forward progress expanding the cage.
1866 	 */
1867 	if (kcage_on && kcage_cageout_ready)
1868 		cv_broadcast(&kcage_throttle_cv);
1869 }
1870