1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2021 Oxide Computer Company
24 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
25 */
26
27 /*
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 */
31
32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
33 /* All Rights Reserved */
34
35 /*
36 * University Copyright- Copyright (c) 1982, 1986, 1988
37 * The Regents of the University of California
38 * All Rights Reserved
39 *
40 * University Acknowledgment- Portions of this document are derived from
41 * software developed by the University of California, Berkeley, and its
42 * contributors.
43 */
44
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/buf.h>
49 #include <sys/uio.h>
50 #include <sys/proc.h>
51 #include <sys/systm.h>
52 #include <sys/mman.h>
53 #include <sys/cred.h>
54 #include <sys/vnode.h>
55 #include <sys/vm.h>
56 #include <sys/vmparam.h>
57 #include <sys/vtrace.h>
58 #include <sys/cmn_err.h>
59 #include <sys/cpuvar.h>
60 #include <sys/user.h>
61 #include <sys/kmem.h>
62 #include <sys/debug.h>
63 #include <sys/callb.h>
64 #include <sys/tnf_probe.h>
65 #include <sys/mem_cage.h>
66 #include <sys/time.h>
67 #include <sys/stdbool.h>
68
69 #include <vm/hat.h>
70 #include <vm/as.h>
71 #include <vm/seg.h>
72 #include <vm/page.h>
73 #include <vm/pvn.h>
74 #include <vm/seg_kmem.h>
75
76 /*
77 * FREE MEMORY MANAGEMENT
78 *
79 * Management of the pool of free pages is a tricky business. There are
80 * several critical threshold values which constrain our allocation of new
81 * pages and inform the rate of paging out of memory to swap. These threshold
82 * values, and the behaviour they induce, are described below in descending
83 * order of size -- and thus increasing order of severity!
84 *
85 * +---------------------------------------------------- physmem (all memory)
86 * |
87 * | Ordinarily there are no particular constraints placed on page
88 * v allocation. The page scanner is not running and page_create_va()
89 * | will effectively grant all page requests (whether from the kernel
90 * | or from user processes) without artificial delay.
91 * |
92 * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
93 * |
94 * | When we have less than "lotsfree" pages, pageout_scanner() is
95 * v signalled by schedpaging() to begin looking for pages that can
96 * | be evicted to disk to bring us back above lotsfree. At this
97 * | stage there is still no constraint on allocation of free pages.
98 * |
99 * | For small systems, we set a lower bound of 16MB for lotsfree;
100 * v this is the natural value for a system with 1GB memory. This is
101 * | to ensure that the pageout reserve pool contains at least 4MB
102 * | for use by ZFS.
103 * |
104 * | For systems with a large amount of memory, we constrain lotsfree
105 * | to be at most 2GB (with a pageout reserve of around 0.5GB), as
106 * v at some point the required slack relates more closely to the
107 * | rate at which paging can occur than to the total amount of memory.
108 * |
109 * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
110 * |
111 * | When we drop below desfree, a number of kernel facilities will
112 * v wait before allocating more memory, under the assumption that
113 * | pageout or reaping will make progress and free up some memory.
114 * | This behaviour is not especially coordinated; look for comparisons
115 * | of desfree and freemem.
116 * |
117 * | In addition to various attempts at advisory caution, clock()
118 * | will wake up the thread that is ordinarily parked in sched().
119 * | This routine is responsible for the heavy-handed swapping out
120 * v of entire processes in an attempt to arrest the slide of free
121 * | memory. See comments in sched.c for more details.
122 * |
123 * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
124 * |
125 * | These two separate tunables have, by default, the same value.
126 * v Various parts of the kernel use minfree to signal the need for
127 * | more aggressive reclamation of memory, and sched() is more
128 * | aggressive at swapping processes out.
129 * |
130 * | If free memory falls below throttlefree, page_create_va() will
131 * | use page_create_throttle() to begin holding most requests for
132 * | new pages while pageout and reaping free up memory. Sleeping
133 * v allocations (e.g., KM_SLEEP) are held here while we wait for
134 * | more memory. Non-sleeping allocations are generally allowed to
135 * | proceed, unless their priority is explicitly lowered with
136 * | KM_NORMALPRI.
137 * |
138 * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
139 * |
140 * | When we hit throttlefree, the situation is already dire. The
141 * v system is generally paging out memory and swapping out entire
142 * | processes in order to free up memory for continued operation.
143 * |
144 * | Unfortunately, evicting memory to disk generally requires short
145 * | term use of additional memory; e.g., allocation of buffers for
146 * | storage drivers, updating maps of free and used blocks, etc.
147 * | As such, pageout_reserve is the number of pages that we keep in
148 * | special reserve for use by pageout() and sched() and by any
149 * v other parts of the kernel that need to be working for those to
150 * | make forward progress such as the ZFS I/O pipeline.
151 * |
152 * | When we are below pageout_reserve, we fail or hold any allocation
153 * | that has not explicitly requested access to the reserve pool.
154 * | Access to the reserve is generally granted via the KM_PUSHPAGE
155 * | flag, or by marking a thread T_PUSHPAGE such that all allocations
156 * | can implicitly tap the reserve. For more details, see the
157 * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
158 * | and VM_PUSHPAGE allocation flags, and page_create_throttle().
159 * |
160 * +---------------------------------------------------------- no free memory
161 * |
162 * | If we have arrived here, things are very bad indeed. It is
163 * v surprisingly difficult to tell if this condition is even fatal,
164 * | as enough memory may have been granted to pageout() and to the
165 * | ZFS I/O pipeline that requests for eviction that have already been
166 * | made will complete and free up memory some time soon.
167 * |
168 * | If free memory does not materialise, the system generally remains
169 * | deadlocked. The pageout_deadman() below is run once per second
170 * | from clock(), seeking to limit the amount of time a single request
171 * v to page out can be blocked before the system panics to get a crash
172 * | dump and return to service.
173 * |
174 * +-------------------------------------------------------------------------
175 */
176
177 /*
178 * The following parameters control operation of the page replacement
179 * algorithm. They are initialized to 0, and then computed at boot time based
180 * on the size of the system; see setupclock(). If they are patched non-zero
181 * in a loaded vmunix they are left alone and may thus be changed per system
182 * using "mdb -kw" on the loaded system.
183 */
184 pgcnt_t slowscan = 0;
185 pgcnt_t fastscan = 0;
186
187 static pgcnt_t handspreadpages = 0;
188
189 /*
190 * looppages:
191 * Cached copy of the total number of pages in the system (total_pages).
192 *
193 * loopfraction:
194 * Divisor used to relate fastscan to looppages in setupclock().
195 */
196 static uint_t loopfraction = 2;
197 static pgcnt_t looppages;
198
199 static uint_t min_percent_cpu = 4;
200 static uint_t max_percent_cpu = 80;
201 static pgcnt_t maxfastscan = 0;
202 static pgcnt_t maxslowscan = 100;
203
204 #define MEGABYTES (1024ULL * 1024ULL)
205
206 /*
207 * pageout_threshold_style:
208 * set to 1 to use the previous default threshold size calculation;
209 * i.e., each threshold is half of the next largest value.
210 */
211 uint_t pageout_threshold_style = 0;
212
213 /*
214 * The operator may override these tunables to request a different minimum or
215 * maximum lotsfree value, or to change the divisor we use for automatic
216 * sizing.
217 *
218 * By default, we make lotsfree 1/64th of the total memory in the machine. The
219 * minimum and maximum are specified in bytes, rather than pages; a zero value
220 * means the default values (below) are used.
221 */
222 uint_t lotsfree_fraction = 64;
223 pgcnt_t lotsfree_min = 0;
224 pgcnt_t lotsfree_max = 0;
225
226 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES)
227 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES)
228
229 /*
230 * If these tunables are set to non-zero values in /etc/system, and provided
231 * the value is not larger than the threshold above, the specified value will
232 * be used directly without any additional calculation or adjustment. The boot
233 * time value of these overrides is preserved in the "clockinit" struct. More
234 * detail is available in the comment at the top of the file.
235 */
236 pgcnt_t maxpgio = 0;
237 pgcnt_t minfree = 0;
238 pgcnt_t desfree = 0;
239 pgcnt_t lotsfree = 0;
240 pgcnt_t needfree = 0;
241 pgcnt_t throttlefree = 0;
242 pgcnt_t pageout_reserve = 0;
243
244 pgcnt_t deficit;
245 pgcnt_t nscan;
246 pgcnt_t desscan;
247
248 /*
249 * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the
250 * number of nanoseconds in each wakeup cycle that gives the equivalent of some
251 * underlying %CPU duty cycle.
252 *
253 * min_pageout_nsec:
254 * nanoseconds/wakeup equivalent of min_percent_cpu.
255 *
256 * max_pageout_nsec:
257 * nanoseconds/wakeup equivalent of max_percent_cpu.
258 *
259 * pageout_nsec:
260 * Number of nanoseconds budgeted for each wakeup cycle.
261 * Computed each time around by schedpaging().
262 * Varies between min_pageout_nsec and max_pageout_nsec,
263 * depending on memory pressure.
264 */
265 static hrtime_t min_pageout_nsec;
266 static hrtime_t max_pageout_nsec;
267 static hrtime_t pageout_nsec;
268
269 static uint_t reset_hands;
270
271 #define PAGES_POLL_MASK 1023
272
273 /*
274 * pageout_sample_lim:
275 * The limit on the number of samples needed to establish a value for new
276 * pageout parameters: fastscan, slowscan, pageout_new_spread, and
277 * handspreadpages.
278 *
279 * pageout_sample_cnt:
280 * Current sample number. Once the sample gets large enough, set new
281 * values for handspreadpages, pageout_new_spread, fastscan and slowscan.
282 *
283 * pageout_sample_pages:
284 * The accumulated number of pages scanned during sampling.
285 *
286 * pageout_sample_etime:
287 * The accumulated nanoseconds for the sample.
288 *
289 * pageout_rate:
290 * Rate in pages/nanosecond, computed at the end of sampling.
291 *
292 * pageout_new_spread:
293 * Initially zero while the system scan rate is measured by
294 * pageout_scanner(), which then sets this value once per system boot after
295 * enough samples have been recorded (pageout_sample_cnt). Once set, this
296 * new value is used for fastscan and handspreadpages.
297 *
298 * sample_start, sample_end:
299 * The hrtime at which the last pageout_scanner() sample began and ended.
300 */
301 typedef hrtime_t hrrate_t;
302
303 static uint64_t pageout_sample_lim = 4;
304 static uint64_t pageout_sample_cnt = 0;
305 static pgcnt_t pageout_sample_pages = 0;
306 static hrrate_t pageout_rate = 0;
307 static pgcnt_t pageout_new_spread = 0;
308
309 static hrtime_t pageout_cycle_nsec;
310 static hrtime_t sample_start, sample_end;
311 static hrtime_t pageout_sample_etime = 0;
312
313 /*
314 * Record number of times a pageout_scanner() wakeup cycle finished because it
315 * timed out (exceeded its CPU budget), rather than because it visited
316 * its budgeted number of pages.
317 */
318 uint64_t pageout_timeouts = 0;
319
320 #ifdef VM_STATS
321 static struct pageoutvmstats_str {
322 ulong_t checkpage[3];
323 } pageoutvmstats;
324 #endif /* VM_STATS */
325
326 /*
327 * Threads waiting for free memory use this condition variable and lock until
328 * memory becomes available.
329 */
330 kmutex_t memavail_lock;
331 kcondvar_t memavail_cv;
332
333 typedef enum pageout_hand {
334 POH_FRONT = 1,
335 POH_BACK,
336 } pageout_hand_t;
337
338 typedef enum {
339 CKP_INELIGIBLE,
340 CKP_NOT_FREED,
341 CKP_FREED,
342 } checkpage_result_t;
343
344 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
345
346 static struct clockinit {
347 bool ci_init;
348 pgcnt_t ci_lotsfree_min;
349 pgcnt_t ci_lotsfree_max;
350 pgcnt_t ci_lotsfree;
351 pgcnt_t ci_desfree;
352 pgcnt_t ci_minfree;
353 pgcnt_t ci_throttlefree;
354 pgcnt_t ci_pageout_reserve;
355 pgcnt_t ci_maxpgio;
356 pgcnt_t ci_maxfastscan;
357 pgcnt_t ci_fastscan;
358 pgcnt_t ci_slowscan;
359 pgcnt_t ci_handspreadpages;
360 } clockinit = { .ci_init = false };
361
362 static pgcnt_t
clamp(pgcnt_t value,pgcnt_t minimum,pgcnt_t maximum)363 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
364 {
365 if (value < minimum) {
366 return (minimum);
367 } else if (value > maximum) {
368 return (maximum);
369 } else {
370 return (value);
371 }
372 }
373
374 static pgcnt_t
tune(pgcnt_t initval,pgcnt_t initval_ceiling,pgcnt_t defval)375 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
376 {
377 if (initval == 0 || initval >= initval_ceiling) {
378 return (defval);
379 } else {
380 return (initval);
381 }
382 }
383
384 /*
385 * Set up the paging constants for the clock algorithm used by
386 * pageout_scanner(), and by the virtual memory system overall. See the
387 * comments at the top of this file for more information about the threshold
388 * values and system responses to memory pressure.
389 *
390 * This routine is called once by main() at startup, after the initial size of
391 * physical memory is determined. It may be called again later if memory is
392 * added to or removed from the system, or if new measurements of the page scan
393 * rate become available.
394 */
395 void
setupclock(void)396 setupclock(void)
397 {
398 pgcnt_t defval;
399 bool half = (pageout_threshold_style == 1);
400 bool recalc = true;
401
402 looppages = total_pages;
403
404 /*
405 * The operator may have provided specific values for some of the
406 * tunables via /etc/system. On our first call, we preserve those
407 * values so that they can be used for subsequent recalculations.
408 *
409 * A value of zero for any tunable means we will use the default
410 * sizing.
411 */
412 if (!clockinit.ci_init) {
413 clockinit.ci_init = true;
414
415 clockinit.ci_lotsfree_min = lotsfree_min;
416 clockinit.ci_lotsfree_max = lotsfree_max;
417 clockinit.ci_lotsfree = lotsfree;
418 clockinit.ci_desfree = desfree;
419 clockinit.ci_minfree = minfree;
420 clockinit.ci_throttlefree = throttlefree;
421 clockinit.ci_pageout_reserve = pageout_reserve;
422 clockinit.ci_maxpgio = maxpgio;
423 clockinit.ci_maxfastscan = maxfastscan;
424 clockinit.ci_fastscan = fastscan;
425 clockinit.ci_slowscan = slowscan;
426 clockinit.ci_handspreadpages = handspreadpages;
427
428 /*
429 * The first call does not trigger a recalculation, only
430 * subsequent calls.
431 */
432 recalc = false;
433 }
434
435 /*
436 * Configure paging threshold values. For more details on what each
437 * threshold signifies, see the comments at the top of this file.
438 */
439 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
440 btop(LOTSFREE_MAX_DEFAULT));
441 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
442 btop(LOTSFREE_MIN_DEFAULT));
443
444 lotsfree = tune(clockinit.ci_lotsfree, looppages,
445 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
446
447 desfree = tune(clockinit.ci_desfree, lotsfree,
448 lotsfree / 2);
449
450 minfree = tune(clockinit.ci_minfree, desfree,
451 half ? desfree / 2 : 3 * desfree / 4);
452
453 throttlefree = tune(clockinit.ci_throttlefree, desfree,
454 minfree);
455
456 pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
457 half ? throttlefree / 2 : 3 * throttlefree / 4);
458
459 /*
460 * Maxpgio thresholds how much paging is acceptable.
461 * This figures that 2/3 busy on an arm is all that is
462 * tolerable for paging. We assume one operation per disk rev.
463 *
464 * XXX - Does not account for multiple swap devices.
465 */
466 if (clockinit.ci_maxpgio == 0) {
467 maxpgio = (DISKRPM * 2) / 3;
468 } else {
469 maxpgio = clockinit.ci_maxpgio;
470 }
471
472 /*
473 * The clock scan rate varies between fastscan and slowscan
474 * based on the amount of free memory available. Fastscan
475 * rate should be set based on the number pages that can be
476 * scanned per sec using ~10% of processor time. Since this
477 * value depends on the processor, MMU, Mhz etc., it is
478 * difficult to determine it in a generic manner for all
479 * architectures.
480 *
481 * Instead of trying to determine the number of pages scanned
482 * per sec for every processor, fastscan is set to be the smaller
483 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
484 * time is limited to ~4% of processor time.
485 *
486 * Setting fastscan to be 1/2 of memory allows pageout to scan
487 * all of memory in ~2 secs. This implies that user pages not
488 * accessed within 1 sec (assuming, handspreadpages == fastscan)
489 * can be reclaimed when free memory is very low. Stealing pages
490 * not accessed within 1 sec seems reasonable and ensures that
491 * active user processes don't thrash.
492 *
493 * Smaller values of fastscan result in scanning fewer pages
494 * every second and consequently pageout may not be able to free
495 * sufficient memory to maintain the minimum threshold. Larger
496 * values of fastscan result in scanning a lot more pages which
497 * could lead to thrashing and higher CPU usage.
498 *
499 * Fastscan needs to be limited to a maximum value and should not
500 * scale with memory to prevent pageout from consuming too much
501 * time for scanning on slow CPU's and avoid thrashing, as a
502 * result of scanning too many pages, on faster CPU's.
503 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
504 * (the upper bound for fastscan) based on the average number
505 * of pages that can potentially be scanned in ~1 sec (using ~4%
506 * of the CPU) on some of the following machines that currently
507 * run Solaris 2.x:
508 *
509 * average memory scanned in ~1 sec
510 *
511 * 25 Mhz SS1+: 23 Meg
512 * LX: 37 Meg
513 * 50 Mhz SC2000: 68 Meg
514 *
515 * 40 Mhz 486: 26 Meg
516 * 66 Mhz 486: 42 Meg
517 *
518 * When free memory falls just below lotsfree, the scan rate
519 * goes from 0 to slowscan (i.e., pageout starts running). This
520 * transition needs to be smooth and is achieved by ensuring that
521 * pageout scans a small number of pages to satisfy the transient
522 * memory demand. This is set to not exceed 100 pages/sec (25 per
523 * wakeup) since scanning that many pages has no noticible impact
524 * on system performance.
525 *
526 * In addition to setting fastscan and slowscan, pageout is
527 * limited to using ~4% of the CPU. This results in increasing
528 * the time taken to scan all of memory, which in turn means that
529 * user processes have a better opportunity of preventing their
530 * pages from being stolen. This has a positive effect on
531 * interactive and overall system performance when memory demand
532 * is high.
533 *
534 * Thus, the rate at which pages are scanned for replacement will
535 * vary linearly between slowscan and the number of pages that
536 * can be scanned using ~4% of processor time instead of varying
537 * linearly between slowscan and fastscan.
538 *
539 * Also, the processor time used by pageout will vary from ~1%
540 * at slowscan to ~4% at fastscan instead of varying between
541 * ~1% at slowscan and ~10% at fastscan.
542 *
543 * The values chosen for the various VM parameters (fastscan,
544 * handspreadpages, etc) are not universally true for all machines,
545 * but appear to be a good rule of thumb for the machines we've
546 * tested. They have the following ranges:
547 *
548 * cpu speed: 20 to 70 Mhz
549 * page size: 4K to 8K
550 * memory size: 16M to 5G
551 * page scan rate: 4000 - 17400 4K pages per sec
552 *
553 * The values need to be re-examined for machines which don't
554 * fall into the various ranges (e.g., slower or faster CPUs,
555 * smaller or larger pagesizes etc) shown above.
556 *
557 * On an MP machine, pageout is often unable to maintain the
558 * minimum paging thresholds under heavy load. This is due to
559 * the fact that user processes running on other CPU's can be
560 * dirtying memory at a much faster pace than pageout can find
561 * pages to free. The memory demands could be met by enabling
562 * more than one CPU to run the clock algorithm in such a manner
563 * that the various clock hands don't overlap. This also makes
564 * it more difficult to determine the values for fastscan, slowscan
565 * and handspreadpages.
566 *
567 * The swapper is currently used to free up memory when pageout
568 * is unable to meet memory demands by swapping out processes.
569 * In addition to freeing up memory, swapping also reduces the
570 * demand for memory by preventing user processes from running
571 * and thereby consuming memory.
572 */
573 if (clockinit.ci_maxfastscan == 0) {
574 if (pageout_new_spread != 0) {
575 maxfastscan = pageout_new_spread;
576 } else {
577 maxfastscan = MAXHANDSPREADPAGES;
578 }
579 } else {
580 maxfastscan = clockinit.ci_maxfastscan;
581 }
582
583 if (clockinit.ci_fastscan == 0) {
584 fastscan = MIN(looppages / loopfraction, maxfastscan);
585 } else {
586 fastscan = clockinit.ci_fastscan;
587 }
588
589 if (fastscan > looppages / loopfraction) {
590 fastscan = looppages / loopfraction;
591 }
592
593 /*
594 * Set slow scan time to 1/10 the fast scan time, but
595 * not to exceed maxslowscan.
596 */
597 if (clockinit.ci_slowscan == 0) {
598 slowscan = MIN(fastscan / 10, maxslowscan);
599 } else {
600 slowscan = clockinit.ci_slowscan;
601 }
602
603 if (slowscan > fastscan / 2) {
604 slowscan = fastscan / 2;
605 }
606
607 /*
608 * Handspreadpages is distance (in pages) between front and back
609 * pageout daemon hands. The amount of time to reclaim a page
610 * once pageout examines it increases with this distance and
611 * decreases as the scan rate rises. It must be < the amount
612 * of pageable memory.
613 *
614 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
615 * to be "fastscan" results in the front hand being a few secs
616 * (varies based on the processor speed) ahead of the back hand
617 * at fastscan rates. This distance can be further reduced, if
618 * necessary, by increasing the processor time used by pageout
619 * to be more than ~4% and preferrably not more than ~10%.
620 *
621 * As a result, user processes have a much better chance of
622 * referencing their pages before the back hand examines them.
623 * This also significantly lowers the number of reclaims from
624 * the freelist since pageout does not end up freeing pages which
625 * may be referenced a sec later.
626 */
627 if (clockinit.ci_handspreadpages == 0) {
628 handspreadpages = fastscan;
629 } else {
630 handspreadpages = clockinit.ci_handspreadpages;
631 }
632
633 /*
634 * Make sure that back hand follows front hand by at least
635 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the
636 * back hand to look at a page during the same wakeup of the pageout
637 * daemon in which the front hand cleared its ref bit.
638 */
639 if (handspreadpages >= looppages) {
640 handspreadpages = looppages - 1;
641 }
642
643 /*
644 * If we have been called to recalculate the parameters, set a flag to
645 * re-evaluate the clock hand pointers.
646 */
647 if (recalc) {
648 reset_hands = 1;
649 }
650 }
651
652 /*
653 * Pageout scheduling.
654 *
655 * Schedpaging controls the rate at which the page out daemon runs by
656 * setting the global variables nscan and desscan SCHEDPAGING_HZ
657 * times a second. Nscan records the number of pages pageout has examined
658 * in its current pass; schedpaging() resets this value to zero each time
659 * it runs. Desscan records the number of pages pageout should examine
660 * in its next pass; schedpaging() sets this value based on the amount of
661 * currently available memory.
662 */
663 #define SCHEDPAGING_HZ 4
664
665 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
666
667 /*
668 * Pool of available async pageout putpage requests.
669 */
670 static struct async_reqs *push_req;
671 static struct async_reqs *req_freelist; /* available req structs */
672 static struct async_reqs *push_list; /* pending reqs */
673 static kmutex_t push_lock; /* protects req pool */
674 static kcondvar_t push_cv;
675
676 /*
677 * If pageout() is stuck on a single push for this many seconds,
678 * pageout_deadman() will assume the system has hit a memory deadlock. If set
679 * to 0, the deadman will have no effect.
680 *
681 * Note that we are only looking for stalls in the calls that pageout() makes
682 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging
683 * I/O, which should not take long unless the underlying strategy call blocks
684 * indefinitely for memory. The actual I/O request happens (or fails) later.
685 */
686 uint_t pageout_deadman_seconds = 90;
687
688 static uint_t pageout_stucktime = 0;
689 static bool pageout_pushing = false;
690 static uint64_t pageout_pushcount = 0;
691 static uint64_t pageout_pushcount_seen = 0;
692
693 static int async_list_size = 256; /* number of async request structs */
694
695 static void pageout_scanner(void);
696
697 /*
698 * If a page is being shared more than "po_share" times
699 * then leave it alone- don't page it out.
700 */
701 #define MIN_PO_SHARE (8)
702 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
703 ulong_t po_share = MIN_PO_SHARE;
704
705 /*
706 * Schedule rate for paging.
707 * Rate is linear interpolation between
708 * slowscan with lotsfree and fastscan when out of memory.
709 */
710 static void
schedpaging(void * arg)711 schedpaging(void *arg)
712 {
713 spgcnt_t vavail;
714
715 if (freemem < lotsfree + needfree + kmem_reapahead)
716 kmem_reap();
717
718 if (freemem < lotsfree + needfree)
719 seg_preap();
720
721 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
722 kcage_cageout_wakeup();
723
724 if (mutex_tryenter(&pageout_mutex)) {
725 /* pageout() not running */
726 nscan = 0;
727 vavail = freemem - deficit;
728 if (pageout_new_spread != 0)
729 vavail -= needfree;
730 if (vavail < 0)
731 vavail = 0;
732 if (vavail > lotsfree)
733 vavail = lotsfree;
734
735 /*
736 * Fix for 1161438 (CRS SPR# 73922). All variables
737 * in the original calculation for desscan were 32 bit signed
738 * ints. As freemem approaches 0x0 on a system with 1 Gig or
739 * more of memory, the calculation can overflow. When this
740 * happens, desscan becomes negative and pageout_scanner()
741 * stops paging out.
742 */
743 if (needfree > 0 && pageout_new_spread == 0) {
744 /*
745 * If we've not yet collected enough samples to
746 * calculate a spread, use the old logic of kicking
747 * into high gear anytime needfree is non-zero.
748 */
749 desscan = fastscan / SCHEDPAGING_HZ;
750 } else {
751 /*
752 * Once we've calculated a spread based on system
753 * memory and usage, just treat needfree as another
754 * form of deficit.
755 */
756 spgcnt_t faststmp, slowstmp, result;
757
758 slowstmp = slowscan * vavail;
759 faststmp = fastscan * (lotsfree - vavail);
760 result = (slowstmp + faststmp) /
761 nz(lotsfree) / SCHEDPAGING_HZ;
762 desscan = (pgcnt_t)result;
763 }
764
765 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
766 (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
767
768 if (freemem < lotsfree + needfree ||
769 pageout_sample_cnt < pageout_sample_lim) {
770 /*
771 * Either we need more memory, or we still need to
772 * measure the average scan rate. Wake the scanner.
773 */
774 DTRACE_PROBE(pageout__cv__signal);
775 cv_signal(&proc_pageout->p_cv);
776 } else {
777 /*
778 * There are enough free pages, no need to
779 * kick the scanner thread. And next time
780 * around, keep more of the `highly shared'
781 * pages.
782 */
783 cv_signal_pageout();
784 if (po_share > MIN_PO_SHARE) {
785 po_share >>= 1;
786 }
787 }
788 mutex_exit(&pageout_mutex);
789 }
790
791 /*
792 * Signal threads waiting for available memory.
793 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
794 * in this case it is not needed - the waiters will be waken up during
795 * the next invocation of this function.
796 */
797 if (kmem_avail() > 0)
798 cv_broadcast(&memavail_cv);
799
800 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
801 }
802
803 pgcnt_t pushes;
804 ulong_t push_list_size; /* # of requests on pageout queue */
805
806 /*
807 * Paging out should always be enabled. This tunable exists to hold pageout
808 * for debugging purposes. If set to 0, pageout_scanner() will go back to
809 * sleep each time it is woken by schedpaging().
810 */
811 uint_t dopageout = 1;
812
813 /*
814 * The page out daemon, which runs as process 2.
815 *
816 * As long as there are at least lotsfree pages,
817 * this process is not run. When the number of free
818 * pages stays in the range desfree to lotsfree,
819 * this daemon runs through the pages in the loop
820 * at a rate determined in schedpaging(). Pageout manages
821 * two hands on the clock. The front hand moves through
822 * memory, clearing the reference bit,
823 * and stealing pages from procs that are over maxrss.
824 * The back hand travels a distance behind the front hand,
825 * freeing the pages that have not been referenced in the time
826 * since the front hand passed. If modified, they are pushed to
827 * swap before being freed.
828 *
829 * There are 2 threads that act on behalf of the pageout process.
830 * One thread scans pages (pageout_scanner) and frees them up if
831 * they don't require any VOP_PUTPAGE operation. If a page must be
832 * written back to its backing store, the request is put on a list
833 * and the other (pageout) thread is signaled. The pageout thread
834 * grabs VOP_PUTPAGE requests from the list, and processes them.
835 * Some filesystems may require resources for the VOP_PUTPAGE
836 * operations (like memory) and hence can block the pageout
837 * thread, but the scanner thread can still operate. There is still
838 * no guarantee that memory deadlocks cannot occur.
839 *
840 * For now, this thing is in very rough form.
841 */
842 void
pageout()843 pageout()
844 {
845 struct async_reqs *arg;
846 pri_t pageout_pri;
847 int i;
848 pgcnt_t max_pushes;
849 callb_cpr_t cprinfo;
850
851 proc_pageout = ttoproc(curthread);
852 proc_pageout->p_cstime = 0;
853 proc_pageout->p_stime = 0;
854 proc_pageout->p_cutime = 0;
855 proc_pageout->p_utime = 0;
856 bcopy("pageout", PTOU(curproc)->u_psargs, 8);
857 bcopy("pageout", PTOU(curproc)->u_comm, 7);
858
859 /*
860 * Create pageout scanner thread
861 */
862 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
863 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
864
865 /*
866 * Allocate and initialize the async request structures
867 * for pageout.
868 */
869 push_req = (struct async_reqs *)
870 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
871
872 req_freelist = push_req;
873 for (i = 0; i < async_list_size - 1; i++) {
874 push_req[i].a_next = &push_req[i + 1];
875 }
876
877 pageout_pri = curthread->t_pri;
878
879 /* Create the pageout scanner thread. */
880 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
881 pageout_pri - 1);
882
883 /*
884 * kick off pageout scheduler.
885 */
886 schedpaging(NULL);
887
888 /*
889 * Create kernel cage thread.
890 * The kernel cage thread is started under the pageout process
891 * to take advantage of the less restricted page allocation
892 * in page_create_throttle().
893 */
894 kcage_cageout_init();
895
896 /*
897 * Limit pushes to avoid saturating pageout devices.
898 */
899 max_pushes = maxpgio / SCHEDPAGING_HZ;
900 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
901
902 for (;;) {
903 mutex_enter(&push_lock);
904
905 while ((arg = push_list) == NULL || pushes > max_pushes) {
906 CALLB_CPR_SAFE_BEGIN(&cprinfo);
907 cv_wait(&push_cv, &push_lock);
908 pushes = 0;
909 CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
910 }
911 push_list = arg->a_next;
912 arg->a_next = NULL;
913 pageout_pushing = true;
914 mutex_exit(&push_lock);
915
916 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
917 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
918 pushes++;
919 }
920
921 /* vp held by checkpage() */
922 VN_RELE(arg->a_vp);
923
924 mutex_enter(&push_lock);
925 pageout_pushing = false;
926 pageout_pushcount++;
927 arg->a_next = req_freelist; /* back on freelist */
928 req_freelist = arg;
929 push_list_size--;
930 mutex_exit(&push_lock);
931 }
932 }
933
934 /*
935 * Kernel thread that scans pages looking for ones to free
936 */
937 static void
pageout_scanner(void)938 pageout_scanner(void)
939 {
940 struct page *fronthand, *backhand;
941 uint_t laps;
942 callb_cpr_t cprinfo;
943 pgcnt_t nscan_limit;
944 pgcnt_t pcount;
945 bool sampling;
946
947 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
948 mutex_enter(&pageout_mutex);
949
950 /*
951 * The restart case does not attempt to point the hands at roughly
952 * the right point on the assumption that after one circuit things
953 * will have settled down, and restarts shouldn't be that often.
954 */
955
956 /*
957 * Set the two clock hands to be separated by a reasonable amount,
958 * but no more than 360 degrees apart.
959 */
960 backhand = page_first();
961 if (handspreadpages >= total_pages) {
962 fronthand = page_nextn(backhand, total_pages - 1);
963 } else {
964 fronthand = page_nextn(backhand, handspreadpages);
965 }
966
967 /*
968 * Establish the minimum and maximum length of time to be spent
969 * scanning pages per wakeup, limiting the scanner duty cycle. The
970 * input percentage values (0-100) must be converted to a fraction of
971 * the number of nanoseconds in a second of wall time, then further
972 * scaled down by the number of scanner wakeups in a second:
973 */
974 min_pageout_nsec = MAX(1,
975 NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
976 max_pageout_nsec = MAX(min_pageout_nsec,
977 NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
978
979 loop:
980 cv_signal_pageout();
981
982 CALLB_CPR_SAFE_BEGIN(&cprinfo);
983 cv_wait(&proc_pageout->p_cv, &pageout_mutex);
984 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
985
986 /*
987 * Check if pageout has been disabled for debugging purposes:
988 */
989 if (!dopageout) {
990 goto loop;
991 }
992
993 /*
994 * One may reset the clock hands for debugging purposes. Hands will
995 * also be reset if memory is added to or removed from the system.
996 */
997 if (reset_hands) {
998 reset_hands = 0;
999
1000 backhand = page_first();
1001 if (handspreadpages >= total_pages) {
1002 fronthand = page_nextn(backhand, total_pages - 1);
1003 } else {
1004 fronthand = page_nextn(backhand, handspreadpages);
1005 }
1006 }
1007
1008 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1009
1010 /*
1011 * Keep track of the number of times we have scanned all the way around
1012 * the loop:
1013 */
1014 laps = 0;
1015
1016 DTRACE_PROBE(pageout__start);
1017
1018 /*
1019 * Track the number of pages visited during this scan so that we can
1020 * periodically measure our duty cycle.
1021 */
1022 pcount = 0;
1023
1024 if (pageout_sample_cnt < pageout_sample_lim) {
1025 /*
1026 * We need to measure the rate at which the system is able to
1027 * scan pages of memory. Each of these initial samples is a
1028 * scan of all system memory, regardless of whether or not we
1029 * are experiencing memory pressure.
1030 */
1031 nscan_limit = total_pages;
1032 sampling = true;
1033 } else {
1034 nscan_limit = desscan;
1035 sampling = false;
1036 }
1037
1038 sample_start = gethrtime();
1039
1040 /*
1041 * Scan the appropriate number of pages for a single duty cycle.
1042 */
1043 while (nscan < nscan_limit) {
1044 checkpage_result_t rvfront, rvback;
1045
1046 if (!sampling && freemem >= lotsfree + needfree) {
1047 /*
1048 * We are not sampling and enough memory has become
1049 * available that scanning is no longer required.
1050 */
1051 break;
1052 }
1053
1054 /*
1055 * Periodically check to see if we have exceeded the CPU duty
1056 * cycle for a single wakeup.
1057 */
1058 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1059 pageout_cycle_nsec = gethrtime() - sample_start;
1060 if (pageout_cycle_nsec >= pageout_nsec) {
1061 ++pageout_timeouts;
1062 break;
1063 }
1064 }
1065
1066 /*
1067 * If checkpage manages to add a page to the free list,
1068 * we give ourselves another couple of trips around the loop.
1069 */
1070 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1071 laps = 0;
1072 }
1073 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1074 laps = 0;
1075 }
1076
1077 ++pcount;
1078
1079 /*
1080 * Protected by pageout_mutex instead of cpu_stat_lock:
1081 */
1082 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1083
1084 /*
1085 * Don't include ineligible pages in the number scanned.
1086 */
1087 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1088 nscan++;
1089 }
1090
1091 backhand = page_next(backhand);
1092 fronthand = page_next(fronthand);
1093
1094 /*
1095 * The front hand has wrapped around to the first page in the
1096 * loop.
1097 */
1098 if (fronthand == page_first()) {
1099 laps++;
1100 DTRACE_PROBE1(pageout__hand__wrap, uint_t, laps);
1101
1102 /*
1103 * Protected by pageout_mutex instead of cpu_stat_lock:
1104 */
1105 CPU_STATS_ADDQ(CPU, vm, rev, 1);
1106
1107 if (laps > 1) {
1108 /*
1109 * Extremely unlikely, but it happens.
1110 * We went around the loop at least once
1111 * and didn't get far enough.
1112 * If we are still skipping `highly shared'
1113 * pages, skip fewer of them. Otherwise,
1114 * give up till the next clock tick.
1115 */
1116 if (po_share < MAX_PO_SHARE) {
1117 po_share <<= 1;
1118 } else {
1119 break;
1120 }
1121 }
1122 }
1123 }
1124
1125 sample_end = gethrtime();
1126
1127 DTRACE_PROBE1(pageout__end, uint_t, laps);
1128
1129 if (pageout_new_spread == 0) {
1130 if (pageout_sample_cnt < pageout_sample_lim) {
1131 /*
1132 * Continue accumulating samples until we have enough
1133 * to get a reasonable value for average scan rate:
1134 */
1135 pageout_sample_pages += pcount;
1136 pageout_sample_etime += sample_end - sample_start;
1137 ++pageout_sample_cnt;
1138 }
1139
1140 if (pageout_sample_cnt >= pageout_sample_lim) {
1141 /*
1142 * We have enough samples, set the spread.
1143 */
1144 pageout_rate = (hrrate_t)pageout_sample_pages *
1145 (hrrate_t)(NANOSEC) / pageout_sample_etime;
1146 pageout_new_spread = pageout_rate / 10;
1147 setupclock();
1148 }
1149 }
1150
1151 goto loop;
1152 }
1153
1154 /*
1155 * The pageout deadman is run once per second by clock().
1156 */
1157 void
pageout_deadman(void)1158 pageout_deadman(void)
1159 {
1160 if (panicstr != NULL) {
1161 /*
1162 * There is no pageout after panic.
1163 */
1164 return;
1165 }
1166
1167 if (pageout_deadman_seconds == 0) {
1168 /*
1169 * The deadman is not enabled.
1170 */
1171 return;
1172 }
1173
1174 if (!pageout_pushing) {
1175 goto reset;
1176 }
1177
1178 /*
1179 * We are pushing a page. Check to see if it is the same call we saw
1180 * last time we looked:
1181 */
1182 if (pageout_pushcount != pageout_pushcount_seen) {
1183 /*
1184 * It is a different call from the last check, so we are not
1185 * stuck.
1186 */
1187 goto reset;
1188 }
1189
1190 if (++pageout_stucktime >= pageout_deadman_seconds) {
1191 panic("pageout_deadman: stuck pushing the same page for %d "
1192 "seconds (freemem is %lu)", pageout_deadman_seconds,
1193 freemem);
1194 }
1195
1196 return;
1197
1198 reset:
1199 /*
1200 * Reset our tracking state to reflect that we are not stuck:
1201 */
1202 pageout_stucktime = 0;
1203 pageout_pushcount_seen = pageout_pushcount;
1204 }
1205
1206 /*
1207 * Look at the page at hand. If it is locked (e.g., for physical i/o),
1208 * system (u., page table) or free, then leave it alone. Otherwise,
1209 * if we are running the front hand, turn off the page's reference bit.
1210 * If the proc is over maxrss, we take it. If running the back hand,
1211 * check whether the page has been reclaimed. If not, free the page,
1212 * pushing it to disk first if necessary.
1213 *
1214 * Return values:
1215 * CKP_INELIGIBLE if the page is not a candidate at all,
1216 * CKP_NOT_FREED if the page was not freed, or
1217 * CKP_FREED if we freed it.
1218 */
1219 static checkpage_result_t
checkpage(struct page * pp,pageout_hand_t whichhand)1220 checkpage(struct page *pp, pageout_hand_t whichhand)
1221 {
1222 int ppattr;
1223 int isfs = 0;
1224 int isexec = 0;
1225 int pagesync_flag;
1226
1227 /*
1228 * Skip pages:
1229 * - associated with the kernel vnode since
1230 * they are always "exclusively" locked.
1231 * - that are free
1232 * - that are shared more than po_share'd times
1233 * - its already locked
1234 *
1235 * NOTE: These optimizations assume that reads are atomic.
1236 */
1237
1238 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1239 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1240 hat_page_checkshare(pp, po_share)) {
1241 return (CKP_INELIGIBLE);
1242 }
1243
1244 if (!page_trylock(pp, SE_EXCL)) {
1245 /*
1246 * Skip the page if we can't acquire the "exclusive" lock.
1247 */
1248 return (CKP_INELIGIBLE);
1249 } else if (PP_ISFREE(pp)) {
1250 /*
1251 * It became free between the above check and our actually
1252 * locking the page. Oh well, there will be other pages.
1253 */
1254 page_unlock(pp);
1255 return (CKP_INELIGIBLE);
1256 }
1257
1258 /*
1259 * Reject pages that cannot be freed. The page_struct_lock
1260 * need not be acquired to examine these
1261 * fields since the page has an "exclusive" lock.
1262 */
1263 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1264 page_unlock(pp);
1265 return (CKP_INELIGIBLE);
1266 }
1267
1268 /*
1269 * Maintain statistics for what we are freeing
1270 */
1271 if (pp->p_vnode != NULL) {
1272 if (pp->p_vnode->v_flag & VVMEXEC)
1273 isexec = 1;
1274
1275 if (!IS_SWAPFSVP(pp->p_vnode))
1276 isfs = 1;
1277 }
1278
1279 /*
1280 * Turn off REF and MOD bits with the front hand.
1281 * The back hand examines the REF bit and always considers
1282 * SHARED pages as referenced.
1283 */
1284 if (whichhand == POH_FRONT) {
1285 pagesync_flag = HAT_SYNC_ZERORM;
1286 } else {
1287 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1288 HAT_SYNC_STOPON_SHARED;
1289 }
1290
1291 ppattr = hat_pagesync(pp, pagesync_flag);
1292
1293 recheck:
1294 /*
1295 * If page is referenced; make unreferenced but reclaimable.
1296 * If this page is not referenced, then it must be reclaimable
1297 * and we can add it to the free list.
1298 */
1299 if (ppattr & P_REF) {
1300 DTRACE_PROBE2(pageout__isref, page_t *, pp,
1301 pageout_hand_t, whichhand);
1302
1303 if (whichhand == POH_FRONT) {
1304 /*
1305 * Checking of rss or madvise flags needed here...
1306 *
1307 * If not "well-behaved", fall through into the code
1308 * for not referenced.
1309 */
1310 hat_clrref(pp);
1311 }
1312
1313 /*
1314 * Somebody referenced the page since the front
1315 * hand went by, so it's not a candidate for
1316 * freeing up.
1317 */
1318 page_unlock(pp);
1319 return (CKP_NOT_FREED);
1320 }
1321
1322 VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1323
1324 /*
1325 * If large page, attempt to demote it. If successfully demoted,
1326 * retry the checkpage.
1327 */
1328 if (pp->p_szc != 0) {
1329 if (!page_try_demote_pages(pp)) {
1330 VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1331 page_unlock(pp);
1332 return (CKP_INELIGIBLE);
1333 }
1334
1335 ASSERT(pp->p_szc == 0);
1336 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1337
1338 /*
1339 * Since page_try_demote_pages() could have unloaded some
1340 * mappings it makes sense to reload ppattr.
1341 */
1342 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1343 }
1344
1345 /*
1346 * If the page is currently dirty, we have to arrange to have it
1347 * cleaned before it can be freed.
1348 *
1349 * XXX - ASSERT(pp->p_vnode != NULL);
1350 */
1351 if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1352 struct vnode *vp = pp->p_vnode;
1353 u_offset_t offset = pp->p_offset;
1354
1355 /*
1356 * XXX - Test for process being swapped out or about to exit?
1357 * [Can't get back to process(es) using the page.]
1358 */
1359
1360 /*
1361 * Hold the vnode before releasing the page lock to
1362 * prevent it from being freed and re-used by some
1363 * other thread.
1364 */
1365 VN_HOLD(vp);
1366 page_unlock(pp);
1367
1368 /*
1369 * Queue I/O request for the pageout thread.
1370 */
1371 if (!queue_io_request(vp, offset)) {
1372 VN_RELE(vp);
1373 return (CKP_NOT_FREED);
1374 }
1375 return (CKP_FREED);
1376 }
1377
1378 /*
1379 * Now we unload all the translations and put the page back on to the
1380 * free list. If the page was used (referenced or modified) after the
1381 * pagesync but before it was unloaded we catch it and handle the page
1382 * properly.
1383 */
1384 DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1385 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1386 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1387 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1388 goto recheck;
1389 }
1390
1391 VN_DISPOSE(pp, B_FREE, 0, kcred);
1392
1393 CPU_STATS_ADD_K(vm, dfree, 1);
1394
1395 if (isfs) {
1396 if (isexec) {
1397 CPU_STATS_ADD_K(vm, execfree, 1);
1398 } else {
1399 CPU_STATS_ADD_K(vm, fsfree, 1);
1400 }
1401 } else {
1402 CPU_STATS_ADD_K(vm, anonfree, 1);
1403 }
1404
1405 return (CKP_FREED);
1406 }
1407
1408 /*
1409 * Queue async i/o request from pageout_scanner and segment swapout
1410 * routines on one common list. This ensures that pageout devices (swap)
1411 * are not saturated by pageout_scanner or swapout requests.
1412 * The pageout thread empties this list by initiating i/o operations.
1413 */
1414 int
queue_io_request(vnode_t * vp,u_offset_t off)1415 queue_io_request(vnode_t *vp, u_offset_t off)
1416 {
1417 struct async_reqs *arg;
1418
1419 /*
1420 * If we cannot allocate an async request struct,
1421 * skip this page.
1422 */
1423 mutex_enter(&push_lock);
1424 if ((arg = req_freelist) == NULL) {
1425 mutex_exit(&push_lock);
1426 return (0);
1427 }
1428 req_freelist = arg->a_next; /* adjust freelist */
1429 push_list_size++;
1430
1431 arg->a_vp = vp;
1432 arg->a_off = off;
1433 arg->a_len = PAGESIZE;
1434 arg->a_flags = B_ASYNC | B_FREE;
1435 arg->a_cred = kcred; /* always held */
1436
1437 /*
1438 * Add to list of pending write requests.
1439 */
1440 arg->a_next = push_list;
1441 push_list = arg;
1442
1443 if (req_freelist == NULL) {
1444 /*
1445 * No free async requests left. The lock is held so we
1446 * might as well signal the pusher thread now.
1447 */
1448 cv_signal(&push_cv);
1449 }
1450 mutex_exit(&push_lock);
1451 return (1);
1452 }
1453
1454 /*
1455 * Wakeup pageout to initiate i/o if push_list is not empty.
1456 */
1457 void
cv_signal_pageout()1458 cv_signal_pageout()
1459 {
1460 if (push_list != NULL) {
1461 mutex_enter(&push_lock);
1462 cv_signal(&push_cv);
1463 mutex_exit(&push_lock);
1464 }
1465 }
1466