1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
24 */
25
26/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved  	*/
28
29/*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39/*
40 * VM - paged vnode.
41 *
42 * This file supplies vm support for the vnode operations that deal with pages.
43 */
44#include <sys/types.h>
45#include <sys/t_lock.h>
46#include <sys/param.h>
47#include <sys/sysmacros.h>
48#include <sys/systm.h>
49#include <sys/time.h>
50#include <sys/buf.h>
51#include <sys/vnode.h>
52#include <sys/uio.h>
53#include <sys/vmsystm.h>
54#include <sys/mman.h>
55#include <sys/vfs.h>
56#include <sys/cred.h>
57#include <sys/user.h>
58#include <sys/kmem.h>
59#include <sys/cmn_err.h>
60#include <sys/debug.h>
61#include <sys/cpuvar.h>
62#include <sys/vtrace.h>
63#include <sys/tnf_probe.h>
64
65#include <vm/hat.h>
66#include <vm/as.h>
67#include <vm/seg.h>
68#include <vm/rm.h>
69#include <vm/pvn.h>
70#include <vm/page.h>
71#include <vm/seg_map.h>
72#include <vm/seg_kmem.h>
73#include <sys/fs/swapnode.h>
74
75int pvn_nofodklust = 0;
76int pvn_write_noklust = 0;
77
78uint_t pvn_vmodsort_supported = 0;	/* set if HAT supports VMODSORT */
79uint_t pvn_vmodsort_disable = 0;	/* set in /etc/system to disable HAT */
80					/* support for vmodsort for testing */
81
82static struct kmem_cache *marker_cache = NULL;
83
84/*
85 * Find the largest contiguous block which contains `addr' for file offset
86 * `offset' in it while living within the file system block sizes (`vp_off'
87 * and `vp_len') and the address space limits for which no pages currently
88 * exist and which map to consecutive file offsets.
89 */
90page_t *
91pvn_read_kluster(
92	struct vnode *vp,
93	u_offset_t off,
94	struct seg *seg,
95	caddr_t addr,
96	u_offset_t *offp,			/* return values */
97	size_t *lenp,				/* return values */
98	u_offset_t vp_off,
99	size_t vp_len,
100	int isra)
101{
102	ssize_t deltaf, deltab;
103	page_t *pp;
104	page_t *plist = NULL;
105	spgcnt_t pagesavail;
106	u_offset_t vp_end;
107
108	ASSERT(off >= vp_off && off < vp_off + vp_len);
109
110	/*
111	 * We only want to do klustering/read ahead if there
112	 * is more than minfree pages currently available.
113	 */
114	pagesavail = freemem - minfree;
115
116	if (pagesavail <= 0)
117		if (isra)
118			return ((page_t *)NULL);    /* ra case - give up */
119		else
120			pagesavail = 1;		    /* must return a page */
121
122	/* We calculate in pages instead of bytes due to 32-bit overflows */
123	if (pagesavail < (spgcnt_t)btopr(vp_len)) {
124		/*
125		 * Don't have enough free memory for the
126		 * max request, try sizing down vp request.
127		 */
128		deltab = (ssize_t)(off - vp_off);
129		vp_len -= deltab;
130		vp_off += deltab;
131		if (pagesavail < btopr(vp_len)) {
132			/*
133			 * Still not enough memory, just settle for
134			 * pagesavail which is at least 1.
135			 */
136			vp_len = ptob(pagesavail);
137		}
138	}
139
140	vp_end = vp_off + vp_len;
141	ASSERT(off >= vp_off && off < vp_end);
142
143	if (isra && SEGOP_KLUSTER(seg, addr, 0))
144		return ((page_t *)NULL);	/* segment driver says no */
145
146	if ((plist = page_create_va(vp, off,
147	    PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
148		return ((page_t *)NULL);
149
150	if (vp_len <= PAGESIZE || pvn_nofodklust) {
151		*offp = off;
152		*lenp = MIN(vp_len, PAGESIZE);
153	} else {
154		/*
155		 * Scan back from front by incrementing "deltab" and
156		 * comparing "off" with "vp_off + deltab" to avoid
157		 * "signed" versus "unsigned" conversion problems.
158		 */
159		for (deltab = PAGESIZE; off >= vp_off + deltab;
160		    deltab += PAGESIZE) {
161			/*
162			 * Call back to the segment driver to verify that
163			 * the klustering/read ahead operation makes sense.
164			 */
165			if (SEGOP_KLUSTER(seg, addr, -deltab))
166				break;		/* page not eligible */
167			if ((pp = page_create_va(vp, off - deltab,
168			    PAGESIZE, PG_EXCL, seg, addr - deltab))
169			    == NULL)
170				break;		/* already have the page */
171			/*
172			 * Add page to front of page list.
173			 */
174			page_add(&plist, pp);
175		}
176		deltab -= PAGESIZE;
177
178		/* scan forward from front */
179		for (deltaf = PAGESIZE; off + deltaf < vp_end;
180		    deltaf += PAGESIZE) {
181			/*
182			 * Call back to the segment driver to verify that
183			 * the klustering/read ahead operation makes sense.
184			 */
185			if (SEGOP_KLUSTER(seg, addr, deltaf))
186				break;		/* page not file extension */
187			if ((pp = page_create_va(vp, off + deltaf,
188			    PAGESIZE, PG_EXCL, seg, addr + deltaf))
189			    == NULL)
190				break;		/* already have page */
191
192			/*
193			 * Add page to end of page list.
194			 */
195			page_add(&plist, pp);
196			plist = plist->p_next;
197		}
198		*offp = off = off - deltab;
199		*lenp = deltab + deltaf;
200		ASSERT(off >= vp_off);
201
202		/*
203		 * If we ended up getting more than was actually
204		 * requested, retract the returned length to only
205		 * reflect what was requested.  This might happen
206		 * if we were allowed to kluster pages across a
207		 * span of (say) 5 frags, and frag size is less
208		 * than PAGESIZE.  We need a whole number of
209		 * pages to contain those frags, but the returned
210		 * size should only allow the returned range to
211		 * extend as far as the end of the frags.
212		 */
213		if ((vp_off + vp_len) < (off + *lenp)) {
214			ASSERT(vp_end > off);
215			*lenp = vp_end - off;
216		}
217	}
218	TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
219	    "pvn_read_kluster:seg %p addr %x isra %x",
220	    seg, addr, isra);
221	return (plist);
222}
223
224/*
225 * Handle pages for this vnode on either side of the page "pp"
226 * which has been locked by the caller.  This routine will also
227 * do klustering in the range [vp_off, vp_off + vp_len] up
228 * until a page which is not found.  The offset and length
229 * of pages included is returned in "*offp" and "*lenp".
230 *
231 * Returns a list of dirty locked pages all ready to be
232 * written back.
233 */
234page_t *
235pvn_write_kluster(
236	struct vnode *vp,
237	page_t *pp,
238	u_offset_t *offp,		/* return values */
239	size_t *lenp,			/* return values */
240	u_offset_t vp_off,
241	size_t vp_len,
242	int flags)
243{
244	u_offset_t off;
245	page_t *dirty;
246	size_t deltab, deltaf;
247	se_t se;
248	u_offset_t vp_end;
249
250	off = pp->p_offset;
251
252	/*
253	 * Kustering should not be done if we are invalidating
254	 * pages since we could destroy pages that belong to
255	 * some other process if this is a swap vnode.
256	 */
257	if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
258		*offp = off;
259		*lenp = PAGESIZE;
260		return (pp);
261	}
262
263	if (flags & (B_FREE | B_INVAL))
264		se = SE_EXCL;
265	else
266		se = SE_SHARED;
267
268	dirty = pp;
269	/*
270	 * Scan backwards looking for pages to kluster by incrementing
271	 * "deltab" and comparing "off" with "vp_off + deltab" to
272	 * avoid "signed" versus "unsigned" conversion problems.
273	 */
274	for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
275		pp = page_lookup_nowait(vp, off - deltab, se);
276		if (pp == NULL)
277			break;		/* page not found */
278		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
279			break;
280		page_add(&dirty, pp);
281	}
282	deltab -= PAGESIZE;
283
284	vp_end = vp_off + vp_len;
285	/* now scan forwards looking for pages to kluster */
286	for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
287		pp = page_lookup_nowait(vp, off + deltaf, se);
288		if (pp == NULL)
289			break;		/* page not found */
290		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
291			break;
292		page_add(&dirty, pp);
293		dirty = dirty->p_next;
294	}
295
296	*offp = off - deltab;
297	*lenp = deltab + deltaf;
298	return (dirty);
299}
300
301/*
302 * Generic entry point used to release the "shared/exclusive" lock
303 * and the "p_iolock" on pages after i/o is complete.
304 */
305void
306pvn_io_done(page_t *plist)
307{
308	page_t *pp;
309
310	while (plist != NULL) {
311		pp = plist;
312		page_sub(&plist, pp);
313		page_io_unlock(pp);
314		page_unlock(pp);
315	}
316}
317
318/*
319 * Entry point to be used by file system getpage subr's and
320 * other such routines which either want to unlock pages (B_ASYNC
321 * request) or destroy a list of pages if an error occurred.
322 */
323void
324pvn_read_done(page_t *plist, int flags)
325{
326	page_t *pp;
327
328	while (plist != NULL) {
329		pp = plist;
330		page_sub(&plist, pp);
331		page_io_unlock(pp);
332		if (flags & B_ERROR) {
333			/*LINTED: constant in conditional context*/
334			VN_DISPOSE(pp, B_INVAL, 0, kcred);
335		} else {
336			(void) page_release(pp, 0);
337		}
338	}
339}
340
341/*
342 * Automagic pageout.
343 * When memory gets tight, start freeing pages popping out of the
344 * write queue.
345 */
346int	write_free = 1;
347pgcnt_t	pages_before_pager = 200;	/* LMXXX */
348
349/*
350 * Routine to be called when page-out's complete.
351 * The caller, typically VOP_PUTPAGE, has to explicity call this routine
352 * after waiting for i/o to complete (biowait) to free the list of
353 * pages associated with the buffer.  These pages must be locked
354 * before i/o is initiated.
355 *
356 * If a write error occurs, the pages are marked as modified
357 * so the write will be re-tried later.
358 */
359
360void
361pvn_write_done(page_t *plist, int flags)
362{
363	int dfree = 0;
364	int pgrec = 0;
365	int pgout = 0;
366	int pgpgout = 0;
367	int anonpgout = 0;
368	int anonfree = 0;
369	int fspgout = 0;
370	int fsfree = 0;
371	int execpgout = 0;
372	int execfree = 0;
373	page_t *pp;
374	struct cpu *cpup;
375	struct vnode *vp = NULL;	/* for probe */
376	uint_t ppattr;
377	kmutex_t *vphm = NULL;
378
379	ASSERT((flags & B_READ) == 0);
380
381	/*
382	 * If we are about to start paging anyway, start freeing pages.
383	 */
384	if (write_free && freemem < lotsfree + pages_before_pager &&
385	    (flags & B_ERROR) == 0) {
386		flags |= B_FREE;
387	}
388
389	/*
390	 * Handle each page involved in the i/o operation.
391	 */
392	while (plist != NULL) {
393		pp = plist;
394		ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
395		page_sub(&plist, pp);
396
397		/* Kernel probe support */
398		if (vp == NULL)
399			vp = pp->p_vnode;
400
401		if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
402			/*
403			 * Move page to the top of the v_page list.
404			 * Skip pages modified during IO.
405			 */
406			vphm = page_vnode_mutex(vp);
407			mutex_enter(vphm);
408			if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
409				page_vpsub(&vp->v_pages, pp);
410				page_vpadd(&vp->v_pages, pp);
411			}
412			mutex_exit(vphm);
413		}
414
415		if (flags & B_ERROR) {
416			/*
417			 * Write operation failed.  We don't want
418			 * to destroy (or free) the page unless B_FORCE
419			 * is set. We set the mod bit again and release
420			 * all locks on the page so that it will get written
421			 * back again later when things are hopefully
422			 * better again.
423			 * If B_INVAL and B_FORCE is set we really have
424			 * to destroy the page.
425			 */
426			if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
427				page_io_unlock(pp);
428				/*LINTED: constant in conditional context*/
429				VN_DISPOSE(pp, B_INVAL, 0, kcred);
430			} else {
431				hat_setmod_only(pp);
432				page_io_unlock(pp);
433				page_unlock(pp);
434			}
435		} else if (flags & B_INVAL) {
436			/*
437			 * XXX - Failed writes with B_INVAL set are
438			 * not handled appropriately.
439			 */
440			page_io_unlock(pp);
441			/*LINTED: constant in conditional context*/
442			VN_DISPOSE(pp, B_INVAL, 0, kcred);
443		} else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
444			/*
445			 * Update statistics for pages being paged out
446			 */
447			if (pp->p_vnode) {
448				if (IS_SWAPFSVP(pp->p_vnode)) {
449					anonpgout++;
450				} else {
451					if (pp->p_vnode->v_flag & VVMEXEC) {
452						execpgout++;
453					} else {
454						fspgout++;
455					}
456				}
457			}
458			page_io_unlock(pp);
459			pgout = 1;
460			pgpgout++;
461			TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
462			    "page_ws_out:pp %p", pp);
463
464			/*
465			 * The page_struct_lock need not be acquired to
466			 * examine "p_lckcnt" and "p_cowcnt" since we'll
467			 * have an "exclusive" lock if the upgrade succeeds.
468			 */
469			if (page_tryupgrade(pp) &&
470			    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
471				/*
472				 * Check if someone has reclaimed the
473				 * page.  If ref and mod are not set, no
474				 * one is using it so we can free it.
475				 * The rest of the system is careful
476				 * to use the NOSYNC flag to unload
477				 * translations set up for i/o w/o
478				 * affecting ref and mod bits.
479				 *
480				 * Obtain a copy of the real hardware
481				 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
482				 * to avoid having to flush the cache.
483				 */
484				ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
485				    HAT_SYNC_STOPON_MOD);
486			ck_refmod:
487				if (!(ppattr & (P_REF | P_MOD))) {
488					if (hat_page_is_mapped(pp)) {
489						/*
490						 * Doesn't look like the page
491						 * was modified so now we
492						 * really have to unload the
493						 * translations.  Meanwhile
494						 * another CPU could've
495						 * modified it so we have to
496						 * check again.  We don't loop
497						 * forever here because now
498						 * the translations are gone
499						 * and no one can get a new one
500						 * since we have the "exclusive"
501						 * lock on the page.
502						 */
503						(void) hat_pageunload(pp,
504						    HAT_FORCE_PGUNLOAD);
505						ppattr = hat_page_getattr(pp,
506						    P_REF | P_MOD);
507						goto ck_refmod;
508					}
509					/*
510					 * Update statistics for pages being
511					 * freed
512					 */
513					if (pp->p_vnode) {
514						if (IS_SWAPFSVP(pp->p_vnode)) {
515							anonfree++;
516						} else {
517							if (pp->p_vnode->v_flag
518							    & VVMEXEC) {
519								execfree++;
520							} else {
521								fsfree++;
522							}
523						}
524					}
525					/*LINTED: constant in conditional ctx*/
526					VN_DISPOSE(pp, B_FREE,
527					    (flags & B_DONTNEED), kcred);
528					dfree++;
529				} else {
530					page_unlock(pp);
531					pgrec++;
532					TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
533					    "page_ws_free:pp %p", pp);
534				}
535			} else {
536				/*
537				 * Page is either `locked' in memory
538				 * or was reclaimed and now has a
539				 * "shared" lock, so release it.
540				 */
541				page_unlock(pp);
542			}
543		} else {
544			/*
545			 * Neither B_FREE nor B_INVAL nor B_ERROR.
546			 * Just release locks.
547			 */
548			page_io_unlock(pp);
549			page_unlock(pp);
550		}
551	}
552
553	CPU_STATS_ENTER_K();
554	cpup = CPU;		/* get cpup now that CPU cannot change */
555	CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
556	CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
557	CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
558	CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
559	CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
560	CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
561	CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
562	CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
563	CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
564	CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
565	CPU_STATS_EXIT_K();
566
567	/* Kernel probe */
568	TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
569	    tnf_opaque,	vnode,			vp,
570	    tnf_ulong,	pages_pageout,		pgpgout,
571	    tnf_ulong,	pages_freed,		dfree,
572	    tnf_ulong,	pages_reclaimed,	pgrec);
573}
574
575/*
576 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
577 * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
578 * operation and is only to be considered if it doesn't involve any
579 * waiting here.  B_TRUNC indicates that the file is being truncated
580 * and so no i/o needs to be done. B_FORCE indicates that the page
581 * must be destroyed so don't try wrting it out.
582 *
583 * The caller must ensure that the page is locked.  Returns 1, if
584 * the page should be written back (the "iolock" is held in this
585 * case), or 0 if the page has been dealt with or has been
586 * unlocked.
587 */
588int
589pvn_getdirty(page_t *pp, int flags)
590{
591	ASSERT((flags & (B_INVAL | B_FREE)) ?
592	    PAGE_EXCL(pp) : PAGE_SHARED(pp));
593	ASSERT(PP_ISFREE(pp) == 0);
594
595	/*
596	 * If trying to invalidate or free a logically `locked' page,
597	 * forget it.  Don't need page_struct_lock to check p_lckcnt and
598	 * p_cowcnt as the page is exclusively locked.
599	 */
600	if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
601	    (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
602		page_unlock(pp);
603		return (0);
604	}
605
606	/*
607	 * Now acquire the i/o lock so we can add it to the dirty
608	 * list (if necessary).  We avoid blocking on the i/o lock
609	 * in the following cases:
610	 *
611	 *	If B_DELWRI is set, which implies that this request is
612	 *	due to a klustering operartion.
613	 *
614	 *	If this is an async (B_ASYNC) operation and we are not doing
615	 *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
616	 *	that the the page is written out].
617	 */
618	if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
619		if (!page_io_trylock(pp)) {
620			page_unlock(pp);
621			return (0);
622		}
623	} else {
624		page_io_lock(pp);
625	}
626
627	/*
628	 * If we want to free or invalidate the page then
629	 * we need to unload it so that anyone who wants
630	 * it will have to take a minor fault to get it.
631	 * Otherwise, we're just writing the page back so we
632	 * need to sync up the hardwre and software mod bit to
633	 * detect any future modifications.  We clear the
634	 * software mod bit when we put the page on the dirty
635	 * list.
636	 */
637	if (flags & (B_INVAL | B_FREE)) {
638		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
639	} else {
640		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
641	}
642
643	if (!hat_ismod(pp) || (flags & B_TRUNC)) {
644		/*
645		 * Don't need to add it to the
646		 * list after all.
647		 */
648		page_io_unlock(pp);
649		if (flags & B_INVAL) {
650			/*LINTED: constant in conditional context*/
651			VN_DISPOSE(pp, B_INVAL, 0, kcred);
652		} else if (flags & B_FREE) {
653			/*LINTED: constant in conditional context*/
654			VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
655		} else {
656			/*
657			 * This is advisory path for the callers
658			 * of VOP_PUTPAGE() who prefer freeing the
659			 * page _only_ if no one else is accessing it.
660			 * E.g. segmap_release()
661			 *
662			 * The above hat_ismod() check is useless because:
663			 * (1) we may not be holding SE_EXCL lock;
664			 * (2) we've not unloaded _all_ translations
665			 *
666			 * Let page_release() do the heavy-lifting.
667			 */
668			(void) page_release(pp, 1);
669		}
670		return (0);
671	}
672
673	/*
674	 * Page is dirty, get it ready for the write back
675	 * and add page to the dirty list.
676	 */
677	hat_clrrefmod(pp);
678
679	/*
680	 * If we're going to free the page when we're done
681	 * then we can let others try to use it starting now.
682	 * We'll detect the fact that they used it when the
683	 * i/o is done and avoid freeing the page.
684	 */
685	if (flags & B_FREE)
686		page_downgrade(pp);
687
688
689	TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
690
691	return (1);
692}
693
694
695/*ARGSUSED*/
696static int
697marker_constructor(void *buf, void *cdrarg, int kmflags)
698{
699	page_t *mark = buf;
700	bzero(mark, sizeof (page_t));
701	mark->p_hash = PVN_VPLIST_HASH_TAG;
702	return (0);
703}
704
705void
706pvn_init()
707{
708	if (pvn_vmodsort_disable == 0)
709		pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
710	marker_cache = kmem_cache_create("marker_cache",
711	    sizeof (page_t), 0, marker_constructor,
712	    NULL, NULL, NULL, NULL, 0);
713}
714
715
716/*
717 * Process a vnode's page list for all pages whose offset is >= off.
718 * Pages are to either be free'd, invalidated, or written back to disk.
719 *
720 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
721 * is specified, otherwise they are "shared" locked.
722 *
723 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
724 *
725 * Special marker page_t's are inserted in the list in order
726 * to keep track of where we are in the list when locks are dropped.
727 *
728 * Note the list is circular and insertions can happen only at the
729 * head and tail of the list. The algorithm ensures visiting all pages
730 * on the list in the following way:
731 *
732 *    Drop two marker pages at the end of the list.
733 *
734 *    Move one marker page backwards towards the start of the list until
735 *    it is at the list head, processing the pages passed along the way.
736 *
737 *    Due to race conditions when the vphm mutex is dropped, additional pages
738 *    can be added to either end of the list, so we'll continue to move
739 *    the marker and process pages until it is up against the end marker.
740 *
741 * There is one special exit condition. If we are processing a VMODSORT
742 * vnode and only writing back modified pages, we can stop as soon as
743 * we run into an unmodified page.  This makes fsync(3) operations fast.
744 */
745int
746pvn_vplist_dirty(
747	vnode_t		*vp,
748	u_offset_t	off,
749	int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
750			size_t *, int, cred_t *),
751	int		flags,
752	cred_t		*cred)
753{
754	page_t		*pp;
755	page_t		*mark;		/* marker page that moves toward head */
756	page_t		*end;		/* marker page at end of list */
757	int		err = 0;
758	int		error;
759	kmutex_t	*vphm;
760	se_t		se;
761	page_t		**where_to_move;
762
763	ASSERT(vp->v_type != VCHR);
764
765	if (vp->v_pages == NULL)
766		return (0);
767
768
769	/*
770	 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
771	 *
772	 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
773	 * from getting blocked while flushing pages to a dead NFS server.
774	 */
775	mutex_enter(&vp->v_lock);
776	if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
777		mutex_exit(&vp->v_lock);
778		return (EAGAIN);
779	}
780
781	while (vp->v_flag & VVMLOCK)
782		cv_wait(&vp->v_cv, &vp->v_lock);
783
784	if (vp->v_pages == NULL) {
785		mutex_exit(&vp->v_lock);
786		return (0);
787	}
788
789	vp->v_flag |= VVMLOCK;
790	mutex_exit(&vp->v_lock);
791
792
793	/*
794	 * Set up the marker pages used to walk the list
795	 */
796	end = kmem_cache_alloc(marker_cache, KM_SLEEP);
797	end->p_vnode = vp;
798	end->p_offset = (u_offset_t)-2;
799	mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
800	mark->p_vnode = vp;
801	mark->p_offset = (u_offset_t)-1;
802
803	/*
804	 * Grab the lock protecting the vnode's page list
805	 * note that this lock is dropped at times in the loop.
806	 */
807	vphm = page_vnode_mutex(vp);
808	mutex_enter(vphm);
809	if (vp->v_pages == NULL)
810		goto leave;
811
812	/*
813	 * insert the markers and loop through the list of pages
814	 */
815	page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
816	page_vpadd(&mark->p_vpnext, end);
817	for (;;) {
818
819		/*
820		 * If only doing an async write back, then we can
821		 * stop as soon as we get to start of the list.
822		 */
823		if (flags == B_ASYNC && vp->v_pages == mark)
824			break;
825
826		/*
827		 * otherwise stop when we've gone through all the pages
828		 */
829		if (mark->p_vpprev == end)
830			break;
831
832		pp = mark->p_vpprev;
833		if (vp->v_pages == pp)
834			where_to_move = &vp->v_pages;
835		else
836			where_to_move = &pp->p_vpprev->p_vpnext;
837
838		ASSERT(pp->p_vnode == vp);
839
840		/*
841		 * If just flushing dirty pages to disk and this vnode
842		 * is using a sorted list of pages, we can stop processing
843		 * as soon as we find an unmodified page. Since all the
844		 * modified pages are visited first.
845		 */
846		if (IS_VMODSORT(vp) &&
847		    !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
848			if (!hat_ismod(pp) && !page_io_locked(pp)) {
849#ifdef  DEBUG
850				/*
851				 * For debug kernels examine what should be
852				 * all the remaining clean pages, asserting
853				 * that they are not modified.
854				 */
855				page_t	*chk = pp;
856				int	attr;
857
858				page_vpsub(&vp->v_pages, mark);
859				page_vpadd(where_to_move, mark);
860				do {
861					chk = chk->p_vpprev;
862					ASSERT(chk != end);
863					if (chk == mark)
864						continue;
865					attr = hat_page_getattr(chk, P_MOD |
866					    P_REF);
867					if ((attr & P_MOD) == 0)
868						continue;
869					panic("v_pages list not all clean: "
870					    "page_t*=%p vnode=%p off=%lx "
871					    "attr=0x%x last clean page_t*=%p\n",
872					    (void *)chk, (void *)chk->p_vnode,
873					    (long)chk->p_offset, attr,
874					    (void *)pp);
875				} while (chk != vp->v_pages);
876#endif
877				break;
878			} else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
879				/*
880				 * Couldn't get io lock, wait until IO is done.
881				 * Block only for sync IO since we don't want
882				 * to block async IO.
883				 */
884				mutex_exit(vphm);
885				page_io_wait(pp);
886				mutex_enter(vphm);
887				continue;
888			}
889		}
890
891		/*
892		 * Skip this page if the offset is out of the desired range.
893		 * Just move the marker and continue.
894		 */
895		if (pp->p_offset < off) {
896			page_vpsub(&vp->v_pages, mark);
897			page_vpadd(where_to_move, mark);
898			continue;
899		}
900
901		/*
902		 * If we are supposed to invalidate or free this
903		 * page, then we need an exclusive lock.
904		 */
905		se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
906
907		/*
908		 * We must acquire the page lock for all synchronous
909		 * operations (invalidate, free and write).
910		 */
911		if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
912			/*
913			 * If the page_lock() drops the mutex
914			 * we must retry the loop.
915			 */
916			if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
917				continue;
918
919			/*
920			 * It's ok to move the marker page now.
921			 */
922			page_vpsub(&vp->v_pages, mark);
923			page_vpadd(where_to_move, mark);
924		} else {
925
926			/*
927			 * update the marker page for all remaining cases
928			 */
929			page_vpsub(&vp->v_pages, mark);
930			page_vpadd(where_to_move, mark);
931
932			/*
933			 * For write backs, If we can't lock the page, it's
934			 * invalid or in the process of being destroyed.  Skip
935			 * it, assuming someone else is writing it.
936			 */
937			if (!page_trylock(pp, se))
938				continue;
939		}
940
941		ASSERT(pp->p_vnode == vp);
942
943		/*
944		 * Successfully locked the page, now figure out what to
945		 * do with it. Free pages are easily dealt with, invalidate
946		 * if desired or just go on to the next page.
947		 */
948		if (PP_ISFREE(pp)) {
949			if ((flags & B_INVAL) == 0) {
950				page_unlock(pp);
951				continue;
952			}
953
954			/*
955			 * Invalidate (destroy) the page.
956			 */
957			mutex_exit(vphm);
958			page_destroy_free(pp);
959			mutex_enter(vphm);
960			continue;
961		}
962
963		/*
964		 * pvn_getdirty() figures out what do do with a dirty page.
965		 * If the page is dirty, the putapage() routine will write it
966		 * and will kluster any other adjacent dirty pages it can.
967		 *
968		 * pvn_getdirty() and `(*putapage)' unlock the page.
969		 */
970		mutex_exit(vphm);
971		if (pvn_getdirty(pp, flags)) {
972			error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
973			if (!err)
974				err = error;
975		}
976		mutex_enter(vphm);
977	}
978	page_vpsub(&vp->v_pages, mark);
979	page_vpsub(&vp->v_pages, end);
980
981leave:
982	/*
983	 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
984	 */
985	mutex_exit(vphm);
986	kmem_cache_free(marker_cache, mark);
987	kmem_cache_free(marker_cache, end);
988	mutex_enter(&vp->v_lock);
989	vp->v_flag &= ~VVMLOCK;
990	cv_broadcast(&vp->v_cv);
991	mutex_exit(&vp->v_lock);
992	return (err);
993}
994
995/*
996 * Walk the vp->v_pages list, for every page call the callback function
997 * pointed by *page_check. If page_check returns non-zero, then mark the
998 * page as modified and if VMODSORT is set, move it to the end of v_pages
999 * list. Moving makes sense only if we have at least two pages - this also
1000 * avoids having v_pages temporarily being NULL after calling page_vpsub()
1001 * if there was just one page.
1002 */
1003void
1004pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1005{
1006	page_t	*pp, *next, *end;
1007	kmutex_t	*vphm;
1008	int	shuffle;
1009
1010	vphm = page_vnode_mutex(vp);
1011	mutex_enter(vphm);
1012
1013	if (vp->v_pages == NULL) {
1014		mutex_exit(vphm);
1015		return;
1016	}
1017
1018	end = vp->v_pages->p_vpprev;
1019	shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1020	pp = vp->v_pages;
1021
1022	for (;;) {
1023		next = pp->p_vpnext;
1024		if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1025			/*
1026			 * hat_setmod_only() in contrast to hat_setmod() does
1027			 * not shuffle the pages and does not grab the mutex
1028			 * page_vnode_mutex. Exactly what we need.
1029			 */
1030			hat_setmod_only(pp);
1031			if (shuffle) {
1032				page_vpsub(&vp->v_pages, pp);
1033				ASSERT(vp->v_pages != NULL);
1034				page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1035				    pp);
1036			}
1037		}
1038		/* Stop if we have just processed the last page. */
1039		if (pp == end)
1040			break;
1041		pp = next;
1042	}
1043
1044	mutex_exit(vphm);
1045}
1046
1047/*
1048 * Zero out zbytes worth of data. Caller should be aware that this
1049 * routine may enter back into the fs layer (xxx_getpage). Locks
1050 * that the xxx_getpage routine may need should not be held while
1051 * calling this.
1052 */
1053void
1054pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1055{
1056	caddr_t addr;
1057
1058	ASSERT(vp->v_type != VCHR);
1059
1060	if (vp->v_pages == NULL)
1061		return;
1062
1063	/*
1064	 * zbytes may be zero but there still may be some portion of
1065	 * a page which needs clearing (since zbytes is a function
1066	 * of filesystem block size, not pagesize.)
1067	 */
1068	if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1069		return;
1070
1071	/*
1072	 * We get the last page and handle the partial
1073	 * zeroing via kernel mappings.  This will make the page
1074	 * dirty so that we know that when this page is written
1075	 * back, the zeroed information will go out with it.  If
1076	 * the page is not currently in memory, then the kzero
1077	 * operation will cause it to be brought it.  We use kzero
1078	 * instead of bzero so that if the page cannot be read in
1079	 * for any reason, the system will not panic.  We need
1080	 * to zero out a minimum of the fs given zbytes, but we
1081	 * might also have to do more to get the entire last page.
1082	 */
1083
1084	if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1085		panic("pvn_vptrunc zbytes");
1086	addr = segmap_getmapflt(segkmap, vp, vplen,
1087	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1088	(void) kzero(addr + (vplen & MAXBOFFSET),
1089	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1090	(void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1091}
1092
1093/*
1094 * Handles common work of the VOP_GETPAGE routines by iterating page by page
1095 * calling the getpage helper for each.
1096 */
1097int
1098pvn_getpages(
1099	int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1100		size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1101	struct vnode *vp,
1102	u_offset_t off,
1103	size_t len,
1104	uint_t *protp,
1105	page_t *pl[],
1106	size_t plsz,
1107	struct seg *seg,
1108	caddr_t addr,
1109	enum seg_rw rw,
1110	struct cred *cred)
1111{
1112	page_t **ppp;
1113	u_offset_t o, eoff;
1114	size_t sz, xlen;
1115	int err;
1116
1117	/* ensure that we have enough space */
1118	ASSERT(pl == NULL || plsz >= len);
1119
1120	/*
1121	 * Loop one page at a time and let getapage function fill
1122	 * in the next page in array.  We only allow one page to be
1123	 * returned at a time (except for the last page) so that we
1124	 * don't have any problems with duplicates and other such
1125	 * painful problems.  This is a very simple minded algorithm,
1126	 * but it does the job correctly.  We hope that the cost of a
1127	 * getapage call for a resident page that we might have been
1128	 * able to get from an earlier call doesn't cost too much.
1129	 */
1130	ppp = pl;
1131	sz = (pl != NULL) ? PAGESIZE : 0;
1132	eoff = off + len;
1133	xlen = len;
1134	for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1135	    xlen -= PAGESIZE) {
1136		if (o + PAGESIZE >= eoff && pl != NULL) {
1137			/*
1138			 * Last time through - allow the all of
1139			 * what's left of the pl[] array to be used.
1140			 */
1141			sz = plsz - (o - off);
1142		}
1143		err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1144		    rw, cred);
1145		if (err) {
1146			/*
1147			 * Release any pages we already got.
1148			 */
1149			if (o > off && pl != NULL) {
1150				for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1151					(void) page_release(*ppp, 1);
1152			}
1153			break;
1154		}
1155		if (pl != NULL)
1156			ppp++;
1157	}
1158	return (err);
1159}
1160
1161/*
1162 * Initialize the page list array.
1163 */
1164/*ARGSUSED*/
1165void
1166pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1167    u_offset_t off, size_t io_len, enum seg_rw rw)
1168{
1169	ssize_t sz;
1170	page_t *ppcur, **ppp;
1171
1172	/*
1173	 * Set up to load plsz worth
1174	 * starting at the needed page.
1175	 */
1176	while (pp != NULL && pp->p_offset != off) {
1177		/*
1178		 * Remove page from the i/o list,
1179		 * release the i/o and the page lock.
1180		 */
1181		ppcur = pp;
1182		page_sub(&pp, ppcur);
1183		page_io_unlock(ppcur);
1184		(void) page_release(ppcur, 1);
1185	}
1186
1187	if (pp == NULL) {
1188		pl[0] = NULL;
1189		return;
1190	}
1191
1192	sz = plsz;
1193
1194	/*
1195	 * Initialize the page list array.
1196	 */
1197	ppp = pl;
1198	do {
1199		ppcur = pp;
1200		*ppp++ = ppcur;
1201		page_sub(&pp, ppcur);
1202		page_io_unlock(ppcur);
1203		if (rw != S_CREATE)
1204			page_downgrade(ppcur);
1205		sz -= PAGESIZE;
1206	} while (sz > 0 && pp != NULL);
1207	*ppp = NULL;		/* terminate list */
1208
1209	/*
1210	 * Now free the remaining pages that weren't
1211	 * loaded in the page list.
1212	 */
1213	while (pp != NULL) {
1214		ppcur = pp;
1215		page_sub(&pp, ppcur);
1216		page_io_unlock(ppcur);
1217		(void) page_release(ppcur, 1);
1218	}
1219}
1220