1*7c478bd9Sstevel@tonic-gate /* 2*7c478bd9Sstevel@tonic-gate * CDDL HEADER START 3*7c478bd9Sstevel@tonic-gate * 4*7c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*7c478bd9Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*7c478bd9Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*7c478bd9Sstevel@tonic-gate * with the License. 8*7c478bd9Sstevel@tonic-gate * 9*7c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*7c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*7c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 12*7c478bd9Sstevel@tonic-gate * and limitations under the License. 13*7c478bd9Sstevel@tonic-gate * 14*7c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*7c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*7c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*7c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*7c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*7c478bd9Sstevel@tonic-gate * 20*7c478bd9Sstevel@tonic-gate * CDDL HEADER END 21*7c478bd9Sstevel@tonic-gate */ 22*7c478bd9Sstevel@tonic-gate /* 23*7c478bd9Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*7c478bd9Sstevel@tonic-gate * Use is subject to license terms. 25*7c478bd9Sstevel@tonic-gate */ 26*7c478bd9Sstevel@tonic-gate 27*7c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28*7c478bd9Sstevel@tonic-gate /* All Rights Reserved */ 29*7c478bd9Sstevel@tonic-gate 30*7c478bd9Sstevel@tonic-gate /* 31*7c478bd9Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 32*7c478bd9Sstevel@tonic-gate * The Regents of the University of California 33*7c478bd9Sstevel@tonic-gate * All Rights Reserved 34*7c478bd9Sstevel@tonic-gate * 35*7c478bd9Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 36*7c478bd9Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 37*7c478bd9Sstevel@tonic-gate * contributors. 38*7c478bd9Sstevel@tonic-gate */ 39*7c478bd9Sstevel@tonic-gate 40*7c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 41*7c478bd9Sstevel@tonic-gate 42*7c478bd9Sstevel@tonic-gate /* 43*7c478bd9Sstevel@tonic-gate * VM - paged vnode. 44*7c478bd9Sstevel@tonic-gate * 45*7c478bd9Sstevel@tonic-gate * This file supplies vm support for the vnode operations that deal with pages. 46*7c478bd9Sstevel@tonic-gate */ 47*7c478bd9Sstevel@tonic-gate #include <sys/types.h> 48*7c478bd9Sstevel@tonic-gate #include <sys/t_lock.h> 49*7c478bd9Sstevel@tonic-gate #include <sys/param.h> 50*7c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h> 51*7c478bd9Sstevel@tonic-gate #include <sys/systm.h> 52*7c478bd9Sstevel@tonic-gate #include <sys/time.h> 53*7c478bd9Sstevel@tonic-gate #include <sys/buf.h> 54*7c478bd9Sstevel@tonic-gate #include <sys/vnode.h> 55*7c478bd9Sstevel@tonic-gate #include <sys/uio.h> 56*7c478bd9Sstevel@tonic-gate #include <sys/vmmeter.h> 57*7c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h> 58*7c478bd9Sstevel@tonic-gate #include <sys/mman.h> 59*7c478bd9Sstevel@tonic-gate #include <sys/vfs.h> 60*7c478bd9Sstevel@tonic-gate #include <sys/cred.h> 61*7c478bd9Sstevel@tonic-gate #include <sys/user.h> 62*7c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 63*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 64*7c478bd9Sstevel@tonic-gate #include <sys/debug.h> 65*7c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 66*7c478bd9Sstevel@tonic-gate #include <sys/vtrace.h> 67*7c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h> 68*7c478bd9Sstevel@tonic-gate 69*7c478bd9Sstevel@tonic-gate #include <vm/hat.h> 70*7c478bd9Sstevel@tonic-gate #include <vm/as.h> 71*7c478bd9Sstevel@tonic-gate #include <vm/seg.h> 72*7c478bd9Sstevel@tonic-gate #include <vm/rm.h> 73*7c478bd9Sstevel@tonic-gate #include <vm/pvn.h> 74*7c478bd9Sstevel@tonic-gate #include <vm/page.h> 75*7c478bd9Sstevel@tonic-gate #include <vm/seg_map.h> 76*7c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 77*7c478bd9Sstevel@tonic-gate #include <sys/fs/swapnode.h> 78*7c478bd9Sstevel@tonic-gate 79*7c478bd9Sstevel@tonic-gate int pvn_nofodklust = 0; 80*7c478bd9Sstevel@tonic-gate int pvn_write_noklust = 0; 81*7c478bd9Sstevel@tonic-gate 82*7c478bd9Sstevel@tonic-gate uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */ 83*7c478bd9Sstevel@tonic-gate uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */ 84*7c478bd9Sstevel@tonic-gate /* support for vmodsort for testing */ 85*7c478bd9Sstevel@tonic-gate 86*7c478bd9Sstevel@tonic-gate static struct kmem_cache *marker_cache = NULL; 87*7c478bd9Sstevel@tonic-gate 88*7c478bd9Sstevel@tonic-gate /* 89*7c478bd9Sstevel@tonic-gate * Find the largest contiguous block which contains `addr' for file offset 90*7c478bd9Sstevel@tonic-gate * `offset' in it while living within the file system block sizes (`vp_off' 91*7c478bd9Sstevel@tonic-gate * and `vp_len') and the address space limits for which no pages currently 92*7c478bd9Sstevel@tonic-gate * exist and which map to consecutive file offsets. 93*7c478bd9Sstevel@tonic-gate */ 94*7c478bd9Sstevel@tonic-gate page_t * 95*7c478bd9Sstevel@tonic-gate pvn_read_kluster( 96*7c478bd9Sstevel@tonic-gate struct vnode *vp, 97*7c478bd9Sstevel@tonic-gate u_offset_t off, 98*7c478bd9Sstevel@tonic-gate struct seg *seg, 99*7c478bd9Sstevel@tonic-gate caddr_t addr, 100*7c478bd9Sstevel@tonic-gate u_offset_t *offp, /* return values */ 101*7c478bd9Sstevel@tonic-gate size_t *lenp, /* return values */ 102*7c478bd9Sstevel@tonic-gate u_offset_t vp_off, 103*7c478bd9Sstevel@tonic-gate size_t vp_len, 104*7c478bd9Sstevel@tonic-gate int isra) 105*7c478bd9Sstevel@tonic-gate { 106*7c478bd9Sstevel@tonic-gate ssize_t deltaf, deltab; 107*7c478bd9Sstevel@tonic-gate page_t *pp; 108*7c478bd9Sstevel@tonic-gate page_t *plist = NULL; 109*7c478bd9Sstevel@tonic-gate spgcnt_t pagesavail; 110*7c478bd9Sstevel@tonic-gate u_offset_t vp_end; 111*7c478bd9Sstevel@tonic-gate 112*7c478bd9Sstevel@tonic-gate ASSERT(off >= vp_off && off < vp_off + vp_len); 113*7c478bd9Sstevel@tonic-gate 114*7c478bd9Sstevel@tonic-gate /* 115*7c478bd9Sstevel@tonic-gate * We only want to do klustering/read ahead if there 116*7c478bd9Sstevel@tonic-gate * is more than minfree pages currently available. 117*7c478bd9Sstevel@tonic-gate */ 118*7c478bd9Sstevel@tonic-gate pagesavail = freemem - minfree; 119*7c478bd9Sstevel@tonic-gate 120*7c478bd9Sstevel@tonic-gate if (pagesavail <= 0) 121*7c478bd9Sstevel@tonic-gate if (isra) 122*7c478bd9Sstevel@tonic-gate return ((page_t *)NULL); /* ra case - give up */ 123*7c478bd9Sstevel@tonic-gate else 124*7c478bd9Sstevel@tonic-gate pagesavail = 1; /* must return a page */ 125*7c478bd9Sstevel@tonic-gate 126*7c478bd9Sstevel@tonic-gate /* We calculate in pages instead of bytes due to 32-bit overflows */ 127*7c478bd9Sstevel@tonic-gate if (pagesavail < (spgcnt_t)btopr(vp_len)) { 128*7c478bd9Sstevel@tonic-gate /* 129*7c478bd9Sstevel@tonic-gate * Don't have enough free memory for the 130*7c478bd9Sstevel@tonic-gate * max request, try sizing down vp request. 131*7c478bd9Sstevel@tonic-gate */ 132*7c478bd9Sstevel@tonic-gate deltab = (ssize_t)(off - vp_off); 133*7c478bd9Sstevel@tonic-gate vp_len -= deltab; 134*7c478bd9Sstevel@tonic-gate vp_off += deltab; 135*7c478bd9Sstevel@tonic-gate if (pagesavail < btopr(vp_len)) { 136*7c478bd9Sstevel@tonic-gate /* 137*7c478bd9Sstevel@tonic-gate * Still not enough memory, just settle for 138*7c478bd9Sstevel@tonic-gate * pagesavail which is at least 1. 139*7c478bd9Sstevel@tonic-gate */ 140*7c478bd9Sstevel@tonic-gate vp_len = ptob(pagesavail); 141*7c478bd9Sstevel@tonic-gate } 142*7c478bd9Sstevel@tonic-gate } 143*7c478bd9Sstevel@tonic-gate 144*7c478bd9Sstevel@tonic-gate vp_end = vp_off + vp_len; 145*7c478bd9Sstevel@tonic-gate ASSERT(off >= vp_off && off < vp_end); 146*7c478bd9Sstevel@tonic-gate 147*7c478bd9Sstevel@tonic-gate if (isra && SEGOP_KLUSTER(seg, addr, 0)) 148*7c478bd9Sstevel@tonic-gate return ((page_t *)NULL); /* segment driver says no */ 149*7c478bd9Sstevel@tonic-gate 150*7c478bd9Sstevel@tonic-gate if ((plist = page_create_va(vp, off, 151*7c478bd9Sstevel@tonic-gate PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL) 152*7c478bd9Sstevel@tonic-gate return ((page_t *)NULL); 153*7c478bd9Sstevel@tonic-gate 154*7c478bd9Sstevel@tonic-gate if (vp_len <= PAGESIZE || pvn_nofodklust) { 155*7c478bd9Sstevel@tonic-gate *offp = off; 156*7c478bd9Sstevel@tonic-gate *lenp = MIN(vp_len, PAGESIZE); 157*7c478bd9Sstevel@tonic-gate } else { 158*7c478bd9Sstevel@tonic-gate /* 159*7c478bd9Sstevel@tonic-gate * Scan back from front by incrementing "deltab" and 160*7c478bd9Sstevel@tonic-gate * comparing "off" with "vp_off + deltab" to avoid 161*7c478bd9Sstevel@tonic-gate * "signed" versus "unsigned" conversion problems. 162*7c478bd9Sstevel@tonic-gate */ 163*7c478bd9Sstevel@tonic-gate for (deltab = PAGESIZE; off >= vp_off + deltab; 164*7c478bd9Sstevel@tonic-gate deltab += PAGESIZE) { 165*7c478bd9Sstevel@tonic-gate /* 166*7c478bd9Sstevel@tonic-gate * Call back to the segment driver to verify that 167*7c478bd9Sstevel@tonic-gate * the klustering/read ahead operation makes sense. 168*7c478bd9Sstevel@tonic-gate */ 169*7c478bd9Sstevel@tonic-gate if (SEGOP_KLUSTER(seg, addr, -deltab)) 170*7c478bd9Sstevel@tonic-gate break; /* page not eligible */ 171*7c478bd9Sstevel@tonic-gate if ((pp = page_create_va(vp, off - deltab, 172*7c478bd9Sstevel@tonic-gate PAGESIZE, PG_EXCL, seg, addr - deltab)) 173*7c478bd9Sstevel@tonic-gate == NULL) 174*7c478bd9Sstevel@tonic-gate break; /* already have the page */ 175*7c478bd9Sstevel@tonic-gate /* 176*7c478bd9Sstevel@tonic-gate * Add page to front of page list. 177*7c478bd9Sstevel@tonic-gate */ 178*7c478bd9Sstevel@tonic-gate page_add(&plist, pp); 179*7c478bd9Sstevel@tonic-gate } 180*7c478bd9Sstevel@tonic-gate deltab -= PAGESIZE; 181*7c478bd9Sstevel@tonic-gate 182*7c478bd9Sstevel@tonic-gate /* scan forward from front */ 183*7c478bd9Sstevel@tonic-gate for (deltaf = PAGESIZE; off + deltaf < vp_end; 184*7c478bd9Sstevel@tonic-gate deltaf += PAGESIZE) { 185*7c478bd9Sstevel@tonic-gate /* 186*7c478bd9Sstevel@tonic-gate * Call back to the segment driver to verify that 187*7c478bd9Sstevel@tonic-gate * the klustering/read ahead operation makes sense. 188*7c478bd9Sstevel@tonic-gate */ 189*7c478bd9Sstevel@tonic-gate if (SEGOP_KLUSTER(seg, addr, deltaf)) 190*7c478bd9Sstevel@tonic-gate break; /* page not file extension */ 191*7c478bd9Sstevel@tonic-gate if ((pp = page_create_va(vp, off + deltaf, 192*7c478bd9Sstevel@tonic-gate PAGESIZE, PG_EXCL, seg, addr + deltaf)) 193*7c478bd9Sstevel@tonic-gate == NULL) 194*7c478bd9Sstevel@tonic-gate break; /* already have page */ 195*7c478bd9Sstevel@tonic-gate 196*7c478bd9Sstevel@tonic-gate /* 197*7c478bd9Sstevel@tonic-gate * Add page to end of page list. 198*7c478bd9Sstevel@tonic-gate */ 199*7c478bd9Sstevel@tonic-gate page_add(&plist, pp); 200*7c478bd9Sstevel@tonic-gate plist = plist->p_next; 201*7c478bd9Sstevel@tonic-gate } 202*7c478bd9Sstevel@tonic-gate *offp = off = off - deltab; 203*7c478bd9Sstevel@tonic-gate *lenp = deltab + deltaf; 204*7c478bd9Sstevel@tonic-gate ASSERT(off >= vp_off); 205*7c478bd9Sstevel@tonic-gate 206*7c478bd9Sstevel@tonic-gate /* 207*7c478bd9Sstevel@tonic-gate * If we ended up getting more than was actually 208*7c478bd9Sstevel@tonic-gate * requested, retract the returned length to only 209*7c478bd9Sstevel@tonic-gate * reflect what was requested. This might happen 210*7c478bd9Sstevel@tonic-gate * if we were allowed to kluster pages across a 211*7c478bd9Sstevel@tonic-gate * span of (say) 5 frags, and frag size is less 212*7c478bd9Sstevel@tonic-gate * than PAGESIZE. We need a whole number of 213*7c478bd9Sstevel@tonic-gate * pages to contain those frags, but the returned 214*7c478bd9Sstevel@tonic-gate * size should only allow the returned range to 215*7c478bd9Sstevel@tonic-gate * extend as far as the end of the frags. 216*7c478bd9Sstevel@tonic-gate */ 217*7c478bd9Sstevel@tonic-gate if ((vp_off + vp_len) < (off + *lenp)) { 218*7c478bd9Sstevel@tonic-gate ASSERT(vp_end > off); 219*7c478bd9Sstevel@tonic-gate *lenp = vp_end - off; 220*7c478bd9Sstevel@tonic-gate } 221*7c478bd9Sstevel@tonic-gate } 222*7c478bd9Sstevel@tonic-gate TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER, 223*7c478bd9Sstevel@tonic-gate "pvn_read_kluster:seg %p addr %x isra %x", 224*7c478bd9Sstevel@tonic-gate seg, addr, isra); 225*7c478bd9Sstevel@tonic-gate return (plist); 226*7c478bd9Sstevel@tonic-gate } 227*7c478bd9Sstevel@tonic-gate 228*7c478bd9Sstevel@tonic-gate /* 229*7c478bd9Sstevel@tonic-gate * Handle pages for this vnode on either side of the page "pp" 230*7c478bd9Sstevel@tonic-gate * which has been locked by the caller. This routine will also 231*7c478bd9Sstevel@tonic-gate * do klustering in the range [vp_off, vp_off + vp_len] up 232*7c478bd9Sstevel@tonic-gate * until a page which is not found. The offset and length 233*7c478bd9Sstevel@tonic-gate * of pages included is returned in "*offp" and "*lenp". 234*7c478bd9Sstevel@tonic-gate * 235*7c478bd9Sstevel@tonic-gate * Returns a list of dirty locked pages all ready to be 236*7c478bd9Sstevel@tonic-gate * written back. 237*7c478bd9Sstevel@tonic-gate */ 238*7c478bd9Sstevel@tonic-gate page_t * 239*7c478bd9Sstevel@tonic-gate pvn_write_kluster( 240*7c478bd9Sstevel@tonic-gate struct vnode *vp, 241*7c478bd9Sstevel@tonic-gate page_t *pp, 242*7c478bd9Sstevel@tonic-gate u_offset_t *offp, /* return values */ 243*7c478bd9Sstevel@tonic-gate size_t *lenp, /* return values */ 244*7c478bd9Sstevel@tonic-gate u_offset_t vp_off, 245*7c478bd9Sstevel@tonic-gate size_t vp_len, 246*7c478bd9Sstevel@tonic-gate int flags) 247*7c478bd9Sstevel@tonic-gate { 248*7c478bd9Sstevel@tonic-gate u_offset_t off; 249*7c478bd9Sstevel@tonic-gate page_t *dirty; 250*7c478bd9Sstevel@tonic-gate size_t deltab, deltaf; 251*7c478bd9Sstevel@tonic-gate se_t se; 252*7c478bd9Sstevel@tonic-gate u_offset_t vp_end; 253*7c478bd9Sstevel@tonic-gate 254*7c478bd9Sstevel@tonic-gate off = pp->p_offset; 255*7c478bd9Sstevel@tonic-gate 256*7c478bd9Sstevel@tonic-gate /* 257*7c478bd9Sstevel@tonic-gate * Kustering should not be done if we are invalidating 258*7c478bd9Sstevel@tonic-gate * pages since we could destroy pages that belong to 259*7c478bd9Sstevel@tonic-gate * some other process if this is a swap vnode. 260*7c478bd9Sstevel@tonic-gate */ 261*7c478bd9Sstevel@tonic-gate if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) { 262*7c478bd9Sstevel@tonic-gate *offp = off; 263*7c478bd9Sstevel@tonic-gate *lenp = PAGESIZE; 264*7c478bd9Sstevel@tonic-gate return (pp); 265*7c478bd9Sstevel@tonic-gate } 266*7c478bd9Sstevel@tonic-gate 267*7c478bd9Sstevel@tonic-gate if (flags & (B_FREE | B_INVAL)) 268*7c478bd9Sstevel@tonic-gate se = SE_EXCL; 269*7c478bd9Sstevel@tonic-gate else 270*7c478bd9Sstevel@tonic-gate se = SE_SHARED; 271*7c478bd9Sstevel@tonic-gate 272*7c478bd9Sstevel@tonic-gate dirty = pp; 273*7c478bd9Sstevel@tonic-gate /* 274*7c478bd9Sstevel@tonic-gate * Scan backwards looking for pages to kluster by incrementing 275*7c478bd9Sstevel@tonic-gate * "deltab" and comparing "off" with "vp_off + deltab" to 276*7c478bd9Sstevel@tonic-gate * avoid "signed" versus "unsigned" conversion problems. 277*7c478bd9Sstevel@tonic-gate */ 278*7c478bd9Sstevel@tonic-gate for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) { 279*7c478bd9Sstevel@tonic-gate pp = page_lookup_nowait(vp, off - deltab, se); 280*7c478bd9Sstevel@tonic-gate if (pp == NULL) 281*7c478bd9Sstevel@tonic-gate break; /* page not found */ 282*7c478bd9Sstevel@tonic-gate if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 283*7c478bd9Sstevel@tonic-gate break; 284*7c478bd9Sstevel@tonic-gate page_add(&dirty, pp); 285*7c478bd9Sstevel@tonic-gate } 286*7c478bd9Sstevel@tonic-gate deltab -= PAGESIZE; 287*7c478bd9Sstevel@tonic-gate 288*7c478bd9Sstevel@tonic-gate vp_end = vp_off + vp_len; 289*7c478bd9Sstevel@tonic-gate /* now scan forwards looking for pages to kluster */ 290*7c478bd9Sstevel@tonic-gate for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) { 291*7c478bd9Sstevel@tonic-gate pp = page_lookup_nowait(vp, off + deltaf, se); 292*7c478bd9Sstevel@tonic-gate if (pp == NULL) 293*7c478bd9Sstevel@tonic-gate break; /* page not found */ 294*7c478bd9Sstevel@tonic-gate if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 295*7c478bd9Sstevel@tonic-gate break; 296*7c478bd9Sstevel@tonic-gate page_add(&dirty, pp); 297*7c478bd9Sstevel@tonic-gate dirty = dirty->p_next; 298*7c478bd9Sstevel@tonic-gate } 299*7c478bd9Sstevel@tonic-gate 300*7c478bd9Sstevel@tonic-gate *offp = off - deltab; 301*7c478bd9Sstevel@tonic-gate *lenp = deltab + deltaf; 302*7c478bd9Sstevel@tonic-gate return (dirty); 303*7c478bd9Sstevel@tonic-gate } 304*7c478bd9Sstevel@tonic-gate 305*7c478bd9Sstevel@tonic-gate /* 306*7c478bd9Sstevel@tonic-gate * Generic entry point used to release the "shared/exclusive" lock 307*7c478bd9Sstevel@tonic-gate * and the "p_iolock" on pages after i/o is complete. 308*7c478bd9Sstevel@tonic-gate */ 309*7c478bd9Sstevel@tonic-gate void 310*7c478bd9Sstevel@tonic-gate pvn_io_done(page_t *plist) 311*7c478bd9Sstevel@tonic-gate { 312*7c478bd9Sstevel@tonic-gate page_t *pp; 313*7c478bd9Sstevel@tonic-gate 314*7c478bd9Sstevel@tonic-gate while (plist != NULL) { 315*7c478bd9Sstevel@tonic-gate pp = plist; 316*7c478bd9Sstevel@tonic-gate page_sub(&plist, pp); 317*7c478bd9Sstevel@tonic-gate page_io_unlock(pp); 318*7c478bd9Sstevel@tonic-gate page_unlock(pp); 319*7c478bd9Sstevel@tonic-gate } 320*7c478bd9Sstevel@tonic-gate } 321*7c478bd9Sstevel@tonic-gate 322*7c478bd9Sstevel@tonic-gate /* 323*7c478bd9Sstevel@tonic-gate * Entry point to be used by file system getpage subr's and 324*7c478bd9Sstevel@tonic-gate * other such routines which either want to unlock pages (B_ASYNC 325*7c478bd9Sstevel@tonic-gate * request) or destroy a list of pages if an error occurred. 326*7c478bd9Sstevel@tonic-gate */ 327*7c478bd9Sstevel@tonic-gate void 328*7c478bd9Sstevel@tonic-gate pvn_read_done(page_t *plist, int flags) 329*7c478bd9Sstevel@tonic-gate { 330*7c478bd9Sstevel@tonic-gate page_t *pp; 331*7c478bd9Sstevel@tonic-gate 332*7c478bd9Sstevel@tonic-gate while (plist != NULL) { 333*7c478bd9Sstevel@tonic-gate pp = plist; 334*7c478bd9Sstevel@tonic-gate page_sub(&plist, pp); 335*7c478bd9Sstevel@tonic-gate page_io_unlock(pp); 336*7c478bd9Sstevel@tonic-gate if (flags & B_ERROR) { 337*7c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 338*7c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 339*7c478bd9Sstevel@tonic-gate } else { 340*7c478bd9Sstevel@tonic-gate (void) page_release(pp, 0); 341*7c478bd9Sstevel@tonic-gate } 342*7c478bd9Sstevel@tonic-gate } 343*7c478bd9Sstevel@tonic-gate } 344*7c478bd9Sstevel@tonic-gate 345*7c478bd9Sstevel@tonic-gate /* 346*7c478bd9Sstevel@tonic-gate * Automagic pageout. 347*7c478bd9Sstevel@tonic-gate * When memory gets tight, start freeing pages popping out of the 348*7c478bd9Sstevel@tonic-gate * write queue. 349*7c478bd9Sstevel@tonic-gate */ 350*7c478bd9Sstevel@tonic-gate int write_free = 1; 351*7c478bd9Sstevel@tonic-gate pgcnt_t pages_before_pager = 200; /* LMXXX */ 352*7c478bd9Sstevel@tonic-gate 353*7c478bd9Sstevel@tonic-gate /* 354*7c478bd9Sstevel@tonic-gate * Routine to be called when page-out's complete. 355*7c478bd9Sstevel@tonic-gate * The caller, typically VOP_PUTPAGE, has to explicity call this routine 356*7c478bd9Sstevel@tonic-gate * after waiting for i/o to complete (biowait) to free the list of 357*7c478bd9Sstevel@tonic-gate * pages associated with the buffer. These pages must be locked 358*7c478bd9Sstevel@tonic-gate * before i/o is initiated. 359*7c478bd9Sstevel@tonic-gate * 360*7c478bd9Sstevel@tonic-gate * If a write error occurs, the pages are marked as modified 361*7c478bd9Sstevel@tonic-gate * so the write will be re-tried later. 362*7c478bd9Sstevel@tonic-gate */ 363*7c478bd9Sstevel@tonic-gate 364*7c478bd9Sstevel@tonic-gate void 365*7c478bd9Sstevel@tonic-gate pvn_write_done(page_t *plist, int flags) 366*7c478bd9Sstevel@tonic-gate { 367*7c478bd9Sstevel@tonic-gate int dfree = 0; 368*7c478bd9Sstevel@tonic-gate int pgrec = 0; 369*7c478bd9Sstevel@tonic-gate int pgout = 0; 370*7c478bd9Sstevel@tonic-gate int pgpgout = 0; 371*7c478bd9Sstevel@tonic-gate int anonpgout = 0; 372*7c478bd9Sstevel@tonic-gate int anonfree = 0; 373*7c478bd9Sstevel@tonic-gate int fspgout = 0; 374*7c478bd9Sstevel@tonic-gate int fsfree = 0; 375*7c478bd9Sstevel@tonic-gate int execpgout = 0; 376*7c478bd9Sstevel@tonic-gate int execfree = 0; 377*7c478bd9Sstevel@tonic-gate page_t *pp; 378*7c478bd9Sstevel@tonic-gate struct cpu *cpup; 379*7c478bd9Sstevel@tonic-gate struct vnode *vp = NULL; /* for probe */ 380*7c478bd9Sstevel@tonic-gate uint_t ppattr; 381*7c478bd9Sstevel@tonic-gate 382*7c478bd9Sstevel@tonic-gate ASSERT((flags & B_READ) == 0); 383*7c478bd9Sstevel@tonic-gate 384*7c478bd9Sstevel@tonic-gate /* 385*7c478bd9Sstevel@tonic-gate * If we are about to start paging anyway, start freeing pages. 386*7c478bd9Sstevel@tonic-gate */ 387*7c478bd9Sstevel@tonic-gate if (write_free && freemem < lotsfree + pages_before_pager && 388*7c478bd9Sstevel@tonic-gate (flags & B_ERROR) == 0) { 389*7c478bd9Sstevel@tonic-gate flags |= B_FREE; 390*7c478bd9Sstevel@tonic-gate } 391*7c478bd9Sstevel@tonic-gate 392*7c478bd9Sstevel@tonic-gate /* 393*7c478bd9Sstevel@tonic-gate * Handle each page involved in the i/o operation. 394*7c478bd9Sstevel@tonic-gate */ 395*7c478bd9Sstevel@tonic-gate while (plist != NULL) { 396*7c478bd9Sstevel@tonic-gate pp = plist; 397*7c478bd9Sstevel@tonic-gate ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); 398*7c478bd9Sstevel@tonic-gate page_sub(&plist, pp); 399*7c478bd9Sstevel@tonic-gate 400*7c478bd9Sstevel@tonic-gate /* Kernel probe support */ 401*7c478bd9Sstevel@tonic-gate if (vp == NULL) 402*7c478bd9Sstevel@tonic-gate vp = pp->p_vnode; 403*7c478bd9Sstevel@tonic-gate 404*7c478bd9Sstevel@tonic-gate if (flags & B_ERROR) { 405*7c478bd9Sstevel@tonic-gate /* 406*7c478bd9Sstevel@tonic-gate * Write operation failed. We don't want 407*7c478bd9Sstevel@tonic-gate * to destroy (or free) the page unless B_FORCE 408*7c478bd9Sstevel@tonic-gate * is set. We set the mod bit again and release 409*7c478bd9Sstevel@tonic-gate * all locks on the page so that it will get written 410*7c478bd9Sstevel@tonic-gate * back again later when things are hopefully 411*7c478bd9Sstevel@tonic-gate * better again. 412*7c478bd9Sstevel@tonic-gate * If B_INVAL and B_FORCE is set we really have 413*7c478bd9Sstevel@tonic-gate * to destroy the page. 414*7c478bd9Sstevel@tonic-gate */ 415*7c478bd9Sstevel@tonic-gate if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { 416*7c478bd9Sstevel@tonic-gate page_io_unlock(pp); 417*7c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 418*7c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 419*7c478bd9Sstevel@tonic-gate } else { 420*7c478bd9Sstevel@tonic-gate hat_setmod(pp); 421*7c478bd9Sstevel@tonic-gate page_io_unlock(pp); 422*7c478bd9Sstevel@tonic-gate page_unlock(pp); 423*7c478bd9Sstevel@tonic-gate } 424*7c478bd9Sstevel@tonic-gate } else if (flags & B_INVAL) { 425*7c478bd9Sstevel@tonic-gate /* 426*7c478bd9Sstevel@tonic-gate * XXX - Failed writes with B_INVAL set are 427*7c478bd9Sstevel@tonic-gate * not handled appropriately. 428*7c478bd9Sstevel@tonic-gate */ 429*7c478bd9Sstevel@tonic-gate page_io_unlock(pp); 430*7c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 431*7c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 432*7c478bd9Sstevel@tonic-gate } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { 433*7c478bd9Sstevel@tonic-gate /* 434*7c478bd9Sstevel@tonic-gate * Update statistics for pages being paged out 435*7c478bd9Sstevel@tonic-gate */ 436*7c478bd9Sstevel@tonic-gate if (pp->p_vnode) { 437*7c478bd9Sstevel@tonic-gate if (IS_SWAPFSVP(pp->p_vnode)) { 438*7c478bd9Sstevel@tonic-gate anonpgout++; 439*7c478bd9Sstevel@tonic-gate } else { 440*7c478bd9Sstevel@tonic-gate if (pp->p_vnode->v_flag & VVMEXEC) { 441*7c478bd9Sstevel@tonic-gate execpgout++; 442*7c478bd9Sstevel@tonic-gate } else { 443*7c478bd9Sstevel@tonic-gate fspgout++; 444*7c478bd9Sstevel@tonic-gate } 445*7c478bd9Sstevel@tonic-gate } 446*7c478bd9Sstevel@tonic-gate } 447*7c478bd9Sstevel@tonic-gate page_io_unlock(pp); 448*7c478bd9Sstevel@tonic-gate pgout = 1; 449*7c478bd9Sstevel@tonic-gate pgpgout++; 450*7c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, 451*7c478bd9Sstevel@tonic-gate "page_ws_out:pp %p", pp); 452*7c478bd9Sstevel@tonic-gate 453*7c478bd9Sstevel@tonic-gate /* 454*7c478bd9Sstevel@tonic-gate * The page_struct_lock need not be acquired to 455*7c478bd9Sstevel@tonic-gate * examine "p_lckcnt" and "p_cowcnt" since we'll 456*7c478bd9Sstevel@tonic-gate * have an "exclusive" lock if the upgrade succeeds. 457*7c478bd9Sstevel@tonic-gate */ 458*7c478bd9Sstevel@tonic-gate if (page_tryupgrade(pp) && 459*7c478bd9Sstevel@tonic-gate pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 460*7c478bd9Sstevel@tonic-gate /* 461*7c478bd9Sstevel@tonic-gate * Check if someone has reclaimed the 462*7c478bd9Sstevel@tonic-gate * page. If ref and mod are not set, no 463*7c478bd9Sstevel@tonic-gate * one is using it so we can free it. 464*7c478bd9Sstevel@tonic-gate * The rest of the system is careful 465*7c478bd9Sstevel@tonic-gate * to use the NOSYNC flag to unload 466*7c478bd9Sstevel@tonic-gate * translations set up for i/o w/o 467*7c478bd9Sstevel@tonic-gate * affecting ref and mod bits. 468*7c478bd9Sstevel@tonic-gate * 469*7c478bd9Sstevel@tonic-gate * Obtain a copy of the real hardware 470*7c478bd9Sstevel@tonic-gate * mod bit using hat_pagesync(pp, HAT_DONTZERO) 471*7c478bd9Sstevel@tonic-gate * to avoid having to flush the cache. 472*7c478bd9Sstevel@tonic-gate */ 473*7c478bd9Sstevel@tonic-gate ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 474*7c478bd9Sstevel@tonic-gate HAT_SYNC_STOPON_MOD); 475*7c478bd9Sstevel@tonic-gate ck_refmod: 476*7c478bd9Sstevel@tonic-gate if (!(ppattr & (P_REF | P_MOD))) { 477*7c478bd9Sstevel@tonic-gate if (hat_page_is_mapped(pp)) { 478*7c478bd9Sstevel@tonic-gate /* 479*7c478bd9Sstevel@tonic-gate * Doesn't look like the page 480*7c478bd9Sstevel@tonic-gate * was modified so now we 481*7c478bd9Sstevel@tonic-gate * really have to unload the 482*7c478bd9Sstevel@tonic-gate * translations. Meanwhile 483*7c478bd9Sstevel@tonic-gate * another CPU could've 484*7c478bd9Sstevel@tonic-gate * modified it so we have to 485*7c478bd9Sstevel@tonic-gate * check again. We don't loop 486*7c478bd9Sstevel@tonic-gate * forever here because now 487*7c478bd9Sstevel@tonic-gate * the translations are gone 488*7c478bd9Sstevel@tonic-gate * and no one can get a new one 489*7c478bd9Sstevel@tonic-gate * since we have the "exclusive" 490*7c478bd9Sstevel@tonic-gate * lock on the page. 491*7c478bd9Sstevel@tonic-gate */ 492*7c478bd9Sstevel@tonic-gate (void) hat_pageunload(pp, 493*7c478bd9Sstevel@tonic-gate HAT_FORCE_PGUNLOAD); 494*7c478bd9Sstevel@tonic-gate ppattr = hat_page_getattr(pp, 495*7c478bd9Sstevel@tonic-gate P_REF | P_MOD); 496*7c478bd9Sstevel@tonic-gate goto ck_refmod; 497*7c478bd9Sstevel@tonic-gate } 498*7c478bd9Sstevel@tonic-gate /* 499*7c478bd9Sstevel@tonic-gate * Update statistics for pages being 500*7c478bd9Sstevel@tonic-gate * freed 501*7c478bd9Sstevel@tonic-gate */ 502*7c478bd9Sstevel@tonic-gate if (pp->p_vnode) { 503*7c478bd9Sstevel@tonic-gate if (IS_SWAPFSVP(pp->p_vnode)) { 504*7c478bd9Sstevel@tonic-gate anonfree++; 505*7c478bd9Sstevel@tonic-gate } else { 506*7c478bd9Sstevel@tonic-gate if (pp->p_vnode->v_flag 507*7c478bd9Sstevel@tonic-gate & VVMEXEC) { 508*7c478bd9Sstevel@tonic-gate execfree++; 509*7c478bd9Sstevel@tonic-gate } else { 510*7c478bd9Sstevel@tonic-gate fsfree++; 511*7c478bd9Sstevel@tonic-gate } 512*7c478bd9Sstevel@tonic-gate } 513*7c478bd9Sstevel@tonic-gate } 514*7c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional ctx*/ 515*7c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_FREE, 516*7c478bd9Sstevel@tonic-gate (flags & B_DONTNEED), kcred); 517*7c478bd9Sstevel@tonic-gate dfree++; 518*7c478bd9Sstevel@tonic-gate } else { 519*7c478bd9Sstevel@tonic-gate page_unlock(pp); 520*7c478bd9Sstevel@tonic-gate pgrec++; 521*7c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, 522*7c478bd9Sstevel@tonic-gate "page_ws_free:pp %p", pp); 523*7c478bd9Sstevel@tonic-gate } 524*7c478bd9Sstevel@tonic-gate } else { 525*7c478bd9Sstevel@tonic-gate /* 526*7c478bd9Sstevel@tonic-gate * Page is either `locked' in memory 527*7c478bd9Sstevel@tonic-gate * or was reclaimed and now has a 528*7c478bd9Sstevel@tonic-gate * "shared" lock, so release it. 529*7c478bd9Sstevel@tonic-gate */ 530*7c478bd9Sstevel@tonic-gate page_unlock(pp); 531*7c478bd9Sstevel@tonic-gate } 532*7c478bd9Sstevel@tonic-gate } else { 533*7c478bd9Sstevel@tonic-gate /* 534*7c478bd9Sstevel@tonic-gate * Neither B_FREE nor B_INVAL nor B_ERROR. 535*7c478bd9Sstevel@tonic-gate * Just release locks. 536*7c478bd9Sstevel@tonic-gate */ 537*7c478bd9Sstevel@tonic-gate page_io_unlock(pp); 538*7c478bd9Sstevel@tonic-gate page_unlock(pp); 539*7c478bd9Sstevel@tonic-gate } 540*7c478bd9Sstevel@tonic-gate } 541*7c478bd9Sstevel@tonic-gate 542*7c478bd9Sstevel@tonic-gate CPU_STATS_ENTER_K(); 543*7c478bd9Sstevel@tonic-gate cpup = CPU; /* get cpup now that CPU cannot change */ 544*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, dfree, dfree); 545*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); 546*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgout, pgout); 547*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); 548*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); 549*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); 550*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); 551*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); 552*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); 553*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, execfree, execfree); 554*7c478bd9Sstevel@tonic-gate CPU_STATS_EXIT_K(); 555*7c478bd9Sstevel@tonic-gate 556*7c478bd9Sstevel@tonic-gate /* Kernel probe */ 557*7c478bd9Sstevel@tonic-gate TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, 558*7c478bd9Sstevel@tonic-gate tnf_opaque, vnode, vp, 559*7c478bd9Sstevel@tonic-gate tnf_ulong, pages_pageout, pgpgout, 560*7c478bd9Sstevel@tonic-gate tnf_ulong, pages_freed, dfree, 561*7c478bd9Sstevel@tonic-gate tnf_ulong, pages_reclaimed, pgrec); 562*7c478bd9Sstevel@tonic-gate } 563*7c478bd9Sstevel@tonic-gate 564*7c478bd9Sstevel@tonic-gate /* 565*7c478bd9Sstevel@tonic-gate * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, 566*7c478bd9Sstevel@tonic-gate * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster 567*7c478bd9Sstevel@tonic-gate * operation and is only to be considered if it doesn't involve any 568*7c478bd9Sstevel@tonic-gate * waiting here. B_TRUNC indicates that the file is being truncated 569*7c478bd9Sstevel@tonic-gate * and so no i/o needs to be done. B_FORCE indicates that the page 570*7c478bd9Sstevel@tonic-gate * must be destroyed so don't try wrting it out. 571*7c478bd9Sstevel@tonic-gate * 572*7c478bd9Sstevel@tonic-gate * The caller must ensure that the page is locked. Returns 1, if 573*7c478bd9Sstevel@tonic-gate * the page should be written back (the "iolock" is held in this 574*7c478bd9Sstevel@tonic-gate * case), or 0 if the page has been dealt with or has been 575*7c478bd9Sstevel@tonic-gate * unlocked. 576*7c478bd9Sstevel@tonic-gate */ 577*7c478bd9Sstevel@tonic-gate int 578*7c478bd9Sstevel@tonic-gate pvn_getdirty(page_t *pp, int flags) 579*7c478bd9Sstevel@tonic-gate { 580*7c478bd9Sstevel@tonic-gate ASSERT((flags & (B_INVAL | B_FREE)) ? 581*7c478bd9Sstevel@tonic-gate PAGE_EXCL(pp) : PAGE_SHARED(pp)); 582*7c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp) == 0); 583*7c478bd9Sstevel@tonic-gate 584*7c478bd9Sstevel@tonic-gate /* 585*7c478bd9Sstevel@tonic-gate * If trying to invalidate or free a logically `locked' page, 586*7c478bd9Sstevel@tonic-gate * forget it. Don't need page_struct_lock to check p_lckcnt and 587*7c478bd9Sstevel@tonic-gate * p_cowcnt as the page is exclusively locked. 588*7c478bd9Sstevel@tonic-gate */ 589*7c478bd9Sstevel@tonic-gate if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && 590*7c478bd9Sstevel@tonic-gate (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { 591*7c478bd9Sstevel@tonic-gate page_unlock(pp); 592*7c478bd9Sstevel@tonic-gate return (0); 593*7c478bd9Sstevel@tonic-gate } 594*7c478bd9Sstevel@tonic-gate 595*7c478bd9Sstevel@tonic-gate /* 596*7c478bd9Sstevel@tonic-gate * Now acquire the i/o lock so we can add it to the dirty 597*7c478bd9Sstevel@tonic-gate * list (if necessary). We avoid blocking on the i/o lock 598*7c478bd9Sstevel@tonic-gate * in the following cases: 599*7c478bd9Sstevel@tonic-gate * 600*7c478bd9Sstevel@tonic-gate * If B_DELWRI is set, which implies that this request is 601*7c478bd9Sstevel@tonic-gate * due to a klustering operartion. 602*7c478bd9Sstevel@tonic-gate * 603*7c478bd9Sstevel@tonic-gate * If this is an async (B_ASYNC) operation and we are not doing 604*7c478bd9Sstevel@tonic-gate * invalidation (B_INVAL) [The current i/o or fsflush will ensure 605*7c478bd9Sstevel@tonic-gate * that the the page is written out]. 606*7c478bd9Sstevel@tonic-gate */ 607*7c478bd9Sstevel@tonic-gate if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { 608*7c478bd9Sstevel@tonic-gate if (!page_io_trylock(pp)) { 609*7c478bd9Sstevel@tonic-gate page_unlock(pp); 610*7c478bd9Sstevel@tonic-gate return (0); 611*7c478bd9Sstevel@tonic-gate } 612*7c478bd9Sstevel@tonic-gate } else { 613*7c478bd9Sstevel@tonic-gate page_io_lock(pp); 614*7c478bd9Sstevel@tonic-gate } 615*7c478bd9Sstevel@tonic-gate 616*7c478bd9Sstevel@tonic-gate /* 617*7c478bd9Sstevel@tonic-gate * If we want to free or invalidate the page then 618*7c478bd9Sstevel@tonic-gate * we need to unload it so that anyone who wants 619*7c478bd9Sstevel@tonic-gate * it will have to take a minor fault to get it. 620*7c478bd9Sstevel@tonic-gate * Otherwise, we're just writing the page back so we 621*7c478bd9Sstevel@tonic-gate * need to sync up the hardwre and software mod bit to 622*7c478bd9Sstevel@tonic-gate * detect any future modifications. We clear the 623*7c478bd9Sstevel@tonic-gate * software mod bit when we put the page on the dirty 624*7c478bd9Sstevel@tonic-gate * list. 625*7c478bd9Sstevel@tonic-gate */ 626*7c478bd9Sstevel@tonic-gate if (flags & (B_INVAL | B_FREE)) { 627*7c478bd9Sstevel@tonic-gate (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 628*7c478bd9Sstevel@tonic-gate } else { 629*7c478bd9Sstevel@tonic-gate (void) hat_pagesync(pp, HAT_SYNC_ZERORM); 630*7c478bd9Sstevel@tonic-gate } 631*7c478bd9Sstevel@tonic-gate 632*7c478bd9Sstevel@tonic-gate if (!hat_ismod(pp) || (flags & B_TRUNC)) { 633*7c478bd9Sstevel@tonic-gate /* 634*7c478bd9Sstevel@tonic-gate * Don't need to add it to the 635*7c478bd9Sstevel@tonic-gate * list after all. 636*7c478bd9Sstevel@tonic-gate */ 637*7c478bd9Sstevel@tonic-gate page_io_unlock(pp); 638*7c478bd9Sstevel@tonic-gate if (flags & B_INVAL) { 639*7c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 640*7c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 641*7c478bd9Sstevel@tonic-gate } else if (flags & B_FREE) { 642*7c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 643*7c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); 644*7c478bd9Sstevel@tonic-gate } else { 645*7c478bd9Sstevel@tonic-gate /* 646*7c478bd9Sstevel@tonic-gate * This is advisory path for the callers 647*7c478bd9Sstevel@tonic-gate * of VOP_PUTPAGE() who prefer freeing the 648*7c478bd9Sstevel@tonic-gate * page _only_ if no one else is accessing it. 649*7c478bd9Sstevel@tonic-gate * E.g. segmap_release() 650*7c478bd9Sstevel@tonic-gate * 651*7c478bd9Sstevel@tonic-gate * The above hat_ismod() check is useless because: 652*7c478bd9Sstevel@tonic-gate * (1) we may not be holding SE_EXCL lock; 653*7c478bd9Sstevel@tonic-gate * (2) we've not unloaded _all_ translations 654*7c478bd9Sstevel@tonic-gate * 655*7c478bd9Sstevel@tonic-gate * Let page_release() do the heavy-lifting. 656*7c478bd9Sstevel@tonic-gate */ 657*7c478bd9Sstevel@tonic-gate (void) page_release(pp, 1); 658*7c478bd9Sstevel@tonic-gate } 659*7c478bd9Sstevel@tonic-gate return (0); 660*7c478bd9Sstevel@tonic-gate } 661*7c478bd9Sstevel@tonic-gate 662*7c478bd9Sstevel@tonic-gate /* 663*7c478bd9Sstevel@tonic-gate * Page is dirty, get it ready for the write back 664*7c478bd9Sstevel@tonic-gate * and add page to the dirty list. 665*7c478bd9Sstevel@tonic-gate */ 666*7c478bd9Sstevel@tonic-gate hat_clrrefmod(pp); 667*7c478bd9Sstevel@tonic-gate 668*7c478bd9Sstevel@tonic-gate /* 669*7c478bd9Sstevel@tonic-gate * If we're going to free the page when we're done 670*7c478bd9Sstevel@tonic-gate * then we can let others try to use it starting now. 671*7c478bd9Sstevel@tonic-gate * We'll detect the fact that they used it when the 672*7c478bd9Sstevel@tonic-gate * i/o is done and avoid freeing the page. 673*7c478bd9Sstevel@tonic-gate */ 674*7c478bd9Sstevel@tonic-gate if (flags & B_FREE) 675*7c478bd9Sstevel@tonic-gate page_downgrade(pp); 676*7c478bd9Sstevel@tonic-gate 677*7c478bd9Sstevel@tonic-gate 678*7c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); 679*7c478bd9Sstevel@tonic-gate 680*7c478bd9Sstevel@tonic-gate return (1); 681*7c478bd9Sstevel@tonic-gate } 682*7c478bd9Sstevel@tonic-gate 683*7c478bd9Sstevel@tonic-gate 684*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 685*7c478bd9Sstevel@tonic-gate static int 686*7c478bd9Sstevel@tonic-gate marker_constructor(void *buf, void *cdrarg, int kmflags) 687*7c478bd9Sstevel@tonic-gate { 688*7c478bd9Sstevel@tonic-gate page_t *mark = buf; 689*7c478bd9Sstevel@tonic-gate bzero(mark, sizeof (page_t)); 690*7c478bd9Sstevel@tonic-gate return (0); 691*7c478bd9Sstevel@tonic-gate } 692*7c478bd9Sstevel@tonic-gate 693*7c478bd9Sstevel@tonic-gate void 694*7c478bd9Sstevel@tonic-gate pvn_init() 695*7c478bd9Sstevel@tonic-gate { 696*7c478bd9Sstevel@tonic-gate if (pvn_vmodsort_disable == 0) 697*7c478bd9Sstevel@tonic-gate pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL); 698*7c478bd9Sstevel@tonic-gate marker_cache = kmem_cache_create("marker_cache", 699*7c478bd9Sstevel@tonic-gate sizeof (page_t), 0, marker_constructor, 700*7c478bd9Sstevel@tonic-gate NULL, NULL, NULL, NULL, 0); 701*7c478bd9Sstevel@tonic-gate } 702*7c478bd9Sstevel@tonic-gate 703*7c478bd9Sstevel@tonic-gate 704*7c478bd9Sstevel@tonic-gate /* 705*7c478bd9Sstevel@tonic-gate * Process a vnode's page list for all pages whose offset is >= off. 706*7c478bd9Sstevel@tonic-gate * Pages are to either be free'd, invalidated, or written back to disk. 707*7c478bd9Sstevel@tonic-gate * 708*7c478bd9Sstevel@tonic-gate * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE 709*7c478bd9Sstevel@tonic-gate * is specified, otherwise they are "shared" locked. 710*7c478bd9Sstevel@tonic-gate * 711*7c478bd9Sstevel@tonic-gate * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} 712*7c478bd9Sstevel@tonic-gate * 713*7c478bd9Sstevel@tonic-gate * Special marker page_t's are inserted in the list in order 714*7c478bd9Sstevel@tonic-gate * to keep track of where we are in the list when locks are dropped. 715*7c478bd9Sstevel@tonic-gate * 716*7c478bd9Sstevel@tonic-gate * Note the list is circular and insertions can happen only at the 717*7c478bd9Sstevel@tonic-gate * head and tail of the list. The algorithm ensures visiting all pages 718*7c478bd9Sstevel@tonic-gate * on the list in the following way: 719*7c478bd9Sstevel@tonic-gate * 720*7c478bd9Sstevel@tonic-gate * Drop two marker pages at the end of the list. 721*7c478bd9Sstevel@tonic-gate * 722*7c478bd9Sstevel@tonic-gate * Move one marker page backwards towards the start of the list until 723*7c478bd9Sstevel@tonic-gate * it is at the list head, processing the pages passed along the way. 724*7c478bd9Sstevel@tonic-gate * 725*7c478bd9Sstevel@tonic-gate * Due to race conditions when the vphm mutex is dropped, additional pages 726*7c478bd9Sstevel@tonic-gate * can be added to either end of the list, so we'll continue to move 727*7c478bd9Sstevel@tonic-gate * the marker and process pages until it is up against the end marker. 728*7c478bd9Sstevel@tonic-gate * 729*7c478bd9Sstevel@tonic-gate * There is one special exit condition. If we are processing a VMODSORT 730*7c478bd9Sstevel@tonic-gate * vnode and only writing back modified pages, we can stop as soon as 731*7c478bd9Sstevel@tonic-gate * we run into an unmodified page. This makes fsync(3) operations fast. 732*7c478bd9Sstevel@tonic-gate */ 733*7c478bd9Sstevel@tonic-gate int 734*7c478bd9Sstevel@tonic-gate pvn_vplist_dirty( 735*7c478bd9Sstevel@tonic-gate vnode_t *vp, 736*7c478bd9Sstevel@tonic-gate u_offset_t off, 737*7c478bd9Sstevel@tonic-gate int (*putapage)(vnode_t *, page_t *, u_offset_t *, 738*7c478bd9Sstevel@tonic-gate size_t *, int, cred_t *), 739*7c478bd9Sstevel@tonic-gate int flags, 740*7c478bd9Sstevel@tonic-gate cred_t *cred) 741*7c478bd9Sstevel@tonic-gate { 742*7c478bd9Sstevel@tonic-gate page_t *pp; 743*7c478bd9Sstevel@tonic-gate page_t *mark; /* marker page that moves toward head */ 744*7c478bd9Sstevel@tonic-gate page_t *end; /* marker page at end of list */ 745*7c478bd9Sstevel@tonic-gate int err = 0; 746*7c478bd9Sstevel@tonic-gate int error; 747*7c478bd9Sstevel@tonic-gate kmutex_t *vphm; 748*7c478bd9Sstevel@tonic-gate se_t se; 749*7c478bd9Sstevel@tonic-gate page_t **where_to_move; 750*7c478bd9Sstevel@tonic-gate 751*7c478bd9Sstevel@tonic-gate ASSERT(vp->v_type != VCHR); 752*7c478bd9Sstevel@tonic-gate 753*7c478bd9Sstevel@tonic-gate if (vp->v_pages == NULL) 754*7c478bd9Sstevel@tonic-gate return (0); 755*7c478bd9Sstevel@tonic-gate 756*7c478bd9Sstevel@tonic-gate 757*7c478bd9Sstevel@tonic-gate /* 758*7c478bd9Sstevel@tonic-gate * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. 759*7c478bd9Sstevel@tonic-gate * 760*7c478bd9Sstevel@tonic-gate * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() 761*7c478bd9Sstevel@tonic-gate * from getting blocked while flushing pages to a dead NFS server. 762*7c478bd9Sstevel@tonic-gate */ 763*7c478bd9Sstevel@tonic-gate mutex_enter(&vp->v_lock); 764*7c478bd9Sstevel@tonic-gate if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { 765*7c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 766*7c478bd9Sstevel@tonic-gate return (EAGAIN); 767*7c478bd9Sstevel@tonic-gate } 768*7c478bd9Sstevel@tonic-gate 769*7c478bd9Sstevel@tonic-gate while (vp->v_flag & VVMLOCK) 770*7c478bd9Sstevel@tonic-gate cv_wait(&vp->v_cv, &vp->v_lock); 771*7c478bd9Sstevel@tonic-gate 772*7c478bd9Sstevel@tonic-gate if (vp->v_pages == NULL) { 773*7c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 774*7c478bd9Sstevel@tonic-gate return (0); 775*7c478bd9Sstevel@tonic-gate } 776*7c478bd9Sstevel@tonic-gate 777*7c478bd9Sstevel@tonic-gate vp->v_flag |= VVMLOCK; 778*7c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 779*7c478bd9Sstevel@tonic-gate 780*7c478bd9Sstevel@tonic-gate 781*7c478bd9Sstevel@tonic-gate /* 782*7c478bd9Sstevel@tonic-gate * Set up the marker pages used to walk the list 783*7c478bd9Sstevel@tonic-gate */ 784*7c478bd9Sstevel@tonic-gate end = kmem_cache_alloc(marker_cache, KM_SLEEP); 785*7c478bd9Sstevel@tonic-gate end->p_vnode = vp; 786*7c478bd9Sstevel@tonic-gate end->p_offset = (u_offset_t)-2; 787*7c478bd9Sstevel@tonic-gate mark = kmem_cache_alloc(marker_cache, KM_SLEEP); 788*7c478bd9Sstevel@tonic-gate mark->p_vnode = vp; 789*7c478bd9Sstevel@tonic-gate mark->p_offset = (u_offset_t)-1; 790*7c478bd9Sstevel@tonic-gate 791*7c478bd9Sstevel@tonic-gate /* 792*7c478bd9Sstevel@tonic-gate * Grab the lock protecting the vnode's page list 793*7c478bd9Sstevel@tonic-gate * note that this lock is dropped at times in the loop. 794*7c478bd9Sstevel@tonic-gate */ 795*7c478bd9Sstevel@tonic-gate vphm = page_vnode_mutex(vp); 796*7c478bd9Sstevel@tonic-gate mutex_enter(vphm); 797*7c478bd9Sstevel@tonic-gate if (vp->v_pages == NULL) 798*7c478bd9Sstevel@tonic-gate goto leave; 799*7c478bd9Sstevel@tonic-gate 800*7c478bd9Sstevel@tonic-gate /* 801*7c478bd9Sstevel@tonic-gate * insert the markers and loop through the list of pages 802*7c478bd9Sstevel@tonic-gate */ 803*7c478bd9Sstevel@tonic-gate page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); 804*7c478bd9Sstevel@tonic-gate page_vpadd(&mark->p_vpnext, end); 805*7c478bd9Sstevel@tonic-gate for (;;) { 806*7c478bd9Sstevel@tonic-gate 807*7c478bd9Sstevel@tonic-gate /* 808*7c478bd9Sstevel@tonic-gate * If only doing an async write back, then we can 809*7c478bd9Sstevel@tonic-gate * stop as soon as we get to start of the list. 810*7c478bd9Sstevel@tonic-gate */ 811*7c478bd9Sstevel@tonic-gate if (flags == B_ASYNC && vp->v_pages == mark) 812*7c478bd9Sstevel@tonic-gate break; 813*7c478bd9Sstevel@tonic-gate 814*7c478bd9Sstevel@tonic-gate /* 815*7c478bd9Sstevel@tonic-gate * otherwise stop when we've gone through all the pages 816*7c478bd9Sstevel@tonic-gate */ 817*7c478bd9Sstevel@tonic-gate if (mark->p_vpprev == end) 818*7c478bd9Sstevel@tonic-gate break; 819*7c478bd9Sstevel@tonic-gate 820*7c478bd9Sstevel@tonic-gate pp = mark->p_vpprev; 821*7c478bd9Sstevel@tonic-gate if (vp->v_pages == pp) 822*7c478bd9Sstevel@tonic-gate where_to_move = &vp->v_pages; 823*7c478bd9Sstevel@tonic-gate else 824*7c478bd9Sstevel@tonic-gate where_to_move = &pp->p_vpprev->p_vpnext; 825*7c478bd9Sstevel@tonic-gate 826*7c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode == vp); 827*7c478bd9Sstevel@tonic-gate 828*7c478bd9Sstevel@tonic-gate /* 829*7c478bd9Sstevel@tonic-gate * Skip this page if the offset is out of the desired range. 830*7c478bd9Sstevel@tonic-gate * Just move the marker and continue. 831*7c478bd9Sstevel@tonic-gate */ 832*7c478bd9Sstevel@tonic-gate if (pp->p_offset < off) { 833*7c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 834*7c478bd9Sstevel@tonic-gate page_vpadd(where_to_move, mark); 835*7c478bd9Sstevel@tonic-gate continue; 836*7c478bd9Sstevel@tonic-gate } 837*7c478bd9Sstevel@tonic-gate 838*7c478bd9Sstevel@tonic-gate /* 839*7c478bd9Sstevel@tonic-gate * If just flushing dirty pages to disk and this vnode 840*7c478bd9Sstevel@tonic-gate * is using a sorted list of pages, we can stop processing 841*7c478bd9Sstevel@tonic-gate * as soon as we find an unmodified page. Since all the 842*7c478bd9Sstevel@tonic-gate * modified pages are visited first. 843*7c478bd9Sstevel@tonic-gate */ 844*7c478bd9Sstevel@tonic-gate if (IS_VMODSORT(vp) && 845*7c478bd9Sstevel@tonic-gate !(flags & (B_INVAL | B_FREE | B_TRUNC)) && 846*7c478bd9Sstevel@tonic-gate !hat_ismod(pp)) { 847*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 848*7c478bd9Sstevel@tonic-gate /* 849*7c478bd9Sstevel@tonic-gate * For debug kernels examine what should be all the 850*7c478bd9Sstevel@tonic-gate * remaining clean pages, asserting that they are 851*7c478bd9Sstevel@tonic-gate * not modified. 852*7c478bd9Sstevel@tonic-gate */ 853*7c478bd9Sstevel@tonic-gate page_t *chk = pp; 854*7c478bd9Sstevel@tonic-gate int attr; 855*7c478bd9Sstevel@tonic-gate 856*7c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 857*7c478bd9Sstevel@tonic-gate page_vpadd(where_to_move, mark); 858*7c478bd9Sstevel@tonic-gate do { 859*7c478bd9Sstevel@tonic-gate chk = chk->p_vpprev; 860*7c478bd9Sstevel@tonic-gate ASSERT(chk != end); 861*7c478bd9Sstevel@tonic-gate if (chk == mark) 862*7c478bd9Sstevel@tonic-gate continue; 863*7c478bd9Sstevel@tonic-gate attr = hat_page_getattr(chk, P_MOD | P_REF); 864*7c478bd9Sstevel@tonic-gate if ((attr & P_MOD) == 0) 865*7c478bd9Sstevel@tonic-gate continue; 866*7c478bd9Sstevel@tonic-gate panic("v_pages list not all clean: " 867*7c478bd9Sstevel@tonic-gate "page_t*=%p vnode=%p off=%lx " 868*7c478bd9Sstevel@tonic-gate "attr=0x%x last clean page_t*=%p\n", 869*7c478bd9Sstevel@tonic-gate (void *)chk, (void *)chk->p_vnode, 870*7c478bd9Sstevel@tonic-gate (long)chk->p_offset, attr, (void *)pp); 871*7c478bd9Sstevel@tonic-gate } while (chk != vp->v_pages); 872*7c478bd9Sstevel@tonic-gate #endif 873*7c478bd9Sstevel@tonic-gate break; 874*7c478bd9Sstevel@tonic-gate } 875*7c478bd9Sstevel@tonic-gate 876*7c478bd9Sstevel@tonic-gate /* 877*7c478bd9Sstevel@tonic-gate * If we are supposed to invalidate or free this 878*7c478bd9Sstevel@tonic-gate * page, then we need an exclusive lock. 879*7c478bd9Sstevel@tonic-gate */ 880*7c478bd9Sstevel@tonic-gate se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 881*7c478bd9Sstevel@tonic-gate 882*7c478bd9Sstevel@tonic-gate /* 883*7c478bd9Sstevel@tonic-gate * We must acquire the page lock for all synchronous 884*7c478bd9Sstevel@tonic-gate * operations (invalidate, free and write). 885*7c478bd9Sstevel@tonic-gate */ 886*7c478bd9Sstevel@tonic-gate if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { 887*7c478bd9Sstevel@tonic-gate /* 888*7c478bd9Sstevel@tonic-gate * If the page_lock() drops the mutex 889*7c478bd9Sstevel@tonic-gate * we must retry the loop. 890*7c478bd9Sstevel@tonic-gate */ 891*7c478bd9Sstevel@tonic-gate if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) 892*7c478bd9Sstevel@tonic-gate continue; 893*7c478bd9Sstevel@tonic-gate 894*7c478bd9Sstevel@tonic-gate /* 895*7c478bd9Sstevel@tonic-gate * It's ok to move the marker page now. 896*7c478bd9Sstevel@tonic-gate */ 897*7c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 898*7c478bd9Sstevel@tonic-gate page_vpadd(where_to_move, mark); 899*7c478bd9Sstevel@tonic-gate } else { 900*7c478bd9Sstevel@tonic-gate 901*7c478bd9Sstevel@tonic-gate /* 902*7c478bd9Sstevel@tonic-gate * update the marker page for all remaining cases 903*7c478bd9Sstevel@tonic-gate */ 904*7c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 905*7c478bd9Sstevel@tonic-gate page_vpadd(where_to_move, mark); 906*7c478bd9Sstevel@tonic-gate 907*7c478bd9Sstevel@tonic-gate /* 908*7c478bd9Sstevel@tonic-gate * For write backs, If we can't lock the page, it's 909*7c478bd9Sstevel@tonic-gate * invalid or in the process of being destroyed. Skip 910*7c478bd9Sstevel@tonic-gate * it, assuming someone else is writing it. 911*7c478bd9Sstevel@tonic-gate */ 912*7c478bd9Sstevel@tonic-gate if (!page_trylock(pp, se)) 913*7c478bd9Sstevel@tonic-gate continue; 914*7c478bd9Sstevel@tonic-gate } 915*7c478bd9Sstevel@tonic-gate 916*7c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode == vp); 917*7c478bd9Sstevel@tonic-gate 918*7c478bd9Sstevel@tonic-gate /* 919*7c478bd9Sstevel@tonic-gate * Successfully locked the page, now figure out what to 920*7c478bd9Sstevel@tonic-gate * do with it. Free pages are easily dealt with, invalidate 921*7c478bd9Sstevel@tonic-gate * if desired or just go on to the next page. 922*7c478bd9Sstevel@tonic-gate */ 923*7c478bd9Sstevel@tonic-gate if (PP_ISFREE(pp)) { 924*7c478bd9Sstevel@tonic-gate if ((flags & B_INVAL) == 0) { 925*7c478bd9Sstevel@tonic-gate page_unlock(pp); 926*7c478bd9Sstevel@tonic-gate continue; 927*7c478bd9Sstevel@tonic-gate } 928*7c478bd9Sstevel@tonic-gate 929*7c478bd9Sstevel@tonic-gate /* 930*7c478bd9Sstevel@tonic-gate * Invalidate (destroy) the page. 931*7c478bd9Sstevel@tonic-gate */ 932*7c478bd9Sstevel@tonic-gate mutex_exit(vphm); 933*7c478bd9Sstevel@tonic-gate page_destroy_free(pp); 934*7c478bd9Sstevel@tonic-gate mutex_enter(vphm); 935*7c478bd9Sstevel@tonic-gate continue; 936*7c478bd9Sstevel@tonic-gate } 937*7c478bd9Sstevel@tonic-gate 938*7c478bd9Sstevel@tonic-gate /* 939*7c478bd9Sstevel@tonic-gate * pvn_getdirty() figures out what do do with a dirty page. 940*7c478bd9Sstevel@tonic-gate * If the page is dirty, the putapage() routine will write it 941*7c478bd9Sstevel@tonic-gate * and will kluster any other adjacent dirty pages it can. 942*7c478bd9Sstevel@tonic-gate * 943*7c478bd9Sstevel@tonic-gate * pvn_getdirty() and `(*putapage)' unlock the page. 944*7c478bd9Sstevel@tonic-gate */ 945*7c478bd9Sstevel@tonic-gate mutex_exit(vphm); 946*7c478bd9Sstevel@tonic-gate if (pvn_getdirty(pp, flags)) { 947*7c478bd9Sstevel@tonic-gate error = (*putapage)(vp, pp, NULL, NULL, flags, cred); 948*7c478bd9Sstevel@tonic-gate if (!err) 949*7c478bd9Sstevel@tonic-gate err = error; 950*7c478bd9Sstevel@tonic-gate } 951*7c478bd9Sstevel@tonic-gate mutex_enter(vphm); 952*7c478bd9Sstevel@tonic-gate } 953*7c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 954*7c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, end); 955*7c478bd9Sstevel@tonic-gate 956*7c478bd9Sstevel@tonic-gate leave: 957*7c478bd9Sstevel@tonic-gate /* 958*7c478bd9Sstevel@tonic-gate * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds 959*7c478bd9Sstevel@tonic-gate */ 960*7c478bd9Sstevel@tonic-gate mutex_exit(vphm); 961*7c478bd9Sstevel@tonic-gate kmem_cache_free(marker_cache, mark); 962*7c478bd9Sstevel@tonic-gate kmem_cache_free(marker_cache, end); 963*7c478bd9Sstevel@tonic-gate mutex_enter(&vp->v_lock); 964*7c478bd9Sstevel@tonic-gate vp->v_flag &= ~VVMLOCK; 965*7c478bd9Sstevel@tonic-gate cv_broadcast(&vp->v_cv); 966*7c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 967*7c478bd9Sstevel@tonic-gate return (err); 968*7c478bd9Sstevel@tonic-gate } 969*7c478bd9Sstevel@tonic-gate 970*7c478bd9Sstevel@tonic-gate /* 971*7c478bd9Sstevel@tonic-gate * Zero out zbytes worth of data. Caller should be aware that this 972*7c478bd9Sstevel@tonic-gate * routine may enter back into the fs layer (xxx_getpage). Locks 973*7c478bd9Sstevel@tonic-gate * that the xxx_getpage routine may need should not be held while 974*7c478bd9Sstevel@tonic-gate * calling this. 975*7c478bd9Sstevel@tonic-gate */ 976*7c478bd9Sstevel@tonic-gate void 977*7c478bd9Sstevel@tonic-gate pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes) 978*7c478bd9Sstevel@tonic-gate { 979*7c478bd9Sstevel@tonic-gate caddr_t addr; 980*7c478bd9Sstevel@tonic-gate 981*7c478bd9Sstevel@tonic-gate ASSERT(vp->v_type != VCHR); 982*7c478bd9Sstevel@tonic-gate 983*7c478bd9Sstevel@tonic-gate if (vp->v_pages == NULL) 984*7c478bd9Sstevel@tonic-gate return; 985*7c478bd9Sstevel@tonic-gate 986*7c478bd9Sstevel@tonic-gate /* 987*7c478bd9Sstevel@tonic-gate * zbytes may be zero but there still may be some portion of 988*7c478bd9Sstevel@tonic-gate * a page which needs clearing (since zbytes is a function 989*7c478bd9Sstevel@tonic-gate * of filesystem block size, not pagesize.) 990*7c478bd9Sstevel@tonic-gate */ 991*7c478bd9Sstevel@tonic-gate if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0) 992*7c478bd9Sstevel@tonic-gate return; 993*7c478bd9Sstevel@tonic-gate 994*7c478bd9Sstevel@tonic-gate /* 995*7c478bd9Sstevel@tonic-gate * We get the last page and handle the partial 996*7c478bd9Sstevel@tonic-gate * zeroing via kernel mappings. This will make the page 997*7c478bd9Sstevel@tonic-gate * dirty so that we know that when this page is written 998*7c478bd9Sstevel@tonic-gate * back, the zeroed information will go out with it. If 999*7c478bd9Sstevel@tonic-gate * the page is not currently in memory, then the kzero 1000*7c478bd9Sstevel@tonic-gate * operation will cause it to be brought it. We use kzero 1001*7c478bd9Sstevel@tonic-gate * instead of bzero so that if the page cannot be read in 1002*7c478bd9Sstevel@tonic-gate * for any reason, the system will not panic. We need 1003*7c478bd9Sstevel@tonic-gate * to zero out a minimum of the fs given zbytes, but we 1004*7c478bd9Sstevel@tonic-gate * might also have to do more to get the entire last page. 1005*7c478bd9Sstevel@tonic-gate */ 1006*7c478bd9Sstevel@tonic-gate 1007*7c478bd9Sstevel@tonic-gate if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE) 1008*7c478bd9Sstevel@tonic-gate panic("pvn_vptrunc zbytes"); 1009*7c478bd9Sstevel@tonic-gate addr = segmap_getmapflt(segkmap, vp, vplen, 1010*7c478bd9Sstevel@tonic-gate MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE); 1011*7c478bd9Sstevel@tonic-gate (void) kzero(addr + (vplen & MAXBOFFSET), 1012*7c478bd9Sstevel@tonic-gate MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET))); 1013*7c478bd9Sstevel@tonic-gate (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC); 1014*7c478bd9Sstevel@tonic-gate } 1015*7c478bd9Sstevel@tonic-gate 1016*7c478bd9Sstevel@tonic-gate /* 1017*7c478bd9Sstevel@tonic-gate * Handles common work of the VOP_GETPAGE routines when more than 1018*7c478bd9Sstevel@tonic-gate * one page must be returned by calling a file system specific operation 1019*7c478bd9Sstevel@tonic-gate * to do most of the work. Must be called with the vp already locked 1020*7c478bd9Sstevel@tonic-gate * by the VOP_GETPAGE routine. 1021*7c478bd9Sstevel@tonic-gate */ 1022*7c478bd9Sstevel@tonic-gate int 1023*7c478bd9Sstevel@tonic-gate pvn_getpages( 1024*7c478bd9Sstevel@tonic-gate int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[], 1025*7c478bd9Sstevel@tonic-gate size_t, struct seg *, caddr_t, enum seg_rw, cred_t *), 1026*7c478bd9Sstevel@tonic-gate struct vnode *vp, 1027*7c478bd9Sstevel@tonic-gate u_offset_t off, 1028*7c478bd9Sstevel@tonic-gate size_t len, 1029*7c478bd9Sstevel@tonic-gate uint_t *protp, 1030*7c478bd9Sstevel@tonic-gate page_t *pl[], 1031*7c478bd9Sstevel@tonic-gate size_t plsz, 1032*7c478bd9Sstevel@tonic-gate struct seg *seg, 1033*7c478bd9Sstevel@tonic-gate caddr_t addr, 1034*7c478bd9Sstevel@tonic-gate enum seg_rw rw, 1035*7c478bd9Sstevel@tonic-gate struct cred *cred) 1036*7c478bd9Sstevel@tonic-gate { 1037*7c478bd9Sstevel@tonic-gate page_t **ppp; 1038*7c478bd9Sstevel@tonic-gate u_offset_t o, eoff; 1039*7c478bd9Sstevel@tonic-gate size_t sz, xlen; 1040*7c478bd9Sstevel@tonic-gate int err; 1041*7c478bd9Sstevel@tonic-gate 1042*7c478bd9Sstevel@tonic-gate ASSERT(plsz >= len); /* insure that we have enough space */ 1043*7c478bd9Sstevel@tonic-gate 1044*7c478bd9Sstevel@tonic-gate /* 1045*7c478bd9Sstevel@tonic-gate * Loop one page at a time and let getapage function fill 1046*7c478bd9Sstevel@tonic-gate * in the next page in array. We only allow one page to be 1047*7c478bd9Sstevel@tonic-gate * returned at a time (except for the last page) so that we 1048*7c478bd9Sstevel@tonic-gate * don't have any problems with duplicates and other such 1049*7c478bd9Sstevel@tonic-gate * painful problems. This is a very simple minded algorithm, 1050*7c478bd9Sstevel@tonic-gate * but it does the job correctly. We hope that the cost of a 1051*7c478bd9Sstevel@tonic-gate * getapage call for a resident page that we might have been 1052*7c478bd9Sstevel@tonic-gate * able to get from an earlier call doesn't cost too much. 1053*7c478bd9Sstevel@tonic-gate */ 1054*7c478bd9Sstevel@tonic-gate ppp = pl; 1055*7c478bd9Sstevel@tonic-gate sz = PAGESIZE; 1056*7c478bd9Sstevel@tonic-gate eoff = off + len; 1057*7c478bd9Sstevel@tonic-gate xlen = len; 1058*7c478bd9Sstevel@tonic-gate for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE, 1059*7c478bd9Sstevel@tonic-gate xlen -= PAGESIZE) { 1060*7c478bd9Sstevel@tonic-gate if (o + PAGESIZE >= eoff) { 1061*7c478bd9Sstevel@tonic-gate /* 1062*7c478bd9Sstevel@tonic-gate * Last time through - allow the all of 1063*7c478bd9Sstevel@tonic-gate * what's left of the pl[] array to be used. 1064*7c478bd9Sstevel@tonic-gate */ 1065*7c478bd9Sstevel@tonic-gate sz = plsz - (o - off); 1066*7c478bd9Sstevel@tonic-gate } 1067*7c478bd9Sstevel@tonic-gate err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr, 1068*7c478bd9Sstevel@tonic-gate rw, cred); 1069*7c478bd9Sstevel@tonic-gate if (err) { 1070*7c478bd9Sstevel@tonic-gate /* 1071*7c478bd9Sstevel@tonic-gate * Release any pages we already got. 1072*7c478bd9Sstevel@tonic-gate */ 1073*7c478bd9Sstevel@tonic-gate if (o > off && pl != NULL) { 1074*7c478bd9Sstevel@tonic-gate for (ppp = pl; *ppp != NULL; *ppp++ = NULL) 1075*7c478bd9Sstevel@tonic-gate (void) page_release(*ppp, 1); 1076*7c478bd9Sstevel@tonic-gate } 1077*7c478bd9Sstevel@tonic-gate break; 1078*7c478bd9Sstevel@tonic-gate } 1079*7c478bd9Sstevel@tonic-gate if (pl != NULL) 1080*7c478bd9Sstevel@tonic-gate ppp++; 1081*7c478bd9Sstevel@tonic-gate } 1082*7c478bd9Sstevel@tonic-gate return (err); 1083*7c478bd9Sstevel@tonic-gate } 1084*7c478bd9Sstevel@tonic-gate 1085*7c478bd9Sstevel@tonic-gate /* 1086*7c478bd9Sstevel@tonic-gate * Initialize the page list array. 1087*7c478bd9Sstevel@tonic-gate */ 1088*7c478bd9Sstevel@tonic-gate void 1089*7c478bd9Sstevel@tonic-gate pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz, 1090*7c478bd9Sstevel@tonic-gate u_offset_t off, size_t io_len, enum seg_rw rw) 1091*7c478bd9Sstevel@tonic-gate { 1092*7c478bd9Sstevel@tonic-gate ssize_t sz; 1093*7c478bd9Sstevel@tonic-gate page_t *ppcur, **ppp; 1094*7c478bd9Sstevel@tonic-gate 1095*7c478bd9Sstevel@tonic-gate if (plsz >= io_len) { 1096*7c478bd9Sstevel@tonic-gate /* 1097*7c478bd9Sstevel@tonic-gate * Everything fits, set up to load 1098*7c478bd9Sstevel@tonic-gate * all the pages. 1099*7c478bd9Sstevel@tonic-gate */ 1100*7c478bd9Sstevel@tonic-gate sz = io_len; 1101*7c478bd9Sstevel@tonic-gate } else { 1102*7c478bd9Sstevel@tonic-gate /* 1103*7c478bd9Sstevel@tonic-gate * Set up to load plsz worth 1104*7c478bd9Sstevel@tonic-gate * starting at the needed page. 1105*7c478bd9Sstevel@tonic-gate */ 1106*7c478bd9Sstevel@tonic-gate while (pp->p_offset != off) { 1107*7c478bd9Sstevel@tonic-gate /* XXX - Do we need this assert? */ 1108*7c478bd9Sstevel@tonic-gate ASSERT(pp->p_next->p_offset != 1109*7c478bd9Sstevel@tonic-gate pp->p_offset); 1110*7c478bd9Sstevel@tonic-gate /* 1111*7c478bd9Sstevel@tonic-gate * Remove page from the i/o list, 1112*7c478bd9Sstevel@tonic-gate * release the i/o and the page lock. 1113*7c478bd9Sstevel@tonic-gate */ 1114*7c478bd9Sstevel@tonic-gate ppcur = pp; 1115*7c478bd9Sstevel@tonic-gate page_sub(&pp, ppcur); 1116*7c478bd9Sstevel@tonic-gate page_io_unlock(ppcur); 1117*7c478bd9Sstevel@tonic-gate (void) page_release(ppcur, 1); 1118*7c478bd9Sstevel@tonic-gate } 1119*7c478bd9Sstevel@tonic-gate sz = plsz; 1120*7c478bd9Sstevel@tonic-gate } 1121*7c478bd9Sstevel@tonic-gate 1122*7c478bd9Sstevel@tonic-gate /* 1123*7c478bd9Sstevel@tonic-gate * Initialize the page list array. 1124*7c478bd9Sstevel@tonic-gate */ 1125*7c478bd9Sstevel@tonic-gate ppp = pl; 1126*7c478bd9Sstevel@tonic-gate do { 1127*7c478bd9Sstevel@tonic-gate ppcur = pp; 1128*7c478bd9Sstevel@tonic-gate *ppp++ = ppcur; 1129*7c478bd9Sstevel@tonic-gate page_sub(&pp, ppcur); 1130*7c478bd9Sstevel@tonic-gate page_io_unlock(ppcur); 1131*7c478bd9Sstevel@tonic-gate if (rw != S_CREATE) 1132*7c478bd9Sstevel@tonic-gate page_downgrade(ppcur); 1133*7c478bd9Sstevel@tonic-gate sz -= PAGESIZE; 1134*7c478bd9Sstevel@tonic-gate } while (sz > 0 && pp != NULL); 1135*7c478bd9Sstevel@tonic-gate *ppp = NULL; /* terminate list */ 1136*7c478bd9Sstevel@tonic-gate 1137*7c478bd9Sstevel@tonic-gate /* 1138*7c478bd9Sstevel@tonic-gate * Now free the remaining pages that weren't 1139*7c478bd9Sstevel@tonic-gate * loaded in the page list. 1140*7c478bd9Sstevel@tonic-gate */ 1141*7c478bd9Sstevel@tonic-gate while (pp != NULL) { 1142*7c478bd9Sstevel@tonic-gate ppcur = pp; 1143*7c478bd9Sstevel@tonic-gate page_sub(&pp, ppcur); 1144*7c478bd9Sstevel@tonic-gate page_io_unlock(ppcur); 1145*7c478bd9Sstevel@tonic-gate (void) page_release(ppcur, 1); 1146*7c478bd9Sstevel@tonic-gate } 1147*7c478bd9Sstevel@tonic-gate } 1148