1a5652762Spraks /* 2a5652762Spraks * CDDL HEADER START 3a5652762Spraks * 4a5652762Spraks * The contents of this file are subject to the terms of the 5a5652762Spraks * Common Development and Distribution License (the "License"). 6a5652762Spraks * You may not use this file except in compliance with the License. 7a5652762Spraks * 8a5652762Spraks * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9a5652762Spraks * or http://www.opensolaris.org/os/licensing. 10a5652762Spraks * See the License for the specific language governing permissions 11a5652762Spraks * and limitations under the License. 12a5652762Spraks * 13a5652762Spraks * When distributing Covered Code, include this CDDL HEADER in each 14a5652762Spraks * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15a5652762Spraks * If applicable, add the following below this CDDL HEADER, with the 16a5652762Spraks * fields enclosed by brackets "[]" replaced with your own identifying 17a5652762Spraks * information: Portions Copyright [yyyy] [name of copyright owner] 18a5652762Spraks * 19a5652762Spraks * CDDL HEADER END 20a5652762Spraks */ 21a5652762Spraks /* 22a5652762Spraks * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23a5652762Spraks * Use is subject to license terms. 24a5652762Spraks */ 25a5652762Spraks 26a5652762Spraks #pragma ident "%Z%%M% %I% %E% SMI" 27a5652762Spraks 28a5652762Spraks /* 29a5652762Spraks * VM - generic vnode page mapping interfaces. 30a5652762Spraks * 31a5652762Spraks * Mechanism to provide temporary mappings to vnode pages. 32a5652762Spraks * The typical use would be to copy/access file data. 33a5652762Spraks */ 34a5652762Spraks 35a5652762Spraks #include <sys/types.h> 36a5652762Spraks #include <sys/t_lock.h> 37a5652762Spraks #include <sys/param.h> 38a5652762Spraks #include <sys/sysmacros.h> 39a5652762Spraks #include <sys/buf.h> 40a5652762Spraks #include <sys/systm.h> 41a5652762Spraks #include <sys/vnode.h> 42a5652762Spraks #include <sys/mman.h> 43a5652762Spraks #include <sys/errno.h> 44a5652762Spraks #include <sys/cred.h> 45a5652762Spraks #include <sys/kmem.h> 46a5652762Spraks #include <sys/vtrace.h> 47a5652762Spraks #include <sys/cmn_err.h> 48a5652762Spraks #include <sys/debug.h> 49a5652762Spraks #include <sys/thread.h> 50a5652762Spraks #include <sys/dumphdr.h> 51a5652762Spraks #include <sys/bitmap.h> 52a5652762Spraks #include <sys/lgrp.h> 53a5652762Spraks 54a5652762Spraks #include <vm/seg_kmem.h> 55a5652762Spraks #include <vm/hat.h> 56a5652762Spraks #include <vm/as.h> 57a5652762Spraks #include <vm/seg.h> 58a5652762Spraks #include <vm/seg_kpm.h> 59a5652762Spraks #include <vm/seg_map.h> 60a5652762Spraks #include <vm/page.h> 61a5652762Spraks #include <vm/pvn.h> 62a5652762Spraks #include <vm/rm.h> 63a5652762Spraks #include <vm/vpm.h> 64a5652762Spraks 65a5652762Spraks /* 66a5652762Spraks * Needs to be enabled by each platform. 67a5652762Spraks */ 68a5652762Spraks int vpm_enable = 0; 69a5652762Spraks 70a5652762Spraks #ifdef SEGKPM_SUPPORT 71a5652762Spraks 72a5652762Spraks 73a5652762Spraks int vpm_cache_enable = 1; 74a5652762Spraks long vpm_cache_percent = 12; 75a5652762Spraks long vpm_cache_size; 76a5652762Spraks int vpm_nfreelist = 0; 77a5652762Spraks int vpmd_freemsk = 0; 78a5652762Spraks 79a5652762Spraks #define VPM_S_PAD 64 80a5652762Spraks union vpm_cpu { 81a5652762Spraks struct { 82a5652762Spraks int vcpu_free_ndx; 83a5652762Spraks ulong_t vcpu_hits; 84a5652762Spraks ulong_t vcpu_misses; 85a5652762Spraks } vcpu; 86a5652762Spraks char vpm_pad[VPM_S_PAD]; 87a5652762Spraks }; 88a5652762Spraks static union vpm_cpu *vpmd_cpu; 89a5652762Spraks 90a5652762Spraks #define vfree_ndx vcpu.vcpu_free_ndx 91a5652762Spraks 92a5652762Spraks int vpm_cachemode = VPMCACHE_LRU; 93a5652762Spraks 94a5652762Spraks #define PPMTX(pp) (&(pp)->p_ilock) 95a5652762Spraks 96a5652762Spraks static struct vpmap *vpmd_vpmap; /* list of vpmap structs preallocated */ 97a5652762Spraks static struct vpmfree *vpmd_free; 98a5652762Spraks #define VPMAPMTX(vpm) (&vpm->vpm_mtx) 99a5652762Spraks #define VPMAP2VMF(vpm) (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk]) 100a5652762Spraks #define VPMAP2VMF_NDX(vpm) (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk) 101a5652762Spraks #define VPMP(id) (&vpmd_vpmap[id - 1]) 102a5652762Spraks #define VPMID(vpm) (uint_t)((vpm - vpmd_vpmap) + 1) 103a5652762Spraks 104a5652762Spraks 105a5652762Spraks #ifdef DEBUG 106a5652762Spraks 107a5652762Spraks struct vpm_debug { 108a5652762Spraks int vpmd_steals; 109a5652762Spraks int vpmd_contend; 110a5652762Spraks int vpmd_prevpagelocked; 111a5652762Spraks int vpmd_getpagefailed; 112a5652762Spraks int vpmd_zerostart; 113a5652762Spraks int vpmd_emptyfreelist; 114a5652762Spraks int vpmd_nofreevpms; 115a5652762Spraks } vpm_debug; 116a5652762Spraks 117a5652762Spraks #define VPM_DEBUG(x) ((vpm_debug.x)++) 118a5652762Spraks 119a5652762Spraks int steals; 120a5652762Spraks int steals_mtbf = 7; 121a5652762Spraks int contend; 122a5652762Spraks int contend_mtbf = 127; 123a5652762Spraks 124a5652762Spraks #define VPM_MTBF(v, f) (((++(v)) & (f)) != (f)) 125a5652762Spraks 126a5652762Spraks #else /* DEBUG */ 127a5652762Spraks 128a5652762Spraks #define VPM_MTBF(v, f) (1) 129a5652762Spraks #define VPM_DEBUG(x) /* nothing */ 130a5652762Spraks 131a5652762Spraks #endif 132a5652762Spraks 133a5652762Spraks /* 134a5652762Spraks * The vpm cache. 135a5652762Spraks * 136a5652762Spraks * The main purpose of having a cache here is to speed up page_lookup() 137a5652762Spraks * operations and also provide an LRU(default) behaviour of file pages. The 138a5652762Spraks * page_lookup() operation tends to be expensive if a page has to be 139a5652762Spraks * reclaimed from the system page cache("cachelist"). Once we speed up the 140a5652762Spraks * page_lookup()->page_reclaim() path then there there should be no need for 141a5652762Spraks * this cache. The system page cache(cachelist) should effectively serve the 142a5652762Spraks * purpose of caching file pages. 143a5652762Spraks * 144a5652762Spraks * This cache is very similar to segmap's smap cache. Each page in the 145a5652762Spraks * cache is tracked by the structure vpmap_t. But unlike segmap, there is no 146a5652762Spraks * hash table. The page_t has a reference to the vpmap_t when cached. For a 147a5652762Spraks * given vnode, offset the page is found by means of a page_lookup() operation. 148a5652762Spraks * Any page which has a mapping(i.e when cached) will not be in the 149a5652762Spraks * system 'cachelist'. Hence the page_lookup() will not have to do a 150a5652762Spraks * page_reclaim(). That is how the cache serves to speed up page_lookup() 151a5652762Spraks * operations. 152a5652762Spraks * 153a5652762Spraks * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system. 154a5652762Spraks */ 155a5652762Spraks 156a5652762Spraks void 157a5652762Spraks vpm_init() 158a5652762Spraks { 159a5652762Spraks long npages; 160a5652762Spraks struct vpmap *vpm; 161a5652762Spraks struct vpmfree *vpmflp; 162a5652762Spraks int i, ndx; 163a5652762Spraks extern void prefetch_smap_w(void *); 164a5652762Spraks 165a5652762Spraks if (!vpm_cache_enable) { 166a5652762Spraks return; 167a5652762Spraks } 168a5652762Spraks 169a5652762Spraks /* 170a5652762Spraks * Set the size of the cache. 171a5652762Spraks */ 172a5652762Spraks vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100); 173a5652762Spraks if (vpm_cache_size < VPMAP_MINCACHE) { 174a5652762Spraks vpm_cache_size = VPMAP_MINCACHE; 175a5652762Spraks } 176a5652762Spraks 177a5652762Spraks /* 178a5652762Spraks * Number of freelists. 179a5652762Spraks */ 180a5652762Spraks if (vpm_nfreelist == 0) { 181a5652762Spraks vpm_nfreelist = max_ncpus; 182a5652762Spraks } else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) { 183a5652762Spraks cmn_err(CE_WARN, "vpmap create : number of freelist " 184a5652762Spraks "vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus); 185a5652762Spraks vpm_nfreelist = 2 * max_ncpus; 186a5652762Spraks } 187a5652762Spraks 188a5652762Spraks /* 189a5652762Spraks * Round it up to the next power of 2 190a5652762Spraks */ 191a5652762Spraks if (vpm_nfreelist & (vpm_nfreelist - 1)) { 192a5652762Spraks vpm_nfreelist = 1 << (highbit(vpm_nfreelist)); 193a5652762Spraks } 194a5652762Spraks vpmd_freemsk = vpm_nfreelist - 1; 195a5652762Spraks 196a5652762Spraks /* 197a5652762Spraks * Use a per cpu rotor index to spread the allocations evenly 198a5652762Spraks * across the available vpm freelists. 199a5652762Spraks */ 200a5652762Spraks vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP); 201a5652762Spraks ndx = 0; 202a5652762Spraks for (i = 0; i < max_ncpus; i++) { 203a5652762Spraks 204a5652762Spraks vpmd_cpu[i].vfree_ndx = ndx; 205a5652762Spraks ndx = (ndx + 1) & vpmd_freemsk; 206a5652762Spraks } 207a5652762Spraks 208a5652762Spraks /* 209a5652762Spraks * Allocate and initialize the freelist. 210a5652762Spraks */ 211a5652762Spraks vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree), 212a5652762Spraks KM_SLEEP); 213a5652762Spraks for (i = 0; i < vpm_nfreelist; i++) { 214a5652762Spraks 215a5652762Spraks vpmflp = &vpmd_free[i]; 216a5652762Spraks /* 217a5652762Spraks * Set up initial queue pointers. They will get flipped 218a5652762Spraks * back and forth. 219a5652762Spraks */ 220a5652762Spraks vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ]; 221a5652762Spraks vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ]; 222a5652762Spraks } 223a5652762Spraks 224a5652762Spraks npages = mmu_btop(vpm_cache_size); 225a5652762Spraks 226a5652762Spraks 227a5652762Spraks /* 228a5652762Spraks * Allocate and initialize the vpmap structs. 229a5652762Spraks */ 230a5652762Spraks vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP); 231a5652762Spraks for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) { 232a5652762Spraks struct vpmfree *vpmflp; 233a5652762Spraks union vpm_freeq *releq; 234a5652762Spraks struct vpmap *vpmapf; 235a5652762Spraks 236a5652762Spraks /* 237a5652762Spraks * Use prefetch as we have to walk thru a large number of 238a5652762Spraks * these data structures. We just use the smap's prefetch 239a5652762Spraks * routine as it does the same. This should work fine 240a5652762Spraks * for x64(this needs to be modifed when enabled on sparc). 241a5652762Spraks */ 242a5652762Spraks prefetch_smap_w((void *)vpm); 243a5652762Spraks 244a5652762Spraks vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm); 245a5652762Spraks 246a5652762Spraks vpmflp = VPMAP2VMF(vpm); 247a5652762Spraks releq = vpmflp->vpm_releq; 248a5652762Spraks 249a5652762Spraks vpmapf = releq->vpmq_free; 250a5652762Spraks if (vpmapf == NULL) { 251a5652762Spraks releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; 252a5652762Spraks } else { 253a5652762Spraks vpm->vpm_next = vpmapf; 254a5652762Spraks vpm->vpm_prev = vpmapf->vpm_prev; 255a5652762Spraks vpmapf->vpm_prev = vpm; 256a5652762Spraks vpm->vpm_prev->vpm_next = vpm; 257a5652762Spraks releq->vpmq_free = vpm->vpm_next; 258a5652762Spraks } 259a5652762Spraks 260a5652762Spraks /* 261a5652762Spraks * Indicate that the vpmap is on the releq at start 262a5652762Spraks */ 263a5652762Spraks vpm->vpm_ndxflg = VPMRELEQ; 264a5652762Spraks } 265a5652762Spraks } 266a5652762Spraks 267a5652762Spraks 268a5652762Spraks /* 269a5652762Spraks * unhooks vpm from the freelist if it is still on the freelist. 270a5652762Spraks */ 271a5652762Spraks #define VPMAP_RMFREELIST(vpm) \ 272a5652762Spraks { \ 273a5652762Spraks if (vpm->vpm_next != NULL) { \ 274a5652762Spraks union vpm_freeq *freeq; \ 275a5652762Spraks struct vpmfree *vpmflp; \ 276a5652762Spraks vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \ 277a5652762Spraks freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \ 278a5652762Spraks mutex_enter(&freeq->vpmq_mtx); \ 279a5652762Spraks if (freeq->vpmq_free != vpm) { \ 280a5652762Spraks vpm->vpm_prev->vpm_next = vpm->vpm_next; \ 281a5652762Spraks vpm->vpm_next->vpm_prev = vpm->vpm_prev; \ 282a5652762Spraks } else if (vpm == vpm->vpm_next) { \ 283a5652762Spraks freeq->vpmq_free = NULL; \ 284a5652762Spraks } else { \ 285a5652762Spraks freeq->vpmq_free = vpm->vpm_next; \ 286a5652762Spraks vpm->vpm_prev->vpm_next = vpm->vpm_next; \ 287a5652762Spraks vpm->vpm_next->vpm_prev = vpm->vpm_prev; \ 288a5652762Spraks } \ 289a5652762Spraks mutex_exit(&freeq->vpmq_mtx); \ 290a5652762Spraks vpm->vpm_next = vpm->vpm_prev = NULL; \ 291a5652762Spraks } \ 292a5652762Spraks } 293a5652762Spraks 294a5652762Spraks static int 295a5652762Spraks get_freelndx(int mode) 296a5652762Spraks { 297a5652762Spraks int ndx; 298a5652762Spraks 299a5652762Spraks ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk; 300a5652762Spraks switch (mode) { 301a5652762Spraks 302a5652762Spraks case VPMCACHE_LRU: 303a5652762Spraks default: 304a5652762Spraks vpmd_cpu[CPU->cpu_seqid].vfree_ndx++; 305a5652762Spraks break; 306a5652762Spraks } 307a5652762Spraks return (ndx); 308a5652762Spraks } 309a5652762Spraks 310a5652762Spraks 311a5652762Spraks /* 312a5652762Spraks * Find one vpmap structure from the free lists and use it for the newpage. 313a5652762Spraks * The previous page it cached is dissociated and released. The page_t's 314a5652762Spraks * p_vpmref is cleared only when the vpm it is pointing to is locked(or 315a5652762Spraks * for AMD64 when the page is exclusively locked in page_unload. That is 316a5652762Spraks * because the p_vpmref is treated as mapping). 317a5652762Spraks * 318a5652762Spraks * The page's p_vpmref is set when the page is 319a5652762Spraks * locked(at least SHARED locked). 320a5652762Spraks */ 321a5652762Spraks static struct vpmap * 322a5652762Spraks get_free_vpmap(page_t *newpage) 323a5652762Spraks { 324a5652762Spraks struct vpmfree *vpmflp; 325a5652762Spraks kmutex_t *vmtx; 326a5652762Spraks struct vpmap *vpm, *first; 327a5652762Spraks union vpm_freeq *allocq, *releq; 328a5652762Spraks page_t *pp = NULL; 329a5652762Spraks int end_ndx, page_locked = 0; 330a5652762Spraks int free_ndx; 331a5652762Spraks 332a5652762Spraks /* 333a5652762Spraks * get the freelist bin index. 334a5652762Spraks */ 335a5652762Spraks free_ndx = get_freelndx(vpm_cachemode); 336a5652762Spraks 337a5652762Spraks end_ndx = free_ndx; 338a5652762Spraks vpmflp = &vpmd_free[free_ndx]; 339a5652762Spraks 340a5652762Spraks retry_queue: 341a5652762Spraks allocq = vpmflp->vpm_allocq; 342a5652762Spraks mutex_enter(&allocq->vpmq_mtx); 343a5652762Spraks 344a5652762Spraks if ((vpm = allocq->vpmq_free) == NULL) { 345a5652762Spraks 346a5652762Spraks skip_queue: 347a5652762Spraks /* 348a5652762Spraks * The alloc list is empty or this queue is being skipped; 349a5652762Spraks * first see if the allocq toggled. 350a5652762Spraks */ 351a5652762Spraks if (vpmflp->vpm_allocq != allocq) { 352a5652762Spraks /* queue changed */ 353a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 354a5652762Spraks goto retry_queue; 355a5652762Spraks } 356a5652762Spraks releq = vpmflp->vpm_releq; 357a5652762Spraks if (!mutex_tryenter(&releq->vpmq_mtx)) { 358a5652762Spraks /* cannot get releq; a free vpmap may be there now */ 359a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 360a5652762Spraks 361a5652762Spraks /* 362a5652762Spraks * This loop could spin forever if this thread has 363a5652762Spraks * higher priority than the thread that is holding 364a5652762Spraks * releq->vpmq_mtx. In order to force the other thread 365a5652762Spraks * to run, we'll lock/unlock the mutex which is safe 366a5652762Spraks * since we just unlocked the allocq mutex. 367a5652762Spraks */ 368a5652762Spraks mutex_enter(&releq->vpmq_mtx); 369a5652762Spraks mutex_exit(&releq->vpmq_mtx); 370a5652762Spraks goto retry_queue; 371a5652762Spraks } 372a5652762Spraks if (releq->vpmq_free == NULL) { 373a5652762Spraks VPM_DEBUG(vpmd_emptyfreelist); 374a5652762Spraks /* 375a5652762Spraks * This freelist is empty. 376a5652762Spraks * This should not happen unless clients 377a5652762Spraks * are failing to release the vpmap after 378a5652762Spraks * accessing the data. Before resorting 379a5652762Spraks * to sleeping, try the next list of the same color. 380a5652762Spraks */ 381a5652762Spraks free_ndx = (free_ndx + 1) & vpmd_freemsk; 382a5652762Spraks if (free_ndx != end_ndx) { 383a5652762Spraks mutex_exit(&releq->vpmq_mtx); 384a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 385a5652762Spraks vpmflp = &vpmd_free[free_ndx]; 386a5652762Spraks goto retry_queue; 387a5652762Spraks } 388a5652762Spraks /* 389a5652762Spraks * Tried all freelists. 390a5652762Spraks * wait on this list and hope something gets freed. 391a5652762Spraks */ 392a5652762Spraks vpmflp->vpm_want++; 393a5652762Spraks mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx); 394a5652762Spraks cv_wait(&vpmflp->vpm_free_cv, 395a5652762Spraks &vpmflp->vpm_freeq[0].vpmq_mtx); 396a5652762Spraks vpmflp->vpm_want--; 397a5652762Spraks mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx); 398a5652762Spraks vpmflp = &vpmd_free[free_ndx]; 399a5652762Spraks VPM_DEBUG(vpmd_nofreevpms); 400a5652762Spraks goto retry_queue; 401a5652762Spraks } else { 402a5652762Spraks /* 403a5652762Spraks * Something on the rele queue; flip the alloc 404a5652762Spraks * and rele queues and retry. 405a5652762Spraks */ 406a5652762Spraks vpmflp->vpm_allocq = releq; 407a5652762Spraks vpmflp->vpm_releq = allocq; 408a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 409a5652762Spraks mutex_exit(&releq->vpmq_mtx); 410a5652762Spraks if (page_locked) { 411a5652762Spraks delay(hz >> 2); 412a5652762Spraks page_locked = 0; 413a5652762Spraks } 414a5652762Spraks goto retry_queue; 415a5652762Spraks } 416a5652762Spraks } else { 417a5652762Spraks int gotnewvpm; 418a5652762Spraks kmutex_t *pmtx; 419a5652762Spraks uint_t vpmref; 420a5652762Spraks 421a5652762Spraks /* 422a5652762Spraks * Fastpath the case we get the vpmap mutex 423a5652762Spraks * on the first try. 424a5652762Spraks */ 425a5652762Spraks first = vpm; 426a5652762Spraks next_vpmap: 427a5652762Spraks vmtx = VPMAPMTX(vpm); 428a5652762Spraks if (!mutex_tryenter(vmtx)) { 429a5652762Spraks /* 430a5652762Spraks * Another thread is trying to reclaim this slot. 431a5652762Spraks * Skip to the next queue or vpmap. 432a5652762Spraks */ 433a5652762Spraks if ((vpm = vpm->vpm_next) == first) { 434a5652762Spraks goto skip_queue; 435a5652762Spraks } else { 436a5652762Spraks goto next_vpmap; 437a5652762Spraks } 438a5652762Spraks } 439a5652762Spraks 440a5652762Spraks /* 441a5652762Spraks * Assign this vpm to the newpage. 442a5652762Spraks */ 443a5652762Spraks pmtx = PPMTX(newpage); 444a5652762Spraks gotnewvpm = 0; 445a5652762Spraks mutex_enter(pmtx); 446a5652762Spraks 447a5652762Spraks /* 448a5652762Spraks * Check if some other thread already assigned a vpm to 449a5652762Spraks * this page. 450a5652762Spraks */ 451a5652762Spraks if ((vpmref = newpage->p_vpmref) == 0) { 452a5652762Spraks newpage->p_vpmref = VPMID(vpm); 453a5652762Spraks gotnewvpm = 1; 454a5652762Spraks } else { 455a5652762Spraks VPM_DEBUG(vpmd_contend); 456a5652762Spraks mutex_exit(vmtx); 457a5652762Spraks } 458a5652762Spraks mutex_exit(pmtx); 459a5652762Spraks 460a5652762Spraks if (gotnewvpm) { 461a5652762Spraks 462a5652762Spraks /* 463a5652762Spraks * At this point, we've selected the vpm. Remove vpm 464a5652762Spraks * from its freelist. If vpm is the first one in 465a5652762Spraks * the freelist, update the head of the freelist. 466a5652762Spraks */ 467a5652762Spraks if (first == vpm) { 468a5652762Spraks ASSERT(first == allocq->vpmq_free); 469a5652762Spraks allocq->vpmq_free = vpm->vpm_next; 470a5652762Spraks } 471a5652762Spraks 472a5652762Spraks /* 473a5652762Spraks * If the head of the freelist still points to vpm, 474a5652762Spraks * then there are no more free vpmaps in that list. 475a5652762Spraks */ 476a5652762Spraks if (allocq->vpmq_free == vpm) 477a5652762Spraks /* 478a5652762Spraks * Took the last one 479a5652762Spraks */ 480a5652762Spraks allocq->vpmq_free = NULL; 481a5652762Spraks else { 482a5652762Spraks vpm->vpm_prev->vpm_next = vpm->vpm_next; 483a5652762Spraks vpm->vpm_next->vpm_prev = vpm->vpm_prev; 484a5652762Spraks } 485a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 486a5652762Spraks vpm->vpm_prev = vpm->vpm_next = NULL; 487a5652762Spraks 488a5652762Spraks /* 489a5652762Spraks * Disassociate the previous page. On x64 systems 490a5652762Spraks * p_vpmref is used as a mapping reference to the page. 491a5652762Spraks */ 492a5652762Spraks if ((pp = vpm->vpm_pp) != NULL && 493a5652762Spraks vpm->vpm_vp == pp->p_vnode && 494a5652762Spraks vpm->vpm_off == pp->p_offset) { 495a5652762Spraks 496a5652762Spraks pmtx = PPMTX(pp); 497a5652762Spraks if (page_trylock(pp, SE_SHARED)) { 498a5652762Spraks /* 499a5652762Spraks * Now verify that it is the correct 500a5652762Spraks * page. If not someone else stole it, 501a5652762Spraks * so just unlock it and leave. 502a5652762Spraks */ 503a5652762Spraks mutex_enter(pmtx); 504a5652762Spraks if (PP_ISFREE(pp) || 505a5652762Spraks vpm->vpm_vp != pp->p_vnode || 506a5652762Spraks vpm->vpm_off != pp->p_offset || 507a5652762Spraks pp->p_vpmref != VPMID(vpm)) { 508a5652762Spraks mutex_exit(pmtx); 509a5652762Spraks 510a5652762Spraks page_unlock(pp); 511a5652762Spraks } else { 512a5652762Spraks /* 513a5652762Spraks * Release the page. 514a5652762Spraks */ 515a5652762Spraks pp->p_vpmref = 0; 516a5652762Spraks mutex_exit(pmtx); 517a5652762Spraks hat_kpm_mapout(pp, 0, 518a5652762Spraks hat_kpm_page2va(pp, 1)); 519a5652762Spraks (void) page_release(pp, 1); 520a5652762Spraks } 521a5652762Spraks } else { 522a5652762Spraks /* 523a5652762Spraks * If the page cannot be locked, just 524a5652762Spraks * clear the p_vpmref and go. 525a5652762Spraks */ 526a5652762Spraks mutex_enter(pmtx); 527a5652762Spraks if (pp->p_vpmref == VPMID(vpm)) { 528a5652762Spraks pp->p_vpmref = 0; 529a5652762Spraks } 530a5652762Spraks mutex_exit(pmtx); 531a5652762Spraks VPM_DEBUG(vpmd_prevpagelocked); 532a5652762Spraks } 533a5652762Spraks } 534a5652762Spraks 535a5652762Spraks /* 536a5652762Spraks * Setup vpm to point to the new page. 537a5652762Spraks */ 538a5652762Spraks vpm->vpm_pp = newpage; 539a5652762Spraks vpm->vpm_vp = newpage->p_vnode; 540a5652762Spraks vpm->vpm_off = newpage->p_offset; 541a5652762Spraks 542a5652762Spraks } else { 543a5652762Spraks int steal = !VPM_MTBF(steals, steals_mtbf); 544a5652762Spraks /* 545a5652762Spraks * Page already has a vpm assigned just use that. 546a5652762Spraks * Grab the vpm mutex and verify that it is still 547a5652762Spraks * the correct one. The pp->p_vpmref should not change 548a5652762Spraks * once we have the vpm mutex and the page lock. 549a5652762Spraks */ 550a5652762Spraks mutex_exit(&allocq->vpmq_mtx); 551a5652762Spraks vpm = VPMP(vpmref); 552a5652762Spraks vmtx = VPMAPMTX(vpm); 553a5652762Spraks mutex_enter(vmtx); 554a5652762Spraks if ((steal && vpm->vpm_refcnt == 0) || 555a5652762Spraks vpm->vpm_pp != newpage) { 556a5652762Spraks /* 557a5652762Spraks * The vpm got stolen, retry. 558a5652762Spraks * clear the p_vpmref. 559a5652762Spraks */ 560a5652762Spraks pmtx = PPMTX(newpage); 561a5652762Spraks mutex_enter(pmtx); 562a5652762Spraks if (newpage->p_vpmref == vpmref) { 563a5652762Spraks newpage->p_vpmref = 0; 564a5652762Spraks } 565a5652762Spraks mutex_exit(pmtx); 566a5652762Spraks 567a5652762Spraks mutex_exit(vmtx); 568a5652762Spraks VPM_DEBUG(vpmd_steals); 569a5652762Spraks goto retry_queue; 570a5652762Spraks } else if (vpm->vpm_refcnt == 0) { 571a5652762Spraks /* 572a5652762Spraks * Remove it from the free list if it 573a5652762Spraks * exists there. 574a5652762Spraks */ 575a5652762Spraks VPMAP_RMFREELIST(vpm); 576a5652762Spraks } 577a5652762Spraks } 578a5652762Spraks return (vpm); 579a5652762Spraks } 580a5652762Spraks } 581a5652762Spraks 582a5652762Spraks static void 583a5652762Spraks free_vpmap(struct vpmap *vpm) 584a5652762Spraks { 585a5652762Spraks struct vpmfree *vpmflp; 586a5652762Spraks struct vpmap *vpmfreelist; 587a5652762Spraks union vpm_freeq *releq; 588a5652762Spraks 589a5652762Spraks ASSERT(MUTEX_HELD(VPMAPMTX(vpm))); 590a5652762Spraks 591a5652762Spraks if (vpm->vpm_refcnt != 0) { 592a5652762Spraks panic("free_vpmap"); 593a5652762Spraks /*NOTREACHED*/ 594a5652762Spraks } 595a5652762Spraks 596a5652762Spraks vpmflp = &vpmd_free[vpm->vpm_free_ndx]; 597a5652762Spraks /* 598a5652762Spraks * Add to the tail of the release queue 599a5652762Spraks * Note that vpm_releq and vpm_allocq could toggle 600a5652762Spraks * before we get the lock. This does not affect 601a5652762Spraks * correctness as the 2 queues are only maintained 602a5652762Spraks * to reduce lock pressure. 603a5652762Spraks */ 604a5652762Spraks releq = vpmflp->vpm_releq; 605a5652762Spraks if (releq == &vpmflp->vpm_freeq[0]) { 606a5652762Spraks vpm->vpm_ndxflg = 0; 607a5652762Spraks } else { 608a5652762Spraks vpm->vpm_ndxflg = 1; 609a5652762Spraks } 610a5652762Spraks mutex_enter(&releq->vpmq_mtx); 611a5652762Spraks vpmfreelist = releq->vpmq_free; 612a5652762Spraks if (vpmfreelist == 0) { 613a5652762Spraks int want; 614a5652762Spraks 615a5652762Spraks releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; 616a5652762Spraks /* 617a5652762Spraks * Both queue mutexes are held to set vpm_want; 618a5652762Spraks * snapshot the value before dropping releq mutex. 619a5652762Spraks * If vpm_want appears after the releq mutex is dropped, 620a5652762Spraks * then the vpmap just freed is already gone. 621a5652762Spraks */ 622a5652762Spraks want = vpmflp->vpm_want; 623a5652762Spraks mutex_exit(&releq->vpmq_mtx); 624a5652762Spraks /* 625a5652762Spraks * See if there was a waiter before dropping the releq mutex 626a5652762Spraks * then recheck after obtaining vpm_freeq[0] mutex as 627a5652762Spraks * the another thread may have already signaled. 628a5652762Spraks */ 629a5652762Spraks if (want) { 630a5652762Spraks mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx); 631a5652762Spraks if (vpmflp->vpm_want) 632a5652762Spraks cv_signal(&vpmflp->vpm_free_cv); 633a5652762Spraks mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx); 634a5652762Spraks } 635a5652762Spraks } else { 636a5652762Spraks vpm->vpm_next = vpmfreelist; 637a5652762Spraks vpm->vpm_prev = vpmfreelist->vpm_prev; 638a5652762Spraks vpmfreelist->vpm_prev = vpm; 639a5652762Spraks vpm->vpm_prev->vpm_next = vpm; 640a5652762Spraks mutex_exit(&releq->vpmq_mtx); 641a5652762Spraks } 642a5652762Spraks } 643a5652762Spraks 644a5652762Spraks /* 645a5652762Spraks * Get the vpmap for the page. 646a5652762Spraks * The refcnt of this vpm is incremented. 647a5652762Spraks */ 648a5652762Spraks static struct vpmap * 649a5652762Spraks get_vpmap(page_t *pp) 650a5652762Spraks { 651a5652762Spraks struct vpmap *vpm = NULL; 652a5652762Spraks kmutex_t *vmtx; 653a5652762Spraks kmutex_t *pmtx; 654a5652762Spraks unsigned int refid; 655a5652762Spraks 656a5652762Spraks ASSERT((pp != NULL) && PAGE_LOCKED(pp)); 657a5652762Spraks 658a5652762Spraks if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) { 659a5652762Spraks vpm = VPMP(refid); 660a5652762Spraks vmtx = VPMAPMTX(vpm); 661a5652762Spraks mutex_enter(vmtx); 662a5652762Spraks /* 663a5652762Spraks * Since we have the page lock and the vpm mutex, the 664a5652762Spraks * pp->p_vpmref cannot change. 665a5652762Spraks */ 666a5652762Spraks if (vpm->vpm_pp != pp) { 667a5652762Spraks pmtx = PPMTX(pp); 668a5652762Spraks 669a5652762Spraks /* 670a5652762Spraks * Clear the p_vpmref as it is incorrect. 671a5652762Spraks * This can happen if the page was stolen. 672a5652762Spraks * On x64 this should not happen as p_vpmref 673a5652762Spraks * is treated as a mapping on the page. So 674a5652762Spraks * if the page is stolen, the mapping would have 675a5652762Spraks * been cleared in page_unload(). 676a5652762Spraks */ 677a5652762Spraks mutex_enter(pmtx); 678a5652762Spraks if (pp->p_vpmref == refid) 679a5652762Spraks pp->p_vpmref = 0; 680a5652762Spraks mutex_exit(pmtx); 681a5652762Spraks 682a5652762Spraks mutex_exit(vmtx); 683a5652762Spraks vpm = NULL; 684a5652762Spraks } else if (vpm->vpm_refcnt == 0) { 685a5652762Spraks /* 686a5652762Spraks * Got the vpm, remove it from the free 687a5652762Spraks * list if it exists there. 688a5652762Spraks */ 689a5652762Spraks VPMAP_RMFREELIST(vpm); 690a5652762Spraks } 691a5652762Spraks } 692a5652762Spraks if (vpm == NULL) { 693a5652762Spraks /* 694a5652762Spraks * get_free_vpmap() returns with the vpmap mutex held. 695a5652762Spraks */ 696a5652762Spraks vpm = get_free_vpmap(pp); 697a5652762Spraks vmtx = VPMAPMTX(vpm); 698a5652762Spraks vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++; 699a5652762Spraks } else { 700a5652762Spraks vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++; 701a5652762Spraks } 702a5652762Spraks 703a5652762Spraks vpm->vpm_refcnt++; 704a5652762Spraks mutex_exit(vmtx); 705a5652762Spraks 706a5652762Spraks return (vpm); 707a5652762Spraks } 708a5652762Spraks 709a5652762Spraks /* END --- vpm cache ---- */ 710a5652762Spraks 711a5652762Spraks /* 712a5652762Spraks * The vnode page mapping(vpm) interface routines. 713a5652762Spraks */ 714a5652762Spraks 715a5652762Spraks /* 716a5652762Spraks * Find or create the pages starting form baseoff for specified 717a5652762Spraks * length 'len'. 718a5652762Spraks */ 719a5652762Spraks static int 720a5652762Spraks vpm_pagecreate( 721a5652762Spraks struct vnode *vp, 722a5652762Spraks u_offset_t baseoff, 723a5652762Spraks size_t len, 724a5652762Spraks vmap_t vml[], 725a5652762Spraks int nseg, 726a5652762Spraks int *newpage) 727a5652762Spraks { 728a5652762Spraks 729a5652762Spraks page_t *pp = NULL; 730a5652762Spraks caddr_t base; 731a5652762Spraks u_offset_t off = baseoff; 732a5652762Spraks int i; 733a5652762Spraks ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS); 734a5652762Spraks 7353bd1497bSpraks for (i = 0; len > 0; len -= PAGESIZE, i++) { 736a5652762Spraks struct vpmap *vpm; 737a5652762Spraks 738a5652762Spraks 739a5652762Spraks if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) { 740a5652762Spraks 741a5652762Spraks base = segkpm_create_va(off); 742a5652762Spraks 743a5652762Spraks /* 744a5652762Spraks * the seg pointer passed in is just advisor. Just 745a5652762Spraks * pass segkmap for now like segmap does with 746a5652762Spraks * segmap_kpm enabled. 747a5652762Spraks */ 748a5652762Spraks if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, 749a5652762Spraks segkmap, base)) == NULL) { 750a5652762Spraks panic("segmap_pagecreate_vpm: " 751a5652762Spraks "page_create failed"); 752a5652762Spraks /*NOTREACHED*/ 753a5652762Spraks } 754a5652762Spraks if (newpage != NULL) 755a5652762Spraks *newpage = 1; 756a5652762Spraks 757a5652762Spraks page_io_unlock(pp); 758a5652762Spraks } 759a5652762Spraks 760a5652762Spraks /* 761a5652762Spraks * Get the vpm for this page_t. 762a5652762Spraks */ 763a5652762Spraks if (vpm_cache_enable) { 764a5652762Spraks vpm = get_vpmap(pp); 765a5652762Spraks vml[i].vs_data = (void *)&vpm->vpm_pp; 766a5652762Spraks } else { 767a5652762Spraks vml[i].vs_data = (void *)pp; 768a5652762Spraks pp->p_vpmref = 0; 769a5652762Spraks } 770a5652762Spraks 771a5652762Spraks vml[i].vs_addr = hat_kpm_mapin(pp, 0); 772a5652762Spraks vml[i].vs_len = PAGESIZE; 773a5652762Spraks 774a5652762Spraks off += PAGESIZE; 775a5652762Spraks } 776a5652762Spraks vml[i].vs_data = NULL; 777a5652762Spraks vml[i].vs_addr = (caddr_t)NULL; 778a5652762Spraks return (0); 779a5652762Spraks } 780a5652762Spraks 781a5652762Spraks 782a5652762Spraks /* 783a5652762Spraks * Returns vpm mappings of pages in the range [off, off+len], where 784a5652762Spraks * len is rounded up to the PAGESIZE boundary. The list of pages and 785a5652762Spraks * the page addresses are returned in the SGL vml (vmap_t) array passed in. 786a5652762Spraks * The nseg is the number of vmap_t entries in the array. 787a5652762Spraks * 788a5652762Spraks * Currently max len allowed is MAXBSIZE therefore, it will either 789a5652762Spraks * fetch/create one or two pages depending on what is the PAGESIZE. 790a5652762Spraks * 791a5652762Spraks * The segmap's SM_LOCKPROTO usage is not supported by these interfaces. 792a5652762Spraks * For such cases, use the seg_map interfaces. 793a5652762Spraks */ 794a5652762Spraks int 795a5652762Spraks vpm_map_pages( 796a5652762Spraks struct vnode *vp, 797a5652762Spraks u_offset_t off, 798a5652762Spraks size_t len, 799a5652762Spraks int fetchpage, 800a5652762Spraks vmap_t *vml, 801a5652762Spraks int nseg, 802a5652762Spraks int *newpage, 803a5652762Spraks enum seg_rw rw) 804a5652762Spraks { 805a5652762Spraks extern struct vnode *common_specvp(); 806a5652762Spraks u_offset_t baseoff; 807a5652762Spraks uint_t prot; 808a5652762Spraks caddr_t base; 809a5652762Spraks page_t *pp, *pplist[MAXVMAPS]; 810a5652762Spraks struct vpmap *vpm; 811a5652762Spraks int i, error = 0; 812a5652762Spraks 813a5652762Spraks ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS); 814a5652762Spraks baseoff = off & (offset_t)PAGEMASK; 815a5652762Spraks vml[0].vs_data = NULL; 816a5652762Spraks vml[0].vs_addr = (caddr_t)NULL; 817a5652762Spraks /* 818a5652762Spraks * For now, lets restrict it to MAXBSIZE. XXX - We can allow 819a5652762Spraks * len longer then MAXBSIZE, but there should be a limit 820a5652762Spraks * which should be determined by how many pages the VOP_GETPAGE() 821a5652762Spraks * can fetch. 822a5652762Spraks */ 823a5652762Spraks if (off + len > baseoff + MAXBSIZE) { 824a5652762Spraks panic("vpm_map_pages bad len"); 825a5652762Spraks /*NOTREACHED*/ 826a5652762Spraks } 827a5652762Spraks 828a5652762Spraks /* 829a5652762Spraks * If this is a block device we have to be sure to use the 830a5652762Spraks * "common" block device vnode for the mapping. 831a5652762Spraks */ 832a5652762Spraks if (vp->v_type == VBLK) 833a5652762Spraks vp = common_specvp(vp); 834a5652762Spraks 8353bd1497bSpraks /* 8363bd1497bSpraks * round up len to a multiple of PAGESIZE. 8373bd1497bSpraks */ 8383bd1497bSpraks len = ((off + len - baseoff + PAGESIZE - 1) & (uintptr_t)PAGEMASK); 839a5652762Spraks 840a5652762Spraks if (!fetchpage) 841a5652762Spraks return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage)); 842a5652762Spraks 8433bd1497bSpraks for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) { 844a5652762Spraks 845a5652762Spraks pp = page_lookup(vp, baseoff, SE_SHARED); 846a5652762Spraks 847a5652762Spraks /* 848a5652762Spraks * If we did not find the page or if this page was not 849a5652762Spraks * in our cache, then let VOP_GETPAGE get all the pages. 850a5652762Spraks * We need to call VOP_GETPAGE so that filesytems can do some 851a5652762Spraks * (un)necessary tracking for sequential access. 852a5652762Spraks */ 853a5652762Spraks 854a5652762Spraks if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) || 855a5652762Spraks (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF) 856a5652762Spraks != (P_MOD | P_REF))) { 857a5652762Spraks if (pp != NULL) { 858a5652762Spraks page_unlock(pp); 859a5652762Spraks } 860a5652762Spraks 861a5652762Spraks /* 862a5652762Spraks * Pass a dummy address as it will be required 863a5652762Spraks * by page_create_va(). We pass segkmap as the seg 864a5652762Spraks * as some file systems(UFS) check it. 865a5652762Spraks */ 866a5652762Spraks base = segkpm_create_va(baseoff); 867a5652762Spraks 868a5652762Spraks error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i], 8693bd1497bSpraks len, segkmap, base, rw, CRED()); 870a5652762Spraks if (error) { 871a5652762Spraks VPM_DEBUG(vpmd_getpagefailed); 872a5652762Spraks pplist[i] = NULL; 873a5652762Spraks } 874a5652762Spraks break; 875a5652762Spraks } else { 876a5652762Spraks pplist[i] = pp; 877a5652762Spraks baseoff += PAGESIZE; 878a5652762Spraks } 879a5652762Spraks } 880a5652762Spraks 881a5652762Spraks if (error) { 882a5652762Spraks for (i = 0; pplist[i] != NULL; i++) { 883a5652762Spraks page_unlock(pplist[i]); 884a5652762Spraks pplist[i] = NULL; 885a5652762Spraks } 886a5652762Spraks vml[0].vs_addr = NULL; 887a5652762Spraks vml[0].vs_data = NULL; 888*9234f026Spraks return (error); 889a5652762Spraks } 890a5652762Spraks 891a5652762Spraks /* 892a5652762Spraks * Get the vpm's for pages. 893a5652762Spraks */ 894a5652762Spraks for (i = 0; pplist[i] != NULL; i++) { 895a5652762Spraks if (vpm_cache_enable) { 896a5652762Spraks vpm = get_vpmap(pplist[i]); 897a5652762Spraks vml[i].vs_data = (void *)&(vpm->vpm_pp); 898a5652762Spraks } else { 899a5652762Spraks vml[i].vs_data = (void *)pplist[i]; 900a5652762Spraks pplist[i]->p_vpmref = 0; 901a5652762Spraks } 902a5652762Spraks 903a5652762Spraks vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0); 904a5652762Spraks vml[i].vs_len = PAGESIZE; 905a5652762Spraks } 906a5652762Spraks 907a5652762Spraks vml[i].vs_data = NULL; 908a5652762Spraks vml[i].vs_addr = (caddr_t)NULL; 909a5652762Spraks 910a5652762Spraks return (0); 911a5652762Spraks } 912a5652762Spraks 913a5652762Spraks /* 914a5652762Spraks * Release the vpm mappings on the pages and unlock them. 915a5652762Spraks */ 916a5652762Spraks void 917a5652762Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw) 918a5652762Spraks { 919a5652762Spraks int i; 920a5652762Spraks struct vpmap *vpm; 921a5652762Spraks kmutex_t *mtx; 922a5652762Spraks page_t *pp; 923a5652762Spraks 924a5652762Spraks for (i = 0; vml[i].vs_data != NULL; i++) { 925a5652762Spraks ASSERT(IS_KPM_ADDR(vml[i].vs_addr)); 926a5652762Spraks 927a5652762Spraks if (vpm_cache_enable) { 928a5652762Spraks pp = *(((page_t **)vml[i].vs_data)); 929a5652762Spraks } else { 930a5652762Spraks pp = (page_t *)vml[i].vs_data; 931a5652762Spraks } 932a5652762Spraks 933a5652762Spraks /* 934a5652762Spraks * Mark page as being modified or referenced, bacause vpm pages 935a5652762Spraks * would not cause faults where it would be set normally. 936a5652762Spraks */ 937a5652762Spraks if (rw == S_WRITE) { 938a5652762Spraks hat_setrefmod(pp); 939a5652762Spraks } else { 940a5652762Spraks ASSERT(rw == S_READ); 941a5652762Spraks hat_setref(pp); 942a5652762Spraks } 943a5652762Spraks 944a5652762Spraks if (vpm_cache_enable) { 945a5652762Spraks page_unlock(pp); 946a5652762Spraks vpm = (struct vpmap *)((char *)vml[i].vs_data 947a5652762Spraks - offsetof(struct vpmap, vpm_pp)); 948a5652762Spraks mtx = VPMAPMTX(vpm); 949a5652762Spraks mutex_enter(mtx); 950a5652762Spraks 951a5652762Spraks if (--vpm->vpm_refcnt == 0) { 952a5652762Spraks free_vpmap(vpm); 953a5652762Spraks } 954a5652762Spraks mutex_exit(mtx); 955a5652762Spraks } else { 956a5652762Spraks hat_kpm_mapout(pp, 0, vml[i].vs_addr); 957a5652762Spraks (void) page_release(pp, 1); 958a5652762Spraks } 959a5652762Spraks vml[i].vs_data = NULL; 960a5652762Spraks vml[i].vs_addr = NULL; 961a5652762Spraks } 962a5652762Spraks } 963a5652762Spraks 964a5652762Spraks /* 965a5652762Spraks * Given the vp, off and the uio structure, this routine will do the 966a5652762Spraks * the copy (uiomove). If the last page created is partially written, 967a5652762Spraks * the rest of the page is zeroed out. It also zeros the beginning of 968a5652762Spraks * the first page till the start offset if requested(zerostart). 969a5652762Spraks * If pages are to be fetched, it will call the filesystem's getpage 970a5652762Spraks * function (VOP_GETPAGE) to get them, otherwise they will be created if 971a5652762Spraks * not already present in the page cache. 972a5652762Spraks */ 973a5652762Spraks int 974a5652762Spraks vpm_data_copy(struct vnode *vp, 975a5652762Spraks u_offset_t off, 976a5652762Spraks size_t len, 977a5652762Spraks struct uio *uio, 978a5652762Spraks int fetchpage, 979a5652762Spraks int *newpage, 980a5652762Spraks int zerostart, 981a5652762Spraks enum seg_rw rw) 982a5652762Spraks { 983a5652762Spraks int error; 984a5652762Spraks struct vmap vml[MINVMAPS]; 985a5652762Spraks enum uio_rw uiorw; 986a5652762Spraks int npages = 0; 987a5652762Spraks 988a5652762Spraks uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ; 989a5652762Spraks /* 990a5652762Spraks * 'off' will be the offset where the I/O starts. 991a5652762Spraks * We get the pages starting at the (off & PAGEMASK) 992a5652762Spraks * page boundary. 993a5652762Spraks */ 994a5652762Spraks error = vpm_map_pages(vp, off, (uint_t)len, 995a5652762Spraks fetchpage, vml, MINVMAPS, &npages, rw); 996a5652762Spraks 997a5652762Spraks if (newpage != NULL) 998a5652762Spraks *newpage = npages; 999a5652762Spraks if (!error) { 1000a5652762Spraks int i, pn, slen = len; 1001a5652762Spraks int pon = off & PAGEOFFSET; 1002a5652762Spraks 1003a5652762Spraks /* 1004a5652762Spraks * Clear from the beginning of the page to start offset 1005a5652762Spraks * if requested. 1006a5652762Spraks */ 1007a5652762Spraks if (!fetchpage && zerostart) { 1008a5652762Spraks (void) kzero(vml[0].vs_addr, (uint_t)pon); 1009a5652762Spraks VPM_DEBUG(vpmd_zerostart); 1010a5652762Spraks } 1011a5652762Spraks 1012a5652762Spraks for (i = 0; !error && slen > 0 && 1013a5652762Spraks vml[i].vs_addr != NULL; i++) { 1014a5652762Spraks pn = (int)MIN(slen, (PAGESIZE - pon)); 1015a5652762Spraks error = uiomove(vml[i].vs_addr + pon, 1016a5652762Spraks (long)pn, uiorw, uio); 1017a5652762Spraks slen -= pn; 1018a5652762Spraks pon = 0; 1019a5652762Spraks } 1020a5652762Spraks 1021a5652762Spraks /* 1022a5652762Spraks * When new pages are created, zero out part of the 1023a5652762Spraks * page we did not copy to. 1024a5652762Spraks */ 1025a5652762Spraks if (!fetchpage && npages && 1026a5652762Spraks uio->uio_loffset < roundup(off + len, PAGESIZE)) { 1027a5652762Spraks int nzero; 1028a5652762Spraks 1029a5652762Spraks pon = (uio->uio_loffset & PAGEOFFSET); 1030a5652762Spraks nzero = PAGESIZE - pon; 1031a5652762Spraks i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE; 1032a5652762Spraks (void) kzero(vml[i].vs_addr + pon, (uint_t)nzero); 1033a5652762Spraks } 1034a5652762Spraks vpm_unmap_pages(vml, rw); 1035a5652762Spraks } 1036a5652762Spraks return (error); 1037a5652762Spraks } 1038a5652762Spraks 1039a5652762Spraks /* 1040a5652762Spraks * called to flush pages for the given vnode covering 1041a5652762Spraks * [off, off+len] range. 1042a5652762Spraks */ 1043a5652762Spraks int 1044a5652762Spraks vpm_sync_pages(struct vnode *vp, 1045a5652762Spraks u_offset_t off, 1046a5652762Spraks size_t len, 1047a5652762Spraks uint_t flags) 1048a5652762Spraks { 1049a5652762Spraks extern struct vnode *common_specvp(); 1050a5652762Spraks int bflags = 0; 1051a5652762Spraks int error = 0; 1052a5652762Spraks size_t psize = roundup(len, PAGESIZE); 1053a5652762Spraks 1054a5652762Spraks /* 1055a5652762Spraks * If this is a block device we have to be sure to use the 1056a5652762Spraks * "common" block device vnode for the mapping. 1057a5652762Spraks */ 1058a5652762Spraks if (vp->v_type == VBLK) 1059a5652762Spraks vp = common_specvp(vp); 1060a5652762Spraks 1061a5652762Spraks if ((flags & ~SM_DONTNEED) != 0) { 1062a5652762Spraks if (flags & SM_ASYNC) 1063a5652762Spraks bflags |= B_ASYNC; 1064a5652762Spraks if (flags & SM_INVAL) 1065a5652762Spraks bflags |= B_INVAL; 1066a5652762Spraks if (flags & SM_DESTROY) 1067a5652762Spraks bflags |= (B_INVAL|B_TRUNC); 1068a5652762Spraks if (flags & SM_FREE) 1069a5652762Spraks bflags |= B_FREE; 1070a5652762Spraks if (flags & SM_DONTNEED) 1071a5652762Spraks bflags |= B_DONTNEED; 1072a5652762Spraks 1073a5652762Spraks error = VOP_PUTPAGE(vp, off, psize, bflags, CRED()); 1074a5652762Spraks } 1075a5652762Spraks 1076a5652762Spraks return (error); 1077a5652762Spraks } 1078a5652762Spraks 1079a5652762Spraks 1080a5652762Spraks #else /* SEGKPM_SUPPORT */ 1081a5652762Spraks 1082a5652762Spraks /* vpm stubs */ 1083a5652762Spraks void 1084a5652762Spraks vpm_init() 1085a5652762Spraks { 1086a5652762Spraks } 1087a5652762Spraks 1088a5652762Spraks /*ARGSUSED*/ 1089a5652762Spraks int 1090a5652762Spraks vpm_pagecreate( 1091a5652762Spraks struct vnode *vp, 1092a5652762Spraks u_offset_t baseoff, 1093a5652762Spraks size_t len, 1094a5652762Spraks vmap_t vml[], 1095a5652762Spraks int nseg, 1096a5652762Spraks int *newpage) 1097a5652762Spraks { 1098a5652762Spraks return (0); 1099a5652762Spraks } 1100a5652762Spraks 1101a5652762Spraks /*ARGSUSED*/ 1102a5652762Spraks int 1103a5652762Spraks vpm_map_pages( 1104a5652762Spraks struct vnode *vp, 1105a5652762Spraks u_offset_t off, 1106a5652762Spraks size_t len, 1107a5652762Spraks int fetchpage, 1108a5652762Spraks vmap_t vml[], 1109a5652762Spraks int nseg, 1110a5652762Spraks int *newpage, 1111a5652762Spraks enum seg_rw rw) 1112a5652762Spraks { 1113a5652762Spraks return (0); 1114a5652762Spraks } 1115a5652762Spraks 1116a5652762Spraks /*ARGSUSED*/ 1117a5652762Spraks int 1118a5652762Spraks vpm_data_copy(struct vnode *vp, 1119a5652762Spraks u_offset_t off, 1120a5652762Spraks size_t len, 1121a5652762Spraks struct uio *uio, 1122a5652762Spraks int fetchpage, 1123a5652762Spraks int *newpage, 1124a5652762Spraks int zerostart, 1125a5652762Spraks enum seg_rw rw) 1126a5652762Spraks { 1127a5652762Spraks return (0); 1128a5652762Spraks } 1129a5652762Spraks 1130a5652762Spraks /*ARGSUSED*/ 1131a5652762Spraks void 1132a5652762Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw) 1133a5652762Spraks { 1134a5652762Spraks } 1135a5652762Spraks /*ARGSUSED*/ 1136a5652762Spraks int 1137a5652762Spraks vpm_sync_pages(struct vnode *vp, 1138a5652762Spraks u_offset_t off, 1139a5652762Spraks size_t len, 1140a5652762Spraks uint_t flags) 1141a5652762Spraks { 1142a5652762Spraks return (0); 1143a5652762Spraks } 1144a5652762Spraks #endif /* SEGKPM_SUPPORT */ 1145