common/vm/vpm.c

a5652762Spraks/*
a5652762Spraks * CDDL HEADER START
a5652762Spraks *
a5652762Spraks * The contents of this file are subject to the terms of the
a5652762Spraks * Common Development and Distribution License (the "License").
a5652762Spraks * You may not use this file except in compliance with the License.
a5652762Spraks *
a5652762Spraks * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
a5652762Spraks * or http://www.opensolaris.org/os/licensing.
a5652762Spraks * See the License for the specific language governing permissions
a5652762Spraks * and limitations under the License.
a5652762Spraks *
a5652762Spraks * When distributing Covered Code, include this CDDL HEADER in each
a5652762Spraks * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
a5652762Spraks * If applicable, add the following below this CDDL HEADER, with the
a5652762Spraks * fields enclosed by brackets "[]" replaced with your own identifying
a5652762Spraks * information: Portions Copyright [yyyy] [name of copyright owner]
a5652762Spraks *
a5652762Spraks * CDDL HEADER END
a5652762Spraks */
a5652762Spraks/*
a5652762Spraks * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
a5652762Spraks * Use is subject to license terms.
a5652762Spraks */
a5652762Spraks
a5652762Spraks#pragma ident	"%Z%%M%	%I%	%E% SMI"
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * VM - generic vnode page mapping interfaces.
a5652762Spraks *
a5652762Spraks * Mechanism to provide temporary mappings to vnode pages.
a5652762Spraks * The typical use would be to copy/access file data.
a5652762Spraks */
a5652762Spraks
a5652762Spraks#include <sys/types.h>
a5652762Spraks#include <sys/t_lock.h>
a5652762Spraks#include <sys/param.h>
a5652762Spraks#include <sys/sysmacros.h>
a5652762Spraks#include <sys/buf.h>
a5652762Spraks#include <sys/systm.h>
a5652762Spraks#include <sys/vnode.h>
a5652762Spraks#include <sys/mman.h>
a5652762Spraks#include <sys/errno.h>
a5652762Spraks#include <sys/cred.h>
a5652762Spraks#include <sys/kmem.h>
a5652762Spraks#include <sys/vtrace.h>
a5652762Spraks#include <sys/cmn_err.h>
a5652762Spraks#include <sys/debug.h>
a5652762Spraks#include <sys/thread.h>
a5652762Spraks#include <sys/dumphdr.h>
a5652762Spraks#include <sys/bitmap.h>
a5652762Spraks#include <sys/lgrp.h>
a5652762Spraks
a5652762Spraks#include <vm/seg_kmem.h>
a5652762Spraks#include <vm/hat.h>
a5652762Spraks#include <vm/as.h>
a5652762Spraks#include <vm/seg.h>
a5652762Spraks#include <vm/seg_kpm.h>
a5652762Spraks#include <vm/seg_map.h>
a5652762Spraks#include <vm/page.h>
a5652762Spraks#include <vm/pvn.h>
a5652762Spraks#include <vm/rm.h>
a5652762Spraks#include <vm/vpm.h>
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * Needs to be enabled by each platform.
a5652762Spraks */
a5652762Spraksint vpm_enable = 0;
a5652762Spraks
a5652762Spraks#ifdef	SEGKPM_SUPPORT
a5652762Spraks
a5652762Spraks
a5652762Spraksint	vpm_cache_enable = 1;
a5652762Sprakslong	vpm_cache_percent = 12;
a5652762Sprakslong	vpm_cache_size;
a5652762Spraksint	vpm_nfreelist = 0;
a5652762Spraksint	vpmd_freemsk = 0;
a5652762Spraks
a5652762Spraks#define	VPM_S_PAD	64
a5652762Spraksunion vpm_cpu {
a5652762Spraks	struct {
a5652762Spraks		int	vcpu_free_ndx;
a5652762Spraks		ulong_t	vcpu_hits;
a5652762Spraks		ulong_t vcpu_misses;
a5652762Spraks	} vcpu;
a5652762Spraks	char vpm_pad[VPM_S_PAD];
a5652762Spraks};
a5652762Spraksstatic union vpm_cpu	*vpmd_cpu;
a5652762Spraks
a5652762Spraks#define	vfree_ndx	vcpu.vcpu_free_ndx
a5652762Spraks
a5652762Spraksint	vpm_cachemode = VPMCACHE_LRU;
a5652762Spraks
a5652762Spraks#define	PPMTX(pp) (&(pp)->p_ilock)
a5652762Spraks
a5652762Spraksstatic struct vpmap *vpmd_vpmap;	/* list of vpmap structs preallocated */
a5652762Spraksstatic struct vpmfree *vpmd_free;
a5652762Spraks#define	VPMAPMTX(vpm)	(&vpm->vpm_mtx)
a5652762Spraks#define	VPMAP2VMF(vpm)	(&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
a5652762Spraks#define	VPMAP2VMF_NDX(vpm)	(ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
a5652762Spraks#define	VPMP(id)	(&vpmd_vpmap[id - 1])
a5652762Spraks#define	VPMID(vpm)	(uint_t)((vpm - vpmd_vpmap) + 1)
a5652762Spraks
a5652762Spraks
a5652762Spraks#ifdef	DEBUG
a5652762Spraks
a5652762Spraksstruct	vpm_debug {
a5652762Spraks	int vpmd_steals;
a5652762Spraks	int vpmd_contend;
a5652762Spraks	int vpmd_prevpagelocked;
a5652762Spraks	int vpmd_getpagefailed;
a5652762Spraks	int vpmd_zerostart;
a5652762Spraks	int vpmd_emptyfreelist;
a5652762Spraks	int vpmd_nofreevpms;
a5652762Spraks} vpm_debug;
a5652762Spraks
a5652762Spraks#define	VPM_DEBUG(x)	((vpm_debug.x)++)
a5652762Spraks
a5652762Spraksint	steals;
a5652762Spraksint	steals_mtbf = 7;
a5652762Spraksint	contend;
a5652762Spraksint	contend_mtbf = 127;
a5652762Spraks
a5652762Spraks#define	VPM_MTBF(v, f)	(((++(v)) & (f)) != (f))
a5652762Spraks
a5652762Spraks#else	/* DEBUG */
a5652762Spraks
a5652762Spraks#define	VPM_MTBF(v, f)	(1)
a5652762Spraks#define	VPM_DEBUG(x)	/* nothing */
a5652762Spraks
a5652762Spraks#endif
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * The vpm cache.
a5652762Spraks *
a5652762Spraks * The main purpose of having a cache here is to speed up page_lookup()
a5652762Spraks * operations and also provide an LRU(default) behaviour of file pages. The
a5652762Spraks * page_lookup() operation tends to be expensive if a page has to be
a5652762Spraks * reclaimed from the system page cache("cachelist"). Once we speed up the
a5652762Spraks * page_lookup()->page_reclaim() path then there there should be no need for
a5652762Spraks * this cache. The system page cache(cachelist) should effectively serve the
a5652762Spraks * purpose of caching file pages.
a5652762Spraks *
a5652762Spraks * This cache is very similar to segmap's smap cache. Each page in the
a5652762Spraks * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
a5652762Spraks * hash table. The page_t has a reference to the vpmap_t when cached. For a
a5652762Spraks * given vnode, offset the page is found by means of a page_lookup() operation.
a5652762Spraks * Any page which has a mapping(i.e when cached) will not be in the
a5652762Spraks * system 'cachelist'. Hence the page_lookup() will not have to do a
a5652762Spraks * page_reclaim(). That is how the cache serves to speed up page_lookup()
a5652762Spraks * operations.
a5652762Spraks *
a5652762Spraks * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
a5652762Spraks */
a5652762Spraks
a5652762Spraksvoid
a5652762Spraksvpm_init()
a5652762Spraks{
a5652762Spraks	long  npages;
a5652762Spraks	struct vpmap *vpm;
a5652762Spraks	struct vpmfree *vpmflp;
a5652762Spraks	int i, ndx;
a5652762Spraks	extern void prefetch_smap_w(void *);
a5652762Spraks
a5652762Spraks	if (!vpm_cache_enable) {
a5652762Spraks		return;
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	/*
a5652762Spraks	 * Set the size of the cache.
a5652762Spraks	 */
a5652762Spraks	vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
a5652762Spraks	if (vpm_cache_size < VPMAP_MINCACHE) {
a5652762Spraks		vpm_cache_size = VPMAP_MINCACHE;
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	/*
a5652762Spraks	 * Number of freelists.
a5652762Spraks	 */
a5652762Spraks	if (vpm_nfreelist == 0) {
a5652762Spraks		vpm_nfreelist = max_ncpus;
a5652762Spraks	} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
a5652762Spraks		cmn_err(CE_WARN, "vpmap create : number of freelist "
a5652762Spraks		"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
a5652762Spraks		vpm_nfreelist = 2 * max_ncpus;
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	/*
a5652762Spraks	 * Round it up to the next power of 2
a5652762Spraks	 */
a5652762Spraks	if (vpm_nfreelist & (vpm_nfreelist - 1)) {
a5652762Spraks		vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
a5652762Spraks	}
a5652762Spraks	vpmd_freemsk = vpm_nfreelist - 1;
a5652762Spraks
a5652762Spraks	/*
a5652762Spraks	 * Use a per cpu rotor index to spread the allocations evenly
a5652762Spraks	 * across the available vpm freelists.
a5652762Spraks	 */
a5652762Spraks	vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
a5652762Spraks	ndx = 0;
a5652762Spraks	for (i = 0; i < max_ncpus; i++) {
a5652762Spraks
a5652762Spraks		vpmd_cpu[i].vfree_ndx = ndx;
a5652762Spraks		ndx = (ndx + 1) & vpmd_freemsk;
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	/*
a5652762Spraks	 * Allocate and initialize the freelist.
a5652762Spraks	 */
a5652762Spraks	vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
a5652762Spraks				KM_SLEEP);
a5652762Spraks	for (i = 0; i < vpm_nfreelist; i++) {
a5652762Spraks
a5652762Spraks		vpmflp = &vpmd_free[i];
a5652762Spraks		/*
a5652762Spraks		 * Set up initial queue pointers. They will get flipped
a5652762Spraks		 * back and forth.
a5652762Spraks		 */
a5652762Spraks		vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
a5652762Spraks		vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	npages = mmu_btop(vpm_cache_size);
a5652762Spraks
a5652762Spraks
a5652762Spraks	/*
a5652762Spraks	 * Allocate and initialize the vpmap structs.
a5652762Spraks	 */
a5652762Spraks	vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP);
a5652762Spraks	for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) {
a5652762Spraks		struct vpmfree *vpmflp;
a5652762Spraks		union vpm_freeq *releq;
a5652762Spraks		struct vpmap *vpmapf;
a5652762Spraks
a5652762Spraks		/*
a5652762Spraks		 * Use prefetch as we have to walk thru a large number of
a5652762Spraks		 * these data structures. We just use the smap's prefetch
a5652762Spraks		 * routine as it does the same. This should work fine
a5652762Spraks		 * for x64(this needs to be modifed when enabled on sparc).
a5652762Spraks		 */
a5652762Spraks		prefetch_smap_w((void *)vpm);
a5652762Spraks
a5652762Spraks		vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
a5652762Spraks
a5652762Spraks		vpmflp = VPMAP2VMF(vpm);
a5652762Spraks		releq = vpmflp->vpm_releq;
a5652762Spraks
a5652762Spraks		vpmapf = releq->vpmq_free;
a5652762Spraks		if (vpmapf == NULL) {
a5652762Spraks			releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
a5652762Spraks		} else {
a5652762Spraks			vpm->vpm_next = vpmapf;
a5652762Spraks			vpm->vpm_prev = vpmapf->vpm_prev;
a5652762Spraks			vpmapf->vpm_prev = vpm;
a5652762Spraks			vpm->vpm_prev->vpm_next = vpm;
a5652762Spraks			releq->vpmq_free = vpm->vpm_next;
a5652762Spraks		}
a5652762Spraks
a5652762Spraks		/*
a5652762Spraks		 * Indicate that the vpmap is on the releq at start
a5652762Spraks		 */
a5652762Spraks		vpm->vpm_ndxflg = VPMRELEQ;
a5652762Spraks	}
a5652762Spraks}
a5652762Spraks
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * unhooks vpm from the freelist if it is still on the freelist.
a5652762Spraks */
a5652762Spraks#define	VPMAP_RMFREELIST(vpm) \
a5652762Spraks	{ \
a5652762Spraks		if (vpm->vpm_next != NULL) { \
a5652762Spraks			union vpm_freeq *freeq; \
a5652762Spraks			struct vpmfree *vpmflp; \
a5652762Spraks			vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
a5652762Spraks			freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
a5652762Spraks			mutex_enter(&freeq->vpmq_mtx); \
a5652762Spraks			if (freeq->vpmq_free != vpm) { \
a5652762Spraks				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
a5652762Spraks				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
a5652762Spraks			} else if (vpm == vpm->vpm_next) { \
a5652762Spraks				freeq->vpmq_free = NULL; \
a5652762Spraks			} else { \
a5652762Spraks				freeq->vpmq_free = vpm->vpm_next; \
a5652762Spraks				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
a5652762Spraks				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
a5652762Spraks			} \
a5652762Spraks			mutex_exit(&freeq->vpmq_mtx); \
a5652762Spraks			vpm->vpm_next = vpm->vpm_prev = NULL; \
a5652762Spraks		} \
a5652762Spraks	}
a5652762Spraks
a5652762Spraksstatic int
a5652762Spraksget_freelndx(int mode)
a5652762Spraks{
a5652762Spraks	int ndx;
a5652762Spraks
a5652762Spraks	ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
a5652762Spraks	switch (mode) {
a5652762Spraks
a5652762Spraks	case	VPMCACHE_LRU:
a5652762Spraks	default:
a5652762Spraks			vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
a5652762Spraks			break;
a5652762Spraks	}
a5652762Spraks	return (ndx);
a5652762Spraks}
a5652762Spraks
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * Find one vpmap structure from the free lists and use it for the newpage.
a5652762Spraks * The previous page it cached is dissociated and released. The page_t's
a5652762Spraks * p_vpmref is cleared only when the vpm it is pointing to is locked(or
a5652762Spraks * for AMD64 when the page is exclusively locked in page_unload. That is
a5652762Spraks * because the p_vpmref is treated as mapping).
a5652762Spraks *
a5652762Spraks * The page's p_vpmref is set when the page is
a5652762Spraks * locked(at least SHARED locked).
a5652762Spraks */
a5652762Spraksstatic struct vpmap *
a5652762Spraksget_free_vpmap(page_t *newpage)
a5652762Spraks{
a5652762Spraks	struct vpmfree *vpmflp;
a5652762Spraks	kmutex_t *vmtx;
a5652762Spraks	struct vpmap *vpm, *first;
a5652762Spraks	union vpm_freeq *allocq, *releq;
a5652762Spraks	page_t *pp = NULL;
a5652762Spraks	int end_ndx, page_locked = 0;
a5652762Spraks	int free_ndx;
a5652762Spraks
a5652762Spraks	/*
a5652762Spraks	 * get the freelist bin index.
a5652762Spraks	 */
a5652762Spraks	free_ndx = get_freelndx(vpm_cachemode);
a5652762Spraks
a5652762Spraks	end_ndx = free_ndx;
a5652762Spraks	vpmflp = &vpmd_free[free_ndx];
a5652762Spraks
a5652762Spraksretry_queue:
a5652762Spraks	allocq = vpmflp->vpm_allocq;
a5652762Spraks	mutex_enter(&allocq->vpmq_mtx);
a5652762Spraks
a5652762Spraks	if ((vpm = allocq->vpmq_free) == NULL) {
a5652762Spraks
a5652762Spraksskip_queue:
a5652762Spraks		/*
a5652762Spraks		 * The alloc list is empty or this queue is being skipped;
a5652762Spraks		 * first see if the allocq toggled.
a5652762Spraks		 */
a5652762Spraks		if (vpmflp->vpm_allocq != allocq) {
a5652762Spraks			/* queue changed */
a5652762Spraks			mutex_exit(&allocq->vpmq_mtx);
a5652762Spraks			goto retry_queue;
a5652762Spraks		}
a5652762Spraks		releq = vpmflp->vpm_releq;
a5652762Spraks		if (!mutex_tryenter(&releq->vpmq_mtx)) {
a5652762Spraks			/* cannot get releq; a free vpmap may be there now */
a5652762Spraks			mutex_exit(&allocq->vpmq_mtx);
a5652762Spraks
a5652762Spraks			/*
a5652762Spraks			 * This loop could spin forever if this thread has
a5652762Spraks			 * higher priority than the thread that is holding
a5652762Spraks			 * releq->vpmq_mtx. In order to force the other thread
a5652762Spraks			 * to run, we'll lock/unlock the mutex which is safe
a5652762Spraks			 * since we just unlocked the allocq mutex.
a5652762Spraks			 */
a5652762Spraks			mutex_enter(&releq->vpmq_mtx);
a5652762Spraks			mutex_exit(&releq->vpmq_mtx);
a5652762Spraks			goto retry_queue;
a5652762Spraks		}
a5652762Spraks		if (releq->vpmq_free == NULL) {
a5652762Spraks			VPM_DEBUG(vpmd_emptyfreelist);
a5652762Spraks			/*
a5652762Spraks			 * This freelist is empty.
a5652762Spraks			 * This should not happen unless clients
a5652762Spraks			 * are failing to release the vpmap after
a5652762Spraks			 * accessing the data. Before resorting
a5652762Spraks			 * to sleeping, try the next list of the same color.
a5652762Spraks			 */
a5652762Spraks			free_ndx = (free_ndx + 1) & vpmd_freemsk;
a5652762Spraks			if (free_ndx != end_ndx) {
a5652762Spraks				mutex_exit(&releq->vpmq_mtx);
a5652762Spraks				mutex_exit(&allocq->vpmq_mtx);
a5652762Spraks				vpmflp = &vpmd_free[free_ndx];
a5652762Spraks				goto retry_queue;
a5652762Spraks			}
a5652762Spraks			/*
a5652762Spraks			 * Tried all freelists.
a5652762Spraks			 * wait on this list and hope something gets freed.
a5652762Spraks			 */
a5652762Spraks			vpmflp->vpm_want++;
a5652762Spraks			mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
a5652762Spraks			cv_wait(&vpmflp->vpm_free_cv,
a5652762Spraks				&vpmflp->vpm_freeq[0].vpmq_mtx);
a5652762Spraks			vpmflp->vpm_want--;
a5652762Spraks			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
a5652762Spraks			vpmflp = &vpmd_free[free_ndx];
a5652762Spraks			VPM_DEBUG(vpmd_nofreevpms);
a5652762Spraks			goto retry_queue;
a5652762Spraks		} else {
a5652762Spraks			/*
a5652762Spraks			 * Something on the rele queue; flip the alloc
a5652762Spraks			 * and rele queues and retry.
a5652762Spraks			 */
a5652762Spraks			vpmflp->vpm_allocq = releq;
a5652762Spraks			vpmflp->vpm_releq = allocq;
a5652762Spraks			mutex_exit(&allocq->vpmq_mtx);
a5652762Spraks			mutex_exit(&releq->vpmq_mtx);
a5652762Spraks			if (page_locked) {
a5652762Spraks				delay(hz >> 2);
a5652762Spraks				page_locked = 0;
a5652762Spraks			}
a5652762Spraks			goto retry_queue;
a5652762Spraks		}
a5652762Spraks	} else {
a5652762Spraks		int gotnewvpm;
a5652762Spraks		kmutex_t *pmtx;
a5652762Spraks		uint_t vpmref;
a5652762Spraks
a5652762Spraks		/*
a5652762Spraks		 * Fastpath the case we get the vpmap mutex
a5652762Spraks		 * on the first try.
a5652762Spraks		 */
a5652762Spraks		first = vpm;
a5652762Spraksnext_vpmap:
a5652762Spraks		vmtx = VPMAPMTX(vpm);
a5652762Spraks		if (!mutex_tryenter(vmtx)) {
a5652762Spraks			/*
a5652762Spraks			 * Another thread is trying to reclaim this slot.
a5652762Spraks			 * Skip to the next queue or vpmap.
a5652762Spraks			 */
a5652762Spraks			if ((vpm = vpm->vpm_next) == first) {
a5652762Spraks				goto skip_queue;
a5652762Spraks			} else {
a5652762Spraks				goto next_vpmap;
a5652762Spraks			}
a5652762Spraks		}
a5652762Spraks
a5652762Spraks		/*
a5652762Spraks		 * Assign this vpm to the newpage.
a5652762Spraks		 */
a5652762Spraks		pmtx = PPMTX(newpage);
a5652762Spraks		gotnewvpm = 0;
a5652762Spraks		mutex_enter(pmtx);
a5652762Spraks
a5652762Spraks		/*
a5652762Spraks		 * Check if some other thread already assigned a vpm to
a5652762Spraks		 * this page.
a5652762Spraks		 */
a5652762Spraks		if ((vpmref = newpage->p_vpmref) == 0) {
a5652762Spraks			newpage->p_vpmref = VPMID(vpm);
a5652762Spraks			gotnewvpm = 1;
a5652762Spraks		} else {
a5652762Spraks			VPM_DEBUG(vpmd_contend);
a5652762Spraks			mutex_exit(vmtx);
a5652762Spraks		}
a5652762Spraks		mutex_exit(pmtx);
a5652762Spraks
a5652762Spraks		if (gotnewvpm) {
a5652762Spraks
a5652762Spraks			/*
a5652762Spraks			 * At this point, we've selected the vpm. Remove vpm
a5652762Spraks			 * from its freelist. If vpm is the first one in
a5652762Spraks			 * the freelist, update the head of the freelist.
a5652762Spraks			 */
a5652762Spraks			if (first == vpm) {
a5652762Spraks				ASSERT(first == allocq->vpmq_free);
a5652762Spraks				allocq->vpmq_free = vpm->vpm_next;
a5652762Spraks			}
a5652762Spraks
a5652762Spraks			/*
a5652762Spraks			 * If the head of the freelist still points to vpm,
a5652762Spraks			 * then there are no more free vpmaps in that list.
a5652762Spraks			 */
a5652762Spraks			if (allocq->vpmq_free == vpm)
a5652762Spraks				/*
a5652762Spraks				 * Took the last one
a5652762Spraks				 */
a5652762Spraks				allocq->vpmq_free = NULL;
a5652762Spraks			else {
a5652762Spraks				vpm->vpm_prev->vpm_next = vpm->vpm_next;
a5652762Spraks				vpm->vpm_next->vpm_prev = vpm->vpm_prev;
a5652762Spraks			}
a5652762Spraks			mutex_exit(&allocq->vpmq_mtx);
a5652762Spraks			vpm->vpm_prev = vpm->vpm_next = NULL;
a5652762Spraks
a5652762Spraks			/*
a5652762Spraks			 * Disassociate the previous page. On x64 systems
a5652762Spraks			 * p_vpmref is used as a mapping reference to the page.
a5652762Spraks			 */
a5652762Spraks			if ((pp = vpm->vpm_pp) != NULL &&
a5652762Spraks				vpm->vpm_vp == pp->p_vnode &&
a5652762Spraks				vpm->vpm_off == pp->p_offset) {
a5652762Spraks
a5652762Spraks				pmtx = PPMTX(pp);
a5652762Spraks				if (page_trylock(pp, SE_SHARED)) {
a5652762Spraks					/*
a5652762Spraks					 * Now verify that it is the correct
a5652762Spraks					 * page. If not someone else stole it,
a5652762Spraks					 * so just unlock it and leave.
a5652762Spraks					 */
a5652762Spraks					mutex_enter(pmtx);
a5652762Spraks					if (PP_ISFREE(pp) ||
a5652762Spraks						vpm->vpm_vp != pp->p_vnode ||
a5652762Spraks						vpm->vpm_off != pp->p_offset ||
a5652762Spraks						pp->p_vpmref != VPMID(vpm)) {
a5652762Spraks						mutex_exit(pmtx);
a5652762Spraks
a5652762Spraks						page_unlock(pp);
a5652762Spraks					} else {
a5652762Spraks						/*
a5652762Spraks						 * Release the page.
a5652762Spraks						 */
a5652762Spraks						pp->p_vpmref = 0;
a5652762Spraks						mutex_exit(pmtx);
a5652762Spraks						hat_kpm_mapout(pp, 0,
a5652762Spraks							hat_kpm_page2va(pp, 1));
a5652762Spraks						(void) page_release(pp, 1);
a5652762Spraks					}
a5652762Spraks				} else {
a5652762Spraks					/*
a5652762Spraks					 * If the page cannot be locked, just
a5652762Spraks					 * clear the p_vpmref and go.
a5652762Spraks					 */
a5652762Spraks					mutex_enter(pmtx);
a5652762Spraks					if (pp->p_vpmref == VPMID(vpm)) {
a5652762Spraks						pp->p_vpmref = 0;
a5652762Spraks					}
a5652762Spraks					mutex_exit(pmtx);
a5652762Spraks					VPM_DEBUG(vpmd_prevpagelocked);
a5652762Spraks				}
a5652762Spraks			}
a5652762Spraks
a5652762Spraks			/*
a5652762Spraks			 * Setup vpm to point to the new page.
a5652762Spraks			 */
a5652762Spraks			vpm->vpm_pp = newpage;
a5652762Spraks			vpm->vpm_vp = newpage->p_vnode;
a5652762Spraks			vpm->vpm_off = newpage->p_offset;
a5652762Spraks
a5652762Spraks		} else {
a5652762Spraks			int steal = !VPM_MTBF(steals, steals_mtbf);
a5652762Spraks			/*
a5652762Spraks			 * Page already has a vpm assigned just use that.
a5652762Spraks			 * Grab the vpm mutex and verify that it is still
a5652762Spraks			 * the correct one. The pp->p_vpmref should not change
a5652762Spraks			 * once we have the vpm mutex and the page lock.
a5652762Spraks			 */
a5652762Spraks			mutex_exit(&allocq->vpmq_mtx);
a5652762Spraks			vpm = VPMP(vpmref);
a5652762Spraks			vmtx = VPMAPMTX(vpm);
a5652762Spraks			mutex_enter(vmtx);
a5652762Spraks			if ((steal && vpm->vpm_refcnt == 0) ||
a5652762Spraks			    vpm->vpm_pp != newpage) {
a5652762Spraks				/*
a5652762Spraks				 * The vpm got stolen, retry.
a5652762Spraks				 * clear the p_vpmref.
a5652762Spraks				 */
a5652762Spraks				pmtx = PPMTX(newpage);
a5652762Spraks				mutex_enter(pmtx);
a5652762Spraks				if (newpage->p_vpmref == vpmref) {
a5652762Spraks					newpage->p_vpmref = 0;
a5652762Spraks				}
a5652762Spraks				mutex_exit(pmtx);
a5652762Spraks
a5652762Spraks				mutex_exit(vmtx);
a5652762Spraks				VPM_DEBUG(vpmd_steals);
a5652762Spraks				goto retry_queue;
a5652762Spraks			} else if (vpm->vpm_refcnt == 0) {
a5652762Spraks				/*
a5652762Spraks				 * Remove it from the free list if it
a5652762Spraks				 * exists there.
a5652762Spraks				 */
a5652762Spraks				VPMAP_RMFREELIST(vpm);
a5652762Spraks			}
a5652762Spraks		}
a5652762Spraks		return (vpm);
a5652762Spraks	}
a5652762Spraks}
a5652762Spraks
a5652762Spraksstatic void
a5652762Spraksfree_vpmap(struct vpmap *vpm)
a5652762Spraks{
a5652762Spraks	struct vpmfree *vpmflp;
a5652762Spraks	struct vpmap *vpmfreelist;
a5652762Spraks	union vpm_freeq *releq;
a5652762Spraks
a5652762Spraks	ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
a5652762Spraks
a5652762Spraks	if (vpm->vpm_refcnt != 0) {
a5652762Spraks		panic("free_vpmap");
a5652762Spraks		/*NOTREACHED*/
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	vpmflp = &vpmd_free[vpm->vpm_free_ndx];
a5652762Spraks	/*
a5652762Spraks	 * Add to the tail of the release queue
a5652762Spraks	 * Note that vpm_releq and vpm_allocq could toggle
a5652762Spraks	 * before we get the lock. This does not affect
a5652762Spraks	 * correctness as the 2 queues are only maintained
a5652762Spraks	 * to reduce lock pressure.
a5652762Spraks	 */
a5652762Spraks	releq = vpmflp->vpm_releq;
a5652762Spraks	if (releq == &vpmflp->vpm_freeq[0]) {
a5652762Spraks		vpm->vpm_ndxflg = 0;
a5652762Spraks	} else {
a5652762Spraks		vpm->vpm_ndxflg = 1;
a5652762Spraks	}
a5652762Spraks	mutex_enter(&releq->vpmq_mtx);
a5652762Spraks	vpmfreelist = releq->vpmq_free;
a5652762Spraks	if (vpmfreelist == 0) {
a5652762Spraks		int want;
a5652762Spraks
a5652762Spraks		releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
a5652762Spraks		/*
a5652762Spraks		 * Both queue mutexes are held to set vpm_want;
a5652762Spraks		 * snapshot the value before dropping releq mutex.
a5652762Spraks		 * If vpm_want appears after the releq mutex is dropped,
a5652762Spraks		 * then the vpmap just freed is already gone.
a5652762Spraks		 */
a5652762Spraks		want = vpmflp->vpm_want;
a5652762Spraks		mutex_exit(&releq->vpmq_mtx);
a5652762Spraks		/*
a5652762Spraks		 * See if there was a waiter before dropping the releq mutex
a5652762Spraks		 * then recheck after obtaining vpm_freeq[0] mutex as
a5652762Spraks		 * the another thread may have already signaled.
a5652762Spraks		 */
a5652762Spraks		if (want) {
a5652762Spraks			mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
a5652762Spraks			if (vpmflp->vpm_want)
a5652762Spraks				cv_signal(&vpmflp->vpm_free_cv);
a5652762Spraks			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
a5652762Spraks		}
a5652762Spraks	} else {
a5652762Spraks		vpm->vpm_next = vpmfreelist;
a5652762Spraks		vpm->vpm_prev = vpmfreelist->vpm_prev;
a5652762Spraks		vpmfreelist->vpm_prev = vpm;
a5652762Spraks		vpm->vpm_prev->vpm_next = vpm;
a5652762Spraks		mutex_exit(&releq->vpmq_mtx);
a5652762Spraks	}
a5652762Spraks}
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * Get the vpmap for the page.
a5652762Spraks * The refcnt of this vpm is incremented.
a5652762Spraks */
a5652762Spraksstatic struct vpmap *
a5652762Spraksget_vpmap(page_t *pp)
a5652762Spraks{
a5652762Spraks	struct vpmap *vpm = NULL;
a5652762Spraks	kmutex_t *vmtx;
a5652762Spraks	kmutex_t *pmtx;
a5652762Spraks	unsigned int refid;
a5652762Spraks
a5652762Spraks	ASSERT((pp != NULL) && PAGE_LOCKED(pp));
a5652762Spraks
a5652762Spraks	if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
a5652762Spraks		vpm = VPMP(refid);
a5652762Spraks		vmtx = VPMAPMTX(vpm);
a5652762Spraks		mutex_enter(vmtx);
a5652762Spraks		/*
a5652762Spraks		 * Since we have the page lock and the vpm mutex, the
a5652762Spraks		 * pp->p_vpmref cannot change.
a5652762Spraks		 */
a5652762Spraks		if (vpm->vpm_pp != pp) {
a5652762Spraks			pmtx = PPMTX(pp);
a5652762Spraks
a5652762Spraks			/*
a5652762Spraks			 * Clear the p_vpmref as it is incorrect.
a5652762Spraks			 * This can happen if the page was stolen.
a5652762Spraks			 * On x64 this should not happen as p_vpmref
a5652762Spraks			 * is treated as a mapping on the page. So
a5652762Spraks			 * if the page is stolen, the mapping would have
a5652762Spraks			 * been cleared in page_unload().
a5652762Spraks			 */
a5652762Spraks			mutex_enter(pmtx);
a5652762Spraks			if (pp->p_vpmref == refid)
a5652762Spraks				pp->p_vpmref = 0;
a5652762Spraks			mutex_exit(pmtx);
a5652762Spraks
a5652762Spraks			mutex_exit(vmtx);
a5652762Spraks			vpm = NULL;
a5652762Spraks		} else if (vpm->vpm_refcnt == 0) {
a5652762Spraks			/*
a5652762Spraks			 * Got the vpm, remove it from the free
a5652762Spraks			 * list if it exists there.
a5652762Spraks			 */
a5652762Spraks			VPMAP_RMFREELIST(vpm);
a5652762Spraks		}
a5652762Spraks	}
a5652762Spraks	if (vpm == NULL) {
a5652762Spraks		/*
a5652762Spraks		 * get_free_vpmap() returns with the vpmap mutex held.
a5652762Spraks		 */
a5652762Spraks		vpm = get_free_vpmap(pp);
a5652762Spraks		vmtx = VPMAPMTX(vpm);
a5652762Spraks		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
a5652762Spraks	} else {
a5652762Spraks		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	vpm->vpm_refcnt++;
a5652762Spraks	mutex_exit(vmtx);
a5652762Spraks
a5652762Spraks	return (vpm);
a5652762Spraks}
a5652762Spraks
a5652762Spraks/* END --- vpm cache ---- */
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * The vnode page mapping(vpm) interface routines.
a5652762Spraks */
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * Find or create the pages starting form baseoff for specified
a5652762Spraks * length 'len'.
a5652762Spraks */
a5652762Spraksstatic int
a5652762Spraksvpm_pagecreate(
a5652762Spraks	struct vnode *vp,
a5652762Spraks	u_offset_t baseoff,
a5652762Spraks	size_t len,
a5652762Spraks	vmap_t vml[],
a5652762Spraks	int nseg,
a5652762Spraks	int *newpage)
a5652762Spraks{
a5652762Spraks
a5652762Spraks	page_t *pp = NULL;
a5652762Spraks	caddr_t base;
a5652762Spraks	u_offset_t off = baseoff;
a5652762Spraks	int i;
a5652762Spraks	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
a5652762Spraks
3bd1497bSpraks	for (i = 0; len > 0; len -= PAGESIZE, i++) {
a5652762Spraks		struct vpmap *vpm;
a5652762Spraks
a5652762Spraks
a5652762Spraks		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
a5652762Spraks
a5652762Spraks			base = segkpm_create_va(off);
a5652762Spraks
a5652762Spraks			/*
a5652762Spraks			 * the seg pointer passed in is just advisor. Just
a5652762Spraks			 * pass segkmap for now like segmap does with
a5652762Spraks			 * segmap_kpm enabled.
a5652762Spraks			 */
a5652762Spraks			if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
a5652762Spraks			    segkmap, base)) == NULL) {
a5652762Spraks				panic("segmap_pagecreate_vpm: "
a5652762Spraks				    "page_create failed");
a5652762Spraks				/*NOTREACHED*/
a5652762Spraks			}
a5652762Spraks			if (newpage != NULL)
a5652762Spraks				*newpage = 1;
a5652762Spraks
a5652762Spraks			page_io_unlock(pp);
a5652762Spraks		}
a5652762Spraks
a5652762Spraks		/*
a5652762Spraks		 * Get the vpm for this page_t.
a5652762Spraks		 */
a5652762Spraks		if (vpm_cache_enable) {
a5652762Spraks			vpm = get_vpmap(pp);
a5652762Spraks			vml[i].vs_data = (void *)&vpm->vpm_pp;
a5652762Spraks		} else {
a5652762Spraks			vml[i].vs_data = (void *)pp;
a5652762Spraks			pp->p_vpmref = 0;
a5652762Spraks		}
a5652762Spraks
a5652762Spraks		vml[i].vs_addr = hat_kpm_mapin(pp, 0);
a5652762Spraks		vml[i].vs_len = PAGESIZE;
a5652762Spraks
a5652762Spraks		off += PAGESIZE;
a5652762Spraks	}
a5652762Spraks	vml[i].vs_data = NULL;
a5652762Spraks	vml[i].vs_addr = (caddr_t)NULL;
a5652762Spraks	return (0);
a5652762Spraks}
a5652762Spraks
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * Returns vpm mappings of pages in the range [off, off+len], where
a5652762Spraks * len is rounded up to the PAGESIZE boundary. The list of pages and
a5652762Spraks * the page addresses are returned in the SGL vml (vmap_t) array passed in.
a5652762Spraks * The nseg is the number of vmap_t entries in the array.
a5652762Spraks *
a5652762Spraks * Currently max len allowed is MAXBSIZE therefore, it will either
a5652762Spraks * fetch/create one or two pages depending on what is the PAGESIZE.
a5652762Spraks *
a5652762Spraks * The segmap's SM_LOCKPROTO  usage is not supported by these interfaces.
a5652762Spraks * For such cases, use the seg_map interfaces.
a5652762Spraks */
a5652762Spraksint
a5652762Spraksvpm_map_pages(
a5652762Spraks	struct vnode *vp,
a5652762Spraks	u_offset_t off,
a5652762Spraks	size_t len,
a5652762Spraks	int fetchpage,
a5652762Spraks	vmap_t *vml,
a5652762Spraks	int nseg,
a5652762Spraks	int  *newpage,
a5652762Spraks	enum seg_rw rw)
a5652762Spraks{
a5652762Spraks	extern struct vnode *common_specvp();
a5652762Spraks	u_offset_t baseoff;
a5652762Spraks	uint_t prot;
a5652762Spraks	caddr_t base;
a5652762Spraks	page_t *pp, *pplist[MAXVMAPS];
a5652762Spraks	struct vpmap *vpm;
a5652762Spraks	int i, error = 0;
a5652762Spraks
a5652762Spraks	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
a5652762Spraks	baseoff = off & (offset_t)PAGEMASK;
a5652762Spraks	vml[0].vs_data = NULL;
a5652762Spraks	vml[0].vs_addr = (caddr_t)NULL;
a5652762Spraks	/*
a5652762Spraks	 * For now, lets restrict it to MAXBSIZE. XXX - We can allow
a5652762Spraks	 * len longer then MAXBSIZE, but there should be a limit
a5652762Spraks	 * which should be determined by how many pages the VOP_GETPAGE()
a5652762Spraks	 * can fetch.
a5652762Spraks	 */
a5652762Spraks	if (off + len > baseoff + MAXBSIZE) {
a5652762Spraks		panic("vpm_map_pages bad len");
a5652762Spraks		/*NOTREACHED*/
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	/*
a5652762Spraks	 * If this is a block device we have to be sure to use the
a5652762Spraks	 * "common" block device vnode for the mapping.
a5652762Spraks	 */
a5652762Spraks	if (vp->v_type == VBLK)
a5652762Spraks		vp = common_specvp(vp);
a5652762Spraks
3bd1497bSpraks	/*
3bd1497bSpraks	 * round up len to a multiple of PAGESIZE.
3bd1497bSpraks	 */
3bd1497bSpraks	len = ((off + len - baseoff + PAGESIZE - 1) & (uintptr_t)PAGEMASK);
a5652762Spraks
a5652762Spraks	if (!fetchpage)
a5652762Spraks		return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
a5652762Spraks
3bd1497bSpraks	for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
a5652762Spraks
a5652762Spraks		pp = page_lookup(vp, baseoff, SE_SHARED);
a5652762Spraks
a5652762Spraks		/*
a5652762Spraks		 * If we did not find the page or if this page was not
a5652762Spraks		 * in our cache, then let VOP_GETPAGE get all the pages.
a5652762Spraks		 * We need to call VOP_GETPAGE so that filesytems can do some
a5652762Spraks		 * (un)necessary tracking for sequential access.
a5652762Spraks		 */
a5652762Spraks
a5652762Spraks		if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
a5652762Spraks			(rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
a5652762Spraks							!= (P_MOD | P_REF))) {
a5652762Spraks			if (pp != NULL) {
a5652762Spraks				page_unlock(pp);
a5652762Spraks			}
a5652762Spraks
a5652762Spraks			/*
a5652762Spraks			 * Pass a dummy address as it will be required
a5652762Spraks			 * by page_create_va(). We pass segkmap as the seg
a5652762Spraks			 * as some file systems(UFS) check it.
a5652762Spraks			 */
a5652762Spraks			base = segkpm_create_va(baseoff);
a5652762Spraks
a5652762Spraks			error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i],
3bd1497bSpraks			len, segkmap, base, rw, CRED());
a5652762Spraks			if (error) {
a5652762Spraks				VPM_DEBUG(vpmd_getpagefailed);
a5652762Spraks				pplist[i] = NULL;
a5652762Spraks			}
a5652762Spraks			break;
a5652762Spraks		} else {
a5652762Spraks			pplist[i] = pp;
a5652762Spraks			baseoff += PAGESIZE;
a5652762Spraks		}
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	if (error) {
a5652762Spraks		for (i = 0; pplist[i] != NULL; i++) {
a5652762Spraks			page_unlock(pplist[i]);
a5652762Spraks			pplist[i] = NULL;
a5652762Spraks		}
a5652762Spraks		vml[0].vs_addr = NULL;
a5652762Spraks		vml[0].vs_data = NULL;
*9234f026Spraks		return (error);
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	/*
a5652762Spraks	 * Get the vpm's for pages.
a5652762Spraks	 */
a5652762Spraks	for (i = 0; pplist[i] != NULL; i++) {
a5652762Spraks		if (vpm_cache_enable) {
a5652762Spraks			vpm = get_vpmap(pplist[i]);
a5652762Spraks			vml[i].vs_data = (void *)&(vpm->vpm_pp);
a5652762Spraks		} else {
a5652762Spraks			vml[i].vs_data = (void *)pplist[i];
a5652762Spraks			pplist[i]->p_vpmref = 0;
a5652762Spraks		}
a5652762Spraks
a5652762Spraks		vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
a5652762Spraks		vml[i].vs_len = PAGESIZE;
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	vml[i].vs_data = NULL;
a5652762Spraks	vml[i].vs_addr = (caddr_t)NULL;
a5652762Spraks
a5652762Spraks	return (0);
a5652762Spraks}
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * Release the vpm mappings on the pages and unlock them.
a5652762Spraks */
a5652762Spraksvoid
a5652762Spraksvpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
a5652762Spraks{
a5652762Spraks	int i;
a5652762Spraks	struct vpmap *vpm;
a5652762Spraks	kmutex_t *mtx;
a5652762Spraks	page_t *pp;
a5652762Spraks
a5652762Spraks	for (i = 0; vml[i].vs_data != NULL; i++) {
a5652762Spraks		ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
a5652762Spraks
a5652762Spraks		if (vpm_cache_enable) {
a5652762Spraks			pp = *(((page_t **)vml[i].vs_data));
a5652762Spraks		} else {
a5652762Spraks			pp = (page_t *)vml[i].vs_data;
a5652762Spraks		}
a5652762Spraks
a5652762Spraks		/*
a5652762Spraks		 * Mark page as being modified or referenced, bacause vpm pages
a5652762Spraks		 * would not cause faults where it would be set normally.
a5652762Spraks		 */
a5652762Spraks		if (rw == S_WRITE) {
a5652762Spraks			hat_setrefmod(pp);
a5652762Spraks		} else {
a5652762Spraks			ASSERT(rw == S_READ);
a5652762Spraks			hat_setref(pp);
a5652762Spraks		}
a5652762Spraks
a5652762Spraks		if (vpm_cache_enable) {
a5652762Spraks			page_unlock(pp);
a5652762Spraks			vpm = (struct vpmap *)((char *)vml[i].vs_data
a5652762Spraks					- offsetof(struct vpmap, vpm_pp));
a5652762Spraks			mtx = VPMAPMTX(vpm);
a5652762Spraks			mutex_enter(mtx);
a5652762Spraks
a5652762Spraks			if (--vpm->vpm_refcnt == 0) {
a5652762Spraks				free_vpmap(vpm);
a5652762Spraks			}
a5652762Spraks			mutex_exit(mtx);
a5652762Spraks		} else {
a5652762Spraks			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
a5652762Spraks			(void) page_release(pp, 1);
a5652762Spraks		}
a5652762Spraks		vml[i].vs_data = NULL;
a5652762Spraks		vml[i].vs_addr = NULL;
a5652762Spraks	}
a5652762Spraks}
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * Given the vp, off and the uio structure, this routine will do the
a5652762Spraks * the copy (uiomove). If the last page created is partially written,
a5652762Spraks * the rest of the page is zeroed out. It also zeros the beginning of
a5652762Spraks * the first page till the start offset if requested(zerostart).
a5652762Spraks * If pages are to be fetched, it will call the filesystem's getpage
a5652762Spraks * function (VOP_GETPAGE) to get them, otherwise they will be created if
a5652762Spraks * not already present in the page cache.
a5652762Spraks */
a5652762Spraksint
a5652762Spraksvpm_data_copy(struct vnode *vp,
a5652762Spraks	u_offset_t off,
a5652762Spraks	size_t len,
a5652762Spraks	struct uio *uio,
a5652762Spraks	int fetchpage,
a5652762Spraks	int *newpage,
a5652762Spraks	int zerostart,
a5652762Spraks	enum seg_rw rw)
a5652762Spraks{
a5652762Spraks	int error;
a5652762Spraks	struct vmap vml[MINVMAPS];
a5652762Spraks	enum uio_rw uiorw;
a5652762Spraks	int npages = 0;
a5652762Spraks
a5652762Spraks	uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
a5652762Spraks	/*
a5652762Spraks	 * 'off' will be the offset where the I/O starts.
a5652762Spraks	 * We get the pages starting at the (off & PAGEMASK)
a5652762Spraks	 * page boundary.
a5652762Spraks	 */
a5652762Spraks	error = vpm_map_pages(vp, off, (uint_t)len,
a5652762Spraks		fetchpage, vml, MINVMAPS, &npages,  rw);
a5652762Spraks
a5652762Spraks	if (newpage != NULL)
a5652762Spraks		*newpage = npages;
a5652762Spraks	if (!error) {
a5652762Spraks		int i, pn, slen = len;
a5652762Spraks		int pon = off & PAGEOFFSET;
a5652762Spraks
a5652762Spraks		/*
a5652762Spraks		 * Clear from the beginning of the page to start offset
a5652762Spraks		 * if requested.
a5652762Spraks		 */
a5652762Spraks		if (!fetchpage && zerostart) {
a5652762Spraks			(void) kzero(vml[0].vs_addr,  (uint_t)pon);
a5652762Spraks			VPM_DEBUG(vpmd_zerostart);
a5652762Spraks		}
a5652762Spraks
a5652762Spraks		for (i = 0; !error && slen > 0 &&
a5652762Spraks				vml[i].vs_addr != NULL; i++) {
a5652762Spraks			pn = (int)MIN(slen, (PAGESIZE - pon));
a5652762Spraks			error = uiomove(vml[i].vs_addr + pon,
a5652762Spraks				    (long)pn, uiorw, uio);
a5652762Spraks			slen -= pn;
a5652762Spraks			pon = 0;
a5652762Spraks		}
a5652762Spraks
a5652762Spraks		/*
a5652762Spraks		 * When new pages are created, zero out part of the
a5652762Spraks		 * page we did not copy to.
a5652762Spraks		 */
a5652762Spraks		if (!fetchpage && npages &&
a5652762Spraks			uio->uio_loffset < roundup(off + len, PAGESIZE)) {
a5652762Spraks			int nzero;
a5652762Spraks
a5652762Spraks			pon = (uio->uio_loffset & PAGEOFFSET);
a5652762Spraks			nzero = PAGESIZE  - pon;
a5652762Spraks			i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
a5652762Spraks			(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
a5652762Spraks		}
a5652762Spraks		vpm_unmap_pages(vml, rw);
a5652762Spraks	}
a5652762Spraks	return (error);
a5652762Spraks}
a5652762Spraks
a5652762Spraks/*
a5652762Spraks * called to flush pages for the given vnode covering
a5652762Spraks * [off, off+len] range.
a5652762Spraks */
a5652762Spraksint
a5652762Spraksvpm_sync_pages(struct vnode *vp,
a5652762Spraks		u_offset_t off,
a5652762Spraks		size_t len,
a5652762Spraks		uint_t flags)
a5652762Spraks{
a5652762Spraks	extern struct vnode *common_specvp();
a5652762Spraks	int bflags = 0;
a5652762Spraks	int error = 0;
a5652762Spraks	size_t psize = roundup(len, PAGESIZE);
a5652762Spraks
a5652762Spraks	/*
a5652762Spraks	 * If this is a block device we have to be sure to use the
a5652762Spraks	 * "common" block device vnode for the mapping.
a5652762Spraks	 */
a5652762Spraks	if (vp->v_type == VBLK)
a5652762Spraks		vp = common_specvp(vp);
a5652762Spraks
a5652762Spraks	if ((flags & ~SM_DONTNEED) != 0) {
a5652762Spraks		if (flags & SM_ASYNC)
a5652762Spraks			bflags |= B_ASYNC;
a5652762Spraks		if (flags & SM_INVAL)
a5652762Spraks			bflags |= B_INVAL;
a5652762Spraks		if (flags & SM_DESTROY)
a5652762Spraks			bflags |= (B_INVAL|B_TRUNC);
a5652762Spraks		if (flags & SM_FREE)
a5652762Spraks			bflags |= B_FREE;
a5652762Spraks		if (flags & SM_DONTNEED)
a5652762Spraks			bflags |= B_DONTNEED;
a5652762Spraks
a5652762Spraks		error = VOP_PUTPAGE(vp, off, psize, bflags, CRED());
a5652762Spraks	}
a5652762Spraks
a5652762Spraks	return (error);
a5652762Spraks}
a5652762Spraks
a5652762Spraks
a5652762Spraks#else	/* SEGKPM_SUPPORT */
a5652762Spraks
a5652762Spraks/* vpm stubs */
a5652762Spraksvoid
a5652762Spraksvpm_init()
a5652762Spraks{
a5652762Spraks}
a5652762Spraks
a5652762Spraks/*ARGSUSED*/
a5652762Spraksint
a5652762Spraksvpm_pagecreate(
a5652762Spraks	struct vnode *vp,
a5652762Spraks	u_offset_t baseoff,
a5652762Spraks	size_t len,
a5652762Spraks	vmap_t vml[],
a5652762Spraks	int nseg,
a5652762Spraks	int *newpage)
a5652762Spraks{
a5652762Spraks	return (0);
a5652762Spraks}
a5652762Spraks
a5652762Spraks/*ARGSUSED*/
a5652762Spraksint
a5652762Spraksvpm_map_pages(
a5652762Spraks	struct vnode *vp,
a5652762Spraks	u_offset_t off,
a5652762Spraks	size_t len,
a5652762Spraks	int fetchpage,
a5652762Spraks	vmap_t vml[],
a5652762Spraks	int nseg,
a5652762Spraks	int *newpage,
a5652762Spraks	enum seg_rw rw)
a5652762Spraks{
a5652762Spraks	return (0);
a5652762Spraks}
a5652762Spraks
a5652762Spraks/*ARGSUSED*/
a5652762Spraksint
a5652762Spraksvpm_data_copy(struct vnode *vp,
a5652762Spraks	u_offset_t off,
a5652762Spraks	size_t len,
a5652762Spraks	struct uio *uio,
a5652762Spraks	int fetchpage,
a5652762Spraks	int *newpage,
a5652762Spraks	int zerostart,
a5652762Spraks	enum seg_rw rw)
a5652762Spraks{
a5652762Spraks	return (0);
a5652762Spraks}
a5652762Spraks
a5652762Spraks/*ARGSUSED*/
a5652762Spraksvoid
a5652762Spraksvpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
a5652762Spraks{
a5652762Spraks}
a5652762Spraks/*ARGSUSED*/
a5652762Spraksint
a5652762Spraksvpm_sync_pages(struct vnode *vp,
a5652762Spraks		u_offset_t off,
a5652762Spraks		size_t len,
a5652762Spraks		uint_t flags)
a5652762Spraks{
a5652762Spraks	return (0);
a5652762Spraks}
a5652762Spraks#endif	/* SEGKPM_SUPPORT */