xref: /illumos-gate/usr/src/uts/common/vm/vpm.c (revision 9234f026)
1a5652762Spraks /*
2a5652762Spraks  * CDDL HEADER START
3a5652762Spraks  *
4a5652762Spraks  * The contents of this file are subject to the terms of the
5a5652762Spraks  * Common Development and Distribution License (the "License").
6a5652762Spraks  * You may not use this file except in compliance with the License.
7a5652762Spraks  *
8a5652762Spraks  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9a5652762Spraks  * or http://www.opensolaris.org/os/licensing.
10a5652762Spraks  * See the License for the specific language governing permissions
11a5652762Spraks  * and limitations under the License.
12a5652762Spraks  *
13a5652762Spraks  * When distributing Covered Code, include this CDDL HEADER in each
14a5652762Spraks  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15a5652762Spraks  * If applicable, add the following below this CDDL HEADER, with the
16a5652762Spraks  * fields enclosed by brackets "[]" replaced with your own identifying
17a5652762Spraks  * information: Portions Copyright [yyyy] [name of copyright owner]
18a5652762Spraks  *
19a5652762Spraks  * CDDL HEADER END
20a5652762Spraks  */
21a5652762Spraks /*
22a5652762Spraks  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23a5652762Spraks  * Use is subject to license terms.
24a5652762Spraks  */
25a5652762Spraks 
26a5652762Spraks #pragma ident	"%Z%%M%	%I%	%E% SMI"
27a5652762Spraks 
28a5652762Spraks /*
29a5652762Spraks  * VM - generic vnode page mapping interfaces.
30a5652762Spraks  *
31a5652762Spraks  * Mechanism to provide temporary mappings to vnode pages.
32a5652762Spraks  * The typical use would be to copy/access file data.
33a5652762Spraks  */
34a5652762Spraks 
35a5652762Spraks #include <sys/types.h>
36a5652762Spraks #include <sys/t_lock.h>
37a5652762Spraks #include <sys/param.h>
38a5652762Spraks #include <sys/sysmacros.h>
39a5652762Spraks #include <sys/buf.h>
40a5652762Spraks #include <sys/systm.h>
41a5652762Spraks #include <sys/vnode.h>
42a5652762Spraks #include <sys/mman.h>
43a5652762Spraks #include <sys/errno.h>
44a5652762Spraks #include <sys/cred.h>
45a5652762Spraks #include <sys/kmem.h>
46a5652762Spraks #include <sys/vtrace.h>
47a5652762Spraks #include <sys/cmn_err.h>
48a5652762Spraks #include <sys/debug.h>
49a5652762Spraks #include <sys/thread.h>
50a5652762Spraks #include <sys/dumphdr.h>
51a5652762Spraks #include <sys/bitmap.h>
52a5652762Spraks #include <sys/lgrp.h>
53a5652762Spraks 
54a5652762Spraks #include <vm/seg_kmem.h>
55a5652762Spraks #include <vm/hat.h>
56a5652762Spraks #include <vm/as.h>
57a5652762Spraks #include <vm/seg.h>
58a5652762Spraks #include <vm/seg_kpm.h>
59a5652762Spraks #include <vm/seg_map.h>
60a5652762Spraks #include <vm/page.h>
61a5652762Spraks #include <vm/pvn.h>
62a5652762Spraks #include <vm/rm.h>
63a5652762Spraks #include <vm/vpm.h>
64a5652762Spraks 
65a5652762Spraks /*
66a5652762Spraks  * Needs to be enabled by each platform.
67a5652762Spraks  */
68a5652762Spraks int vpm_enable = 0;
69a5652762Spraks 
70a5652762Spraks #ifdef	SEGKPM_SUPPORT
71a5652762Spraks 
72a5652762Spraks 
73a5652762Spraks int	vpm_cache_enable = 1;
74a5652762Spraks long	vpm_cache_percent = 12;
75a5652762Spraks long	vpm_cache_size;
76a5652762Spraks int	vpm_nfreelist = 0;
77a5652762Spraks int	vpmd_freemsk = 0;
78a5652762Spraks 
79a5652762Spraks #define	VPM_S_PAD	64
80a5652762Spraks union vpm_cpu {
81a5652762Spraks 	struct {
82a5652762Spraks 		int	vcpu_free_ndx;
83a5652762Spraks 		ulong_t	vcpu_hits;
84a5652762Spraks 		ulong_t vcpu_misses;
85a5652762Spraks 	} vcpu;
86a5652762Spraks 	char vpm_pad[VPM_S_PAD];
87a5652762Spraks };
88a5652762Spraks static union vpm_cpu	*vpmd_cpu;
89a5652762Spraks 
90a5652762Spraks #define	vfree_ndx	vcpu.vcpu_free_ndx
91a5652762Spraks 
92a5652762Spraks int	vpm_cachemode = VPMCACHE_LRU;
93a5652762Spraks 
94a5652762Spraks #define	PPMTX(pp) (&(pp)->p_ilock)
95a5652762Spraks 
96a5652762Spraks static struct vpmap *vpmd_vpmap;	/* list of vpmap structs preallocated */
97a5652762Spraks static struct vpmfree *vpmd_free;
98a5652762Spraks #define	VPMAPMTX(vpm)	(&vpm->vpm_mtx)
99a5652762Spraks #define	VPMAP2VMF(vpm)	(&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
100a5652762Spraks #define	VPMAP2VMF_NDX(vpm)	(ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
101a5652762Spraks #define	VPMP(id)	(&vpmd_vpmap[id - 1])
102a5652762Spraks #define	VPMID(vpm)	(uint_t)((vpm - vpmd_vpmap) + 1)
103a5652762Spraks 
104a5652762Spraks 
105a5652762Spraks #ifdef	DEBUG
106a5652762Spraks 
107a5652762Spraks struct	vpm_debug {
108a5652762Spraks 	int vpmd_steals;
109a5652762Spraks 	int vpmd_contend;
110a5652762Spraks 	int vpmd_prevpagelocked;
111a5652762Spraks 	int vpmd_getpagefailed;
112a5652762Spraks 	int vpmd_zerostart;
113a5652762Spraks 	int vpmd_emptyfreelist;
114a5652762Spraks 	int vpmd_nofreevpms;
115a5652762Spraks } vpm_debug;
116a5652762Spraks 
117a5652762Spraks #define	VPM_DEBUG(x)	((vpm_debug.x)++)
118a5652762Spraks 
119a5652762Spraks int	steals;
120a5652762Spraks int	steals_mtbf = 7;
121a5652762Spraks int	contend;
122a5652762Spraks int	contend_mtbf = 127;
123a5652762Spraks 
124a5652762Spraks #define	VPM_MTBF(v, f)	(((++(v)) & (f)) != (f))
125a5652762Spraks 
126a5652762Spraks #else	/* DEBUG */
127a5652762Spraks 
128a5652762Spraks #define	VPM_MTBF(v, f)	(1)
129a5652762Spraks #define	VPM_DEBUG(x)	/* nothing */
130a5652762Spraks 
131a5652762Spraks #endif
132a5652762Spraks 
133a5652762Spraks /*
134a5652762Spraks  * The vpm cache.
135a5652762Spraks  *
136a5652762Spraks  * The main purpose of having a cache here is to speed up page_lookup()
137a5652762Spraks  * operations and also provide an LRU(default) behaviour of file pages. The
138a5652762Spraks  * page_lookup() operation tends to be expensive if a page has to be
139a5652762Spraks  * reclaimed from the system page cache("cachelist"). Once we speed up the
140a5652762Spraks  * page_lookup()->page_reclaim() path then there there should be no need for
141a5652762Spraks  * this cache. The system page cache(cachelist) should effectively serve the
142a5652762Spraks  * purpose of caching file pages.
143a5652762Spraks  *
144a5652762Spraks  * This cache is very similar to segmap's smap cache. Each page in the
145a5652762Spraks  * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
146a5652762Spraks  * hash table. The page_t has a reference to the vpmap_t when cached. For a
147a5652762Spraks  * given vnode, offset the page is found by means of a page_lookup() operation.
148a5652762Spraks  * Any page which has a mapping(i.e when cached) will not be in the
149a5652762Spraks  * system 'cachelist'. Hence the page_lookup() will not have to do a
150a5652762Spraks  * page_reclaim(). That is how the cache serves to speed up page_lookup()
151a5652762Spraks  * operations.
152a5652762Spraks  *
153a5652762Spraks  * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
154a5652762Spraks  */
155a5652762Spraks 
156a5652762Spraks void
157a5652762Spraks vpm_init()
158a5652762Spraks {
159a5652762Spraks 	long  npages;
160a5652762Spraks 	struct vpmap *vpm;
161a5652762Spraks 	struct vpmfree *vpmflp;
162a5652762Spraks 	int i, ndx;
163a5652762Spraks 	extern void prefetch_smap_w(void *);
164a5652762Spraks 
165a5652762Spraks 	if (!vpm_cache_enable) {
166a5652762Spraks 		return;
167a5652762Spraks 	}
168a5652762Spraks 
169a5652762Spraks 	/*
170a5652762Spraks 	 * Set the size of the cache.
171a5652762Spraks 	 */
172a5652762Spraks 	vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
173a5652762Spraks 	if (vpm_cache_size < VPMAP_MINCACHE) {
174a5652762Spraks 		vpm_cache_size = VPMAP_MINCACHE;
175a5652762Spraks 	}
176a5652762Spraks 
177a5652762Spraks 	/*
178a5652762Spraks 	 * Number of freelists.
179a5652762Spraks 	 */
180a5652762Spraks 	if (vpm_nfreelist == 0) {
181a5652762Spraks 		vpm_nfreelist = max_ncpus;
182a5652762Spraks 	} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
183a5652762Spraks 		cmn_err(CE_WARN, "vpmap create : number of freelist "
184a5652762Spraks 		"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
185a5652762Spraks 		vpm_nfreelist = 2 * max_ncpus;
186a5652762Spraks 	}
187a5652762Spraks 
188a5652762Spraks 	/*
189a5652762Spraks 	 * Round it up to the next power of 2
190a5652762Spraks 	 */
191a5652762Spraks 	if (vpm_nfreelist & (vpm_nfreelist - 1)) {
192a5652762Spraks 		vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
193a5652762Spraks 	}
194a5652762Spraks 	vpmd_freemsk = vpm_nfreelist - 1;
195a5652762Spraks 
196a5652762Spraks 	/*
197a5652762Spraks 	 * Use a per cpu rotor index to spread the allocations evenly
198a5652762Spraks 	 * across the available vpm freelists.
199a5652762Spraks 	 */
200a5652762Spraks 	vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
201a5652762Spraks 	ndx = 0;
202a5652762Spraks 	for (i = 0; i < max_ncpus; i++) {
203a5652762Spraks 
204a5652762Spraks 		vpmd_cpu[i].vfree_ndx = ndx;
205a5652762Spraks 		ndx = (ndx + 1) & vpmd_freemsk;
206a5652762Spraks 	}
207a5652762Spraks 
208a5652762Spraks 	/*
209a5652762Spraks 	 * Allocate and initialize the freelist.
210a5652762Spraks 	 */
211a5652762Spraks 	vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
212a5652762Spraks 				KM_SLEEP);
213a5652762Spraks 	for (i = 0; i < vpm_nfreelist; i++) {
214a5652762Spraks 
215a5652762Spraks 		vpmflp = &vpmd_free[i];
216a5652762Spraks 		/*
217a5652762Spraks 		 * Set up initial queue pointers. They will get flipped
218a5652762Spraks 		 * back and forth.
219a5652762Spraks 		 */
220a5652762Spraks 		vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
221a5652762Spraks 		vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
222a5652762Spraks 	}
223a5652762Spraks 
224a5652762Spraks 	npages = mmu_btop(vpm_cache_size);
225a5652762Spraks 
226a5652762Spraks 
227a5652762Spraks 	/*
228a5652762Spraks 	 * Allocate and initialize the vpmap structs.
229a5652762Spraks 	 */
230a5652762Spraks 	vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP);
231a5652762Spraks 	for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) {
232a5652762Spraks 		struct vpmfree *vpmflp;
233a5652762Spraks 		union vpm_freeq *releq;
234a5652762Spraks 		struct vpmap *vpmapf;
235a5652762Spraks 
236a5652762Spraks 		/*
237a5652762Spraks 		 * Use prefetch as we have to walk thru a large number of
238a5652762Spraks 		 * these data structures. We just use the smap's prefetch
239a5652762Spraks 		 * routine as it does the same. This should work fine
240a5652762Spraks 		 * for x64(this needs to be modifed when enabled on sparc).
241a5652762Spraks 		 */
242a5652762Spraks 		prefetch_smap_w((void *)vpm);
243a5652762Spraks 
244a5652762Spraks 		vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
245a5652762Spraks 
246a5652762Spraks 		vpmflp = VPMAP2VMF(vpm);
247a5652762Spraks 		releq = vpmflp->vpm_releq;
248a5652762Spraks 
249a5652762Spraks 		vpmapf = releq->vpmq_free;
250a5652762Spraks 		if (vpmapf == NULL) {
251a5652762Spraks 			releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
252a5652762Spraks 		} else {
253a5652762Spraks 			vpm->vpm_next = vpmapf;
254a5652762Spraks 			vpm->vpm_prev = vpmapf->vpm_prev;
255a5652762Spraks 			vpmapf->vpm_prev = vpm;
256a5652762Spraks 			vpm->vpm_prev->vpm_next = vpm;
257a5652762Spraks 			releq->vpmq_free = vpm->vpm_next;
258a5652762Spraks 		}
259a5652762Spraks 
260a5652762Spraks 		/*
261a5652762Spraks 		 * Indicate that the vpmap is on the releq at start
262a5652762Spraks 		 */
263a5652762Spraks 		vpm->vpm_ndxflg = VPMRELEQ;
264a5652762Spraks 	}
265a5652762Spraks }
266a5652762Spraks 
267a5652762Spraks 
268a5652762Spraks /*
269a5652762Spraks  * unhooks vpm from the freelist if it is still on the freelist.
270a5652762Spraks  */
271a5652762Spraks #define	VPMAP_RMFREELIST(vpm) \
272a5652762Spraks 	{ \
273a5652762Spraks 		if (vpm->vpm_next != NULL) { \
274a5652762Spraks 			union vpm_freeq *freeq; \
275a5652762Spraks 			struct vpmfree *vpmflp; \
276a5652762Spraks 			vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
277a5652762Spraks 			freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
278a5652762Spraks 			mutex_enter(&freeq->vpmq_mtx); \
279a5652762Spraks 			if (freeq->vpmq_free != vpm) { \
280a5652762Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
281a5652762Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
282a5652762Spraks 			} else if (vpm == vpm->vpm_next) { \
283a5652762Spraks 				freeq->vpmq_free = NULL; \
284a5652762Spraks 			} else { \
285a5652762Spraks 				freeq->vpmq_free = vpm->vpm_next; \
286a5652762Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
287a5652762Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
288a5652762Spraks 			} \
289a5652762Spraks 			mutex_exit(&freeq->vpmq_mtx); \
290a5652762Spraks 			vpm->vpm_next = vpm->vpm_prev = NULL; \
291a5652762Spraks 		} \
292a5652762Spraks 	}
293a5652762Spraks 
294a5652762Spraks static int
295a5652762Spraks get_freelndx(int mode)
296a5652762Spraks {
297a5652762Spraks 	int ndx;
298a5652762Spraks 
299a5652762Spraks 	ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
300a5652762Spraks 	switch (mode) {
301a5652762Spraks 
302a5652762Spraks 	case	VPMCACHE_LRU:
303a5652762Spraks 	default:
304a5652762Spraks 			vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
305a5652762Spraks 			break;
306a5652762Spraks 	}
307a5652762Spraks 	return (ndx);
308a5652762Spraks }
309a5652762Spraks 
310a5652762Spraks 
311a5652762Spraks /*
312a5652762Spraks  * Find one vpmap structure from the free lists and use it for the newpage.
313a5652762Spraks  * The previous page it cached is dissociated and released. The page_t's
314a5652762Spraks  * p_vpmref is cleared only when the vpm it is pointing to is locked(or
315a5652762Spraks  * for AMD64 when the page is exclusively locked in page_unload. That is
316a5652762Spraks  * because the p_vpmref is treated as mapping).
317a5652762Spraks  *
318a5652762Spraks  * The page's p_vpmref is set when the page is
319a5652762Spraks  * locked(at least SHARED locked).
320a5652762Spraks  */
321a5652762Spraks static struct vpmap *
322a5652762Spraks get_free_vpmap(page_t *newpage)
323a5652762Spraks {
324a5652762Spraks 	struct vpmfree *vpmflp;
325a5652762Spraks 	kmutex_t *vmtx;
326a5652762Spraks 	struct vpmap *vpm, *first;
327a5652762Spraks 	union vpm_freeq *allocq, *releq;
328a5652762Spraks 	page_t *pp = NULL;
329a5652762Spraks 	int end_ndx, page_locked = 0;
330a5652762Spraks 	int free_ndx;
331a5652762Spraks 
332a5652762Spraks 	/*
333a5652762Spraks 	 * get the freelist bin index.
334a5652762Spraks 	 */
335a5652762Spraks 	free_ndx = get_freelndx(vpm_cachemode);
336a5652762Spraks 
337a5652762Spraks 	end_ndx = free_ndx;
338a5652762Spraks 	vpmflp = &vpmd_free[free_ndx];
339a5652762Spraks 
340a5652762Spraks retry_queue:
341a5652762Spraks 	allocq = vpmflp->vpm_allocq;
342a5652762Spraks 	mutex_enter(&allocq->vpmq_mtx);
343a5652762Spraks 
344a5652762Spraks 	if ((vpm = allocq->vpmq_free) == NULL) {
345a5652762Spraks 
346a5652762Spraks skip_queue:
347a5652762Spraks 		/*
348a5652762Spraks 		 * The alloc list is empty or this queue is being skipped;
349a5652762Spraks 		 * first see if the allocq toggled.
350a5652762Spraks 		 */
351a5652762Spraks 		if (vpmflp->vpm_allocq != allocq) {
352a5652762Spraks 			/* queue changed */
353a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
354a5652762Spraks 			goto retry_queue;
355a5652762Spraks 		}
356a5652762Spraks 		releq = vpmflp->vpm_releq;
357a5652762Spraks 		if (!mutex_tryenter(&releq->vpmq_mtx)) {
358a5652762Spraks 			/* cannot get releq; a free vpmap may be there now */
359a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
360a5652762Spraks 
361a5652762Spraks 			/*
362a5652762Spraks 			 * This loop could spin forever if this thread has
363a5652762Spraks 			 * higher priority than the thread that is holding
364a5652762Spraks 			 * releq->vpmq_mtx. In order to force the other thread
365a5652762Spraks 			 * to run, we'll lock/unlock the mutex which is safe
366a5652762Spraks 			 * since we just unlocked the allocq mutex.
367a5652762Spraks 			 */
368a5652762Spraks 			mutex_enter(&releq->vpmq_mtx);
369a5652762Spraks 			mutex_exit(&releq->vpmq_mtx);
370a5652762Spraks 			goto retry_queue;
371a5652762Spraks 		}
372a5652762Spraks 		if (releq->vpmq_free == NULL) {
373a5652762Spraks 			VPM_DEBUG(vpmd_emptyfreelist);
374a5652762Spraks 			/*
375a5652762Spraks 			 * This freelist is empty.
376a5652762Spraks 			 * This should not happen unless clients
377a5652762Spraks 			 * are failing to release the vpmap after
378a5652762Spraks 			 * accessing the data. Before resorting
379a5652762Spraks 			 * to sleeping, try the next list of the same color.
380a5652762Spraks 			 */
381a5652762Spraks 			free_ndx = (free_ndx + 1) & vpmd_freemsk;
382a5652762Spraks 			if (free_ndx != end_ndx) {
383a5652762Spraks 				mutex_exit(&releq->vpmq_mtx);
384a5652762Spraks 				mutex_exit(&allocq->vpmq_mtx);
385a5652762Spraks 				vpmflp = &vpmd_free[free_ndx];
386a5652762Spraks 				goto retry_queue;
387a5652762Spraks 			}
388a5652762Spraks 			/*
389a5652762Spraks 			 * Tried all freelists.
390a5652762Spraks 			 * wait on this list and hope something gets freed.
391a5652762Spraks 			 */
392a5652762Spraks 			vpmflp->vpm_want++;
393a5652762Spraks 			mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
394a5652762Spraks 			cv_wait(&vpmflp->vpm_free_cv,
395a5652762Spraks 				&vpmflp->vpm_freeq[0].vpmq_mtx);
396a5652762Spraks 			vpmflp->vpm_want--;
397a5652762Spraks 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
398a5652762Spraks 			vpmflp = &vpmd_free[free_ndx];
399a5652762Spraks 			VPM_DEBUG(vpmd_nofreevpms);
400a5652762Spraks 			goto retry_queue;
401a5652762Spraks 		} else {
402a5652762Spraks 			/*
403a5652762Spraks 			 * Something on the rele queue; flip the alloc
404a5652762Spraks 			 * and rele queues and retry.
405a5652762Spraks 			 */
406a5652762Spraks 			vpmflp->vpm_allocq = releq;
407a5652762Spraks 			vpmflp->vpm_releq = allocq;
408a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
409a5652762Spraks 			mutex_exit(&releq->vpmq_mtx);
410a5652762Spraks 			if (page_locked) {
411a5652762Spraks 				delay(hz >> 2);
412a5652762Spraks 				page_locked = 0;
413a5652762Spraks 			}
414a5652762Spraks 			goto retry_queue;
415a5652762Spraks 		}
416a5652762Spraks 	} else {
417a5652762Spraks 		int gotnewvpm;
418a5652762Spraks 		kmutex_t *pmtx;
419a5652762Spraks 		uint_t vpmref;
420a5652762Spraks 
421a5652762Spraks 		/*
422a5652762Spraks 		 * Fastpath the case we get the vpmap mutex
423a5652762Spraks 		 * on the first try.
424a5652762Spraks 		 */
425a5652762Spraks 		first = vpm;
426a5652762Spraks next_vpmap:
427a5652762Spraks 		vmtx = VPMAPMTX(vpm);
428a5652762Spraks 		if (!mutex_tryenter(vmtx)) {
429a5652762Spraks 			/*
430a5652762Spraks 			 * Another thread is trying to reclaim this slot.
431a5652762Spraks 			 * Skip to the next queue or vpmap.
432a5652762Spraks 			 */
433a5652762Spraks 			if ((vpm = vpm->vpm_next) == first) {
434a5652762Spraks 				goto skip_queue;
435a5652762Spraks 			} else {
436a5652762Spraks 				goto next_vpmap;
437a5652762Spraks 			}
438a5652762Spraks 		}
439a5652762Spraks 
440a5652762Spraks 		/*
441a5652762Spraks 		 * Assign this vpm to the newpage.
442a5652762Spraks 		 */
443a5652762Spraks 		pmtx = PPMTX(newpage);
444a5652762Spraks 		gotnewvpm = 0;
445a5652762Spraks 		mutex_enter(pmtx);
446a5652762Spraks 
447a5652762Spraks 		/*
448a5652762Spraks 		 * Check if some other thread already assigned a vpm to
449a5652762Spraks 		 * this page.
450a5652762Spraks 		 */
451a5652762Spraks 		if ((vpmref = newpage->p_vpmref) == 0) {
452a5652762Spraks 			newpage->p_vpmref = VPMID(vpm);
453a5652762Spraks 			gotnewvpm = 1;
454a5652762Spraks 		} else {
455a5652762Spraks 			VPM_DEBUG(vpmd_contend);
456a5652762Spraks 			mutex_exit(vmtx);
457a5652762Spraks 		}
458a5652762Spraks 		mutex_exit(pmtx);
459a5652762Spraks 
460a5652762Spraks 		if (gotnewvpm) {
461a5652762Spraks 
462a5652762Spraks 			/*
463a5652762Spraks 			 * At this point, we've selected the vpm. Remove vpm
464a5652762Spraks 			 * from its freelist. If vpm is the first one in
465a5652762Spraks 			 * the freelist, update the head of the freelist.
466a5652762Spraks 			 */
467a5652762Spraks 			if (first == vpm) {
468a5652762Spraks 				ASSERT(first == allocq->vpmq_free);
469a5652762Spraks 				allocq->vpmq_free = vpm->vpm_next;
470a5652762Spraks 			}
471a5652762Spraks 
472a5652762Spraks 			/*
473a5652762Spraks 			 * If the head of the freelist still points to vpm,
474a5652762Spraks 			 * then there are no more free vpmaps in that list.
475a5652762Spraks 			 */
476a5652762Spraks 			if (allocq->vpmq_free == vpm)
477a5652762Spraks 				/*
478a5652762Spraks 				 * Took the last one
479a5652762Spraks 				 */
480a5652762Spraks 				allocq->vpmq_free = NULL;
481a5652762Spraks 			else {
482a5652762Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next;
483a5652762Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev;
484a5652762Spraks 			}
485a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
486a5652762Spraks 			vpm->vpm_prev = vpm->vpm_next = NULL;
487a5652762Spraks 
488a5652762Spraks 			/*
489a5652762Spraks 			 * Disassociate the previous page. On x64 systems
490a5652762Spraks 			 * p_vpmref is used as a mapping reference to the page.
491a5652762Spraks 			 */
492a5652762Spraks 			if ((pp = vpm->vpm_pp) != NULL &&
493a5652762Spraks 				vpm->vpm_vp == pp->p_vnode &&
494a5652762Spraks 				vpm->vpm_off == pp->p_offset) {
495a5652762Spraks 
496a5652762Spraks 				pmtx = PPMTX(pp);
497a5652762Spraks 				if (page_trylock(pp, SE_SHARED)) {
498a5652762Spraks 					/*
499a5652762Spraks 					 * Now verify that it is the correct
500a5652762Spraks 					 * page. If not someone else stole it,
501a5652762Spraks 					 * so just unlock it and leave.
502a5652762Spraks 					 */
503a5652762Spraks 					mutex_enter(pmtx);
504a5652762Spraks 					if (PP_ISFREE(pp) ||
505a5652762Spraks 						vpm->vpm_vp != pp->p_vnode ||
506a5652762Spraks 						vpm->vpm_off != pp->p_offset ||
507a5652762Spraks 						pp->p_vpmref != VPMID(vpm)) {
508a5652762Spraks 						mutex_exit(pmtx);
509a5652762Spraks 
510a5652762Spraks 						page_unlock(pp);
511a5652762Spraks 					} else {
512a5652762Spraks 						/*
513a5652762Spraks 						 * Release the page.
514a5652762Spraks 						 */
515a5652762Spraks 						pp->p_vpmref = 0;
516a5652762Spraks 						mutex_exit(pmtx);
517a5652762Spraks 						hat_kpm_mapout(pp, 0,
518a5652762Spraks 							hat_kpm_page2va(pp, 1));
519a5652762Spraks 						(void) page_release(pp, 1);
520a5652762Spraks 					}
521a5652762Spraks 				} else {
522a5652762Spraks 					/*
523a5652762Spraks 					 * If the page cannot be locked, just
524a5652762Spraks 					 * clear the p_vpmref and go.
525a5652762Spraks 					 */
526a5652762Spraks 					mutex_enter(pmtx);
527a5652762Spraks 					if (pp->p_vpmref == VPMID(vpm)) {
528a5652762Spraks 						pp->p_vpmref = 0;
529a5652762Spraks 					}
530a5652762Spraks 					mutex_exit(pmtx);
531a5652762Spraks 					VPM_DEBUG(vpmd_prevpagelocked);
532a5652762Spraks 				}
533a5652762Spraks 			}
534a5652762Spraks 
535a5652762Spraks 			/*
536a5652762Spraks 			 * Setup vpm to point to the new page.
537a5652762Spraks 			 */
538a5652762Spraks 			vpm->vpm_pp = newpage;
539a5652762Spraks 			vpm->vpm_vp = newpage->p_vnode;
540a5652762Spraks 			vpm->vpm_off = newpage->p_offset;
541a5652762Spraks 
542a5652762Spraks 		} else {
543a5652762Spraks 			int steal = !VPM_MTBF(steals, steals_mtbf);
544a5652762Spraks 			/*
545a5652762Spraks 			 * Page already has a vpm assigned just use that.
546a5652762Spraks 			 * Grab the vpm mutex and verify that it is still
547a5652762Spraks 			 * the correct one. The pp->p_vpmref should not change
548a5652762Spraks 			 * once we have the vpm mutex and the page lock.
549a5652762Spraks 			 */
550a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
551a5652762Spraks 			vpm = VPMP(vpmref);
552a5652762Spraks 			vmtx = VPMAPMTX(vpm);
553a5652762Spraks 			mutex_enter(vmtx);
554a5652762Spraks 			if ((steal && vpm->vpm_refcnt == 0) ||
555a5652762Spraks 			    vpm->vpm_pp != newpage) {
556a5652762Spraks 				/*
557a5652762Spraks 				 * The vpm got stolen, retry.
558a5652762Spraks 				 * clear the p_vpmref.
559a5652762Spraks 				 */
560a5652762Spraks 				pmtx = PPMTX(newpage);
561a5652762Spraks 				mutex_enter(pmtx);
562a5652762Spraks 				if (newpage->p_vpmref == vpmref) {
563a5652762Spraks 					newpage->p_vpmref = 0;
564a5652762Spraks 				}
565a5652762Spraks 				mutex_exit(pmtx);
566a5652762Spraks 
567a5652762Spraks 				mutex_exit(vmtx);
568a5652762Spraks 				VPM_DEBUG(vpmd_steals);
569a5652762Spraks 				goto retry_queue;
570a5652762Spraks 			} else if (vpm->vpm_refcnt == 0) {
571a5652762Spraks 				/*
572a5652762Spraks 				 * Remove it from the free list if it
573a5652762Spraks 				 * exists there.
574a5652762Spraks 				 */
575a5652762Spraks 				VPMAP_RMFREELIST(vpm);
576a5652762Spraks 			}
577a5652762Spraks 		}
578a5652762Spraks 		return (vpm);
579a5652762Spraks 	}
580a5652762Spraks }
581a5652762Spraks 
582a5652762Spraks static void
583a5652762Spraks free_vpmap(struct vpmap *vpm)
584a5652762Spraks {
585a5652762Spraks 	struct vpmfree *vpmflp;
586a5652762Spraks 	struct vpmap *vpmfreelist;
587a5652762Spraks 	union vpm_freeq *releq;
588a5652762Spraks 
589a5652762Spraks 	ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
590a5652762Spraks 
591a5652762Spraks 	if (vpm->vpm_refcnt != 0) {
592a5652762Spraks 		panic("free_vpmap");
593a5652762Spraks 		/*NOTREACHED*/
594a5652762Spraks 	}
595a5652762Spraks 
596a5652762Spraks 	vpmflp = &vpmd_free[vpm->vpm_free_ndx];
597a5652762Spraks 	/*
598a5652762Spraks 	 * Add to the tail of the release queue
599a5652762Spraks 	 * Note that vpm_releq and vpm_allocq could toggle
600a5652762Spraks 	 * before we get the lock. This does not affect
601a5652762Spraks 	 * correctness as the 2 queues are only maintained
602a5652762Spraks 	 * to reduce lock pressure.
603a5652762Spraks 	 */
604a5652762Spraks 	releq = vpmflp->vpm_releq;
605a5652762Spraks 	if (releq == &vpmflp->vpm_freeq[0]) {
606a5652762Spraks 		vpm->vpm_ndxflg = 0;
607a5652762Spraks 	} else {
608a5652762Spraks 		vpm->vpm_ndxflg = 1;
609a5652762Spraks 	}
610a5652762Spraks 	mutex_enter(&releq->vpmq_mtx);
611a5652762Spraks 	vpmfreelist = releq->vpmq_free;
612a5652762Spraks 	if (vpmfreelist == 0) {
613a5652762Spraks 		int want;
614a5652762Spraks 
615a5652762Spraks 		releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
616a5652762Spraks 		/*
617a5652762Spraks 		 * Both queue mutexes are held to set vpm_want;
618a5652762Spraks 		 * snapshot the value before dropping releq mutex.
619a5652762Spraks 		 * If vpm_want appears after the releq mutex is dropped,
620a5652762Spraks 		 * then the vpmap just freed is already gone.
621a5652762Spraks 		 */
622a5652762Spraks 		want = vpmflp->vpm_want;
623a5652762Spraks 		mutex_exit(&releq->vpmq_mtx);
624a5652762Spraks 		/*
625a5652762Spraks 		 * See if there was a waiter before dropping the releq mutex
626a5652762Spraks 		 * then recheck after obtaining vpm_freeq[0] mutex as
627a5652762Spraks 		 * the another thread may have already signaled.
628a5652762Spraks 		 */
629a5652762Spraks 		if (want) {
630a5652762Spraks 			mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
631a5652762Spraks 			if (vpmflp->vpm_want)
632a5652762Spraks 				cv_signal(&vpmflp->vpm_free_cv);
633a5652762Spraks 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
634a5652762Spraks 		}
635a5652762Spraks 	} else {
636a5652762Spraks 		vpm->vpm_next = vpmfreelist;
637a5652762Spraks 		vpm->vpm_prev = vpmfreelist->vpm_prev;
638a5652762Spraks 		vpmfreelist->vpm_prev = vpm;
639a5652762Spraks 		vpm->vpm_prev->vpm_next = vpm;
640a5652762Spraks 		mutex_exit(&releq->vpmq_mtx);
641a5652762Spraks 	}
642a5652762Spraks }
643a5652762Spraks 
644a5652762Spraks /*
645a5652762Spraks  * Get the vpmap for the page.
646a5652762Spraks  * The refcnt of this vpm is incremented.
647a5652762Spraks  */
648a5652762Spraks static struct vpmap *
649a5652762Spraks get_vpmap(page_t *pp)
650a5652762Spraks {
651a5652762Spraks 	struct vpmap *vpm = NULL;
652a5652762Spraks 	kmutex_t *vmtx;
653a5652762Spraks 	kmutex_t *pmtx;
654a5652762Spraks 	unsigned int refid;
655a5652762Spraks 
656a5652762Spraks 	ASSERT((pp != NULL) && PAGE_LOCKED(pp));
657a5652762Spraks 
658a5652762Spraks 	if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
659a5652762Spraks 		vpm = VPMP(refid);
660a5652762Spraks 		vmtx = VPMAPMTX(vpm);
661a5652762Spraks 		mutex_enter(vmtx);
662a5652762Spraks 		/*
663a5652762Spraks 		 * Since we have the page lock and the vpm mutex, the
664a5652762Spraks 		 * pp->p_vpmref cannot change.
665a5652762Spraks 		 */
666a5652762Spraks 		if (vpm->vpm_pp != pp) {
667a5652762Spraks 			pmtx = PPMTX(pp);
668a5652762Spraks 
669a5652762Spraks 			/*
670a5652762Spraks 			 * Clear the p_vpmref as it is incorrect.
671a5652762Spraks 			 * This can happen if the page was stolen.
672a5652762Spraks 			 * On x64 this should not happen as p_vpmref
673a5652762Spraks 			 * is treated as a mapping on the page. So
674a5652762Spraks 			 * if the page is stolen, the mapping would have
675a5652762Spraks 			 * been cleared in page_unload().
676a5652762Spraks 			 */
677a5652762Spraks 			mutex_enter(pmtx);
678a5652762Spraks 			if (pp->p_vpmref == refid)
679a5652762Spraks 				pp->p_vpmref = 0;
680a5652762Spraks 			mutex_exit(pmtx);
681a5652762Spraks 
682a5652762Spraks 			mutex_exit(vmtx);
683a5652762Spraks 			vpm = NULL;
684a5652762Spraks 		} else if (vpm->vpm_refcnt == 0) {
685a5652762Spraks 			/*
686a5652762Spraks 			 * Got the vpm, remove it from the free
687a5652762Spraks 			 * list if it exists there.
688a5652762Spraks 			 */
689a5652762Spraks 			VPMAP_RMFREELIST(vpm);
690a5652762Spraks 		}
691a5652762Spraks 	}
692a5652762Spraks 	if (vpm == NULL) {
693a5652762Spraks 		/*
694a5652762Spraks 		 * get_free_vpmap() returns with the vpmap mutex held.
695a5652762Spraks 		 */
696a5652762Spraks 		vpm = get_free_vpmap(pp);
697a5652762Spraks 		vmtx = VPMAPMTX(vpm);
698a5652762Spraks 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
699a5652762Spraks 	} else {
700a5652762Spraks 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
701a5652762Spraks 	}
702a5652762Spraks 
703a5652762Spraks 	vpm->vpm_refcnt++;
704a5652762Spraks 	mutex_exit(vmtx);
705a5652762Spraks 
706a5652762Spraks 	return (vpm);
707a5652762Spraks }
708a5652762Spraks 
709a5652762Spraks /* END --- vpm cache ---- */
710a5652762Spraks 
711a5652762Spraks /*
712a5652762Spraks  * The vnode page mapping(vpm) interface routines.
713a5652762Spraks  */
714a5652762Spraks 
715a5652762Spraks /*
716a5652762Spraks  * Find or create the pages starting form baseoff for specified
717a5652762Spraks  * length 'len'.
718a5652762Spraks  */
719a5652762Spraks static int
720a5652762Spraks vpm_pagecreate(
721a5652762Spraks 	struct vnode *vp,
722a5652762Spraks 	u_offset_t baseoff,
723a5652762Spraks 	size_t len,
724a5652762Spraks 	vmap_t vml[],
725a5652762Spraks 	int nseg,
726a5652762Spraks 	int *newpage)
727a5652762Spraks {
728a5652762Spraks 
729a5652762Spraks 	page_t *pp = NULL;
730a5652762Spraks 	caddr_t base;
731a5652762Spraks 	u_offset_t off = baseoff;
732a5652762Spraks 	int i;
733a5652762Spraks 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
734a5652762Spraks 
7353bd1497bSpraks 	for (i = 0; len > 0; len -= PAGESIZE, i++) {
736a5652762Spraks 		struct vpmap *vpm;
737a5652762Spraks 
738a5652762Spraks 
739a5652762Spraks 		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
740a5652762Spraks 
741a5652762Spraks 			base = segkpm_create_va(off);
742a5652762Spraks 
743a5652762Spraks 			/*
744a5652762Spraks 			 * the seg pointer passed in is just advisor. Just
745a5652762Spraks 			 * pass segkmap for now like segmap does with
746a5652762Spraks 			 * segmap_kpm enabled.
747a5652762Spraks 			 */
748a5652762Spraks 			if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
749a5652762Spraks 			    segkmap, base)) == NULL) {
750a5652762Spraks 				panic("segmap_pagecreate_vpm: "
751a5652762Spraks 				    "page_create failed");
752a5652762Spraks 				/*NOTREACHED*/
753a5652762Spraks 			}
754a5652762Spraks 			if (newpage != NULL)
755a5652762Spraks 				*newpage = 1;
756a5652762Spraks 
757a5652762Spraks 			page_io_unlock(pp);
758a5652762Spraks 		}
759a5652762Spraks 
760a5652762Spraks 		/*
761a5652762Spraks 		 * Get the vpm for this page_t.
762a5652762Spraks 		 */
763a5652762Spraks 		if (vpm_cache_enable) {
764a5652762Spraks 			vpm = get_vpmap(pp);
765a5652762Spraks 			vml[i].vs_data = (void *)&vpm->vpm_pp;
766a5652762Spraks 		} else {
767a5652762Spraks 			vml[i].vs_data = (void *)pp;
768a5652762Spraks 			pp->p_vpmref = 0;
769a5652762Spraks 		}
770a5652762Spraks 
771a5652762Spraks 		vml[i].vs_addr = hat_kpm_mapin(pp, 0);
772a5652762Spraks 		vml[i].vs_len = PAGESIZE;
773a5652762Spraks 
774a5652762Spraks 		off += PAGESIZE;
775a5652762Spraks 	}
776a5652762Spraks 	vml[i].vs_data = NULL;
777a5652762Spraks 	vml[i].vs_addr = (caddr_t)NULL;
778a5652762Spraks 	return (0);
779a5652762Spraks }
780a5652762Spraks 
781a5652762Spraks 
782a5652762Spraks /*
783a5652762Spraks  * Returns vpm mappings of pages in the range [off, off+len], where
784a5652762Spraks  * len is rounded up to the PAGESIZE boundary. The list of pages and
785a5652762Spraks  * the page addresses are returned in the SGL vml (vmap_t) array passed in.
786a5652762Spraks  * The nseg is the number of vmap_t entries in the array.
787a5652762Spraks  *
788a5652762Spraks  * Currently max len allowed is MAXBSIZE therefore, it will either
789a5652762Spraks  * fetch/create one or two pages depending on what is the PAGESIZE.
790a5652762Spraks  *
791a5652762Spraks  * The segmap's SM_LOCKPROTO  usage is not supported by these interfaces.
792a5652762Spraks  * For such cases, use the seg_map interfaces.
793a5652762Spraks  */
794a5652762Spraks int
795a5652762Spraks vpm_map_pages(
796a5652762Spraks 	struct vnode *vp,
797a5652762Spraks 	u_offset_t off,
798a5652762Spraks 	size_t len,
799a5652762Spraks 	int fetchpage,
800a5652762Spraks 	vmap_t *vml,
801a5652762Spraks 	int nseg,
802a5652762Spraks 	int  *newpage,
803a5652762Spraks 	enum seg_rw rw)
804a5652762Spraks {
805a5652762Spraks 	extern struct vnode *common_specvp();
806a5652762Spraks 	u_offset_t baseoff;
807a5652762Spraks 	uint_t prot;
808a5652762Spraks 	caddr_t base;
809a5652762Spraks 	page_t *pp, *pplist[MAXVMAPS];
810a5652762Spraks 	struct vpmap *vpm;
811a5652762Spraks 	int i, error = 0;
812a5652762Spraks 
813a5652762Spraks 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
814a5652762Spraks 	baseoff = off & (offset_t)PAGEMASK;
815a5652762Spraks 	vml[0].vs_data = NULL;
816a5652762Spraks 	vml[0].vs_addr = (caddr_t)NULL;
817a5652762Spraks 	/*
818a5652762Spraks 	 * For now, lets restrict it to MAXBSIZE. XXX - We can allow
819a5652762Spraks 	 * len longer then MAXBSIZE, but there should be a limit
820a5652762Spraks 	 * which should be determined by how many pages the VOP_GETPAGE()
821a5652762Spraks 	 * can fetch.
822a5652762Spraks 	 */
823a5652762Spraks 	if (off + len > baseoff + MAXBSIZE) {
824a5652762Spraks 		panic("vpm_map_pages bad len");
825a5652762Spraks 		/*NOTREACHED*/
826a5652762Spraks 	}
827a5652762Spraks 
828a5652762Spraks 	/*
829a5652762Spraks 	 * If this is a block device we have to be sure to use the
830a5652762Spraks 	 * "common" block device vnode for the mapping.
831a5652762Spraks 	 */
832a5652762Spraks 	if (vp->v_type == VBLK)
833a5652762Spraks 		vp = common_specvp(vp);
834a5652762Spraks 
8353bd1497bSpraks 	/*
8363bd1497bSpraks 	 * round up len to a multiple of PAGESIZE.
8373bd1497bSpraks 	 */
8383bd1497bSpraks 	len = ((off + len - baseoff + PAGESIZE - 1) & (uintptr_t)PAGEMASK);
839a5652762Spraks 
840a5652762Spraks 	if (!fetchpage)
841a5652762Spraks 		return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
842a5652762Spraks 
8433bd1497bSpraks 	for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
844a5652762Spraks 
845a5652762Spraks 		pp = page_lookup(vp, baseoff, SE_SHARED);
846a5652762Spraks 
847a5652762Spraks 		/*
848a5652762Spraks 		 * If we did not find the page or if this page was not
849a5652762Spraks 		 * in our cache, then let VOP_GETPAGE get all the pages.
850a5652762Spraks 		 * We need to call VOP_GETPAGE so that filesytems can do some
851a5652762Spraks 		 * (un)necessary tracking for sequential access.
852a5652762Spraks 		 */
853a5652762Spraks 
854a5652762Spraks 		if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
855a5652762Spraks 			(rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
856a5652762Spraks 							!= (P_MOD | P_REF))) {
857a5652762Spraks 			if (pp != NULL) {
858a5652762Spraks 				page_unlock(pp);
859a5652762Spraks 			}
860a5652762Spraks 
861a5652762Spraks 			/*
862a5652762Spraks 			 * Pass a dummy address as it will be required
863a5652762Spraks 			 * by page_create_va(). We pass segkmap as the seg
864a5652762Spraks 			 * as some file systems(UFS) check it.
865a5652762Spraks 			 */
866a5652762Spraks 			base = segkpm_create_va(baseoff);
867a5652762Spraks 
868a5652762Spraks 			error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i],
8693bd1497bSpraks 			len, segkmap, base, rw, CRED());
870a5652762Spraks 			if (error) {
871a5652762Spraks 				VPM_DEBUG(vpmd_getpagefailed);
872a5652762Spraks 				pplist[i] = NULL;
873a5652762Spraks 			}
874a5652762Spraks 			break;
875a5652762Spraks 		} else {
876a5652762Spraks 			pplist[i] = pp;
877a5652762Spraks 			baseoff += PAGESIZE;
878a5652762Spraks 		}
879a5652762Spraks 	}
880a5652762Spraks 
881a5652762Spraks 	if (error) {
882a5652762Spraks 		for (i = 0; pplist[i] != NULL; i++) {
883a5652762Spraks 			page_unlock(pplist[i]);
884a5652762Spraks 			pplist[i] = NULL;
885a5652762Spraks 		}
886a5652762Spraks 		vml[0].vs_addr = NULL;
887a5652762Spraks 		vml[0].vs_data = NULL;
888*9234f026Spraks 		return (error);
889a5652762Spraks 	}
890a5652762Spraks 
891a5652762Spraks 	/*
892a5652762Spraks 	 * Get the vpm's for pages.
893a5652762Spraks 	 */
894a5652762Spraks 	for (i = 0; pplist[i] != NULL; i++) {
895a5652762Spraks 		if (vpm_cache_enable) {
896a5652762Spraks 			vpm = get_vpmap(pplist[i]);
897a5652762Spraks 			vml[i].vs_data = (void *)&(vpm->vpm_pp);
898a5652762Spraks 		} else {
899a5652762Spraks 			vml[i].vs_data = (void *)pplist[i];
900a5652762Spraks 			pplist[i]->p_vpmref = 0;
901a5652762Spraks 		}
902a5652762Spraks 
903a5652762Spraks 		vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
904a5652762Spraks 		vml[i].vs_len = PAGESIZE;
905a5652762Spraks 	}
906a5652762Spraks 
907a5652762Spraks 	vml[i].vs_data = NULL;
908a5652762Spraks 	vml[i].vs_addr = (caddr_t)NULL;
909a5652762Spraks 
910a5652762Spraks 	return (0);
911a5652762Spraks }
912a5652762Spraks 
913a5652762Spraks /*
914a5652762Spraks  * Release the vpm mappings on the pages and unlock them.
915a5652762Spraks  */
916a5652762Spraks void
917a5652762Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
918a5652762Spraks {
919a5652762Spraks 	int i;
920a5652762Spraks 	struct vpmap *vpm;
921a5652762Spraks 	kmutex_t *mtx;
922a5652762Spraks 	page_t *pp;
923a5652762Spraks 
924a5652762Spraks 	for (i = 0; vml[i].vs_data != NULL; i++) {
925a5652762Spraks 		ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
926a5652762Spraks 
927a5652762Spraks 		if (vpm_cache_enable) {
928a5652762Spraks 			pp = *(((page_t **)vml[i].vs_data));
929a5652762Spraks 		} else {
930a5652762Spraks 			pp = (page_t *)vml[i].vs_data;
931a5652762Spraks 		}
932a5652762Spraks 
933a5652762Spraks 		/*
934a5652762Spraks 		 * Mark page as being modified or referenced, bacause vpm pages
935a5652762Spraks 		 * would not cause faults where it would be set normally.
936a5652762Spraks 		 */
937a5652762Spraks 		if (rw == S_WRITE) {
938a5652762Spraks 			hat_setrefmod(pp);
939a5652762Spraks 		} else {
940a5652762Spraks 			ASSERT(rw == S_READ);
941a5652762Spraks 			hat_setref(pp);
942a5652762Spraks 		}
943a5652762Spraks 
944a5652762Spraks 		if (vpm_cache_enable) {
945a5652762Spraks 			page_unlock(pp);
946a5652762Spraks 			vpm = (struct vpmap *)((char *)vml[i].vs_data
947a5652762Spraks 					- offsetof(struct vpmap, vpm_pp));
948a5652762Spraks 			mtx = VPMAPMTX(vpm);
949a5652762Spraks 			mutex_enter(mtx);
950a5652762Spraks 
951a5652762Spraks 			if (--vpm->vpm_refcnt == 0) {
952a5652762Spraks 				free_vpmap(vpm);
953a5652762Spraks 			}
954a5652762Spraks 			mutex_exit(mtx);
955a5652762Spraks 		} else {
956a5652762Spraks 			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
957a5652762Spraks 			(void) page_release(pp, 1);
958a5652762Spraks 		}
959a5652762Spraks 		vml[i].vs_data = NULL;
960a5652762Spraks 		vml[i].vs_addr = NULL;
961a5652762Spraks 	}
962a5652762Spraks }
963a5652762Spraks 
964a5652762Spraks /*
965a5652762Spraks  * Given the vp, off and the uio structure, this routine will do the
966a5652762Spraks  * the copy (uiomove). If the last page created is partially written,
967a5652762Spraks  * the rest of the page is zeroed out. It also zeros the beginning of
968a5652762Spraks  * the first page till the start offset if requested(zerostart).
969a5652762Spraks  * If pages are to be fetched, it will call the filesystem's getpage
970a5652762Spraks  * function (VOP_GETPAGE) to get them, otherwise they will be created if
971a5652762Spraks  * not already present in the page cache.
972a5652762Spraks  */
973a5652762Spraks int
974a5652762Spraks vpm_data_copy(struct vnode *vp,
975a5652762Spraks 	u_offset_t off,
976a5652762Spraks 	size_t len,
977a5652762Spraks 	struct uio *uio,
978a5652762Spraks 	int fetchpage,
979a5652762Spraks 	int *newpage,
980a5652762Spraks 	int zerostart,
981a5652762Spraks 	enum seg_rw rw)
982a5652762Spraks {
983a5652762Spraks 	int error;
984a5652762Spraks 	struct vmap vml[MINVMAPS];
985a5652762Spraks 	enum uio_rw uiorw;
986a5652762Spraks 	int npages = 0;
987a5652762Spraks 
988a5652762Spraks 	uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
989a5652762Spraks 	/*
990a5652762Spraks 	 * 'off' will be the offset where the I/O starts.
991a5652762Spraks 	 * We get the pages starting at the (off & PAGEMASK)
992a5652762Spraks 	 * page boundary.
993a5652762Spraks 	 */
994a5652762Spraks 	error = vpm_map_pages(vp, off, (uint_t)len,
995a5652762Spraks 		fetchpage, vml, MINVMAPS, &npages,  rw);
996a5652762Spraks 
997a5652762Spraks 	if (newpage != NULL)
998a5652762Spraks 		*newpage = npages;
999a5652762Spraks 	if (!error) {
1000a5652762Spraks 		int i, pn, slen = len;
1001a5652762Spraks 		int pon = off & PAGEOFFSET;
1002a5652762Spraks 
1003a5652762Spraks 		/*
1004a5652762Spraks 		 * Clear from the beginning of the page to start offset
1005a5652762Spraks 		 * if requested.
1006a5652762Spraks 		 */
1007a5652762Spraks 		if (!fetchpage && zerostart) {
1008a5652762Spraks 			(void) kzero(vml[0].vs_addr,  (uint_t)pon);
1009a5652762Spraks 			VPM_DEBUG(vpmd_zerostart);
1010a5652762Spraks 		}
1011a5652762Spraks 
1012a5652762Spraks 		for (i = 0; !error && slen > 0 &&
1013a5652762Spraks 				vml[i].vs_addr != NULL; i++) {
1014a5652762Spraks 			pn = (int)MIN(slen, (PAGESIZE - pon));
1015a5652762Spraks 			error = uiomove(vml[i].vs_addr + pon,
1016a5652762Spraks 				    (long)pn, uiorw, uio);
1017a5652762Spraks 			slen -= pn;
1018a5652762Spraks 			pon = 0;
1019a5652762Spraks 		}
1020a5652762Spraks 
1021a5652762Spraks 		/*
1022a5652762Spraks 		 * When new pages are created, zero out part of the
1023a5652762Spraks 		 * page we did not copy to.
1024a5652762Spraks 		 */
1025a5652762Spraks 		if (!fetchpage && npages &&
1026a5652762Spraks 			uio->uio_loffset < roundup(off + len, PAGESIZE)) {
1027a5652762Spraks 			int nzero;
1028a5652762Spraks 
1029a5652762Spraks 			pon = (uio->uio_loffset & PAGEOFFSET);
1030a5652762Spraks 			nzero = PAGESIZE  - pon;
1031a5652762Spraks 			i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
1032a5652762Spraks 			(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
1033a5652762Spraks 		}
1034a5652762Spraks 		vpm_unmap_pages(vml, rw);
1035a5652762Spraks 	}
1036a5652762Spraks 	return (error);
1037a5652762Spraks }
1038a5652762Spraks 
1039a5652762Spraks /*
1040a5652762Spraks  * called to flush pages for the given vnode covering
1041a5652762Spraks  * [off, off+len] range.
1042a5652762Spraks  */
1043a5652762Spraks int
1044a5652762Spraks vpm_sync_pages(struct vnode *vp,
1045a5652762Spraks 		u_offset_t off,
1046a5652762Spraks 		size_t len,
1047a5652762Spraks 		uint_t flags)
1048a5652762Spraks {
1049a5652762Spraks 	extern struct vnode *common_specvp();
1050a5652762Spraks 	int bflags = 0;
1051a5652762Spraks 	int error = 0;
1052a5652762Spraks 	size_t psize = roundup(len, PAGESIZE);
1053a5652762Spraks 
1054a5652762Spraks 	/*
1055a5652762Spraks 	 * If this is a block device we have to be sure to use the
1056a5652762Spraks 	 * "common" block device vnode for the mapping.
1057a5652762Spraks 	 */
1058a5652762Spraks 	if (vp->v_type == VBLK)
1059a5652762Spraks 		vp = common_specvp(vp);
1060a5652762Spraks 
1061a5652762Spraks 	if ((flags & ~SM_DONTNEED) != 0) {
1062a5652762Spraks 		if (flags & SM_ASYNC)
1063a5652762Spraks 			bflags |= B_ASYNC;
1064a5652762Spraks 		if (flags & SM_INVAL)
1065a5652762Spraks 			bflags |= B_INVAL;
1066a5652762Spraks 		if (flags & SM_DESTROY)
1067a5652762Spraks 			bflags |= (B_INVAL|B_TRUNC);
1068a5652762Spraks 		if (flags & SM_FREE)
1069a5652762Spraks 			bflags |= B_FREE;
1070a5652762Spraks 		if (flags & SM_DONTNEED)
1071a5652762Spraks 			bflags |= B_DONTNEED;
1072a5652762Spraks 
1073a5652762Spraks 		error = VOP_PUTPAGE(vp, off, psize, bflags, CRED());
1074a5652762Spraks 	}
1075a5652762Spraks 
1076a5652762Spraks 	return (error);
1077a5652762Spraks }
1078a5652762Spraks 
1079a5652762Spraks 
1080a5652762Spraks #else	/* SEGKPM_SUPPORT */
1081a5652762Spraks 
1082a5652762Spraks /* vpm stubs */
1083a5652762Spraks void
1084a5652762Spraks vpm_init()
1085a5652762Spraks {
1086a5652762Spraks }
1087a5652762Spraks 
1088a5652762Spraks /*ARGSUSED*/
1089a5652762Spraks int
1090a5652762Spraks vpm_pagecreate(
1091a5652762Spraks 	struct vnode *vp,
1092a5652762Spraks 	u_offset_t baseoff,
1093a5652762Spraks 	size_t len,
1094a5652762Spraks 	vmap_t vml[],
1095a5652762Spraks 	int nseg,
1096a5652762Spraks 	int *newpage)
1097a5652762Spraks {
1098a5652762Spraks 	return (0);
1099a5652762Spraks }
1100a5652762Spraks 
1101a5652762Spraks /*ARGSUSED*/
1102a5652762Spraks int
1103a5652762Spraks vpm_map_pages(
1104a5652762Spraks 	struct vnode *vp,
1105a5652762Spraks 	u_offset_t off,
1106a5652762Spraks 	size_t len,
1107a5652762Spraks 	int fetchpage,
1108a5652762Spraks 	vmap_t vml[],
1109a5652762Spraks 	int nseg,
1110a5652762Spraks 	int *newpage,
1111a5652762Spraks 	enum seg_rw rw)
1112a5652762Spraks {
1113a5652762Spraks 	return (0);
1114a5652762Spraks }
1115a5652762Spraks 
1116a5652762Spraks /*ARGSUSED*/
1117a5652762Spraks int
1118a5652762Spraks vpm_data_copy(struct vnode *vp,
1119a5652762Spraks 	u_offset_t off,
1120a5652762Spraks 	size_t len,
1121a5652762Spraks 	struct uio *uio,
1122a5652762Spraks 	int fetchpage,
1123a5652762Spraks 	int *newpage,
1124a5652762Spraks 	int zerostart,
1125a5652762Spraks 	enum seg_rw rw)
1126a5652762Spraks {
1127a5652762Spraks 	return (0);
1128a5652762Spraks }
1129a5652762Spraks 
1130a5652762Spraks /*ARGSUSED*/
1131a5652762Spraks void
1132a5652762Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
1133a5652762Spraks {
1134a5652762Spraks }
1135a5652762Spraks /*ARGSUSED*/
1136a5652762Spraks int
1137a5652762Spraks vpm_sync_pages(struct vnode *vp,
1138a5652762Spraks 		u_offset_t off,
1139a5652762Spraks 		size_t len,
1140a5652762Spraks 		uint_t flags)
1141a5652762Spraks {
1142a5652762Spraks 	return (0);
1143a5652762Spraks }
1144a5652762Spraks #endif	/* SEGKPM_SUPPORT */
1145