1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
24 */
25
26/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved	*/
28
29/*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39/*
40 * VM - anonymous pages.
41 *
42 * This layer sits immediately above the vm_swap layer.  It manages
43 * physical pages that have no permanent identity in the file system
44 * name space, using the services of the vm_swap layer to allocate
45 * backing storage for these pages.  Since these pages have no external
46 * identity, they are discarded when the last reference is removed.
47 *
48 * An important function of this layer is to manage low-level sharing
49 * of pages that are logically distinct but that happen to be
50 * physically identical (e.g., the corresponding pages of the processes
51 * resulting from a fork before one process or the other changes their
52 * contents).  This pseudo-sharing is present only as an optimization
53 * and is not to be confused with true sharing in which multiple
54 * address spaces deliberately contain references to the same object;
55 * such sharing is managed at a higher level.
56 *
57 * The key data structure here is the anon struct, which contains a
58 * reference count for its associated physical page and a hint about
59 * the identity of that page.  Anon structs typically live in arrays,
60 * with an instance's position in its array determining where the
61 * corresponding backing storage is allocated; however, the swap_xlate()
62 * routine abstracts away this representation information so that the
63 * rest of the anon layer need not know it.  (See the swap layer for
64 * more details on anon struct layout.)
65 *
66 * In the future versions of the system, the association between an
67 * anon struct and its position on backing store will change so that
68 * we don't require backing store all anonymous pages in the system.
69 * This is important for consideration for large memory systems.
70 * We can also use this technique to delay binding physical locations
71 * to anonymous pages until pageout/swapout time where we can make
72 * smarter allocation decisions to improve anonymous klustering.
73 *
74 * Many of the routines defined here take a (struct anon **) argument,
75 * which allows the code at this level to manage anon pages directly,
76 * so that callers can regard anon structs as opaque objects and not be
77 * concerned with assigning or inspecting their contents.
78 *
79 * Clients of this layer refer to anon pages indirectly.  That is, they
80 * maintain arrays of pointers to anon structs rather than maintaining
81 * anon structs themselves.  The (struct anon **) arguments mentioned
82 * above are pointers to entries in these arrays.  It is these arrays
83 * that capture the mapping between offsets within a given segment and
84 * the corresponding anonymous backing storage address.
85 */
86
87#ifdef DEBUG
88#define	ANON_DEBUG
89#endif
90
91#include <sys/types.h>
92#include <sys/t_lock.h>
93#include <sys/param.h>
94#include <sys/systm.h>
95#include <sys/mman.h>
96#include <sys/cred.h>
97#include <sys/thread.h>
98#include <sys/vnode.h>
99#include <sys/cpuvar.h>
100#include <sys/swap.h>
101#include <sys/cmn_err.h>
102#include <sys/vtrace.h>
103#include <sys/kmem.h>
104#include <sys/sysmacros.h>
105#include <sys/bitmap.h>
106#include <sys/vmsystm.h>
107#include <sys/tuneable.h>
108#include <sys/debug.h>
109#include <sys/fs/swapnode.h>
110#include <sys/tnf_probe.h>
111#include <sys/lgrp.h>
112#include <sys/policy.h>
113#include <sys/condvar_impl.h>
114#include <sys/mutex_impl.h>
115#include <sys/rctl.h>
116
117#include <vm/as.h>
118#include <vm/hat.h>
119#include <vm/anon.h>
120#include <vm/page.h>
121#include <vm/vpage.h>
122#include <vm/seg.h>
123#include <vm/rm.h>
124
125#include <fs/fs_subr.h>
126
127struct vnode *anon_vp;
128
129int anon_debug;
130
131kmutex_t	anoninfo_lock;
132struct		k_anoninfo k_anoninfo;
133ani_free_t	*ani_free_pool;
134pad_mutex_t	anon_array_lock[ANON_LOCKSIZE];
135kcondvar_t	anon_array_cv[ANON_LOCKSIZE];
136
137/*
138 * Global hash table for (vp, off) -> anon slot
139 */
140extern	int swap_maxcontig;
141size_t	anon_hash_size;
142unsigned int anon_hash_shift;
143struct anon **anon_hash;
144
145static struct kmem_cache *anon_cache;
146static struct kmem_cache *anonmap_cache;
147
148pad_mutex_t	*anonhash_lock;
149
150/*
151 * Used to make the increment of all refcnts of all anon slots of a large
152 * page appear to be atomic.  The lock is grabbed for the first anon slot of
153 * a large page.
154 */
155pad_mutex_t	*anonpages_hash_lock;
156
157#define	APH_MUTEX(vp, off)				\
158	(&anonpages_hash_lock[(ANON_HASH((vp), (off)) &	\
159	    (AH_LOCK_SIZE - 1))].pad_mutex)
160
161#ifdef VM_STATS
162static struct anonvmstats_str {
163	ulong_t getpages[30];
164	ulong_t privatepages[10];
165	ulong_t demotepages[9];
166	ulong_t decrefpages[9];
167	ulong_t	dupfillholes[4];
168	ulong_t freepages[1];
169} anonvmstats;
170#endif /* VM_STATS */
171
172/*ARGSUSED*/
173static int
174anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
175{
176	struct anon_map *amp = buf;
177
178	rw_init(&amp->a_rwlock, NULL, RW_DEFAULT, NULL);
179	cv_init(&amp->a_purgecv, NULL, CV_DEFAULT, NULL);
180	mutex_init(&amp->a_pmtx, NULL, MUTEX_DEFAULT, NULL);
181	mutex_init(&amp->a_purgemtx, NULL, MUTEX_DEFAULT, NULL);
182	return (0);
183}
184
185/*ARGSUSED1*/
186static void
187anonmap_cache_destructor(void *buf, void *cdrarg)
188{
189	struct anon_map *amp = buf;
190
191	rw_destroy(&amp->a_rwlock);
192	cv_destroy(&amp->a_purgecv);
193	mutex_destroy(&amp->a_pmtx);
194	mutex_destroy(&amp->a_purgemtx);
195}
196
197void
198anon_init(void)
199{
200	int i;
201	pad_mutex_t *tmp;
202
203	/* These both need to be powers of 2 so round up to the next power */
204	anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1);
205	anon_hash_size = 1L << anon_hash_shift;
206
207	/*
208	 * We need to align the anonhash_lock and anonpages_hash_lock arrays
209	 * to a 64B boundary to avoid false sharing.  We add 63B to our
210	 * allocation so that we can get a 64B aligned address to use.
211	 * We allocate both of these together to avoid wasting an additional
212	 * 63B.
213	 */
214	tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63,
215	    KM_SLEEP);
216	anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64);
217	anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE;
218
219	for (i = 0; i < AH_LOCK_SIZE; i++) {
220		mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT,
221		    NULL);
222		mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL,
223		    MUTEX_DEFAULT, NULL);
224	}
225
226	for (i = 0; i < ANON_LOCKSIZE; i++) {
227		mutex_init(&anon_array_lock[i].pad_mutex, NULL,
228		    MUTEX_DEFAULT, NULL);
229		cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL);
230	}
231
232	anon_hash = (struct anon **)
233	    kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
234	anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
235	    AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL);
236	anonmap_cache = kmem_cache_create("anonmap_cache",
237	    sizeof (struct anon_map), 0,
238	    anonmap_cache_constructor, anonmap_cache_destructor, NULL,
239	    NULL, NULL, 0);
240	swap_maxcontig = (1024 * 1024) >> PAGESHIFT;	/* 1MB of pages */
241
242	tmp = kmem_zalloc((ANI_MAX_POOL * sizeof (ani_free_t)) + 63, KM_SLEEP);
243	/* Round ani_free_pool to cacheline boundary to avoid false sharing. */
244	ani_free_pool = (ani_free_t *)P2ROUNDUP((uintptr_t)tmp, 64);
245
246	anon_vp = vn_alloc(KM_SLEEP);
247	vn_setops(anon_vp, swap_vnodeops);
248	anon_vp->v_type = VREG;
249	anon_vp->v_flag |= (VISSWAP|VISSWAPFS);
250}
251
252/*
253 * Global anon slot hash table manipulation.
254 */
255
256static void
257anon_addhash(struct anon *ap)
258{
259	int index;
260
261	ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
262	index = ANON_HASH(ap->an_vp, ap->an_off);
263	ap->an_hash = anon_hash[index];
264	anon_hash[index] = ap;
265}
266
267static void
268anon_rmhash(struct anon *ap)
269{
270	struct anon **app;
271
272	ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
273
274	for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)];
275	    *app; app = &((*app)->an_hash)) {
276		if (*app == ap) {
277			*app = ap->an_hash;
278			break;
279		}
280	}
281}
282
283/*
284 * The anon array interfaces. Functions allocating,
285 * freeing array of pointers, and returning/setting
286 * entries in the array of pointers for a given offset.
287 *
288 * Create the list of pointers
289 */
290struct anon_hdr *
291anon_create(pgcnt_t npages, int flags)
292{
293	struct anon_hdr *ahp;
294	ulong_t nchunks;
295	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
296
297	if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) {
298		return (NULL);
299	}
300
301	mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL);
302	/*
303	 * Single level case.
304	 */
305	ahp->size = npages;
306	if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) {
307
308		if (flags & ANON_ALLOC_FORCE)
309			ahp->flags |= ANON_ALLOC_FORCE;
310
311		ahp->array_chunk = kmem_zalloc(
312		    ahp->size * sizeof (struct anon *), kmemflags);
313
314		if (ahp->array_chunk == NULL) {
315			kmem_free(ahp, sizeof (struct anon_hdr));
316			return (NULL);
317		}
318	} else {
319		/*
320		 * 2 Level case.
321		 * anon hdr size needs to be rounded off  to be a multiple
322		 * of ANON_CHUNK_SIZE. This is important as various anon
323		 * related functions depend on this.
324		 * NOTE -
325		 * anon_grow()  makes anon hdr size a multiple of
326		 * ANON_CHUNK_SIZE.
327		 * amp size is <= anon hdr size.
328		 * anon_index + seg_pgs <= anon hdr size.
329		 */
330		ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE);
331		nchunks = ahp->size >> ANON_CHUNK_SHIFT;
332
333		ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *),
334		    kmemflags);
335
336		if (ahp->array_chunk == NULL) {
337			kmem_free(ahp, sizeof (struct anon_hdr));
338			return (NULL);
339		}
340	}
341	return (ahp);
342}
343
344/*
345 * Free the array of pointers
346 */
347void
348anon_release(struct anon_hdr *ahp, pgcnt_t npages)
349{
350	ulong_t i;
351	void **ppp;
352	ulong_t nchunks;
353
354	ASSERT(npages <= ahp->size);
355
356	/*
357	 * Single level case.
358	 */
359	if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
360		kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *));
361	} else {
362		/*
363		 * 2 level case.
364		 */
365		nchunks = ahp->size >> ANON_CHUNK_SHIFT;
366		for (i = 0; i < nchunks; i++) {
367			ppp = &ahp->array_chunk[i];
368			if (*ppp != NULL)
369				kmem_free(*ppp, PAGESIZE);
370		}
371		kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *));
372	}
373	mutex_destroy(&ahp->serial_lock);
374	kmem_free(ahp, sizeof (struct anon_hdr));
375}
376
377/*
378 * Return the pointer from the list for a
379 * specified anon index.
380 */
381struct anon *
382anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx)
383{
384	struct anon **app;
385
386	ASSERT(an_idx < ahp->size);
387
388	/*
389	 * Single level case.
390	 */
391	if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
392		return ((struct anon *)
393		    ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK));
394	} else {
395
396		/*
397		 * 2 level case.
398		 */
399		app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
400		if (app) {
401			return ((struct anon *)
402			    ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] &
403			    ANON_PTRMASK));
404		} else {
405			return (NULL);
406		}
407	}
408}
409
410/*
411 * Return the anon pointer for the first valid entry in the anon list,
412 * starting from the given index.
413 */
414struct anon *
415anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index)
416{
417	struct anon *ap;
418	struct anon **app;
419	ulong_t chunkoff;
420	ulong_t i;
421	ulong_t j;
422	pgcnt_t size;
423
424	i = *index;
425	size = ahp->size;
426
427	ASSERT(i < size);
428
429	if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
430		/*
431		 * 1 level case
432		 */
433		while (i < size) {
434			ap = (struct anon *)
435			    ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK);
436			if (ap) {
437				*index = i;
438				return (ap);
439			}
440			i++;
441		}
442	} else {
443		/*
444		 * 2 level case
445		 */
446		chunkoff = i & ANON_CHUNK_OFF;
447		while (i < size) {
448			app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT];
449			if (app)
450				for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) {
451					ap = (struct anon *)
452					    ((uintptr_t)app[j] & ANON_PTRMASK);
453					if (ap) {
454						*index = i + (j - chunkoff);
455						return (ap);
456					}
457				}
458			chunkoff = 0;
459			i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF;
460		}
461	}
462	*index = size;
463	return (NULL);
464}
465
466/*
467 * Set list entry with a given pointer for a specified offset
468 */
469int
470anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags)
471{
472	void		**ppp;
473	struct anon	**app;
474	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
475	uintptr_t	*ap_addr;
476
477	ASSERT(an_idx < ahp->size);
478
479	/*
480	 * Single level case.
481	 */
482	if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
483		ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx];
484	} else {
485
486		/*
487		 * 2 level case.
488		 */
489		ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
490
491		ASSERT(ppp != NULL);
492		if (*ppp == NULL) {
493			mutex_enter(&ahp->serial_lock);
494			ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
495			if (*ppp == NULL) {
496				*ppp = kmem_zalloc(PAGESIZE, kmemflags);
497				if (*ppp == NULL) {
498					mutex_exit(&ahp->serial_lock);
499					return (ENOMEM);
500				}
501			}
502			mutex_exit(&ahp->serial_lock);
503		}
504		app = *ppp;
505		ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF];
506	}
507	*ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap;
508	return (0);
509}
510
511/*
512 * Copy anon array into a given new anon array
513 */
514int
515anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, struct anon_hdr *dahp,
516    ulong_t d_idx, pgcnt_t npages, int flags)
517{
518	void **sapp, **dapp;
519	void *ap;
520	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
521
522	ASSERT((s_idx < sahp->size) && (d_idx < dahp->size));
523	ASSERT((npages <= sahp->size) && (npages <= dahp->size));
524
525	/*
526	 * Both arrays are 1 level.
527	 */
528	if (((sahp->size <= ANON_CHUNK_SIZE) &&
529	    (dahp->size <= ANON_CHUNK_SIZE)) ||
530	    ((sahp->flags & ANON_ALLOC_FORCE) &&
531	    (dahp->flags & ANON_ALLOC_FORCE))) {
532
533		bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx],
534		    npages * sizeof (struct anon *));
535		return (0);
536	}
537
538	/*
539	 * Both arrays are 2 levels.
540	 */
541	if (sahp->size > ANON_CHUNK_SIZE &&
542	    dahp->size > ANON_CHUNK_SIZE &&
543	    ((sahp->flags & ANON_ALLOC_FORCE) == 0) &&
544	    ((dahp->flags & ANON_ALLOC_FORCE) == 0)) {
545
546		ulong_t sapidx, dapidx;
547		ulong_t *sap, *dap;
548		ulong_t chknp;
549
550		while (npages != 0) {
551
552			sapidx = s_idx & ANON_CHUNK_OFF;
553			dapidx = d_idx & ANON_CHUNK_OFF;
554			chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx);
555			if (chknp > npages)
556				chknp = npages;
557
558			sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT];
559			if ((sap = *sapp) != NULL) {
560				dapp = &dahp->array_chunk[d_idx
561				    >> ANON_CHUNK_SHIFT];
562				if ((dap = *dapp) == NULL) {
563					*dapp = kmem_zalloc(PAGESIZE,
564					    kmemflags);
565					if ((dap = *dapp) == NULL)
566						return (ENOMEM);
567				}
568				bcopy((sap + sapidx), (dap + dapidx),
569				    chknp << ANON_PTRSHIFT);
570			}
571			s_idx += chknp;
572			d_idx += chknp;
573			npages -= chknp;
574		}
575		return (0);
576	}
577
578	/*
579	 * At least one of the arrays is 2 level.
580	 */
581	while (npages--) {
582		if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) {
583			ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx)));
584			if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM)
585					return (ENOMEM);
586		}
587		s_idx++;
588		d_idx++;
589	}
590	return (0);
591}
592
593
594/*
595 * ANON_INITBUF is a convenience macro for anon_grow() below. It
596 * takes a buffer dst, which is at least as large as buffer src. It
597 * does a bcopy from src into dst, and then bzeros the extra bytes
598 * of dst. If tail is set, the data in src is tail aligned within
599 * dst instead of head aligned.
600 */
601
602#define	ANON_INITBUF(src, srclen, dst, dstsize, tail)			      \
603	if (tail) {							      \
604		bzero((dst), (dstsize) - (srclen));			      \
605		bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
606	} else {							      \
607		bcopy((src), (dst), (srclen));				      \
608		bzero((char *)(dst) + (srclen), (dstsize) - (srclen));	      \
609	}
610
611#define	ANON_1_LEVEL_INC	(ANON_CHUNK_SIZE / 8)
612#define	ANON_2_LEVEL_INC	(ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
613
614/*
615 * anon_grow() is used to efficiently extend an existing anon array.
616 * startidx_p points to the index into the anon array of the first page
617 * that is in use. oldseg_pgs is the number of pages in use, starting at
618 * *startidx_p. newpages is the number of additional pages desired.
619 *
620 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
621 *
622 * The growth is done by creating a new top level of the anon array,
623 * and (if the array is 2-level) reusing the existing second level arrays.
624 *
625 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
626 *
627 * Returns the new number of pages in the anon array.
628 */
629pgcnt_t
630anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs,
631    pgcnt_t newseg_pgs, int flags)
632{
633	ulong_t startidx = startidx_p ? *startidx_p : 0;
634	pgcnt_t oldamp_pgs = ahp->size, newamp_pgs;
635	pgcnt_t oelems, nelems, totpages;
636	void **level1;
637	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
638	int growdown = (flags & ANON_GROWDOWN);
639	size_t newarrsz, oldarrsz;
640	void *level2;
641
642	ASSERT(!(startidx_p == NULL && growdown));
643	ASSERT(startidx + oldseg_pgs <= ahp->size);
644
645	/*
646	 * Determine the total number of pages needed in the new
647	 * anon array. If growing down, totpages is all pages from
648	 * startidx through the end of the array, plus <newseg_pgs>
649	 * pages. If growing up, keep all pages from page 0 through
650	 * the last page currently in use, plus <newseg_pgs> pages.
651	 */
652	if (growdown)
653		totpages = oldamp_pgs - startidx + newseg_pgs;
654	else
655		totpages = startidx + oldseg_pgs + newseg_pgs;
656
657	/* If the array is already large enough, just return. */
658
659	if (oldamp_pgs >= totpages) {
660		if (growdown)
661			*startidx_p = oldamp_pgs - totpages;
662		return (oldamp_pgs);
663	}
664
665	/*
666	 * oldamp_pgs/newamp_pgs are the total numbers of pages represented
667	 * by the corresponding arrays.
668	 * oelems/nelems are the number of pointers in the top level arrays
669	 * which may be either level 1 or level 2.
670	 * Will the new anon array be one level or two levels?
671	 */
672	if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
673		newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC);
674		oelems = oldamp_pgs;
675		nelems = newamp_pgs;
676	} else {
677		newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC);
678		oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
679		nelems = newamp_pgs >> ANON_CHUNK_SHIFT;
680	}
681
682	newarrsz = nelems * sizeof (void *);
683	level1 = kmem_alloc(newarrsz, kmemflags);
684	if (level1 == NULL)
685		return (0);
686
687	/* Are we converting from a one level to a two level anon array? */
688
689	if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE &&
690	    !(ahp->flags & ANON_ALLOC_FORCE)) {
691
692		/*
693		 * Yes, we're converting to a two level. Reuse old level 1
694		 * as new level 2 if it is exactly PAGESIZE. Otherwise
695		 * alloc a new level 2 and copy the old level 1 data into it.
696		 */
697		if (oldamp_pgs == ANON_CHUNK_SIZE) {
698			level2 = (void *)ahp->array_chunk;
699		} else {
700			level2 = kmem_alloc(PAGESIZE, kmemflags);
701			if (level2 == NULL) {
702				kmem_free(level1, newarrsz);
703				return (0);
704			}
705			oldarrsz = oldamp_pgs * sizeof (void *);
706
707			ANON_INITBUF(ahp->array_chunk, oldarrsz,
708			    level2, PAGESIZE, growdown);
709			kmem_free(ahp->array_chunk, oldarrsz);
710		}
711		bzero(level1, newarrsz);
712		if (growdown)
713			level1[nelems - 1] = level2;
714		else
715			level1[0] = level2;
716	} else {
717		oldarrsz = oelems * sizeof (void *);
718
719		ANON_INITBUF(ahp->array_chunk, oldarrsz,
720		    level1, newarrsz, growdown);
721		kmem_free(ahp->array_chunk, oldarrsz);
722	}
723
724	ahp->array_chunk = level1;
725	ahp->size = newamp_pgs;
726	if (growdown)
727		*startidx_p = newamp_pgs - totpages;
728
729	return (newamp_pgs);
730}
731
732
733/*
734 * Called to sync ani_free value.
735 */
736
737void
738set_anoninfo(void)
739{
740	processorid_t	ix, max_seqid;
741	pgcnt_t		total = 0;
742	static clock_t	last_time;
743	clock_t		new_time;
744
745	if (ani_free_pool == NULL)
746		return;
747
748	/*
749	 * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to
750	 * identify the maximum number of CPUs were ever online.
751	 */
752	new_time = ddi_get_lbolt();
753	if (new_time > last_time) {
754
755		max_seqid = max_cpu_seqid_ever;
756		ASSERT(ANI_MAX_POOL > max_seqid);
757		for (ix = 0; ix <= max_seqid; ix++)
758			total += ani_free_pool[ix].ani_count;
759
760		last_time = new_time;
761		k_anoninfo.ani_free = total;
762	}
763}
764
765/*
766 * Reserve anon space.
767 *
768 * It's no longer simply a matter of incrementing ani_resv to
769 * reserve swap space, we need to check memory-based as well
770 * as disk-backed (physical) swap.  The following algorithm
771 * is used:
772 *	Check the space on physical swap
773 *		i.e. amount needed < ani_max - ani_phys_resv
774 *	If we are swapping on swapfs check
775 *		amount needed < (availrmem - swapfs_minfree)
776 * Since the algorithm to check for the quantity of swap space is
777 * almost the same as that for reserving it, we'll just use anon_resvmem
778 * with a flag to decrement availrmem.
779 *
780 * Return non-zero on success.
781 */
782int
783anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
784{
785	pgcnt_t npages = btopr(size);
786	pgcnt_t mswap_pages = 0;
787	pgcnt_t pswap_pages = 0;
788	proc_t *p = curproc;
789
790	if (zone != NULL) {
791		/* test zone.max-swap resource control */
792		mutex_enter(&p->p_lock);
793		if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
794			mutex_exit(&p->p_lock);
795
796			if (takemem)
797				atomic_add_64(&zone->zone_anon_alloc_fail, 1);
798
799			return (0);
800		}
801
802		if (!takemem)
803			rctl_decr_swap(zone, ptob(npages));
804
805		mutex_exit(&p->p_lock);
806	}
807	mutex_enter(&anoninfo_lock);
808
809	/*
810	 * pswap_pages is the number of pages we can take from
811	 * physical (i.e. disk-backed) swap.
812	 */
813	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
814	pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv;
815
816	ANON_PRINT(A_RESV,
817	    ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
818	    npages, takemem, pswap_pages, (void *)caller()));
819
820	if (npages <= pswap_pages) {
821		/*
822		 * we have enough space on a physical swap
823		 */
824		if (takemem)
825			k_anoninfo.ani_phys_resv += npages;
826		mutex_exit(&anoninfo_lock);
827		return (1);
828	} else if (pswap_pages != 0) {
829		/*
830		 * we have some space on a physical swap
831		 */
832		if (takemem) {
833			/*
834			 * use up remainder of phys swap
835			 */
836			k_anoninfo.ani_phys_resv += pswap_pages;
837			ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max);
838		}
839	}
840	/*
841	 * since (npages > pswap_pages) we need mem swap
842	 * mswap_pages is the number of pages needed from availrmem
843	 */
844	ASSERT(npages > pswap_pages);
845	mswap_pages = npages - pswap_pages;
846
847	ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n",
848	    mswap_pages));
849
850	/*
851	 * priv processes can reserve memory as swap as long as availrmem
852	 * remains greater than swapfs_minfree; in the case of non-priv
853	 * processes, memory can be reserved as swap only if availrmem
854	 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
855	 * swapfs_reserve amount of memswap is not available to non-priv
856	 * processes. This protects daemons such as automounter dying
857	 * as a result of application processes eating away almost entire
858	 * membased swap. This safeguard becomes useless if apps are run
859	 * with root access.
860	 *
861	 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
862	 *
863	 */
864	if (tryhard) {
865		pgcnt_t floor_pages;
866
867		if (secpolicy_resource_anon_mem(CRED())) {
868			floor_pages = swapfs_minfree;
869		} else {
870			floor_pages = swapfs_minfree + swapfs_reserve;
871		}
872
873		mutex_exit(&anoninfo_lock);
874		(void) page_reclaim_mem(mswap_pages, floor_pages, 0);
875		mutex_enter(&anoninfo_lock);
876	}
877
878	mutex_enter(&freemem_lock);
879	if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
880	    (availrmem > (swapfs_minfree + mswap_pages) &&
881	    secpolicy_resource(CRED()) == 0)) {
882
883		if (takemem) {
884			/*
885			 * Take the memory from the rest of the system.
886			 */
887			availrmem -= mswap_pages;
888			mutex_exit(&freemem_lock);
889			k_anoninfo.ani_mem_resv += mswap_pages;
890			ANI_ADD(mswap_pages);
891			ANON_PRINT((A_RESV | A_MRESV),
892			    ("anon_resvmem: took %ld pages of availrmem\n",
893			    mswap_pages));
894		} else {
895			mutex_exit(&freemem_lock);
896		}
897
898		ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
899		mutex_exit(&anoninfo_lock);
900		return (1);
901	} else {
902		/*
903		 * Fail if not enough memory
904		 */
905		if (takemem) {
906			k_anoninfo.ani_phys_resv -= pswap_pages;
907		}
908
909		mutex_exit(&freemem_lock);
910		mutex_exit(&anoninfo_lock);
911		ANON_PRINT(A_RESV,
912		    ("anon_resvmem: not enough space from swapfs\n"));
913		if (zone != NULL && takemem)
914			rctl_decr_swap(zone, ptob(npages));
915		return (0);
916	}
917}
918
919/*
920 * Give back an anon reservation.
921 */
922void
923anon_unresvmem(size_t size, zone_t *zone)
924{
925	pgcnt_t npages = btopr(size);
926	spgcnt_t mem_free_pages = 0;
927	pgcnt_t phys_free_slots;
928#ifdef	ANON_DEBUG
929	pgcnt_t mem_resv;
930#endif
931	if (zone != NULL)
932		rctl_decr_swap(zone, ptob(npages));
933
934	mutex_enter(&anoninfo_lock);
935
936	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
937
938	/*
939	 * If some of this reservation belonged to swapfs
940	 * give it back to availrmem.
941	 * ani_mem_resv is the amount of availrmem swapfs has reserved.
942	 * but some of that memory could be locked by segspt so we can only
943	 * return non locked ani_mem_resv back to availrmem
944	 */
945	if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
946		ANON_PRINT((A_RESV | A_MRESV),
947		    ("anon_unresv: growing availrmem by %ld pages\n",
948		    MIN(k_anoninfo.ani_mem_resv, npages)));
949
950		mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv -
951		    k_anoninfo.ani_locked_swap), npages);
952		mutex_enter(&freemem_lock);
953		availrmem += mem_free_pages;
954		mutex_exit(&freemem_lock);
955		k_anoninfo.ani_mem_resv -= mem_free_pages;
956
957		ANI_ADD(-mem_free_pages);
958	}
959	/*
960	 * The remainder of the pages is returned to phys swap
961	 */
962	ASSERT(npages >= mem_free_pages);
963	phys_free_slots = npages - mem_free_pages;
964
965	if (phys_free_slots) {
966		k_anoninfo.ani_phys_resv -= phys_free_slots;
967	}
968
969#ifdef	ANON_DEBUG
970	mem_resv = k_anoninfo.ani_mem_resv;
971#endif
972
973	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
974	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
975
976	mutex_exit(&anoninfo_lock);
977
978	ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n",
979	    npages, mem_resv, (void *)caller()));
980}
981
982/*
983 * Allocate an anon slot and return it with the lock held.
984 */
985struct anon *
986anon_alloc(struct vnode *vp, anoff_t off)
987{
988	struct anon	*ap;
989	kmutex_t	*ahm;
990
991	ap = kmem_cache_alloc(anon_cache, KM_SLEEP);
992	if (vp == NULL) {
993		swap_alloc(ap);
994	} else {
995		ap->an_vp = vp;
996		ap->an_off = off;
997	}
998	ap->an_refcnt = 1;
999	ap->an_pvp = NULL;
1000	ap->an_poff = 0;
1001	ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1002	mutex_enter(ahm);
1003	anon_addhash(ap);
1004	mutex_exit(ahm);
1005	ANI_ADD(-1);
1006	ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n",
1007	    (void *)ap, (ap ? (void *)ap->an_vp : NULL)));
1008	return (ap);
1009}
1010
1011/*
1012 * Called for pages locked in memory via softlock/pagelock/mlock to make sure
1013 * such pages don't consume any physical swap resources needed for swapping
1014 * unlocked pages.
1015 */
1016void
1017anon_swap_free(struct anon *ap, page_t *pp)
1018{
1019	kmutex_t *ahm;
1020
1021	ASSERT(ap != NULL);
1022	ASSERT(pp != NULL);
1023	ASSERT(PAGE_LOCKED(pp));
1024	ASSERT(pp->p_vnode != NULL);
1025	ASSERT(IS_SWAPFSVP(pp->p_vnode));
1026	ASSERT(ap->an_refcnt != 0);
1027	ASSERT(pp->p_vnode == ap->an_vp);
1028	ASSERT(pp->p_offset == ap->an_off);
1029
1030	if (ap->an_pvp == NULL)
1031		return;
1032
1033	page_io_lock(pp);
1034	ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1035	mutex_enter(ahm);
1036
1037	ASSERT(ap->an_refcnt != 0);
1038	ASSERT(pp->p_vnode == ap->an_vp);
1039	ASSERT(pp->p_offset == ap->an_off);
1040
1041	if (ap->an_pvp != NULL) {
1042		swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1043		ap->an_pvp = NULL;
1044		ap->an_poff = 0;
1045		mutex_exit(ahm);
1046		hat_setmod(pp);
1047	} else {
1048		mutex_exit(ahm);
1049	}
1050	page_io_unlock(pp);
1051}
1052
1053/*
1054 * Decrement the reference count of an anon page.
1055 * If reference count goes to zero, free it and
1056 * its associated page (if any).
1057 */
1058void
1059anon_decref(struct anon *ap)
1060{
1061	page_t *pp;
1062	struct vnode *vp;
1063	anoff_t off;
1064	kmutex_t *ahm;
1065
1066	ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1067	mutex_enter(ahm);
1068	ASSERT(ap->an_refcnt != 0);
1069	if (ap->an_refcnt == 0)
1070		panic("anon_decref: slot count 0");
1071	if (--ap->an_refcnt == 0) {
1072		swap_xlate(ap, &vp, &off);
1073		anon_rmhash(ap);
1074		if (ap->an_pvp != NULL)
1075			swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1076		mutex_exit(ahm);
1077
1078		/*
1079		 * If there is a page for this anon slot we will need to
1080		 * call VN_DISPOSE to get rid of the vp association and
1081		 * put the page back on the free list as really free.
1082		 * Acquire the "exclusive" lock to ensure that any
1083		 * pending i/o always completes before the swap slot
1084		 * is freed.
1085		 */
1086		pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
1087		if (pp != NULL) {
1088			/*LINTED: constant in conditional context */
1089			VN_DISPOSE(pp, B_INVAL, 0, kcred);
1090		}
1091		ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n",
1092		    (void *)ap, (void *)ap->an_vp));
1093
1094		kmem_cache_free(anon_cache, ap);
1095
1096		ANI_ADD(1);
1097	} else {
1098		mutex_exit(ahm);
1099	}
1100}
1101
1102
1103/*
1104 * check an_refcnt of the root anon slot (anon_index argument is aligned at
1105 * seg->s_szc level) to determine whether COW processing is required.
1106 * anonpages_hash_lock[] held on the root ap ensures that if root's
1107 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase
1108 * later since this process can't fork while its AS lock is held).
1109 *
1110 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0.
1111 */
1112int
1113anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index)
1114{
1115	struct anon	*ap;
1116	kmutex_t	*ahmpages = NULL;
1117
1118	ap = anon_get_ptr(ahp, anon_index);
1119	if (ap == NULL)
1120		return (0);
1121
1122	ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1123	mutex_enter(ahmpages);
1124	ASSERT(ap->an_refcnt >= 1);
1125	if (ap->an_refcnt == 1) {
1126		mutex_exit(ahmpages);
1127		return (0);
1128	}
1129	mutex_exit(ahmpages);
1130	return (1);
1131}
1132/*
1133 * Check 'nslots' anon slots for refcnt > 1.
1134 *
1135 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise
1136 * returns 0.
1137 */
1138static int
1139anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
1140{
1141	struct anon *ap;
1142
1143	while (nslots-- > 0) {
1144		if ((ap = anon_get_ptr(ahp, anon_index)) != NULL &&
1145		    ap->an_refcnt > 1)
1146			return (1);
1147		anon_index++;
1148	}
1149
1150	return (0);
1151}
1152
1153static void
1154anon_decref_pages(
1155	struct anon_hdr *ahp,
1156	ulong_t an_idx,
1157	uint_t szc)
1158{
1159	struct anon *ap = anon_get_ptr(ahp, an_idx);
1160	kmutex_t *ahmpages = NULL;
1161	page_t *pp;
1162	pgcnt_t pgcnt = page_get_pagecnt(szc);
1163	pgcnt_t i;
1164	struct vnode *vp;
1165	anoff_t   off;
1166	kmutex_t *ahm;
1167#ifdef DEBUG
1168	int refcnt = 1;
1169#endif
1170
1171	ASSERT(szc != 0);
1172	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1173	ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1174	ASSERT(an_idx < ahp->size);
1175
1176	if (ahp->size - an_idx < pgcnt) {
1177		/*
1178		 * In case of shared mappings total anon map size may not be
1179		 * the largest page size aligned.
1180		 */
1181		pgcnt = ahp->size - an_idx;
1182	}
1183
1184	VM_STAT_ADD(anonvmstats.decrefpages[0]);
1185
1186	if (ap != NULL) {
1187		ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1188		mutex_enter(ahmpages);
1189		ASSERT((refcnt = ap->an_refcnt) != 0);
1190		VM_STAT_ADD(anonvmstats.decrefpages[1]);
1191		if (ap->an_refcnt == 1) {
1192			VM_STAT_ADD(anonvmstats.decrefpages[2]);
1193			ASSERT(!anon_share(ahp, an_idx, pgcnt));
1194			mutex_exit(ahmpages);
1195			ahmpages = NULL;
1196		}
1197	}
1198
1199	i = 0;
1200	while (i < pgcnt) {
1201		if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) {
1202			ASSERT(refcnt == 1 && ahmpages == NULL);
1203			i++;
1204			continue;
1205		}
1206		ASSERT(ap->an_refcnt == refcnt);
1207		ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1208		ASSERT(ahmpages == NULL || ap->an_refcnt > 1);
1209
1210		if (ahmpages == NULL) {
1211			swap_xlate(ap, &vp, &off);
1212			pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
1213			if (pp == NULL || pp->p_szc == 0) {
1214				VM_STAT_ADD(anonvmstats.decrefpages[3]);
1215				ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1216				(void) anon_set_ptr(ahp, an_idx + i, NULL,
1217				    ANON_SLEEP);
1218				mutex_enter(ahm);
1219				ap->an_refcnt--;
1220				ASSERT(ap->an_refcnt == 0);
1221				anon_rmhash(ap);
1222				if (ap->an_pvp)
1223					swap_phys_free(ap->an_pvp, ap->an_poff,
1224					    PAGESIZE);
1225				mutex_exit(ahm);
1226				if (pp == NULL) {
1227					pp = page_lookup(vp, (u_offset_t)off,
1228					    SE_EXCL);
1229					ASSERT(pp == NULL || pp->p_szc == 0);
1230				}
1231				if (pp != NULL) {
1232					VM_STAT_ADD(anonvmstats.decrefpages[4]);
1233					/*LINTED*/
1234					VN_DISPOSE(pp, B_INVAL, 0, kcred);
1235				}
1236				kmem_cache_free(anon_cache, ap);
1237				ANI_ADD(1);
1238				i++;
1239			} else {
1240				pgcnt_t j;
1241				pgcnt_t curpgcnt =
1242				    page_get_pagecnt(pp->p_szc);
1243				size_t ppasize = curpgcnt * sizeof (page_t *);
1244				page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
1245				int dispose = 0;
1246
1247				VM_STAT_ADD(anonvmstats.decrefpages[5]);
1248
1249				ASSERT(pp->p_szc <= szc);
1250				ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt));
1251				ASSERT(IS_P2ALIGNED(i, curpgcnt));
1252				ASSERT(i + curpgcnt <= pgcnt);
1253				ASSERT(!(page_pptonum(pp) & (curpgcnt - 1)));
1254				ppa[0] = pp;
1255				for (j = i + 1; j < i + curpgcnt; j++) {
1256					ap = anon_get_ptr(ahp, an_idx + j);
1257					ASSERT(ap != NULL &&
1258					    ap->an_refcnt == 1);
1259					swap_xlate(ap, &vp, &off);
1260					pp = page_lookup(vp, (u_offset_t)off,
1261					    SE_EXCL);
1262					if (pp == NULL)
1263						panic("anon_decref_pages: "
1264						    "no page");
1265
1266					(void) hat_pageunload(pp,
1267					    HAT_FORCE_PGUNLOAD);
1268					ASSERT(pp->p_szc == ppa[0]->p_szc);
1269					ASSERT(page_pptonum(pp) - 1 ==
1270					    page_pptonum(ppa[j - i - 1]));
1271					ppa[j - i] = pp;
1272					if (ap->an_pvp != NULL &&
1273					    !vn_matchopval(ap->an_pvp,
1274					    VOPNAME_DISPOSE,
1275					    (fs_generic_func_p)(uintptr_t)
1276					    fs_dispose))
1277						dispose = 1;
1278				}
1279				for (j = i; j < i + curpgcnt; j++) {
1280					ap = anon_get_ptr(ahp, an_idx + j);
1281					ASSERT(ap != NULL &&
1282					    ap->an_refcnt == 1);
1283					ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1284					(void) anon_set_ptr(ahp, an_idx + j,
1285					    NULL, ANON_SLEEP);
1286					mutex_enter(ahm);
1287					ap->an_refcnt--;
1288					ASSERT(ap->an_refcnt == 0);
1289					anon_rmhash(ap);
1290					if (ap->an_pvp)
1291						swap_phys_free(ap->an_pvp,
1292						    ap->an_poff, PAGESIZE);
1293					mutex_exit(ahm);
1294					kmem_cache_free(anon_cache, ap);
1295					ANI_ADD(1);
1296				}
1297				if (!dispose) {
1298					VM_STAT_ADD(anonvmstats.decrefpages[6]);
1299					page_destroy_pages(ppa[0]);
1300				} else {
1301					VM_STAT_ADD(anonvmstats.decrefpages[7]);
1302					for (j = 0; j < curpgcnt; j++) {
1303						ASSERT(PAGE_EXCL(ppa[j]));
1304						ppa[j]->p_szc = 0;
1305					}
1306					for (j = 0; j < curpgcnt; j++) {
1307						ASSERT(!hat_page_is_mapped(
1308						    ppa[j]));
1309						/*LINTED*/
1310						VN_DISPOSE(ppa[j], B_INVAL, 0,
1311						    kcred);
1312					}
1313				}
1314				kmem_free(ppa, ppasize);
1315				i += curpgcnt;
1316			}
1317		} else {
1318			VM_STAT_ADD(anonvmstats.decrefpages[8]);
1319			(void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP);
1320			ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1321			mutex_enter(ahm);
1322			ap->an_refcnt--;
1323			mutex_exit(ahm);
1324			i++;
1325		}
1326	}
1327
1328	if (ahmpages != NULL) {
1329		mutex_exit(ahmpages);
1330	}
1331}
1332
1333/*
1334 * Duplicate references to size bytes worth of anon pages.
1335 * Used when duplicating a segment that contains private anon pages.
1336 * This code assumes that procedure calling this one has already used
1337 * hat_chgprot() to disable write access to the range of addresses that
1338 * that *old actually refers to.
1339 */
1340void
1341anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new,
1342    ulong_t new_idx, size_t size)
1343{
1344	spgcnt_t npages;
1345	kmutex_t *ahm;
1346	struct anon *ap;
1347	ulong_t off;
1348	ulong_t index;
1349
1350	npages = btopr(size);
1351	while (npages > 0) {
1352		index = old_idx;
1353		if ((ap = anon_get_next_ptr(old, &index)) == NULL)
1354			break;
1355
1356		ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1357		off = index - old_idx;
1358		npages -= off;
1359		if (npages <= 0)
1360			break;
1361
1362		(void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP);
1363		ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1364
1365		mutex_enter(ahm);
1366		ap->an_refcnt++;
1367		mutex_exit(ahm);
1368
1369		off++;
1370		new_idx += off;
1371		old_idx += off;
1372		npages--;
1373	}
1374}
1375
1376/*
1377 * Just like anon_dup but also guarantees there are no holes (unallocated anon
1378 * slots) within any large page region. That means if a large page region is
1379 * empty in the old array it will skip it. If there are 1 or more valid slots
1380 * in the large page region of the old array it will make sure to fill in any
1381 * unallocated ones and also copy them to the new array. If noalloc is 1 large
1382 * page region should either have no valid anon slots or all slots should be
1383 * valid.
1384 */
1385void
1386anon_dup_fill_holes(
1387	struct anon_hdr *old,
1388	ulong_t old_idx,
1389	struct anon_hdr *new,
1390	ulong_t new_idx,
1391	size_t size,
1392	uint_t szc,
1393	int noalloc)
1394{
1395	struct anon	*ap;
1396	spgcnt_t	npages;
1397	kmutex_t	*ahm, *ahmpages = NULL;
1398	pgcnt_t		pgcnt, i;
1399	ulong_t		index, off;
1400#ifdef DEBUG
1401	int		refcnt;
1402#endif
1403
1404	ASSERT(szc != 0);
1405	pgcnt = page_get_pagecnt(szc);
1406	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1407	npages = btopr(size);
1408	ASSERT(IS_P2ALIGNED(npages, pgcnt));
1409	ASSERT(IS_P2ALIGNED(old_idx, pgcnt));
1410
1411	VM_STAT_ADD(anonvmstats.dupfillholes[0]);
1412
1413	while (npages > 0) {
1414		index = old_idx;
1415
1416		/*
1417		 * Find the next valid slot.
1418		 */
1419		if (anon_get_next_ptr(old, &index) == NULL)
1420			break;
1421
1422		ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1423		/*
1424		 * Now backup index to the beginning of the
1425		 * current large page region of the old array.
1426		 */
1427		index = P2ALIGN(index, pgcnt);
1428		off = index - old_idx;
1429		ASSERT(IS_P2ALIGNED(off, pgcnt));
1430		npages -= off;
1431		if (npages <= 0)
1432			break;
1433
1434		/*
1435		 * Fill and copy a large page regions worth
1436		 * of anon slots.
1437		 */
1438		for (i = 0; i < pgcnt; i++) {
1439			if ((ap = anon_get_ptr(old, index + i)) == NULL) {
1440				if (noalloc) {
1441					panic("anon_dup_fill_holes: "
1442					    "empty anon slot\n");
1443				}
1444				VM_STAT_ADD(anonvmstats.dupfillholes[1]);
1445				ap = anon_alloc(NULL, 0);
1446				(void) anon_set_ptr(old, index + i, ap,
1447				    ANON_SLEEP);
1448			} else if (i == 0) {
1449				/*
1450				 * make the increment of all refcnts of all
1451				 * anon slots of a large page appear atomic by
1452				 * getting an anonpages_hash_lock for the
1453				 * first anon slot of a large page.
1454				 */
1455				VM_STAT_ADD(anonvmstats.dupfillholes[2]);
1456
1457				ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1458				mutex_enter(ahmpages);
1459				/*LINTED*/
1460				ASSERT(refcnt = ap->an_refcnt);
1461
1462				VM_STAT_COND_ADD(ap->an_refcnt > 1,
1463				    anonvmstats.dupfillholes[3]);
1464			}
1465			(void) anon_set_ptr(new, new_idx + off + i, ap,
1466			    ANON_SLEEP);
1467			ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1468			mutex_enter(ahm);
1469			ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1470			ASSERT(i == 0 || ahmpages == NULL ||
1471			    refcnt == ap->an_refcnt);
1472			ap->an_refcnt++;
1473			mutex_exit(ahm);
1474		}
1475		if (ahmpages != NULL) {
1476			mutex_exit(ahmpages);
1477			ahmpages = NULL;
1478		}
1479		off += pgcnt;
1480		new_idx += off;
1481		old_idx += off;
1482		npages -= pgcnt;
1483	}
1484}
1485
1486/*
1487 * Used when a segment with a vnode changes szc. similarly to
1488 * anon_dup_fill_holes() makes sure each large page region either has no anon
1489 * slots or all of them. but new slots are created by COWing the file
1490 * pages. on entrance no anon slots should be shared.
1491 */
1492int
1493anon_fill_cow_holes(
1494	struct seg *seg,
1495	caddr_t addr,
1496	struct anon_hdr *ahp,
1497	ulong_t an_idx,
1498	struct vnode *vp,
1499	u_offset_t vp_off,
1500	size_t size,
1501	uint_t szc,
1502	uint_t prot,
1503	struct vpage vpage[],
1504	struct cred *cred)
1505{
1506	struct anon	*ap;
1507	spgcnt_t	npages;
1508	pgcnt_t		pgcnt, i;
1509	ulong_t		index, off;
1510	int		err = 0;
1511	int		pageflags = 0;
1512
1513	ASSERT(szc != 0);
1514	pgcnt = page_get_pagecnt(szc);
1515	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1516	npages = btopr(size);
1517	ASSERT(IS_P2ALIGNED(npages, pgcnt));
1518	ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1519
1520	while (npages > 0) {
1521		index = an_idx;
1522
1523		/*
1524		 * Find the next valid slot.
1525		 */
1526		if (anon_get_next_ptr(ahp, &index) == NULL) {
1527			break;
1528		}
1529
1530		ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1531		/*
1532		 * Now backup index to the beginning of the
1533		 * current large page region of the anon array.
1534		 */
1535		index = P2ALIGN(index, pgcnt);
1536		off = index - an_idx;
1537		ASSERT(IS_P2ALIGNED(off, pgcnt));
1538		npages -= off;
1539		if (npages <= 0)
1540			break;
1541		an_idx += off;
1542		vp_off += ptob(off);
1543		addr += ptob(off);
1544		if (vpage != NULL) {
1545			vpage += off;
1546		}
1547
1548		for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) {
1549			if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) {
1550				page_t *pl[1 + 1];
1551				page_t *pp;
1552
1553				err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL,
1554				    pl, PAGESIZE, seg, addr, S_READ, cred,
1555				    NULL);
1556				if (err) {
1557					break;
1558				}
1559				if (vpage != NULL) {
1560					prot = VPP_PROT(vpage);
1561					pageflags = VPP_ISPPLOCK(vpage) ?
1562					    LOCK_PAGE : 0;
1563				}
1564				pp = anon_private(&ap, seg, addr, prot, pl[0],
1565				    pageflags, cred);
1566				if (pp == NULL) {
1567					err = ENOMEM;
1568					break;
1569				}
1570				(void) anon_set_ptr(ahp, an_idx, ap,
1571				    ANON_SLEEP);
1572				page_unlock(pp);
1573			}
1574			ASSERT(ap->an_refcnt == 1);
1575			addr += PAGESIZE;
1576			if (vpage != NULL) {
1577				vpage++;
1578			}
1579		}
1580		npages -= pgcnt;
1581	}
1582
1583	return (err);
1584}
1585
1586/*
1587 * Free a group of "size" anon pages, size in bytes,
1588 * and clear out the pointers to the anon entries.
1589 */
1590void
1591anon_free(struct anon_hdr *ahp, ulong_t index, size_t size)
1592{
1593	spgcnt_t npages;
1594	struct anon *ap;
1595	ulong_t old;
1596
1597	npages = btopr(size);
1598
1599	while (npages > 0) {
1600		old = index;
1601		if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1602			break;
1603
1604		ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1605		npages -= index - old;
1606		if (npages <= 0)
1607			break;
1608
1609		(void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP);
1610		anon_decref(ap);
1611		/*
1612		 * Bump index and decrement page count
1613		 */
1614		index++;
1615		npages--;
1616	}
1617}
1618
1619void
1620anon_free_pages(
1621	struct anon_hdr *ahp,
1622	ulong_t an_idx,
1623	size_t size,
1624	uint_t szc)
1625{
1626	spgcnt_t	npages;
1627	pgcnt_t		pgcnt;
1628	ulong_t		index, off;
1629
1630	ASSERT(szc != 0);
1631	pgcnt = page_get_pagecnt(szc);
1632	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1633	npages = btopr(size);
1634	ASSERT(IS_P2ALIGNED(npages, pgcnt));
1635	ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1636	ASSERT(an_idx < ahp->size);
1637
1638	VM_STAT_ADD(anonvmstats.freepages[0]);
1639
1640	while (npages > 0) {
1641		index = an_idx;
1642
1643		/*
1644		 * Find the next valid slot.
1645		 */
1646		if (anon_get_next_ptr(ahp, &index) == NULL)
1647			break;
1648
1649		ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1650		/*
1651		 * Now backup index to the beginning of the
1652		 * current large page region of the old array.
1653		 */
1654		index = P2ALIGN(index, pgcnt);
1655		off = index - an_idx;
1656		ASSERT(IS_P2ALIGNED(off, pgcnt));
1657		npages -= off;
1658		if (npages <= 0)
1659			break;
1660
1661		anon_decref_pages(ahp, index, szc);
1662
1663		off += pgcnt;
1664		an_idx += off;
1665		npages -= pgcnt;
1666	}
1667}
1668
1669/*
1670 * Make anonymous pages discardable
1671 */
1672int
1673anon_disclaim(struct anon_map *amp, ulong_t index, size_t size,
1674    uint_t behav, pgcnt_t *purged)
1675{
1676	spgcnt_t npages = btopr(size);
1677	struct anon *ap;
1678	struct vnode *vp;
1679	anoff_t off;
1680	page_t *pp, *root_pp;
1681	kmutex_t *ahm;
1682	pgcnt_t pgcnt, npurged = 0;
1683	ulong_t old_idx, idx, i;
1684	struct anon_hdr *ahp = amp->ahp;
1685	anon_sync_obj_t cookie;
1686	int err = 0;
1687
1688	VERIFY(behav == MADV_FREE || behav == MADV_PURGE);
1689	ASSERT(RW_READ_HELD(&amp->a_rwlock));
1690	pgcnt = 1;
1691	for (; npages > 0; index = (pgcnt == 1) ? index + 1 :
1692	    P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) {
1693
1694		/*
1695		 * get anon pointer and index for the first valid entry
1696		 * in the anon list, starting from "index"
1697		 */
1698		old_idx = index;
1699		if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1700			break;
1701
1702		/*
1703		 * decrement npages by number of NULL anon slots we skipped
1704		 */
1705		npages -= index - old_idx;
1706		if (npages <= 0)
1707			break;
1708
1709		anon_array_enter(amp, index, &cookie);
1710		ap = anon_get_ptr(ahp, index);
1711		ASSERT(ap != NULL);
1712
1713		/*
1714		 * Get anonymous page and try to lock it SE_EXCL;
1715		 * if we couldn't grab the lock we skip to next page.
1716		 */
1717		swap_xlate(ap, &vp, &off);
1718		pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL);
1719		if (pp == NULL) {
1720			segadvstat.MADV_FREE_miss.value.ul++;
1721			pgcnt = 1;
1722			anon_array_exit(&cookie);
1723			continue;
1724		}
1725		pgcnt = page_get_pagecnt(pp->p_szc);
1726
1727		/*
1728		 * we cannot free a page which is permanently locked.
1729		 * The page_struct_lock need not be acquired to examine
1730		 * these fields since the page has an "exclusive" lock.
1731		 */
1732		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1733			page_unlock(pp);
1734			segadvstat.MADV_FREE_miss.value.ul++;
1735			anon_array_exit(&cookie);
1736			err = EBUSY;
1737			continue;
1738		}
1739
1740		ahm = AH_MUTEX(vp, off);
1741		mutex_enter(ahm);
1742		ASSERT(ap->an_refcnt != 0);
1743		/*
1744		 * skip this one if copy-on-write is not yet broken.
1745		 */
1746		if (ap->an_refcnt > 1) {
1747			mutex_exit(ahm);
1748			page_unlock(pp);
1749			segadvstat.MADV_FREE_miss.value.ul++;
1750			anon_array_exit(&cookie);
1751			continue;
1752		}
1753
1754		if (behav == MADV_PURGE && pp->p_szc != 0) {
1755			/*
1756			 * If we're purging and we have a large page, simplify
1757			 * things a bit by demoting ourselves into the base
1758			 * page case.
1759			 */
1760			(void) page_try_demote_pages(pp);
1761		}
1762
1763		if (pp->p_szc == 0) {
1764			pgcnt = 1;
1765
1766			/*
1767			 * free swap slot;
1768			 */
1769			if (ap->an_pvp) {
1770				swap_phys_free(ap->an_pvp, ap->an_poff,
1771				    PAGESIZE);
1772				ap->an_pvp = NULL;
1773				ap->an_poff = 0;
1774			}
1775
1776			if (behav == MADV_PURGE) {
1777				/*
1778				 * If we're purging (instead of merely freeing),
1779				 * rip out this anon structure entirely to
1780				 * assure that any subsequent fault pulls from
1781				 * the backing vnode (if any).
1782				 */
1783				if (--ap->an_refcnt == 0)
1784					anon_rmhash(ap);
1785
1786				mutex_exit(ahm);
1787				(void) anon_set_ptr(ahp, index,
1788				    NULL, ANON_SLEEP);
1789				npurged++;
1790				ANI_ADD(1);
1791				kmem_cache_free(anon_cache, ap);
1792			} else {
1793				mutex_exit(ahm);
1794			}
1795
1796			segadvstat.MADV_FREE_hit.value.ul++;
1797
1798			/*
1799			 * while we are at it, unload all the translations
1800			 * and attempt to free the page.
1801			 */
1802			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1803			/*LINTED: constant in conditional context */
1804			VN_DISPOSE(pp,
1805			    behav == MADV_FREE ? B_FREE : B_INVAL, 0, kcred);
1806
1807			anon_array_exit(&cookie);
1808			continue;
1809		}
1810
1811		pgcnt = page_get_pagecnt(pp->p_szc);
1812		if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) {
1813			if (!page_try_demote_pages(pp)) {
1814				mutex_exit(ahm);
1815				page_unlock(pp);
1816				segadvstat.MADV_FREE_miss.value.ul++;
1817				anon_array_exit(&cookie);
1818				err = EBUSY;
1819				continue;
1820			} else {
1821				pgcnt = 1;
1822				if (ap->an_pvp) {
1823					swap_phys_free(ap->an_pvp,
1824					    ap->an_poff, PAGESIZE);
1825					ap->an_pvp = NULL;
1826					ap->an_poff = 0;
1827				}
1828				mutex_exit(ahm);
1829				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1830				/*LINTED*/
1831				VN_DISPOSE(pp, B_FREE, 0, kcred);
1832				segadvstat.MADV_FREE_hit.value.ul++;
1833				anon_array_exit(&cookie);
1834				continue;
1835			}
1836		}
1837		mutex_exit(ahm);
1838		root_pp = pp;
1839
1840		/*
1841		 * try to lock remaining pages
1842		 */
1843		for (idx = 1; idx < pgcnt; idx++) {
1844			pp++;
1845			if (!page_trylock(pp, SE_EXCL))
1846				break;
1847			if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1848				page_unlock(pp);
1849				break;
1850			}
1851		}
1852
1853		if (idx == pgcnt) {
1854			for (i = 0; i < pgcnt; i++) {
1855				ap = anon_get_ptr(ahp, index + i);
1856				if (ap == NULL)
1857					break;
1858				swap_xlate(ap, &vp, &off);
1859				ahm = AH_MUTEX(vp, off);
1860				mutex_enter(ahm);
1861				ASSERT(ap->an_refcnt != 0);
1862
1863				/*
1864				 * skip this one if copy-on-write
1865				 * is not yet broken.
1866				 */
1867				if (ap->an_refcnt > 1) {
1868					mutex_exit(ahm);
1869					goto skiplp;
1870				}
1871				if (ap->an_pvp) {
1872					swap_phys_free(ap->an_pvp,
1873					    ap->an_poff, PAGESIZE);
1874					ap->an_pvp = NULL;
1875					ap->an_poff = 0;
1876				}
1877				mutex_exit(ahm);
1878			}
1879			page_destroy_pages(root_pp);
1880			segadvstat.MADV_FREE_hit.value.ul += pgcnt;
1881			anon_array_exit(&cookie);
1882			continue;
1883		}
1884skiplp:
1885		segadvstat.MADV_FREE_miss.value.ul += pgcnt;
1886		for (i = 0, pp = root_pp; i < idx; pp++, i++)
1887			page_unlock(pp);
1888		anon_array_exit(&cookie);
1889	}
1890
1891	if (purged != NULL)
1892		*purged = npurged;
1893
1894	return (err);
1895}
1896
1897/*
1898 * Return the kept page(s) and protections back to the segment driver.
1899 */
1900int
1901anon_getpage(
1902	struct anon **app,
1903	uint_t *protp,
1904	page_t *pl[],
1905	size_t plsz,
1906	struct seg *seg,
1907	caddr_t addr,
1908	enum seg_rw rw,
1909	struct cred *cred)
1910{
1911	page_t *pp;
1912	struct anon *ap = *app;
1913	struct vnode *vp;
1914	anoff_t off;
1915	int err;
1916	kmutex_t *ahm;
1917
1918	swap_xlate(ap, &vp, &off);
1919
1920	/*
1921	 * Lookup the page. If page is being paged in,
1922	 * wait for it to finish as we must return a list of
1923	 * pages since this routine acts like the VOP_GETPAGE
1924	 * routine does.
1925	 */
1926	if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) {
1927		ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1928		mutex_enter(ahm);
1929		if (ap->an_refcnt == 1)
1930			*protp = PROT_ALL;
1931		else
1932			*protp = PROT_ALL & ~PROT_WRITE;
1933		mutex_exit(ahm);
1934		pl[0] = pp;
1935		pl[1] = NULL;
1936		return (0);
1937	}
1938
1939	/*
1940	 * Simply treat it as a vnode fault on the anon vp.
1941	 */
1942
1943	TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE,
1944	    "anon_getpage:seg %x addr %x vp %x",
1945	    seg, addr, vp);
1946
1947	err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz,
1948	    seg, addr, rw, cred, NULL);
1949
1950	if (err == 0 && pl != NULL) {
1951		ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1952		mutex_enter(ahm);
1953		if (ap->an_refcnt != 1)
1954			*protp &= ~PROT_WRITE;	/* make read-only */
1955		mutex_exit(ahm);
1956	}
1957	return (err);
1958}
1959
1960/*
1961 * Creates or returns kept pages to the segment driver.  returns -1 if a large
1962 * page cannot be allocated. returns -2 if some other process has allocated a
1963 * larger page.
1964 *
1965 * For cowfault it will allocate any size pages to fill the requested area to
1966 * avoid partially overwriting anon slots (i.e. sharing only some of the anon
1967 * slots within a large page with other processes). This policy greatly
1968 * simplifies large page freeing (which is only freed when all anon slot
1969 * refcnts are 0).
1970 */
1971int
1972anon_map_getpages(
1973	struct anon_map *amp,
1974	ulong_t	start_idx,
1975	uint_t	szc,
1976	struct seg *seg,
1977	caddr_t	addr,
1978	uint_t prot,
1979	uint_t *protp,
1980	page_t	*ppa[],
1981	uint_t	*ppa_szc,
1982	struct vpage vpage[],
1983	enum seg_rw rw,
1984	int brkcow,
1985	int anypgsz,
1986	int pgflags,
1987	struct cred *cred)
1988{
1989	pgcnt_t		pgcnt;
1990	struct anon	*ap;
1991	struct vnode	*vp;
1992	anoff_t		off;
1993	page_t		*pp, *pl[2], *conpp = NULL;
1994	caddr_t		vaddr;
1995	ulong_t		pg_idx, an_idx, i;
1996	spgcnt_t	nreloc = 0;
1997	int		prealloc = 1;
1998	int		err, slotcreate;
1999	uint_t		vpprot;
2000	int		upsize = (szc < seg->s_szc);
2001
2002#if !defined(__i386) && !defined(__amd64)
2003	ASSERT(seg->s_szc != 0);
2004#endif
2005	ASSERT(szc <= seg->s_szc);
2006	ASSERT(ppa_szc != NULL);
2007	ASSERT(rw != S_CREATE);
2008
2009	*protp = PROT_ALL;
2010
2011	VM_STAT_ADD(anonvmstats.getpages[0]);
2012
2013	if (szc == 0) {
2014		VM_STAT_ADD(anonvmstats.getpages[1]);
2015		if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) {
2016			err = anon_getpage(&ap, protp, pl, PAGESIZE, seg,
2017			    addr, rw, cred);
2018			if (err)
2019				return (err);
2020			ppa[0] = pl[0];
2021			if (brkcow == 0 || (*protp & PROT_WRITE)) {
2022				VM_STAT_ADD(anonvmstats.getpages[2]);
2023				if (ppa[0]->p_szc != 0 && upsize) {
2024					VM_STAT_ADD(anonvmstats.getpages[3]);
2025					*ppa_szc = MIN(ppa[0]->p_szc,
2026					    seg->s_szc);
2027					page_unlock(ppa[0]);
2028					return (-2);
2029				}
2030				return (0);
2031			}
2032			panic("anon_map_getpages: cowfault for szc 0");
2033		} else {
2034			VM_STAT_ADD(anonvmstats.getpages[4]);
2035			ppa[0] = anon_zero(seg, addr, &ap, cred);
2036			if (ppa[0] == NULL)
2037				return (ENOMEM);
2038			(void) anon_set_ptr(amp->ahp, start_idx, ap,
2039			    ANON_SLEEP);
2040			return (0);
2041		}
2042	}
2043
2044	pgcnt = page_get_pagecnt(szc);
2045	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
2046	ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
2047
2048	/*
2049	 * First we check for the case that the requtested large
2050	 * page or larger page already exists in the system.
2051	 * Actually we only check if the first constituent page
2052	 * exists and only preallocate if it's not found.
2053	 */
2054	ap = anon_get_ptr(amp->ahp, start_idx);
2055	if (ap) {
2056		uint_t pszc;
2057		swap_xlate(ap, &vp, &off);
2058		if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) {
2059			if (pszc > szc && upsize) {
2060				*ppa_szc = MIN(pszc, seg->s_szc);
2061				return (-2);
2062			}
2063			if (pszc >= szc) {
2064				prealloc = 0;
2065			}
2066		}
2067	}
2068
2069	VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]);
2070	VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]);
2071
2072top:
2073	/*
2074	 * If a smaller page or no page at all was found,
2075	 * grab a large page off the freelist.
2076	 */
2077	if (prealloc) {
2078		ASSERT(conpp == NULL);
2079		if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa,
2080		    szc, 0, pgflags) != 0) {
2081			VM_STAT_ADD(anonvmstats.getpages[7]);
2082			if (brkcow == 0 || szc < seg->s_szc ||
2083			    !anon_szcshare(amp->ahp, start_idx)) {
2084				/*
2085				 * If the refcnt's of all anon slots are <= 1
2086				 * they can't increase since we are holding
2087				 * the address space's lock. So segvn can
2088				 * safely decrease szc without risking to
2089				 * generate a cow fault for the region smaller
2090				 * than the segment's largest page size.
2091				 */
2092				VM_STAT_ADD(anonvmstats.getpages[8]);
2093				return (-1);
2094			}
2095		docow:
2096			/*
2097			 * This is a cow fault. Copy away the entire 1 large
2098			 * page region of this segment.
2099			 */
2100			if (szc != seg->s_szc)
2101				panic("anon_map_getpages: cowfault for szc %d",
2102				    szc);
2103			vaddr = addr;
2104			for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
2105			    pg_idx++, an_idx++, vaddr += PAGESIZE) {
2106				if ((ap = anon_get_ptr(amp->ahp, an_idx)) !=
2107				    NULL) {
2108					err = anon_getpage(&ap, &vpprot, pl,
2109					    PAGESIZE, seg, vaddr, rw, cred);
2110					if (err) {
2111						for (i = 0; i < pg_idx; i++) {
2112							if ((pp = ppa[i]) !=
2113							    NULL)
2114								page_unlock(pp);
2115						}
2116						return (err);
2117					}
2118					ppa[pg_idx] = pl[0];
2119				} else {
2120					/*
2121					 * Since this is a cowfault we know
2122					 * that this address space has a
2123					 * parent or children which means
2124					 * anon_dup_fill_holes() has initialized
2125					 * all anon slots within a large page
2126					 * region that had at least one anon
2127					 * slot at the time of fork().
2128					 */
2129					panic("anon_map_getpages: "
2130					    "cowfault but anon slot is empty");
2131				}
2132			}
2133			VM_STAT_ADD(anonvmstats.getpages[9]);
2134			*protp = PROT_ALL;
2135			return (anon_map_privatepages(amp, start_idx, szc, seg,
2136			    addr, prot, ppa, vpage, anypgsz, pgflags, cred));
2137		}
2138	}
2139
2140	VM_STAT_ADD(anonvmstats.getpages[10]);
2141
2142	an_idx = start_idx;
2143	pg_idx = 0;
2144	vaddr = addr;
2145	while (pg_idx < pgcnt) {
2146		slotcreate = 0;
2147		if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) {
2148			VM_STAT_ADD(anonvmstats.getpages[11]);
2149			/*
2150			 * For us to have decided not to preallocate
2151			 * would have meant that a large page
2152			 * was found. Which also means that all of the
2153			 * anon slots for that page would have been
2154			 * already created for us.
2155			 */
2156			if (prealloc == 0)
2157				panic("anon_map_getpages: prealloc = 0");
2158
2159			slotcreate = 1;
2160			ap = anon_alloc(NULL, 0);
2161		}
2162		swap_xlate(ap, &vp, &off);
2163
2164		/*
2165		 * Now setup our preallocated page to pass down
2166		 * to swap_getpage().
2167		 */
2168		if (prealloc) {
2169			ASSERT(ppa[pg_idx]->p_szc == szc);
2170			conpp = ppa[pg_idx];
2171		}
2172		ASSERT(prealloc || conpp == NULL);
2173
2174		/*
2175		 * If we just created this anon slot then call
2176		 * with S_CREATE to prevent doing IO on the page.
2177		 * Similar to the anon_zero case.
2178		 */
2179		err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE,
2180		    NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr,
2181		    slotcreate == 1 ? S_CREATE : rw, cred);
2182
2183		if (err) {
2184			ASSERT(err != -2 || upsize);
2185			VM_STAT_ADD(anonvmstats.getpages[12]);
2186			ASSERT(slotcreate == 0);
2187			goto io_err;
2188		}
2189
2190		pp = pl[0];
2191
2192		if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) {
2193			VM_STAT_ADD(anonvmstats.getpages[13]);
2194			ASSERT(slotcreate == 0);
2195			ASSERT(prealloc == 0);
2196			ASSERT(pg_idx == 0);
2197			if (pp->p_szc > szc) {
2198				ASSERT(upsize);
2199				*ppa_szc = MIN(pp->p_szc, seg->s_szc);
2200				page_unlock(pp);
2201				VM_STAT_ADD(anonvmstats.getpages[14]);
2202				return (-2);
2203			}
2204			page_unlock(pp);
2205			prealloc = 1;
2206			goto top;
2207		}
2208
2209		/*
2210		 * If we decided to preallocate but VOP_GETPAGE
2211		 * found a page in the system that satisfies our
2212		 * request then free up our preallocated large page
2213		 * and continue looping accross the existing large
2214		 * page via VOP_GETPAGE.
2215		 */
2216		if (prealloc && pp != ppa[pg_idx]) {
2217			VM_STAT_ADD(anonvmstats.getpages[15]);
2218			ASSERT(slotcreate == 0);
2219			ASSERT(pg_idx == 0);
2220			conpp = NULL;
2221			prealloc = 0;
2222			page_free_pages(ppa[0]);
2223		}
2224
2225		if (prealloc && nreloc > 1) {
2226			/*
2227			 * we have relocated out of a smaller large page.
2228			 * skip npgs - 1 iterations and continue which will
2229			 * increment by one the loop indices.
2230			 */
2231			spgcnt_t npgs = nreloc;
2232
2233			VM_STAT_ADD(anonvmstats.getpages[16]);
2234
2235			ASSERT(pp == ppa[pg_idx]);
2236			ASSERT(slotcreate == 0);
2237			ASSERT(pg_idx + npgs <= pgcnt);
2238			if ((*protp & PROT_WRITE) &&
2239			    anon_share(amp->ahp, an_idx, npgs)) {
2240				*protp &= ~PROT_WRITE;
2241			}
2242			pg_idx += npgs;
2243			an_idx += npgs;
2244			vaddr += PAGESIZE * npgs;
2245			continue;
2246		}
2247
2248		VM_STAT_ADD(anonvmstats.getpages[17]);
2249
2250		/*
2251		 * Anon_zero case.
2252		 */
2253		if (slotcreate) {
2254			ASSERT(prealloc);
2255			pagezero(pp, 0, PAGESIZE);
2256			CPU_STATS_ADD_K(vm, zfod, 1);
2257			hat_setrefmod(pp);
2258		}
2259
2260		ASSERT(prealloc == 0 || ppa[pg_idx] == pp);
2261		ASSERT(prealloc != 0 || PAGE_SHARED(pp));
2262		ASSERT(prealloc == 0 || PAGE_EXCL(pp));
2263
2264		if (pg_idx > 0 &&
2265		    ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) ||
2266		    (pp->p_szc != ppa[pg_idx - 1]->p_szc))) {
2267			panic("anon_map_getpages: unexpected page");
2268		} else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) {
2269			panic("anon_map_getpages: unaligned page");
2270		}
2271
2272		if (prealloc == 0) {
2273			ppa[pg_idx] = pp;
2274		}
2275
2276		if (ap->an_refcnt > 1) {
2277			VM_STAT_ADD(anonvmstats.getpages[18]);
2278			*protp &= ~PROT_WRITE;
2279		}
2280
2281		/*
2282		 * If this is a new anon slot then initialize
2283		 * the anon array entry.
2284		 */
2285		if (slotcreate) {
2286			(void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2287		}
2288		pg_idx++;
2289		an_idx++;
2290		vaddr += PAGESIZE;
2291	}
2292
2293	/*
2294	 * Since preallocated pages come off the freelist
2295	 * they are locked SE_EXCL. Simply downgrade and return.
2296	 */
2297	if (prealloc) {
2298		VM_STAT_ADD(anonvmstats.getpages[19]);
2299		conpp = NULL;
2300		for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2301			page_downgrade(ppa[pg_idx]);
2302		}
2303	}
2304	ASSERT(conpp == NULL);
2305
2306	if (brkcow == 0 || (*protp & PROT_WRITE)) {
2307		VM_STAT_ADD(anonvmstats.getpages[20]);
2308		return (0);
2309	}
2310
2311	if (szc < seg->s_szc)
2312		panic("anon_map_getpages: cowfault for szc %d", szc);
2313
2314	VM_STAT_ADD(anonvmstats.getpages[21]);
2315
2316	*protp = PROT_ALL;
2317	return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot,
2318	    ppa, vpage, anypgsz, pgflags, cred));
2319io_err:
2320	/*
2321	 * We got an IO error somewhere in our large page.
2322	 * If we were using a preallocated page then just demote
2323	 * all the constituent pages that we've succeeded with sofar
2324	 * to PAGESIZE pages and leave them in the system
2325	 * unlocked.
2326	 */
2327
2328	ASSERT(err != -2 || ((pg_idx == 0) && upsize));
2329
2330	VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]);
2331	VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]);
2332	VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]);
2333
2334	if (prealloc) {
2335		conpp = NULL;
2336		if (pg_idx > 0) {
2337			VM_STAT_ADD(anonvmstats.getpages[25]);
2338			for (i = 0; i < pgcnt; i++) {
2339				pp = ppa[i];
2340				ASSERT(PAGE_EXCL(pp));
2341				ASSERT(pp->p_szc == szc);
2342				pp->p_szc = 0;
2343			}
2344			for (i = 0; i < pg_idx; i++) {
2345				ASSERT(!hat_page_is_mapped(ppa[i]));
2346				page_unlock(ppa[i]);
2347			}
2348			/*
2349			 * Now free up the remaining unused constituent
2350			 * pages.
2351			 */
2352			while (pg_idx < pgcnt) {
2353				ASSERT(!hat_page_is_mapped(ppa[pg_idx]));
2354				page_free(ppa[pg_idx], 0);
2355				pg_idx++;
2356			}
2357		} else {
2358			VM_STAT_ADD(anonvmstats.getpages[26]);
2359			page_free_pages(ppa[0]);
2360		}
2361	} else {
2362		VM_STAT_ADD(anonvmstats.getpages[27]);
2363		ASSERT(err > 0);
2364		for (i = 0; i < pg_idx; i++)
2365			page_unlock(ppa[i]);
2366	}
2367	ASSERT(conpp == NULL);
2368	if (err != -1)
2369		return (err);
2370	/*
2371	 * we are here because we failed to relocate.
2372	 */
2373	ASSERT(prealloc);
2374	if (brkcow == 0 || szc < seg->s_szc ||
2375	    !anon_szcshare(amp->ahp, start_idx)) {
2376		VM_STAT_ADD(anonvmstats.getpages[28]);
2377		return (-1);
2378	}
2379	VM_STAT_ADD(anonvmstats.getpages[29]);
2380	goto docow;
2381}
2382
2383
2384/*
2385 * Turn a reference to an object or shared anon page
2386 * into a private page with a copy of the data from the
2387 * original page which is always locked by the caller.
2388 * This routine unloads the translation and unlocks the
2389 * original page, if it isn't being stolen, before returning
2390 * to the caller.
2391 *
2392 * NOTE:  The original anon slot is not freed by this routine
2393 *	  It must be freed by the caller while holding the
2394 *	  "anon_map" lock to prevent races which can occur if
2395 *	  a process has multiple lwps in its address space.
2396 */
2397page_t *
2398anon_private(
2399	struct anon **app,
2400	struct seg *seg,
2401	caddr_t addr,
2402	uint_t	prot,
2403	page_t *opp,
2404	int oppflags,
2405	struct cred *cred)
2406{
2407	struct anon *old = *app;
2408	struct anon *new;
2409	page_t *pp = NULL;
2410	struct vnode *vp;
2411	anoff_t off;
2412	page_t *anon_pl[1 + 1];
2413	int err;
2414
2415	if (oppflags & STEAL_PAGE)
2416		ASSERT(PAGE_EXCL(opp));
2417	else
2418		ASSERT(PAGE_LOCKED(opp));
2419
2420	CPU_STATS_ADD_K(vm, cow_fault, 1);
2421
2422	/* Kernel probe */
2423	TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */,
2424		tnf_opaque,	address,	addr);
2425
2426	*app = new = anon_alloc(NULL, 0);
2427	swap_xlate(new, &vp, &off);
2428
2429	if (oppflags & STEAL_PAGE) {
2430		page_rename(opp, vp, (u_offset_t)off);
2431		pp = opp;
2432		TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE,
2433		    "anon_private:seg %p addr %x pp %p vp %p off %lx",
2434		    seg, addr, pp, vp, off);
2435		hat_setmod(pp);
2436
2437		/* bug 4026339 */
2438		page_downgrade(pp);
2439		return (pp);
2440	}
2441
2442	/*
2443	 * Call the VOP_GETPAGE routine to create the page, thereby
2444	 * enabling the vnode driver to allocate any filesystem
2445	 * space (e.g., disk block allocation for UFS).  This also
2446	 * prevents more than one page from being added to the
2447	 * vnode at the same time.
2448	 */
2449	err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL,
2450	    anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2451	if (err)
2452		goto out;
2453
2454	pp = anon_pl[0];
2455
2456	/*
2457	 * If the original page was locked, we need to move the lock
2458	 * to the new page by transfering 'cowcnt/lckcnt' of the original
2459	 * page to 'cowcnt/lckcnt' of the new page.
2460	 *
2461	 * See Statement at the beginning of segvn_lockop() and
2462	 * comments in page_pp_useclaim() regarding the way
2463	 * cowcnts/lckcnts are handled.
2464	 *
2465	 * Also availrmem must be decremented up front for read only mapping
2466	 * before calling page_pp_useclaim. page_pp_useclaim will bump it back
2467	 * if availrmem did not need to be decremented after all.
2468	 */
2469	if (oppflags & LOCK_PAGE) {
2470		if ((prot & PROT_WRITE) == 0) {
2471			mutex_enter(&freemem_lock);
2472			if (availrmem > pages_pp_maximum) {
2473				availrmem--;
2474				pages_useclaim++;
2475			} else {
2476				mutex_exit(&freemem_lock);
2477				goto out;
2478			}
2479			mutex_exit(&freemem_lock);
2480		}
2481		page_pp_useclaim(opp, pp, prot & PROT_WRITE);
2482	}
2483
2484	/*
2485	 * Now copy the contents from the original page,
2486	 * which is locked and loaded in the MMU by
2487	 * the caller to prevent yet another page fault.
2488	 */
2489	/* XXX - should set mod bit in here */
2490	if (ppcopy(opp, pp) == 0) {
2491		/*
2492		 * Before ppcopy could hanlde UE or other faults, we
2493		 * would have panicked here, and still have no option
2494		 * but to do so now.
2495		 */
2496		panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p",
2497		    (void *)opp, (void *)pp);
2498	}
2499
2500	hat_setrefmod(pp);		/* mark as modified */
2501
2502	/*
2503	 * Unload the old translation.
2504	 */
2505	hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD);
2506
2507	/*
2508	 * Free unmapped, unmodified original page.
2509	 * or release the lock on the original page,
2510	 * otherwise the process will sleep forever in
2511	 * anon_decref() waiting for the "exclusive" lock
2512	 * on the page.
2513	 */
2514	(void) page_release(opp, 1);
2515
2516	/*
2517	 * we are done with page creation so downgrade the new
2518	 * page's selock to shared, this helps when multiple
2519	 * as_fault(...SOFTLOCK...) are done to the same
2520	 * page(aio)
2521	 */
2522	page_downgrade(pp);
2523
2524	/*
2525	 * NOTE:  The original anon slot must be freed by the
2526	 * caller while holding the "anon_map" lock, if we
2527	 * copied away from an anonymous page.
2528	 */
2529	return (pp);
2530
2531out:
2532	*app = old;
2533	if (pp)
2534		page_unlock(pp);
2535	anon_decref(new);
2536	page_unlock(opp);
2537	return ((page_t *)NULL);
2538}
2539
2540int
2541anon_map_privatepages(
2542	struct anon_map *amp,
2543	ulong_t	start_idx,
2544	uint_t	szc,
2545	struct seg *seg,
2546	caddr_t addr,
2547	uint_t	prot,
2548	page_t	*ppa[],
2549	struct vpage vpage[],
2550	int anypgsz,
2551	int pgflags,
2552	struct cred *cred)
2553{
2554	pgcnt_t		pgcnt;
2555	struct vnode	*vp;
2556	anoff_t		off;
2557	page_t		*pl[2], *conpp = NULL;
2558	int		err;
2559	int		prealloc = 1;
2560	struct anon	*ap, *oldap;
2561	caddr_t		vaddr;
2562	page_t		*pplist, *pp;
2563	ulong_t		pg_idx, an_idx;
2564	spgcnt_t	nreloc = 0;
2565	int		pagelock = 0;
2566	kmutex_t	*ahmpages = NULL;
2567#ifdef DEBUG
2568	int		refcnt;
2569#endif
2570
2571	ASSERT(szc != 0);
2572	ASSERT(szc == seg->s_szc);
2573
2574	VM_STAT_ADD(anonvmstats.privatepages[0]);
2575
2576	pgcnt = page_get_pagecnt(szc);
2577	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
2578	ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
2579
2580	ASSERT(amp != NULL);
2581	ap = anon_get_ptr(amp->ahp, start_idx);
2582	ASSERT(ap == NULL || ap->an_refcnt >= 1);
2583
2584	VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]);
2585
2586	/*
2587	 * Now try and allocate the large page. If we fail then just
2588	 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let
2589	 * the caller make this decision but to avoid added complexity
2590	 * it's simplier to handle that case here.
2591	 */
2592	if (anypgsz == -1) {
2593		VM_STAT_ADD(anonvmstats.privatepages[2]);
2594		prealloc = 0;
2595	} else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc,
2596	    anypgsz, pgflags) != 0) {
2597		VM_STAT_ADD(anonvmstats.privatepages[3]);
2598		prealloc = 0;
2599	}
2600
2601	/*
2602	 * make the decrement of all refcnts of all
2603	 * anon slots of a large page appear atomic by
2604	 * getting an anonpages_hash_lock for the
2605	 * first anon slot of a large page.
2606	 */
2607	if (ap != NULL) {
2608		ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
2609		mutex_enter(ahmpages);
2610		if (ap->an_refcnt == 1) {
2611			VM_STAT_ADD(anonvmstats.privatepages[4]);
2612			ASSERT(!anon_share(amp->ahp, start_idx, pgcnt));
2613			mutex_exit(ahmpages);
2614
2615			if (prealloc) {
2616				page_free_replacement_page(pplist);
2617				page_create_putback(pgcnt);
2618			}
2619			ASSERT(ppa[0]->p_szc <= szc);
2620			if (ppa[0]->p_szc == szc) {
2621				VM_STAT_ADD(anonvmstats.privatepages[5]);
2622				return (0);
2623			}
2624			for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2625				ASSERT(ppa[pg_idx] != NULL);
2626				page_unlock(ppa[pg_idx]);
2627			}
2628			return (-1);
2629		}
2630	}
2631
2632	/*
2633	 * If we are passed in the vpage array and this is
2634	 * not PROT_WRITE then we need to decrement availrmem
2635	 * up front before we try anything. If we need to and
2636	 * can't decrement availrmem then its better to fail now
2637	 * than in the middle of processing the new large page.
2638	 * page_pp_usclaim() on behalf of each constituent page
2639	 * below will adjust availrmem back for the cases not needed.
2640	 */
2641	if (vpage != NULL && (prot & PROT_WRITE) == 0) {
2642		for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2643			if (VPP_ISPPLOCK(&vpage[pg_idx])) {
2644				pagelock = 1;
2645				break;
2646			}
2647		}
2648		if (pagelock) {
2649			VM_STAT_ADD(anonvmstats.privatepages[6]);
2650			mutex_enter(&freemem_lock);
2651			if (availrmem >= pages_pp_maximum + pgcnt) {
2652				availrmem -= pgcnt;
2653				pages_useclaim += pgcnt;
2654			} else {
2655				VM_STAT_ADD(anonvmstats.privatepages[7]);
2656				mutex_exit(&freemem_lock);
2657				if (ahmpages != NULL) {
2658					mutex_exit(ahmpages);
2659				}
2660				if (prealloc) {
2661					page_free_replacement_page(pplist);
2662					page_create_putback(pgcnt);
2663				}
2664				for (pg_idx = 0; pg_idx < pgcnt; pg_idx++)
2665					if (ppa[pg_idx] != NULL)
2666						page_unlock(ppa[pg_idx]);
2667				return (ENOMEM);
2668			}
2669			mutex_exit(&freemem_lock);
2670		}
2671	}
2672
2673	CPU_STATS_ADD_K(vm, cow_fault, pgcnt);
2674
2675	VM_STAT_ADD(anonvmstats.privatepages[8]);
2676
2677	an_idx = start_idx;
2678	pg_idx = 0;
2679	vaddr = addr;
2680	for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) {
2681		ASSERT(ppa[pg_idx] != NULL);
2682		oldap = anon_get_ptr(amp->ahp, an_idx);
2683		ASSERT(ahmpages != NULL || oldap == NULL);
2684		ASSERT(ahmpages == NULL || oldap != NULL);
2685		ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2686		ASSERT(ahmpages == NULL || pg_idx != 0 ||
2687		    (refcnt = oldap->an_refcnt));
2688		ASSERT(ahmpages == NULL || pg_idx == 0 ||
2689		    refcnt == oldap->an_refcnt);
2690
2691		ap = anon_alloc(NULL, 0);
2692
2693		swap_xlate(ap, &vp, &off);
2694
2695		/*
2696		 * Now setup our preallocated page to pass down to
2697		 * swap_getpage().
2698		 */
2699		if (prealloc) {
2700			pp = pplist;
2701			page_sub(&pplist, pp);
2702			conpp = pp;
2703		}
2704
2705		err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl,
2706		    PAGESIZE, conpp, NULL, &nreloc, seg, vaddr,
2707		    S_CREATE, cred);
2708
2709		/*
2710		 * Impossible to fail this is S_CREATE.
2711		 */
2712		if (err)
2713			panic("anon_map_privatepages: VOP_GETPAGE failed");
2714
2715		ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0);
2716		ASSERT(prealloc == 0 || nreloc == 1);
2717
2718		pp = pl[0];
2719
2720		/*
2721		 * If the original page was locked, we need to move
2722		 * the lock to the new page by transfering
2723		 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt'
2724		 * of the new page. pg_idx can be used to index
2725		 * into the vpage array since the caller will guarentee
2726		 * that vpage struct passed in corresponds to addr
2727		 * and forward.
2728		 */
2729		if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) {
2730			page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE);
2731		} else if (pagelock) {
2732			mutex_enter(&freemem_lock);
2733			availrmem++;
2734			pages_useclaim--;
2735			mutex_exit(&freemem_lock);
2736		}
2737
2738		/*
2739		 * Now copy the contents from the original page.
2740		 */
2741		if (ppcopy(ppa[pg_idx], pp) == 0) {
2742			/*
2743			 * Before ppcopy could hanlde UE or other faults, we
2744			 * would have panicked here, and still have no option
2745			 * but to do so now.
2746			 */
2747			panic("anon_map_privatepages, ppcopy failed");
2748		}
2749
2750		hat_setrefmod(pp);		/* mark as modified */
2751
2752		/*
2753		 * Release the lock on the original page,
2754		 * derement the old slot, and down grade the lock
2755		 * on the new copy.
2756		 */
2757		page_unlock(ppa[pg_idx]);
2758
2759		if (!prealloc)
2760			page_downgrade(pp);
2761
2762		ppa[pg_idx] = pp;
2763
2764		/*
2765		 * Now reflect the copy in the new anon array.
2766		 */
2767		ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2768		if (oldap != NULL)
2769			anon_decref(oldap);
2770		(void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2771	}
2772
2773	/*
2774	 * Unload the old large page translation.
2775	 */
2776	hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD);
2777
2778	if (ahmpages != NULL) {
2779		mutex_exit(ahmpages);
2780	}
2781	ASSERT(prealloc == 0 || pplist == NULL);
2782	if (prealloc) {
2783		VM_STAT_ADD(anonvmstats.privatepages[9]);
2784		for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2785			page_downgrade(ppa[pg_idx]);
2786		}
2787	}
2788
2789	return (0);
2790}
2791
2792/*
2793 * Allocate a private zero-filled anon page.
2794 */
2795page_t *
2796anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred)
2797{
2798	struct anon *ap;
2799	page_t *pp;
2800	struct vnode *vp;
2801	anoff_t off;
2802	page_t *anon_pl[1 + 1];
2803	int err;
2804
2805	/* Kernel probe */
2806	TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */,
2807		tnf_opaque,	address,	addr);
2808
2809	*app = ap = anon_alloc(NULL, 0);
2810	swap_xlate(ap, &vp, &off);
2811
2812	/*
2813	 * Call the VOP_GETPAGE routine to create the page, thereby
2814	 * enabling the vnode driver to allocate any filesystem
2815	 * dependent structures (e.g., disk block allocation for UFS).
2816	 * This also prevents more than on page from being added to
2817	 * the vnode at the same time since it is locked.
2818	 */
2819	err = VOP_GETPAGE(vp, off, PAGESIZE, NULL,
2820	    anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2821	if (err) {
2822		*app = NULL;
2823		anon_decref(ap);
2824		return (NULL);
2825	}
2826	pp = anon_pl[0];
2827
2828	pagezero(pp, 0, PAGESIZE);	/* XXX - should set mod bit */
2829	page_downgrade(pp);
2830	CPU_STATS_ADD_K(vm, zfod, 1);
2831	hat_setrefmod(pp);	/* mark as modified so pageout writes back */
2832	return (pp);
2833}
2834
2835
2836/*
2837 * Allocate array of private zero-filled anon pages for empty slots
2838 * and kept pages for non empty slots within given range.
2839 *
2840 * NOTE: This rontine will try and use large pages
2841 *	if available and supported by underlying platform.
2842 */
2843int
2844anon_map_createpages(
2845	struct anon_map *amp,
2846	ulong_t start_index,
2847	size_t len,
2848	page_t *ppa[],
2849	struct seg *seg,
2850	caddr_t addr,
2851	enum seg_rw rw,
2852	struct cred *cred)
2853{
2854
2855	struct anon	*ap;
2856	struct vnode	*ap_vp;
2857	page_t		*pp, *pplist, *anon_pl[1 + 1], *conpp = NULL;
2858	int		err = 0;
2859	ulong_t		p_index, index;
2860	pgcnt_t		npgs, pg_cnt;
2861	spgcnt_t	nreloc = 0;
2862	uint_t		l_szc, szc, prot;
2863	anoff_t		ap_off;
2864	size_t		pgsz;
2865	lgrp_t		*lgrp;
2866	kmutex_t	*ahm;
2867
2868	/*
2869	 * XXX For now only handle S_CREATE.
2870	 */
2871	ASSERT(rw == S_CREATE);
2872
2873	index	= start_index;
2874	p_index	= 0;
2875	npgs = btopr(len);
2876
2877	/*
2878	 * If this platform supports multiple page sizes
2879	 * then try and allocate directly from the free
2880	 * list for pages larger than PAGESIZE.
2881	 *
2882	 * NOTE:When we have page_create_ru we can stop
2883	 *	directly allocating from the freelist.
2884	 */
2885	l_szc  = seg->s_szc;
2886	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2887	while (npgs) {
2888
2889		/*
2890		 * if anon slot already exists
2891		 *   (means page has been created)
2892		 * so 1) look up the page
2893		 *    2) if the page is still in memory, get it.
2894		 *    3) if not, create a page and
2895		 *	  page in from physical swap device.
2896		 * These are done in anon_getpage().
2897		 */
2898		ap = anon_get_ptr(amp->ahp, index);
2899		if (ap) {
2900			err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE,
2901			    seg, addr, S_READ, cred);
2902			if (err) {
2903				ANON_LOCK_EXIT(&amp->a_rwlock);
2904				panic("anon_map_createpages: anon_getpage");
2905			}
2906			pp = anon_pl[0];
2907			ppa[p_index++] = pp;
2908
2909			/*
2910			 * an_pvp can become non-NULL after SysV's page was
2911			 * paged out before ISM was attached to this SysV
2912			 * shared memory segment. So free swap slot if needed.
2913			 */
2914			if (ap->an_pvp != NULL) {
2915				page_io_lock(pp);
2916				ahm = AH_MUTEX(ap->an_vp, ap->an_off);
2917				mutex_enter(ahm);
2918				if (ap->an_pvp != NULL) {
2919					swap_phys_free(ap->an_pvp,
2920					    ap->an_poff, PAGESIZE);
2921					ap->an_pvp = NULL;
2922					ap->an_poff = 0;
2923					mutex_exit(ahm);
2924					hat_setmod(pp);
2925				} else {
2926					mutex_exit(ahm);
2927				}
2928				page_io_unlock(pp);
2929			}
2930
2931			addr += PAGESIZE;
2932			index++;
2933			npgs--;
2934			continue;
2935		}
2936		/*
2937		 * Now try and allocate the largest page possible
2938		 * for the current address and range.
2939		 * Keep dropping down in page size until:
2940		 *
2941		 *	1) Properly aligned
2942		 *	2) Does not overlap existing anon pages
2943		 *	3) Fits in remaining range.
2944		 *	4) able to allocate one.
2945		 *
2946		 * NOTE: XXX When page_create_ru is completed this code
2947		 *	 will change.
2948		 */
2949		szc    = l_szc;
2950		pplist = NULL;
2951		pg_cnt = 0;
2952		while (szc) {
2953			pgsz	= page_get_pagesize(szc);
2954			pg_cnt	= pgsz >> PAGESHIFT;
2955			if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs &&
2956			    anon_pages(amp->ahp, index, pg_cnt) == 0) {
2957				/*
2958				 * XXX
2959				 * Since we are faking page_create()
2960				 * we also need to do the freemem and
2961				 * pcf accounting.
2962				 */
2963				(void) page_create_wait(pg_cnt, PG_WAIT);
2964
2965				/*
2966				 * Get lgroup to allocate next page of shared
2967				 * memory from and use it to specify where to
2968				 * allocate the physical memory
2969				 */
2970				lgrp = lgrp_mem_choose(seg, addr, pgsz);
2971
2972				pplist = page_get_freelist(
2973				    anon_vp, (u_offset_t)0, seg,
2974				    addr, pgsz, 0, lgrp);
2975
2976				if (pplist == NULL) {
2977					page_create_putback(pg_cnt);
2978				}
2979
2980				/*
2981				 * If a request for a page of size
2982				 * larger than PAGESIZE failed
2983				 * then don't try that size anymore.
2984				 */
2985				if (pplist == NULL) {
2986					l_szc = szc - 1;
2987				} else {
2988					break;
2989				}
2990			}
2991			szc--;
2992		}
2993
2994		/*
2995		 * If just using PAGESIZE pages then don't
2996		 * directly allocate from the free list.
2997		 */
2998		if (pplist == NULL) {
2999			ASSERT(szc == 0);
3000			pp = anon_zero(seg, addr, &ap, cred);
3001			if (pp == NULL) {
3002				ANON_LOCK_EXIT(&amp->a_rwlock);
3003				panic("anon_map_createpages: anon_zero");
3004			}
3005			ppa[p_index++] = pp;
3006
3007			ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
3008			(void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
3009
3010			addr += PAGESIZE;
3011			index++;
3012			npgs--;
3013			continue;
3014		}
3015
3016		/*
3017		 * pplist is a list of pg_cnt PAGESIZE pages.
3018		 * These pages are locked SE_EXCL since they
3019		 * came directly off the free list.
3020		 */
3021		ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt));
3022		ASSERT(IS_P2ALIGNED(index, pg_cnt));
3023		ASSERT(conpp == NULL);
3024		while (pg_cnt--) {
3025
3026			ap = anon_alloc(NULL, 0);
3027			swap_xlate(ap, &ap_vp, &ap_off);
3028
3029			ASSERT(pplist != NULL);
3030			pp = pplist;
3031			page_sub(&pplist, pp);
3032			PP_CLRFREE(pp);
3033			PP_CLRAGED(pp);
3034			conpp = pp;
3035
3036			err = swap_getconpage(ap_vp, ap_off, PAGESIZE,
3037			    (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL,
3038			    &nreloc, seg, addr, S_CREATE, cred);
3039
3040			if (err) {
3041				ANON_LOCK_EXIT(&amp->a_rwlock);
3042				panic("anon_map_createpages: S_CREATE");
3043			}
3044
3045			ASSERT(anon_pl[0] == pp);
3046			ASSERT(nreloc == 1);
3047			pagezero(pp, 0, PAGESIZE);
3048			CPU_STATS_ADD_K(vm, zfod, 1);
3049			hat_setrefmod(pp);
3050
3051			ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
3052			(void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
3053
3054			ppa[p_index++] = pp;
3055
3056			addr += PAGESIZE;
3057			index++;
3058			npgs--;
3059		}
3060		conpp = NULL;
3061		pg_cnt	= pgsz >> PAGESHIFT;
3062		p_index = p_index - pg_cnt;
3063		while (pg_cnt--) {
3064			page_downgrade(ppa[p_index++]);
3065		}
3066	}
3067	ANON_LOCK_EXIT(&amp->a_rwlock);
3068	return (0);
3069}
3070
3071static int
3072anon_try_demote_pages(
3073	struct anon_hdr *ahp,
3074	ulong_t sidx,
3075	uint_t szc,
3076	page_t **ppa,
3077	int private)
3078{
3079	struct anon	*ap;
3080	pgcnt_t		pgcnt = page_get_pagecnt(szc);
3081	page_t		*pp;
3082	pgcnt_t		i;
3083	kmutex_t	*ahmpages = NULL;
3084	int		root = 0;
3085	pgcnt_t		npgs;
3086	pgcnt_t		curnpgs = 0;
3087	size_t		ppasize = 0;
3088
3089	ASSERT(szc != 0);
3090	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3091	ASSERT(IS_P2ALIGNED(sidx, pgcnt));
3092	ASSERT(sidx < ahp->size);
3093
3094	if (ppa == NULL) {
3095		ppasize = pgcnt * sizeof (page_t *);
3096		ppa = kmem_alloc(ppasize, KM_SLEEP);
3097	}
3098
3099	ap = anon_get_ptr(ahp, sidx);
3100	if (ap != NULL && private) {
3101		VM_STAT_ADD(anonvmstats.demotepages[1]);
3102		ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
3103		mutex_enter(ahmpages);
3104	}
3105
3106	if (ap != NULL && ap->an_refcnt > 1) {
3107		if (ahmpages != NULL) {
3108			VM_STAT_ADD(anonvmstats.demotepages[2]);
3109			mutex_exit(ahmpages);
3110		}
3111		if (ppasize != 0) {
3112			kmem_free(ppa, ppasize);
3113		}
3114		return (0);
3115	}
3116	if (ahmpages != NULL) {
3117		mutex_exit(ahmpages);
3118	}
3119	if (ahp->size - sidx < pgcnt) {
3120		ASSERT(private == 0);
3121		pgcnt = ahp->size - sidx;
3122	}
3123	for (i = 0; i < pgcnt; i++, sidx++) {
3124		ap = anon_get_ptr(ahp, sidx);
3125		if (ap != NULL) {
3126			if (ap->an_refcnt != 1) {
3127				panic("anon_try_demote_pages: an_refcnt != 1");
3128			}
3129			pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off,
3130			    SE_EXCL);
3131			if (pp != NULL) {
3132				(void) hat_pageunload(pp,
3133				    HAT_FORCE_PGUNLOAD);
3134			}
3135		} else {
3136			ppa[i] = NULL;
3137		}
3138	}
3139	for (i = 0; i < pgcnt; i++) {
3140		if ((pp = ppa[i]) != NULL && pp->p_szc != 0) {
3141			ASSERT(pp->p_szc <= szc);
3142			if (!root) {
3143				VM_STAT_ADD(anonvmstats.demotepages[3]);
3144				if (curnpgs != 0)
3145					panic("anon_try_demote_pages: "
3146					    "bad large page");
3147
3148				root = 1;
3149				curnpgs = npgs =
3150				    page_get_pagecnt(pp->p_szc);
3151
3152				ASSERT(npgs <= pgcnt);
3153				ASSERT(IS_P2ALIGNED(npgs, npgs));
3154				ASSERT(!(page_pptonum(pp) & (npgs - 1)));
3155			} else {
3156				ASSERT(i > 0);
3157				ASSERT(page_pptonum(pp) - 1 ==
3158				    page_pptonum(ppa[i - 1]));
3159				if ((page_pptonum(pp) & (npgs - 1)) ==
3160				    npgs - 1)
3161					root = 0;
3162			}
3163			ASSERT(PAGE_EXCL(pp));
3164			pp->p_szc = 0;
3165			ASSERT(curnpgs > 0);
3166			curnpgs--;
3167		}
3168	}
3169	if (root != 0 || curnpgs != 0)
3170		panic("anon_try_demote_pages: bad large page");
3171
3172	for (i = 0; i < pgcnt; i++) {
3173		if ((pp = ppa[i]) != NULL) {
3174			ASSERT(!hat_page_is_mapped(pp));
3175			ASSERT(pp->p_szc == 0);
3176			page_unlock(pp);
3177		}
3178	}
3179	if (ppasize != 0) {
3180		kmem_free(ppa, ppasize);
3181	}
3182	return (1);
3183}
3184
3185/*
3186 * anon_map_demotepages() can only be called by MAP_PRIVATE segments.
3187 */
3188int
3189anon_map_demotepages(
3190	struct anon_map *amp,
3191	ulong_t	start_idx,
3192	struct seg *seg,
3193	caddr_t addr,
3194	uint_t prot,
3195	struct vpage vpage[],
3196	struct cred *cred)
3197{
3198	struct anon	*ap;
3199	uint_t		szc = seg->s_szc;
3200	pgcnt_t		pgcnt = page_get_pagecnt(szc);
3201	size_t		ppasize = pgcnt * sizeof (page_t *);
3202	page_t		**ppa = kmem_alloc(ppasize, KM_SLEEP);
3203	page_t		*pp;
3204	page_t		*pl[2];
3205	pgcnt_t		i, pg_idx;
3206	ulong_t		an_idx;
3207	caddr_t		vaddr;
3208	int		err;
3209	int		retry = 0;
3210	uint_t		vpprot;
3211
3212	ASSERT(RW_WRITE_HELD(&amp->a_rwlock));
3213	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3214	ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
3215	ASSERT(ppa != NULL);
3216	ASSERT(szc != 0);
3217	ASSERT(szc == amp->a_szc);
3218
3219	VM_STAT_ADD(anonvmstats.demotepages[0]);
3220
3221top:
3222	if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) {
3223		kmem_free(ppa, ppasize);
3224		return (0);
3225	}
3226
3227	VM_STAT_ADD(anonvmstats.demotepages[4]);
3228
3229	ASSERT(retry == 0); /* we can be here only once */
3230
3231	vaddr = addr;
3232	for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
3233	    pg_idx++, an_idx++, vaddr += PAGESIZE) {
3234		ap = anon_get_ptr(amp->ahp, an_idx);
3235		if (ap == NULL)
3236			panic("anon_map_demotepages: no anon slot");
3237		err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr,
3238		    S_READ, cred);
3239		if (err) {
3240			for (i = 0; i < pg_idx; i++) {
3241				if ((pp = ppa[i]) != NULL)
3242					page_unlock(pp);
3243			}
3244			kmem_free(ppa, ppasize);
3245			return (err);
3246		}
3247		ppa[pg_idx] = pl[0];
3248	}
3249
3250	err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa,
3251	    vpage, -1, 0, cred);
3252	if (err > 0) {
3253		VM_STAT_ADD(anonvmstats.demotepages[5]);
3254		kmem_free(ppa, ppasize);
3255		return (err);
3256	}
3257	ASSERT(err == 0 || err == -1);
3258	if (err == -1) {
3259		VM_STAT_ADD(anonvmstats.demotepages[6]);
3260		retry = 1;
3261		goto top;
3262	}
3263	for (i = 0; i < pgcnt; i++) {
3264		ASSERT(ppa[i] != NULL);
3265		if (ppa[i]->p_szc != 0)
3266			retry = 1;
3267		page_unlock(ppa[i]);
3268	}
3269	if (retry) {
3270		VM_STAT_ADD(anonvmstats.demotepages[7]);
3271		goto top;
3272	}
3273
3274	VM_STAT_ADD(anonvmstats.demotepages[8]);
3275
3276	kmem_free(ppa, ppasize);
3277
3278	return (0);
3279}
3280
3281/*
3282 * Free pages of shared anon map. It's assumed that anon maps don't share anon
3283 * structures with private anon maps. Therefore all anon structures should
3284 * have at most one reference at this point. This means underlying pages can
3285 * be exclusively locked and demoted or freed.  If not freeing the entire
3286 * large pages demote the ends of the region we free to be able to free
3287 * subpages. Page roots correspond to aligned index positions in anon map.
3288 */
3289void
3290anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len)
3291{
3292	ulong_t eidx = sidx + btopr(len);
3293	pgcnt_t pages = page_get_pagecnt(amp->a_szc);
3294	struct anon_hdr *ahp = amp->ahp;
3295	ulong_t tidx;
3296	size_t size;
3297	ulong_t sidx_aligned;
3298	ulong_t eidx_aligned;
3299
3300	ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
3301	ASSERT(amp->refcnt <= 1);
3302	ASSERT(amp->a_szc > 0);
3303	ASSERT(eidx <= ahp->size);
3304	ASSERT(!anon_share(ahp, sidx, btopr(len)));
3305
3306	if (len == 0) {	/* XXX */
3307		return;
3308	}
3309
3310	sidx_aligned = P2ALIGN(sidx, pages);
3311	if (sidx_aligned != sidx ||
3312	    (eidx < sidx_aligned + pages && eidx < ahp->size)) {
3313		if (!anon_try_demote_pages(ahp, sidx_aligned,
3314		    amp->a_szc, NULL, 0)) {
3315			panic("anon_shmap_free_pages: demote failed");
3316		}
3317		size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) :
3318		    P2NPHASE(sidx, pages);
3319		size <<= PAGESHIFT;
3320		anon_free(ahp, sidx, size);
3321		sidx = sidx_aligned + pages;
3322		if (eidx <= sidx) {
3323			return;
3324		}
3325	}
3326	eidx_aligned = P2ALIGN(eidx, pages);
3327	if (sidx < eidx_aligned) {
3328		anon_free_pages(ahp, sidx,
3329		    (eidx_aligned - sidx) << PAGESHIFT,
3330		    amp->a_szc);
3331		sidx = eidx_aligned;
3332	}
3333	ASSERT(sidx == eidx_aligned);
3334	if (eidx == eidx_aligned) {
3335		return;
3336	}
3337	tidx = eidx;
3338	if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL &&
3339	    tidx - sidx < pages) {
3340		if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) {
3341			panic("anon_shmap_free_pages: demote failed");
3342		}
3343		size = (eidx - sidx) << PAGESHIFT;
3344		anon_free(ahp, sidx, size);
3345	} else {
3346		anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc);
3347	}
3348}
3349
3350/*
3351 * This routine should be called with amp's writer lock when there're no other
3352 * users of amp.  All pcache entries of this amp must have been already
3353 * inactivated. We must not drop a_rwlock here to prevent new users from
3354 * attaching to this amp.
3355 */
3356void
3357anonmap_purge(struct anon_map *amp)
3358{
3359	ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
3360	ASSERT(amp->refcnt <= 1);
3361
3362	if (amp->a_softlockcnt != 0) {
3363		seg_ppurge(NULL, amp, 0);
3364	}
3365
3366	/*
3367	 * Since all pcache entries were already inactive before this routine
3368	 * was called seg_ppurge() couldn't return while there're still
3369	 * entries that can be found via the list anchored at a_phead. So we
3370	 * can assert this list is empty now. a_softlockcnt may be still non 0
3371	 * if asynchronous thread that manages pcache already removed pcache
3372	 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non
3373	 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if
3374	 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map
3375	 * before shamp_reclaim() is done with it. a_purgemtx also taken by
3376	 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a
3377	 * barrier that prevents anonmap_purge() to complete while
3378	 * shamp_reclaim() may still be referencing this amp.
3379	 */
3380	ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3381	ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3382
3383	mutex_enter(&amp->a_purgemtx);
3384	while (amp->a_softlockcnt != 0) {
3385		ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3386		ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3387		amp->a_purgewait = 1;
3388		cv_wait(&amp->a_purgecv, &amp->a_purgemtx);
3389	}
3390	mutex_exit(&amp->a_purgemtx);
3391
3392	ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3393	ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3394	ASSERT(amp->a_softlockcnt == 0);
3395}
3396
3397/*
3398 * Allocate and initialize an anon_map structure for seg
3399 * associating the given swap reservation with the new anon_map.
3400 */
3401struct anon_map *
3402anonmap_alloc(size_t size, size_t swresv, int flags)
3403{
3404	struct anon_map *amp;
3405	int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
3406
3407	amp = kmem_cache_alloc(anonmap_cache, kmflags);
3408	if (amp == NULL) {
3409		ASSERT(kmflags == KM_NOSLEEP);
3410		return (NULL);
3411	}
3412
3413	amp->ahp = anon_create(btopr(size), flags);
3414	if (amp->ahp == NULL) {
3415		ASSERT(flags == ANON_NOSLEEP);
3416		kmem_cache_free(anonmap_cache, amp);
3417		return (NULL);
3418	}
3419	amp->refcnt = 1;
3420	amp->size = size;
3421	amp->swresv = swresv;
3422	amp->locality = 0;
3423	amp->a_szc = 0;
3424	amp->a_sp = NULL;
3425	amp->a_softlockcnt = 0;
3426	amp->a_purgewait = 0;
3427	amp->a_phead.p_lnext = &amp->a_phead;
3428	amp->a_phead.p_lprev = &amp->a_phead;
3429
3430	return (amp);
3431}
3432
3433void
3434anonmap_free(struct anon_map *amp)
3435{
3436	ASSERT(amp->ahp != NULL);
3437	ASSERT(amp->refcnt == 0);
3438	ASSERT(amp->a_softlockcnt == 0);
3439	ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3440	ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3441
3442	lgrp_shm_policy_fini(amp, NULL);
3443	anon_release(amp->ahp, btopr(amp->size));
3444	kmem_cache_free(anonmap_cache, amp);
3445}
3446
3447/*
3448 * Returns true if the app array has some empty slots.
3449 * The offp and lenp parameters are in/out parameters.  On entry
3450 * these values represent the starting offset and length of the
3451 * mapping.  When true is returned, these values may be modified
3452 * to be the largest range which includes empty slots.
3453 */
3454int
3455non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp,
3456    size_t *lenp)
3457{
3458	ulong_t i, el;
3459	ssize_t low, high;
3460	struct anon *ap;
3461
3462	low = -1;
3463	for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) {
3464		ap = anon_get_ptr(ahp, anon_idx);
3465		if (ap == NULL) {
3466			if (low == -1)
3467				low = i;
3468			high = i;
3469		}
3470	}
3471	if (low != -1) {
3472		/*
3473		 * Found at least one non-anon page.
3474		 * Set up the off and len return values.
3475		 */
3476		if (low != 0)
3477			*offp += low;
3478		*lenp = high - low + PAGESIZE;
3479		return (1);
3480	}
3481	return (0);
3482}
3483
3484/*
3485 * Return a count of the number of existing anon pages in the anon array
3486 * app in the range (off, off+len). The array and slots must be guaranteed
3487 * stable by the caller.
3488 */
3489pgcnt_t
3490anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
3491{
3492	pgcnt_t cnt = 0;
3493
3494	while (nslots-- > 0) {
3495		if ((anon_get_ptr(ahp, anon_index)) != NULL)
3496			cnt++;
3497		anon_index++;
3498	}
3499	return (cnt);
3500}
3501
3502/*
3503 * Move reserved phys swap into memory swap (unreserve phys swap
3504 * and reserve mem swap by the same amount).
3505 * Used by segspt when it needs to lock reserved swap npages in memory
3506 */
3507int
3508anon_swap_adjust(pgcnt_t npages)
3509{
3510	pgcnt_t unlocked_mem_swap;
3511
3512	mutex_enter(&anoninfo_lock);
3513
3514	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3515	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3516
3517	unlocked_mem_swap = k_anoninfo.ani_mem_resv
3518	    - k_anoninfo.ani_locked_swap;
3519	if (npages > unlocked_mem_swap) {
3520		spgcnt_t adjusted_swap = npages - unlocked_mem_swap;
3521
3522		/*
3523		 * if there is not enough unlocked mem swap we take missing
3524		 * amount from phys swap and give it to mem swap
3525		 */
3526		if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) {
3527			mutex_exit(&anoninfo_lock);
3528			return (ENOMEM);
3529		}
3530
3531		k_anoninfo.ani_mem_resv += adjusted_swap;
3532		ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap);
3533		k_anoninfo.ani_phys_resv -= adjusted_swap;
3534
3535		ANI_ADD(adjusted_swap);
3536	}
3537	k_anoninfo.ani_locked_swap += npages;
3538
3539	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3540	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3541
3542	mutex_exit(&anoninfo_lock);
3543
3544	return (0);
3545}
3546
3547/*
3548 * 'unlocked' reserved mem swap so when it is unreserved it
3549 * can be moved back phys (disk) swap
3550 */
3551void
3552anon_swap_restore(pgcnt_t npages)
3553{
3554	mutex_enter(&anoninfo_lock);
3555
3556	ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3557
3558	ASSERT(k_anoninfo.ani_locked_swap >= npages);
3559	k_anoninfo.ani_locked_swap -= npages;
3560
3561	ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3562
3563	mutex_exit(&anoninfo_lock);
3564}
3565
3566/*
3567 * Return the pointer from the list for a
3568 * specified anon index.
3569 */
3570ulong_t *
3571anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx)
3572{
3573	struct anon	**app;
3574	void		**ppp;
3575
3576	ASSERT(an_idx < ahp->size);
3577
3578	/*
3579	 * Single level case.
3580	 */
3581	if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
3582		return ((ulong_t *)&ahp->array_chunk[an_idx]);
3583	} else {
3584
3585		/*
3586		 * 2 level case.
3587		 */
3588		ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3589		if (*ppp == NULL) {
3590			mutex_enter(&ahp->serial_lock);
3591			ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3592			if (*ppp == NULL)
3593				*ppp = kmem_zalloc(PAGESIZE, KM_SLEEP);
3594			mutex_exit(&ahp->serial_lock);
3595		}
3596		app = *ppp;
3597		return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]);
3598	}
3599}
3600
3601void
3602anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj)
3603{
3604	ulong_t		*ap_slot;
3605	kmutex_t	*mtx;
3606	kcondvar_t	*cv;
3607	int		hash;
3608
3609	/*
3610	 * Use szc to determine anon slot(s) to appear atomic.
3611	 * If szc = 0, then lock the anon slot and mark it busy.
3612	 * If szc > 0, then lock the range of slots by getting the
3613	 * anon_array_lock for the first anon slot, and mark only the
3614	 * first anon slot busy to represent whole range being busy.
3615	 */
3616
3617	ASSERT(RW_READ_HELD(&amp->a_rwlock));
3618	an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
3619	hash = ANON_ARRAY_HASH(amp, an_idx);
3620	sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3621	sobj->sync_cv = cv = &anon_array_cv[hash];
3622	mutex_enter(mtx);
3623	ap_slot = anon_get_slot(amp->ahp, an_idx);
3624	while (ANON_ISBUSY(ap_slot))
3625		cv_wait(cv, mtx);
3626	ANON_SETBUSY(ap_slot);
3627	sobj->sync_data = ap_slot;
3628	mutex_exit(mtx);
3629}
3630
3631int
3632anon_array_try_enter(struct anon_map *amp, ulong_t an_idx,
3633    anon_sync_obj_t *sobj)
3634{
3635	ulong_t		*ap_slot;
3636	kmutex_t	*mtx;
3637	int		hash;
3638
3639	/*
3640	 * Try to lock a range of anon slots.
3641	 * Use szc to determine anon slot(s) to appear atomic.
3642	 * If szc = 0, then lock the anon slot and mark it busy.
3643	 * If szc > 0, then lock the range of slots by getting the
3644	 * anon_array_lock for the first anon slot, and mark only the
3645	 * first anon slot busy to represent whole range being busy.
3646	 * Fail if the mutex or the anon_array are busy.
3647	 */
3648
3649	ASSERT(RW_READ_HELD(&amp->a_rwlock));
3650	an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
3651	hash = ANON_ARRAY_HASH(amp, an_idx);
3652	sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3653	sobj->sync_cv = &anon_array_cv[hash];
3654	if (!mutex_tryenter(mtx)) {
3655		return (EWOULDBLOCK);
3656	}
3657	ap_slot = anon_get_slot(amp->ahp, an_idx);
3658	if (ANON_ISBUSY(ap_slot)) {
3659		mutex_exit(mtx);
3660		return (EWOULDBLOCK);
3661	}
3662	ANON_SETBUSY(ap_slot);
3663	sobj->sync_data = ap_slot;
3664	mutex_exit(mtx);
3665	return (0);
3666}
3667
3668void
3669anon_array_exit(anon_sync_obj_t *sobj)
3670{
3671	mutex_enter(sobj->sync_mutex);
3672	ASSERT(ANON_ISBUSY(sobj->sync_data));
3673	ANON_CLRBUSY(sobj->sync_data);
3674	if (CV_HAS_WAITERS(sobj->sync_cv))
3675		cv_broadcast(sobj->sync_cv);
3676	mutex_exit(sobj->sync_mutex);
3677}
3678