xref: /illumos-gate/usr/src/uts/common/vm/seg_vn.c (revision c6f039c7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2018 Joyent, Inc.
24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 /*
41  * VM - shared or copy-on-write from a vnode/anonymous memory.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/param.h>
46 #include <sys/t_lock.h>
47 #include <sys/errno.h>
48 #include <sys/systm.h>
49 #include <sys/mman.h>
50 #include <sys/debug.h>
51 #include <sys/cred.h>
52 #include <sys/vmsystm.h>
53 #include <sys/tuneable.h>
54 #include <sys/bitmap.h>
55 #include <sys/swap.h>
56 #include <sys/kmem.h>
57 #include <sys/sysmacros.h>
58 #include <sys/vtrace.h>
59 #include <sys/cmn_err.h>
60 #include <sys/callb.h>
61 #include <sys/vm.h>
62 #include <sys/dumphdr.h>
63 #include <sys/lgrp.h>
64 
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/seg.h>
68 #include <vm/seg_vn.h>
69 #include <vm/pvn.h>
70 #include <vm/anon.h>
71 #include <vm/page.h>
72 #include <vm/vpage.h>
73 #include <sys/proc.h>
74 #include <sys/task.h>
75 #include <sys/project.h>
76 #include <sys/zone.h>
77 #include <sys/shm_impl.h>
78 
79 /*
80  * segvn_fault needs a temporary page list array.  To avoid calling kmem all
81  * the time, it creates a small (PVN_GETPAGE_NUM entry) array and uses it if
82  * it can.  In the rare case when this page list is not large enough, it
83  * goes and gets a large enough array from kmem.
84  *
85  * This small page list array covers either 8 pages or 64kB worth of pages -
86  * whichever is smaller.
87  */
88 #define	PVN_MAX_GETPAGE_SZ	0x10000
89 #define	PVN_MAX_GETPAGE_NUM	0x8
90 
91 #if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE
92 #define	PVN_GETPAGE_SZ	ptob(PVN_MAX_GETPAGE_NUM)
93 #define	PVN_GETPAGE_NUM	PVN_MAX_GETPAGE_NUM
94 #else
95 #define	PVN_GETPAGE_SZ	PVN_MAX_GETPAGE_SZ
96 #define	PVN_GETPAGE_NUM	btop(PVN_MAX_GETPAGE_SZ)
97 #endif
98 
99 /*
100  * Private seg op routines.
101  */
102 static int	segvn_dup(struct seg *seg, struct seg *newseg);
103 static int	segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
104 static void	segvn_free(struct seg *seg);
105 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
106 		    caddr_t addr, size_t len, enum fault_type type,
107 		    enum seg_rw rw);
108 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
109 static int	segvn_setprot(struct seg *seg, caddr_t addr,
110 		    size_t len, uint_t prot);
111 static int	segvn_checkprot(struct seg *seg, caddr_t addr,
112 		    size_t len, uint_t prot);
113 static int	segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
114 static size_t	segvn_swapout(struct seg *seg);
115 static int	segvn_sync(struct seg *seg, caddr_t addr, size_t len,
116 		    int attr, uint_t flags);
117 static size_t	segvn_incore(struct seg *seg, caddr_t addr, size_t len,
118 		    char *vec);
119 static int	segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
120 		    int attr, int op, ulong_t *lockmap, size_t pos);
121 static int	segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
122 		    uint_t *protv);
123 static u_offset_t	segvn_getoffset(struct seg *seg, caddr_t addr);
124 static int	segvn_gettype(struct seg *seg, caddr_t addr);
125 static int	segvn_getvp(struct seg *seg, caddr_t addr,
126 		    struct vnode **vpp);
127 static int	segvn_advise(struct seg *seg, caddr_t addr, size_t len,
128 		    uint_t behav);
129 static void	segvn_dump(struct seg *seg);
130 static int	segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
131 		    struct page ***ppp, enum lock_type type, enum seg_rw rw);
132 static int	segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
133 		    uint_t szc);
134 static int	segvn_getmemid(struct seg *seg, caddr_t addr,
135 		    memid_t *memidp);
136 static lgrp_mem_policy_info_t	*segvn_getpolicy(struct seg *, caddr_t);
137 static int	segvn_capable(struct seg *seg, segcapability_t capable);
138 static int	segvn_inherit(struct seg *, caddr_t, size_t, uint_t);
139 
140 struct	seg_ops segvn_ops = {
141 	segvn_dup,
142 	segvn_unmap,
143 	segvn_free,
144 	segvn_fault,
145 	segvn_faulta,
146 	segvn_setprot,
147 	segvn_checkprot,
148 	segvn_kluster,
149 	segvn_swapout,
150 	segvn_sync,
151 	segvn_incore,
152 	segvn_lockop,
153 	segvn_getprot,
154 	segvn_getoffset,
155 	segvn_gettype,
156 	segvn_getvp,
157 	segvn_advise,
158 	segvn_dump,
159 	segvn_pagelock,
160 	segvn_setpagesize,
161 	segvn_getmemid,
162 	segvn_getpolicy,
163 	segvn_capable,
164 	segvn_inherit
165 };
166 
167 /*
168  * Common zfod structures, provided as a shorthand for others to use.
169  */
170 static segvn_crargs_t zfod_segvn_crargs =
171 	SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
172 static segvn_crargs_t kzfod_segvn_crargs =
173 	SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
174 	PROT_ALL & ~PROT_USER);
175 static segvn_crargs_t stack_noexec_crargs =
176 	SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
177 
178 caddr_t	zfod_argsp = (caddr_t)&zfod_segvn_crargs;	/* user zfod argsp */
179 caddr_t	kzfod_argsp = (caddr_t)&kzfod_segvn_crargs;	/* kernel zfod argsp */
180 caddr_t	stack_exec_argsp = (caddr_t)&zfod_segvn_crargs;	/* executable stack */
181 caddr_t	stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
182 
183 #define	vpgtob(n)	((n) * sizeof (struct vpage))	/* For brevity */
184 
185 size_t	segvn_comb_thrshld = UINT_MAX;	/* patchable -- see 1196681 */
186 
187 size_t	segvn_pglock_comb_thrshld = (1UL << 16);	/* 64K */
188 size_t	segvn_pglock_comb_balign = (1UL << 16);		/* 64K */
189 uint_t	segvn_pglock_comb_bshift;
190 size_t	segvn_pglock_comb_palign;
191 
192 static int	segvn_concat(struct seg *, struct seg *, int);
193 static int	segvn_extend_prev(struct seg *, struct seg *,
194 		    struct segvn_crargs *, size_t);
195 static int	segvn_extend_next(struct seg *, struct seg *,
196 		    struct segvn_crargs *, size_t);
197 static void	segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw);
198 static void	segvn_pagelist_rele(page_t **);
199 static void	segvn_setvnode_mpss(vnode_t *);
200 static void	segvn_relocate_pages(page_t **, page_t *);
201 static int	segvn_full_szcpages(page_t **, uint_t, int *, uint_t *);
202 static int	segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t,
203     uint_t, page_t **, page_t **, uint_t *, int *);
204 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t,
205     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
206 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
207     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
208 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
209     u_offset_t, struct vpage *, page_t **, uint_t,
210     enum fault_type, enum seg_rw, int);
211 static void	segvn_vpage(struct seg *);
212 static size_t	segvn_count_swap_by_vpages(struct seg *);
213 
214 static void segvn_purge(struct seg *seg);
215 static int segvn_reclaim(void *, caddr_t, size_t, struct page **,
216     enum seg_rw, int);
217 static int shamp_reclaim(void *, caddr_t, size_t, struct page **,
218     enum seg_rw, int);
219 
220 static int sameprot(struct seg *, caddr_t, size_t);
221 
222 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t);
223 static int segvn_clrszc(struct seg *);
224 static struct seg *segvn_split_seg(struct seg *, caddr_t);
225 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
226     ulong_t, uint_t);
227 
228 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t,
229     size_t, void *, u_offset_t);
230 
231 static struct kmem_cache *segvn_cache;
232 static struct kmem_cache **segvn_szc_cache;
233 
234 #ifdef VM_STATS
235 static struct segvnvmstats_str {
236 	ulong_t	fill_vp_pages[31];
237 	ulong_t fltvnpages[49];
238 	ulong_t	fullszcpages[10];
239 	ulong_t	relocatepages[3];
240 	ulong_t	fltanpages[17];
241 	ulong_t pagelock[2];
242 	ulong_t	demoterange[3];
243 } segvnvmstats;
244 #endif /* VM_STATS */
245 
246 #define	SDR_RANGE	1		/* demote entire range */
247 #define	SDR_END		2		/* demote non aligned ends only */
248 
249 #define	CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) {	    \
250 		if ((len) != 0) { 		      	      		      \
251 			lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);  \
252 			ASSERT(lpgaddr >= (seg)->s_base);	      	      \
253 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) +    \
254 			    (len)), pgsz);				      \
255 			ASSERT(lpgeaddr > lpgaddr);		      	      \
256 			ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size);    \
257 		} else {					      	      \
258 			lpgeaddr = lpgaddr = (addr);	      		      \
259 		}							      \
260 	}
261 
262 /*ARGSUSED*/
263 static int
segvn_cache_constructor(void * buf,void * cdrarg,int kmflags)264 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
265 {
266 	struct segvn_data *svd = buf;
267 
268 	rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
269 	mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
270 	svd->svn_trnext = svd->svn_trprev = NULL;
271 	return (0);
272 }
273 
274 /*ARGSUSED1*/
275 static void
segvn_cache_destructor(void * buf,void * cdrarg)276 segvn_cache_destructor(void *buf, void *cdrarg)
277 {
278 	struct segvn_data *svd = buf;
279 
280 	rw_destroy(&svd->lock);
281 	mutex_destroy(&svd->segfree_syncmtx);
282 }
283 
284 /*ARGSUSED*/
285 static int
svntr_cache_constructor(void * buf,void * cdrarg,int kmflags)286 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags)
287 {
288 	bzero(buf, sizeof (svntr_t));
289 	return (0);
290 }
291 
292 /*
293  * Patching this variable to non-zero allows the system to run with
294  * stacks marked as "not executable".  It's a bit of a kludge, but is
295  * provided as a tweakable for platforms that export those ABIs
296  * (e.g. sparc V8) that have executable stacks enabled by default.
297  * There are also some restrictions for platforms that don't actually
298  * implement 'noexec' protections.
299  *
300  * Once enabled, the system is (therefore) unable to provide a fully
301  * ABI-compliant execution environment, though practically speaking,
302  * most everything works.  The exceptions are generally some interpreters
303  * and debuggers that create executable code on the stack and jump
304  * into it (without explicitly mprotecting the address range to include
305  * PROT_EXEC).
306  *
307  * One important class of applications that are disabled are those
308  * that have been transformed into malicious agents using one of the
309  * numerous "buffer overflow" attacks.  See 4007890.
310  */
311 int noexec_user_stack = 0;
312 int noexec_user_stack_log = 1;
313 
314 int segvn_lpg_disable = 0;
315 uint_t segvn_maxpgszc = 0;
316 
317 ulong_t segvn_vmpss_clrszc_cnt;
318 ulong_t segvn_vmpss_clrszc_err;
319 ulong_t segvn_fltvnpages_clrszc_cnt;
320 ulong_t segvn_fltvnpages_clrszc_err;
321 ulong_t segvn_setpgsz_align_err;
322 ulong_t segvn_setpgsz_anon_align_err;
323 ulong_t segvn_setpgsz_getattr_err;
324 ulong_t segvn_setpgsz_eof_err;
325 ulong_t segvn_faultvnmpss_align_err1;
326 ulong_t segvn_faultvnmpss_align_err2;
327 ulong_t segvn_faultvnmpss_align_err3;
328 ulong_t segvn_faultvnmpss_align_err4;
329 ulong_t segvn_faultvnmpss_align_err5;
330 ulong_t	segvn_vmpss_pageio_deadlk_err;
331 
332 int segvn_use_regions = 1;
333 
334 /*
335  * Segvn supports text replication optimization for NUMA platforms. Text
336  * replica's are represented by anon maps (amp). There's one amp per text file
337  * region per lgroup. A process chooses the amp for each of its text mappings
338  * based on the lgroup assignment of its main thread (t_tid = 1). All
339  * processes that want a replica on a particular lgroup for the same text file
340  * mapping share the same amp. amp's are looked up in svntr_hashtab hash table
341  * with vp,off,size,szc used as a key. Text replication segments are read only
342  * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
343  * forcing COW faults from vnode to amp and mapping amp pages instead of vnode
344  * pages. Replication amp is assigned to a segment when it gets its first
345  * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
346  * rechecks periodically if the process still maps an amp local to the main
347  * thread. If not async thread forces process to remap to an amp in the new
348  * home lgroup of the main thread. Current text replication implementation
349  * only provides the benefit to workloads that do most of their work in the
350  * main thread of a process or all the threads of a process run in the same
351  * lgroup. To extend text replication benefit to different types of
352  * multithreaded workloads further work would be needed in the hat layer to
353  * allow the same virtual address in the same hat to simultaneously map
354  * different physical addresses (i.e. page table replication would be needed
355  * for x86).
356  *
357  * amp pages are used instead of vnode pages as long as segment has a very
358  * simple life cycle.  It's created via segvn_create(), handles S_EXEC
359  * (S_READ) pagefaults and is fully unmapped.  If anything more complicated
360  * happens such as protection is changed, real COW fault happens, pagesize is
361  * changed, MC_LOCK is requested or segment is partially unmapped we turn off
362  * text replication by converting the segment back to vnode only segment
363  * (unmap segment's address range and set svd->amp to NULL).
364  *
365  * The original file can be changed after amp is inserted into
366  * svntr_hashtab. Processes that are launched after the file is already
367  * changed can't use the replica's created prior to the file change. To
368  * implement this functionality hash entries are timestamped. Replica's can
369  * only be used if current file modification time is the same as the timestamp
370  * saved when hash entry was created. However just timestamps alone are not
371  * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
372  * deal with file changes via MAP_SHARED mappings differently. When writable
373  * MAP_SHARED mappings are created to vnodes marked as executable we mark all
374  * existing replica's for this vnode as not usable for future text
375  * mappings. And we don't create new replica's for files that currently have
376  * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
377  * true).
378  */
379 
380 #define	SEGVN_TEXTREPL_MAXBYTES_FACTOR	(20)
381 size_t	segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR;
382 
383 static ulong_t			svntr_hashtab_sz = 512;
384 static svntr_bucket_t		*svntr_hashtab = NULL;
385 static struct kmem_cache	*svntr_cache;
386 static svntr_stats_t		*segvn_textrepl_stats;
387 static ksema_t 			segvn_trasync_sem;
388 
389 int				segvn_disable_textrepl = 1;
390 size_t				textrepl_size_thresh = (size_t)-1;
391 size_t				segvn_textrepl_bytes = 0;
392 size_t				segvn_textrepl_max_bytes = 0;
393 clock_t				segvn_update_textrepl_interval = 0;
394 int				segvn_update_tr_time = 10;
395 int				segvn_disable_textrepl_update = 0;
396 
397 static void segvn_textrepl(struct seg *);
398 static void segvn_textunrepl(struct seg *, int);
399 static void segvn_inval_trcache(vnode_t *);
400 static void segvn_trasync_thread(void);
401 static void segvn_trupdate_wakeup(void *);
402 static void segvn_trupdate(void);
403 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *,
404     ulong_t);
405 
406 /*
407  * Initialize segvn data structures
408  */
409 void
segvn_init(void)410 segvn_init(void)
411 {
412 	uint_t maxszc;
413 	uint_t szc;
414 	size_t pgsz;
415 
416 	segvn_cache = kmem_cache_create("segvn_cache",
417 	    sizeof (struct segvn_data), 0,
418 	    segvn_cache_constructor, segvn_cache_destructor, NULL,
419 	    NULL, NULL, 0);
420 
421 	if (segvn_lpg_disable == 0) {
422 		szc = maxszc = page_num_pagesizes() - 1;
423 		if (szc == 0) {
424 			segvn_lpg_disable = 1;
425 		}
426 		if (page_get_pagesize(0) != PAGESIZE) {
427 			panic("segvn_init: bad szc 0");
428 			/*NOTREACHED*/
429 		}
430 		while (szc != 0) {
431 			pgsz = page_get_pagesize(szc);
432 			if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) {
433 				panic("segvn_init: bad szc %d", szc);
434 				/*NOTREACHED*/
435 			}
436 			szc--;
437 		}
438 		if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
439 			segvn_maxpgszc = maxszc;
440 	}
441 
442 	if (segvn_maxpgszc) {
443 		segvn_szc_cache = (struct kmem_cache **)kmem_alloc(
444 		    (segvn_maxpgszc + 1) * sizeof (struct kmem_cache *),
445 		    KM_SLEEP);
446 	}
447 
448 	for (szc = 1; szc <= segvn_maxpgszc; szc++) {
449 		char	str[32];
450 
451 		(void) sprintf(str, "segvn_szc_cache%d", szc);
452 		segvn_szc_cache[szc] = kmem_cache_create(str,
453 		    page_get_pagecnt(szc) * sizeof (page_t *), 0,
454 		    NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
455 	}
456 
457 
458 	if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL))
459 		segvn_use_regions = 0;
460 
461 	/*
462 	 * For now shared regions and text replication segvn support
463 	 * are mutually exclusive. This is acceptable because
464 	 * currently significant benefit from text replication was
465 	 * only observed on AMD64 NUMA platforms (due to relatively
466 	 * small L2$ size) and currently we don't support shared
467 	 * regions on x86.
468 	 */
469 	if (segvn_use_regions && !segvn_disable_textrepl) {
470 		segvn_disable_textrepl = 1;
471 	}
472 
473 #if defined(_LP64)
474 	if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 &&
475 	    !segvn_disable_textrepl) {
476 		ulong_t i;
477 		size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t);
478 
479 		svntr_cache = kmem_cache_create("svntr_cache",
480 		    sizeof (svntr_t), 0, svntr_cache_constructor, NULL,
481 		    NULL, NULL, NULL, 0);
482 		svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP);
483 		for (i = 0; i < svntr_hashtab_sz; i++) {
484 			mutex_init(&svntr_hashtab[i].tr_lock,  NULL,
485 			    MUTEX_DEFAULT, NULL);
486 		}
487 		segvn_textrepl_max_bytes = ptob(physmem) /
488 		    segvn_textrepl_max_bytes_factor;
489 		segvn_textrepl_stats = kmem_zalloc(NCPU *
490 		    sizeof (svntr_stats_t), KM_SLEEP);
491 		sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
492 		(void) thread_create(NULL, 0, segvn_trasync_thread,
493 		    NULL, 0, &p0, TS_RUN, minclsyspri);
494 	}
495 #endif
496 
497 	if (!ISP2(segvn_pglock_comb_balign) ||
498 	    segvn_pglock_comb_balign < PAGESIZE) {
499 		segvn_pglock_comb_balign = 1UL << 16; /* 64K */
500 	}
501 	segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1;
502 	segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign);
503 }
504 
505 #define	SEGVN_PAGEIO	((void *)0x1)
506 #define	SEGVN_NOPAGEIO	((void *)0x2)
507 
508 static void
segvn_setvnode_mpss(vnode_t * vp)509 segvn_setvnode_mpss(vnode_t *vp)
510 {
511 	int err;
512 
513 	ASSERT(vp->v_mpssdata == NULL ||
514 	    vp->v_mpssdata == SEGVN_PAGEIO ||
515 	    vp->v_mpssdata == SEGVN_NOPAGEIO);
516 
517 	if (vp->v_mpssdata == NULL) {
518 		if (vn_vmpss_usepageio(vp)) {
519 			err = VOP_PAGEIO(vp, (page_t *)NULL,
520 			    (u_offset_t)0, 0, 0, CRED(), NULL);
521 		} else {
522 			err = ENOSYS;
523 		}
524 		/*
525 		 * set v_mpssdata just once per vnode life
526 		 * so that it never changes.
527 		 */
528 		mutex_enter(&vp->v_lock);
529 		if (vp->v_mpssdata == NULL) {
530 			if (err == EINVAL) {
531 				vp->v_mpssdata = SEGVN_PAGEIO;
532 			} else {
533 				vp->v_mpssdata = SEGVN_NOPAGEIO;
534 			}
535 		}
536 		mutex_exit(&vp->v_lock);
537 	}
538 }
539 
540 int
segvn_create(struct seg ** segpp,void * argsp)541 segvn_create(struct seg **segpp, void *argsp)
542 {
543 	struct seg *seg = *segpp;
544 	extern lgrp_mem_policy_t lgrp_mem_default_policy;
545 	struct segvn_crargs *a = (struct segvn_crargs *)argsp;
546 	struct segvn_data *svd;
547 	size_t swresv = 0;
548 	struct cred *cred;
549 	struct anon_map *amp;
550 	int error = 0;
551 	size_t pgsz;
552 	lgrp_mem_policy_t mpolicy = lgrp_mem_default_policy;
553 	int use_rgn = 0;
554 	int trok = 0;
555 
556 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
557 
558 	if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) {
559 		panic("segvn_create type");
560 		/*NOTREACHED*/
561 	}
562 
563 	/*
564 	 * Check arguments.  If a shared anon structure is given then
565 	 * it is illegal to also specify a vp.
566 	 */
567 	if (a->amp != NULL && a->vp != NULL) {
568 		panic("segvn_create anon_map");
569 		/*NOTREACHED*/
570 	}
571 
572 	if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) &&
573 	    a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) &&
574 	    segvn_use_regions) {
575 		use_rgn = 1;
576 	}
577 
578 	/* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
579 	if (a->type == MAP_SHARED)
580 		a->flags &= ~MAP_NORESERVE;
581 
582 	if (a->szc != 0) {
583 		if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) ||
584 		    (a->amp != NULL && a->type == MAP_PRIVATE) ||
585 		    (a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
586 			a->szc = 0;
587 		} else {
588 			if (a->szc > segvn_maxpgszc)
589 				a->szc = segvn_maxpgszc;
590 			pgsz = page_get_pagesize(a->szc);
591 			if (!IS_P2ALIGNED(seg->s_base, pgsz) ||
592 			    !IS_P2ALIGNED(seg->s_size, pgsz)) {
593 				a->szc = 0;
594 			} else if (a->vp != NULL) {
595 				if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) {
596 					/*
597 					 * paranoid check.
598 					 * hat_page_demote() is not supported
599 					 * on swapfs pages.
600 					 */
601 					a->szc = 0;
602 				} else if (map_addr_vacalign_check(seg->s_base,
603 				    a->offset & PAGEMASK)) {
604 					a->szc = 0;
605 				}
606 			} else if (a->amp != NULL) {
607 				pgcnt_t anum = btopr(a->offset);
608 				pgcnt_t pgcnt = page_get_pagecnt(a->szc);
609 				if (!IS_P2ALIGNED(anum, pgcnt)) {
610 					a->szc = 0;
611 				}
612 			}
613 		}
614 	}
615 
616 	/*
617 	 * If segment may need private pages, reserve them now.
618 	 */
619 	if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) ||
620 	    (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) {
621 		if (anon_resv_zone(seg->s_size,
622 		    seg->s_as->a_proc->p_zone) == 0)
623 			return (EAGAIN);
624 		swresv = seg->s_size;
625 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
626 		    seg, swresv, 1);
627 	}
628 
629 	/*
630 	 * Reserve any mapping structures that may be required.
631 	 *
632 	 * Don't do it for segments that may use regions. It's currently a
633 	 * noop in the hat implementations anyway.
634 	 */
635 	if (!use_rgn) {
636 		hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
637 	}
638 
639 	if (a->cred) {
640 		cred = a->cred;
641 		crhold(cred);
642 	} else {
643 		crhold(cred = CRED());
644 	}
645 
646 	/* Inform the vnode of the new mapping */
647 	if (a->vp != NULL) {
648 		error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
649 		    seg->s_as, seg->s_base, seg->s_size, a->prot,
650 		    a->maxprot, a->type, cred, NULL);
651 		if (error) {
652 			if (swresv != 0) {
653 				anon_unresv_zone(swresv,
654 				    seg->s_as->a_proc->p_zone);
655 				TRACE_3(TR_FAC_VM, TR_ANON_PROC,
656 				    "anon proc:%p %lu %u", seg, swresv, 0);
657 			}
658 			crfree(cred);
659 			if (!use_rgn) {
660 				hat_unload(seg->s_as->a_hat, seg->s_base,
661 				    seg->s_size, HAT_UNLOAD_UNMAP);
662 			}
663 			return (error);
664 		}
665 		/*
666 		 * svntr_hashtab will be NULL if we support shared regions.
667 		 */
668 		trok = ((a->flags & MAP_TEXT) &&
669 		    (seg->s_size > textrepl_size_thresh ||
670 		    (a->flags & _MAP_TEXTREPL)) &&
671 		    lgrp_optimizations() && svntr_hashtab != NULL &&
672 		    a->type == MAP_PRIVATE && swresv == 0 &&
673 		    !(a->flags & MAP_NORESERVE) &&
674 		    seg->s_as != &kas && a->vp->v_type == VREG);
675 
676 		ASSERT(!trok || !use_rgn);
677 	}
678 
679 	/*
680 	 * MAP_NORESERVE mappings don't count towards the VSZ of a process
681 	 * until we fault the pages in.
682 	 */
683 	if ((a->vp == NULL || a->vp->v_type != VREG) &&
684 	    a->flags & MAP_NORESERVE) {
685 		seg->s_as->a_resvsize -= seg->s_size;
686 	}
687 
688 	/*
689 	 * If more than one segment in the address space, and they're adjacent
690 	 * virtually, try to concatenate them.  Don't concatenate if an
691 	 * explicit anon_map structure was supplied (e.g., SystemV shared
692 	 * memory) or if we'll use text replication for this segment.
693 	 */
694 	if (a->amp == NULL && !use_rgn && !trok) {
695 		struct seg *pseg, *nseg;
696 		struct segvn_data *psvd, *nsvd;
697 		lgrp_mem_policy_t ppolicy, npolicy;
698 		uint_t	lgrp_mem_policy_flags = 0;
699 
700 		/*
701 		 * Memory policy flags (lgrp_mem_policy_flags) is valid when
702 		 * extending stack/heap segments.
703 		 */
704 		if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
705 		    !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
706 			lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
707 		} else {
708 			/*
709 			 * Get policy when not extending it from another segment
710 			 */
711 			mpolicy = lgrp_mem_policy_default(seg->s_size, a->type);
712 		}
713 
714 		/*
715 		 * First, try to concatenate the previous and new segments
716 		 */
717 		pseg = AS_SEGPREV(seg->s_as, seg);
718 		if (pseg != NULL &&
719 		    pseg->s_base + pseg->s_size == seg->s_base &&
720 		    pseg->s_ops == &segvn_ops) {
721 			/*
722 			 * Get memory allocation policy from previous segment.
723 			 * When extension is specified (e.g. for heap) apply
724 			 * this policy to the new segment regardless of the
725 			 * outcome of segment concatenation.  Extension occurs
726 			 * for non-default policy otherwise default policy is
727 			 * used and is based on extended segment size.
728 			 */
729 			psvd = (struct segvn_data *)pseg->s_data;
730 			ppolicy = psvd->policy_info.mem_policy;
731 			if (lgrp_mem_policy_flags ==
732 			    LGRP_MP_FLAG_EXTEND_UP) {
733 				if (ppolicy != lgrp_mem_default_policy) {
734 					mpolicy = ppolicy;
735 				} else {
736 					mpolicy = lgrp_mem_policy_default(
737 					    pseg->s_size + seg->s_size,
738 					    a->type);
739 				}
740 			}
741 
742 			if (mpolicy == ppolicy &&
743 			    (pseg->s_size + seg->s_size <=
744 			    segvn_comb_thrshld || psvd->amp == NULL) &&
745 			    segvn_extend_prev(pseg, seg, a, swresv) == 0) {
746 				/*
747 				 * success! now try to concatenate
748 				 * with following seg
749 				 */
750 				crfree(cred);
751 				nseg = AS_SEGNEXT(pseg->s_as, pseg);
752 				if (nseg != NULL &&
753 				    nseg != pseg &&
754 				    nseg->s_ops == &segvn_ops &&
755 				    pseg->s_base + pseg->s_size ==
756 				    nseg->s_base)
757 					(void) segvn_concat(pseg, nseg, 0);
758 				ASSERT(pseg->s_szc == 0 ||
759 				    (a->szc == pseg->s_szc &&
760 				    IS_P2ALIGNED(pseg->s_base, pgsz) &&
761 				    IS_P2ALIGNED(pseg->s_size, pgsz)));
762 				/*
763 				 * Communicate out the newly concatenated
764 				 * segment as part of the result.
765 				 */
766 				*segpp = pseg;
767 				return (0);
768 			}
769 		}
770 
771 		/*
772 		 * Failed, so try to concatenate with following seg
773 		 */
774 		nseg = AS_SEGNEXT(seg->s_as, seg);
775 		if (nseg != NULL &&
776 		    seg->s_base + seg->s_size == nseg->s_base &&
777 		    nseg->s_ops == &segvn_ops) {
778 			/*
779 			 * Get memory allocation policy from next segment.
780 			 * When extension is specified (e.g. for stack) apply
781 			 * this policy to the new segment regardless of the
782 			 * outcome of segment concatenation.  Extension occurs
783 			 * for non-default policy otherwise default policy is
784 			 * used and is based on extended segment size.
785 			 */
786 			nsvd = (struct segvn_data *)nseg->s_data;
787 			npolicy = nsvd->policy_info.mem_policy;
788 			if (lgrp_mem_policy_flags ==
789 			    LGRP_MP_FLAG_EXTEND_DOWN) {
790 				if (npolicy != lgrp_mem_default_policy) {
791 					mpolicy = npolicy;
792 				} else {
793 					mpolicy = lgrp_mem_policy_default(
794 					    nseg->s_size + seg->s_size,
795 					    a->type);
796 				}
797 			}
798 
799 			if (mpolicy == npolicy &&
800 			    segvn_extend_next(seg, nseg, a, swresv) == 0) {
801 				crfree(cred);
802 				ASSERT(nseg->s_szc == 0 ||
803 				    (a->szc == nseg->s_szc &&
804 				    IS_P2ALIGNED(nseg->s_base, pgsz) &&
805 				    IS_P2ALIGNED(nseg->s_size, pgsz)));
806 				/*
807 				 * Communicate out the newly concatenated
808 				 * segment as part of the result.
809 				 */
810 				*segpp = nseg;
811 				return (0);
812 			}
813 		}
814 	}
815 
816 	if (a->vp != NULL) {
817 		VN_HOLD(a->vp);
818 		if (a->type == MAP_SHARED)
819 			lgrp_shm_policy_init(NULL, a->vp);
820 	}
821 	svd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
822 
823 	seg->s_ops = &segvn_ops;
824 	seg->s_data = (void *)svd;
825 	seg->s_szc = a->szc;
826 
827 	svd->seg = seg;
828 	svd->vp = a->vp;
829 	/*
830 	 * Anonymous mappings have no backing file so the offset is meaningless.
831 	 */
832 	svd->offset = a->vp ? (a->offset & PAGEMASK) : 0;
833 	svd->prot = a->prot;
834 	svd->maxprot = a->maxprot;
835 	svd->pageprot = 0;
836 	svd->type = a->type;
837 	svd->vpage = NULL;
838 	svd->cred = cred;
839 	svd->advice = MADV_NORMAL;
840 	svd->pageadvice = 0;
841 	svd->flags = (ushort_t)a->flags;
842 	svd->softlockcnt = 0;
843 	svd->softlockcnt_sbase = 0;
844 	svd->softlockcnt_send = 0;
845 	svd->svn_inz = 0;
846 	svd->rcookie = HAT_INVALID_REGION_COOKIE;
847 	svd->pageswap = 0;
848 
849 	if (a->szc != 0 && a->vp != NULL) {
850 		segvn_setvnode_mpss(a->vp);
851 	}
852 	if (svd->type == MAP_SHARED && svd->vp != NULL &&
853 	    (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) {
854 		ASSERT(vn_is_mapped(svd->vp, V_WRITE));
855 		segvn_inval_trcache(svd->vp);
856 	}
857 
858 	amp = a->amp;
859 	if ((svd->amp = amp) == NULL) {
860 		svd->anon_index = 0;
861 		if (svd->type == MAP_SHARED) {
862 			svd->swresv = 0;
863 			/*
864 			 * Shared mappings to a vp need no other setup.
865 			 * If we have a shared mapping to an anon_map object
866 			 * which hasn't been allocated yet,  allocate the
867 			 * struct now so that it will be properly shared
868 			 * by remembering the swap reservation there.
869 			 */
870 			if (a->vp == NULL) {
871 				svd->amp = anonmap_alloc(seg->s_size, swresv,
872 				    ANON_SLEEP);
873 				svd->amp->a_szc = seg->s_szc;
874 			}
875 		} else {
876 			/*
877 			 * Private mapping (with or without a vp).
878 			 * Allocate anon_map when needed.
879 			 */
880 			svd->swresv = swresv;
881 		}
882 	} else {
883 		pgcnt_t anon_num;
884 
885 		/*
886 		 * Mapping to an existing anon_map structure without a vp.
887 		 * For now we will insure that the segment size isn't larger
888 		 * than the size - offset gives us.  Later on we may wish to
889 		 * have the anon array dynamically allocated itself so that
890 		 * we don't always have to allocate all the anon pointer slots.
891 		 * This of course involves adding extra code to check that we
892 		 * aren't trying to use an anon pointer slot beyond the end
893 		 * of the currently allocated anon array.
894 		 */
895 		if ((amp->size - a->offset) < seg->s_size) {
896 			panic("segvn_create anon_map size");
897 			/*NOTREACHED*/
898 		}
899 
900 		anon_num = btopr(a->offset);
901 
902 		if (a->type == MAP_SHARED) {
903 			/*
904 			 * SHARED mapping to a given anon_map.
905 			 */
906 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
907 			amp->refcnt++;
908 			if (a->szc > amp->a_szc) {
909 				amp->a_szc = a->szc;
910 			}
911 			ANON_LOCK_EXIT(&amp->a_rwlock);
912 			svd->anon_index = anon_num;
913 			svd->swresv = 0;
914 		} else {
915 			/*
916 			 * PRIVATE mapping to a given anon_map.
917 			 * Make sure that all the needed anon
918 			 * structures are created (so that we will
919 			 * share the underlying pages if nothing
920 			 * is written by this mapping) and then
921 			 * duplicate the anon array as is done
922 			 * when a privately mapped segment is dup'ed.
923 			 */
924 			struct anon *ap;
925 			caddr_t addr;
926 			caddr_t eaddr;
927 			ulong_t	anon_idx;
928 			int hat_flag = HAT_LOAD;
929 
930 			if (svd->flags & MAP_TEXT) {
931 				hat_flag |= HAT_LOAD_TEXT;
932 			}
933 
934 			svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
935 			svd->amp->a_szc = seg->s_szc;
936 			svd->anon_index = 0;
937 			svd->swresv = swresv;
938 
939 			/*
940 			 * Prevent 2 threads from allocating anon
941 			 * slots simultaneously.
942 			 */
943 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
944 			eaddr = seg->s_base + seg->s_size;
945 
946 			for (anon_idx = anon_num, addr = seg->s_base;
947 			    addr < eaddr; addr += PAGESIZE, anon_idx++) {
948 				page_t *pp;
949 
950 				if ((ap = anon_get_ptr(amp->ahp,
951 				    anon_idx)) != NULL)
952 					continue;
953 
954 				/*
955 				 * Allocate the anon struct now.
956 				 * Might as well load up translation
957 				 * to the page while we're at it...
958 				 */
959 				pp = anon_zero(seg, addr, &ap, cred);
960 				if (ap == NULL || pp == NULL) {
961 					panic("segvn_create anon_zero");
962 					/*NOTREACHED*/
963 				}
964 
965 				/*
966 				 * Re-acquire the anon_map lock and
967 				 * initialize the anon array entry.
968 				 */
969 				ASSERT(anon_get_ptr(amp->ahp,
970 				    anon_idx) == NULL);
971 				(void) anon_set_ptr(amp->ahp, anon_idx, ap,
972 				    ANON_SLEEP);
973 
974 				ASSERT(seg->s_szc == 0);
975 				ASSERT(!IS_VMODSORT(pp->p_vnode));
976 
977 				ASSERT(use_rgn == 0);
978 				hat_memload(seg->s_as->a_hat, addr, pp,
979 				    svd->prot & ~PROT_WRITE, hat_flag);
980 
981 				page_unlock(pp);
982 			}
983 			ASSERT(seg->s_szc == 0);
984 			anon_dup(amp->ahp, anon_num, svd->amp->ahp,
985 			    0, seg->s_size);
986 			ANON_LOCK_EXIT(&amp->a_rwlock);
987 		}
988 	}
989 
990 	/*
991 	 * Set default memory allocation policy for segment
992 	 *
993 	 * Always set policy for private memory at least for initialization
994 	 * even if this is a shared memory segment
995 	 */
996 	(void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size);
997 
998 	if (svd->type == MAP_SHARED)
999 		(void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
1000 		    svd->vp, svd->offset, seg->s_size);
1001 
1002 	if (use_rgn) {
1003 		ASSERT(!trok);
1004 		ASSERT(svd->amp == NULL);
1005 		svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base,
1006 		    seg->s_size, (void *)svd->vp, svd->offset, svd->prot,
1007 		    (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback,
1008 		    HAT_REGION_TEXT);
1009 	}
1010 
1011 	ASSERT(!trok || !(svd->prot & PROT_WRITE));
1012 	svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF;
1013 
1014 	return (0);
1015 }
1016 
1017 /*
1018  * Concatenate two existing segments, if possible.
1019  * Return 0 on success, -1 if two segments are not compatible
1020  * or -2 on memory allocation failure.
1021  * If amp_cat == 1 then try and concat segments with anon maps
1022  */
1023 static int
segvn_concat(struct seg * seg1,struct seg * seg2,int amp_cat)1024 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat)
1025 {
1026 	struct segvn_data *svd1 = seg1->s_data;
1027 	struct segvn_data *svd2 = seg2->s_data;
1028 	struct anon_map *amp1 = svd1->amp;
1029 	struct anon_map *amp2 = svd2->amp;
1030 	struct vpage *vpage1 = svd1->vpage;
1031 	struct vpage *vpage2 = svd2->vpage, *nvpage = NULL;
1032 	size_t size, nvpsize;
1033 	pgcnt_t npages1, npages2;
1034 
1035 	ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as);
1036 	ASSERT(AS_WRITE_HELD(seg1->s_as));
1037 	ASSERT(seg1->s_ops == seg2->s_ops);
1038 
1039 	if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) ||
1040 	    HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
1041 		return (-1);
1042 	}
1043 
1044 	/* both segments exist, try to merge them */
1045 #define	incompat(x)	(svd1->x != svd2->x)
1046 	if (incompat(vp) || incompat(maxprot) ||
1047 	    (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) ||
1048 	    (!svd1->pageprot && !svd2->pageprot && incompat(prot)) ||
1049 	    incompat(type) || incompat(cred) || incompat(flags) ||
1050 	    seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) ||
1051 	    (svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0)
1052 		return (-1);
1053 #undef incompat
1054 
1055 	/*
1056 	 * vp == NULL implies zfod, offset doesn't matter
1057 	 */
1058 	if (svd1->vp != NULL &&
1059 	    svd1->offset + seg1->s_size != svd2->offset) {
1060 		return (-1);
1061 	}
1062 
1063 	/*
1064 	 * Don't concatenate if either segment uses text replication.
1065 	 */
1066 	if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) {
1067 		return (-1);
1068 	}
1069 
1070 	/*
1071 	 * Fail early if we're not supposed to concatenate
1072 	 * segments with non NULL amp.
1073 	 */
1074 	if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) {
1075 		return (-1);
1076 	}
1077 
1078 	if (svd1->vp == NULL && svd1->type == MAP_SHARED) {
1079 		if (amp1 != amp2) {
1080 			return (-1);
1081 		}
1082 		if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) !=
1083 		    svd2->anon_index) {
1084 			return (-1);
1085 		}
1086 		ASSERT(amp1 == NULL || amp1->refcnt >= 2);
1087 	}
1088 
1089 	/*
1090 	 * If either seg has vpages, create a new merged vpage array.
1091 	 */
1092 	if (vpage1 != NULL || vpage2 != NULL) {
1093 		struct vpage *vp, *evp;
1094 
1095 		npages1 = seg_pages(seg1);
1096 		npages2 = seg_pages(seg2);
1097 		nvpsize = vpgtob(npages1 + npages2);
1098 
1099 		if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) {
1100 			return (-2);
1101 		}
1102 
1103 		if (vpage1 != NULL) {
1104 			bcopy(vpage1, nvpage, vpgtob(npages1));
1105 		} else {
1106 			evp = nvpage + npages1;
1107 			for (vp = nvpage; vp < evp; vp++) {
1108 				VPP_SETPROT(vp, svd1->prot);
1109 				VPP_SETADVICE(vp, svd1->advice);
1110 			}
1111 		}
1112 
1113 		if (vpage2 != NULL) {
1114 			bcopy(vpage2, nvpage + npages1, vpgtob(npages2));
1115 		} else {
1116 			evp = nvpage + npages1 + npages2;
1117 			for (vp = nvpage + npages1; vp < evp; vp++) {
1118 				VPP_SETPROT(vp, svd2->prot);
1119 				VPP_SETADVICE(vp, svd2->advice);
1120 			}
1121 		}
1122 
1123 		if (svd2->pageswap && (!svd1->pageswap && svd1->swresv)) {
1124 			ASSERT(svd1->swresv == seg1->s_size);
1125 			ASSERT(!(svd1->flags & MAP_NORESERVE));
1126 			ASSERT(!(svd2->flags & MAP_NORESERVE));
1127 			evp = nvpage + npages1;
1128 			for (vp = nvpage; vp < evp; vp++) {
1129 				VPP_SETSWAPRES(vp);
1130 			}
1131 		}
1132 
1133 		if (svd1->pageswap && (!svd2->pageswap && svd2->swresv)) {
1134 			ASSERT(svd2->swresv == seg2->s_size);
1135 			ASSERT(!(svd1->flags & MAP_NORESERVE));
1136 			ASSERT(!(svd2->flags & MAP_NORESERVE));
1137 			vp = nvpage + npages1;
1138 			evp = vp + npages2;
1139 			for (; vp < evp; vp++) {
1140 				VPP_SETSWAPRES(vp);
1141 			}
1142 		}
1143 	}
1144 	ASSERT((vpage1 != NULL || vpage2 != NULL) ||
1145 	    (svd1->pageswap == 0 && svd2->pageswap == 0));
1146 
1147 	/*
1148 	 * If either segment has private pages, create a new merged anon
1149 	 * array. If mergeing shared anon segments just decrement anon map's
1150 	 * refcnt.
1151 	 */
1152 	if (amp1 != NULL && svd1->type == MAP_SHARED) {
1153 		ASSERT(amp1 == amp2 && svd1->vp == NULL);
1154 		ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
1155 		ASSERT(amp1->refcnt >= 2);
1156 		amp1->refcnt--;
1157 		ANON_LOCK_EXIT(&amp1->a_rwlock);
1158 		svd2->amp = NULL;
1159 	} else if (amp1 != NULL || amp2 != NULL) {
1160 		struct anon_hdr *nahp;
1161 		struct anon_map *namp = NULL;
1162 		size_t asize;
1163 
1164 		ASSERT(svd1->type == MAP_PRIVATE);
1165 
1166 		asize = seg1->s_size + seg2->s_size;
1167 		if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) {
1168 			if (nvpage != NULL) {
1169 				kmem_free(nvpage, nvpsize);
1170 			}
1171 			return (-2);
1172 		}
1173 		if (amp1 != NULL) {
1174 			/*
1175 			 * XXX anon rwlock is not really needed because
1176 			 * this is a private segment and we are writers.
1177 			 */
1178 			ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
1179 			ASSERT(amp1->refcnt == 1);
1180 			if (anon_copy_ptr(amp1->ahp, svd1->anon_index,
1181 			    nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) {
1182 				anon_release(nahp, btop(asize));
1183 				ANON_LOCK_EXIT(&amp1->a_rwlock);
1184 				if (nvpage != NULL) {
1185 					kmem_free(nvpage, nvpsize);
1186 				}
1187 				return (-2);
1188 			}
1189 		}
1190 		if (amp2 != NULL) {
1191 			ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
1192 			ASSERT(amp2->refcnt == 1);
1193 			if (anon_copy_ptr(amp2->ahp, svd2->anon_index,
1194 			    nahp, btop(seg1->s_size), btop(seg2->s_size),
1195 			    ANON_NOSLEEP)) {
1196 				anon_release(nahp, btop(asize));
1197 				ANON_LOCK_EXIT(&amp2->a_rwlock);
1198 				if (amp1 != NULL) {
1199 					ANON_LOCK_EXIT(&amp1->a_rwlock);
1200 				}
1201 				if (nvpage != NULL) {
1202 					kmem_free(nvpage, nvpsize);
1203 				}
1204 				return (-2);
1205 			}
1206 		}
1207 		if (amp1 != NULL) {
1208 			namp = amp1;
1209 			anon_release(amp1->ahp, btop(amp1->size));
1210 		}
1211 		if (amp2 != NULL) {
1212 			if (namp == NULL) {
1213 				ASSERT(amp1 == NULL);
1214 				namp = amp2;
1215 				anon_release(amp2->ahp, btop(amp2->size));
1216 			} else {
1217 				amp2->refcnt--;
1218 				ANON_LOCK_EXIT(&amp2->a_rwlock);
1219 				anonmap_free(amp2);
1220 			}
1221 			svd2->amp = NULL; /* needed for seg_free */
1222 		}
1223 		namp->ahp = nahp;
1224 		namp->size = asize;
1225 		svd1->amp = namp;
1226 		svd1->anon_index = 0;
1227 		ANON_LOCK_EXIT(&namp->a_rwlock);
1228 	}
1229 	/*
1230 	 * Now free the old vpage structures.
1231 	 */
1232 	if (nvpage != NULL) {
1233 		if (vpage1 != NULL) {
1234 			kmem_free(vpage1, vpgtob(npages1));
1235 		}
1236 		if (vpage2 != NULL) {
1237 			svd2->vpage = NULL;
1238 			kmem_free(vpage2, vpgtob(npages2));
1239 		}
1240 		if (svd2->pageprot) {
1241 			svd1->pageprot = 1;
1242 		}
1243 		if (svd2->pageadvice) {
1244 			svd1->pageadvice = 1;
1245 		}
1246 		if (svd2->pageswap) {
1247 			svd1->pageswap = 1;
1248 		}
1249 		svd1->vpage = nvpage;
1250 	}
1251 
1252 	/* all looks ok, merge segments */
1253 	svd1->swresv += svd2->swresv;
1254 	svd2->swresv = 0;  /* so seg_free doesn't release swap space */
1255 	size = seg2->s_size;
1256 	seg_free(seg2);
1257 	seg1->s_size += size;
1258 	return (0);
1259 }
1260 
1261 /*
1262  * Extend the previous segment (seg1) to include the
1263  * new segment (seg2 + a), if possible.
1264  * Return 0 on success.
1265  */
1266 static int
segvn_extend_prev(struct seg * seg1,struct seg * seg2,struct segvn_crargs * a,size_t swresv)1267 segvn_extend_prev(struct seg *seg1, struct seg *seg2, struct segvn_crargs *a,
1268     size_t swresv)
1269 {
1270 	struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data;
1271 	size_t size;
1272 	struct anon_map *amp1;
1273 	struct vpage *new_vpage;
1274 
1275 	/*
1276 	 * We don't need any segment level locks for "segvn" data
1277 	 * since the address space is "write" locked.
1278 	 */
1279 	ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as));
1280 
1281 	if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) {
1282 		return (-1);
1283 	}
1284 
1285 	/* second segment is new, try to extend first */
1286 	/* XXX - should also check cred */
1287 	if (svd1->vp != a->vp || svd1->maxprot != a->maxprot ||
1288 	    (!svd1->pageprot && (svd1->prot != a->prot)) ||
1289 	    svd1->type != a->type || svd1->flags != a->flags ||
1290 	    seg1->s_szc != a->szc || svd1->softlockcnt_send > 0)
1291 		return (-1);
1292 
1293 	/* vp == NULL implies zfod, offset doesn't matter */
1294 	if (svd1->vp != NULL &&
1295 	    svd1->offset + seg1->s_size != (a->offset & PAGEMASK))
1296 		return (-1);
1297 
1298 	if (svd1->tr_state != SEGVN_TR_OFF) {
1299 		return (-1);
1300 	}
1301 
1302 	amp1 = svd1->amp;
1303 	if (amp1) {
1304 		pgcnt_t newpgs;
1305 
1306 		/*
1307 		 * Segment has private pages, can data structures
1308 		 * be expanded?
1309 		 *
1310 		 * Acquire the anon_map lock to prevent it from changing,
1311 		 * if it is shared.  This ensures that the anon_map
1312 		 * will not change while a thread which has a read/write
1313 		 * lock on an address space references it.
1314 		 * XXX - Don't need the anon_map lock at all if "refcnt"
1315 		 * is 1.
1316 		 *
1317 		 * Can't grow a MAP_SHARED segment with an anonmap because
1318 		 * there may be existing anon slots where we want to extend
1319 		 * the segment and we wouldn't know what to do with them
1320 		 * (e.g., for tmpfs right thing is to just leave them there,
1321 		 * for /dev/zero they should be cleared out).
1322 		 */
1323 		if (svd1->type == MAP_SHARED)
1324 			return (-1);
1325 
1326 		ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
1327 		if (amp1->refcnt > 1) {
1328 			ANON_LOCK_EXIT(&amp1->a_rwlock);
1329 			return (-1);
1330 		}
1331 		newpgs = anon_grow(amp1->ahp, &svd1->anon_index,
1332 		    btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP);
1333 
1334 		if (newpgs == 0) {
1335 			ANON_LOCK_EXIT(&amp1->a_rwlock);
1336 			return (-1);
1337 		}
1338 		amp1->size = ptob(newpgs);
1339 		ANON_LOCK_EXIT(&amp1->a_rwlock);
1340 	}
1341 	if (svd1->vpage != NULL) {
1342 		struct vpage *vp, *evp;
1343 		new_vpage =
1344 		    kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
1345 		    KM_NOSLEEP);
1346 		if (new_vpage == NULL)
1347 			return (-1);
1348 		bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1)));
1349 		kmem_free(svd1->vpage, vpgtob(seg_pages(seg1)));
1350 		svd1->vpage = new_vpage;
1351 
1352 		vp = new_vpage + seg_pages(seg1);
1353 		evp = vp + seg_pages(seg2);
1354 		for (; vp < evp; vp++)
1355 			VPP_SETPROT(vp, a->prot);
1356 		if (svd1->pageswap && swresv) {
1357 			ASSERT(!(svd1->flags & MAP_NORESERVE));
1358 			ASSERT(swresv == seg2->s_size);
1359 			vp = new_vpage + seg_pages(seg1);
1360 			for (; vp < evp; vp++) {
1361 				VPP_SETSWAPRES(vp);
1362 			}
1363 		}
1364 	}
1365 	ASSERT(svd1->vpage != NULL || svd1->pageswap == 0);
1366 	size = seg2->s_size;
1367 	seg_free(seg2);
1368 	seg1->s_size += size;
1369 	svd1->swresv += swresv;
1370 	if (svd1->pageprot && (a->prot & PROT_WRITE) &&
1371 	    svd1->type == MAP_SHARED && svd1->vp != NULL &&
1372 	    (svd1->vp->v_flag & VVMEXEC)) {
1373 		ASSERT(vn_is_mapped(svd1->vp, V_WRITE));
1374 		segvn_inval_trcache(svd1->vp);
1375 	}
1376 	return (0);
1377 }
1378 
1379 /*
1380  * Extend the next segment (seg2) to include the
1381  * new segment (seg1 + a), if possible.
1382  * Return 0 on success.
1383  */
1384 static int
segvn_extend_next(struct seg * seg1,struct seg * seg2,struct segvn_crargs * a,size_t swresv)1385 segvn_extend_next(struct seg *seg1, struct seg *seg2, struct segvn_crargs *a,
1386     size_t swresv)
1387 {
1388 	struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data;
1389 	size_t size;
1390 	struct anon_map *amp2;
1391 	struct vpage *new_vpage;
1392 
1393 	/*
1394 	 * We don't need any segment level locks for "segvn" data
1395 	 * since the address space is "write" locked.
1396 	 */
1397 	ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as));
1398 
1399 	if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
1400 		return (-1);
1401 	}
1402 
1403 	/* first segment is new, try to extend second */
1404 	/* XXX - should also check cred */
1405 	if (svd2->vp != a->vp || svd2->maxprot != a->maxprot ||
1406 	    (!svd2->pageprot && (svd2->prot != a->prot)) ||
1407 	    svd2->type != a->type || svd2->flags != a->flags ||
1408 	    seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0)
1409 		return (-1);
1410 	/* vp == NULL implies zfod, offset doesn't matter */
1411 	if (svd2->vp != NULL &&
1412 	    (a->offset & PAGEMASK) + seg1->s_size != svd2->offset)
1413 		return (-1);
1414 
1415 	if (svd2->tr_state != SEGVN_TR_OFF) {
1416 		return (-1);
1417 	}
1418 
1419 	amp2 = svd2->amp;
1420 	if (amp2) {
1421 		pgcnt_t newpgs;
1422 
1423 		/*
1424 		 * Segment has private pages, can data structures
1425 		 * be expanded?
1426 		 *
1427 		 * Acquire the anon_map lock to prevent it from changing,
1428 		 * if it is shared.  This ensures that the anon_map
1429 		 * will not change while a thread which has a read/write
1430 		 * lock on an address space references it.
1431 		 *
1432 		 * XXX - Don't need the anon_map lock at all if "refcnt"
1433 		 * is 1.
1434 		 */
1435 		if (svd2->type == MAP_SHARED)
1436 			return (-1);
1437 
1438 		ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
1439 		if (amp2->refcnt > 1) {
1440 			ANON_LOCK_EXIT(&amp2->a_rwlock);
1441 			return (-1);
1442 		}
1443 		newpgs = anon_grow(amp2->ahp, &svd2->anon_index,
1444 		    btop(seg2->s_size), btop(seg1->s_size),
1445 		    ANON_NOSLEEP | ANON_GROWDOWN);
1446 
1447 		if (newpgs == 0) {
1448 			ANON_LOCK_EXIT(&amp2->a_rwlock);
1449 			return (-1);
1450 		}
1451 		amp2->size = ptob(newpgs);
1452 		ANON_LOCK_EXIT(&amp2->a_rwlock);
1453 	}
1454 	if (svd2->vpage != NULL) {
1455 		struct vpage *vp, *evp;
1456 		new_vpage =
1457 		    kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
1458 		    KM_NOSLEEP);
1459 		if (new_vpage == NULL) {
1460 			/* Not merging segments so adjust anon_index back */
1461 			if (amp2)
1462 				svd2->anon_index += seg_pages(seg1);
1463 			return (-1);
1464 		}
1465 		bcopy(svd2->vpage, new_vpage + seg_pages(seg1),
1466 		    vpgtob(seg_pages(seg2)));
1467 		kmem_free(svd2->vpage, vpgtob(seg_pages(seg2)));
1468 		svd2->vpage = new_vpage;
1469 
1470 		vp = new_vpage;
1471 		evp = vp + seg_pages(seg1);
1472 		for (; vp < evp; vp++)
1473 			VPP_SETPROT(vp, a->prot);
1474 		if (svd2->pageswap && swresv) {
1475 			ASSERT(!(svd2->flags & MAP_NORESERVE));
1476 			ASSERT(swresv == seg1->s_size);
1477 			vp = new_vpage;
1478 			for (; vp < evp; vp++) {
1479 				VPP_SETSWAPRES(vp);
1480 			}
1481 		}
1482 	}
1483 	ASSERT(svd2->vpage != NULL || svd2->pageswap == 0);
1484 	size = seg1->s_size;
1485 	seg_free(seg1);
1486 	seg2->s_size += size;
1487 	seg2->s_base -= size;
1488 	svd2->offset -= size;
1489 	svd2->swresv += swresv;
1490 	if (svd2->pageprot && (a->prot & PROT_WRITE) &&
1491 	    svd2->type == MAP_SHARED && svd2->vp != NULL &&
1492 	    (svd2->vp->v_flag & VVMEXEC)) {
1493 		ASSERT(vn_is_mapped(svd2->vp, V_WRITE));
1494 		segvn_inval_trcache(svd2->vp);
1495 	}
1496 	return (0);
1497 }
1498 
1499 /*
1500  * Duplicate all the pages in the segment. This may break COW sharing for a
1501  * given page. If the page is marked with inherit zero set, then instead of
1502  * duplicating the page, we zero the page.
1503  */
1504 static int
segvn_dup_pages(struct seg * seg,struct seg * newseg)1505 segvn_dup_pages(struct seg *seg, struct seg *newseg)
1506 {
1507 	int error;
1508 	uint_t prot;
1509 	page_t *pp;
1510 	struct anon *ap, *newap;
1511 	size_t i;
1512 	caddr_t addr;
1513 
1514 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1515 	struct segvn_data *newsvd = (struct segvn_data *)newseg->s_data;
1516 	ulong_t old_idx = svd->anon_index;
1517 	ulong_t new_idx = 0;
1518 
1519 	i = btopr(seg->s_size);
1520 	addr = seg->s_base;
1521 
1522 	/*
1523 	 * XXX break cow sharing using PAGESIZE
1524 	 * pages. They will be relocated into larger
1525 	 * pages at fault time.
1526 	 */
1527 	while (i-- > 0) {
1528 		if ((ap = anon_get_ptr(svd->amp->ahp, old_idx)) != NULL) {
1529 			struct vpage *vpp;
1530 
1531 			vpp = &svd->vpage[seg_page(seg, addr)];
1532 
1533 			/*
1534 			 * prot need not be computed below 'cause anon_private
1535 			 * is going to ignore it anyway as child doesn't inherit
1536 			 * pagelock from parent.
1537 			 */
1538 			prot = svd->pageprot ? VPP_PROT(vpp) : svd->prot;
1539 
1540 			/*
1541 			 * Check whether we should zero this or dup it.
1542 			 */
1543 			if (svd->svn_inz == SEGVN_INZ_ALL ||
1544 			    (svd->svn_inz == SEGVN_INZ_VPP &&
1545 			    VPP_ISINHZERO(vpp))) {
1546 				pp = anon_zero(newseg, addr, &newap,
1547 				    newsvd->cred);
1548 			} else {
1549 				page_t *anon_pl[1+1];
1550 				uint_t vpprot;
1551 				error = anon_getpage(&ap, &vpprot, anon_pl,
1552 				    PAGESIZE, seg, addr, S_READ, svd->cred);
1553 				if (error != 0)
1554 					return (error);
1555 
1556 				pp = anon_private(&newap, newseg, addr, prot,
1557 				    anon_pl[0], 0, newsvd->cred);
1558 			}
1559 			if (pp == NULL) {
1560 				return (ENOMEM);
1561 			}
1562 			(void) anon_set_ptr(newsvd->amp->ahp, new_idx, newap,
1563 			    ANON_SLEEP);
1564 			page_unlock(pp);
1565 		}
1566 		addr += PAGESIZE;
1567 		old_idx++;
1568 		new_idx++;
1569 	}
1570 
1571 	return (0);
1572 }
1573 
1574 static int
segvn_dup(struct seg * seg,struct seg * newseg)1575 segvn_dup(struct seg *seg, struct seg *newseg)
1576 {
1577 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1578 	struct segvn_data *newsvd;
1579 	pgcnt_t npages = seg_pages(seg);
1580 	int error = 0;
1581 	size_t len;
1582 	struct anon_map *amp;
1583 
1584 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1585 	ASSERT(newseg->s_as->a_proc->p_parent == curproc);
1586 
1587 	/*
1588 	 * If segment has anon reserved, reserve more for the new seg.
1589 	 * For a MAP_NORESERVE segment swresv will be a count of all the
1590 	 * allocated anon slots; thus we reserve for the child as many slots
1591 	 * as the parent has allocated. This semantic prevents the child or
1592 	 * parent from dieing during a copy-on-write fault caused by trying
1593 	 * to write a shared pre-existing anon page.
1594 	 */
1595 	if ((len = svd->swresv) != 0) {
1596 		if (anon_resv(svd->swresv) == 0)
1597 			return (ENOMEM);
1598 
1599 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
1600 		    seg, len, 0);
1601 	}
1602 
1603 	newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
1604 
1605 	newseg->s_ops = &segvn_ops;
1606 	newseg->s_data = (void *)newsvd;
1607 	newseg->s_szc = seg->s_szc;
1608 
1609 	newsvd->seg = newseg;
1610 	if ((newsvd->vp = svd->vp) != NULL) {
1611 		VN_HOLD(svd->vp);
1612 		if (svd->type == MAP_SHARED)
1613 			lgrp_shm_policy_init(NULL, svd->vp);
1614 	}
1615 	newsvd->offset = svd->offset;
1616 	newsvd->prot = svd->prot;
1617 	newsvd->maxprot = svd->maxprot;
1618 	newsvd->pageprot = svd->pageprot;
1619 	newsvd->type = svd->type;
1620 	newsvd->cred = svd->cred;
1621 	crhold(newsvd->cred);
1622 	newsvd->advice = svd->advice;
1623 	newsvd->pageadvice = svd->pageadvice;
1624 	newsvd->svn_inz = svd->svn_inz;
1625 	newsvd->swresv = svd->swresv;
1626 	newsvd->pageswap = svd->pageswap;
1627 	newsvd->flags = svd->flags;
1628 	newsvd->softlockcnt = 0;
1629 	newsvd->softlockcnt_sbase = 0;
1630 	newsvd->softlockcnt_send = 0;
1631 	newsvd->policy_info = svd->policy_info;
1632 	newsvd->rcookie = HAT_INVALID_REGION_COOKIE;
1633 
1634 	if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) {
1635 		/*
1636 		 * Not attaching to a shared anon object.
1637 		 */
1638 		ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) ||
1639 		    svd->tr_state == SEGVN_TR_OFF);
1640 		if (svd->tr_state == SEGVN_TR_ON) {
1641 			ASSERT(newsvd->vp != NULL && amp != NULL);
1642 			newsvd->tr_state = SEGVN_TR_INIT;
1643 		} else {
1644 			newsvd->tr_state = svd->tr_state;
1645 		}
1646 		newsvd->amp = NULL;
1647 		newsvd->anon_index = 0;
1648 	} else {
1649 		/* regions for now are only used on pure vnode segments */
1650 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
1651 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
1652 		newsvd->tr_state = SEGVN_TR_OFF;
1653 		if (svd->type == MAP_SHARED) {
1654 			ASSERT(svd->svn_inz == SEGVN_INZ_NONE);
1655 			newsvd->amp = amp;
1656 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1657 			amp->refcnt++;
1658 			ANON_LOCK_EXIT(&amp->a_rwlock);
1659 			newsvd->anon_index = svd->anon_index;
1660 		} else {
1661 			int reclaim = 1;
1662 
1663 			/*
1664 			 * Allocate and initialize new anon_map structure.
1665 			 */
1666 			newsvd->amp = anonmap_alloc(newseg->s_size, 0,
1667 			    ANON_SLEEP);
1668 			newsvd->amp->a_szc = newseg->s_szc;
1669 			newsvd->anon_index = 0;
1670 			ASSERT(svd->svn_inz == SEGVN_INZ_NONE ||
1671 			    svd->svn_inz == SEGVN_INZ_ALL ||
1672 			    svd->svn_inz == SEGVN_INZ_VPP);
1673 
1674 			/*
1675 			 * We don't have to acquire the anon_map lock
1676 			 * for the new segment (since it belongs to an
1677 			 * address space that is still not associated
1678 			 * with any process), or the segment in the old
1679 			 * address space (since all threads in it
1680 			 * are stopped while duplicating the address space).
1681 			 */
1682 
1683 			/*
1684 			 * The goal of the following code is to make sure that
1685 			 * softlocked pages do not end up as copy on write
1686 			 * pages.  This would cause problems where one
1687 			 * thread writes to a page that is COW and a different
1688 			 * thread in the same process has softlocked it.  The
1689 			 * softlock lock would move away from this process
1690 			 * because the write would cause this process to get
1691 			 * a copy (without the softlock).
1692 			 *
1693 			 * The strategy here is to just break the
1694 			 * sharing on pages that could possibly be
1695 			 * softlocked.
1696 			 *
1697 			 * In addition, if any pages have been marked that they
1698 			 * should be inherited as zero, then we immediately go
1699 			 * ahead and break COW and zero them. In the case of a
1700 			 * softlocked page that should be inherited zero, we
1701 			 * break COW and just get a zero page.
1702 			 */
1703 retry:
1704 			if (svd->softlockcnt ||
1705 			    svd->svn_inz != SEGVN_INZ_NONE) {
1706 				/*
1707 				 * The softlock count might be non zero
1708 				 * because some pages are still stuck in the
1709 				 * cache for lazy reclaim. Flush the cache
1710 				 * now. This should drop the count to zero.
1711 				 * [or there is really I/O going on to these
1712 				 * pages]. Note, we have the writers lock so
1713 				 * nothing gets inserted during the flush.
1714 				 */
1715 				if (svd->softlockcnt && reclaim == 1) {
1716 					segvn_purge(seg);
1717 					reclaim = 0;
1718 					goto retry;
1719 				}
1720 
1721 				error = segvn_dup_pages(seg, newseg);
1722 				if (error != 0) {
1723 					newsvd->vpage = NULL;
1724 					goto out;
1725 				}
1726 			} else {	/* common case */
1727 				if (seg->s_szc != 0) {
1728 					/*
1729 					 * If at least one of anon slots of a
1730 					 * large page exists then make sure
1731 					 * all anon slots of a large page
1732 					 * exist to avoid partial cow sharing
1733 					 * of a large page in the future.
1734 					 */
1735 					anon_dup_fill_holes(amp->ahp,
1736 					    svd->anon_index, newsvd->amp->ahp,
1737 					    0, seg->s_size, seg->s_szc,
1738 					    svd->vp != NULL);
1739 				} else {
1740 					anon_dup(amp->ahp, svd->anon_index,
1741 					    newsvd->amp->ahp, 0, seg->s_size);
1742 				}
1743 
1744 				hat_clrattr(seg->s_as->a_hat, seg->s_base,
1745 				    seg->s_size, PROT_WRITE);
1746 			}
1747 		}
1748 	}
1749 	/*
1750 	 * If necessary, create a vpage structure for the new segment.
1751 	 * Do not copy any page lock indications.
1752 	 */
1753 	if (svd->vpage != NULL) {
1754 		uint_t i;
1755 		struct vpage *ovp = svd->vpage;
1756 		struct vpage *nvp;
1757 
1758 		nvp = newsvd->vpage =
1759 		    kmem_alloc(vpgtob(npages), KM_SLEEP);
1760 		for (i = 0; i < npages; i++) {
1761 			*nvp = *ovp++;
1762 			VPP_CLRPPLOCK(nvp++);
1763 		}
1764 	} else
1765 		newsvd->vpage = NULL;
1766 
1767 	/* Inform the vnode of the new mapping */
1768 	if (newsvd->vp != NULL) {
1769 		error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset,
1770 		    newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot,
1771 		    newsvd->maxprot, newsvd->type, newsvd->cred, NULL);
1772 	}
1773 out:
1774 	if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1775 		ASSERT(newsvd->amp == NULL);
1776 		ASSERT(newsvd->tr_state == SEGVN_TR_OFF);
1777 		newsvd->rcookie = svd->rcookie;
1778 		hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie);
1779 	}
1780 	return (error);
1781 }
1782 
1783 
1784 /*
1785  * callback function to invoke free_vp_pages() for only those pages actually
1786  * processed by the HAT when a shared region is destroyed.
1787  */
1788 extern int free_pages;
1789 
1790 static void
segvn_hat_rgn_unload_callback(caddr_t saddr,caddr_t eaddr,caddr_t r_saddr,size_t r_size,void * r_obj,u_offset_t r_objoff)1791 segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
1792     size_t r_size, void *r_obj, u_offset_t r_objoff)
1793 {
1794 	u_offset_t off;
1795 	size_t len;
1796 	vnode_t *vp = (vnode_t *)r_obj;
1797 
1798 	ASSERT(eaddr > saddr);
1799 	ASSERT(saddr >= r_saddr);
1800 	ASSERT(saddr < r_saddr + r_size);
1801 	ASSERT(eaddr > r_saddr);
1802 	ASSERT(eaddr <= r_saddr + r_size);
1803 	ASSERT(vp != NULL);
1804 
1805 	if (!free_pages) {
1806 		return;
1807 	}
1808 
1809 	len = eaddr - saddr;
1810 	off = (saddr - r_saddr) + r_objoff;
1811 	free_vp_pages(vp, off, len);
1812 }
1813 
1814 /*
1815  * callback function used by segvn_unmap to invoke free_vp_pages() for only
1816  * those pages actually processed by the HAT
1817  */
1818 static void
segvn_hat_unload_callback(hat_callback_t * cb)1819 segvn_hat_unload_callback(hat_callback_t *cb)
1820 {
1821 	struct seg		*seg = cb->hcb_data;
1822 	struct segvn_data	*svd = (struct segvn_data *)seg->s_data;
1823 	size_t			len;
1824 	u_offset_t		off;
1825 
1826 	ASSERT(svd->vp != NULL);
1827 	ASSERT(cb->hcb_end_addr > cb->hcb_start_addr);
1828 	ASSERT(cb->hcb_start_addr >= seg->s_base);
1829 
1830 	len = cb->hcb_end_addr - cb->hcb_start_addr;
1831 	off = cb->hcb_start_addr - seg->s_base;
1832 	free_vp_pages(svd->vp, svd->offset + off, len);
1833 }
1834 
1835 /*
1836  * This function determines the number of bytes of swap reserved by
1837  * a segment for which per-page accounting is present. It is used to
1838  * calculate the correct value of a segvn_data's swresv.
1839  */
1840 static size_t
segvn_count_swap_by_vpages(struct seg * seg)1841 segvn_count_swap_by_vpages(struct seg *seg)
1842 {
1843 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1844 	struct vpage *vp, *evp;
1845 	size_t nswappages = 0;
1846 
1847 	ASSERT(svd->pageswap);
1848 	ASSERT(svd->vpage != NULL);
1849 
1850 	evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)];
1851 
1852 	for (vp = svd->vpage; vp < evp; vp++) {
1853 		if (VPP_ISSWAPRES(vp))
1854 			nswappages++;
1855 	}
1856 
1857 	return (nswappages << PAGESHIFT);
1858 }
1859 
1860 static int
segvn_unmap(struct seg * seg,caddr_t addr,size_t len)1861 segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
1862 {
1863 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1864 	struct segvn_data *nsvd;
1865 	struct seg *nseg;
1866 	struct anon_map *amp;
1867 	pgcnt_t	opages;		/* old segment size in pages */
1868 	pgcnt_t	npages;		/* new segment size in pages */
1869 	pgcnt_t	dpages;		/* pages being deleted (unmapped) */
1870 	hat_callback_t callback;	/* used for free_vp_pages() */
1871 	hat_callback_t *cbp = NULL;
1872 	caddr_t nbase;
1873 	size_t nsize;
1874 	size_t oswresv;
1875 	int reclaim = 1;
1876 
1877 	/*
1878 	 * We don't need any segment level locks for "segvn" data
1879 	 * since the address space is "write" locked.
1880 	 */
1881 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1882 
1883 	/*
1884 	 * Fail the unmap if pages are SOFTLOCKed through this mapping.
1885 	 * softlockcnt is protected from change by the as write lock.
1886 	 */
1887 retry:
1888 	if (svd->softlockcnt > 0) {
1889 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
1890 
1891 		/*
1892 		 * If this is shared segment non 0 softlockcnt
1893 		 * means locked pages are still in use.
1894 		 */
1895 		if (svd->type == MAP_SHARED) {
1896 			return (EAGAIN);
1897 		}
1898 
1899 		/*
1900 		 * since we do have the writers lock nobody can fill
1901 		 * the cache during the purge. The flush either succeeds
1902 		 * or we still have pending I/Os.
1903 		 */
1904 		if (reclaim == 1) {
1905 			segvn_purge(seg);
1906 			reclaim = 0;
1907 			goto retry;
1908 		}
1909 		return (EAGAIN);
1910 	}
1911 
1912 	/*
1913 	 * Check for bad sizes
1914 	 */
1915 	if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size ||
1916 	    (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) {
1917 		panic("segvn_unmap");
1918 		/*NOTREACHED*/
1919 	}
1920 
1921 	if (seg->s_szc != 0) {
1922 		size_t pgsz = page_get_pagesize(seg->s_szc);
1923 		int err;
1924 		if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
1925 			ASSERT(seg->s_base != addr || seg->s_size != len);
1926 			if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1927 				ASSERT(svd->amp == NULL);
1928 				ASSERT(svd->tr_state == SEGVN_TR_OFF);
1929 				hat_leave_region(seg->s_as->a_hat,
1930 				    svd->rcookie, HAT_REGION_TEXT);
1931 				svd->rcookie = HAT_INVALID_REGION_COOKIE;
1932 				/*
1933 				 * could pass a flag to segvn_demote_range()
1934 				 * below to tell it not to do any unloads but
1935 				 * this case is rare enough to not bother for
1936 				 * now.
1937 				 */
1938 			} else if (svd->tr_state == SEGVN_TR_INIT) {
1939 				svd->tr_state = SEGVN_TR_OFF;
1940 			} else if (svd->tr_state == SEGVN_TR_ON) {
1941 				ASSERT(svd->amp != NULL);
1942 				segvn_textunrepl(seg, 1);
1943 				ASSERT(svd->amp == NULL);
1944 				ASSERT(svd->tr_state == SEGVN_TR_OFF);
1945 			}
1946 			VM_STAT_ADD(segvnvmstats.demoterange[0]);
1947 			err = segvn_demote_range(seg, addr, len, SDR_END, 0);
1948 			if (err == 0) {
1949 				return (IE_RETRY);
1950 			}
1951 			return (err);
1952 		}
1953 	}
1954 
1955 	/* Inform the vnode of the unmapping. */
1956 	if (svd->vp) {
1957 		int error;
1958 
1959 		error = VOP_DELMAP(svd->vp,
1960 		    (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base),
1961 		    seg->s_as, addr, len, svd->prot, svd->maxprot,
1962 		    svd->type, svd->cred, NULL);
1963 
1964 		if (error == EAGAIN)
1965 			return (error);
1966 	}
1967 
1968 	/*
1969 	 * Remove any page locks set through this mapping.
1970 	 * If text replication is not off no page locks could have been
1971 	 * established via this mapping.
1972 	 */
1973 	if (svd->tr_state == SEGVN_TR_OFF) {
1974 		(void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0);
1975 	}
1976 
1977 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1978 		ASSERT(svd->amp == NULL);
1979 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
1980 		ASSERT(svd->type == MAP_PRIVATE);
1981 		hat_leave_region(seg->s_as->a_hat, svd->rcookie,
1982 		    HAT_REGION_TEXT);
1983 		svd->rcookie = HAT_INVALID_REGION_COOKIE;
1984 	} else if (svd->tr_state == SEGVN_TR_ON) {
1985 		ASSERT(svd->amp != NULL);
1986 		ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE));
1987 		segvn_textunrepl(seg, 1);
1988 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
1989 	} else {
1990 		if (svd->tr_state != SEGVN_TR_OFF) {
1991 			ASSERT(svd->tr_state == SEGVN_TR_INIT);
1992 			svd->tr_state = SEGVN_TR_OFF;
1993 		}
1994 		/*
1995 		 * Unload any hardware translations in the range to be taken
1996 		 * out. Use a callback to invoke free_vp_pages() effectively.
1997 		 */
1998 		if (svd->vp != NULL && free_pages != 0) {
1999 			callback.hcb_data = seg;
2000 			callback.hcb_function = segvn_hat_unload_callback;
2001 			cbp = &callback;
2002 		}
2003 		hat_unload_callback(seg->s_as->a_hat, addr, len,
2004 		    HAT_UNLOAD_UNMAP, cbp);
2005 
2006 		if (svd->type == MAP_SHARED && svd->vp != NULL &&
2007 		    (svd->vp->v_flag & VVMEXEC) &&
2008 		    ((svd->prot & PROT_WRITE) || svd->pageprot)) {
2009 			segvn_inval_trcache(svd->vp);
2010 		}
2011 	}
2012 
2013 	/*
2014 	 * Check for entire segment
2015 	 */
2016 	if (addr == seg->s_base && len == seg->s_size) {
2017 		seg_free(seg);
2018 		return (0);
2019 	}
2020 
2021 	opages = seg_pages(seg);
2022 	dpages = btop(len);
2023 	npages = opages - dpages;
2024 	amp = svd->amp;
2025 	ASSERT(amp == NULL || amp->a_szc >= seg->s_szc);
2026 
2027 	/*
2028 	 * Check for beginning of segment
2029 	 */
2030 	if (addr == seg->s_base) {
2031 		if (svd->vpage != NULL) {
2032 			size_t nbytes;
2033 			struct vpage *ovpage;
2034 
2035 			ovpage = svd->vpage;	/* keep pointer to vpage */
2036 
2037 			nbytes = vpgtob(npages);
2038 			svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2039 			bcopy(&ovpage[dpages], svd->vpage, nbytes);
2040 
2041 			/* free up old vpage */
2042 			kmem_free(ovpage, vpgtob(opages));
2043 		}
2044 		if (amp != NULL) {
2045 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2046 			if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
2047 				/*
2048 				 * Shared anon map is no longer in use. Before
2049 				 * freeing its pages purge all entries from
2050 				 * pcache that belong to this amp.
2051 				 */
2052 				if (svd->type == MAP_SHARED) {
2053 					ASSERT(amp->refcnt == 1);
2054 					ASSERT(svd->softlockcnt == 0);
2055 					anonmap_purge(amp);
2056 				}
2057 				/*
2058 				 * Free up now unused parts of anon_map array.
2059 				 */
2060 				if (amp->a_szc == seg->s_szc) {
2061 					if (seg->s_szc != 0) {
2062 						anon_free_pages(amp->ahp,
2063 						    svd->anon_index, len,
2064 						    seg->s_szc);
2065 					} else {
2066 						anon_free(amp->ahp,
2067 						    svd->anon_index,
2068 						    len);
2069 					}
2070 				} else {
2071 					ASSERT(svd->type == MAP_SHARED);
2072 					ASSERT(amp->a_szc > seg->s_szc);
2073 					anon_shmap_free_pages(amp,
2074 					    svd->anon_index, len);
2075 				}
2076 
2077 				/*
2078 				 * Unreserve swap space for the
2079 				 * unmapped chunk of this segment in
2080 				 * case it's MAP_SHARED
2081 				 */
2082 				if (svd->type == MAP_SHARED) {
2083 					anon_unresv_zone(len,
2084 					    seg->s_as->a_proc->p_zone);
2085 					amp->swresv -= len;
2086 				}
2087 			}
2088 			ANON_LOCK_EXIT(&amp->a_rwlock);
2089 			svd->anon_index += dpages;
2090 		}
2091 		if (svd->vp != NULL)
2092 			svd->offset += len;
2093 
2094 		seg->s_base += len;
2095 		seg->s_size -= len;
2096 
2097 		if (svd->swresv) {
2098 			if (svd->flags & MAP_NORESERVE) {
2099 				ASSERT(amp);
2100 				oswresv = svd->swresv;
2101 
2102 				svd->swresv = ptob(anon_pages(amp->ahp,
2103 				    svd->anon_index, npages));
2104 				anon_unresv_zone(oswresv - svd->swresv,
2105 				    seg->s_as->a_proc->p_zone);
2106 				if (SEG_IS_PARTIAL_RESV(seg))
2107 					seg->s_as->a_resvsize -= oswresv -
2108 					    svd->swresv;
2109 			} else {
2110 				size_t unlen;
2111 
2112 				if (svd->pageswap) {
2113 					oswresv = svd->swresv;
2114 					svd->swresv =
2115 					    segvn_count_swap_by_vpages(seg);
2116 					ASSERT(oswresv >= svd->swresv);
2117 					unlen = oswresv - svd->swresv;
2118 				} else {
2119 					svd->swresv -= len;
2120 					ASSERT(svd->swresv == seg->s_size);
2121 					unlen = len;
2122 				}
2123 				anon_unresv_zone(unlen,
2124 				    seg->s_as->a_proc->p_zone);
2125 			}
2126 			TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2127 			    seg, len, 0);
2128 		}
2129 
2130 		return (0);
2131 	}
2132 
2133 	/*
2134 	 * Check for end of segment
2135 	 */
2136 	if (addr + len == seg->s_base + seg->s_size) {
2137 		if (svd->vpage != NULL) {
2138 			size_t nbytes;
2139 			struct vpage *ovpage;
2140 
2141 			ovpage = svd->vpage;	/* keep pointer to vpage */
2142 
2143 			nbytes = vpgtob(npages);
2144 			svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2145 			bcopy(ovpage, svd->vpage, nbytes);
2146 
2147 			/* free up old vpage */
2148 			kmem_free(ovpage, vpgtob(opages));
2149 
2150 		}
2151 		if (amp != NULL) {
2152 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2153 			if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
2154 				/*
2155 				 * Free up now unused parts of anon_map array.
2156 				 */
2157 				ulong_t an_idx = svd->anon_index + npages;
2158 
2159 				/*
2160 				 * Shared anon map is no longer in use. Before
2161 				 * freeing its pages purge all entries from
2162 				 * pcache that belong to this amp.
2163 				 */
2164 				if (svd->type == MAP_SHARED) {
2165 					ASSERT(amp->refcnt == 1);
2166 					ASSERT(svd->softlockcnt == 0);
2167 					anonmap_purge(amp);
2168 				}
2169 
2170 				if (amp->a_szc == seg->s_szc) {
2171 					if (seg->s_szc != 0) {
2172 						anon_free_pages(amp->ahp,
2173 						    an_idx, len,
2174 						    seg->s_szc);
2175 					} else {
2176 						anon_free(amp->ahp, an_idx,
2177 						    len);
2178 					}
2179 				} else {
2180 					ASSERT(svd->type == MAP_SHARED);
2181 					ASSERT(amp->a_szc > seg->s_szc);
2182 					anon_shmap_free_pages(amp,
2183 					    an_idx, len);
2184 				}
2185 
2186 				/*
2187 				 * Unreserve swap space for the
2188 				 * unmapped chunk of this segment in
2189 				 * case it's MAP_SHARED
2190 				 */
2191 				if (svd->type == MAP_SHARED) {
2192 					anon_unresv_zone(len,
2193 					    seg->s_as->a_proc->p_zone);
2194 					amp->swresv -= len;
2195 				}
2196 			}
2197 			ANON_LOCK_EXIT(&amp->a_rwlock);
2198 		}
2199 
2200 		seg->s_size -= len;
2201 
2202 		if (svd->swresv) {
2203 			if (svd->flags & MAP_NORESERVE) {
2204 				ASSERT(amp);
2205 				oswresv = svd->swresv;
2206 				svd->swresv = ptob(anon_pages(amp->ahp,
2207 				    svd->anon_index, npages));
2208 				anon_unresv_zone(oswresv - svd->swresv,
2209 				    seg->s_as->a_proc->p_zone);
2210 				if (SEG_IS_PARTIAL_RESV(seg))
2211 					seg->s_as->a_resvsize -= oswresv -
2212 					    svd->swresv;
2213 			} else {
2214 				size_t unlen;
2215 
2216 				if (svd->pageswap) {
2217 					oswresv = svd->swresv;
2218 					svd->swresv =
2219 					    segvn_count_swap_by_vpages(seg);
2220 					ASSERT(oswresv >= svd->swresv);
2221 					unlen = oswresv - svd->swresv;
2222 				} else {
2223 					svd->swresv -= len;
2224 					ASSERT(svd->swresv == seg->s_size);
2225 					unlen = len;
2226 				}
2227 				anon_unresv_zone(unlen,
2228 				    seg->s_as->a_proc->p_zone);
2229 			}
2230 			TRACE_3(TR_FAC_VM, TR_ANON_PROC,
2231 			    "anon proc:%p %lu %u", seg, len, 0);
2232 		}
2233 
2234 		return (0);
2235 	}
2236 
2237 	/*
2238 	 * The section to go is in the middle of the segment,
2239 	 * have to make it into two segments.  nseg is made for
2240 	 * the high end while seg is cut down at the low end.
2241 	 */
2242 	nbase = addr + len;				/* new seg base */
2243 	nsize = (seg->s_base + seg->s_size) - nbase;	/* new seg size */
2244 	seg->s_size = addr - seg->s_base;		/* shrink old seg */
2245 	nseg = seg_alloc(seg->s_as, nbase, nsize);
2246 	if (nseg == NULL) {
2247 		panic("segvn_unmap seg_alloc");
2248 		/*NOTREACHED*/
2249 	}
2250 	nseg->s_ops = seg->s_ops;
2251 	nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
2252 	nseg->s_data = (void *)nsvd;
2253 	nseg->s_szc = seg->s_szc;
2254 	*nsvd = *svd;
2255 	nsvd->seg = nseg;
2256 	nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
2257 	nsvd->swresv = 0;
2258 	nsvd->softlockcnt = 0;
2259 	nsvd->softlockcnt_sbase = 0;
2260 	nsvd->softlockcnt_send = 0;
2261 	nsvd->svn_inz = svd->svn_inz;
2262 	ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
2263 
2264 	if (svd->vp != NULL) {
2265 		VN_HOLD(nsvd->vp);
2266 		if (nsvd->type == MAP_SHARED)
2267 			lgrp_shm_policy_init(NULL, nsvd->vp);
2268 	}
2269 	crhold(svd->cred);
2270 
2271 	if (svd->vpage == NULL) {
2272 		nsvd->vpage = NULL;
2273 	} else {
2274 		/* need to split vpage into two arrays */
2275 		size_t nbytes;
2276 		struct vpage *ovpage;
2277 
2278 		ovpage = svd->vpage;		/* keep pointer to vpage */
2279 
2280 		npages = seg_pages(seg);	/* seg has shrunk */
2281 		nbytes = vpgtob(npages);
2282 		svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2283 
2284 		bcopy(ovpage, svd->vpage, nbytes);
2285 
2286 		npages = seg_pages(nseg);
2287 		nbytes = vpgtob(npages);
2288 		nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2289 
2290 		bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes);
2291 
2292 		/* free up old vpage */
2293 		kmem_free(ovpage, vpgtob(opages));
2294 	}
2295 
2296 	if (amp == NULL) {
2297 		nsvd->amp = NULL;
2298 		nsvd->anon_index = 0;
2299 	} else {
2300 		/*
2301 		 * Need to create a new anon map for the new segment.
2302 		 * We'll also allocate a new smaller array for the old
2303 		 * smaller segment to save space.
2304 		 */
2305 		opages = btop((uintptr_t)(addr - seg->s_base));
2306 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2307 		if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
2308 			/*
2309 			 * Free up now unused parts of anon_map array.
2310 			 */
2311 			ulong_t an_idx = svd->anon_index + opages;
2312 
2313 			/*
2314 			 * Shared anon map is no longer in use. Before
2315 			 * freeing its pages purge all entries from
2316 			 * pcache that belong to this amp.
2317 			 */
2318 			if (svd->type == MAP_SHARED) {
2319 				ASSERT(amp->refcnt == 1);
2320 				ASSERT(svd->softlockcnt == 0);
2321 				anonmap_purge(amp);
2322 			}
2323 
2324 			if (amp->a_szc == seg->s_szc) {
2325 				if (seg->s_szc != 0) {
2326 					anon_free_pages(amp->ahp, an_idx, len,
2327 					    seg->s_szc);
2328 				} else {
2329 					anon_free(amp->ahp, an_idx,
2330 					    len);
2331 				}
2332 			} else {
2333 				ASSERT(svd->type == MAP_SHARED);
2334 				ASSERT(amp->a_szc > seg->s_szc);
2335 				anon_shmap_free_pages(amp, an_idx, len);
2336 			}
2337 
2338 			/*
2339 			 * Unreserve swap space for the
2340 			 * unmapped chunk of this segment in
2341 			 * case it's MAP_SHARED
2342 			 */
2343 			if (svd->type == MAP_SHARED) {
2344 				anon_unresv_zone(len,
2345 				    seg->s_as->a_proc->p_zone);
2346 				amp->swresv -= len;
2347 			}
2348 		}
2349 		nsvd->anon_index = svd->anon_index +
2350 		    btop((uintptr_t)(nseg->s_base - seg->s_base));
2351 		if (svd->type == MAP_SHARED) {
2352 			amp->refcnt++;
2353 			nsvd->amp = amp;
2354 		} else {
2355 			struct anon_map *namp;
2356 			struct anon_hdr *nahp;
2357 
2358 			ASSERT(svd->type == MAP_PRIVATE);
2359 			nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
2360 			namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
2361 			namp->a_szc = seg->s_szc;
2362 			(void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp,
2363 			    0, btop(seg->s_size), ANON_SLEEP);
2364 			(void) anon_copy_ptr(amp->ahp, nsvd->anon_index,
2365 			    namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
2366 			anon_release(amp->ahp, btop(amp->size));
2367 			svd->anon_index = 0;
2368 			nsvd->anon_index = 0;
2369 			amp->ahp = nahp;
2370 			amp->size = seg->s_size;
2371 			nsvd->amp = namp;
2372 		}
2373 		ANON_LOCK_EXIT(&amp->a_rwlock);
2374 	}
2375 	if (svd->swresv) {
2376 		if (svd->flags & MAP_NORESERVE) {
2377 			ASSERT(amp);
2378 			oswresv = svd->swresv;
2379 			svd->swresv = ptob(anon_pages(amp->ahp,
2380 			    svd->anon_index, btop(seg->s_size)));
2381 			nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
2382 			    nsvd->anon_index, btop(nseg->s_size)));
2383 			ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
2384 			anon_unresv_zone(oswresv - (svd->swresv + nsvd->swresv),
2385 			    seg->s_as->a_proc->p_zone);
2386 			if (SEG_IS_PARTIAL_RESV(seg))
2387 				seg->s_as->a_resvsize -= oswresv -
2388 				    (svd->swresv + nsvd->swresv);
2389 		} else {
2390 			size_t unlen;
2391 
2392 			if (svd->pageswap) {
2393 				oswresv = svd->swresv;
2394 				svd->swresv = segvn_count_swap_by_vpages(seg);
2395 				nsvd->swresv = segvn_count_swap_by_vpages(nseg);
2396 				ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
2397 				unlen = oswresv - (svd->swresv + nsvd->swresv);
2398 			} else {
2399 				if (seg->s_size + nseg->s_size + len !=
2400 				    svd->swresv) {
2401 					panic("segvn_unmap: cannot split "
2402 					    "swap reservation");
2403 					/*NOTREACHED*/
2404 				}
2405 				svd->swresv = seg->s_size;
2406 				nsvd->swresv = nseg->s_size;
2407 				unlen = len;
2408 			}
2409 			anon_unresv_zone(unlen,
2410 			    seg->s_as->a_proc->p_zone);
2411 		}
2412 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2413 		    seg, len, 0);
2414 	}
2415 
2416 	return (0);			/* I'm glad that's all over with! */
2417 }
2418 
2419 static void
segvn_free(struct seg * seg)2420 segvn_free(struct seg *seg)
2421 {
2422 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2423 	pgcnt_t npages = seg_pages(seg);
2424 	struct anon_map *amp;
2425 	size_t len;
2426 
2427 	/*
2428 	 * We don't need any segment level locks for "segvn" data
2429 	 * since the address space is "write" locked.
2430 	 */
2431 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
2432 	ASSERT(svd->tr_state == SEGVN_TR_OFF);
2433 
2434 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2435 
2436 	/*
2437 	 * Be sure to unlock pages. XXX Why do things get free'ed instead
2438 	 * of unmapped? XXX
2439 	 */
2440 	(void) segvn_lockop(seg, seg->s_base, seg->s_size,
2441 	    0, MC_UNLOCK, NULL, 0);
2442 
2443 	/*
2444 	 * Deallocate the vpage and anon pointers if necessary and possible.
2445 	 */
2446 	if (svd->vpage != NULL) {
2447 		kmem_free(svd->vpage, vpgtob(npages));
2448 		svd->vpage = NULL;
2449 	}
2450 	if ((amp = svd->amp) != NULL) {
2451 		/*
2452 		 * If there are no more references to this anon_map
2453 		 * structure, then deallocate the structure after freeing
2454 		 * up all the anon slot pointers that we can.
2455 		 */
2456 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2457 		ASSERT(amp->a_szc >= seg->s_szc);
2458 		if (--amp->refcnt == 0) {
2459 			if (svd->type == MAP_PRIVATE) {
2460 				/*
2461 				 * Private - we only need to anon_free
2462 				 * the part that this segment refers to.
2463 				 */
2464 				if (seg->s_szc != 0) {
2465 					anon_free_pages(amp->ahp,
2466 					    svd->anon_index, seg->s_size,
2467 					    seg->s_szc);
2468 				} else {
2469 					anon_free(amp->ahp, svd->anon_index,
2470 					    seg->s_size);
2471 				}
2472 			} else {
2473 
2474 				/*
2475 				 * Shared anon map is no longer in use. Before
2476 				 * freeing its pages purge all entries from
2477 				 * pcache that belong to this amp.
2478 				 */
2479 				ASSERT(svd->softlockcnt == 0);
2480 				anonmap_purge(amp);
2481 
2482 				/*
2483 				 * Shared - anon_free the entire
2484 				 * anon_map's worth of stuff and
2485 				 * release any swap reservation.
2486 				 */
2487 				if (amp->a_szc != 0) {
2488 					anon_shmap_free_pages(amp, 0,
2489 					    amp->size);
2490 				} else {
2491 					anon_free(amp->ahp, 0, amp->size);
2492 				}
2493 				if ((len = amp->swresv) != 0) {
2494 					anon_unresv_zone(len,
2495 					    seg->s_as->a_proc->p_zone);
2496 					TRACE_3(TR_FAC_VM, TR_ANON_PROC,
2497 					    "anon proc:%p %lu %u", seg, len, 0);
2498 				}
2499 			}
2500 			svd->amp = NULL;
2501 			ANON_LOCK_EXIT(&amp->a_rwlock);
2502 			anonmap_free(amp);
2503 		} else if (svd->type == MAP_PRIVATE) {
2504 			/*
2505 			 * We had a private mapping which still has
2506 			 * a held anon_map so just free up all the
2507 			 * anon slot pointers that we were using.
2508 			 */
2509 			if (seg->s_szc != 0) {
2510 				anon_free_pages(amp->ahp, svd->anon_index,
2511 				    seg->s_size, seg->s_szc);
2512 			} else {
2513 				anon_free(amp->ahp, svd->anon_index,
2514 				    seg->s_size);
2515 			}
2516 			ANON_LOCK_EXIT(&amp->a_rwlock);
2517 		} else {
2518 			ANON_LOCK_EXIT(&amp->a_rwlock);
2519 		}
2520 	}
2521 
2522 	/*
2523 	 * Release swap reservation.
2524 	 */
2525 	if ((len = svd->swresv) != 0) {
2526 		anon_unresv_zone(svd->swresv,
2527 		    seg->s_as->a_proc->p_zone);
2528 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2529 		    seg, len, 0);
2530 		if (SEG_IS_PARTIAL_RESV(seg))
2531 			seg->s_as->a_resvsize -= svd->swresv;
2532 		svd->swresv = 0;
2533 	}
2534 	/*
2535 	 * Release claim on vnode, credentials, and finally free the
2536 	 * private data.
2537 	 */
2538 	if (svd->vp != NULL) {
2539 		if (svd->type == MAP_SHARED)
2540 			lgrp_shm_policy_fini(NULL, svd->vp);
2541 		VN_RELE(svd->vp);
2542 		svd->vp = NULL;
2543 	}
2544 	crfree(svd->cred);
2545 	svd->pageprot = 0;
2546 	svd->pageadvice = 0;
2547 	svd->pageswap = 0;
2548 	svd->cred = NULL;
2549 
2550 	/*
2551 	 * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's
2552 	 * still working with this segment without holding as lock (in case
2553 	 * it's called by pcache async thread).
2554 	 */
2555 	ASSERT(svd->softlockcnt == 0);
2556 	mutex_enter(&svd->segfree_syncmtx);
2557 	mutex_exit(&svd->segfree_syncmtx);
2558 
2559 	seg->s_data = NULL;
2560 	kmem_cache_free(segvn_cache, svd);
2561 }
2562 
2563 /*
2564  * Do a F_SOFTUNLOCK call over the range requested.  The range must have
2565  * already been F_SOFTLOCK'ed.
2566  * Caller must always match addr and len of a softunlock with a previous
2567  * softlock with exactly the same addr and len.
2568  */
2569 static void
segvn_softunlock(struct seg * seg,caddr_t addr,size_t len,enum seg_rw rw)2570 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
2571 {
2572 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2573 	page_t *pp;
2574 	caddr_t adr;
2575 	struct vnode *vp;
2576 	u_offset_t offset;
2577 	ulong_t anon_index = 0;
2578 	struct anon_map *amp;
2579 	struct anon *ap = NULL;
2580 
2581 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2582 	ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
2583 
2584 	if ((amp = svd->amp) != NULL)
2585 		anon_index = svd->anon_index + seg_page(seg, addr);
2586 
2587 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
2588 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
2589 		hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie);
2590 	} else {
2591 		hat_unlock(seg->s_as->a_hat, addr, len);
2592 	}
2593 	for (adr = addr; adr < addr + len; adr += PAGESIZE) {
2594 		if (amp != NULL) {
2595 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2596 			if ((ap = anon_get_ptr(amp->ahp, anon_index++))
2597 			    != NULL) {
2598 				swap_xlate(ap, &vp, &offset);
2599 			} else {
2600 				vp = svd->vp;
2601 				offset = svd->offset +
2602 				    (uintptr_t)(adr - seg->s_base);
2603 			}
2604 			ANON_LOCK_EXIT(&amp->a_rwlock);
2605 		} else {
2606 			vp = svd->vp;
2607 			offset = svd->offset +
2608 			    (uintptr_t)(adr - seg->s_base);
2609 		}
2610 
2611 		/*
2612 		 * Use page_find() instead of page_lookup() to
2613 		 * find the page since we know that it is locked.
2614 		 */
2615 		pp = page_find(vp, offset);
2616 		if (pp == NULL) {
2617 			panic(
2618 			    "segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
2619 			    (void *)adr, (void *)ap, (void *)vp, offset);
2620 			/*NOTREACHED*/
2621 		}
2622 
2623 		if (rw == S_WRITE) {
2624 			hat_setrefmod(pp);
2625 			if (seg->s_as->a_vbits)
2626 				hat_setstat(seg->s_as, adr, PAGESIZE,
2627 				    P_REF | P_MOD);
2628 		} else if (rw != S_OTHER) {
2629 			hat_setref(pp);
2630 			if (seg->s_as->a_vbits)
2631 				hat_setstat(seg->s_as, adr, PAGESIZE, P_REF);
2632 		}
2633 		TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
2634 		    "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
2635 		page_unlock(pp);
2636 	}
2637 	ASSERT(svd->softlockcnt >= btop(len));
2638 	if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) {
2639 		/*
2640 		 * All SOFTLOCKS are gone. Wakeup any waiting
2641 		 * unmappers so they can try again to unmap.
2642 		 * Check for waiters first without the mutex
2643 		 * held so we don't always grab the mutex on
2644 		 * softunlocks.
2645 		 */
2646 		if (AS_ISUNMAPWAIT(seg->s_as)) {
2647 			mutex_enter(&seg->s_as->a_contents);
2648 			if (AS_ISUNMAPWAIT(seg->s_as)) {
2649 				AS_CLRUNMAPWAIT(seg->s_as);
2650 				cv_broadcast(&seg->s_as->a_cv);
2651 			}
2652 			mutex_exit(&seg->s_as->a_contents);
2653 		}
2654 	}
2655 }
2656 
2657 #define	PAGE_HANDLED	((page_t *)-1)
2658 
2659 /*
2660  * Release all the pages in the NULL terminated ppp list
2661  * which haven't already been converted to PAGE_HANDLED.
2662  */
2663 static void
segvn_pagelist_rele(page_t ** ppp)2664 segvn_pagelist_rele(page_t **ppp)
2665 {
2666 	for (; *ppp != NULL; ppp++) {
2667 		if (*ppp != PAGE_HANDLED)
2668 			page_unlock(*ppp);
2669 	}
2670 }
2671 
2672 static int stealcow = 1;
2673 
2674 /*
2675  * Workaround for viking chip bug.  See bug id 1220902.
2676  * To fix this down in pagefault() would require importing so
2677  * much as and segvn code as to be unmaintainable.
2678  */
2679 int enable_mbit_wa = 0;
2680 
2681 /*
2682  * Handles all the dirty work of getting the right
2683  * anonymous pages and loading up the translations.
2684  * This routine is called only from segvn_fault()
2685  * when looping over the range of addresses requested.
2686  *
2687  * The basic algorithm here is:
2688  * 	If this is an anon_zero case
2689  *		Call anon_zero to allocate page
2690  *		Load up translation
2691  *		Return
2692  *	endif
2693  *	If this is an anon page
2694  *		Use anon_getpage to get the page
2695  *	else
2696  *		Find page in pl[] list passed in
2697  *	endif
2698  *	If not a cow
2699  *		Load up the translation to the page
2700  *		return
2701  *	endif
2702  *	Call anon_private to handle cow
2703  *	Load up (writable) translation to new page
2704  */
2705 static faultcode_t
segvn_faultpage(struct hat * hat,struct seg * seg,caddr_t addr,u_offset_t off,struct vpage * vpage,page_t * pl[],uint_t vpprot,enum fault_type type,enum seg_rw rw,int brkcow)2706 segvn_faultpage(
2707 	struct hat *hat,		/* the hat to use for mapping */
2708 	struct seg *seg,		/* seg_vn of interest */
2709 	caddr_t addr,			/* address in as */
2710 	u_offset_t off,			/* offset in vp */
2711 	struct vpage *vpage,		/* pointer to vpage for vp, off */
2712 	page_t *pl[],			/* object source page pointer */
2713 	uint_t vpprot,			/* access allowed to object pages */
2714 	enum fault_type type,		/* type of fault */
2715 	enum seg_rw rw,			/* type of access at fault */
2716 	int brkcow)			/* we may need to break cow */
2717 {
2718 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2719 	page_t *pp, **ppp;
2720 	uint_t pageflags = 0;
2721 	page_t *anon_pl[1 + 1];
2722 	page_t *opp = NULL;		/* original page */
2723 	uint_t prot;
2724 	int err;
2725 	int cow;
2726 	int claim;
2727 	int steal = 0;
2728 	ulong_t anon_index = 0;
2729 	struct anon *ap, *oldap;
2730 	struct anon_map *amp;
2731 	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
2732 	int anon_lock = 0;
2733 	anon_sync_obj_t cookie;
2734 
2735 	if (svd->flags & MAP_TEXT) {
2736 		hat_flag |= HAT_LOAD_TEXT;
2737 	}
2738 
2739 	ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
2740 	ASSERT(seg->s_szc == 0);
2741 	ASSERT(svd->tr_state != SEGVN_TR_INIT);
2742 
2743 	/*
2744 	 * Initialize protection value for this page.
2745 	 * If we have per page protection values check it now.
2746 	 */
2747 	if (svd->pageprot) {
2748 		uint_t protchk;
2749 
2750 		switch (rw) {
2751 		case S_READ:
2752 			protchk = PROT_READ;
2753 			break;
2754 		case S_WRITE:
2755 			protchk = PROT_WRITE;
2756 			break;
2757 		case S_EXEC:
2758 			protchk = PROT_EXEC;
2759 			break;
2760 		case S_OTHER:
2761 		default:
2762 			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
2763 			break;
2764 		}
2765 
2766 		prot = VPP_PROT(vpage);
2767 		if ((prot & protchk) == 0)
2768 			return (FC_PROT);	/* illegal access type */
2769 	} else {
2770 		prot = svd->prot;
2771 	}
2772 
2773 	if (type == F_SOFTLOCK) {
2774 		atomic_inc_ulong((ulong_t *)&svd->softlockcnt);
2775 	}
2776 
2777 	/*
2778 	 * Always acquire the anon array lock to prevent 2 threads from
2779 	 * allocating separate anon slots for the same "addr".
2780 	 */
2781 
2782 	if ((amp = svd->amp) != NULL) {
2783 		ASSERT(RW_READ_HELD(&amp->a_rwlock));
2784 		anon_index = svd->anon_index + seg_page(seg, addr);
2785 		anon_array_enter(amp, anon_index, &cookie);
2786 		anon_lock = 1;
2787 	}
2788 
2789 	if (svd->vp == NULL && amp != NULL) {
2790 		if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) {
2791 			/*
2792 			 * Allocate a (normally) writable anonymous page of
2793 			 * zeroes. If no advance reservations, reserve now.
2794 			 */
2795 			if (svd->flags & MAP_NORESERVE) {
2796 				if (anon_resv_zone(ptob(1),
2797 				    seg->s_as->a_proc->p_zone)) {
2798 					atomic_add_long(&svd->swresv, ptob(1));
2799 					atomic_add_long(&seg->s_as->a_resvsize,
2800 					    ptob(1));
2801 				} else {
2802 					err = ENOMEM;
2803 					goto out;
2804 				}
2805 			}
2806 			if ((pp = anon_zero(seg, addr, &ap,
2807 			    svd->cred)) == NULL) {
2808 				err = ENOMEM;
2809 				goto out;	/* out of swap space */
2810 			}
2811 			/*
2812 			 * Re-acquire the anon_map lock and
2813 			 * initialize the anon array entry.
2814 			 */
2815 			(void) anon_set_ptr(amp->ahp, anon_index, ap,
2816 			    ANON_SLEEP);
2817 
2818 			ASSERT(pp->p_szc == 0);
2819 
2820 			/*
2821 			 * Handle pages that have been marked for migration
2822 			 */
2823 			if (lgrp_optimizations())
2824 				page_migrate(seg, addr, &pp, 1);
2825 
2826 			if (enable_mbit_wa) {
2827 				if (rw == S_WRITE)
2828 					hat_setmod(pp);
2829 				else if (!hat_ismod(pp))
2830 					prot &= ~PROT_WRITE;
2831 			}
2832 			/*
2833 			 * If AS_PAGLCK is set in a_flags (via memcntl(2)
2834 			 * with MC_LOCKAS, MCL_FUTURE) and this is a
2835 			 * MAP_NORESERVE segment, we may need to
2836 			 * permanently lock the page as it is being faulted
2837 			 * for the first time. The following text applies
2838 			 * only to MAP_NORESERVE segments:
2839 			 *
2840 			 * As per memcntl(2), if this segment was created
2841 			 * after MCL_FUTURE was applied (a "future"
2842 			 * segment), its pages must be locked.  If this
2843 			 * segment existed at MCL_FUTURE application (a
2844 			 * "past" segment), the interface is unclear.
2845 			 *
2846 			 * We decide to lock only if vpage is present:
2847 			 *
2848 			 * - "future" segments will have a vpage array (see
2849 			 *    as_map), and so will be locked as required
2850 			 *
2851 			 * - "past" segments may not have a vpage array,
2852 			 *    depending on whether events (such as
2853 			 *    mprotect) have occurred. Locking if vpage
2854 			 *    exists will preserve legacy behavior.  Not
2855 			 *    locking if vpage is absent, will not break
2856 			 *    the interface or legacy behavior.  Note that
2857 			 *    allocating vpage here if it's absent requires
2858 			 *    upgrading the segvn reader lock, the cost of
2859 			 *    which does not seem worthwhile.
2860 			 *
2861 			 * Usually testing and setting VPP_ISPPLOCK and
2862 			 * VPP_SETPPLOCK requires holding the segvn lock as
2863 			 * writer, but in this case all readers are
2864 			 * serializing on the anon array lock.
2865 			 */
2866 			if (AS_ISPGLCK(seg->s_as) && vpage != NULL &&
2867 			    (svd->flags & MAP_NORESERVE) &&
2868 			    !VPP_ISPPLOCK(vpage)) {
2869 				proc_t *p = seg->s_as->a_proc;
2870 				ASSERT(svd->type == MAP_PRIVATE);
2871 				mutex_enter(&p->p_lock);
2872 				if (rctl_incr_locked_mem(p, NULL, PAGESIZE,
2873 				    1) == 0) {
2874 					claim = VPP_PROT(vpage) & PROT_WRITE;
2875 					if (page_pp_lock(pp, claim, 0)) {
2876 						VPP_SETPPLOCK(vpage);
2877 					} else {
2878 						rctl_decr_locked_mem(p, NULL,
2879 						    PAGESIZE, 1);
2880 					}
2881 				}
2882 				mutex_exit(&p->p_lock);
2883 			}
2884 
2885 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2886 			hat_memload(hat, addr, pp, prot, hat_flag);
2887 
2888 			if (!(hat_flag & HAT_LOAD_LOCK))
2889 				page_unlock(pp);
2890 
2891 			anon_array_exit(&cookie);
2892 			return (0);
2893 		}
2894 	}
2895 
2896 	/*
2897 	 * Obtain the page structure via anon_getpage() if it is
2898 	 * a private copy of an object (the result of a previous
2899 	 * copy-on-write).
2900 	 */
2901 	if (amp != NULL) {
2902 		if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) {
2903 			err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE,
2904 			    seg, addr, rw, svd->cred);
2905 			if (err)
2906 				goto out;
2907 
2908 			if (svd->type == MAP_SHARED) {
2909 				/*
2910 				 * If this is a shared mapping to an
2911 				 * anon_map, then ignore the write
2912 				 * permissions returned by anon_getpage().
2913 				 * They apply to the private mappings
2914 				 * of this anon_map.
2915 				 */
2916 				vpprot |= PROT_WRITE;
2917 			}
2918 			opp = anon_pl[0];
2919 		}
2920 	}
2921 
2922 	/*
2923 	 * Search the pl[] list passed in if it is from the
2924 	 * original object (i.e., not a private copy).
2925 	 */
2926 	if (opp == NULL) {
2927 		/*
2928 		 * Find original page.  We must be bringing it in
2929 		 * from the list in pl[].
2930 		 */
2931 		for (ppp = pl; (opp = *ppp) != NULL; ppp++) {
2932 			if (opp == PAGE_HANDLED)
2933 				continue;
2934 			ASSERT(opp->p_vnode == svd->vp); /* XXX */
2935 			if (opp->p_offset == off)
2936 				break;
2937 		}
2938 		if (opp == NULL) {
2939 			panic("segvn_faultpage not found");
2940 			/*NOTREACHED*/
2941 		}
2942 		*ppp = PAGE_HANDLED;
2943 
2944 	}
2945 
2946 	ASSERT(PAGE_LOCKED(opp));
2947 
2948 	TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
2949 	    "segvn_fault:pp %p vp %p offset %llx", opp, NULL, 0);
2950 
2951 	/*
2952 	 * The fault is treated as a copy-on-write fault if a
2953 	 * write occurs on a private segment and the object
2954 	 * page (i.e., mapping) is write protected.  We assume
2955 	 * that fatal protection checks have already been made.
2956 	 */
2957 
2958 	if (brkcow) {
2959 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
2960 		cow = !(vpprot & PROT_WRITE);
2961 	} else if (svd->tr_state == SEGVN_TR_ON) {
2962 		/*
2963 		 * If we are doing text replication COW on first touch.
2964 		 */
2965 		ASSERT(amp != NULL);
2966 		ASSERT(svd->vp != NULL);
2967 		ASSERT(rw != S_WRITE);
2968 		cow = (ap == NULL);
2969 	} else {
2970 		cow = 0;
2971 	}
2972 
2973 	/*
2974 	 * If not a copy-on-write case load the translation
2975 	 * and return.
2976 	 */
2977 	if (cow == 0) {
2978 
2979 		/*
2980 		 * Handle pages that have been marked for migration
2981 		 */
2982 		if (lgrp_optimizations())
2983 			page_migrate(seg, addr, &opp, 1);
2984 
2985 		if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) {
2986 			if (rw == S_WRITE)
2987 				hat_setmod(opp);
2988 			else if (rw != S_OTHER && !hat_ismod(opp))
2989 				prot &= ~PROT_WRITE;
2990 		}
2991 
2992 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE ||
2993 		    (!svd->pageprot && svd->prot == (prot & vpprot)));
2994 		ASSERT(amp == NULL ||
2995 		    svd->rcookie == HAT_INVALID_REGION_COOKIE);
2996 		hat_memload_region(hat, addr, opp, prot & vpprot, hat_flag,
2997 		    svd->rcookie);
2998 
2999 		if (!(hat_flag & HAT_LOAD_LOCK))
3000 			page_unlock(opp);
3001 
3002 		if (anon_lock) {
3003 			anon_array_exit(&cookie);
3004 		}
3005 		return (0);
3006 	}
3007 
3008 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
3009 
3010 	hat_setref(opp);
3011 
3012 	ASSERT(amp != NULL && anon_lock);
3013 
3014 	/*
3015 	 * Steal the page only if it isn't a private page
3016 	 * since stealing a private page is not worth the effort.
3017 	 */
3018 	if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL)
3019 		steal = 1;
3020 
3021 	/*
3022 	 * Steal the original page if the following conditions are true:
3023 	 *
3024 	 * We are low on memory, the page is not private, page is not large,
3025 	 * not shared, not modified, not `locked' or if we have it `locked'
3026 	 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies
3027 	 * that the page is not shared) and if it doesn't have any
3028 	 * translations. page_struct_lock isn't needed to look at p_cowcnt
3029 	 * and p_lckcnt because we first get exclusive lock on page.
3030 	 */
3031 	(void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
3032 
3033 	if (