1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2018 Joyent, Inc.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
34 *
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
38 */
39
40 /*
41 * VM - shared or copy-on-write from a vnode/anonymous memory.
42 */
43
44 #include <sys/types.h>
45 #include <sys/param.h>
46 #include <sys/t_lock.h>
47 #include <sys/errno.h>
48 #include <sys/systm.h>
49 #include <sys/mman.h>
50 #include <sys/debug.h>
51 #include <sys/cred.h>
52 #include <sys/vmsystm.h>
53 #include <sys/tuneable.h>
54 #include <sys/bitmap.h>
55 #include <sys/swap.h>
56 #include <sys/kmem.h>
57 #include <sys/sysmacros.h>
58 #include <sys/vtrace.h>
59 #include <sys/cmn_err.h>
60 #include <sys/callb.h>
61 #include <sys/vm.h>
62 #include <sys/dumphdr.h>
63 #include <sys/lgrp.h>
64
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/seg.h>
68 #include <vm/seg_vn.h>
69 #include <vm/pvn.h>
70 #include <vm/anon.h>
71 #include <vm/page.h>
72 #include <vm/vpage.h>
73 #include <sys/proc.h>
74 #include <sys/task.h>
75 #include <sys/project.h>
76 #include <sys/zone.h>
77 #include <sys/shm_impl.h>
78
79 /*
80 * segvn_fault needs a temporary page list array. To avoid calling kmem all
81 * the time, it creates a small (PVN_GETPAGE_NUM entry) array and uses it if
82 * it can. In the rare case when this page list is not large enough, it
83 * goes and gets a large enough array from kmem.
84 *
85 * This small page list array covers either 8 pages or 64kB worth of pages -
86 * whichever is smaller.
87 */
88 #define PVN_MAX_GETPAGE_SZ 0x10000
89 #define PVN_MAX_GETPAGE_NUM 0x8
90
91 #if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE
92 #define PVN_GETPAGE_SZ ptob(PVN_MAX_GETPAGE_NUM)
93 #define PVN_GETPAGE_NUM PVN_MAX_GETPAGE_NUM
94 #else
95 #define PVN_GETPAGE_SZ PVN_MAX_GETPAGE_SZ
96 #define PVN_GETPAGE_NUM btop(PVN_MAX_GETPAGE_SZ)
97 #endif
98
99 /*
100 * Private seg op routines.
101 */
102 static int segvn_dup(struct seg *seg, struct seg *newseg);
103 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
104 static void segvn_free(struct seg *seg);
105 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
106 caddr_t addr, size_t len, enum fault_type type,
107 enum seg_rw rw);
108 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
109 static int segvn_setprot(struct seg *seg, caddr_t addr,
110 size_t len, uint_t prot);
111 static int segvn_checkprot(struct seg *seg, caddr_t addr,
112 size_t len, uint_t prot);
113 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
114 static size_t segvn_swapout(struct seg *seg);
115 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len,
116 int attr, uint_t flags);
117 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len,
118 char *vec);
119 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
120 int attr, int op, ulong_t *lockmap, size_t pos);
121 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
122 uint_t *protv);
123 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr);
124 static int segvn_gettype(struct seg *seg, caddr_t addr);
125 static int segvn_getvp(struct seg *seg, caddr_t addr,
126 struct vnode **vpp);
127 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len,
128 uint_t behav);
129 static void segvn_dump(struct seg *seg);
130 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
131 struct page ***ppp, enum lock_type type, enum seg_rw rw);
132 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
133 uint_t szc);
134 static int segvn_getmemid(struct seg *seg, caddr_t addr,
135 memid_t *memidp);
136 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t);
137 static int segvn_capable(struct seg *seg, segcapability_t capable);
138 static int segvn_inherit(struct seg *, caddr_t, size_t, uint_t);
139
140 struct seg_ops segvn_ops = {
141 segvn_dup,
142 segvn_unmap,
143 segvn_free,
144 segvn_fault,
145 segvn_faulta,
146 segvn_setprot,
147 segvn_checkprot,
148 segvn_kluster,
149 segvn_swapout,
150 segvn_sync,
151 segvn_incore,
152 segvn_lockop,
153 segvn_getprot,
154 segvn_getoffset,
155 segvn_gettype,
156 segvn_getvp,
157 segvn_advise,
158 segvn_dump,
159 segvn_pagelock,
160 segvn_setpagesize,
161 segvn_getmemid,
162 segvn_getpolicy,
163 segvn_capable,
164 segvn_inherit
165 };
166
167 /*
168 * Common zfod structures, provided as a shorthand for others to use.
169 */
170 static segvn_crargs_t zfod_segvn_crargs =
171 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
172 static segvn_crargs_t kzfod_segvn_crargs =
173 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
174 PROT_ALL & ~PROT_USER);
175 static segvn_crargs_t stack_noexec_crargs =
176 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
177
178 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */
179 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */
180 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */
181 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
182
183 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */
184
185 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */
186
187 size_t segvn_pglock_comb_thrshld = (1UL << 16); /* 64K */
188 size_t segvn_pglock_comb_balign = (1UL << 16); /* 64K */
189 uint_t segvn_pglock_comb_bshift;
190 size_t segvn_pglock_comb_palign;
191
192 static int segvn_concat(struct seg *, struct seg *, int);
193 static int segvn_extend_prev(struct seg *, struct seg *,
194 struct segvn_crargs *, size_t);
195 static int segvn_extend_next(struct seg *, struct seg *,
196 struct segvn_crargs *, size_t);
197 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw);
198 static void segvn_pagelist_rele(page_t **);
199 static void segvn_setvnode_mpss(vnode_t *);
200 static void segvn_relocate_pages(page_t **, page_t *);
201 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *);
202 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t,
203 uint_t, page_t **, page_t **, uint_t *, int *);
204 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t,
205 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
206 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
207 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
208 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
209 u_offset_t, struct vpage *, page_t **, uint_t,
210 enum fault_type, enum seg_rw, int);
211 static void segvn_vpage(struct seg *);
212 static size_t segvn_count_swap_by_vpages(struct seg *);
213
214 static void segvn_purge(struct seg *seg);
215 static int segvn_reclaim(void *, caddr_t, size_t, struct page **,
216 enum seg_rw, int);
217 static int shamp_reclaim(void *, caddr_t, size_t, struct page **,
218 enum seg_rw, int);
219
220 static int sameprot(struct seg *, caddr_t, size_t);
221
222 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t);
223 static int segvn_clrszc(struct seg *);
224 static struct seg *segvn_split_seg(struct seg *, caddr_t);
225 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
226 ulong_t, uint_t);
227
228 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t,
229 size_t, void *, u_offset_t);
230
231 static struct kmem_cache *segvn_cache;
232 static struct kmem_cache **segvn_szc_cache;
233
234 #ifdef VM_STATS
235 static struct segvnvmstats_str {
236 ulong_t fill_vp_pages[31];
237 ulong_t fltvnpages[49];
238 ulong_t fullszcpages[10];
239 ulong_t relocatepages[3];
240 ulong_t fltanpages[17];
241 ulong_t pagelock[2];
242 ulong_t demoterange[3];
243 } segvnvmstats;
244 #endif /* VM_STATS */
245
246 #define SDR_RANGE 1 /* demote entire range */
247 #define SDR_END 2 /* demote non aligned ends only */
248
249 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \
250 if ((len) != 0) { \
251 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \
252 ASSERT(lpgaddr >= (seg)->s_base); \
253 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \
254 (len)), pgsz); \
255 ASSERT(lpgeaddr > lpgaddr); \
256 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \
257 } else { \
258 lpgeaddr = lpgaddr = (addr); \
259 } \
260 }
261
262 /*ARGSUSED*/
263 static int
segvn_cache_constructor(void * buf,void * cdrarg,int kmflags)264 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
265 {
266 struct segvn_data *svd = buf;
267
268 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
269 mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
270 svd->svn_trnext = svd->svn_trprev = NULL;
271 return (0);
272 }
273
274 /*ARGSUSED1*/
275 static void
segvn_cache_destructor(void * buf,void * cdrarg)276 segvn_cache_destructor(void *buf, void *cdrarg)
277 {
278 struct segvn_data *svd = buf;
279
280 rw_destroy(&svd->lock);
281 mutex_destroy(&svd->segfree_syncmtx);
282 }
283
284 /*ARGSUSED*/
285 static int
svntr_cache_constructor(void * buf,void * cdrarg,int kmflags)286 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags)
287 {
288 bzero(buf, sizeof (svntr_t));
289 return (0);
290 }
291
292 /*
293 * Patching this variable to non-zero allows the system to run with
294 * stacks marked as "not executable". It's a bit of a kludge, but is
295 * provided as a tweakable for platforms that export those ABIs
296 * (e.g. sparc V8) that have executable stacks enabled by default.
297 * There are also some restrictions for platforms that don't actually
298 * implement 'noexec' protections.
299 *
300 * Once enabled, the system is (therefore) unable to provide a fully
301 * ABI-compliant execution environment, though practically speaking,
302 * most everything works. The exceptions are generally some interpreters
303 * and debuggers that create executable code on the stack and jump
304 * into it (without explicitly mprotecting the address range to include
305 * PROT_EXEC).
306 *
307 * One important class of applications that are disabled are those
308 * that have been transformed into malicious agents using one of the
309 * numerous "buffer overflow" attacks. See 4007890.
310 */
311 int noexec_user_stack = 0;
312 int noexec_user_stack_log = 1;
313
314 int segvn_lpg_disable = 0;
315 uint_t segvn_maxpgszc = 0;
316
317 ulong_t segvn_vmpss_clrszc_cnt;
318 ulong_t segvn_vmpss_clrszc_err;
319 ulong_t segvn_fltvnpages_clrszc_cnt;
320 ulong_t segvn_fltvnpages_clrszc_err;
321 ulong_t segvn_setpgsz_align_err;
322 ulong_t segvn_setpgsz_anon_align_err;
323 ulong_t segvn_setpgsz_getattr_err;
324 ulong_t segvn_setpgsz_eof_err;
325 ulong_t segvn_faultvnmpss_align_err1;
326 ulong_t segvn_faultvnmpss_align_err2;
327 ulong_t segvn_faultvnmpss_align_err3;
328 ulong_t segvn_faultvnmpss_align_err4;
329 ulong_t segvn_faultvnmpss_align_err5;
330 ulong_t segvn_vmpss_pageio_deadlk_err;
331
332 int segvn_use_regions = 1;
333
334 /*
335 * Segvn supports text replication optimization for NUMA platforms. Text
336 * replica's are represented by anon maps (amp). There's one amp per text file
337 * region per lgroup. A process chooses the amp for each of its text mappings
338 * based on the lgroup assignment of its main thread (t_tid = 1). All
339 * processes that want a replica on a particular lgroup for the same text file
340 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table
341 * with vp,off,size,szc used as a key. Text replication segments are read only
342 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
343 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode
344 * pages. Replication amp is assigned to a segment when it gets its first
345 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
346 * rechecks periodically if the process still maps an amp local to the main
347 * thread. If not async thread forces process to remap to an amp in the new
348 * home lgroup of the main thread. Current text replication implementation
349 * only provides the benefit to workloads that do most of their work in the
350 * main thread of a process or all the threads of a process run in the same
351 * lgroup. To extend text replication benefit to different types of
352 * multithreaded workloads further work would be needed in the hat layer to
353 * allow the same virtual address in the same hat to simultaneously map
354 * different physical addresses (i.e. page table replication would be needed
355 * for x86).
356 *
357 * amp pages are used instead of vnode pages as long as segment has a very
358 * simple life cycle. It's created via segvn_create(), handles S_EXEC
359 * (S_READ) pagefaults and is fully unmapped. If anything more complicated
360 * happens such as protection is changed, real COW fault happens, pagesize is
361 * changed, MC_LOCK is requested or segment is partially unmapped we turn off
362 * text replication by converting the segment back to vnode only segment
363 * (unmap segment's address range and set svd->amp to NULL).
364 *
365 * The original file can be changed after amp is inserted into
366 * svntr_hashtab. Processes that are launched after the file is already
367 * changed can't use the replica's created prior to the file change. To
368 * implement this functionality hash entries are timestamped. Replica's can
369 * only be used if current file modification time is the same as the timestamp
370 * saved when hash entry was created. However just timestamps alone are not
371 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
372 * deal with file changes via MAP_SHARED mappings differently. When writable
373 * MAP_SHARED mappings are created to vnodes marked as executable we mark all
374 * existing replica's for this vnode as not usable for future text
375 * mappings. And we don't create new replica's for files that currently have
376 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
377 * true).
378 */
379
380 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20)
381 size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR;
382
383 static ulong_t svntr_hashtab_sz = 512;
384 static svntr_bucket_t *svntr_hashtab = NULL;
385 static struct kmem_cache *svntr_cache;
386 static svntr_stats_t *segvn_textrepl_stats;
387 static ksema_t segvn_trasync_sem;
388
389 int segvn_disable_textrepl = 1;
390 size_t textrepl_size_thresh = (size_t)-1;
391 size_t segvn_textrepl_bytes = 0;
392 size_t segvn_textrepl_max_bytes = 0;
393 clock_t segvn_update_textrepl_interval = 0;
394 int segvn_update_tr_time = 10;
395 int segvn_disable_textrepl_update = 0;
396
397 static void segvn_textrepl(struct seg *);
398 static void segvn_textunrepl(struct seg *, int);
399 static void segvn_inval_trcache(vnode_t *);
400 static void segvn_trasync_thread(void);
401 static void segvn_trupdate_wakeup(void *);
402 static void segvn_trupdate(void);
403 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *,
404 ulong_t);
405
406 /*
407 * Initialize segvn data structures
408 */
409 void
segvn_init(void)410 segvn_init(void)
411 {
412 uint_t maxszc;
413 uint_t szc;
414 size_t pgsz;
415
416 segvn_cache = kmem_cache_create("segvn_cache",
417 sizeof (struct segvn_data), 0,
418 segvn_cache_constructor, segvn_cache_destructor, NULL,
419 NULL, NULL, 0);
420
421 if (segvn_lpg_disable == 0) {
422 szc = maxszc = page_num_pagesizes() - 1;
423 if (szc == 0) {
424 segvn_lpg_disable = 1;
425 }
426 if (page_get_pagesize(0) != PAGESIZE) {
427 panic("segvn_init: bad szc 0");
428 /*NOTREACHED*/
429 }
430 while (szc != 0) {
431 pgsz = page_get_pagesize(szc);
432 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) {
433 panic("segvn_init: bad szc %d", szc);
434 /*NOTREACHED*/
435 }
436 szc--;
437 }
438 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
439 segvn_maxpgszc = maxszc;
440 }
441
442 if (segvn_maxpgszc) {
443 segvn_szc_cache = (struct kmem_cache **)kmem_alloc(
444 (segvn_maxpgszc + 1) * sizeof (struct kmem_cache *),
445 KM_SLEEP);
446 }
447
448 for (szc = 1; szc <= segvn_maxpgszc; szc++) {
449 char str[32];
450
451 (void) sprintf(str, "segvn_szc_cache%d", szc);
452 segvn_szc_cache[szc] = kmem_cache_create(str,
453 page_get_pagecnt(szc) * sizeof (page_t *), 0,
454 NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
455 }
456
457
458 if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL))
459 segvn_use_regions = 0;
460
461 /*
462 * For now shared regions and text replication segvn support
463 * are mutually exclusive. This is acceptable because
464 * currently significant benefit from text replication was
465 * only observed on AMD64 NUMA platforms (due to relatively
466 * small L2$ size) and currently we don't support shared
467 * regions on x86.
468 */
469 if (segvn_use_regions && !segvn_disable_textrepl) {
470 segvn_disable_textrepl = 1;
471 }
472
473 #if defined(_LP64)
474 if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 &&
475 !segvn_disable_textrepl) {
476 ulong_t i;
477 size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t);
478
479 svntr_cache = kmem_cache_create("svntr_cache",
480 sizeof (svntr_t), 0, svntr_cache_constructor, NULL,
481 NULL, NULL, NULL, 0);
482 svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP);
483 for (i = 0; i < svntr_hashtab_sz; i++) {
484 mutex_init(&svntr_hashtab[i].tr_lock, NULL,
485 MUTEX_DEFAULT, NULL);
486 }
487 segvn_textrepl_max_bytes = ptob(physmem) /
488 segvn_textrepl_max_bytes_factor;
489 segvn_textrepl_stats = kmem_zalloc(NCPU *
490 sizeof (svntr_stats_t), KM_SLEEP);
491 sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
492 (void) thread_create(NULL, 0, segvn_trasync_thread,
493 NULL, 0, &p0, TS_RUN, minclsyspri);
494 }
495 #endif
496
497 if (!ISP2(segvn_pglock_comb_balign) ||
498 segvn_pglock_comb_balign < PAGESIZE) {
499 segvn_pglock_comb_balign = 1UL << 16; /* 64K */
500 }
501 segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1;
502 segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign);
503 }
504
505 #define SEGVN_PAGEIO ((void *)0x1)
506 #define SEGVN_NOPAGEIO ((void *)0x2)
507
508 static void
segvn_setvnode_mpss(vnode_t * vp)509 segvn_setvnode_mpss(vnode_t *vp)
510 {
511 int err;
512
513 ASSERT(vp->v_mpssdata == NULL ||
514 vp->v_mpssdata == SEGVN_PAGEIO ||
515 vp->v_mpssdata == SEGVN_NOPAGEIO);
516
517 if (vp->v_mpssdata == NULL) {
518 if (vn_vmpss_usepageio(vp)) {
519 err = VOP_PAGEIO(vp, (page_t *)NULL,
520 (u_offset_t)0, 0, 0, CRED(), NULL);
521 } else {
522 err = ENOSYS;
523 }
524 /*
525 * set v_mpssdata just once per vnode life
526 * so that it never changes.
527 */
528 mutex_enter(&vp->v_lock);
529 if (vp->v_mpssdata == NULL) {
530 if (err == EINVAL) {
531 vp->v_mpssdata = SEGVN_PAGEIO;
532 } else {
533 vp->v_mpssdata = SEGVN_NOPAGEIO;
534 }
535 }
536 mutex_exit(&vp->v_lock);
537 }
538 }
539
540 int
segvn_create(struct seg ** segpp,void * argsp)541 segvn_create(struct seg **segpp, void *argsp)
542 {
543 struct seg *seg = *segpp;
544 extern lgrp_mem_policy_t lgrp_mem_default_policy;
545 struct segvn_crargs *a = (struct segvn_crargs *)argsp;
546 struct segvn_data *svd;
547 size_t swresv = 0;
548 struct cred *cred;
549 struct anon_map *amp;
550 int error = 0;
551 size_t pgsz;
552 lgrp_mem_policy_t mpolicy = lgrp_mem_default_policy;
553 int use_rgn = 0;
554 int trok = 0;
555
556 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
557
558 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) {
559 panic("segvn_create type");
560 /*NOTREACHED*/
561 }
562
563 /*
564 * Check arguments. If a shared anon structure is given then
565 * it is illegal to also specify a vp.
566 */
567 if (a->amp != NULL && a->vp != NULL) {
568 panic("segvn_create anon_map");
569 /*NOTREACHED*/
570 }
571
572 if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) &&
573 a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) &&
574 segvn_use_regions) {
575 use_rgn = 1;
576 }
577
578 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
579 if (a->type == MAP_SHARED)
580 a->flags &= ~MAP_NORESERVE;
581
582 if (a->szc != 0) {
583 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) ||
584 (a->amp != NULL && a->type == MAP_PRIVATE) ||
585 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
586 a->szc = 0;
587 } else {
588 if (a->szc > segvn_maxpgszc)
589 a->szc = segvn_maxpgszc;
590 pgsz = page_get_pagesize(a->szc);
591 if (!IS_P2ALIGNED(seg->s_base, pgsz) ||
592 !IS_P2ALIGNED(seg->s_size, pgsz)) {
593 a->szc = 0;
594 } else if (a->vp != NULL) {
595 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) {
596 /*
597 * paranoid check.
598 * hat_page_demote() is not supported
599 * on swapfs pages.
600 */
601 a->szc = 0;
602 } else if (map_addr_vacalign_check(seg->s_base,
603 a->offset & PAGEMASK)) {
604 a->szc = 0;
605 }
606 } else if (a->amp != NULL) {
607 pgcnt_t anum = btopr(a->offset);
608 pgcnt_t pgcnt = page_get_pagecnt(a->szc);
609 if (!IS_P2ALIGNED(anum, pgcnt)) {
610 a->szc = 0;
611 }
612 }
613 }
614 }
615
616 /*
617 * If segment may need private pages, reserve them now.
618 */
619 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) ||
620 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) {
621 if (anon_resv_zone(seg->s_size,
622 seg->s_as->a_proc->p_zone) == 0)
623 return (EAGAIN);
624 swresv = seg->s_size;
625 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
626 seg, swresv, 1);
627 }
628
629 /*
630 * Reserve any mapping structures that may be required.
631 *
632 * Don't do it for segments that may use regions. It's currently a
633 * noop in the hat implementations anyway.
634 */
635 if (!use_rgn) {
636 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
637 }
638
639 if (a->cred) {
640 cred = a->cred;
641 crhold(cred);
642 } else {
643 crhold(cred = CRED());
644 }
645
646 /* Inform the vnode of the new mapping */
647 if (a->vp != NULL) {
648 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
649 seg->s_as, seg->s_base, seg->s_size, a->prot,
650 a->maxprot, a->type, cred, NULL);
651 if (error) {
652 if (swresv != 0) {
653 anon_unresv_zone(swresv,
654 seg->s_as->a_proc->p_zone);
655 TRACE_3(TR_FAC_VM, TR_ANON_PROC,
656 "anon proc:%p %lu %u", seg, swresv, 0);
657 }
658 crfree(cred);
659 if (!use_rgn) {
660 hat_unload(seg->s_as->a_hat, seg->s_base,
661 seg->s_size, HAT_UNLOAD_UNMAP);
662 }
663 return (error);
664 }
665 /*
666 * svntr_hashtab will be NULL if we support shared regions.
667 */
668 trok = ((a->flags & MAP_TEXT) &&
669 (seg->s_size > textrepl_size_thresh ||
670 (a->flags & _MAP_TEXTREPL)) &&
671 lgrp_optimizations() && svntr_hashtab != NULL &&
672 a->type == MAP_PRIVATE && swresv == 0 &&
673 !(a->flags & MAP_NORESERVE) &&
674 seg->s_as != &kas && a->vp->v_type == VREG);
675
676 ASSERT(!trok || !use_rgn);
677 }
678
679 /*
680 * MAP_NORESERVE mappings don't count towards the VSZ of a process
681 * until we fault the pages in.
682 */
683 if ((a->vp == NULL || a->vp->v_type != VREG) &&
684 a->flags & MAP_NORESERVE) {
685 seg->s_as->a_resvsize -= seg->s_size;
686 }
687
688 /*
689 * If more than one segment in the address space, and they're adjacent
690 * virtually, try to concatenate them. Don't concatenate if an
691 * explicit anon_map structure was supplied (e.g., SystemV shared
692 * memory) or if we'll use text replication for this segment.
693 */
694 if (a->amp == NULL && !use_rgn && !trok) {
695 struct seg *pseg, *nseg;
696 struct segvn_data *psvd, *nsvd;
697 lgrp_mem_policy_t ppolicy, npolicy;
698 uint_t lgrp_mem_policy_flags = 0;
699
700 /*
701 * Memory policy flags (lgrp_mem_policy_flags) is valid when
702 * extending stack/heap segments.
703 */
704 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
705 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
706 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
707 } else {
708 /*
709 * Get policy when not extending it from another segment
710 */
711 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type);
712 }
713
714 /*
715 * First, try to concatenate the previous and new segments
716 */
717 pseg = AS_SEGPREV(seg->s_as, seg);
718 if (pseg != NULL &&
719 pseg->s_base + pseg->s_size == seg->s_base &&
720 pseg->s_ops == &segvn_ops) {
721 /*
722 * Get memory allocation policy from previous segment.
723 * When extension is specified (e.g. for heap) apply
724 * this policy to the new segment regardless of the
725 * outcome of segment concatenation. Extension occurs
726 * for non-default policy otherwise default policy is
727 * used and is based on extended segment size.
728 */
729 psvd = (struct segvn_data *)pseg->s_data;
730 ppolicy = psvd->policy_info.mem_policy;
731 if (lgrp_mem_policy_flags ==
732 LGRP_MP_FLAG_EXTEND_UP) {
733 if (ppolicy != lgrp_mem_default_policy) {
734 mpolicy = ppolicy;
735 } else {
736 mpolicy = lgrp_mem_policy_default(
737 pseg->s_size + seg->s_size,
738 a->type);
739 }
740 }
741
742 if (mpolicy == ppolicy &&
743 (pseg->s_size + seg->s_size <=
744 segvn_comb_thrshld || psvd->amp == NULL) &&
745 segvn_extend_prev(pseg, seg, a, swresv) == 0) {
746 /*
747 * success! now try to concatenate
748 * with following seg
749 */
750 crfree(cred);
751 nseg = AS_SEGNEXT(pseg->s_as, pseg);
752 if (nseg != NULL &&
753 nseg != pseg &&
754 nseg->s_ops == &segvn_ops &&
755 pseg->s_base + pseg->s_size ==
756 nseg->s_base)
757 (void) segvn_concat(pseg, nseg, 0);
758 ASSERT(pseg->s_szc == 0 ||
759 (a->szc == pseg->s_szc &&
760 IS_P2ALIGNED(pseg->s_base, pgsz) &&
761 IS_P2ALIGNED(pseg->s_size, pgsz)));
762 /*
763 * Communicate out the newly concatenated
764 * segment as part of the result.
765 */
766 *segpp = pseg;
767 return (0);
768 }
769 }
770
771 /*
772 * Failed, so try to concatenate with following seg
773 */
774 nseg = AS_SEGNEXT(seg->s_as, seg);
775 if (nseg != NULL &&
776 seg->s_base + seg->s_size == nseg->s_base &&
777 nseg->s_ops == &segvn_ops) {
778 /*
779 * Get memory allocation policy from next segment.
780 * When extension is specified (e.g. for stack) apply
781 * this policy to the new segment regardless of the
782 * outcome of segment concatenation. Extension occurs
783 * for non-default policy otherwise default policy is
784 * used and is based on extended segment size.
785 */
786 nsvd = (struct segvn_data *)nseg->s_data;
787 npolicy = nsvd->policy_info.mem_policy;
788 if (lgrp_mem_policy_flags ==
789 LGRP_MP_FLAG_EXTEND_DOWN) {
790 if (npolicy != lgrp_mem_default_policy) {
791 mpolicy = npolicy;
792 } else {
793 mpolicy = lgrp_mem_policy_default(
794 nseg->s_size + seg->s_size,
795 a->type);
796 }
797 }
798
799 if (mpolicy == npolicy &&
800 segvn_extend_next(seg, nseg, a, swresv) == 0) {
801 crfree(cred);
802 ASSERT(nseg->s_szc == 0 ||
803 (a->szc == nseg->s_szc &&
804 IS_P2ALIGNED(nseg->s_base, pgsz) &&
805 IS_P2ALIGNED(nseg->s_size, pgsz)));
806 /*
807 * Communicate out the newly concatenated
808 * segment as part of the result.
809 */
810 *segpp = nseg;
811 return (0);
812 }
813 }
814 }
815
816 if (a->vp != NULL) {
817 VN_HOLD(a->vp);
818 if (a->type == MAP_SHARED)
819 lgrp_shm_policy_init(NULL, a->vp);
820 }
821 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
822
823 seg->s_ops = &segvn_ops;
824 seg->s_data = (void *)svd;
825 seg->s_szc = a->szc;
826
827 svd->seg = seg;
828 svd->vp = a->vp;
829 /*
830 * Anonymous mappings have no backing file so the offset is meaningless.
831 */
832 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0;
833 svd->prot = a->prot;
834 svd->maxprot = a->maxprot;
835 svd->pageprot = 0;
836 svd->type = a->type;
837 svd->vpage = NULL;
838 svd->cred = cred;
839 svd->advice = MADV_NORMAL;
840 svd->pageadvice = 0;
841 svd->flags = (ushort_t)a->flags;
842 svd->softlockcnt = 0;
843 svd->softlockcnt_sbase = 0;
844 svd->softlockcnt_send = 0;
845 svd->svn_inz = 0;
846 svd->rcookie = HAT_INVALID_REGION_COOKIE;
847 svd->pageswap = 0;
848
849 if (a->szc != 0 && a->vp != NULL) {
850 segvn_setvnode_mpss(a->vp);
851 }
852 if (svd->type == MAP_SHARED && svd->vp != NULL &&
853 (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) {
854 ASSERT(vn_is_mapped(svd->vp, V_WRITE));
855 segvn_inval_trcache(svd->vp);
856 }
857
858 amp = a->amp;
859 if ((svd->amp = amp) == NULL) {
860 svd->anon_index = 0;
861 if (svd->type == MAP_SHARED) {
862 svd->swresv = 0;
863 /*
864 * Shared mappings to a vp need no other setup.
865 * If we have a shared mapping to an anon_map object
866 * which hasn't been allocated yet, allocate the
867 * struct now so that it will be properly shared
868 * by remembering the swap reservation there.
869 */
870 if (a->vp == NULL) {
871 svd->amp = anonmap_alloc(seg->s_size, swresv,
872 ANON_SLEEP);
873 svd->amp->a_szc = seg->s_szc;
874 }
875 } else {
876 /*
877 * Private mapping (with or without a vp).
878 * Allocate anon_map when needed.
879 */
880 svd->swresv = swresv;
881 }
882 } else {
883 pgcnt_t anon_num;
884
885 /*
886 * Mapping to an existing anon_map structure without a vp.
887 * For now we will insure that the segment size isn't larger
888 * than the size - offset gives us. Later on we may wish to
889 * have the anon array dynamically allocated itself so that
890 * we don't always have to allocate all the anon pointer slots.
891 * This of course involves adding extra code to check that we
892 * aren't trying to use an anon pointer slot beyond the end
893 * of the currently allocated anon array.
894 */
895 if ((amp->size - a->offset) < seg->s_size) {
896 panic("segvn_create anon_map size");
897 /*NOTREACHED*/
898 }
899
900 anon_num = btopr(a->offset);
901
902 if (a->type == MAP_SHARED) {
903 /*
904 * SHARED mapping to a given anon_map.
905 */
906 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
907 amp->refcnt++;
908 if (a->szc > amp->a_szc) {
909 amp->a_szc = a->szc;
910 }
911 ANON_LOCK_EXIT(&->a_rwlock);
912 svd->anon_index = anon_num;
913 svd->swresv = 0;
914 } else {
915 /*
916 * PRIVATE mapping to a given anon_map.
917 * Make sure that all the needed anon
918 * structures are created (so that we will
919 * share the underlying pages if nothing
920 * is written by this mapping) and then
921 * duplicate the anon array as is done
922 * when a privately mapped segment is dup'ed.
923 */
924 struct anon *ap;
925 caddr_t addr;
926 caddr_t eaddr;
927 ulong_t anon_idx;
928 int hat_flag = HAT_LOAD;
929
930 if (svd->flags & MAP_TEXT) {
931 hat_flag |= HAT_LOAD_TEXT;
932 }
933
934 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
935 svd->amp->a_szc = seg->s_szc;
936 svd->anon_index = 0;
937 svd->swresv = swresv;
938
939 /*
940 * Prevent 2 threads from allocating anon
941 * slots simultaneously.
942 */
943 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
944 eaddr = seg->s_base + seg->s_size;
945
946 for (anon_idx = anon_num, addr = seg->s_base;
947 addr < eaddr; addr += PAGESIZE, anon_idx++) {
948 page_t *pp;
949
950 if ((ap = anon_get_ptr(amp->ahp,
951 anon_idx)) != NULL)
952 continue;
953
954 /*
955 * Allocate the anon struct now.
956 * Might as well load up translation
957 * to the page while we're at it...
958 */
959 pp = anon_zero(seg, addr, &ap, cred);
960 if (ap == NULL || pp == NULL) {
961 panic("segvn_create anon_zero");
962 /*NOTREACHED*/
963 }
964
965 /*
966 * Re-acquire the anon_map lock and
967 * initialize the anon array entry.
968 */
969 ASSERT(anon_get_ptr(amp->ahp,
970 anon_idx) == NULL);
971 (void) anon_set_ptr(amp->ahp, anon_idx, ap,
972 ANON_SLEEP);
973
974 ASSERT(seg->s_szc == 0);
975 ASSERT(!IS_VMODSORT(pp->p_vnode));
976
977 ASSERT(use_rgn == 0);
978 hat_memload(seg->s_as->a_hat, addr, pp,
979 svd->prot & ~PROT_WRITE, hat_flag);
980
981 page_unlock(pp);
982 }
983 ASSERT(seg->s_szc == 0);
984 anon_dup(amp->ahp, anon_num, svd->amp->ahp,
985 0, seg->s_size);
986 ANON_LOCK_EXIT(&->a_rwlock);
987 }
988 }
989
990 /*
991 * Set default memory allocation policy for segment
992 *
993 * Always set policy for private memory at least for initialization
994 * even if this is a shared memory segment
995 */
996 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size);
997
998 if (svd->type == MAP_SHARED)
999 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
1000 svd->vp, svd->offset, seg->s_size);
1001
1002 if (use_rgn) {
1003 ASSERT(!trok);
1004 ASSERT(svd->amp == NULL);
1005 svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base,
1006 seg->s_size, (void *)svd->vp, svd->offset, svd->prot,
1007 (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback,
1008 HAT_REGION_TEXT);
1009 }
1010
1011 ASSERT(!trok || !(svd->prot & PROT_WRITE));
1012 svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF;
1013
1014 return (0);
1015 }
1016
1017 /*
1018 * Concatenate two existing segments, if possible.
1019 * Return 0 on success, -1 if two segments are not compatible
1020 * or -2 on memory allocation failure.
1021 * If amp_cat == 1 then try and concat segments with anon maps
1022 */
1023 static int
segvn_concat(struct seg * seg1,struct seg * seg2,int amp_cat)1024 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat)
1025 {
1026 struct segvn_data *svd1 = seg1->s_data;
1027 struct segvn_data *svd2 = seg2->s_data;
1028 struct anon_map *amp1 = svd1->amp;
1029 struct anon_map *amp2 = svd2->amp;
1030 struct vpage *vpage1 = svd1->vpage;
1031 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL;
1032 size_t size, nvpsize;
1033 pgcnt_t npages1, npages2;
1034
1035 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as);
1036 ASSERT(AS_WRITE_HELD(seg1->s_as));
1037 ASSERT(seg1->s_ops == seg2->s_ops);
1038
1039 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) ||
1040 HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
1041 return (-1);
1042 }
1043
1044 /* both segments exist, try to merge them */
1045 #define incompat(x) (svd1->x != svd2->x)
1046 if (incompat(vp) || incompat(maxprot) ||
1047 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) ||
1048 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) ||
1049 incompat(type) || incompat(cred) || incompat(flags) ||
1050 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) ||
1051 (svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0)
1052 return (-1);
1053 #undef incompat
1054
1055 /*
1056 * vp == NULL implies zfod, offset doesn't matter
1057 */
1058 if (svd1->vp != NULL &&
1059 svd1->offset + seg1->s_size != svd2->offset) {
1060 return (-1);
1061 }
1062
1063 /*
1064 * Don't concatenate if either segment uses text replication.
1065 */
1066 if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) {
1067 return (-1);
1068 }
1069
1070 /*
1071 * Fail early if we're not supposed to concatenate
1072 * segments with non NULL amp.
1073 */
1074 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) {
1075 return (-1);
1076 }
1077
1078 if (svd1->vp == NULL && svd1->type == MAP_SHARED) {
1079 if (amp1 != amp2) {
1080 return (-1);
1081 }
1082 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) !=
1083 svd2->anon_index) {
1084 return (-1);
1085 }
1086 ASSERT(amp1 == NULL || amp1->refcnt >= 2);
1087 }
1088
1089 /*
1090 * If either seg has vpages, create a new merged vpage array.
1091 */
1092 if (vpage1 != NULL || vpage2 != NULL) {
1093 struct vpage *vp, *evp;
1094
1095 npages1 = seg_pages(seg1);
1096 npages2 = seg_pages(seg2);
1097 nvpsize = vpgtob(npages1 + npages2);
1098
1099 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) {
1100 return (-2);
1101 }
1102
1103 if (vpage1 != NULL) {
1104 bcopy(vpage1, nvpage, vpgtob(npages1));
1105 } else {
1106 evp = nvpage + npages1;
1107 for (vp = nvpage; vp < evp; vp++) {
1108 VPP_SETPROT(vp, svd1->prot);
1109 VPP_SETADVICE(vp, svd1->advice);
1110 }
1111 }
1112
1113 if (vpage2 != NULL) {
1114 bcopy(vpage2, nvpage + npages1, vpgtob(npages2));
1115 } else {
1116 evp = nvpage + npages1 + npages2;
1117 for (vp = nvpage + npages1; vp < evp; vp++) {
1118 VPP_SETPROT(vp, svd2->prot);
1119 VPP_SETADVICE(vp, svd2->advice);
1120 }
1121 }
1122
1123 if (svd2->pageswap && (!svd1->pageswap && svd1->swresv)) {
1124 ASSERT(svd1->swresv == seg1->s_size);
1125 ASSERT(!(svd1->flags & MAP_NORESERVE));
1126 ASSERT(!(svd2->flags & MAP_NORESERVE));
1127 evp = nvpage + npages1;
1128 for (vp = nvpage; vp < evp; vp++) {
1129 VPP_SETSWAPRES(vp);
1130 }
1131 }
1132
1133 if (svd1->pageswap && (!svd2->pageswap && svd2->swresv)) {
1134 ASSERT(svd2->swresv == seg2->s_size);
1135 ASSERT(!(svd1->flags & MAP_NORESERVE));
1136 ASSERT(!(svd2->flags & MAP_NORESERVE));
1137 vp = nvpage + npages1;
1138 evp = vp + npages2;
1139 for (; vp < evp; vp++) {
1140 VPP_SETSWAPRES(vp);
1141 }
1142 }
1143 }
1144 ASSERT((vpage1 != NULL || vpage2 != NULL) ||
1145 (svd1->pageswap == 0 && svd2->pageswap == 0));
1146
1147 /*
1148 * If either segment has private pages, create a new merged anon
1149 * array. If mergeing shared anon segments just decrement anon map's
1150 * refcnt.
1151 */
1152 if (amp1 != NULL && svd1->type == MAP_SHARED) {
1153 ASSERT(amp1 == amp2 && svd1->vp == NULL);
1154 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER);
1155 ASSERT(amp1->refcnt >= 2);
1156 amp1->refcnt--;
1157 ANON_LOCK_EXIT(&1->a_rwlock);
1158 svd2->amp = NULL;
1159 } else if (amp1 != NULL || amp2 != NULL) {
1160 struct anon_hdr *nahp;
1161 struct anon_map *namp = NULL;
1162 size_t asize;
1163
1164 ASSERT(svd1->type == MAP_PRIVATE);
1165
1166 asize = seg1->s_size + seg2->s_size;
1167 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) {
1168 if (nvpage != NULL) {
1169 kmem_free(nvpage, nvpsize);
1170 }
1171 return (-2);
1172 }
1173 if (amp1 != NULL) {
1174 /*
1175 * XXX anon rwlock is not really needed because
1176 * this is a private segment and we are writers.
1177 */
1178 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER);
1179 ASSERT(amp1->refcnt == 1);
1180 if (anon_copy_ptr(amp1->ahp, svd1->anon_index,
1181 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) {
1182 anon_release(nahp, btop(asize));
1183 ANON_LOCK_EXIT(&1->a_rwlock);
1184 if (nvpage != NULL) {
1185 kmem_free(nvpage, nvpsize);
1186 }
1187 return (-2);
1188 }
1189 }
1190 if (amp2 != NULL) {
1191 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER);
1192 ASSERT(amp2->refcnt == 1);
1193 if (anon_copy_ptr(amp2->ahp, svd2->anon_index,
1194 nahp, btop(seg1->s_size), btop(seg2->s_size),
1195 ANON_NOSLEEP)) {
1196 anon_release(nahp, btop(asize));
1197 ANON_LOCK_EXIT(&2->a_rwlock);
1198 if (amp1 != NULL) {
1199 ANON_LOCK_EXIT(&1->a_rwlock);
1200 }
1201 if (nvpage != NULL) {
1202 kmem_free(nvpage, nvpsize);
1203 }
1204 return (-2);
1205 }
1206 }
1207 if (amp1 != NULL) {
1208 namp = amp1;
1209 anon_release(amp1->ahp, btop(amp1->size));
1210 }
1211 if (amp2 != NULL) {
1212 if (namp == NULL) {
1213 ASSERT(amp1 == NULL);
1214 namp = amp2;
1215 anon_release(amp2->ahp, btop(amp2->size));
1216 } else {
1217 amp2->refcnt--;
1218 ANON_LOCK_EXIT(&2->a_rwlock);
1219 anonmap_free(amp2);
1220 }
1221 svd2->amp = NULL; /* needed for seg_free */
1222 }
1223 namp->ahp = nahp;
1224 namp->size = asize;
1225 svd1->amp = namp;
1226 svd1->anon_index = 0;
1227 ANON_LOCK_EXIT(&namp->a_rwlock);
1228 }
1229 /*
1230 * Now free the old vpage structures.
1231 */
1232 if (nvpage != NULL) {
1233 if (vpage1 != NULL) {
1234 kmem_free(vpage1, vpgtob(npages1));
1235 }
1236 if (vpage2 != NULL) {
1237 svd2->vpage = NULL;
1238 kmem_free(vpage2, vpgtob(npages2));
1239 }
1240 if (svd2->pageprot) {
1241 svd1->pageprot = 1;
1242 }
1243 if (svd2->pageadvice) {
1244 svd1->pageadvice = 1;
1245 }
1246 if (svd2->pageswap) {
1247 svd1->pageswap = 1;
1248 }
1249 svd1->vpage = nvpage;
1250 }
1251
1252 /* all looks ok, merge segments */
1253 svd1->swresv += svd2->swresv;
1254 svd2->swresv = 0; /* so seg_free doesn't release swap space */
1255 size = seg2->s_size;
1256 seg_free(seg2);
1257 seg1->s_size += size;
1258 return (0);
1259 }
1260
1261 /*
1262 * Extend the previous segment (seg1) to include the
1263 * new segment (seg2 + a), if possible.
1264 * Return 0 on success.
1265 */
1266 static int
segvn_extend_prev(struct seg * seg1,struct seg * seg2,struct segvn_crargs * a,size_t swresv)1267 segvn_extend_prev(struct seg *seg1, struct seg *seg2, struct segvn_crargs *a,
1268 size_t swresv)
1269 {
1270 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data;
1271 size_t size;
1272 struct anon_map *amp1;
1273 struct vpage *new_vpage;
1274
1275 /*
1276 * We don't need any segment level locks for "segvn" data
1277 * since the address space is "write" locked.
1278 */
1279 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as));
1280
1281 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) {
1282 return (-1);
1283 }
1284
1285 /* second segment is new, try to extend first */
1286 /* XXX - should also check cred */
1287 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot ||
1288 (!svd1->pageprot && (svd1->prot != a->prot)) ||
1289 svd1->type != a->type || svd1->flags != a->flags ||
1290 seg1->s_szc != a->szc || svd1->softlockcnt_send > 0)
1291 return (-1);
1292
1293 /* vp == NULL implies zfod, offset doesn't matter */
1294 if (svd1->vp != NULL &&
1295 svd1->offset + seg1->s_size != (a->offset & PAGEMASK))
1296 return (-1);
1297
1298 if (svd1->tr_state != SEGVN_TR_OFF) {
1299 return (-1);
1300 }
1301
1302 amp1 = svd1->amp;
1303 if (amp1) {
1304 pgcnt_t newpgs;
1305
1306 /*
1307 * Segment has private pages, can data structures
1308 * be expanded?
1309 *
1310 * Acquire the anon_map lock to prevent it from changing,
1311 * if it is shared. This ensures that the anon_map
1312 * will not change while a thread which has a read/write
1313 * lock on an address space references it.
1314 * XXX - Don't need the anon_map lock at all if "refcnt"
1315 * is 1.
1316 *
1317 * Can't grow a MAP_SHARED segment with an anonmap because
1318 * there may be existing anon slots where we want to extend
1319 * the segment and we wouldn't know what to do with them
1320 * (e.g., for tmpfs right thing is to just leave them there,
1321 * for /dev/zero they should be cleared out).
1322 */
1323 if (svd1->type == MAP_SHARED)
1324 return (-1);
1325
1326 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER);
1327 if (amp1->refcnt > 1) {
1328 ANON_LOCK_EXIT(&1->a_rwlock);
1329 return (-1);
1330 }
1331 newpgs = anon_grow(amp1->ahp, &svd1->anon_index,
1332 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP);
1333
1334 if (newpgs == 0) {
1335 ANON_LOCK_EXIT(&1->a_rwlock);
1336 return (-1);
1337 }
1338 amp1->size = ptob(newpgs);
1339 ANON_LOCK_EXIT(&1->a_rwlock);
1340 }
1341 if (svd1->vpage != NULL) {
1342 struct vpage *vp, *evp;
1343 new_vpage =
1344 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
1345 KM_NOSLEEP);
1346 if (new_vpage == NULL)
1347 return (-1);
1348 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1)));
1349 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1)));
1350 svd1->vpage = new_vpage;
1351
1352 vp = new_vpage + seg_pages(seg1);
1353 evp = vp + seg_pages(seg2);
1354 for (; vp < evp; vp++)
1355 VPP_SETPROT(vp, a->prot);
1356 if (svd1->pageswap && swresv) {
1357 ASSERT(!(svd1->flags & MAP_NORESERVE));
1358 ASSERT(swresv == seg2->s_size);
1359 vp = new_vpage + seg_pages(seg1);
1360 for (; vp < evp; vp++) {
1361 VPP_SETSWAPRES(vp);
1362 }
1363 }
1364 }
1365 ASSERT(svd1->vpage != NULL || svd1->pageswap == 0);
1366 size = seg2->s_size;
1367 seg_free(seg2);
1368 seg1->s_size += size;
1369 svd1->swresv += swresv;
1370 if (svd1->pageprot && (a->prot & PROT_WRITE) &&
1371 svd1->type == MAP_SHARED && svd1->vp != NULL &&
1372 (svd1->vp->v_flag & VVMEXEC)) {
1373 ASSERT(vn_is_mapped(svd1->vp, V_WRITE));
1374 segvn_inval_trcache(svd1->vp);
1375 }
1376 return (0);
1377 }
1378
1379 /*
1380 * Extend the next segment (seg2) to include the
1381 * new segment (seg1 + a), if possible.
1382 * Return 0 on success.
1383 */
1384 static int
segvn_extend_next(struct seg * seg1,struct seg * seg2,struct segvn_crargs * a,size_t swresv)1385 segvn_extend_next(struct seg *seg1, struct seg *seg2, struct segvn_crargs *a,
1386 size_t swresv)
1387 {
1388 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data;
1389 size_t size;
1390 struct anon_map *amp2;
1391 struct vpage *new_vpage;
1392
1393 /*
1394 * We don't need any segment level locks for "segvn" data
1395 * since the address space is "write" locked.
1396 */
1397 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as));
1398
1399 if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
1400 return (-1);
1401 }
1402
1403 /* first segment is new, try to extend second */
1404 /* XXX - should also check cred */
1405 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot ||
1406 (!svd2->pageprot && (svd2->prot != a->prot)) ||
1407 svd2->type != a->type || svd2->flags != a->flags ||
1408 seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0)
1409 return (-1);
1410 /* vp == NULL implies zfod, offset doesn't matter */
1411 if (svd2->vp != NULL &&
1412 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset)
1413 return (-1);
1414
1415 if (svd2->tr_state != SEGVN_TR_OFF) {
1416 return (-1);
1417 }
1418
1419 amp2 = svd2->amp;
1420 if (amp2) {
1421 pgcnt_t newpgs;
1422
1423 /*
1424 * Segment has private pages, can data structures
1425 * be expanded?
1426 *
1427 * Acquire the anon_map lock to prevent it from changing,
1428 * if it is shared. This ensures that the anon_map
1429 * will not change while a thread which has a read/write
1430 * lock on an address space references it.
1431 *
1432 * XXX - Don't need the anon_map lock at all if "refcnt"
1433 * is 1.
1434 */
1435 if (svd2->type == MAP_SHARED)
1436 return (-1);
1437
1438 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER);
1439 if (amp2->refcnt > 1) {
1440 ANON_LOCK_EXIT(&2->a_rwlock);
1441 return (-1);
1442 }
1443 newpgs = anon_grow(amp2->ahp, &svd2->anon_index,
1444 btop(seg2->s_size), btop(seg1->s_size),
1445 ANON_NOSLEEP | ANON_GROWDOWN);
1446
1447 if (newpgs == 0) {
1448 ANON_LOCK_EXIT(&2->a_rwlock);
1449 return (-1);
1450 }
1451 amp2->size = ptob(newpgs);
1452 ANON_LOCK_EXIT(&2->a_rwlock);
1453 }
1454 if (svd2->vpage != NULL) {
1455 struct vpage *vp, *evp;
1456 new_vpage =
1457 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
1458 KM_NOSLEEP);
1459 if (new_vpage == NULL) {
1460 /* Not merging segments so adjust anon_index back */
1461 if (amp2)
1462 svd2->anon_index += seg_pages(seg1);
1463 return (-1);
1464 }
1465 bcopy(svd2->vpage, new_vpage + seg_pages(seg1),
1466 vpgtob(seg_pages(seg2)));
1467 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2)));
1468 svd2->vpage = new_vpage;
1469
1470 vp = new_vpage;
1471 evp = vp + seg_pages(seg1);
1472 for (; vp < evp; vp++)
1473 VPP_SETPROT(vp, a->prot);
1474 if (svd2->pageswap && swresv) {
1475 ASSERT(!(svd2->flags & MAP_NORESERVE));
1476 ASSERT(swresv == seg1->s_size);
1477 vp = new_vpage;
1478 for (; vp < evp; vp++) {
1479 VPP_SETSWAPRES(vp);
1480 }
1481 }
1482 }
1483 ASSERT(svd2->vpage != NULL || svd2->pageswap == 0);
1484 size = seg1->s_size;
1485 seg_free(seg1);
1486 seg2->s_size += size;
1487 seg2->s_base -= size;
1488 svd2->offset -= size;
1489 svd2->swresv += swresv;
1490 if (svd2->pageprot && (a->prot & PROT_WRITE) &&
1491 svd2->type == MAP_SHARED && svd2->vp != NULL &&
1492 (svd2->vp->v_flag & VVMEXEC)) {
1493 ASSERT(vn_is_mapped(svd2->vp, V_WRITE));
1494 segvn_inval_trcache(svd2->vp);
1495 }
1496 return (0);
1497 }
1498
1499 /*
1500 * Duplicate all the pages in the segment. This may break COW sharing for a
1501 * given page. If the page is marked with inherit zero set, then instead of
1502 * duplicating the page, we zero the page.
1503 */
1504 static int
segvn_dup_pages(struct seg * seg,struct seg * newseg)1505 segvn_dup_pages(struct seg *seg, struct seg *newseg)
1506 {
1507 int error;
1508 uint_t prot;
1509 page_t *pp;
1510 struct anon *ap, *newap;
1511 size_t i;
1512 caddr_t addr;
1513
1514 struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1515 struct segvn_data *newsvd = (struct segvn_data *)newseg->s_data;
1516 ulong_t old_idx = svd->anon_index;
1517 ulong_t new_idx = 0;
1518
1519 i = btopr(seg->s_size);
1520 addr = seg->s_base;
1521
1522 /*
1523 * XXX break cow sharing using PAGESIZE
1524 * pages. They will be relocated into larger
1525 * pages at fault time.
1526 */
1527 while (i-- > 0) {
1528 if ((ap = anon_get_ptr(svd->amp->ahp, old_idx)) != NULL) {
1529 struct vpage *vpp;
1530
1531 vpp = &svd->vpage[seg_page(seg, addr)];
1532
1533 /*
1534 * prot need not be computed below 'cause anon_private
1535 * is going to ignore it anyway as child doesn't inherit
1536 * pagelock from parent.
1537 */
1538 prot = svd->pageprot ? VPP_PROT(vpp) : svd->prot;
1539
1540 /*
1541 * Check whether we should zero this or dup it.
1542 */
1543 if (svd->svn_inz == SEGVN_INZ_ALL ||
1544 (svd->svn_inz == SEGVN_INZ_VPP &&
1545 VPP_ISINHZERO(vpp))) {
1546 pp = anon_zero(newseg, addr, &newap,
1547 newsvd->cred);
1548 } else {
1549 page_t *anon_pl[1+1];
1550 uint_t vpprot;
1551 error = anon_getpage(&ap, &vpprot, anon_pl,
1552 PAGESIZE, seg, addr, S_READ, svd->cred);
1553 if (error != 0)
1554 return (error);
1555
1556 pp = anon_private(&newap, newseg, addr, prot,
1557 anon_pl[0], 0, newsvd->cred);
1558 }
1559 if (pp == NULL) {
1560 return (ENOMEM);
1561 }
1562 (void) anon_set_ptr(newsvd->amp->ahp, new_idx, newap,
1563 ANON_SLEEP);
1564 page_unlock(pp);
1565 }
1566 addr += PAGESIZE;
1567 old_idx++;
1568 new_idx++;
1569 }
1570
1571 return (0);
1572 }
1573
1574 static int
segvn_dup(struct seg * seg,struct seg * newseg)1575 segvn_dup(struct seg *seg, struct seg *newseg)
1576 {
1577 struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1578 struct segvn_data *newsvd;
1579 pgcnt_t npages = seg_pages(seg);
1580 int error = 0;
1581 size_t len;
1582 struct anon_map *amp;
1583
1584 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1585 ASSERT(newseg->s_as->a_proc->p_parent == curproc);
1586
1587 /*
1588 * If segment has anon reserved, reserve more for the new seg.
1589 * For a MAP_NORESERVE segment swresv will be a count of all the
1590 * allocated anon slots; thus we reserve for the child as many slots
1591 * as the parent has allocated. This semantic prevents the child or
1592 * parent from dieing during a copy-on-write fault caused by trying
1593 * to write a shared pre-existing anon page.
1594 */
1595 if ((len = svd->swresv) != 0) {
1596 if (anon_resv(svd->swresv) == 0)
1597 return (ENOMEM);
1598
1599 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
1600 seg, len, 0);
1601 }
1602
1603 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
1604
1605 newseg->s_ops = &segvn_ops;
1606 newseg->s_data = (void *)newsvd;
1607 newseg->s_szc = seg->s_szc;
1608
1609 newsvd->seg = newseg;
1610 if ((newsvd->vp = svd->vp) != NULL) {
1611 VN_HOLD(svd->vp);
1612 if (svd->type == MAP_SHARED)
1613 lgrp_shm_policy_init(NULL, svd->vp);
1614 }
1615 newsvd->offset = svd->offset;
1616 newsvd->prot = svd->prot;
1617 newsvd->maxprot = svd->maxprot;
1618 newsvd->pageprot = svd->pageprot;
1619 newsvd->type = svd->type;
1620 newsvd->cred = svd->cred;
1621 crhold(newsvd->cred);
1622 newsvd->advice = svd->advice;
1623 newsvd->pageadvice = svd->pageadvice;
1624 newsvd->svn_inz = svd->svn_inz;
1625 newsvd->swresv = svd->swresv;
1626 newsvd->pageswap = svd->pageswap;
1627 newsvd->flags = svd->flags;
1628 newsvd->softlockcnt = 0;
1629 newsvd->softlockcnt_sbase = 0;
1630 newsvd->softlockcnt_send = 0;
1631 newsvd->policy_info = svd->policy_info;
1632 newsvd->rcookie = HAT_INVALID_REGION_COOKIE;
1633
1634 if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) {
1635 /*
1636 * Not attaching to a shared anon object.
1637 */
1638 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) ||
1639 svd->tr_state == SEGVN_TR_OFF);
1640 if (svd->tr_state == SEGVN_TR_ON) {
1641 ASSERT(newsvd->vp != NULL && amp != NULL);
1642 newsvd->tr_state = SEGVN_TR_INIT;
1643 } else {
1644 newsvd->tr_state = svd->tr_state;
1645 }
1646 newsvd->amp = NULL;
1647 newsvd->anon_index = 0;
1648 } else {
1649 /* regions for now are only used on pure vnode segments */
1650 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
1651 ASSERT(svd->tr_state == SEGVN_TR_OFF);
1652 newsvd->tr_state = SEGVN_TR_OFF;
1653 if (svd->type == MAP_SHARED) {
1654 ASSERT(svd->svn_inz == SEGVN_INZ_NONE);
1655 newsvd->amp = amp;
1656 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
1657 amp->refcnt++;
1658 ANON_LOCK_EXIT(&->a_rwlock);
1659 newsvd->anon_index = svd->anon_index;
1660 } else {
1661 int reclaim = 1;
1662
1663 /*
1664 * Allocate and initialize new anon_map structure.
1665 */
1666 newsvd->amp = anonmap_alloc(newseg->s_size, 0,
1667 ANON_SLEEP);
1668 newsvd->amp->a_szc = newseg->s_szc;
1669 newsvd->anon_index = 0;
1670 ASSERT(svd->svn_inz == SEGVN_INZ_NONE ||
1671 svd->svn_inz == SEGVN_INZ_ALL ||
1672 svd->svn_inz == SEGVN_INZ_VPP);
1673
1674 /*
1675 * We don't have to acquire the anon_map lock
1676 * for the new segment (since it belongs to an
1677 * address space that is still not associated
1678 * with any process), or the segment in the old
1679 * address space (since all threads in it
1680 * are stopped while duplicating the address space).
1681 */
1682
1683 /*
1684 * The goal of the following code is to make sure that
1685 * softlocked pages do not end up as copy on write
1686 * pages. This would cause problems where one
1687 * thread writes to a page that is COW and a different
1688 * thread in the same process has softlocked it. The
1689 * softlock lock would move away from this process
1690 * because the write would cause this process to get
1691 * a copy (without the softlock).
1692 *
1693 * The strategy here is to just break the
1694 * sharing on pages that could possibly be
1695 * softlocked.
1696 *
1697 * In addition, if any pages have been marked that they
1698 * should be inherited as zero, then we immediately go
1699 * ahead and break COW and zero them. In the case of a
1700 * softlocked page that should be inherited zero, we
1701 * break COW and just get a zero page.
1702 */
1703 retry:
1704 if (svd->softlockcnt ||
1705 svd->svn_inz != SEGVN_INZ_NONE) {
1706 /*
1707 * The softlock count might be non zero
1708 * because some pages are still stuck in the
1709 * cache for lazy reclaim. Flush the cache
1710 * now. This should drop the count to zero.
1711 * [or there is really I/O going on to these
1712 * pages]. Note, we have the writers lock so
1713 * nothing gets inserted during the flush.
1714 */
1715 if (svd->softlockcnt && reclaim == 1) {
1716 segvn_purge(seg);
1717 reclaim = 0;
1718 goto retry;
1719 }
1720
1721 error = segvn_dup_pages(seg, newseg);
1722 if (error != 0) {
1723 newsvd->vpage = NULL;
1724 goto out;
1725 }
1726 } else { /* common case */
1727 if (seg->s_szc != 0) {
1728 /*
1729 * If at least one of anon slots of a
1730 * large page exists then make sure
1731 * all anon slots of a large page
1732 * exist to avoid partial cow sharing
1733 * of a large page in the future.
1734 */
1735 anon_dup_fill_holes(amp->ahp,
1736 svd->anon_index, newsvd->amp->ahp,
1737 0, seg->s_size, seg->s_szc,
1738 svd->vp != NULL);
1739 } else {
1740 anon_dup(amp->ahp, svd->anon_index,
1741 newsvd->amp->ahp, 0, seg->s_size);
1742 }
1743
1744 hat_clrattr(seg->s_as->a_hat, seg->s_base,
1745 seg->s_size, PROT_WRITE);
1746 }
1747 }
1748 }
1749 /*
1750 * If necessary, create a vpage structure for the new segment.
1751 * Do not copy any page lock indications.
1752 */
1753 if (svd->vpage != NULL) {
1754 uint_t i;
1755 struct vpage *ovp = svd->vpage;
1756 struct vpage *nvp;
1757
1758 nvp = newsvd->vpage =
1759 kmem_alloc(vpgtob(npages), KM_SLEEP);
1760 for (i = 0; i < npages; i++) {
1761 *nvp = *ovp++;
1762 VPP_CLRPPLOCK(nvp++);
1763 }
1764 } else
1765 newsvd->vpage = NULL;
1766
1767 /* Inform the vnode of the new mapping */
1768 if (newsvd->vp != NULL) {
1769 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset,
1770 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot,
1771 newsvd->maxprot, newsvd->type, newsvd->cred, NULL);
1772 }
1773 out:
1774 if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1775 ASSERT(newsvd->amp == NULL);
1776 ASSERT(newsvd->tr_state == SEGVN_TR_OFF);
1777 newsvd->rcookie = svd->rcookie;
1778 hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie);
1779 }
1780 return (error);
1781 }
1782
1783
1784 /*
1785 * callback function to invoke free_vp_pages() for only those pages actually
1786 * processed by the HAT when a shared region is destroyed.
1787 */
1788 extern int free_pages;
1789
1790 static void
segvn_hat_rgn_unload_callback(caddr_t saddr,caddr_t eaddr,caddr_t r_saddr,size_t r_size,void * r_obj,u_offset_t r_objoff)1791 segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
1792 size_t r_size, void *r_obj, u_offset_t r_objoff)
1793 {
1794 u_offset_t off;
1795 size_t len;
1796 vnode_t *vp = (vnode_t *)r_obj;
1797
1798 ASSERT(eaddr > saddr);
1799 ASSERT(saddr >= r_saddr);
1800 ASSERT(saddr < r_saddr + r_size);
1801 ASSERT(eaddr > r_saddr);
1802 ASSERT(eaddr <= r_saddr + r_size);
1803 ASSERT(vp != NULL);
1804
1805 if (!free_pages) {
1806 return;
1807 }
1808
1809 len = eaddr - saddr;
1810 off = (saddr - r_saddr) + r_objoff;
1811 free_vp_pages(vp, off, len);
1812 }
1813
1814 /*
1815 * callback function used by segvn_unmap to invoke free_vp_pages() for only
1816 * those pages actually processed by the HAT
1817 */
1818 static void
segvn_hat_unload_callback(hat_callback_t * cb)1819 segvn_hat_unload_callback(hat_callback_t *cb)
1820 {
1821 struct seg *seg = cb->hcb_data;
1822 struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1823 size_t len;
1824 u_offset_t off;
1825
1826 ASSERT(svd->vp != NULL);
1827 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr);
1828 ASSERT(cb->hcb_start_addr >= seg->s_base);
1829
1830 len = cb->hcb_end_addr - cb->hcb_start_addr;
1831 off = cb->hcb_start_addr - seg->s_base;
1832 free_vp_pages(svd->vp, svd->offset + off, len);
1833 }
1834
1835 /*
1836 * This function determines the number of bytes of swap reserved by
1837 * a segment for which per-page accounting is present. It is used to
1838 * calculate the correct value of a segvn_data's swresv.
1839 */
1840 static size_t
segvn_count_swap_by_vpages(struct seg * seg)1841 segvn_count_swap_by_vpages(struct seg *seg)
1842 {
1843 struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1844 struct vpage *vp, *evp;
1845 size_t nswappages = 0;
1846
1847 ASSERT(svd->pageswap);
1848 ASSERT(svd->vpage != NULL);
1849
1850 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)];
1851
1852 for (vp = svd->vpage; vp < evp; vp++) {
1853 if (VPP_ISSWAPRES(vp))
1854 nswappages++;
1855 }
1856
1857 return (nswappages << PAGESHIFT);
1858 }
1859
1860 static int
segvn_unmap(struct seg * seg,caddr_t addr,size_t len)1861 segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
1862 {
1863 struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1864 struct segvn_data *nsvd;
1865 struct seg *nseg;
1866 struct anon_map *amp;
1867 pgcnt_t opages; /* old segment size in pages */
1868 pgcnt_t npages; /* new segment size in pages */
1869 pgcnt_t dpages; /* pages being deleted (unmapped) */
1870 hat_callback_t callback; /* used for free_vp_pages() */
1871 hat_callback_t *cbp = NULL;
1872 caddr_t nbase;
1873 size_t nsize;
1874 size_t oswresv;
1875 int reclaim = 1;
1876
1877 /*
1878 * We don't need any segment level locks for "segvn" data
1879 * since the address space is "write" locked.
1880 */
1881 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1882
1883 /*
1884 * Fail the unmap if pages are SOFTLOCKed through this mapping.
1885 * softlockcnt is protected from change by the as write lock.
1886 */
1887 retry:
1888 if (svd->softlockcnt > 0) {
1889 ASSERT(svd->tr_state == SEGVN_TR_OFF);
1890
1891 /*
1892 * If this is shared segment non 0 softlockcnt
1893 * means locked pages are still in use.
1894 */
1895 if (svd->type == MAP_SHARED) {
1896 return (EAGAIN);
1897 }
1898
1899 /*
1900 * since we do have the writers lock nobody can fill
1901 * the cache during the purge. The flush either succeeds
1902 * or we still have pending I/Os.
1903 */
1904 if (reclaim == 1) {
1905 segvn_purge(seg);
1906 reclaim = 0;
1907 goto retry;
1908 }
1909 return (EAGAIN);
1910 }
1911
1912 /*
1913 * Check for bad sizes
1914 */
1915 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size ||
1916 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) {
1917 panic("segvn_unmap");
1918 /*NOTREACHED*/
1919 }
1920
1921 if (seg->s_szc != 0) {
1922 size_t pgsz = page_get_pagesize(seg->s_szc);
1923 int err;
1924 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
1925 ASSERT(seg->s_base != addr || seg->s_size != len);
1926 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1927 ASSERT(svd->amp == NULL);
1928 ASSERT(svd->tr_state == SEGVN_TR_OFF);
1929 hat_leave_region(seg->s_as->a_hat,
1930 svd->rcookie, HAT_REGION_TEXT);
1931 svd->rcookie = HAT_INVALID_REGION_COOKIE;
1932 /*
1933 * could pass a flag to segvn_demote_range()
1934 * below to tell it not to do any unloads but
1935 * this case is rare enough to not bother for
1936 * now.
1937 */
1938 } else if (svd->tr_state == SEGVN_TR_INIT) {
1939 svd->tr_state = SEGVN_TR_OFF;
1940 } else if (svd->tr_state == SEGVN_TR_ON) {
1941 ASSERT(svd->amp != NULL);
1942 segvn_textunrepl(seg, 1);
1943 ASSERT(svd->amp == NULL);
1944 ASSERT(svd->tr_state == SEGVN_TR_OFF);
1945 }
1946 VM_STAT_ADD(segvnvmstats.demoterange[0]);
1947 err = segvn_demote_range(seg, addr, len, SDR_END, 0);
1948 if (err == 0) {
1949 return (IE_RETRY);
1950 }
1951 return (err);
1952 }
1953 }
1954
1955 /* Inform the vnode of the unmapping. */
1956 if (svd->vp) {
1957 int error;
1958
1959 error = VOP_DELMAP(svd->vp,
1960 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base),
1961 seg->s_as, addr, len, svd->prot, svd->maxprot,
1962 svd->type, svd->cred, NULL);
1963
1964 if (error == EAGAIN)
1965 return (error);
1966 }
1967
1968 /*
1969 * Remove any page locks set through this mapping.
1970 * If text replication is not off no page locks could have been
1971 * established via this mapping.
1972 */
1973 if (svd->tr_state == SEGVN_TR_OFF) {
1974 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0);
1975 }
1976
1977 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1978 ASSERT(svd->amp == NULL);
1979 ASSERT(svd->tr_state == SEGVN_TR_OFF);
1980 ASSERT(svd->type == MAP_PRIVATE);
1981 hat_leave_region(seg->s_as->a_hat, svd->rcookie,
1982 HAT_REGION_TEXT);
1983 svd->rcookie = HAT_INVALID_REGION_COOKIE;
1984 } else if (svd->tr_state == SEGVN_TR_ON) {
1985 ASSERT(svd->amp != NULL);
1986 ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE));
1987 segvn_textunrepl(seg, 1);
1988 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
1989 } else {
1990 if (svd->tr_state != SEGVN_TR_OFF) {
1991 ASSERT(svd->tr_state == SEGVN_TR_INIT);
1992 svd->tr_state = SEGVN_TR_OFF;
1993 }
1994 /*
1995 * Unload any hardware translations in the range to be taken
1996 * out. Use a callback to invoke free_vp_pages() effectively.
1997 */
1998 if (svd->vp != NULL && free_pages != 0) {
1999 callback.hcb_data = seg;
2000 callback.hcb_function = segvn_hat_unload_callback;
2001 cbp = &callback;
2002 }
2003 hat_unload_callback(seg->s_as->a_hat, addr, len,
2004 HAT_UNLOAD_UNMAP, cbp);
2005
2006 if (svd->type == MAP_SHARED && svd->vp != NULL &&
2007 (svd->vp->v_flag & VVMEXEC) &&
2008 ((svd->prot & PROT_WRITE) || svd->pageprot)) {
2009 segvn_inval_trcache(svd->vp);
2010 }
2011 }
2012
2013 /*
2014 * Check for entire segment
2015 */
2016 if (addr == seg->s_base && len == seg->s_size) {
2017 seg_free(seg);
2018 return (0);
2019 }
2020
2021 opages = seg_pages(seg);
2022 dpages = btop(len);
2023 npages = opages - dpages;
2024 amp = svd->amp;
2025 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc);
2026
2027 /*
2028 * Check for beginning of segment
2029 */
2030 if (addr == seg->s_base) {
2031 if (svd->vpage != NULL) {
2032 size_t nbytes;
2033 struct vpage *ovpage;
2034
2035 ovpage = svd->vpage; /* keep pointer to vpage */
2036
2037 nbytes = vpgtob(npages);
2038 svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2039 bcopy(&ovpage[dpages], svd->vpage, nbytes);
2040
2041 /* free up old vpage */
2042 kmem_free(ovpage, vpgtob(opages));
2043 }
2044 if (amp != NULL) {
2045 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2046 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
2047 /*
2048 * Shared anon map is no longer in use. Before
2049 * freeing its pages purge all entries from
2050 * pcache that belong to this amp.
2051 */
2052 if (svd->type == MAP_SHARED) {
2053 ASSERT(amp->refcnt == 1);
2054 ASSERT(svd->softlockcnt == 0);
2055 anonmap_purge(amp);
2056 }
2057 /*
2058 * Free up now unused parts of anon_map array.
2059 */
2060 if (amp->a_szc == seg->s_szc) {
2061 if (seg->s_szc != 0) {
2062 anon_free_pages(amp->ahp,
2063 svd->anon_index, len,
2064 seg->s_szc);
2065 } else {
2066 anon_free(amp->ahp,
2067 svd->anon_index,
2068 len);
2069 }
2070 } else {
2071 ASSERT(svd->type == MAP_SHARED);
2072 ASSERT(amp->a_szc > seg->s_szc);
2073 anon_shmap_free_pages(amp,
2074 svd->anon_index, len);
2075 }
2076
2077 /*
2078 * Unreserve swap space for the
2079 * unmapped chunk of this segment in
2080 * case it's MAP_SHARED
2081 */
2082 if (svd->type == MAP_SHARED) {
2083 anon_unresv_zone(len,
2084 seg->s_as->a_proc->p_zone);
2085 amp->swresv -= len;
2086 }
2087 }
2088 ANON_LOCK_EXIT(&->a_rwlock);
2089 svd->anon_index += dpages;
2090 }
2091 if (svd->vp != NULL)
2092 svd->offset += len;
2093
2094 seg->s_base += len;
2095 seg->s_size -= len;
2096
2097 if (svd->swresv) {
2098 if (svd->flags & MAP_NORESERVE) {
2099 ASSERT(amp);
2100 oswresv = svd->swresv;
2101
2102 svd->swresv = ptob(anon_pages(amp->ahp,
2103 svd->anon_index, npages));
2104 anon_unresv_zone(oswresv - svd->swresv,
2105 seg->s_as->a_proc->p_zone);
2106 if (SEG_IS_PARTIAL_RESV(seg))
2107 seg->s_as->a_resvsize -= oswresv -
2108 svd->swresv;
2109 } else {
2110 size_t unlen;
2111
2112 if (svd->pageswap) {
2113 oswresv = svd->swresv;
2114 svd->swresv =
2115 segvn_count_swap_by_vpages(seg);
2116 ASSERT(oswresv >= svd->swresv);
2117 unlen = oswresv - svd->swresv;
2118 } else {
2119 svd->swresv -= len;
2120 ASSERT(svd->swresv == seg->s_size);
2121 unlen = len;
2122 }
2123 anon_unresv_zone(unlen,
2124 seg->s_as->a_proc->p_zone);
2125 }
2126 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2127 seg, len, 0);
2128 }
2129
2130 return (0);
2131 }
2132
2133 /*
2134 * Check for end of segment
2135 */
2136 if (addr + len == seg->s_base + seg->s_size) {
2137 if (svd->vpage != NULL) {
2138 size_t nbytes;
2139 struct vpage *ovpage;
2140
2141 ovpage = svd->vpage; /* keep pointer to vpage */
2142
2143 nbytes = vpgtob(npages);
2144 svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2145 bcopy(ovpage, svd->vpage, nbytes);
2146
2147 /* free up old vpage */
2148 kmem_free(ovpage, vpgtob(opages));
2149
2150 }
2151 if (amp != NULL) {
2152 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2153 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
2154 /*
2155 * Free up now unused parts of anon_map array.
2156 */
2157 ulong_t an_idx = svd->anon_index + npages;
2158
2159 /*
2160 * Shared anon map is no longer in use. Before
2161 * freeing its pages purge all entries from
2162 * pcache that belong to this amp.
2163 */
2164 if (svd->type == MAP_SHARED) {
2165 ASSERT(amp->refcnt == 1);
2166 ASSERT(svd->softlockcnt == 0);
2167 anonmap_purge(amp);
2168 }
2169
2170 if (amp->a_szc == seg->s_szc) {
2171 if (seg->s_szc != 0) {
2172 anon_free_pages(amp->ahp,
2173 an_idx, len,
2174 seg->s_szc);
2175 } else {
2176 anon_free(amp->ahp, an_idx,
2177 len);
2178 }
2179 } else {
2180 ASSERT(svd->type == MAP_SHARED);
2181 ASSERT(amp->a_szc > seg->s_szc);
2182 anon_shmap_free_pages(amp,
2183 an_idx, len);
2184 }
2185
2186 /*
2187 * Unreserve swap space for the
2188 * unmapped chunk of this segment in
2189 * case it's MAP_SHARED
2190 */
2191 if (svd->type == MAP_SHARED) {
2192 anon_unresv_zone(len,
2193 seg->s_as->a_proc->p_zone);
2194 amp->swresv -= len;
2195 }
2196 }
2197 ANON_LOCK_EXIT(&->a_rwlock);
2198 }
2199
2200 seg->s_size -= len;
2201
2202 if (svd->swresv) {
2203 if (svd->flags & MAP_NORESERVE) {
2204 ASSERT(amp);
2205 oswresv = svd->swresv;
2206 svd->swresv = ptob(anon_pages(amp->ahp,
2207 svd->anon_index, npages));
2208 anon_unresv_zone(oswresv - svd->swresv,
2209 seg->s_as->a_proc->p_zone);
2210 if (SEG_IS_PARTIAL_RESV(seg))
2211 seg->s_as->a_resvsize -= oswresv -
2212 svd->swresv;
2213 } else {
2214 size_t unlen;
2215
2216 if (svd->pageswap) {
2217 oswresv = svd->swresv;
2218 svd->swresv =
2219 segvn_count_swap_by_vpages(seg);
2220 ASSERT(oswresv >= svd->swresv);
2221 unlen = oswresv - svd->swresv;
2222 } else {
2223 svd->swresv -= len;
2224 ASSERT(svd->swresv == seg->s_size);
2225 unlen = len;
2226 }
2227 anon_unresv_zone(unlen,
2228 seg->s_as->a_proc->p_zone);
2229 }
2230 TRACE_3(TR_FAC_VM, TR_ANON_PROC,
2231 "anon proc:%p %lu %u", seg, len, 0);
2232 }
2233
2234 return (0);
2235 }
2236
2237 /*
2238 * The section to go is in the middle of the segment,
2239 * have to make it into two segments. nseg is made for
2240 * the high end while seg is cut down at the low end.
2241 */
2242 nbase = addr + len; /* new seg base */
2243 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */
2244 seg->s_size = addr - seg->s_base; /* shrink old seg */
2245 nseg = seg_alloc(seg->s_as, nbase, nsize);
2246 if (nseg == NULL) {
2247 panic("segvn_unmap seg_alloc");
2248 /*NOTREACHED*/
2249 }
2250 nseg->s_ops = seg->s_ops;
2251 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
2252 nseg->s_data = (void *)nsvd;
2253 nseg->s_szc = seg->s_szc;
2254 *nsvd = *svd;
2255 nsvd->seg = nseg;
2256 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
2257 nsvd->swresv = 0;
2258 nsvd->softlockcnt = 0;
2259 nsvd->softlockcnt_sbase = 0;
2260 nsvd->softlockcnt_send = 0;
2261 nsvd->svn_inz = svd->svn_inz;
2262 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
2263
2264 if (svd->vp != NULL) {
2265 VN_HOLD(nsvd->vp);
2266 if (nsvd->type == MAP_SHARED)
2267 lgrp_shm_policy_init(NULL, nsvd->vp);
2268 }
2269 crhold(svd->cred);
2270
2271 if (svd->vpage == NULL) {
2272 nsvd->vpage = NULL;
2273 } else {
2274 /* need to split vpage into two arrays */
2275 size_t nbytes;
2276 struct vpage *ovpage;
2277
2278 ovpage = svd->vpage; /* keep pointer to vpage */
2279
2280 npages = seg_pages(seg); /* seg has shrunk */
2281 nbytes = vpgtob(npages);
2282 svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2283
2284 bcopy(ovpage, svd->vpage, nbytes);
2285
2286 npages = seg_pages(nseg);
2287 nbytes = vpgtob(npages);
2288 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2289
2290 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes);
2291
2292 /* free up old vpage */
2293 kmem_free(ovpage, vpgtob(opages));
2294 }
2295
2296 if (amp == NULL) {
2297 nsvd->amp = NULL;
2298 nsvd->anon_index = 0;
2299 } else {
2300 /*
2301 * Need to create a new anon map for the new segment.
2302 * We'll also allocate a new smaller array for the old
2303 * smaller segment to save space.
2304 */
2305 opages = btop((uintptr_t)(addr - seg->s_base));
2306 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2307 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
2308 /*
2309 * Free up now unused parts of anon_map array.
2310 */
2311 ulong_t an_idx = svd->anon_index + opages;
2312
2313 /*
2314 * Shared anon map is no longer in use. Before
2315 * freeing its pages purge all entries from
2316 * pcache that belong to this amp.
2317 */
2318 if (svd->type == MAP_SHARED) {
2319 ASSERT(amp->refcnt == 1);
2320 ASSERT(svd->softlockcnt == 0);
2321 anonmap_purge(amp);
2322 }
2323
2324 if (amp->a_szc == seg->s_szc) {
2325 if (seg->s_szc != 0) {
2326 anon_free_pages(amp->ahp, an_idx, len,
2327 seg->s_szc);
2328 } else {
2329 anon_free(amp->ahp, an_idx,
2330 len);
2331 }
2332 } else {
2333 ASSERT(svd->type == MAP_SHARED);
2334 ASSERT(amp->a_szc > seg->s_szc);
2335 anon_shmap_free_pages(amp, an_idx, len);
2336 }
2337
2338 /*
2339 * Unreserve swap space for the
2340 * unmapped chunk of this segment in
2341 * case it's MAP_SHARED
2342 */
2343 if (svd->type == MAP_SHARED) {
2344 anon_unresv_zone(len,
2345 seg->s_as->a_proc->p_zone);
2346 amp->swresv -= len;
2347 }
2348 }
2349 nsvd->anon_index = svd->anon_index +
2350 btop((uintptr_t)(nseg->s_base - seg->s_base));
2351 if (svd->type == MAP_SHARED) {
2352 amp->refcnt++;
2353 nsvd->amp = amp;
2354 } else {
2355 struct anon_map *namp;
2356 struct anon_hdr *nahp;
2357
2358 ASSERT(svd->type == MAP_PRIVATE);
2359 nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
2360 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
2361 namp->a_szc = seg->s_szc;
2362 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp,
2363 0, btop(seg->s_size), ANON_SLEEP);
2364 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index,
2365 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
2366 anon_release(amp->ahp, btop(amp->size));
2367 svd->anon_index = 0;
2368 nsvd->anon_index = 0;
2369 amp->ahp = nahp;
2370 amp->size = seg->s_size;
2371 nsvd->amp = namp;
2372 }
2373 ANON_LOCK_EXIT(&->a_rwlock);
2374 }
2375 if (svd->swresv) {
2376 if (svd->flags & MAP_NORESERVE) {
2377 ASSERT(amp);
2378 oswresv = svd->swresv;
2379 svd->swresv = ptob(anon_pages(amp->ahp,
2380 svd->anon_index, btop(seg->s_size)));
2381 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
2382 nsvd->anon_index, btop(nseg->s_size)));
2383 ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
2384 anon_unresv_zone(oswresv - (svd->swresv + nsvd->swresv),
2385 seg->s_as->a_proc->p_zone);
2386 if (SEG_IS_PARTIAL_RESV(seg))
2387 seg->s_as->a_resvsize -= oswresv -
2388 (svd->swresv + nsvd->swresv);
2389 } else {
2390 size_t unlen;
2391
2392 if (svd->pageswap) {
2393 oswresv = svd->swresv;
2394 svd->swresv = segvn_count_swap_by_vpages(seg);
2395 nsvd->swresv = segvn_count_swap_by_vpages(nseg);
2396 ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
2397 unlen = oswresv - (svd->swresv + nsvd->swresv);
2398 } else {
2399 if (seg->s_size + nseg->s_size + len !=
2400 svd->swresv) {
2401 panic("segvn_unmap: cannot split "
2402 "swap reservation");
2403 /*NOTREACHED*/
2404 }
2405 svd->swresv = seg->s_size;
2406 nsvd->swresv = nseg->s_size;
2407 unlen = len;
2408 }
2409 anon_unresv_zone(unlen,
2410 seg->s_as->a_proc->p_zone);
2411 }
2412 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2413 seg, len, 0);
2414 }
2415
2416 return (0); /* I'm glad that's all over with! */
2417 }
2418
2419 static void
segvn_free(struct seg * seg)2420 segvn_free(struct seg *seg)
2421 {
2422 struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2423 pgcnt_t npages = seg_pages(seg);
2424 struct anon_map *amp;
2425 size_t len;
2426
2427 /*
2428 * We don't need any segment level locks for "segvn" data
2429 * since the address space is "write" locked.
2430 */
2431 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
2432 ASSERT(svd->tr_state == SEGVN_TR_OFF);
2433
2434 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2435
2436 /*
2437 * Be sure to unlock pages. XXX Why do things get free'ed instead
2438 * of unmapped? XXX
2439 */
2440 (void) segvn_lockop(seg, seg->s_base, seg->s_size,
2441 0, MC_UNLOCK, NULL, 0);
2442
2443 /*
2444 * Deallocate the vpage and anon pointers if necessary and possible.
2445 */
2446 if (svd->vpage != NULL) {
2447 kmem_free(svd->vpage, vpgtob(npages));
2448 svd->vpage = NULL;
2449 }
2450 if ((amp = svd->amp) != NULL) {
2451 /*
2452 * If there are no more references to this anon_map
2453 * structure, then deallocate the structure after freeing
2454 * up all the anon slot pointers that we can.
2455 */
2456 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2457 ASSERT(amp->a_szc >= seg->s_szc);
2458 if (--amp->refcnt == 0) {
2459 if (svd->type == MAP_PRIVATE) {
2460 /*
2461 * Private - we only need to anon_free
2462 * the part that this segment refers to.
2463 */
2464 if (seg->s_szc != 0) {
2465 anon_free_pages(amp->ahp,
2466 svd->anon_index, seg->s_size,
2467 seg->s_szc);
2468 } else {
2469 anon_free(amp->ahp, svd->anon_index,
2470 seg->s_size);
2471 }
2472 } else {
2473
2474 /*
2475 * Shared anon map is no longer in use. Before
2476 * freeing its pages purge all entries from
2477 * pcache that belong to this amp.
2478 */
2479 ASSERT(svd->softlockcnt == 0);
2480 anonmap_purge(amp);
2481
2482 /*
2483 * Shared - anon_free the entire
2484 * anon_map's worth of stuff and
2485 * release any swap reservation.
2486 */
2487 if (amp->a_szc != 0) {
2488 anon_shmap_free_pages(amp, 0,
2489 amp->size);
2490 } else {
2491 anon_free(amp->ahp, 0, amp->size);
2492 }
2493 if ((len = amp->swresv) != 0) {
2494 anon_unresv_zone(len,
2495 seg->s_as->a_proc->p_zone);
2496 TRACE_3(TR_FAC_VM, TR_ANON_PROC,
2497 "anon proc:%p %lu %u", seg, len, 0);
2498 }
2499 }
2500 svd->amp = NULL;
2501 ANON_LOCK_EXIT(&->a_rwlock);
2502 anonmap_free(amp);
2503 } else if (svd->type == MAP_PRIVATE) {
2504 /*
2505 * We had a private mapping which still has
2506 * a held anon_map so just free up all the
2507 * anon slot pointers that we were using.
2508 */
2509 if (seg->s_szc != 0) {
2510 anon_free_pages(amp->ahp, svd->anon_index,
2511 seg->s_size, seg->s_szc);
2512 } else {
2513 anon_free(amp->ahp, svd->anon_index,
2514 seg->s_size);
2515 }
2516 ANON_LOCK_EXIT(&->a_rwlock);
2517 } else {
2518 ANON_LOCK_EXIT(&->a_rwlock);
2519 }
2520 }
2521
2522 /*
2523 * Release swap reservation.
2524 */
2525 if ((len = svd->swresv) != 0) {
2526 anon_unresv_zone(svd->swresv,
2527 seg->s_as->a_proc->p_zone);
2528 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2529 seg, len, 0);
2530 if (SEG_IS_PARTIAL_RESV(seg))
2531 seg->s_as->a_resvsize -= svd->swresv;
2532 svd->swresv = 0;
2533 }
2534 /*
2535 * Release claim on vnode, credentials, and finally free the
2536 * private data.
2537 */
2538 if (svd->vp != NULL) {
2539 if (svd->type == MAP_SHARED)
2540 lgrp_shm_policy_fini(NULL, svd->vp);
2541 VN_RELE(svd->vp);
2542 svd->vp = NULL;
2543 }
2544 crfree(svd->cred);
2545 svd->pageprot = 0;
2546 svd->pageadvice = 0;
2547 svd->pageswap = 0;
2548 svd->cred = NULL;
2549
2550 /*
2551 * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's
2552 * still working with this segment without holding as lock (in case
2553 * it's called by pcache async thread).
2554 */
2555 ASSERT(svd->softlockcnt == 0);
2556 mutex_enter(&svd->segfree_syncmtx);
2557 mutex_exit(&svd->segfree_syncmtx);
2558
2559 seg->s_data = NULL;
2560 kmem_cache_free(segvn_cache, svd);
2561 }
2562
2563 /*
2564 * Do a F_SOFTUNLOCK call over the range requested. The range must have
2565 * already been F_SOFTLOCK'ed.
2566 * Caller must always match addr and len of a softunlock with a previous
2567 * softlock with exactly the same addr and len.
2568 */
2569 static void
segvn_softunlock(struct seg * seg,caddr_t addr,size_t len,enum seg_rw rw)2570 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
2571 {
2572 struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2573 page_t *pp;
2574 caddr_t adr;
2575 struct vnode *vp;
2576 u_offset_t offset;
2577 ulong_t anon_index = 0;
2578 struct anon_map *amp;
2579 struct anon *ap = NULL;
2580
2581 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2582 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
2583
2584 if ((amp = svd->amp) != NULL)
2585 anon_index = svd->anon_index + seg_page(seg, addr);
2586
2587 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
2588 ASSERT(svd->tr_state == SEGVN_TR_OFF);
2589 hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie);
2590 } else {
2591 hat_unlock(seg->s_as->a_hat, addr, len);
2592 }
2593 for (adr = addr; adr < addr + len; adr += PAGESIZE) {
2594 if (amp != NULL) {
2595 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2596 if ((ap = anon_get_ptr(amp->ahp, anon_index++))
2597 != NULL) {
2598 swap_xlate(ap, &vp, &offset);
2599 } else {
2600 vp = svd->vp;
2601 offset = svd->offset +
2602 (uintptr_t)(adr - seg->s_base);
2603 }
2604 ANON_LOCK_EXIT(&->a_rwlock);
2605 } else {
2606 vp = svd->vp;
2607 offset = svd->offset +
2608 (uintptr_t)(adr - seg->s_base);
2609 }
2610
2611 /*
2612 * Use page_find() instead of page_lookup() to
2613 * find the page since we know that it is locked.
2614 */
2615 pp = page_find(vp, offset);
2616 if (pp == NULL) {
2617 panic(
2618 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
2619 (void *)adr, (void *)ap, (void *)vp, offset);
2620 /*NOTREACHED*/
2621 }
2622
2623 if (rw == S_WRITE) {
2624 hat_setrefmod(pp);
2625 if (seg->s_as->a_vbits)
2626 hat_setstat(seg->s_as, adr, PAGESIZE,
2627 P_REF | P_MOD);
2628 } else if (rw != S_OTHER) {
2629 hat_setref(pp);
2630 if (seg->s_as->a_vbits)
2631 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF);
2632 }
2633 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
2634 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
2635 page_unlock(pp);
2636 }
2637 ASSERT(svd->softlockcnt >= btop(len));
2638 if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) {
2639 /*
2640 * All SOFTLOCKS are gone. Wakeup any waiting
2641 * unmappers so they can try again to unmap.
2642 * Check for waiters first without the mutex
2643 * held so we don't always grab the mutex on
2644 * softunlocks.
2645 */
2646 if (AS_ISUNMAPWAIT(seg->s_as)) {
2647 mutex_enter(&seg->s_as->a_contents);
2648 if (AS_ISUNMAPWAIT(seg->s_as)) {
2649 AS_CLRUNMAPWAIT(seg->s_as);
2650 cv_broadcast(&seg->s_as->a_cv);
2651 }
2652 mutex_exit(&seg->s_as->a_contents);
2653 }
2654 }
2655 }
2656
2657 #define PAGE_HANDLED ((page_t *)-1)
2658
2659 /*
2660 * Release all the pages in the NULL terminated ppp list
2661 * which haven't already been converted to PAGE_HANDLED.
2662 */
2663 static void
segvn_pagelist_rele(page_t ** ppp)2664 segvn_pagelist_rele(page_t **ppp)
2665 {
2666 for (; *ppp != NULL; ppp++) {
2667 if (*ppp != PAGE_HANDLED)
2668 page_unlock(*ppp);
2669 }
2670 }
2671
2672 static int stealcow = 1;
2673
2674 /*
2675 * Workaround for viking chip bug. See bug id 1220902.
2676 * To fix this down in pagefault() would require importing so
2677 * much as and segvn code as to be unmaintainable.
2678 */
2679 int enable_mbit_wa = 0;
2680
2681 /*
2682 * Handles all the dirty work of getting the right
2683 * anonymous pages and loading up the translations.
2684 * This routine is called only from segvn_fault()
2685 * when looping over the range of addresses requested.
2686 *
2687 * The basic algorithm here is:
2688 * If this is an anon_zero case
2689 * Call anon_zero to allocate page
2690 * Load up translation
2691 * Return
2692 * endif
2693 * If this is an anon page
2694 * Use anon_getpage to get the page
2695 * else
2696 * Find page in pl[] list passed in
2697 * endif
2698 * If not a cow
2699 * Load up the translation to the page
2700 * return
2701 * endif
2702 * Call anon_private to handle cow
2703 * Load up (writable) translation to new page
2704 */
2705 static faultcode_t
segvn_faultpage(struct hat * hat,struct seg * seg,caddr_t addr,u_offset_t off,struct vpage * vpage,page_t * pl[],uint_t vpprot,enum fault_type type,enum seg_rw rw,int brkcow)2706 segvn_faultpage(
2707 struct hat *hat, /* the hat to use for mapping */
2708 struct seg *seg, /* seg_vn of interest */
2709 caddr_t addr, /* address in as */
2710 u_offset_t off, /* offset in vp */
2711 struct vpage *vpage, /* pointer to vpage for vp, off */
2712 page_t *pl[], /* object source page pointer */
2713 uint_t vpprot, /* access allowed to object pages */
2714 enum fault_type type, /* type of fault */
2715 enum seg_rw rw, /* type of access at fault */
2716 int brkcow) /* we may need to break cow */
2717 {
2718 struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2719 page_t *pp, **ppp;
2720 uint_t pageflags = 0;
2721 page_t *anon_pl[1 + 1];
2722 page_t *opp = NULL; /* original page */
2723 uint_t prot;
2724 int err;
2725 int cow;
2726 int claim;
2727 int steal = 0;
2728 ulong_t anon_index = 0;
2729 struct anon *ap, *oldap;
2730 struct anon_map *amp;
2731 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
2732 int anon_lock = 0;
2733 anon_sync_obj_t cookie;
2734
2735 if (svd->flags & MAP_TEXT) {
2736 hat_flag |= HAT_LOAD_TEXT;
2737 }
2738
2739 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
2740 ASSERT(seg->s_szc == 0);
2741 ASSERT(svd->tr_state != SEGVN_TR_INIT);
2742
2743 /*
2744 * Initialize protection value for this page.
2745 * If we have per page protection values check it now.
2746 */
2747 if (svd->pageprot) {
2748 uint_t protchk;
2749
2750 switch (rw) {
2751 case S_READ:
2752 protchk = PROT_READ;
2753 break;
2754 case S_WRITE:
2755 protchk = PROT_WRITE;
2756 break;
2757 case S_EXEC:
2758 protchk = PROT_EXEC;
2759 break;
2760 case S_OTHER:
2761 default:
2762 protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
2763 break;
2764 }
2765
2766 prot = VPP_PROT(vpage);
2767 if ((prot & protchk) == 0)
2768 return (FC_PROT); /* illegal access type */
2769 } else {
2770 prot = svd->prot;
2771 }
2772
2773 if (type == F_SOFTLOCK) {
2774 atomic_inc_ulong((ulong_t *)&svd->softlockcnt);
2775 }
2776
2777 /*
2778 * Always acquire the anon array lock to prevent 2 threads from
2779 * allocating separate anon slots for the same "addr".
2780 */
2781
2782 if ((amp = svd->amp) != NULL) {
2783 ASSERT(RW_READ_HELD(&->a_rwlock));
2784 anon_index = svd->anon_index + seg_page(seg, addr);
2785 anon_array_enter(amp, anon_index, &cookie);
2786 anon_lock = 1;
2787 }
2788
2789 if (svd->vp == NULL && amp != NULL) {
2790 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) {
2791 /*
2792 * Allocate a (normally) writable anonymous page of
2793 * zeroes. If no advance reservations, reserve now.
2794 */
2795 if (svd->flags & MAP_NORESERVE) {
2796 if (anon_resv_zone(ptob(1),
2797 seg->s_as->a_proc->p_zone)) {
2798 atomic_add_long(&svd->swresv, ptob(1));
2799 atomic_add_long(&seg->s_as->a_resvsize,
2800 ptob(1));
2801 } else {
2802 err = ENOMEM;
2803 goto out;
2804 }
2805 }
2806 if ((pp = anon_zero(seg, addr, &ap,
2807 svd->cred)) == NULL) {
2808 err = ENOMEM;
2809 goto out; /* out of swap space */
2810 }
2811 /*
2812 * Re-acquire the anon_map lock and
2813 * initialize the anon array entry.
2814 */
2815 (void) anon_set_ptr(amp->ahp, anon_index, ap,
2816 ANON_SLEEP);
2817
2818 ASSERT(pp->p_szc == 0);
2819
2820 /*
2821 * Handle pages that have been marked for migration
2822 */
2823 if (lgrp_optimizations())
2824 page_migrate(seg, addr, &pp, 1);
2825
2826 if (enable_mbit_wa) {
2827 if (rw == S_WRITE)
2828 hat_setmod(pp);
2829 else if (!hat_ismod(pp))
2830 prot &= ~PROT_WRITE;
2831 }
2832 /*
2833 * If AS_PAGLCK is set in a_flags (via memcntl(2)
2834 * with MC_LOCKAS, MCL_FUTURE) and this is a
2835 * MAP_NORESERVE segment, we may need to
2836 * permanently lock the page as it is being faulted
2837 * for the first time. The following text applies
2838 * only to MAP_NORESERVE segments:
2839 *
2840 * As per memcntl(2), if this segment was created
2841 * after MCL_FUTURE was applied (a "future"
2842 * segment), its pages must be locked. If this
2843 * segment existed at MCL_FUTURE application (a
2844 * "past" segment), the interface is unclear.
2845 *
2846 * We decide to lock only if vpage is present:
2847 *
2848 * - "future" segments will have a vpage array (see
2849 * as_map), and so will be locked as required
2850 *
2851 * - "past" segments may not have a vpage array,
2852 * depending on whether events (such as
2853 * mprotect) have occurred. Locking if vpage
2854 * exists will preserve legacy behavior. Not
2855 * locking if vpage is absent, will not break
2856 * the interface or legacy behavior. Note that
2857 * allocating vpage here if it's absent requires
2858 * upgrading the segvn reader lock, the cost of
2859 * which does not seem worthwhile.
2860 *
2861 * Usually testing and setting VPP_ISPPLOCK and
2862 * VPP_SETPPLOCK requires holding the segvn lock as
2863 * writer, but in this case all readers are
2864 * serializing on the anon array lock.
2865 */
2866 if (AS_ISPGLCK(seg->s_as) && vpage != NULL &&
2867 (svd->flags & MAP_NORESERVE) &&
2868 !VPP_ISPPLOCK(vpage)) {
2869 proc_t *p = seg->s_as->a_proc;
2870 ASSERT(svd->type == MAP_PRIVATE);
2871 mutex_enter(&p->p_lock);
2872 if (rctl_incr_locked_mem(p, NULL, PAGESIZE,
2873 1) == 0) {
2874 claim = VPP_PROT(vpage) & PROT_WRITE;
2875 if (page_pp_lock(pp, claim, 0)) {
2876 VPP_SETPPLOCK(vpage);
2877 } else {
2878 rctl_decr_locked_mem(p, NULL,
2879 PAGESIZE, 1);
2880 }
2881 }
2882 mutex_exit(&p->p_lock);
2883 }
2884
2885 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2886 hat_memload(hat, addr, pp, prot, hat_flag);
2887
2888 if (!(hat_flag & HAT_LOAD_LOCK))
2889 page_unlock(pp);
2890
2891 anon_array_exit(&cookie);
2892 return (0);
2893 }
2894 }
2895
2896 /*
2897 * Obtain the page structure via anon_getpage() if it is
2898 * a private copy of an object (the result of a previous
2899 * copy-on-write).
2900 */
2901 if (amp != NULL) {
2902 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) {
2903 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE,
2904 seg, addr, rw, svd->cred);
2905 if (err)
2906 goto out;
2907
2908 if (svd->type == MAP_SHARED) {
2909 /*
2910 * If this is a shared mapping to an
2911 * anon_map, then ignore the write
2912 * permissions returned by anon_getpage().
2913 * They apply to the private mappings
2914 * of this anon_map.
2915 */
2916 vpprot |= PROT_WRITE;
2917 }
2918 opp = anon_pl[0];
2919 }
2920 }
2921
2922 /*
2923 * Search the pl[] list passed in if it is from the
2924 * original object (i.e., not a private copy).
2925 */
2926 if (opp == NULL) {
2927 /*
2928 * Find original page. We must be bringing it in
2929 * from the list in pl[].
2930 */
2931 for (ppp = pl; (opp = *ppp) != NULL; ppp++) {
2932 if (opp == PAGE_HANDLED)
2933 continue;
2934 ASSERT(opp->p_vnode == svd->vp); /* XXX */
2935 if (opp->p_offset == off)
2936 break;
2937 }
2938 if (opp == NULL) {
2939 panic("segvn_faultpage not found");
2940 /*NOTREACHED*/
2941 }
2942 *ppp = PAGE_HANDLED;
2943
2944 }
2945
2946 ASSERT(PAGE_LOCKED(opp));
2947
2948 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
2949 "segvn_fault:pp %p vp %p offset %llx", opp, NULL, 0);
2950
2951 /*
2952 * The fault is treated as a copy-on-write fault if a
2953 * write occurs on a private segment and the object
2954 * page (i.e., mapping) is write protected. We assume
2955 * that fatal protection checks have already been made.
2956 */
2957
2958 if (brkcow) {
2959 ASSERT(svd->tr_state == SEGVN_TR_OFF);
2960 cow = !(vpprot & PROT_WRITE);
2961 } else if (svd->tr_state == SEGVN_TR_ON) {
2962 /*
2963 * If we are doing text replication COW on first touch.
2964 */
2965 ASSERT(amp != NULL);
2966 ASSERT(svd->vp != NULL);
2967 ASSERT(rw != S_WRITE);
2968 cow = (ap == NULL);
2969 } else {
2970 cow = 0;
2971 }
2972
2973 /*
2974 * If not a copy-on-write case load the translation
2975 * and return.
2976 */
2977 if (cow == 0) {
2978
2979 /*
2980 * Handle pages that have been marked for migration
2981 */
2982 if (lgrp_optimizations())
2983 page_migrate(seg, addr, &opp, 1);
2984
2985 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) {
2986 if (rw == S_WRITE)
2987 hat_setmod(opp);
2988 else if (rw != S_OTHER && !hat_ismod(opp))
2989 prot &= ~PROT_WRITE;
2990 }
2991
2992 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE ||
2993 (!svd->pageprot && svd->prot == (prot & vpprot)));
2994 ASSERT(amp == NULL ||
2995 svd->rcookie == HAT_INVALID_REGION_COOKIE);
2996 hat_memload_region(hat, addr, opp, prot & vpprot, hat_flag,
2997 svd->rcookie);
2998
2999 if (!(hat_flag & HAT_LOAD_LOCK))
3000 page_unlock(opp);
3001
3002 if (anon_lock) {
3003 anon_array_exit(&cookie);
3004 }
3005 return (0);
3006 }
3007
3008 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
3009
3010 hat_setref(opp);
3011
3012 ASSERT(amp != NULL && anon_lock);
3013
3014 /*
3015 * Steal the page only if it isn't a private page
3016 * since stealing a private page is not worth the effort.
3017 */
3018 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL)
3019 steal = 1;
3020
3021 /*
3022 * Steal the original page if the following conditions are true:
3023 *
3024 * We are low on memory, the page is not private, page is not large,
3025 * not shared, not modified, not `locked' or if we have it `locked'
3026 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies
3027 * that the page is not shared) and if it doesn't have any
3028 * translations. page_struct_lock isn't needed to look at p_cowcnt
3029 * and p_lckcnt because we first get exclusive lock on page.
3030 */
3031 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
3032
3033 if (