xref: /illumos-gate/usr/src/uts/common/vm/seg_spt.c (revision 2570281c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2019 Joyent, Inc.
24  * Copyright (c) 2016 by Delphix. All rights reserved.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/user.h>
29 #include <sys/mman.h>
30 #include <sys/kmem.h>
31 #include <sys/sysmacros.h>
32 #include <sys/cmn_err.h>
33 #include <sys/systm.h>
34 #include <sys/tuneable.h>
35 #include <vm/hat.h>
36 #include <vm/seg.h>
37 #include <vm/as.h>
38 #include <vm/anon.h>
39 #include <vm/page.h>
40 #include <sys/buf.h>
41 #include <sys/swap.h>
42 #include <sys/atomic.h>
43 #include <vm/seg_spt.h>
44 #include <sys/debug.h>
45 #include <sys/vtrace.h>
46 #include <sys/shm.h>
47 #include <sys/shm_impl.h>
48 #include <sys/lgrp.h>
49 #include <sys/vmsystm.h>
50 #include <sys/policy.h>
51 #include <sys/project.h>
52 #include <sys/zone.h>
53 
54 #define	SEGSPTADDR	(caddr_t)0x0
55 
56 /*
57  * # pages used for spt
58  */
59 size_t	spt_used;
60 
61 /*
62  * See spt_setminfree().
63  */
64 pgcnt_t segspt_minfree = 0;
65 size_t segspt_minfree_clamp = (1UL << 30); /* 1GB in bytes */
66 
67 static int segspt_create(struct seg **segpp, void *argsp);
68 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
69 static void segspt_free(struct seg *seg);
70 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
71 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
72 
73 /* ARGSUSED */
74 __NORETURN static int
segspt_badop_dup(struct seg * seg __unused,struct seg * newseg __unused)75 segspt_badop_dup(struct seg *seg __unused, struct seg *newseg __unused)
76 {
77 	panic("%s called", __func__);
78 }
79 
80 /* ARGSUSED */
81 __NORETURN static faultcode_t
segspt_badop_fault(struct hat * hat,struct seg * seg,caddr_t addr,size_t len,enum fault_type type,enum seg_rw rw)82 segspt_badop_fault(struct hat *hat, struct seg *seg, caddr_t addr,
83     size_t len, enum fault_type type, enum seg_rw rw)
84 {
85 	panic("%s called", __func__);
86 }
87 
88 /* ARGSUSED */
89 __NORETURN static faultcode_t
segspt_badop_faulta(struct seg * seg __unused,caddr_t addr __unused)90 segspt_badop_faulta(struct seg *seg __unused, caddr_t addr __unused)
91 {
92 	panic("%s called", __func__);
93 }
94 
95 /* ARGSUSED */
96 __NORETURN static int
segspt_badop_prot(struct seg * seg,caddr_t addr,size_t len,uint_t prot)97 segspt_badop_prot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
98 {
99 	panic("%s called", __func__);
100 }
101 
102 /* ARGSUSED */
103 __NORETURN static int
segspt_badop_checkprot(struct seg * seg,caddr_t addr,size_t size,uint_t prot)104 segspt_badop_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
105 {
106 	panic("%s called", __func__);
107 }
108 
109 /* ARGSUSED */
110 __NORETURN static int
segspt_badop_kluster(struct seg * seg,caddr_t addr,ssize_t delta)111 segspt_badop_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
112 {
113 	panic("%s called", __func__);
114 }
115 
116 /* ARGSUSED */
117 __NORETURN static size_t
segspt_badop_swapout(struct seg * seg)118 segspt_badop_swapout(struct seg *seg)
119 {
120 	panic("%s called", __func__);
121 }
122 
123 /* ARGSUSED */
124 __NORETURN static int
segspt_badop_sync(struct seg * seg,caddr_t addr,size_t len,int attr,uint_t flags)125 segspt_badop_sync(struct seg *seg, caddr_t addr, size_t len, int attr,
126     uint_t flags)
127 {
128 	panic("%s called", __func__);
129 }
130 
131 /* ARGSUSED */
132 __NORETURN
133 static size_t
segspt_badop_incore(struct seg * seg,caddr_t addr,size_t len,char * vec)134 segspt_badop_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
135 {
136 	panic("%s called", __func__);
137 }
138 
139 /* ARGSUSED */
140 __NORETURN static int
segspt_badop_lockop(struct seg * seg,caddr_t addr,size_t len,int attr,int op,ulong_t * lockmap,size_t pos)141 segspt_badop_lockop(struct seg *seg, caddr_t addr, size_t len, int attr,
142     int op, ulong_t *lockmap, size_t pos)
143 {
144 	panic("%s called", __func__);
145 }
146 
147 /* ARGSUSED */
148 __NORETURN static int
segspt_badop_getprot(struct seg * seg,caddr_t addr,size_t len,uint_t * protv)149 segspt_badop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
150 {
151 	panic("%s called", __func__);
152 }
153 
154 /* ARGSUSED */
155 __NORETURN static u_offset_t
segspt_badop_getoffset(struct seg * seg,caddr_t addr)156 segspt_badop_getoffset(struct seg *seg, caddr_t addr)
157 {
158 	panic("%s called", __func__);
159 }
160 
161 /* ARGSUSED */
162 __NORETURN static int
segspt_badop_gettype(struct seg * seg,caddr_t addr)163 segspt_badop_gettype(struct seg *seg, caddr_t addr)
164 {
165 	panic("%s called", __func__);
166 }
167 
168 /* ARGSUSED */
169 __NORETURN static int
segspt_badop_getvp(struct seg * seg,caddr_t addr,struct vnode ** vpp)170 segspt_badop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
171 {
172 	panic("%s called", __func__);
173 }
174 
175 /* ARGSUSED */
176 __NORETURN static int
segspt_badop_advise(struct seg * seg,caddr_t addr,size_t len,uint_t behav)177 segspt_badop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
178 {
179 	panic("%s called", __func__);
180 }
181 
182 /* ARGSUSED */
183 __NORETURN static void
segspt_badop_dump(struct seg * seg)184 segspt_badop_dump(struct seg *seg)
185 {
186 	panic("%s called", __func__);
187 }
188 
189 /* ARGSUSED */
190 __NORETURN static int
segspt_badop_pagelock(struct seg * seg,caddr_t addr,size_t len,struct page *** ppp,enum lock_type type,enum seg_rw rw)191 segspt_badop_pagelock(struct seg *seg, caddr_t addr, size_t len,
192     struct page ***ppp, enum lock_type type, enum seg_rw rw)
193 {
194 	panic("%s called", __func__);
195 }
196 
197 /* ARGSUSED */
198 __NORETURN static int
segspt_badop_setpgsz(struct seg * seg,caddr_t addr,size_t len,uint_t szc)199 segspt_badop_setpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
200 {
201 	panic("%s called", __func__);
202 }
203 
204 /* ARGSUSED */
205 __NORETURN static int
segspt_badop_getmemid(struct seg * seg,caddr_t addr,memid_t * memidp)206 segspt_badop_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
207 {
208 	panic("%s called", __func__);
209 }
210 
211 /* ARGSUSED */
212 __NORETURN static int
segspt_badop_capable(struct seg * seg,segcapability_t capability)213 segspt_badop_capable(struct seg *seg, segcapability_t capability)
214 {
215 	panic("%s called", __func__);
216 }
217 
218 struct seg_ops segspt_ops = {
219 	segspt_badop_dup,		/* dup */
220 	segspt_unmap,
221 	segspt_free,
222 	segspt_badop_fault,		/* fault */
223 	segspt_badop_faulta,		/* faulta */
224 	segspt_badop_prot,		/* setprot */
225 	segspt_badop_checkprot,		/* checkprot */
226 	segspt_badop_kluster,		/* kluster */
227 	segspt_badop_swapout,		/* swapout */
228 	segspt_badop_sync,		/* sync */
229 	segspt_badop_incore,		/* incore */
230 	segspt_badop_lockop,		/* lockop */
231 	segspt_badop_getprot,		/* getprot */
232 	segspt_badop_getoffset,		/* getoffset */
233 	segspt_badop_gettype,		/* gettype */
234 	segspt_badop_getvp,		/* getvp */
235 	segspt_badop_advise,		/* advise */
236 	segspt_badop_dump,		/* dump */
237 	segspt_badop_pagelock,		/* pagelock */
238 	segspt_badop_setpgsz,		/* setpgsz */
239 	segspt_badop_getmemid,		/* getmemid */
240 	segspt_getpolicy,		/* getpolicy */
241 	segspt_badop_capable,		/* capable */
242 	seg_inherit_notsup		/* inherit */
243 };
244 
245 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
246 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
247 static void segspt_shmfree(struct seg *seg);
248 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
249 		caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
250 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
251 static int segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len,
252 		uint_t prot);
253 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
254 		uint_t prot);
255 static int	segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
256 static size_t	segspt_shmswapout(struct seg *seg);
257 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
258 		char *vec);
259 static int segspt_shmsync(struct seg *seg, caddr_t addr, size_t len,
260 		int attr, uint_t flags);
261 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
262 		int attr, int op, ulong_t *lockmap, size_t pos);
263 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
264 		uint_t *protv);
265 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
266 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
267 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
268 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
269 		uint_t behav);
270 static void segspt_shmdump(struct seg *seg);
271 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
272 		struct page ***, enum lock_type, enum seg_rw);
273 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
274 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
275 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
276 static int segspt_shmcapable(struct seg *, segcapability_t);
277 
278 struct seg_ops segspt_shmops = {
279 	segspt_shmdup,
280 	segspt_shmunmap,
281 	segspt_shmfree,
282 	segspt_shmfault,
283 	segspt_shmfaulta,
284 	segspt_shmsetprot,
285 	segspt_shmcheckprot,
286 	segspt_shmkluster,
287 	segspt_shmswapout,
288 	segspt_shmsync,
289 	segspt_shmincore,
290 	segspt_shmlockop,
291 	segspt_shmgetprot,
292 	segspt_shmgetoffset,
293 	segspt_shmgettype,
294 	segspt_shmgetvp,
295 	segspt_shmadvise,	/* advise */
296 	segspt_shmdump,
297 	segspt_shmpagelock,
298 	segspt_shmsetpgsz,
299 	segspt_shmgetmemid,
300 	segspt_shmgetpolicy,
301 	segspt_shmcapable,
302 	seg_inherit_notsup
303 };
304 
305 static void segspt_purge(struct seg *seg);
306 static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
307 		enum seg_rw, int);
308 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
309 		page_t **ppa);
310 
311 /*
312  * This value corresponds to headroom in availrmem that ISM can never allocate
313  * (but others can).  The original intent here was to prevent ISM from locking
314  * all of the remaining availrmem into memory, making forward progress
315  * difficult. It's not clear how much this matters on modern systems.
316  *
317  * The traditional default value of 5% of total memory is used, except on
318  * systems where that quickly gets ridiculous: in that case we clamp at a rather
319  * arbitrary value of 1GB.
320  *
321  * Note that since this is called lazily on the first sptcreate(), in theory,
322  * this could represent a very small value if the system is heavily loaded
323  * already. In practice, the first ISM user is pretty likely to come along
324  * earlier during the system's operation.
325  *
326  * This never gets re-figured.
327  */
328 static void
spt_setminfree(void)329 spt_setminfree(void)
330 {
331 	segspt_minfree = availrmem / 20;
332 
333 	if (segspt_minfree_clamp != 0 &&
334 	    segspt_minfree > (segspt_minfree_clamp / PAGESIZE))
335 		segspt_minfree = segspt_minfree_clamp / PAGESIZE;
336 }
337 
338 int
sptcreate(size_t size,struct seg ** sptseg,struct anon_map * amp,uint_t prot,uint_t flags,uint_t share_szc)339 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
340     uint_t prot, uint_t flags, uint_t share_szc)
341 {
342 	int	err;
343 	struct	as	*newas;
344 	struct	segspt_crargs sptcargs;
345 
346 	if (segspt_minfree == 0)
347 		spt_setminfree();
348 
349 	if (!hat_supported(HAT_SHARED_PT, (void *)0))
350 		return (EINVAL);
351 
352 	/*
353 	 * get a new as for this shared memory segment
354 	 */
355 	newas = as_alloc();
356 	newas->a_proc = NULL;
357 	sptcargs.amp = amp;
358 	sptcargs.prot = prot;
359 	sptcargs.flags = flags;
360 	sptcargs.szc = share_szc;
361 	/*
362 	 * create a shared page table (spt) segment
363 	 */
364 
365 	if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
366 		as_free(newas);
367 		return (err);
368 	}
369 	*sptseg = sptcargs.seg_spt;
370 	return (0);
371 }
372 
373 void
sptdestroy(struct as * as,struct anon_map * amp)374 sptdestroy(struct as *as, struct anon_map *amp)
375 {
376 
377 	(void) as_unmap(as, SEGSPTADDR, amp->size);
378 	as_free(as);
379 }
380 
381 /*
382  * called from seg_free().
383  * free (i.e., unlock, unmap, return to free list)
384  *  all the pages in the given seg.
385  */
386 void
segspt_free(struct seg * seg)387 segspt_free(struct seg	*seg)
388 {
389 	struct spt_data *sptd = (struct spt_data *)seg->s_data;
390 
391 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
392 
393 	if (sptd != NULL) {
394 		if (sptd->spt_realsize)
395 			segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
396 
397 		if (sptd->spt_ppa_lckcnt) {
398 			kmem_free(sptd->spt_ppa_lckcnt,
399 			    sizeof (*sptd->spt_ppa_lckcnt)
400 			    * btopr(sptd->spt_amp->size));
401 		}
402 		kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
403 		cv_destroy(&sptd->spt_cv);
404 		mutex_destroy(&sptd->spt_lock);
405 		kmem_free(sptd, sizeof (*sptd));
406 	}
407 }
408 
409 /*ARGSUSED*/
410 static int
segspt_shmsync(struct seg * seg,caddr_t addr,size_t len,int attr,uint_t flags)411 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
412     uint_t flags)
413 {
414 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
415 
416 	return (0);
417 }
418 
419 /*ARGSUSED*/
420 static size_t
segspt_shmincore(struct seg * seg,caddr_t addr,size_t len,char * vec)421 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
422 {
423 	caddr_t	eo_seg;
424 	pgcnt_t	npages;
425 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
426 	struct seg	*sptseg;
427 	struct spt_data *sptd;
428 
429 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
430 #ifdef lint
431 	seg = seg;
432 #endif
433 	sptseg = shmd->shm_sptseg;
434 	sptd = sptseg->s_data;
435 
436 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
437 		eo_seg = addr + len;
438 		while (addr < eo_seg) {
439 			/* page exists, and it's locked. */
440 			*vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
441 			    SEG_PAGE_ANON;
442 			addr += PAGESIZE;
443 		}
444 		return (len);
445 	} else {
446 		struct  anon_map *amp = shmd->shm_amp;
447 		struct  anon	*ap;
448 		page_t		*pp;
449 		pgcnt_t		anon_index;
450 		struct vnode	*vp;
451 		u_offset_t	off;
452 		ulong_t		i;
453 		int		ret;
454 		anon_sync_obj_t	cookie;
455 
456 		addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
457 		anon_index = seg_page(seg, addr);
458 		npages = btopr(len);
459 		if (anon_index + npages > btopr(shmd->shm_amp->size)) {
460 			return (EINVAL);
461 		}
462 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
463 		for (i = 0; i < npages; i++, anon_index++) {
464 			ret = 0;
465 			anon_array_enter(amp, anon_index, &cookie);
466 			ap = anon_get_ptr(amp->ahp, anon_index);
467 			if (ap != NULL) {
468 				swap_xlate(ap, &vp, &off);
469 				anon_array_exit(&cookie);
470 				pp = page_lookup_nowait(vp, off, SE_SHARED);
471 				if (pp != NULL) {
472 					ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
473 					page_unlock(pp);
474 				}
475 			} else {
476 				anon_array_exit(&cookie);
477 			}
478 			if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
479 				ret |= SEG_PAGE_LOCKED;
480 			}
481 			*vec++ = (char)ret;
482 		}
483 		ANON_LOCK_EXIT(&amp->a_rwlock);
484 		return (len);
485 	}
486 }
487 
488 static int
segspt_unmap(struct seg * seg,caddr_t raddr,size_t ssize)489 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
490 {
491 	size_t share_size;
492 
493 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
494 
495 	/*
496 	 * seg.s_size may have been rounded up to the largest page size
497 	 * in shmat().
498 	 * XXX This should be cleanedup. sptdestroy should take a length
499 	 * argument which should be the same as sptcreate. Then
500 	 * this rounding would not be needed (or is done in shm.c)
501 	 * Only the check for full segment will be needed.
502 	 *
503 	 * XXX -- shouldn't raddr == 0 always? These tests don't seem
504 	 * to be useful at all.
505 	 */
506 	share_size = page_get_pagesize(seg->s_szc);
507 	ssize = P2ROUNDUP(ssize, share_size);
508 
509 	if (raddr == seg->s_base && ssize == seg->s_size) {
510 		seg_free(seg);
511 		return (0);
512 	} else
513 		return (EINVAL);
514 }
515 
516 int
segspt_create(struct seg ** segpp,void * argsp)517 segspt_create(struct seg **segpp, void *argsp)
518 {
519 	struct seg	*seg = *segpp;
520 	int		err;
521 	caddr_t		addr = seg->s_base;
522 	struct spt_data *sptd;
523 	struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
524 	struct anon_map *amp = sptcargs->amp;
525 	struct kshmid	*sp = amp->a_sp;
526 	struct	cred	*cred = CRED();
527 	ulong_t		i, j, anon_index = 0;
528 	pgcnt_t		npages = btopr(amp->size);
529 	struct vnode	*vp;
530 	page_t		**ppa;
531 	uint_t		hat_flags;
532 	size_t		pgsz;
533 	pgcnt_t		pgcnt;
534 	caddr_t		a;
535 	pgcnt_t		pidx;
536 	size_t		sz;
537 	proc_t		*procp = curproc;
538 	rctl_qty_t	lockedbytes = 0;
539 	kproject_t	*proj;
540 
541 	/*
542 	 * We are holding the a_lock on the underlying dummy as,
543 	 * so we can make calls to the HAT layer.
544 	 */
545 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
546 	ASSERT(sp != NULL);
547 
548 	if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
549 		if (err = anon_swap_adjust(npages))
550 			return (err);
551 	}
552 	err = ENOMEM;
553 
554 	if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
555 		goto out1;
556 
557 	ppa = NULL;
558 	if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
559 		if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
560 		    KM_NOSLEEP)) == NULL)
561 			goto out2;
562 	}
563 
564 	mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
565 
566 	if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
567 		goto out3;
568 
569 	seg->s_ops = &segspt_ops;
570 	sptd->spt_vp = vp;
571 	sptd->spt_amp = amp;
572 	sptd->spt_prot = sptcargs->prot;
573 	sptd->spt_flags = sptcargs->flags;
574 	seg->s_data = (caddr_t)sptd;
575 	sptd->spt_ppa = NULL;
576 	sptd->spt_ppa_lckcnt = NULL;
577 	seg->s_szc = sptcargs->szc;
578 	cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
579 	sptd->spt_gen = 0;
580 
581 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
582 	if (seg->s_szc > amp->a_szc) {
583 		amp->a_szc = seg->s_szc;
584 	}
585 	ANON_LOCK_EXIT(&amp->a_rwlock);
586 
587 	/*
588 	 * Set policy to affect initial allocation of pages in
589 	 * anon_map_createpages()
590 	 */
591 	(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
592 	    NULL, 0, ptob(npages));
593 
594 	if (sptcargs->flags & SHM_PAGEABLE) {
595 		size_t  share_sz;
596 		pgcnt_t new_npgs, more_pgs;
597 		struct anon_hdr *nahp;
598 		zone_t *zone;
599 
600 		share_sz = page_get_pagesize(seg->s_szc);
601 		if (!IS_P2ALIGNED(amp->size, share_sz)) {
602 			/*
603 			 * We are rounding up the size of the anon array
604 			 * on 4 M boundary because we always create 4 M
605 			 * of page(s) when locking, faulting pages and we
606 			 * don't have to check for all corner cases e.g.
607 			 * if there is enough space to allocate 4 M
608 			 * page.
609 			 */
610 			new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
611 			more_pgs = new_npgs - npages;
612 
613 			/*
614 			 * The zone will never be NULL, as a fully created
615 			 * shm always has an owning zone.
616 			 */
617 			zone = sp->shm_perm.ipc_zone_ref.zref_zone;
618 			ASSERT(zone != NULL);
619 			if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
620 				err = ENOMEM;
621 				goto out4;
622 			}
623 
624 			nahp = anon_create(new_npgs, ANON_SLEEP);
625 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
626 			(void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
627 			    ANON_SLEEP);
628 			anon_release(amp->ahp, npages);
629 			amp->ahp = nahp;
630 			ASSERT(amp->swresv == ptob(npages));
631 			amp->swresv = amp->size = ptob(new_npgs);
632 			ANON_LOCK_EXIT(&amp->a_rwlock);
633 			npages = new_npgs;
634 		}
635 
636 		sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
637 		    sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
638 		sptd->spt_pcachecnt = 0;
639 		sptd->spt_realsize = ptob(npages);
640 		sptcargs->seg_spt = seg;
641 		return (0);
642 	}
643 
644 	/*
645 	 * get array of pages for each anon slot in amp
646 	 */
647 	if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
648 	    seg, addr, S_CREATE, cred)) != 0)
649 		goto out4;
650 
651 	mutex_enter(&sp->shm_mlock);
652 
653 	/* May be partially locked, so, count bytes to charge for locking */
654 	for (i = 0; i < npages; i++)
655 		if (ppa[i]->p_lckcnt == 0)
656 			lockedbytes += PAGESIZE;
657 
658 	proj = sp->shm_perm.ipc_proj;
659 
660 	if (lockedbytes > 0) {
661 		mutex_enter(&procp->p_lock);
662 		if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
663 			mutex_exit(&procp->p_lock);
664 			mutex_exit(&sp->shm_mlock);
665 			for (i = 0; i < npages; i++)
666 				page_unlock(ppa[i]);
667 			err = ENOMEM;
668 			goto out4;
669 		}
670 		mutex_exit(&procp->p_lock);
671 	}
672 
673 	/*
674 	 * addr is initial address corresponding to the first page on ppa list
675 	 */
676 	for (i = 0; i < npages; i++) {
677 		/* attempt to lock all pages */
678 		if (page_pp_lock(ppa[i], 0, 1) == 0) {
679 			/*
680 			 * if unable to lock any page, unlock all
681 			 * of them and return error
682 			 */
683 			for (j = 0; j < i; j++)
684 				page_pp_unlock(ppa[j], 0, 1);
685 			for (i = 0; i < npages; i++)
686 				page_unlock(ppa[i]);
687 			rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
688 			mutex_exit(&sp->shm_mlock);
689 			err = ENOMEM;
690 			goto out4;
691 		}
692 	}
693 	mutex_exit(&sp->shm_mlock);
694 
695 	/*
696 	 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
697 	 * for the entire life of the segment. For example platforms
698 	 * that do not support Dynamic Reconfiguration.
699 	 */
700 	hat_flags = HAT_LOAD_SHARE;
701 	if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
702 		hat_flags |= HAT_LOAD_LOCK;
703 
704 	/*
705 	 * Load translations one lare page at a time
706 	 * to make sure we don't create mappings bigger than
707 	 * segment's size code in case underlying pages
708 	 * are shared with segvn's segment that uses bigger
709 	 * size code than we do.
710 	 */
711 	pgsz = page_get_pagesize(seg->s_szc);
712 	pgcnt = page_get_pagecnt(seg->s_szc);
713 	for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
714 		sz = MIN(pgsz, ptob(npages - pidx));
715 		hat_memload_array(seg->s_as->a_hat, a, sz,
716 		    &ppa[pidx], sptd->spt_prot, hat_flags);
717 	}
718 
719 	/*
720 	 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
721 	 * we will leave the pages locked SE_SHARED for the life
722 	 * of the ISM segment. This will prevent any calls to
723 	 * hat_pageunload() on this ISM segment for those platforms.
724 	 */
725 	if (!(hat_flags & HAT_LOAD_LOCK)) {
726 		/*
727 		 * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
728 		 * we no longer need to hold the SE_SHARED lock on the pages,
729 		 * since L_PAGELOCK and F_SOFTLOCK calls will grab the
730 		 * SE_SHARED lock on the pages as necessary.
731 		 */
732 		for (i = 0; i < npages; i++)
733 			page_unlock(ppa[i]);
734 	}
735 	sptd->spt_pcachecnt = 0;
736 	kmem_free(ppa, ((sizeof (page_t *)) * npages));
737 	sptd->spt_realsize = ptob(npages);
738 	atomic_add_long(&spt_used, npages);
739 	sptcargs->seg_spt = seg;
740 	return (0);
741 
742 out4:
743 	seg->s_data = NULL;
744 	kmem_free(vp, sizeof (*vp));
745 	cv_destroy(&sptd->spt_cv);
746 out3:
747 	mutex_destroy(&sptd->spt_lock);
748 	if ((sptcargs->flags & SHM_PAGEABLE) == 0)
749 		kmem_free(ppa, (sizeof (*ppa) * npages));
750 out2:
751 	kmem_free(sptd, sizeof (*sptd));
752 out1:
753 	if ((sptcargs->flags & SHM_PAGEABLE) == 0)
754 		anon_swap_restore(npages);
755 	return (err);
756 }
757 
758 /*ARGSUSED*/
759 void
segspt_free_pages(struct seg * seg,caddr_t addr,size_t len)760 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
761 {
762 	struct page	*pp;
763 	struct spt_data *sptd = (struct spt_data *)seg->s_data;
764 	pgcnt_t		npages;
765 	ulong_t		anon_idx;
766 	struct anon_map *amp;
767 	struct anon	*ap;
768 	struct vnode	*vp;
769 	u_offset_t	off;
770 	uint_t		hat_flags;
771 	int		root = 0;
772 	pgcnt_t		pgs, curnpgs = 0;
773 	page_t		*rootpp;
774 	rctl_qty_t	unlocked_bytes = 0;
775 	kproject_t	*proj;
776 	kshmid_t	*sp;
777 
778 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
779 
780 	len = P2ROUNDUP(len, PAGESIZE);
781 
782 	npages = btop(len);
783 
784 	hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
785 	if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
786 	    (sptd->spt_flags & SHM_PAGEABLE)) {
787 		hat_flags = HAT_UNLOAD_UNMAP;
788 	}
789 
790 	hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
791 
792 	amp = sptd->spt_amp;
793 	if (sptd->spt_flags & SHM_PAGEABLE)
794 		npages = btop(amp->size);
795 
796 	ASSERT(amp != NULL);
797 
798 	proj = NULL;
799 	rootpp = NULL;
800 	sp = NULL;
801 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
802 		sp = amp->a_sp;
803 		proj = sp->shm_perm.ipc_proj;
804 		mutex_enter(&sp->shm_mlock);
805 	}
806 	for (anon_idx = 0; anon_idx < npages; anon_idx++) {
807 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
808 			if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
809 				panic("segspt_free_pages: null app");
810 				/*NOTREACHED*/
811 			}
812 		} else {
813 			if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
814 			    == NULL)
815 				continue;
816 		}
817 		ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
818 		swap_xlate(ap, &vp, &off);
819 
820 		/*
821 		 * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
822 		 * the pages won't be having SE_SHARED lock at this
823 		 * point.
824 		 *
825 		 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
826 		 * the pages are still held SE_SHARED locked from the
827 		 * original segspt_create()
828 		 *
829 		 * Our goal is to get SE_EXCL lock on each page, remove
830 		 * permanent lock on it and invalidate the page.
831 		 */
832 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
833 			if (hat_flags == HAT_UNLOAD_UNMAP)
834 				pp = page_lookup(vp, off, SE_EXCL);
835 			else {
836 				if ((pp = page_find(vp, off)) == NULL) {
837 					panic("segspt_free_pages: "
838 					    "page not locked");
839 					/*NOTREACHED*/
840 				}
841 				if (!page_tryupgrade(pp)) {
842 					page_unlock(pp);
843 					pp = page_lookup(vp, off, SE_EXCL);
844 				}
845 			}
846 			if (pp == NULL) {
847 				panic("segspt_free_pages: "
848 				    "page not in the system");
849 				/*NOTREACHED*/
850 			}
851 			ASSERT(pp->p_lckcnt > 0);
852 			page_pp_unlock(pp, 0, 1);
853 			if (pp->p_lckcnt == 0)
854 				unlocked_bytes += PAGESIZE;
855 		} else {
856 			if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
857 				continue;
858 		}
859 		/*
860 		 * It's logical to invalidate the pages here as in most cases
861 		 * these were created by segspt.
862 		 */
863 		if (pp->p_szc != 0) {
864 			if (root == 0) {
865 				ASSERT(curnpgs == 0);
866 				root = 1;
867 				rootpp = pp;
868 				pgs = curnpgs = page_get_pagecnt(pp->p_szc);
869 				ASSERT(pgs > 1);
870 				ASSERT(IS_P2ALIGNED(pgs, pgs));
871 				ASSERT(!(page_pptonum(pp) & (pgs - 1)));
872 				curnpgs--;
873 			} else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
874 				ASSERT(curnpgs == 1);
875 				ASSERT(page_pptonum(pp) ==
876 				    page_pptonum(rootpp) + (pgs - 1));
877 				page_destroy_pages(rootpp);
878 				root = 0;
879 				curnpgs = 0;
880 			} else {
881 				ASSERT(curnpgs > 1);
882 				ASSERT(page_pptonum(pp) ==
883 				    page_pptonum(rootpp) + (pgs - curnpgs));
884 				curnpgs--;
885 			}
886 		} else {
887 			if (root != 0 || curnpgs != 0) {
888 				panic("segspt_free_pages: bad large page");
889 				/*NOTREACHED*/
890 			}
891 			/*
892 			 * Before destroying the pages, we need to take care
893 			 * of the rctl locked memory accounting. For that
894 			 * we need to calculte the unlocked_bytes.
895 			 */
896 			if (pp->p_lckcnt > 0)
897 				unlocked_bytes += PAGESIZE;
898 			/*LINTED: constant in conditional context */
899 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
900 		}
901 	}
902 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
903 		if (unlocked_bytes > 0)
904 			rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
905 		mutex_exit(&sp->shm_mlock);
906 	}
907 	if (root != 0 || curnpgs != 0) {
908 		panic("segspt_free_pages: bad large page");
909 		/*NOTREACHED*/
910 	}
911 
912 	/*
913 	 * mark that pages have been released
914 	 */
915 	sptd->spt_realsize = 0;
916 
917 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
918 		atomic_add_long(&spt_used, -npages);
919 		anon_swap_restore(npages);
920 	}
921 }
922 
923 /*
924  * Get memory allocation policy info for specified address in given segment
925  */
926 static lgrp_mem_policy_info_t *
segspt_getpolicy(struct seg * seg,caddr_t addr)927 segspt_getpolicy(struct seg *seg, caddr_t addr)
928 {
929 	struct anon_map		*amp;
930 	ulong_t			anon_index;
931 	lgrp_mem_policy_info_t	*policy_info;
932 	struct spt_data		*spt_data;
933 
934 	ASSERT(seg != NULL);
935 
936 	/*
937 	 * Get anon_map from segspt
938 	 *
939 	 * Assume that no lock needs to be held on anon_map, since
940 	 * it should be protected by its reference count which must be
941 	 * nonzero for an existing segment
942 	 * Need to grab readers lock on policy tree though
943 	 */
944 	spt_data = (struct spt_data *)seg->s_data;
945 	if (spt_data == NULL)
946 		return (NULL);
947 	amp = spt_data->spt_amp;
948 	ASSERT(amp->refcnt != 0);
949 
950 	/*
951 	 * Get policy info
952 	 *
953 	 * Assume starting anon index of 0
954 	 */
955 	anon_index = seg_page(seg, addr);
956 	policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
957 
958 	return (policy_info);
959 }
960 
961 /*
962  * DISM only.
963  * Return locked pages over a given range.
964  *
965  * We will cache all DISM locked pages and save the pplist for the
966  * entire segment in the ppa field of the underlying DISM segment structure.
967  * Later, during a call to segspt_reclaim() we will use this ppa array
968  * to page_unlock() all of the pages and then we will free this ppa list.
969  */
970 /*ARGSUSED*/
971 static int
segspt_dismpagelock(struct seg * seg,caddr_t addr,size_t len,struct page *** ppp,enum lock_type type,enum seg_rw rw)972 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
973     struct page ***ppp, enum lock_type type, enum seg_rw rw)
974 {
975 	struct  shm_data *shmd = (struct shm_data *)seg->s_data;
976 	struct  seg	*sptseg = shmd->shm_sptseg;
977 	struct  spt_data *sptd = sptseg->s_data;
978 	pgcnt_t pg_idx, npages, tot_npages, npgs;
979 	struct  page **pplist, **pl, **ppa, *pp;
980 	struct  anon_map *amp;
981 	spgcnt_t	an_idx;
982 	int	ret = ENOTSUP;
983 	uint_t	pl_built = 0;
984 	struct  anon *ap;
985 	struct  vnode *vp;
986 	u_offset_t off;
987 	pgcnt_t claim_availrmem = 0;
988 	uint_t	szc;
989 
990 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
991 	ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
992 
993 	/*
994 	 * We want to lock/unlock the entire ISM segment. Therefore,
995 	 * we will be using the underlying sptseg and it's base address
996 	 * and length for the caching arguments.
997 	 */
998 	ASSERT(sptseg);
999 	ASSERT(sptd);
1000 
1001 	pg_idx = seg_page(seg, addr);
1002 	npages = btopr(len);
1003 
1004 	/*
1005 	 * check if the request is larger than number of pages covered
1006 	 * by amp
1007 	 */
1008 	if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
1009 		*ppp = NULL;
1010 		return (ENOTSUP);
1011 	}
1012 
1013 	if (type == L_PAGEUNLOCK) {
1014 		ASSERT(sptd->spt_ppa != NULL);
1015 
1016 		seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1017 		    sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1018 
1019 		/*
1020 		 * If someone is blocked while unmapping, we purge
1021 		 * segment page cache and thus reclaim pplist synchronously
1022 		 * without waiting for seg_pasync_thread. This speeds up
1023 		 * unmapping in cases where munmap(2) is called, while
1024 		 * raw async i/o is still in progress or where a thread
1025 		 * exits on data fault in a multithreaded application.
1026 		 */
1027 		if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
1028 		    (AS_ISUNMAPWAIT(seg->s_as) &&
1029 		    shmd->shm_softlockcnt > 0)) {
1030 			segspt_purge(seg);
1031 		}
1032 		return (0);
1033 	}
1034 
1035 	/* The L_PAGELOCK case ... */
1036 
1037 	if (sptd->spt_flags & DISM_PPA_CHANGED) {
1038 		segspt_purge(seg);
1039 		/*
1040 		 * for DISM ppa needs to be rebuild since
1041 		 * number of locked pages could be changed
1042 		 */
1043 		*ppp = NULL;
1044 		return (ENOTSUP);
1045 	}
1046 
1047 	/*
1048 	 * First try to find pages in segment page cache, without
1049 	 * holding the segment lock.
1050 	 */
1051 	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1052 	    S_WRITE, SEGP_FORCE_WIRED);
1053 	if (pplist != NULL) {
1054 		ASSERT(sptd->spt_ppa != NULL);
1055 		ASSERT(sptd->spt_ppa == pplist);
1056 		ppa = sptd->spt_ppa;
1057 		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1058 			if (ppa[an_idx] == NULL) {
1059 				seg_pinactive(seg, NULL, seg->s_base,
1060 				    sptd->spt_amp->size, ppa,
1061 				    S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1062 				*ppp = NULL;
1063 				return (ENOTSUP);
1064 			}
1065 			if ((szc = ppa[an_idx]->p_szc) != 0) {
1066 				npgs = page_get_pagecnt(szc);
1067 				an_idx = P2ROUNDUP(an_idx + 1, npgs);
1068 			} else {
1069 				an_idx++;
1070 			}
1071 		}
1072 		/*
1073 		 * Since we cache the entire DISM segment, we want to
1074 		 * set ppp to point to the first slot that corresponds
1075 		 * to the requested addr, i.e. pg_idx.
1076 		 */
1077 		*ppp = &(sptd->spt_ppa[pg_idx]);
1078 		return (0);
1079 	}
1080 
1081 	mutex_enter(&sptd->spt_lock);
1082 	/*
1083 	 * try to find pages in segment page cache with mutex
1084 	 */
1085 	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1086 	    S_WRITE, SEGP_FORCE_WIRED);
1087 	if (pplist != NULL) {
1088 		ASSERT(sptd->spt_ppa != NULL);
1089 		ASSERT(sptd->spt_ppa == pplist);
1090 		ppa = sptd->spt_ppa;
1091 		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1092 			if (ppa[an_idx] == NULL) {
1093 				mutex_exit(&sptd->spt_lock);
1094 				seg_pinactive(seg, NULL, seg->s_base,
1095 				    sptd->spt_amp->size, ppa,
1096 				    S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1097 				*ppp = NULL;
1098 				return (ENOTSUP);
1099 			}
1100 			if ((szc = ppa[an_idx]->p_szc) != 0) {
1101 				npgs = page_get_pagecnt(szc);
1102 				an_idx = P2ROUNDUP(an_idx + 1, npgs);
1103 			} else {
1104 				an_idx++;
1105 			}
1106 		}
1107 		/*
1108 		 * Since we cache the entire DISM segment, we want to
1109 		 * set ppp to point to the first slot that corresponds
1110 		 * to the requested addr, i.e. pg_idx.
1111 		 */
1112 		mutex_exit(&sptd->spt_lock);
1113 		*ppp = &(sptd->spt_ppa[pg_idx]);
1114 		return (0);
1115 	}
1116 	if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1117 	    SEGP_FORCE_WIRED) == SEGP_FAIL) {
1118 		mutex_exit(&sptd->spt_lock);
1119 		*ppp = NULL;
1120 		return (ENOTSUP);
1121 	}
1122 
1123 	/*
1124 	 * No need to worry about protections because DISM pages are always rw.
1125 	 */
1126 	pl = pplist = NULL;
1127 	amp = sptd->spt_amp;
1128 
1129 	/*
1130 	 * Do we need to build the ppa array?
1131 	 */
1132 	if (sptd->spt_ppa == NULL) {
1133 		pgcnt_t lpg_cnt = 0;
1134 
1135 		pl_built = 1;
1136 		tot_npages = btopr(sptd->spt_amp->size);
1137 
1138 		ASSERT(sptd->spt_pcachecnt == 0);
1139 		pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
1140 		pl = pplist;
1141 
1142 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1143 		for (an_idx = 0; an_idx < tot_npages; ) {
1144 			ap = anon_get_ptr(amp->ahp, an_idx);
1145 			/*
1146 			 * Cache only mlocked pages. For large pages
1147 			 * if one (constituent) page is mlocked
1148 			 * all pages for that large page
1149 			 * are cached also. This is for quick
1150 			 * lookups of ppa array;
1151 			 */
1152 			if ((ap != NULL) && (lpg_cnt != 0 ||
1153 			    (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
1154 
1155 				swap_xlate(ap, &vp, &off);
1156 				pp = page_lookup(vp, off, SE_SHARED);
1157 				ASSERT(pp != NULL);
1158 				if (lpg_cnt == 0) {
1159 					lpg_cnt++;
1160 					/*
1161 					 * For a small page, we are done --
1162 					 * lpg_count is reset to 0 below.
1163 					 *
1164 					 * For a large page, we are guaranteed
1165 					 * to find the anon structures of all
1166 					 * constituent pages and a non-zero
1167 					 * lpg_cnt ensures that we don't test
1168 					 * for mlock for these. We are done
1169 					 * when lpg_count reaches (npgs + 1).
1170 					 * If we are not the first constituent
1171 					 * page, restart at the first one.
1172 					 */
1173 					npgs = page_get_pagecnt(pp->p_szc);
1174 					if (!IS_P2ALIGNED(an_idx, npgs)) {
1175 						an_idx = P2ALIGN(an_idx, npgs);
1176 						page_unlock(pp);
1177 						continue;
1178 					}
1179 				}
1180 				if (++lpg_cnt > npgs)
1181 					lpg_cnt = 0;
1182 
1183 				/*
1184 				 * availrmem is decremented only
1185 				 * for unlocked pages
1186 				 */
1187 				if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1188 					claim_availrmem++;
1189 				pplist[an_idx] = pp;
1190 			}
1191 			an_idx++;
1192 		}
1193 		ANON_LOCK_EXIT(&amp->a_rwlock);
1194 
1195 		if (claim_availrmem) {
1196 			mutex_enter(&freemem_lock);
1197 			if (availrmem < tune.t_minarmem + claim_availrmem) {
1198 				mutex_exit(&freemem_lock);
1199 				ret = ENOTSUP;
1200 				claim_availrmem = 0;
1201 				goto insert_fail;
1202 			} else {
1203 				availrmem -= claim_availrmem;
1204 			}
1205 			mutex_exit(&freemem_lock);
1206 		}
1207 
1208 		sptd->spt_ppa = pl;
1209 	} else {
1210 		/*
1211 		 * We already have a valid ppa[].
1212 		 */
1213 		pl = sptd->spt_ppa;
1214 	}
1215 
1216 	ASSERT(pl != NULL);
1217 
1218 	ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1219 	    sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1220 	    segspt_reclaim);
1221 	if (ret == SEGP_FAIL) {
1222 		/*
1223 		 * seg_pinsert failed. We return
1224 		 * ENOTSUP, so that the as_pagelock() code will
1225 		 * then try the slower F_SOFTLOCK path.
1226 		 */
1227 		if (pl_built) {
1228 			/*
1229 			 * No one else has referenced the ppa[].
1230 			 * We created it and we need to destroy it.
1231 			 */
1232 			sptd->spt_ppa = NULL;
1233 		}
1234 		ret = ENOTSUP;
1235 		goto insert_fail;
1236 	}
1237 
1238 	/*
1239 	 * In either case, we increment softlockcnt on the 'real' segment.
1240 	 */
1241 	sptd->spt_pcachecnt++;
1242 	atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1243 
1244 	ppa = sptd->spt_ppa;
1245 	for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1246 		if (ppa[an_idx] == NULL) {
1247 			mutex_exit(&sptd->spt_lock);
1248 			seg_pinactive(seg, NULL, seg->s_base,
1249 			    sptd->spt_amp->size,
1250 			    pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1251 			*ppp = NULL;
1252 			return (ENOTSUP);
1253 		}
1254 		if ((szc = ppa[an_idx]->p_szc) != 0) {
1255 			npgs = page_get_pagecnt(szc);
1256 			an_idx = P2ROUNDUP(an_idx + 1, npgs);
1257 		} else {
1258 			an_idx++;
1259 		}
1260 	}
1261 	/*
1262 	 * We can now drop the sptd->spt_lock since the ppa[]
1263 	 * exists and we have incremented pacachecnt.
1264 	 */
1265 	mutex_exit(&sptd->spt_lock);
1266 
1267 	/*
1268 	 * Since we cache the entire segment, we want to
1269 	 * set ppp to point to the first slot that corresponds
1270 	 * to the requested addr, i.e. pg_idx.
1271 	 */
1272 	*ppp = &(sptd->spt_ppa[pg_idx]);
1273 	return (0);
1274 
1275 insert_fail:
1276 	/*
1277 	 * We will only reach this code if we tried and failed.
1278 	 *
1279 	 * And we can drop the lock on the dummy seg, once we've failed
1280 	 * to set up a new ppa[].
1281 	 */
1282 	mutex_exit(&sptd->spt_lock);
1283 
1284 	if (pl_built) {
1285 		if (claim_availrmem) {
1286 			mutex_enter(&freemem_lock);
1287 			availrmem += claim_availrmem;
1288 			mutex_exit(&freemem_lock);
1289 		}
1290 
1291 		/*
1292 		 * We created pl and we need to destroy it.
1293 		 */
1294 		pplist = pl;
1295 		for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1296 			if (pplist[an_idx] != NULL)
1297 				page_unlock(pplist[an_idx]);
1298 		}
1299 		kmem_free(pl, sizeof (page_t *) * tot_npages);
1300 	}
1301 
1302 	if (shmd->shm_softlockcnt <= 0) {
1303 		if (AS_ISUNMAPWAIT(seg->s_as)) {
1304 			mutex_enter(&seg->s_as->a_contents);
1305 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1306 				AS_CLRUNMAPWAIT(seg->s_as);
1307 				cv_broadcast(&seg->s_as->a_cv);
1308 			}
1309 			mutex_exit(&seg->s_as->a_contents);
1310 		}
1311 	}
1312 	*ppp = NULL;
1313 	return (ret);
1314 }
1315 
1316 
1317 
1318 /*
1319  * return locked pages over a given range.
1320  *
1321  * We will cache the entire ISM segment and save the pplist for the
1322  * entire segment in the ppa field of the underlying ISM segment structure.
1323  * Later, during a call to segspt_reclaim() we will use this ppa array
1324  * to page_unlock() all of the pages and then we will free this ppa list.
1325  */
1326 /*ARGSUSED*/
1327 static int
segspt_shmpagelock(struct seg * seg,caddr_t addr,size_t len,struct page *** ppp,enum lock_type type,enum seg_rw rw)1328 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1329     struct page ***ppp, enum lock_type type, enum seg_rw rw)
1330 {
1331 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1332 	struct seg	*sptseg = shmd->shm_sptseg;
1333 	struct spt_data *sptd = sptseg->s_data;
1334 	pgcnt_t np, page_index, npages;
1335 	caddr_t a, spt_base;
1336 	struct page **pplist, **pl, *pp;
1337 	struct anon_map *amp;
1338 	ulong_t anon_index;
1339 	int ret = ENOTSUP;
1340 	uint_t	pl_built = 0;
1341 	struct anon *ap;
1342 	struct vnode *vp;
1343 	u_offset_t off;
1344 
1345 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1346 	ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1347 
1348 
1349 	/*
1350 	 * We want to lock/unlock the entire ISM segment. Therefore,
1351 	 * we will be using the underlying sptseg and it's base address
1352 	 * and length for the caching arguments.
1353 	 */
1354 	ASSERT(sptseg);
1355 	ASSERT(sptd);
1356 
1357 	if (sptd->spt_flags & SHM_PAGEABLE) {
1358 		return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1359 	}
1360 
1361 	page_index = seg_page(seg, addr);
1362 	npages = btopr(len);
1363 
1364 	/*
1365 	 * check if the request is larger than number of pages covered
1366 	 * by amp
1367 	 */
1368 	if (page_index + npages > btopr(sptd->spt_amp->size)) {
1369 		*ppp = NULL;
1370 		return (ENOTSUP);
1371 	}
1372 
1373 	if (type == L_PAGEUNLOCK) {
1374 
1375 		ASSERT(sptd->spt_ppa != NULL);
1376 
1377 		seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1378 		    sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1379 
1380 		/*
1381 		 * If someone is blocked while unmapping, we purge
1382 		 * segment page cache and thus reclaim pplist synchronously
1383 		 * without waiting for seg_pasync_thread. This speeds up
1384 		 * unmapping in cases where munmap(2) is called, while
1385 		 * raw async i/o is still in progress or where a thread
1386 		 * exits on data fault in a multithreaded application.
1387 		 */
1388 		if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1389 			segspt_purge(seg);
1390 		}
1391 		return (0);
1392 	}
1393 
1394 	/* The L_PAGELOCK case... */
1395 
1396 	/*
1397 	 * First try to find pages in segment page cache, without
1398 	 * holding the segment lock.
1399 	 */
1400 	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1401 	    S_WRITE, SEGP_FORCE_WIRED);
1402 	if (pplist != NULL) {
1403 		ASSERT(sptd->spt_ppa == pplist);
1404 		ASSERT(sptd->spt_ppa[page_index]);
1405 		/*
1406 		 * Since we cache the entire ISM segment, we want to
1407 		 * set ppp to point to the first slot that corresponds
1408 		 * to the requested addr, i.e. page_index.
1409 		 */
1410 		*ppp = &(sptd->spt_ppa[page_index]);
1411 		return (0);
1412 	}
1413 
1414 	mutex_enter(&sptd->spt_lock);
1415 
1416 	/*
1417 	 * try to find pages in segment page cache
1418 	 */
1419 	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1420 	    S_WRITE, SEGP_FORCE_WIRED);
1421 	if (pplist != NULL) {
1422 		ASSERT(sptd->spt_ppa == pplist);
1423 		/*
1424 		 * Since we cache the entire segment, we want to
1425 		 * set ppp to point to the first slot that corresponds
1426 		 * to the requested addr, i.e. page_index.
1427 		 */
1428 		mutex_exit(&sptd->spt_lock);
1429 		*ppp = &(sptd->spt_ppa[page_index]);
1430 		return (0);
1431 	}
1432 
1433 	if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1434 	    SEGP_FORCE_WIRED) == SEGP_FAIL) {
1435 		mutex_exit(&sptd->spt_lock);
1436 		*ppp = NULL;
1437 		return (ENOTSUP);
1438 	}
1439 
1440 	/*
1441 	 * No need to worry about protections because ISM pages
1442 	 * are always rw.
1443 	 */
1444 	pl = pplist = NULL;
1445 
1446 	/*
1447 	 * Do we need to build the ppa array?
1448 	 */
1449 	if (sptd->spt_ppa == NULL) {
1450 		ASSERT(sptd->spt_ppa == pplist);
1451 
1452 		spt_base = sptseg->s_base;
1453 		pl_built = 1;
1454 
1455 		/*
1456 		 * availrmem is decremented once during anon_swap_adjust()
1457 		 * and is incremented during the anon_unresv(), which is
1458 		 * called from shm_rm_amp() when the segment is destroyed.
1459 		 */
1460 		amp = sptd->spt_amp;
1461 		ASSERT(amp != NULL);
1462 
1463 		/* pcachecnt is protected by sptd->spt_lock */
1464 		ASSERT(sptd->spt_pcachecnt == 0);
1465 		pplist = kmem_zalloc(sizeof (page_t *)
1466 		    * btopr(sptd->spt_amp->size), KM_SLEEP);
1467 		pl = pplist;
1468 
1469 		anon_index = seg_page(sptseg, spt_base);
1470 
1471 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1472 		for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1473 		    a += PAGESIZE, anon_index++, pplist++) {
1474 			ap = anon_get_ptr(amp->ahp, anon_index);
1475 			ASSERT(ap != NULL);
1476 			swap_xlate(ap, &vp, &off);
1477 			pp = page_lookup(vp, off, SE_SHARED);
1478 			ASSERT(pp != NULL);
1479 			*pplist = pp;
1480 		}
1481 		ANON_LOCK_EXIT(&amp->a_rwlock);
1482 
1483 		if (a < (spt_base + sptd->spt_amp->size)) {
1484 			ret = ENOTSUP;
1485 			goto insert_fail;
1486 		}
1487 		sptd->spt_ppa = pl;
1488 	} else {
1489 		/*
1490 		 * We already have a valid ppa[].
1491 		 */
1492 		pl = sptd->spt_ppa;
1493 	}
1494 
1495 	ASSERT(pl != NULL);
1496 
1497 	ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1498 	    sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1499 	    segspt_reclaim);
1500 	if (ret == SEGP_FAIL) {
1501 		/*
1502 		 * seg_pinsert failed. We return
1503 		 * ENOTSUP, so that the as_pagelock() code will
1504 		 * then try the slower F_SOFTLOCK path.
1505 		 */
1506 		if (pl_built) {
1507 			/*
1508 			 * No one else has referenced the ppa[].
1509 			 * We created it and we need to destroy it.
1510 			 */
1511 			sptd->spt_ppa = NULL;
1512 		}
1513 		ret = ENOTSUP;
1514 		goto insert_fail;
1515 	}
1516 
1517 	/*
1518 	 * In either case, we increment softlockcnt on the 'real' segment.
1519 	 */
1520 	sptd->spt_pcachecnt++;
1521 	atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1522 
1523 	/*
1524 	 * We can now drop the sptd->spt_lock since the ppa[]
1525 	 * exists and we have incremented pacachecnt.
1526 	 */
1527 	mutex_exit(&sptd->spt_lock);
1528 
1529 	/*
1530 	 * Since we cache the entire segment, we want to
1531 	 * set ppp to point to the first slot that corresponds
1532 	 * to the requested addr, i.e. page_index.
1533 	 */
1534 	*ppp = &(sptd->spt_ppa[page_index]);
1535 	return (0);
1536 
1537 insert_fail:
1538 	/*
1539 	 * We will only reach this code if we tried and failed.
1540 	 *
1541 	 * And we can drop the lock on the dummy seg, once we've failed
1542 	 * to set up a new ppa[].
1543 	 */
1544 	mutex_exit(&sptd->spt_lock);
1545 
1546 	if (pl_built) {
1547 		/*
1548 		 * We created pl and we need to destroy it.
1549 		 */
1550 		pplist = pl;
1551 		np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1552 		while (np) {
1553 			page_unlock(*pplist);
1554 			np--;
1555 			pplist++;
1556 		}
1557 		kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1558 	}
1559 	if (shmd->shm_softlockcnt <= 0) {
1560 		if (AS_ISUNMAPWAIT(seg->s_as)) {
1561 			mutex_enter(&seg->s_as->a_contents);
1562 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1563 				AS_CLRUNMAPWAIT(seg->s_as);
1564 				cv_broadcast(&seg->s_as->a_cv);
1565 			}
1566 			mutex_exit(&seg->s_as->a_contents);
1567 		}
1568 	}
1569 	*ppp = NULL;
1570 	return (ret);
1571 }
1572 
1573 /*
1574  * purge any cached pages in the I/O page cache
1575  */
1576 static void
segspt_purge(struct seg * seg)1577 segspt_purge(struct seg *seg)
1578 {
1579 	seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1580 }
1581 
1582 static int
segspt_reclaim(void * ptag,caddr_t addr,size_t len,struct page ** pplist,enum seg_rw rw,int async)1583 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1584     enum seg_rw rw, int async)
1585 {
1586 	struct seg *seg = (struct seg *)ptag;
1587 	struct	shm_data *shmd = (struct shm_data *)seg->s_data;
1588 	struct	seg	*sptseg;
1589 	struct	spt_data *sptd;
1590 	pgcnt_t npages, i, free_availrmem = 0;
1591 	int	done = 0;
1592 
1593 #ifdef lint
1594 	addr = addr;
1595 #endif
1596 	sptseg = shmd->shm_sptseg;
1597 	sptd = sptseg->s_data;
1598 	npages = (len >> PAGESHIFT);
1599 	ASSERT(npages);
1600 	ASSERT(sptd->spt_pcachecnt != 0);
1601 	ASSERT(sptd->spt_ppa == pplist);
1602 	ASSERT(npages == btopr(sptd->spt_amp->size));
1603 	ASSERT(async || AS_LOCK_HELD(seg->s_as));
1604 
1605 	/*
1606 	 * Acquire the lock on the dummy seg and destroy the
1607 	 * ppa array IF this is the last pcachecnt.
1608 	 */
1609 	mutex_enter(&sptd->spt_lock);
1610 	if (--sptd->spt_pcachecnt == 0) {
1611 		for (i = 0; i < npages; i++) {
1612 			if (pplist[i] == NULL) {
1613 				continue;
1614 			}
1615 			if (rw == S_WRITE) {
1616 				hat_setrefmod(pplist[i]);
1617 			} else {
1618 				hat_setref(pplist[i]);
1619 			}
1620 			if ((sptd->spt_flags & SHM_PAGEABLE) &&
1621 			    (sptd->spt_ppa_lckcnt[i] == 0))
1622 				free_availrmem++;
1623 			page_unlock(pplist[i]);
1624 		}
1625 		if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1626 			mutex_enter(&freemem_lock);
1627 			availrmem += free_availrmem;
1628 			mutex_exit(&freemem_lock);
1629 		}
1630 		/*
1631 		 * Since we want to cach/uncache the entire ISM segment,
1632 		 * we will track the pplist in a segspt specific field
1633 		 * ppa, that is initialized at the time we add an entry to
1634 		 * the cache.
1635 		 */
1636 		ASSERT(sptd->spt_pcachecnt == 0);
1637 		kmem_free(pplist, sizeof (page_t *) * npages);
1638 		sptd->spt_ppa = NULL;
1639 		sptd->spt_flags &= ~DISM_PPA_CHANGED;
1640 		sptd->spt_gen++;
1641 		cv_broadcast(&sptd->spt_cv);
1642 		done = 1;
1643 	}
1644 	mutex_exit(&sptd->spt_lock);
1645 
1646 	/*
1647 	 * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1648 	 * may not hold AS lock (in this case async argument is not 0). This
1649 	 * means if softlockcnt drops to 0 after the decrement below address
1650 	 * space may get freed. We can't allow it since after softlock
1651 	 * derement to 0 we still need to access as structure for possible
1652 	 * wakeup of unmap waiters. To prevent the disappearance of as we take
1653 	 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1654 	 * this mutex as a barrier to make sure this routine completes before
1655 	 * segment is freed.
1656 	 *
1657 	 * The second complication we have to deal with in async case is a
1658 	 * possibility of missed wake up of unmap wait thread. When we don't
1659 	 * hold as lock here we may take a_contents lock before unmap wait
1660 	 * thread that was first to see softlockcnt was still not 0. As a
1661 	 * result we'll fail to wake up an unmap wait thread. To avoid this
1662 	 * race we set nounmapwait flag in as structure if we drop softlockcnt
1663 	 * to 0 if async is not 0.  unmapwait thread
1664 	 * will not block if this flag is set.
1665 	 */
1666 	if (async)
1667 		mutex_enter(&shmd->shm_segfree_syncmtx);
1668 
1669 	/*
1670 	 * Now decrement softlockcnt.
1671 	 */
1672 	ASSERT(shmd->shm_softlockcnt > 0);
1673 	atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1674 
1675 	if (shmd->shm_softlockcnt <= 0) {
1676 		if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1677 			mutex_enter(&seg->s_as->a_contents);
1678 			if (async)
1679 				AS_SETNOUNMAPWAIT(seg->s_as);
1680 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1681 				AS_CLRUNMAPWAIT(seg->s_as);
1682 				cv_broadcast(&seg->s_as->a_cv);
1683 			}
1684 			mutex_exit(&seg->s_as->a_contents);
1685 		}
1686 	}
1687 
1688 	if (async)
1689 		mutex_exit(&shmd->shm_segfree_syncmtx);
1690 
1691 	return (done);
1692 }
1693 
1694 /*
1695  * Do a F_SOFTUNLOCK call over the range requested.
1696  * The range must have already been F_SOFTLOCK'ed.
1697  *
1698  * The calls to acquire and release the anon map lock mutex were
1699  * removed in order to avoid a deadly embrace during a DR
1700  * memory delete operation.  (Eg. DR blocks while waiting for a
1701  * exclusive lock on a page that is being used for kaio; the
1702  * thread that will complete the kaio and call segspt_softunlock
1703  * blocks on the anon map lock; another thread holding the anon
1704  * map lock blocks on another page lock via the segspt_shmfault
1705  * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1706  *
1707  * The appropriateness of the removal is based upon the following:
1708  * 1. If we are holding a segment's reader lock and the page is held
1709  * shared, then the corresponding element in anonmap which points to
1710  * anon struct cannot change and there is no need to acquire the
1711  * anonymous map lock.
1712  * 2. Threads in segspt_softunlock have a reader lock on the segment
1713  * and already have the shared page lock, so we are guaranteed that
1714  * the anon map slot cannot change and therefore can call anon_get_ptr()
1715  * without grabbing the anonymous map lock.
1716  * 3. Threads that softlock a shared page break copy-on-write, even if
1717  * its a read.  Thus cow faults can be ignored with respect to soft
1718  * unlocking, since the breaking of cow means that the anon slot(s) will
1719  * not be shared.
1720  */
1721 static void
segspt_softunlock(struct seg * seg,caddr_t sptseg_addr,size_t len,enum seg_rw rw)1722 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1723     size_t len, enum seg_rw rw)
1724 {
1725 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1726 	struct seg	*sptseg;
1727 	struct spt_data *sptd;
1728 	page_t *pp;
1729 	caddr_t adr;
1730 	struct vnode *vp;
1731 	u_offset_t offset;
1732 	ulong_t anon_index;
1733 	struct anon_map *amp;		/* XXX - for locknest */
1734 	struct anon *ap = NULL;
1735 	pgcnt_t npages;
1736 
1737 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1738 
1739 	sptseg = shmd->shm_sptseg;
1740 	sptd = sptseg->s_data;
1741 
1742 	/*
1743 	 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1744 	 * and therefore their pages are SE_SHARED locked
1745 	 * for the entire life of the segment.
1746 	 */
1747 	if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1748 	    ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1749 		goto softlock_decrement;
1750 	}
1751 
1752 	/*
1753 	 * Any thread is free to do a page_find and
1754 	 * page_unlock() on the pages within this seg.
1755 	 *
1756 	 * We are already holding the as->a_lock on the user's
1757 	 * real segment, but we need to hold the a_lock on the
1758 	 * underlying dummy as. This is mostly to satisfy the
1759 	 * underlying HAT layer.
1760 	 */
1761 	AS_LOCK_ENTER(sptseg->s_as, RW_READER);
1762 	hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1763 	AS_LOCK_EXIT(sptseg->s_as);
1764 
1765 	amp = sptd->spt_amp;
1766 	ASSERT(amp != NULL);
1767 	anon_index = seg_page(sptseg, sptseg_addr);
1768 
1769 	for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1770 		ap = anon_get_ptr(amp->ahp, anon_index++);
1771 		ASSERT(ap != NULL);
1772 		swap_xlate(ap, &vp, &offset);
1773 
1774 		/*
1775 		 * Use page_find() instead of page_lookup() to
1776 		 * find the page since we know that it has a
1777 		 * "shared" lock.
1778 		 */
1779 		pp = page_find(vp, offset);
1780 		ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1781 		if (pp == NULL) {
1782 			panic("segspt_softunlock: "
1783 			    "addr %p, ap %p, vp %p, off %llx",
1784 			    (void *)adr, (void *)ap, (void *)vp, offset);
1785 			/*NOTREACHED*/
1786 		}
1787 
1788 		if (rw == S_WRITE) {
1789 			hat_setrefmod(pp);
1790 		} else if (rw != S_OTHER) {
1791 			hat_setref(pp);
1792 		}
1793 		page_unlock(pp);
1794 	}
1795 
1796 softlock_decrement:
1797 	npages = btopr(len);
1798 	ASSERT(shmd->shm_softlockcnt >= npages);
1799 	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1800 	if (shmd->shm_softlockcnt == 0) {
1801 		/*
1802 		 * All SOFTLOCKS are gone. Wakeup any waiting
1803 		 * unmappers so they can try again to unmap.
1804 		 * Check for waiters first without the mutex
1805 		 * held so we don't always grab the mutex on
1806 		 * softunlocks.
1807 		 */
1808 		if (AS_ISUNMAPWAIT(seg->s_as)) {
1809 			mutex_enter(&seg->s_as->a_contents);
1810 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1811 				AS_CLRUNMAPWAIT(seg->s_as);
1812 				cv_broadcast(&seg->s_as->a_cv);
1813 			}
1814 			mutex_exit(&seg->s_as->a_contents);
1815 		}
1816 	}
1817 }
1818 
1819 int
segspt_shmattach(struct seg ** segpp,void * argsp)1820 segspt_shmattach(struct seg **segpp, void *argsp)
1821 {
1822 	struct seg *seg = *segpp;
1823 	struct shm_data *shmd_arg = (struct shm_data *)argsp;
1824 	struct shm_data *shmd;
1825 	struct anon_map *shm_amp = shmd_arg->shm_amp;
1826 	struct spt_data *sptd;
1827 	int error = 0;
1828 
1829 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1830 
1831 	shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1832 	if (shmd == NULL)
1833 		return (ENOMEM);
1834 
1835 	shmd->shm_sptas = shmd_arg->shm_sptas;
1836 	shmd->shm_amp = shm_amp;
1837 	shmd->shm_sptseg = shmd_arg->shm_sptseg;
1838 
1839 	(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1840 	    NULL, 0, seg->s_size);
1841 
1842 	mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1843 
1844 	seg->s_data = (void *)shmd;
1845 	seg->s_ops = &segspt_shmops;
1846 	seg->s_szc = shmd->shm_sptseg->s_szc;
1847 	sptd = shmd->shm_sptseg->s_data;
1848 
1849 	if (sptd->spt_flags & SHM_PAGEABLE) {
1850 		if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1851 		    KM_NOSLEEP)) == NULL) {
1852 			seg->s_data = (void *)NULL;
1853 			kmem_free(shmd, (sizeof (*shmd)));
1854 			return (ENOMEM);
1855 		}
1856 		shmd->shm_lckpgs = 0;
1857 		if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1858 			if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1859 			    shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1860 			    seg->s_size, seg->s_szc)) != 0) {
1861 				kmem_free(shmd->shm_vpage,
1862 				    btopr(shm_amp->size));
1863 			}
1864 		}
1865 	} else {
1866 		error = hat_share(seg->s_as->a_hat, seg->s_base,
1867 		    shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1868 		    seg->s_size, seg->s_szc);
1869 	}
1870 	if (error) {
1871 		seg->s_szc = 0;
1872 		seg->s_data = (void *)NULL;
1873 		kmem_free(shmd, (sizeof (*shmd)));
1874 	} else {
1875 		ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1876 		shm_amp->refcnt++;
1877 		ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1878 	}
1879 	return (error);
1880 }
1881 
1882 int
segspt_shmunmap(struct seg * seg,caddr_t raddr,size_t ssize)1883 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1884 {
1885 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1886 	int reclaim = 1;
1887 
1888 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1889 retry:
1890 	if (shmd->shm_softlockcnt > 0) {
1891 		if (reclaim == 1) {
1892 			segspt_purge(seg);
1893 			reclaim = 0;
1894 			goto retry;
1895 		}
1896 		return (EAGAIN);
1897 	}
1898 
1899 	if (ssize != seg->s_size) {
1900 #ifdef DEBUG
1901 		cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1902 		    ssize, seg->s_size);
1903 #endif
1904 		return (EINVAL);
1905 	}
1906 
1907 	(void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1908 	    NULL, 0);
1909 	hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1910 
1911 	seg_free(seg);
1912 
1913 	return (0);
1914 }
1915 
1916 void
segspt_shmfree(struct seg * seg)1917 segspt_shmfree(struct seg *seg)
1918 {
1919 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1920 	struct anon_map *shm_amp = shmd->shm_amp;
1921 
1922 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1923 
1924 	(void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1925 	    MC_UNLOCK, NULL, 0);
1926 
1927 	/*
1928 	 * Need to increment refcnt when attaching
1929 	 * and decrement when detaching because of dup().
1930 	 */
1931 	ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1932 	shm_amp->refcnt--;
1933 	ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1934 
1935 	if (shmd->shm_vpage) {	/* only for DISM */
1936 		kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1937 		shmd->shm_vpage = NULL;
1938 	}
1939 
1940 	/*
1941 	 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1942 	 * still working with this segment without holding as lock.
1943 	 */
1944 	ASSERT(shmd->shm_softlockcnt == 0);
1945 	mutex_enter(&shmd->shm_segfree_syncmtx);
1946 	mutex_destroy(&shmd->shm_segfree_syncmtx);
1947 
1948 	kmem_free(shmd, sizeof (*shmd));
1949 }
1950 
1951 /*ARGSUSED*/
1952 int
segspt_shmsetprot(struct seg * seg,caddr_t addr,size_t len,uint_t prot)1953 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1954 {
1955 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1956 
1957 	/*
1958 	 * Shared page table is more than shared mapping.
1959 	 *  Individual process sharing page tables can't change prot
1960 	 *  because there is only one set of page tables.
1961 	 *  This will be allowed after private page table is
1962 	 *  supported.
1963 	 */
1964 /* need to return correct status error? */
1965 	return (0);
1966 }
1967 
1968 
1969 faultcode_t
segspt_dismfault(struct hat * hat,struct seg * seg,caddr_t addr,size_t len,enum fault_type type,enum seg_rw rw)1970 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1971     size_t len, enum fault_type type, enum seg_rw rw)
1972 {
1973 	struct  shm_data	*shmd = (struct shm_data *)seg->s_data;
1974 	struct  seg		*sptseg = shmd->shm_sptseg;
1975 	struct  as		*curspt = shmd->shm_sptas;
1976 	struct  spt_data	*sptd = sptseg->s_data;
1977 	pgcnt_t npages;
1978 	size_t  size;
1979 	caddr_t segspt_addr, shm_addr;
1980 	page_t  **ppa;
1981 	int	i;
1982 	ulong_t an_idx = 0;
1983 	int	err = 0;
1984 	int	dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1985 	size_t	pgsz;
1986 	pgcnt_t	pgcnt;
1987 	caddr_t	a;
1988 	pgcnt_t	pidx;
1989 
1990 #ifdef lint
1991 	hat = hat;
1992 #endif
1993 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1994 
1995 	/*
1996 	 * Because of the way spt is implemented
1997 	 * the realsize of the segment does not have to be
1998 	 * equal to the segment size itself. The segment size is
1999 	 * often in multiples of a page size larger than PAGESIZE.
2000 	 * The realsize is rounded up to the nearest PAGESIZE
2001 	 * based on what the user requested. This is a bit of
2002 	 * ungliness that is historical but not easily fixed
2003 	 * without re-designing the higher levels of ISM.
2004 	 */
2005 	ASSERT(addr >= seg->s_base);
2006 	if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2007 		return (FC_NOMAP);
2008 	/*
2009 	 * For all of the following cases except F_PROT, we need to
2010 	 * make any necessary adjustments to addr and len
2011 	 * and get all of the necessary page_t's into an array called ppa[].
2012 	 *
2013 	 * The code in shmat() forces base addr and len of ISM segment
2014 	 * to be aligned to largest page size supported. Therefore,
2015 	 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2016 	 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2017 	 * in large pagesize chunks, or else we will screw up the HAT
2018 	 * layer by calling hat_memload_array() with differing page sizes
2019 	 * over a given virtual range.
2020 	 */
2021 	pgsz = page_get_pagesize(sptseg->s_szc);
2022 	pgcnt = page_get_pagecnt(sptseg->s_szc);
2023 	shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2024 	size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2025 	npages = btopr(size);
2026 
2027 	/*
2028 	 * Now we need to convert from addr in segshm to addr in segspt.
2029 	 */
2030 	an_idx = seg_page(seg, shm_addr);
2031 	segspt_addr = sptseg->s_base + ptob(an_idx);
2032 
2033 	ASSERT((segspt_addr + ptob(npages)) <=
2034 	    (sptseg->s_base + sptd->spt_realsize));
2035 	ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
2036 
2037 	switch (type) {
2038 
2039 	case F_SOFTLOCK:
2040 
2041 		atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2042 		/*
2043 		 * Fall through to the F_INVAL case to load up the hat layer
2044 		 * entries with the HAT_LOAD_LOCK flag.
2045 		 */
2046 		/* FALLTHRU */
2047 	case F_INVAL:
2048 
2049 		if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2050 			return (FC_NOMAP);
2051 
2052 		ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
2053 
2054 		err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
2055 		if (err != 0) {
2056 			if (type == F_SOFTLOCK) {
2057 				atomic_add_long((ulong_t *)(
2058 				    &(shmd->shm_softlockcnt)), -npages);
2059 			}
2060 			goto dism_err;
2061 		}
2062 		AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2063 		a = segspt_addr;
2064 		pidx = 0;
2065 		if (type == F_SOFTLOCK) {
2066 
2067 			/*
2068 			 * Load up the translation keeping it
2069 			 * locked and don't unlock the page.
2070 			 */
2071 			for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2072 				hat_memload_array(sptseg->s_as->a_hat,
2073 				    a, pgsz, &ppa[pidx], sptd->spt_prot,
2074 				    HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2075 			}
2076 		} else {
2077 			/*
2078 			 * Migrate pages marked for migration
2079 			 */
2080 			if (lgrp_optimizations())
2081 				page_migrate(seg, shm_addr, ppa, npages);
2082 
2083 			for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2084 				hat_memload_array(sptseg->s_as->a_hat,
2085 				    a, pgsz, &ppa[pidx],
2086 				    sptd->spt_prot,
2087 				    HAT_LOAD_SHARE);
2088 			}
2089 
2090 			/*
2091 			 * And now drop the SE_SHARED lock(s).
2092 			 */
2093 			if (dyn_ism_unmap) {
2094 				for (i = 0; i < npages; i++) {
2095 					page_unlock(ppa[i]);
2096 				}
2097 			}
2098 		}
2099 
2100 		if (!dyn_ism_unmap) {
2101 			if (hat_share(seg->s_as->a_hat, shm_addr,
2102 			    curspt->a_hat, segspt_addr, ptob(npages),
2103 			    seg->s_szc) != 0) {
2104 				panic("hat_share err in DISM fault");
2105 				/* NOTREACHED */
2106 			}
2107 			if (type == F_INVAL) {
2108 				for (i = 0; i < npages; i++) {
2109 					page_unlock(ppa[i]);
2110 				}
2111 			}
2112 		}
2113 		AS_LOCK_EXIT(sptseg->s_as);
2114 dism_err:
2115 		kmem_free(ppa, npages * sizeof (page_t *));
2116 		return (err);
2117 
2118 	case F_SOFTUNLOCK:
2119 
2120 		/*
2121 		 * This is a bit ugly, we pass in the real seg pointer,
2122 		 * but the segspt_addr is the virtual address within the
2123 		 * dummy seg.
2124 		 */
2125 		segspt_softunlock(seg, segspt_addr, size, rw);
2126 		return (0);
2127 
2128 	case F_PROT:
2129 
2130 		/*
2131 		 * This takes care of the unusual case where a user
2132 		 * allocates a stack in shared memory and a register
2133 		 * window overflow is written to that stack page before
2134 		 * it is otherwise modified.
2135 		 *
2136 		 * We can get away with this because ISM segments are
2137 		 * always rw. Other than this unusual case, there
2138 		 * should be no instances of protection violations.
2139 		 */
2140 		return (0);
2141 
2142 	default:
2143 #ifdef DEBUG
2144 		panic("segspt_dismfault default type?");
2145 #else
2146 		return (FC_NOMAP);
2147 #endif
2148 	}
2149 }
2150 
2151 
2152 faultcode_t
segspt_shmfault(struct hat * hat,struct seg * seg,caddr_t addr,size_t len,enum fault_type type,enum seg_rw rw)2153 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
2154     size_t len, enum fault_type type, enum seg_rw rw)
2155 {
2156 	struct shm_data		*shmd = (struct shm_data *)seg->s_data;
2157 	struct seg		*sptseg = shmd->shm_sptseg;
2158 	struct as		*curspt = shmd->shm_sptas;
2159 	struct spt_data		*sptd = sptseg->s_data;
2160 	pgcnt_t npages;
2161 	size_t size;
2162 	caddr_t sptseg_addr, shm_addr;
2163 	page_t *pp, **ppa;
2164 	int	i;
2165 	u_offset_t offset;
2166 	ulong_t anon_index = 0;
2167 	struct vnode *vp;
2168 	struct anon_map *amp;		/* XXX - for locknest */
2169 	struct anon *ap = NULL;
2170 	size_t		pgsz;
2171 	pgcnt_t		pgcnt;
2172 	caddr_t		a;
2173 	pgcnt_t		pidx;
2174 	size_t		sz;
2175 
2176 #ifdef lint
2177 	hat = hat;
2178 #endif
2179 
2180 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2181 
2182 	if (sptd->spt_flags & SHM_PAGEABLE) {
2183 		return (segspt_dismfault(hat, seg, addr, len, type, rw));
2184 	}
2185 
2186 	/*
2187 	 * Because of the way spt is implemented
2188 	 * the realsize of the segment does not have to be
2189 	 * equal to the segment size itself. The segment size is
2190 	 * often in multiples of a page size larger than PAGESIZE.
2191 	 * The realsize is rounded up to the nearest PAGESIZE
2192 	 * based on what the user requested. This is a bit of
2193 	 * ungliness that is historical but not easily fixed
2194 	 * without re-designing the higher levels of ISM.
2195 	 */
2196 	ASSERT(addr >= seg->s_base);
2197 	if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2198 		return (FC_NOMAP);
2199 	/*
2200 	 * For all of the following cases except F_PROT, we need to
2201 	 * make any necessary adjustments to addr and len
2202 	 * and get all of the necessary page_t's into an array called ppa[].
2203 	 *
2204 	 * The code in shmat() forces base addr and len of ISM segment
2205 	 * to be aligned to largest page size supported. Therefore,
2206 	 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2207 	 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2208 	 * in large pagesize chunks, or else we will screw up the HAT
2209 	 * layer by calling hat_memload_array() with differing page sizes
2210 	 * over a given virtual range.
2211 	 */
2212 	pgsz = page_get_pagesize(sptseg->s_szc);
2213 	pgcnt = page_get_pagecnt(sptseg->s_szc);
2214 	shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2215 	size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2216 	npages = btopr(size);
2217 
2218 	/*
2219 	 * Now we need to convert from addr in segshm to addr in segspt.
2220 	 */
2221 	anon_index = seg_page(seg, shm_addr);
2222 	sptseg_addr = sptseg->s_base + ptob(anon_index);
2223 
2224 	/*
2225 	 * And now we may have to adjust npages downward if we have
2226 	 * exceeded the realsize of the segment or initial anon
2227 	 * allocations.
2228 	 */
2229 	if ((sptseg_addr + ptob(npages)) >
2230 	    (sptseg->s_base + sptd->spt_realsize))
2231 		size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2232 
2233 	npages = btopr(size);
2234 
2235 	ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2236 	ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2237 
2238 	switch (type) {
2239 
2240 	case F_SOFTLOCK:
2241 
2242 		/*
2243 		 * availrmem is decremented once during anon_swap_adjust()
2244 		 * and is incremented during the anon_unresv(), which is
2245 		 * called from shm_rm_amp() when the segment is destroyed.
2246 		 */
2247 		atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2248 		/*
2249 		 * Some platforms assume that ISM pages are SE_SHARED
2250 		 * locked for the entire life of the segment.
2251 		 */
2252 		if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2253 			return (0);
2254 		/*
2255 		 * Fall through to the F_INVAL case to load up the hat layer
2256 		 * entries with the HAT_LOAD_LOCK flag.
2257 		 */
2258 
2259 		/* FALLTHRU */
2260 	case F_INVAL:
2261 
2262 		if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2263 			return (FC_NOMAP);
2264 
2265 		/*
2266 		 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2267 		 * may still rely on this call to hat_share(). That
2268 		 * would imply that those hat's can fault on a
2269 		 * HAT_LOAD_LOCK translation, which would seem
2270 		 * contradictory.
2271 		 */
2272 		if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2273 			if (hat_share(seg->s_as->a_hat, seg->s_base,
2274 			    curspt->a_hat, sptseg->s_base,
2275 			    sptseg->s_size, sptseg->s_szc) != 0) {
2276 				panic("hat_share error in ISM fault");
2277 				/*NOTREACHED*/
2278 			}
2279 			return (0);
2280 		}
2281 		ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2282 
2283 		/*
2284 		 * I see no need to lock the real seg,
2285 		 * here, because all of our work will be on the underlying
2286 		 * dummy seg.
2287 		 *
2288 		 * sptseg_addr and npages now account for large pages.
2289 		 */
2290 		amp = sptd->spt_amp;
2291 		ASSERT(amp != NULL);
2292 		anon_index = seg_page(sptseg, sptseg_addr);
2293 
2294 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2295 		for (i = 0; i < npages; i++) {
2296 			ap = anon_get_ptr(amp->ahp, anon_index++);
2297 			ASSERT(ap != NULL);
2298 			swap_xlate(ap, &vp, &offset);
2299 			pp = page_lookup(vp, offset, SE_SHARED);
2300 			ASSERT(pp != NULL);
2301 			ppa[i] = pp;
2302 		}
2303 		ANON_LOCK_EXIT(&amp->a_rwlock);
2304 		ASSERT(i == npages);
2305 
2306 		/*
2307 		 * We are already holding the as->a_lock on the user's
2308 		 * real segment, but we need to hold the a_lock on the
2309 		 * underlying dummy as. This is mostly to satisfy the
2310 		 * underlying HAT layer.
2311 		 */
2312 		AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2313 		a = sptseg_addr;
2314 		pidx = 0;
2315 		if (type == F_SOFTLOCK) {
2316 			/*
2317 			 * Load up the translation keeping it
2318 			 * locked and don't unlock the page.
2319 			 */
2320 			for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2321 				sz = MIN(pgsz, ptob(npages - pidx));
2322 				hat_memload_array(sptseg->s_as->a_hat, a,
2323 				    sz, &ppa[pidx], sptd->spt_prot,
2324 				    HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2325 			}
2326 		} else {
2327 			/*
2328 			 * Migrate pages marked for migration.
2329 			 */
2330 			if (lgrp_optimizations())
2331 				page_migrate(seg, shm_addr, ppa, npages);
2332 
2333 			for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2334 				sz = MIN(pgsz, ptob(npages - pidx));
2335 				hat_memload_array(sptseg->s_as->a_hat,
2336 				    a, sz, &ppa[pidx],
2337 				    sptd->spt_prot, HAT_LOAD_SHARE);
2338 			}
2339 
2340 			/*
2341 			 * And now drop the SE_SHARED lock(s).
2342 			 */
2343 			for (i = 0; i < npages; i++)
2344 				page_unlock(ppa[i]);
2345 		}
2346 		AS_LOCK_EXIT(sptseg->s_as);
2347 
2348 		kmem_free(ppa, sizeof (page_t *) * npages);
2349 		return (0);
2350 	case F_SOFTUNLOCK:
2351 
2352 		/*
2353 		 * This is a bit ugly, we pass in the real seg pointer,
2354 		 * but the sptseg_addr is the virtual address within the
2355 		 * dummy seg.
2356 		 */
2357 		segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2358 		return (0);
2359 
2360 	case F_PROT:
2361 
2362 		/*
2363 		 * This takes care of the unusual case where a user
2364 		 * allocates a stack in shared memory and a register
2365 		 * window overflow is written to that stack page before
2366 		 * it is otherwise modified.
2367 		 *
2368 		 * We can get away with this because ISM segments are
2369 		 * always rw. Other than this unusual case, there
2370 		 * should be no instances of protection violations.
2371 		 */
2372 		return (0);
2373 
2374 	default:
2375 #ifdef DEBUG
2376 		cmn_err(CE_WARN, "segspt_shmfault default type?");
2377 #endif
2378 		return (FC_NOMAP);
2379 	}
2380 }
2381 
2382 /*ARGSUSED*/
2383 static faultcode_t
segspt_shmfaulta(struct seg * seg,caddr_t addr)2384 segspt_shmfaulta(struct seg *seg, caddr_t addr)
2385 {
2386 	return (0);
2387 }
2388 
2389 /*ARGSUSED*/
2390 static int
segspt_shmkluster(struct seg * seg,caddr_t addr,ssize_t delta)2391 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2392 {
2393 	return (0);
2394 }
2395 
2396 /*ARGSUSED*/
2397 static size_t
segspt_shmswapout(struct seg * seg)2398 segspt_shmswapout(struct seg *seg)
2399 {
2400 	return (0);
2401 }
2402 
2403 /*
2404  * duplicate the shared page tables
2405  */
2406 int
segspt_shmdup(struct seg * seg,struct seg * newseg)2407 segspt_shmdup(struct seg *seg, struct seg *newseg)
2408 {
2409 	struct shm_data		*shmd = (struct shm_data *)seg->s_data;
2410 	struct anon_map		*amp = shmd->shm_amp;
2411 	struct shm_data		*shmd_new;
2412 	struct seg		*spt_seg = shmd->shm_sptseg;
2413 	struct spt_data		*sptd = spt_seg->s_data;
2414 	int			error = 0;
2415 
2416 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
2417 
2418 	shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2419 	newseg->s_data = (void *)shmd_new;
2420 	shmd_new->shm_sptas = shmd->shm_sptas;
2421 	shmd_new->shm_amp = amp;
2422 	shmd_new->shm_sptseg = shmd->shm_sptseg;
2423 	newseg->s_ops = &segspt_shmops;
2424 	newseg->s_szc = seg->s_szc;
2425 	ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2426 
2427 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2428 	amp->refcnt++;
2429 	ANON_LOCK_EXIT(&amp->a_rwlock);
2430 
2431 	if (sptd->spt_flags & SHM_PAGEABLE) {
2432 		shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2433 		shmd_new->shm_lckpgs = 0;
2434 		if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2435 			if ((error = hat_share(newseg->s_as->a_hat,
2436 			    newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2437 			    seg->s_size, seg->s_szc)) != 0) {
2438 				kmem_free(shmd_new->shm_vpage,
2439 				    btopr(amp->size));
2440 			}
2441 		}
2442 		return (error);
2443 	} else {
2444 		return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2445 		    shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2446 		    seg->s_szc));
2447 
2448 	}
2449 }
2450 
2451 /*ARGSUSED*/
2452 int
segspt_shmcheckprot(struct seg * seg,caddr_t addr,size_t size,uint_t prot)2453 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2454 {
2455 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2456 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2457 
2458 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2459 
2460 	/*
2461 	 * ISM segment is always rw.
2462 	 */
2463 	return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2464 }
2465 
2466 /*
2467  * Return an array of locked large pages, for empty slots allocate
2468  * private zero-filled anon pages.
2469  */
2470 static int
spt_anon_getpages(struct seg * sptseg,caddr_t sptaddr,size_t len,page_t * ppa[])2471 spt_anon_getpages(
2472 	struct seg *sptseg,
2473 	caddr_t sptaddr,
2474 	size_t len,
2475 	page_t *ppa[])
2476 {
2477 	struct  spt_data *sptd = sptseg->s_data;
2478 	struct  anon_map *amp = sptd->spt_amp;
2479 	enum	seg_rw rw = sptd->spt_prot;
2480 	uint_t	szc = sptseg->s_szc;
2481 	size_t	pg_sz, share_sz = page_get_pagesize(szc);
2482 	pgcnt_t	lp_npgs;
2483 	caddr_t	lp_addr, e_sptaddr;
2484 	uint_t	vpprot, ppa_szc = 0;
2485 	struct  vpage *vpage = NULL;
2486 	ulong_t	j, ppa_idx;
2487 	int	err, ierr = 0;
2488 	pgcnt_t	an_idx;
2489 	anon_sync_obj_t cookie;
2490 	int anon_locked = 0;
2491 	pgcnt_t amp_pgs;
2492 
2493 
2494 	ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2495 	ASSERT(len != 0);
2496 
2497 	pg_sz = share_sz;
2498 	lp_npgs = btop(pg_sz);
2499 	lp_addr = sptaddr;
2500 	e_sptaddr = sptaddr + len;
2501 	an_idx = seg_page(sptseg, sptaddr);
2502 	ppa_idx = 0;
2503 
2504 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2505 
2506 	amp_pgs = page_get_pagecnt(amp->a_szc);
2507 
2508 	/*CONSTCOND*/
2509 	while (1) {
2510 		for (; lp_addr < e_sptaddr;
2511 		    an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2512 
2513 			/*
2514 			 * If we're currently locked, and we get to a new
2515 			 * page, unlock our current anon chunk.
2516 			 */
2517 			if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2518 				anon_array_exit(&cookie);
2519 				anon_locked = 0;
2520 			}
2521 			if (!anon_locked) {
2522 				anon_array_enter(amp, an_idx, &cookie);
2523 				anon_locked = 1;
2524 			}
2525 			ppa_szc = (uint_t)-1;
2526 			ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2527 			    lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2528 			    &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2529 
2530 			if (ierr != 0) {
2531 				if (ierr > 0) {
2532 					err = FC_MAKE_ERR(ierr);
2533 					goto lpgs_err;
2534 				}
2535 				break;
2536 			}
2537 		}
2538 		if (lp_addr == e_sptaddr) {
2539 			break;
2540 		}
2541 		ASSERT(lp_addr < e_sptaddr);
2542 
2543 		/*
2544 		 * ierr == -1 means we failed to allocate a large page.
2545 		 * so do a size down operation.
2546 		 *
2547 		 * ierr == -2 means some other process that privately shares
2548 		 * pages with this process has allocated a larger page and we
2549 		 * need to retry with larger pages. So do a size up
2550 		 * operation. This relies on the fact that large pages are
2551 		 * never partially shared i.e. if we share any constituent
2552 		 * page of a large page with another process we must share the
2553 		 * entire large page. Note this cannot happen for SOFTLOCK
2554 		 * case, unless current address (lpaddr) is at the beginning
2555 		 * of the next page size boundary because the other process
2556 		 * couldn't have relocated locked pages.
2557 		 */
2558 		ASSERT(ierr == -1 || ierr == -2);
2559 		if (segvn_anypgsz) {
2560 			ASSERT(ierr == -2 || szc != 0);
2561 			ASSERT(ierr == -1 || szc < sptseg->s_szc);
2562 			szc = (ierr == -1) ? szc - 1 : szc + 1;
2563 		} else {
2564 			/*
2565 			 * For faults and segvn_anypgsz == 0
2566 			 * we need to be careful not to loop forever
2567 			 * if existing page is found with szc other
2568 			 * than 0 or seg->s_szc. This could be due
2569 			 * to page relocations on behalf of DR or
2570 			 * more likely large page creation. For this
2571 			 * case simply re-size to existing page's szc
2572 			 * if returned by anon_map_getpages().
2573 			 */
2574 			if (ppa_szc == (uint_t)-1) {
2575 				szc = (ierr == -1) ? 0 : sptseg->s_szc;
2576 			} else {
2577 				ASSERT(ppa_szc <= sptseg->s_szc);
2578 				ASSERT(ierr == -2 || ppa_szc < szc);
2579 				ASSERT(ierr == -1 || ppa_szc > szc);
2580 				szc = ppa_szc;
2581 			}
2582 		}
2583 		pg_sz = page_get_pagesize(szc);
2584 		lp_npgs = btop(pg_sz);
2585 		ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2586 	}
2587 	if (anon_locked) {
2588 		anon_array_exit(&cookie);
2589 	}
2590 	ANON_LOCK_EXIT(&amp->a_rwlock);
2591 	return (0);
2592 
2593 lpgs_err:
2594 	if (anon_locked) {
2595 		anon_array_exit(&cookie);
2596 	}
2597 	ANON_LOCK_EXIT(&amp->a_rwlock);
2598 	for (j = 0; j < ppa_idx; j++)
2599 		page_unlock(ppa[j]);
2600 	return (err);
2601 }
2602 
2603 /*
2604  * count the number of bytes in a set of spt pages that are currently not
2605  * locked
2606  */
2607 static rctl_qty_t
spt_unlockedbytes(pgcnt_t npages,page_t ** ppa)2608 spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2609 {
2610 	ulong_t	i;
2611 	rctl_qty_t unlocked = 0;
2612 
2613 	for (i = 0; i < npages; i++) {
2614 		if (ppa[i]->p_lckcnt == 0)
2615 			unlocked += PAGESIZE;
2616 	}
2617 	return (unlocked);
2618 }
2619 
2620 extern	u_longlong_t randtick(void);
2621 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2622 #define	NLCK	(NCPU_P2)
2623 /* Random number with a range [0, n-1], n must be power of two */
2624 #define	RAND_P2(n)	\
2625 	((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2626 
2627 int
spt_lockpages(struct seg * seg,pgcnt_t anon_index,pgcnt_t npages,page_t ** ppa,ulong_t * lockmap,size_t pos,rctl_qty_t * locked)2628 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2629     page_t **ppa, ulong_t *lockmap, size_t pos,
2630     rctl_qty_t *locked)
2631 {
2632 	struct	shm_data *shmd = seg->s_data;
2633 	struct	spt_data *sptd = shmd->shm_sptseg->s_data;
2634 	ulong_t	i;
2635 	int	kernel;
2636 	pgcnt_t	nlck = 0;
2637 	int	rv = 0;
2638 	int	use_reserved = 1;
2639 
2640 	/* return the number of bytes actually locked */
2641 	*locked = 0;
2642 
2643 	/*
2644 	 * To avoid contention on freemem_lock, availrmem and pages_locked
2645 	 * global counters are updated only every nlck locked pages instead of
2646 	 * every time.  Reserve nlck locks up front and deduct from this
2647 	 * reservation for each page that requires a lock.  When the reservation
2648 	 * is consumed, reserve again.  nlck is randomized, so the competing
2649 	 * threads do not fall into a cyclic lock contention pattern. When
2650 	 * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2651 	 * is used to lock pages.
2652 	 */
2653 	for (i = 0; i < npages; anon_index++, pos++, i++) {
2654 		if (nlck == 0 && use_reserved == 1) {
2655 			nlck = NLCK + RAND_P2(NLCK);
2656 			/* if fewer loops left, decrease nlck */
2657 			nlck = MIN(nlck, npages - i);
2658 			/*
2659 			 * Reserve nlck locks up front and deduct from this
2660 			 * reservation for each page that requires a lock.  When
2661 			 * the reservation is consumed, reserve again.
2662 			 */
2663 			mutex_enter(&freemem_lock);
2664 			if ((availrmem - nlck) < pages_pp_maximum) {
2665 				/* Do not do advance memory reserves */
2666 				use_reserved = 0;
2667 			} else {
2668 				availrmem	-= nlck;
2669 				pages_locked	+= nlck;
2670 			}
2671 			mutex_exit(&freemem_lock);
2672 		}
2673 		if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2674 			if (sptd->spt_ppa_lckcnt[anon_index] <
2675 			    (ushort_t)DISM_LOCK_MAX) {
2676 				if (++sptd->spt_ppa_lckcnt[anon_index] ==
2677 				    (ushort_t)DISM_LOCK_MAX) {
2678 					cmn_err(CE_WARN,
2679 					    "DISM page lock limit "
2680 					    "reached on DISM offset 0x%lx\n",
2681 					    anon_index << PAGESHIFT);
2682 				}
2683 				kernel = (sptd->spt_ppa &&
2684 				    sptd->spt_ppa[anon_index]);
2685 				if (!page_pp_lock(ppa[i], 0, kernel ||
2686 				    use_reserved)) {
2687 					sptd->spt_ppa_lckcnt[anon_index]--;
2688 					rv = EAGAIN;
2689 					break;
2690 				}
2691 				/* if this is a newly locked page, count it */
2692 				if (ppa[i]->p_lckcnt == 1) {
2693 					if (kernel == 0 && use_reserved == 1)
2694 						nlck--;
2695 					*locked += PAGESIZE;
2696 				}
2697 				shmd->shm_lckpgs++;
2698 				shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2699 				if (lockmap != NULL)
2700 					BT_SET(lockmap, pos);
2701 			}
2702 		}
2703 	}
2704 	/* Return unused lock reservation */
2705 	if (nlck != 0 && use_reserved == 1) {
2706 		mutex_enter(&freemem_lock);
2707 		availrmem	+= nlck;
2708 		pages_locked	-= nlck;
2709 		mutex_exit(&freemem_lock);
2710 	}
2711 
2712 	return (rv);
2713 }
2714 
2715 int
spt_unlockpages(struct seg * seg,pgcnt_t anon_index,pgcnt_t npages,rctl_qty_t * unlocked)2716 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2717     rctl_qty_t *unlocked)
2718 {
2719 	struct shm_data	*shmd = seg->s_data;
2720 	struct spt_data	*sptd = shmd->shm_sptseg->s_data;
2721 	struct anon_map	*amp = sptd->spt_amp;
2722 	struct anon	*ap;
2723 	struct vnode	*vp;
2724 	u_offset_t	off;
2725 	struct page	*pp;
2726 	int		kernel;
2727 	anon_sync_obj_t	cookie;
2728 	ulong_t		i;
2729 	pgcnt_t		nlck = 0;
2730 	pgcnt_t		nlck_limit = NLCK;
2731 
2732 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2733 	for (i = 0; i < npages; i++, anon_index++) {
2734 		if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2735 			anon_array_enter(amp, anon_index, &cookie);
2736 			ap = anon_get_ptr(amp->ahp, anon_index);
2737 			ASSERT(ap);
2738 
2739 			swap_xlate(ap, &vp, &off);
2740 			anon_array_exit(&cookie);
2741 			pp = page_lookup(vp, off, SE_SHARED);
2742 			ASSERT(pp);
2743 			/*
2744 			 * availrmem is decremented only for pages which are not
2745 			 * in seg pcache, for pages in seg pcache availrmem was
2746 			 * decremented in _dismpagelock()
2747 			 */
2748 			kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2749 			ASSERT(pp->p_lckcnt > 0);
2750 
2751 			/*
2752 			 * lock page but do not change availrmem, we do it
2753 			 * ourselves every nlck loops.
2754 			 */
2755 			page_pp_unlock(pp, 0, 1);
2756 			if (pp->p_lckcnt == 0) {
2757 				if (kernel == 0)
2758 					nlck++;
2759 				*unlocked += PAGESIZE;
2760 			}
2761 			page_unlock(pp);
2762 			shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2763 			sptd->spt_ppa_lckcnt[anon_index]--;
2764 			shmd->shm_lckpgs--;
2765 		}
2766 
2767 		/*
2768 		 * To reduce freemem_lock contention, do not update availrmem
2769 		 * until at least NLCK pages have been unlocked.
2770 		 * 1. No need to update if nlck is zero
2771 		 * 2. Always update if the last iteration
2772 		 */
2773 		if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2774 			mutex_enter(&freemem_lock);
2775 			availrmem	+= nlck;
2776 			pages_locked	-= nlck;
2777 			mutex_exit(&freemem_lock);
2778 			nlck = 0;
2779 			nlck_limit = NLCK + RAND_P2(NLCK);
2780 		}
2781 	}
2782 	ANON_LOCK_EXIT(&amp->a_rwlock);
2783 
2784 	return (0);
2785 }
2786 
2787 /*ARGSUSED*/
2788 static int
segspt_shmlockop(struct seg * seg,caddr_t addr,size_t len,int attr,int op,ulong_t * lockmap,size_t pos)2789 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2790     int attr, int op, ulong_t *lockmap, size_t pos)
2791 {
2792 	struct shm_data *shmd = seg->s_data;
2793 	struct seg	*sptseg = shmd->shm_sptseg;
2794 	struct spt_data *sptd = sptseg->s_data;
2795 	struct kshmid	*sp = sptd->spt_amp->a_sp;
2796 	pgcnt_t		npages, a_npages;
2797 	page_t		**ppa;
2798 	pgcnt_t		an_idx, a_an_idx, ppa_idx;
2799 	caddr_t		spt_addr, a_addr;	/* spt and aligned address */
2800 	size_t		a_len;			/* aligned len */
2801 	size_t		share_sz;
2802 	ulong_t		i;
2803 	int		sts = 0;
2804 	rctl_qty_t	unlocked = 0;
2805 	rctl_qty_t	locked = 0;
2806 	struct proc	*p = curproc;
2807 	kproject_t	*proj;
2808 
2809 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2810 	ASSERT(sp != NULL);
2811 
2812 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2813 		return (0);
2814 	}
2815 
2816 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2817 	an_idx = seg_page(seg, addr);
2818 	npages = btopr(len);
2819 
2820 	if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2821 		return (ENOMEM);
2822 	}
2823 
2824 	/*
2825 	 * A shm's project never changes, so no lock needed.
2826 	 * The shm has a hold on the project, so it will not go away.
2827 	 * Since we have a mapping to shm within this zone, we know
2828 	 * that the zone will not go away.
2829 	 */
2830 	proj = sp->shm_perm.ipc_proj;
2831 
2832 	if (op == MC_LOCK) {
2833 
2834 		/*
2835 		 * Need to align addr and size request if they are not
2836 		 * aligned so we can always allocate large page(s) however
2837 		 * we only lock what was requested in initial request.
2838 		 */
2839 		share_sz = page_get_pagesize(sptseg->s_szc);
2840 		a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2841 		a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2842 		    share_sz);
2843 		a_npages = btop(a_len);
2844 		a_an_idx = seg_page(seg, a_addr);
2845 		spt_addr = sptseg->s_base + ptob(a_an_idx);
2846 		ppa_idx = an_idx - a_an_idx;
2847 
2848 		if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2849 		    KM_NOSLEEP)) == NULL) {
2850 			return (ENOMEM);
2851 		}
2852 
2853 		/*
2854 		 * Don't cache any new pages for IO and
2855 		 * flush any cached pages.
2856 		 */
2857 		mutex_enter(&sptd->spt_lock);
2858 		if (sptd->spt_ppa != NULL)
2859 			sptd->spt_flags |= DISM_PPA_CHANGED;
2860 
2861 		sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2862 		if (sts != 0) {
2863 			mutex_exit(&sptd->spt_lock);
2864 			kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2865 			return (sts);
2866 		}
2867 
2868 		mutex_enter(&sp->shm_mlock);
2869 		/* enforce locked memory rctl */
2870 		unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2871 
2872 		mutex_enter(&p->p_lock);
2873 		if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2874 			mutex_exit(&p->p_lock);
2875 			sts = EAGAIN;
2876 		} else {
2877 			mutex_exit(&p->p_lock);
2878 			sts = spt_lockpages(seg, an_idx, npages,
2879 			    &ppa[ppa_idx], lockmap, pos, &locked);
2880 
2881 			/*
2882 			 * correct locked count if not all pages could be
2883 			 * locked
2884 			 */
2885 			if ((unlocked - locked) > 0) {
2886 				rctl_decr_locked_mem(NULL, proj,
2887 				    (unlocked - locked), 0);
2888 			}
2889 		}
2890 		/*
2891 		 * unlock pages
2892 		 */
2893 		for (i = 0; i < a_npages; i++)
2894 			page_unlock(ppa[i]);
2895 		if (sptd->spt_ppa != NULL)
2896 			sptd->spt_flags |= DISM_PPA_CHANGED;
2897 		mutex_exit(&sp->shm_mlock);
2898 		mutex_exit(&sptd->spt_lock);
2899 
2900 		kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2901 
2902 	} else if (op == MC_UNLOCK) { /* unlock */
2903 		page_t		**ppa;
2904 
2905 		mutex_enter(&sptd->spt_lock);
2906 		if (shmd->shm_lckpgs == 0) {
2907 			mutex_exit(&sptd->spt_lock);
2908 			return (0);
2909 		}
2910 		/*
2911 		 * Don't cache new IO pages.
2912 		 */
2913 		if (sptd->spt_ppa != NULL)
2914 			sptd->spt_flags |= DISM_PPA_CHANGED;
2915 
2916 		mutex_enter(&sp->shm_mlock);
2917 		sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2918 		if ((ppa = sptd->spt_ppa) != NULL)
2919 			sptd->spt_flags |= DISM_PPA_CHANGED;
2920 		mutex_exit(&sptd->spt_lock);
2921 
2922 		rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2923 		mutex_exit(&sp->shm_mlock);
2924 
2925 		if (ppa != NULL)
2926 			seg_ppurge_wiredpp(ppa);
2927 	}
2928 	return (sts);
2929 }
2930 
2931 /*ARGSUSED*/
2932 int
segspt_shmgetprot(struct seg * seg,caddr_t addr,size_t len,uint_t * protv)2933 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2934 {
2935 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2936 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2937 	spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2938 
2939 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2940 
2941 	/*
2942 	 * ISM segment is always rw.
2943 	 */
2944 	while (--pgno >= 0)
2945 		*protv++ = sptd->spt_prot;
2946 	return (0);
2947 }
2948 
2949 /*ARGSUSED*/
2950 u_offset_t
segspt_shmgetoffset(struct seg * seg,caddr_t addr)2951 segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2952 {
2953 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2954 
2955 	/* Offset does not matter in ISM memory */
2956 
2957 	return ((u_offset_t)0);
2958 }
2959 
2960 /* ARGSUSED */
2961 int
segspt_shmgettype(struct seg * seg,caddr_t addr)2962 segspt_shmgettype(struct seg *seg, caddr_t addr)
2963 {
2964 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2965 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2966 
2967 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2968 
2969 	/*
2970 	 * The shared memory mapping is always MAP_SHARED, SWAP is only
2971 	 * reserved for DISM
2972 	 */
2973 	return (MAP_SHARED |
2974 	    ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2975 }
2976 
2977 /*ARGSUSED*/
2978 int
segspt_shmgetvp(struct seg * seg,caddr_t addr,struct vnode ** vpp)2979 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2980 {
2981 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2982 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2983 
2984 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2985 
2986 	*vpp = sptd->spt_vp;
2987 	return (0);
2988 }
2989 
2990 /*
2991  * We need to wait for pending IO to complete to a DISM segment in order for
2992  * pages to get kicked out of the seg_pcache.  120 seconds should be more
2993  * than enough time to wait.
2994  */
2995 static clock_t spt_pcache_wait = 120;
2996 
2997 /*ARGSUSED*/
2998 static int
segspt_shmadvise(struct seg * seg,caddr_t addr,size_t len,uint_t behav)2999 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
3000 {
3001 	struct shm_data	*shmd = (struct shm_data *)seg->s_data;
3002 	struct spt_data	*sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
3003 	struct anon_map	*amp;
3004 	pgcnt_t pg_idx;
3005 	ushort_t gen;
3006 	clock_t	end_lbolt;
3007 	int writer;
3008 	page_t **ppa;
3009 
3010 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
3011 
3012 	if (behav == MADV_FREE || behav == MADV_PURGE) {
3013 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
3014 			return (0);
3015 
3016 		amp = sptd->spt_amp;
3017 		pg_idx = seg_page(seg, addr);
3018 
3019 		mutex_enter(&sptd->spt_lock);
3020 		if ((ppa = sptd->spt_ppa) == NULL) {
3021 			mutex_exit(&sptd->spt_lock);
3022 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3023 			(void) anon_disclaim(amp, pg_idx, len, behav, NULL);
3024 			ANON_LOCK_EXIT(&amp->a_rwlock);
3025 			return (0);
3026 		}
3027 
3028 		sptd->spt_flags |= DISM_PPA_CHANGED;
3029 		gen = sptd->spt_gen;
3030 
3031 		mutex_exit(&sptd->spt_lock);
3032 
3033 		/*
3034 		 * Purge all DISM cached pages
3035 		 */
3036 		seg_ppurge_wiredpp(ppa);
3037 
3038 		/*
3039 		 * Drop the AS_LOCK so that other threads can grab it
3040 		 * in the as_pageunlock path and hopefully get the segment
3041 		 * kicked out of the seg_pcache.  We bump the shm_softlockcnt
3042 		 * to keep this segment resident.
3043 		 */
3044 		writer = AS_WRITE_HELD(seg->s_as);
3045 		atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
3046 		AS_LOCK_EXIT(seg->s_as);
3047 
3048 		mutex_enter(&sptd->spt_lock);
3049 
3050 		end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
3051 
3052 		/*
3053 		 * Try to wait for pages to get kicked out of the seg_pcache.
3054 		 */
3055 		while (sptd->spt_gen == gen &&
3056 		    (sptd->spt_flags & DISM_PPA_CHANGED) &&
3057 		    ddi_get_lbolt() < end_lbolt) {
3058 			if (!cv_timedwait_sig(&sptd->spt_cv,
3059 			    &sptd->spt_lock, end_lbolt)) {
3060 				break;
3061 			}
3062 		}
3063 
3064 		mutex_exit(&sptd->spt_lock);
3065 
3066 		/* Regrab the AS_LOCK and release our hold on the segment */
3067 		AS_LOCK_ENTER(seg->s_as, writer ? RW_WRITER : RW_READER);
3068 		atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
3069 		if (shmd->shm_softlockcnt <= 0) {
3070 			if (AS_ISUNMAPWAIT(seg->s_as)) {
3071 				mutex_enter(&seg->s_as->a_contents);
3072 				if (AS_ISUNMAPWAIT(seg->s_as)) {
3073 					AS_CLRUNMAPWAIT(seg->s_as);
3074 					cv_broadcast(&seg->s_as->a_cv);
3075 				}
3076 				mutex_exit(&seg->s_as->a_contents);
3077 			}
3078 		}
3079 
3080 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3081 		(void) anon_disclaim(amp, pg_idx, len, behav, NULL);
3082 		ANON_LOCK_EXIT(&amp->a_rwlock);
3083 	} else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
3084 	    behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
3085 		int			already_set;
3086 		ulong_t			anon_index;
3087 		lgrp_mem_policy_t	policy;
3088 		caddr_t			shm_addr;
3089 		size_t			share_size;
3090 		size_t			size;
3091 		struct seg		*sptseg = shmd->shm_sptseg;
3092 		caddr_t			sptseg_addr;
3093 
3094 		/*
3095 		 * Align address and length to page size of underlying segment
3096 		 */
3097 		share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
3098 		shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
3099 		size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
3100 		    share_size);
3101 
3102 		amp = shmd->shm_amp;
3103 		anon_index = seg_page(seg, shm_addr);
3104 
3105 		/*
3106 		 * And now we may have to adjust size downward if we have
3107 		 * exceeded the realsize of the segment or initial anon
3108 		 * allocations.
3109 		 */
3110 		sptseg_addr = sptseg->s_base + ptob(anon_index);
3111 		if ((sptseg_addr + size) >
3112 		    (sptseg->s_base + sptd->spt_realsize))
3113 			size = (sptseg->s_base + sptd->spt_realsize) -
3114 			    sptseg_addr;
3115 
3116 		/*
3117 		 * Set memory allocation policy for this segment
3118 		 */
3119 		policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
3120 		already_set = lgrp_shm_policy_set(policy, amp, anon_index,
3121 		    NULL, 0, len);
3122 
3123 		/*
3124 		 * If random memory allocation policy set already,
3125 		 * don't bother reapplying it.
3126 		 */
3127 		if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
3128 			return (0);
3129 
3130 		/*
3131 		 * Mark any existing pages in the given range for
3132 		 * migration, flushing the I/O page cache, and using
3133 		 * underlying segment to calculate anon index and get
3134 		 * anonmap and vnode pointer from
3135 		 */
3136 		if (shmd->shm_softlockcnt > 0)
3137 			segspt_purge(seg);
3138 
3139 		page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
3140 	}
3141 
3142 	return (0);
3143 }
3144 
3145 /*ARGSUSED*/
3146 void
segspt_shmdump(struct seg * seg)3147 segspt_shmdump(struct seg *seg)
3148 {
3149 	/* no-op for ISM segment */
3150 }
3151 
3152 /*ARGSUSED*/
3153 static int
segspt_shmsetpgsz(struct seg * seg,caddr_t addr,size_t len,uint_t szc)3154 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
3155 {
3156 	return (ENOTSUP);
3157 }
3158 
3159 /*
3160  * get a memory ID for an addr in a given segment
3161  */
3162 static int
segspt_shmgetmemid(struct seg * seg,caddr_t addr,memid_t * memidp)3163 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
3164 {
3165 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
3166 	struct anon	*ap;
3167 	size_t		anon_index;
3168 	struct anon_map	*amp = shmd->shm_amp;
3169 	struct spt_data	*sptd = shmd->shm_sptseg->s_data;
3170 	struct seg	*sptseg = shmd->shm_sptseg;
3171 	anon_sync_obj_t	cookie;
3172 
3173 	anon_index = seg_page(seg, addr);
3174 
3175 	if (addr > (seg->s_base + sptd->spt_realsize)) {
3176 		return (EFAULT);
3177 	}
3178 
3179 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3180 	anon_array_enter(amp, anon_index, &cookie);
3181 	ap = anon_get_ptr(amp->ahp, anon_index);
3182 	if (ap == NULL) {
3183 		struct page *pp;
3184 		caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
3185 
3186 		pp = anon_zero(sptseg, spt_addr, &ap, kcred);
3187 		if (pp == NULL) {
3188 			anon_array_exit(&cookie);
3189 			ANON_LOCK_EXIT(&amp->a_rwlock);
3190 			return (ENOMEM);
3191 		}
3192 		(void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3193 		page_unlock(pp);
3194 	}
3195 	anon_array_exit(&cookie);
3196 	ANON_LOCK_EXIT(&amp->a_rwlock);
3197 	memidp->val[0] = (uintptr_t)ap;
3198 	memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
3199 	return (0);
3200 }
3201 
3202 /*
3203  * Get memory allocation policy info for specified address in given segment
3204  */
3205 static lgrp_mem_policy_info_t *
segspt_shmgetpolicy(struct seg * seg,caddr_t addr)3206 segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3207 {
3208 	struct anon_map		*amp;
3209 	ulong_t			anon_index;
3210 	lgrp_mem_policy_info_t	*policy_info;
3211 	struct shm_data		*shm_data;
3212 
3213 	ASSERT(seg != NULL);
3214 
3215 	/*
3216 	 * Get anon_map from segshm
3217 	 *
3218 	 * Assume that no lock needs to be held on anon_map, since
3219 	 * it should be protected by its reference count which must be
3220 	 * nonzero for an existing segment
3221 	 * Need to grab readers lock on policy tree though
3222 	 */
3223 	shm_data = (struct shm_data *)seg->s_data;
3224 	if (shm_data == NULL)
3225 		return (NULL);
3226 	amp = shm_data->shm_amp;
3227 	ASSERT(amp->refcnt != 0);
3228 
3229 	/*
3230 	 * Get policy info
3231 	 *
3232 	 * Assume starting anon index of 0
3233 	 */
3234 	anon_index = seg_page(seg, addr);
3235 	policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3236 
3237 	return (policy_info);
3238 }
3239 
3240 /*ARGSUSED*/
3241 static int
segspt_shmcapable(struct seg * seg,segcapability_t capability)3242 segspt_shmcapable(struct seg *seg, segcapability_t capability)
3243 {
3244 	return (0);
3245 }
3246