1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2018 Joyent, Inc.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
29/*	  All Rights Reserved  	*/
30
31/*
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
34 * All Rights Reserved
35 *
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
38 * contributors.
39 */
40
41/*
42 * VM - address spaces.
43 */
44
45#include <sys/types.h>
46#include <sys/t_lock.h>
47#include <sys/param.h>
48#include <sys/errno.h>
49#include <sys/systm.h>
50#include <sys/mman.h>
51#include <sys/sysmacros.h>
52#include <sys/cpuvar.h>
53#include <sys/sysinfo.h>
54#include <sys/kmem.h>
55#include <sys/vnode.h>
56#include <sys/vmsystm.h>
57#include <sys/cmn_err.h>
58#include <sys/debug.h>
59#include <sys/tnf_probe.h>
60#include <sys/vtrace.h>
61
62#include <vm/hat.h>
63#include <vm/as.h>
64#include <vm/seg.h>
65#include <vm/seg_vn.h>
66#include <vm/seg_dev.h>
67#include <vm/seg_kmem.h>
68#include <vm/seg_map.h>
69#include <vm/seg_spt.h>
70#include <vm/seg_hole.h>
71#include <vm/page.h>
72
73clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
74
75static struct kmem_cache *as_cache;
76
77static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
78static void as_clearwatchprot(struct as *, caddr_t, size_t);
79
80
81/*
82 * Verifying the segment lists is very time-consuming; it may not be
83 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
84 */
85#ifdef DEBUG
86#define	VERIFY_SEGLIST
87int do_as_verify = 0;
88#endif
89
90/*
91 * Allocate a new callback data structure entry and fill in the events of
92 * interest, the address range of interest, and the callback argument.
93 * Link the entry on the as->a_callbacks list. A callback entry for the
94 * entire address space may be specified with vaddr = 0 and size = -1.
95 *
96 * CALLERS RESPONSIBILITY: If not calling from within the process context for
97 * the specified as, the caller must guarantee persistence of the specified as
98 * for the duration of this function (eg. pages being locked within the as
99 * will guarantee persistence).
100 */
101int
102as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
103    caddr_t vaddr, size_t size, int sleepflag)
104{
105	struct as_callback 	*current_head, *cb;
106	caddr_t 		saddr;
107	size_t 			rsize;
108
109	/* callback function and an event are mandatory */
110	if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
111		return (EINVAL);
112
113	/* Adding a callback after as_free has been called is not allowed */
114	if (as == &kas)
115		return (ENOMEM);
116
117	/*
118	 * vaddr = 0 and size = -1 is used to indicate that the callback range
119	 * is the entire address space so no rounding is done in that case.
120	 */
121	if (size != -1) {
122		saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
123		rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
124		    (size_t)saddr;
125		/* check for wraparound */
126		if (saddr + rsize < saddr)
127			return (ENOMEM);
128	} else {
129		if (vaddr != 0)
130			return (EINVAL);
131		saddr = vaddr;
132		rsize = size;
133	}
134
135	/* Allocate and initialize a callback entry */
136	cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
137	if (cb == NULL)
138		return (EAGAIN);
139
140	cb->ascb_func = cb_func;
141	cb->ascb_arg = arg;
142	cb->ascb_events = events;
143	cb->ascb_saddr = saddr;
144	cb->ascb_len = rsize;
145
146	/* Add the entry to the list */
147	mutex_enter(&as->a_contents);
148	current_head = as->a_callbacks;
149	as->a_callbacks = cb;
150	cb->ascb_next = current_head;
151
152	/*
153	 * The call to this function may lose in a race with
154	 * a pertinent event - eg. a thread does long term memory locking
155	 * but before the callback is added another thread executes as_unmap.
156	 * A broadcast here resolves that.
157	 */
158	if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
159		AS_CLRUNMAPWAIT(as);
160		cv_broadcast(&as->a_cv);
161	}
162
163	mutex_exit(&as->a_contents);
164	return (0);
165}
166
167/*
168 * Search the callback list for an entry which pertains to arg.
169 *
170 * This is called from within the client upon completion of the callback.
171 * RETURN VALUES:
172 *	AS_CALLBACK_DELETED  (callback entry found and deleted)
173 *	AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
174 *	AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
175 *			entry will be made in as_do_callbacks)
176 *
177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
178 * set, it indicates that as_do_callbacks is processing this entry.  The
179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
180 * to unblock as_do_callbacks, in case it is blocked.
181 *
182 * CALLERS RESPONSIBILITY: If not calling from within the process context for
183 * the specified as, the caller must guarantee persistence of the specified as
184 * for the duration of this function (eg. pages being locked within the as
185 * will guarantee persistence).
186 */
187uint_t
188as_delete_callback(struct as *as, void *arg)
189{
190	struct as_callback **prevcb = &as->a_callbacks;
191	struct as_callback *cb;
192	uint_t rc = AS_CALLBACK_NOTFOUND;
193
194	mutex_enter(&as->a_contents);
195	for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
196		if (cb->ascb_arg != arg)
197			continue;
198
199		/*
200		 * If the events indicate AS_CALLBACK_CALLED, just clear
201		 * AS_ALL_EVENT in the events field and wakeup the thread
202		 * that may be waiting in as_do_callbacks.  as_do_callbacks
203		 * will take care of removing this entry from the list.  In
204		 * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
205		 * (AS_CALLBACK_CALLED not set), just remove it from the
206		 * list, return the memory and return AS_CALLBACK_DELETED.
207		 */
208		if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
209			/* leave AS_CALLBACK_CALLED */
210			cb->ascb_events &= ~AS_ALL_EVENT;
211			rc = AS_CALLBACK_DELETE_DEFERRED;
212			cv_broadcast(&as->a_cv);
213		} else {
214			*prevcb = cb->ascb_next;
215			kmem_free(cb, sizeof (struct as_callback));
216			rc = AS_CALLBACK_DELETED;
217		}
218		break;
219	}
220	mutex_exit(&as->a_contents);
221	return (rc);
222}
223
224/*
225 * Searches the as callback list for a matching entry.
226 * Returns a pointer to the first matching callback, or NULL if
227 * nothing is found.
228 * This function never sleeps so it is ok to call it with more
229 * locks held but the (required) a_contents mutex.
230 *
231 * See also comment on as_do_callbacks below.
232 */
233static struct as_callback *
234as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
235    size_t event_len)
236{
237	struct as_callback	*cb;
238
239	ASSERT(MUTEX_HELD(&as->a_contents));
240	for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
241		/*
242		 * If the callback has not already been called, then
243		 * check if events or address range pertains.  An event_len
244		 * of zero means do an unconditional callback.
245		 */
246		if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
247		    ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
248		    (event_addr + event_len < cb->ascb_saddr) ||
249		    (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
250			continue;
251		}
252		break;
253	}
254	return (cb);
255}
256
257/*
258 * Executes a given callback and removes it from the callback list for
259 * this address space.
260 * This function may sleep so the caller must drop all locks except
261 * a_contents before calling this func.
262 *
263 * See also comments on as_do_callbacks below.
264 */
265static void
266as_execute_callback(struct as *as, struct as_callback *cb,
267    uint_t events)
268{
269	struct as_callback **prevcb;
270	void	*cb_arg;
271
272	ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
273	cb->ascb_events |= AS_CALLBACK_CALLED;
274	mutex_exit(&as->a_contents);
275	(*cb->ascb_func)(as, cb->ascb_arg, events);
276	mutex_enter(&as->a_contents);
277	/*
278	 * the callback function is required to delete the callback
279	 * when the callback function determines it is OK for
280	 * this thread to continue. as_delete_callback will clear
281	 * the AS_ALL_EVENT in the events field when it is deleted.
282	 * If the callback function called as_delete_callback,
283	 * events will already be cleared and there will be no blocking.
284	 */
285	while ((cb->ascb_events & events) != 0) {
286		cv_wait(&as->a_cv, &as->a_contents);
287	}
288	/*
289	 * This entry needs to be taken off the list. Normally, the
290	 * callback func itself does that, but unfortunately the list
291	 * may have changed while the callback was running because the
292	 * a_contents mutex was dropped and someone else other than the
293	 * callback func itself could have called as_delete_callback,
294	 * so we have to search to find this entry again.  The entry
295	 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
296	 */
297	cb_arg = cb->ascb_arg;
298	prevcb = &as->a_callbacks;
299	for (cb = as->a_callbacks; cb != NULL;
300	    prevcb = &cb->ascb_next, cb = *prevcb) {
301		if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
302		    (cb_arg != cb->ascb_arg)) {
303			continue;
304		}
305		*prevcb = cb->ascb_next;
306		kmem_free(cb, sizeof (struct as_callback));
307		break;
308	}
309}
310
311/*
312 * Check the callback list for a matching event and intersection of
313 * address range. If there is a match invoke the callback.  Skip an entry if:
314 *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
315 *    - not event of interest
316 *    - not address range of interest
317 *
318 * An event_len of zero indicates a request for an unconditional callback
319 * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
320 * a_contents lock must be dropped before a callback, so only one callback
321 * can be done before returning. Return -1 (true) if a callback was
322 * executed and removed from the list, else return 0 (false).
323 *
324 * The logically separate parts, i.e. finding a matching callback and
325 * executing a given callback have been separated into two functions
326 * so that they can be called with different sets of locks held beyond
327 * the always-required a_contents. as_find_callback does not sleep so
328 * it is ok to call it if more locks than a_contents (i.e. the a_lock
329 * rwlock) are held. as_execute_callback on the other hand may sleep
330 * so all locks beyond a_contents must be dropped by the caller if one
331 * does not want to end comatose.
332 */
333static int
334as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
335    size_t event_len)
336{
337	struct as_callback *cb;
338
339	if ((cb = as_find_callback(as, events, event_addr, event_len))) {
340		as_execute_callback(as, cb, events);
341		return (-1);
342	}
343	return (0);
344}
345
346/*
347 * Search for the segment containing addr. If a segment containing addr
348 * exists, that segment is returned.  If no such segment exists, and
349 * the list spans addresses greater than addr, then the first segment
350 * whose base is greater than addr is returned; otherwise, NULL is
351 * returned unless tail is true, in which case the last element of the
352 * list is returned.
353 *
354 * a_seglast is used to cache the last found segment for repeated
355 * searches to the same addr (which happens frequently).
356 */
357struct seg *
358as_findseg(struct as *as, caddr_t addr, int tail)
359{
360	struct seg *seg = as->a_seglast;
361	avl_index_t where;
362
363	ASSERT(AS_LOCK_HELD(as));
364
365	if (seg != NULL &&
366	    seg->s_base <= addr &&
367	    addr < seg->s_base + seg->s_size)
368		return (seg);
369
370	seg = avl_find(&as->a_segtree, &addr, &where);
371	if (seg != NULL)
372		return (as->a_seglast = seg);
373
374	seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
375	if (seg == NULL && tail)
376		seg = avl_last(&as->a_segtree);
377	return (as->a_seglast = seg);
378}
379
380#ifdef VERIFY_SEGLIST
381/*
382 * verify that the linked list is coherent
383 */
384static void
385as_verify(struct as *as)
386{
387	struct seg *seg, *seglast, *p, *n;
388	uint_t nsegs = 0;
389
390	if (do_as_verify == 0)
391		return;
392
393	seglast = as->a_seglast;
394
395	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
396		ASSERT(seg->s_as == as);
397		p = AS_SEGPREV(as, seg);
398		n = AS_SEGNEXT(as, seg);
399		ASSERT(p == NULL || p->s_as == as);
400		ASSERT(p == NULL || p->s_base < seg->s_base);
401		ASSERT(n == NULL || n->s_base > seg->s_base);
402		ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
403		if (seg == seglast)
404			seglast = NULL;
405		nsegs++;
406	}
407	ASSERT(seglast == NULL);
408	ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
409}
410#endif /* VERIFY_SEGLIST */
411
412/*
413 * Add a new segment to the address space. The avl_find()
414 * may be expensive so we attempt to use last segment accessed
415 * in as_gap() as an insertion point.
416 */
417int
418as_addseg(struct as  *as, struct seg *newseg)
419{
420	struct seg *seg;
421	caddr_t addr;
422	caddr_t eaddr;
423	avl_index_t where;
424
425	ASSERT(AS_WRITE_HELD(as));
426
427	as->a_updatedir = 1;	/* inform /proc */
428	gethrestime(&as->a_updatetime);
429
430	if (as->a_lastgaphl != NULL) {
431		struct seg *hseg = NULL;
432		struct seg *lseg = NULL;
433
434		if (as->a_lastgaphl->s_base > newseg->s_base) {
435			hseg = as->a_lastgaphl;
436			lseg = AVL_PREV(&as->a_segtree, hseg);
437		} else {
438			lseg = as->a_lastgaphl;
439			hseg = AVL_NEXT(&as->a_segtree, lseg);
440		}
441
442		if (hseg && lseg && lseg->s_base < newseg->s_base &&
443		    hseg->s_base > newseg->s_base) {
444			avl_insert_here(&as->a_segtree, newseg, lseg,
445			    AVL_AFTER);
446			as->a_lastgaphl = NULL;
447			as->a_seglast = newseg;
448			return (0);
449		}
450		as->a_lastgaphl = NULL;
451	}
452
453	addr = newseg->s_base;
454	eaddr = addr + newseg->s_size;
455again:
456
457	seg = avl_find(&as->a_segtree, &addr, &where);
458
459	if (seg == NULL)
460		seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
461
462	if (seg == NULL)
463		seg = avl_last(&as->a_segtree);
464
465	if (seg != NULL) {
466		caddr_t base = seg->s_base;
467
468		/*
469		 * If top of seg is below the requested address, then
470		 * the insertion point is at the end of the linked list,
471		 * and seg points to the tail of the list.  Otherwise,
472		 * the insertion point is immediately before seg.
473		 */
474		if (base + seg->s_size > addr) {
475			if (addr >= base || eaddr > base) {
476#ifdef __sparc
477				extern struct seg_ops segnf_ops;
478
479				/*
480				 * no-fault segs must disappear if overlaid.
481				 * XXX need new segment type so
482				 * we don't have to check s_ops
483				 */
484				if (seg->s_ops == &segnf_ops) {
485					seg_unmap(seg);
486					goto again;
487				}
488#endif
489				return (-1);	/* overlapping segment */
490			}
491		}
492	}
493	as->a_seglast = newseg;
494	avl_insert(&as->a_segtree, newseg, where);
495
496#ifdef VERIFY_SEGLIST
497	as_verify(as);
498#endif
499	return (0);
500}
501
502struct seg *
503as_removeseg(struct as *as, struct seg *seg)
504{
505	avl_tree_t *t;
506
507	ASSERT(AS_WRITE_HELD(as));
508
509	as->a_updatedir = 1;	/* inform /proc */
510	gethrestime(&as->a_updatetime);
511
512	if (seg == NULL)
513		return (NULL);
514
515	t = &as->a_segtree;
516	if (as->a_seglast == seg)
517		as->a_seglast = NULL;
518	as->a_lastgaphl = NULL;
519
520	/*
521	 * if this segment is at an address higher than
522	 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
523	 */
524	if (as->a_lastgap &&
525	    (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
526		as->a_lastgap = AVL_NEXT(t, seg);
527
528	/*
529	 * remove the segment from the seg tree
530	 */
531	avl_remove(t, seg);
532
533#ifdef VERIFY_SEGLIST
534	as_verify(as);
535#endif
536	return (seg);
537}
538
539/*
540 * Find a segment containing addr.
541 */
542struct seg *
543as_segat(struct as *as, caddr_t addr)
544{
545	struct seg *seg = as->a_seglast;
546
547	ASSERT(AS_LOCK_HELD(as));
548
549	if (seg != NULL && seg->s_base <= addr &&
550	    addr < seg->s_base + seg->s_size)
551		return (seg);
552
553	seg = avl_find(&as->a_segtree, &addr, NULL);
554	return (seg);
555}
556
557/*
558 * Serialize all searches for holes in an address space to
559 * prevent two or more threads from allocating the same virtual
560 * address range.  The address space must not be "read/write"
561 * locked by the caller since we may block.
562 */
563void
564as_rangelock(struct as *as)
565{
566	mutex_enter(&as->a_contents);
567	while (AS_ISCLAIMGAP(as))
568		cv_wait(&as->a_cv, &as->a_contents);
569	AS_SETCLAIMGAP(as);
570	mutex_exit(&as->a_contents);
571}
572
573/*
574 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
575 */
576void
577as_rangeunlock(struct as *as)
578{
579	mutex_enter(&as->a_contents);
580	AS_CLRCLAIMGAP(as);
581	cv_signal(&as->a_cv);
582	mutex_exit(&as->a_contents);
583}
584
585/*
586 * compar segments (or just an address) by segment address range
587 */
588static int
589as_segcompar(const void *x, const void *y)
590{
591	struct seg *a = (struct seg *)x;
592	struct seg *b = (struct seg *)y;
593
594	if (a->s_base < b->s_base)
595		return (-1);
596	if (a->s_base >= b->s_base + b->s_size)
597		return (1);
598	return (0);
599}
600
601
602void
603as_avlinit(struct as *as)
604{
605	avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
606	    offsetof(struct seg, s_tree));
607	avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
608	    offsetof(struct watched_page, wp_link));
609}
610
611/*ARGSUSED*/
612static int
613as_constructor(void *buf, void *cdrarg, int kmflags)
614{
615	struct as *as = buf;
616
617	mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
618	cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
619	rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
620	as_avlinit(as);
621	return (0);
622}
623
624/*ARGSUSED1*/
625static void
626as_destructor(void *buf, void *cdrarg)
627{
628	struct as *as = buf;
629
630	avl_destroy(&as->a_segtree);
631	mutex_destroy(&as->a_contents);
632	cv_destroy(&as->a_cv);
633	rw_destroy(&as->a_lock);
634}
635
636void
637as_init(void)
638{
639	as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
640	    as_constructor, as_destructor, NULL, NULL, NULL, 0);
641}
642
643/*
644 * Allocate and initialize an address space data structure.
645 * We call hat_alloc to allow any machine dependent
646 * information in the hat structure to be initialized.
647 */
648struct as *
649as_alloc(void)
650{
651	struct as *as;
652
653	as = kmem_cache_alloc(as_cache, KM_SLEEP);
654
655	as->a_flags		= 0;
656	as->a_vbits		= 0;
657	as->a_hrm		= NULL;
658	as->a_seglast		= NULL;
659	as->a_size		= 0;
660	as->a_resvsize		= 0;
661	as->a_updatedir		= 0;
662	gethrestime(&as->a_updatetime);
663	as->a_objectdir		= NULL;
664	as->a_sizedir		= 0;
665	as->a_userlimit		= (caddr_t)USERLIMIT;
666	as->a_lastgap		= NULL;
667	as->a_lastgaphl		= NULL;
668	as->a_callbacks		= NULL;
669	as->a_proc		= NULL;
670
671	AS_LOCK_ENTER(as, RW_WRITER);
672	as->a_hat = hat_alloc(as);	/* create hat for default system mmu */
673	AS_LOCK_EXIT(as);
674
675	return (as);
676}
677
678/*
679 * Free an address space data structure.
680 * Need to free the hat first and then
681 * all the segments on this as and finally
682 * the space for the as struct itself.
683 */
684void
685as_free(struct as *as)
686{
687	struct hat *hat = as->a_hat;
688	struct seg *seg, *next;
689	boolean_t free_started = B_FALSE;
690
691top:
692	/*
693	 * Invoke ALL callbacks. as_do_callbacks will do one callback
694	 * per call, and not return (-1) until the callback has completed.
695	 * When as_do_callbacks returns zero, all callbacks have completed.
696	 */
697	mutex_enter(&as->a_contents);
698	while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
699		;
700
701	mutex_exit(&as->a_contents);
702	AS_LOCK_ENTER(as, RW_WRITER);
703
704	if (!free_started) {
705		free_started = B_TRUE;
706		hat_free_start(hat);
707	}
708	for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
709		int err;
710
711		next = AS_SEGNEXT(as, seg);
712retry:
713		err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
714		if (err == EAGAIN) {
715			mutex_enter(&as->a_contents);
716			if (as->a_callbacks) {
717				AS_LOCK_EXIT(as);
718			} else if (!AS_ISNOUNMAPWAIT(as)) {
719				/*
720				 * Memory is currently locked. Wait for a
721				 * cv_signal that it has been unlocked, then
722				 * try the operation again.
723				 */
724				if (AS_ISUNMAPWAIT(as) == 0)
725					cv_broadcast(&as->a_cv);
726				AS_SETUNMAPWAIT(as);
727				AS_LOCK_EXIT(as);
728				while (AS_ISUNMAPWAIT(as))
729					cv_wait(&as->a_cv, &as->a_contents);
730			} else {
731				/*
732				 * We may have raced with
733				 * segvn_reclaim()/segspt_reclaim(). In this
734				 * case clean nounmapwait flag and retry since
735				 * softlockcnt in this segment may be already
736				 * 0.  We don't drop as writer lock so our
737				 * number of retries without sleeping should
738				 * be very small. See segvn_reclaim() for
739				 * more comments.
740				 */
741				AS_CLRNOUNMAPWAIT(as);
742				mutex_exit(&as->a_contents);
743				goto retry;
744			}
745			mutex_exit(&as->a_contents);
746			goto top;
747		} else {
748			/*
749			 * We do not expect any other error return at this
750			 * time. This is similar to an ASSERT in seg_unmap()
751			 */
752			ASSERT(err == 0);
753		}
754	}
755	hat_free_end(hat);
756	AS_LOCK_EXIT(as);
757
758	/* /proc stuff */
759	ASSERT(avl_numnodes(&as->a_wpage) == 0);
760	if (as->a_objectdir) {
761		kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
762		as->a_objectdir = NULL;
763		as->a_sizedir = 0;
764	}
765
766	/*
767	 * Free the struct as back to kmem.  Assert it has no segments.
768	 */
769	ASSERT(avl_numnodes(&as->a_segtree) == 0);
770	kmem_cache_free(as_cache, as);
771}
772
773int
774as_dup(struct as *as, struct proc *forkedproc)
775{
776	struct as *newas;
777	struct seg *seg, *newseg;
778	size_t	purgesize = 0;
779	int error;
780
781	AS_LOCK_ENTER(as, RW_WRITER);
782	as_clearwatch(as);
783	newas = as_alloc();
784	newas->a_userlimit = as->a_userlimit;
785	newas->a_proc = forkedproc;
786
787	AS_LOCK_ENTER(newas, RW_WRITER);
788
789	(void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
790
791	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
792
793		if (seg->s_flags & S_PURGE) {
794			purgesize += seg->s_size;
795			continue;
796		}
797
798		newseg = seg_alloc(newas, seg->s_base, seg->s_size);
799		if (newseg == NULL) {
800			AS_LOCK_EXIT(newas);
801			as_setwatch(as);
802			AS_LOCK_EXIT(as);
803			as_free(newas);
804			return (-1);
805		}
806		if ((error = SEGOP_DUP(seg, newseg)) != 0) {
807			/*
808			 * We call seg_free() on the new seg
809			 * because the segment is not set up
810			 * completely; i.e. it has no ops.
811			 */
812			as_setwatch(as);
813			AS_LOCK_EXIT(as);
814			seg_free(newseg);
815			AS_LOCK_EXIT(newas);
816			as_free(newas);
817			return (error);
818		}
819		if ((newseg->s_flags & S_HOLE) == 0) {
820			newas->a_size += seg->s_size;
821		}
822	}
823	newas->a_resvsize = as->a_resvsize - purgesize;
824
825	error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
826
827	AS_LOCK_EXIT(newas);
828
829	as_setwatch(as);
830	AS_LOCK_EXIT(as);
831	if (error != 0) {
832		as_free(newas);
833		return (error);
834	}
835	forkedproc->p_as = newas;
836	return (0);
837}
838
839/*
840 * Handle a ``fault'' at addr for size bytes.
841 */
842faultcode_t
843as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
844    enum fault_type type, enum seg_rw rw)
845{
846	struct seg *seg;
847	caddr_t raddr;			/* rounded down addr */
848	size_t rsize;			/* rounded up size */
849	size_t ssize;
850	faultcode_t res = 0;
851	caddr_t addrsav;
852	struct seg *segsav;
853	int as_lock_held;
854	klwp_t *lwp = ttolwp(curthread);
855
856
857
858retry:
859	/*
860	 * Indicate that the lwp is not to be stopped while waiting for a
861	 * pagefault.  This is to avoid deadlock while debugging a process
862	 * via /proc over NFS (in particular).
863	 */
864	if (lwp != NULL)
865		lwp->lwp_nostop++;
866
867	/*
868	 * same length must be used when we softlock and softunlock.  We
869	 * don't support softunlocking lengths less than the original length
870	 * when there is largepage support.  See seg_dev.c for more
871	 * comments.
872	 */
873	switch (type) {
874
875	case F_SOFTLOCK:
876		CPU_STATS_ADD_K(vm, softlock, 1);
877		break;
878
879	case F_SOFTUNLOCK:
880		break;
881
882	case F_PROT:
883		CPU_STATS_ADD_K(vm, prot_fault, 1);
884		break;
885
886	case F_INVAL:
887		CPU_STATS_ENTER_K();
888		CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
889		if (as == &kas)
890			CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
891		CPU_STATS_EXIT_K();
892		break;
893	}
894
895	/* Kernel probe */
896	TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
897	    tnf_opaque,	address,	addr,
898	    tnf_fault_type,	fault_type,	type,
899	    tnf_seg_access,	access,		rw);
900
901	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
902	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
903	    (size_t)raddr;
904
905	/*
906	 * XXX -- Don't grab the as lock for segkmap. We should grab it for
907	 * correctness, but then we could be stuck holding this lock for
908	 * a LONG time if the fault needs to be resolved on a slow
909	 * filesystem, and then no-one will be able to exec new commands,
910	 * as exec'ing requires the write lock on the as.
911	 */
912	if (as == &kas && segkmap && segkmap->s_base <= raddr &&
913	    raddr + size < segkmap->s_base + segkmap->s_size) {
914		seg = segkmap;
915		as_lock_held = 0;
916	} else {
917		AS_LOCK_ENTER(as, RW_READER);
918
919		seg = as_segat(as, raddr);
920		if (seg == NULL) {
921			AS_LOCK_EXIT(as);
922			if (lwp != NULL)
923				lwp->lwp_nostop--;
924			return (FC_NOMAP);
925		}
926
927		as_lock_held = 1;
928	}
929
930	addrsav = raddr;
931	segsav = seg;
932
933	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
934		if (raddr >= seg->s_base + seg->s_size) {
935			seg = AS_SEGNEXT(as, seg);
936			if (seg == NULL || raddr != seg->s_base) {
937				res = FC_NOMAP;
938				break;
939			}
940		}
941		if (raddr + rsize > seg->s_base + seg->s_size)
942			ssize = seg->s_base + seg->s_size - raddr;
943		else
944			ssize = rsize;
945
946		res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
947		if (res != 0)
948			break;
949	}
950
951	/*
952	 * If we were SOFTLOCKing and encountered a failure,
953	 * we must SOFTUNLOCK the range we already did. (Maybe we
954	 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
955	 * right here...)
956	 */
957	if (res != 0 && type == F_SOFTLOCK) {
958		for (seg = segsav; addrsav < raddr; addrsav += ssize) {
959			if (addrsav >= seg->s_base + seg->s_size)
960				seg = AS_SEGNEXT(as, seg);
961			ASSERT(seg != NULL);
962			/*
963			 * Now call the fault routine again to perform the
964			 * unlock using S_OTHER instead of the rw variable
965			 * since we never got a chance to touch the pages.
966			 */
967			if (raddr > seg->s_base + seg->s_size)
968				ssize = seg->s_base + seg->s_size - addrsav;
969			else
970				ssize = raddr - addrsav;
971			(void) SEGOP_FAULT(hat, seg, addrsav, ssize,
972			    F_SOFTUNLOCK, S_OTHER);
973		}
974	}
975	if (as_lock_held)
976		AS_LOCK_EXIT(as);
977	if (lwp != NULL)
978		lwp->lwp_nostop--;
979
980	/*
981	 * If the lower levels returned EDEADLK for a fault,
982	 * It means that we should retry the fault.  Let's wait
983	 * a bit also to let the deadlock causing condition clear.
984	 * This is part of a gross hack to work around a design flaw
985	 * in the ufs/sds logging code and should go away when the
986	 * logging code is re-designed to fix the problem. See bug
987	 * 4125102 for details of the problem.
988	 */
989	if (FC_ERRNO(res) == EDEADLK) {
990		delay(deadlk_wait);
991		res = 0;
992		goto retry;
993	}
994	return (res);
995}
996
997
998
999/*
1000 * Asynchronous ``fault'' at addr for size bytes.
1001 */
1002faultcode_t
1003as_faulta(struct as *as, caddr_t addr, size_t size)
1004{
1005	struct seg *seg;
1006	caddr_t raddr;			/* rounded down addr */
1007	size_t rsize;			/* rounded up size */
1008	faultcode_t res = 0;
1009	klwp_t *lwp = ttolwp(curthread);
1010
1011retry:
1012	/*
1013	 * Indicate that the lwp is not to be stopped while waiting
1014	 * for a pagefault.  This is to avoid deadlock while debugging
1015	 * a process via /proc over NFS (in particular).
1016	 */
1017	if (lwp != NULL)
1018		lwp->lwp_nostop++;
1019
1020	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1021	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1022	    (size_t)raddr;
1023
1024	AS_LOCK_ENTER(as, RW_READER);
1025	seg = as_segat(as, raddr);
1026	if (seg == NULL) {
1027		AS_LOCK_EXIT(as);
1028		if (lwp != NULL)
1029			lwp->lwp_nostop--;
1030		return (FC_NOMAP);
1031	}
1032
1033	for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1034		if (raddr >= seg->s_base + seg->s_size) {
1035			seg = AS_SEGNEXT(as, seg);
1036			if (seg == NULL || raddr != seg->s_base) {
1037				res = FC_NOMAP;
1038				break;
1039			}
1040		}
1041		res = SEGOP_FAULTA(seg, raddr);
1042		if (res != 0)
1043			break;
1044	}
1045	AS_LOCK_EXIT(as);
1046	if (lwp != NULL)
1047		lwp->lwp_nostop--;
1048	/*
1049	 * If the lower levels returned EDEADLK for a fault,
1050	 * It means that we should retry the fault.  Let's wait
1051	 * a bit also to let the deadlock causing condition clear.
1052	 * This is part of a gross hack to work around a design flaw
1053	 * in the ufs/sds logging code and should go away when the
1054	 * logging code is re-designed to fix the problem. See bug
1055	 * 4125102 for details of the problem.
1056	 */
1057	if (FC_ERRNO(res) == EDEADLK) {
1058		delay(deadlk_wait);
1059		res = 0;
1060		goto retry;
1061	}
1062	return (res);
1063}
1064
1065/*
1066 * Set the virtual mapping for the interval from [addr : addr + size)
1067 * in address space `as' to have the specified protection.
1068 * It is ok for the range to cross over several segments,
1069 * as long as they are contiguous.
1070 */
1071int
1072as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1073{
1074	struct seg *seg;
1075	struct as_callback *cb;
1076	size_t ssize;
1077	caddr_t raddr;			/* rounded down addr */
1078	size_t rsize;			/* rounded up size */
1079	int error = 0, writer = 0;
1080	caddr_t saveraddr;
1081	size_t saversize;
1082
1083setprot_top:
1084	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1085	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1086	    (size_t)raddr;
1087
1088	if (raddr + rsize < raddr)		/* check for wraparound */
1089		return (ENOMEM);
1090
1091	saveraddr = raddr;
1092	saversize = rsize;
1093
1094	/*
1095	 * Normally we only lock the as as a reader. But
1096	 * if due to setprot the segment driver needs to split
1097	 * a segment it will return IE_RETRY. Therefore we re-acquire
1098	 * the as lock as a writer so the segment driver can change
1099	 * the seg list. Also the segment driver will return IE_RETRY
1100	 * after it has changed the segment list so we therefore keep
1101	 * locking as a writer. Since these opeartions should be rare
1102	 * want to only lock as a writer when necessary.
1103	 */
1104	if (writer || avl_numnodes(&as->a_wpage) != 0) {
1105		AS_LOCK_ENTER(as, RW_WRITER);
1106	} else {
1107		AS_LOCK_ENTER(as, RW_READER);
1108	}
1109
1110	as_clearwatchprot(as, raddr, rsize);
1111	seg = as_segat(as, raddr);
1112	if (seg == NULL) {
1113		as_setwatch(as);
1114		AS_LOCK_EXIT(as);
1115		return (ENOMEM);
1116	}
1117
1118	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1119		if (raddr >= seg->s_base + seg->s_size) {
1120			seg = AS_SEGNEXT(as, seg);
1121			if (seg == NULL || raddr != seg->s_base) {
1122				error = ENOMEM;
1123				break;
1124			}
1125		}
1126		if ((raddr + rsize) > (seg->s_base + seg->s_size))
1127			ssize = seg->s_base + seg->s_size - raddr;
1128		else
1129			ssize = rsize;
1130retry:
1131		error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1132
1133		if (error == IE_NOMEM) {
1134			error = EAGAIN;
1135			break;
1136		}
1137
1138		if (error == IE_RETRY) {
1139			AS_LOCK_EXIT(as);
1140			writer = 1;
1141			goto setprot_top;
1142		}
1143
1144		if (error == EAGAIN) {
1145			/*
1146			 * Make sure we have a_lock as writer.
1147			 */
1148			if (writer == 0) {
1149				AS_LOCK_EXIT(as);
1150				writer = 1;
1151				goto setprot_top;
1152			}
1153
1154			/*
1155			 * Memory is currently locked.  It must be unlocked
1156			 * before this operation can succeed through a retry.
1157			 * The possible reasons for locked memory and
1158			 * corresponding strategies for unlocking are:
1159			 * (1) Normal I/O
1160			 *	wait for a signal that the I/O operation
1161			 *	has completed and the memory is unlocked.
1162			 * (2) Asynchronous I/O
1163			 *	The aio subsystem does not unlock pages when
1164			 *	the I/O is completed. Those pages are unlocked
1165			 *	when the application calls aiowait/aioerror.
1166			 *	So, to prevent blocking forever, cv_broadcast()
1167			 *	is done to wake up aio_cleanup_thread.
1168			 *	Subsequently, segvn_reclaim will be called, and
1169			 *	that will do AS_CLRUNMAPWAIT() and wake us up.
1170			 * (3) Long term page locking:
1171			 *	Drivers intending to have pages locked for a
1172			 *	period considerably longer than for normal I/O
1173			 *	(essentially forever) may have registered for a
1174			 *	callback so they may unlock these pages on
1175			 *	request. This is needed to allow this operation
1176			 *	to succeed. Each entry on the callback list is
1177			 *	examined. If the event or address range pertains
1178			 *	the callback is invoked (unless it already is in
1179			 *	progress). The a_contents lock must be dropped
1180			 *	before the callback, so only one callback can
1181			 *	be done at a time. Go to the top and do more
1182			 *	until zero is returned. If zero is returned,
1183			 *	either there were no callbacks for this event
1184			 *	or they were already in progress.
1185			 */
1186			mutex_enter(&as->a_contents);
1187			if (as->a_callbacks &&
1188			    (cb = as_find_callback(as, AS_SETPROT_EVENT,
1189			    seg->s_base, seg->s_size))) {
1190				AS_LOCK_EXIT(as);
1191				as_execute_callback(as, cb, AS_SETPROT_EVENT);
1192			} else if (!AS_ISNOUNMAPWAIT(as)) {
1193				if (AS_ISUNMAPWAIT(as) == 0)
1194					cv_broadcast(&as->a_cv);
1195				AS_SETUNMAPWAIT(as);
1196				AS_LOCK_EXIT(as);
1197				while (AS_ISUNMAPWAIT(as))
1198					cv_wait(&as->a_cv, &as->a_contents);
1199			} else {
1200				/*
1201				 * We may have raced with
1202				 * segvn_reclaim()/segspt_reclaim(). In this
1203				 * case clean nounmapwait flag and retry since
1204				 * softlockcnt in this segment may be already
1205				 * 0.  We don't drop as writer lock so our
1206				 * number of retries without sleeping should
1207				 * be very small. See segvn_reclaim() for
1208				 * more comments.
1209				 */
1210				AS_CLRNOUNMAPWAIT(as);
1211				mutex_exit(&as->a_contents);
1212				goto retry;
1213			}
1214			mutex_exit(&as->a_contents);
1215			goto setprot_top;
1216		} else if (error != 0)
1217			break;
1218	}
1219	if (error != 0) {
1220		as_setwatch(as);
1221	} else {
1222		as_setwatchprot(as, saveraddr, saversize, prot);
1223	}
1224	AS_LOCK_EXIT(as);
1225	return (error);
1226}
1227
1228/*
1229 * Check to make sure that the interval [addr, addr + size)
1230 * in address space `as' has at least the specified protection.
1231 * It is ok for the range to cross over several segments, as long
1232 * as they are contiguous.
1233 */
1234int
1235as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1236{
1237	struct seg *seg;
1238	size_t ssize;
1239	caddr_t raddr;			/* rounded down addr */
1240	size_t rsize;			/* rounded up size */
1241	int error = 0;
1242
1243	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1244	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1245	    (size_t)raddr;
1246
1247	if (raddr + rsize < raddr)		/* check for wraparound */
1248		return (ENOMEM);
1249
1250	/*
1251	 * This is ugly as sin...
1252	 * Normally, we only acquire the address space readers lock.
1253	 * However, if the address space has watchpoints present,
1254	 * we must acquire the writer lock on the address space for
1255	 * the benefit of as_clearwatchprot() and as_setwatchprot().
1256	 */
1257	if (avl_numnodes(&as->a_wpage) != 0)
1258		AS_LOCK_ENTER(as, RW_WRITER);
1259	else
1260		AS_LOCK_ENTER(as, RW_READER);
1261	as_clearwatchprot(as, raddr, rsize);
1262	seg = as_segat(as, raddr);
1263	if (seg == NULL) {
1264		as_setwatch(as);
1265		AS_LOCK_EXIT(as);
1266		return (ENOMEM);
1267	}
1268
1269	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1270		if (raddr >= seg->s_base + seg->s_size) {
1271			seg = AS_SEGNEXT(as, seg);
1272			if (seg == NULL || raddr != seg->s_base) {
1273				error = ENOMEM;
1274				break;
1275			}
1276		}
1277		if ((raddr + rsize) > (seg->s_base + seg->s_size))
1278			ssize = seg->s_base + seg->s_size - raddr;
1279		else
1280			ssize = rsize;
1281
1282		error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1283		if (error != 0)
1284			break;
1285	}
1286	as_setwatch(as);
1287	AS_LOCK_EXIT(as);
1288	return (error);
1289}
1290
1291int
1292as_unmap(struct as *as, caddr_t addr, size_t size)
1293{
1294	struct seg *seg, *seg_next;
1295	struct as_callback *cb;
1296	caddr_t raddr, eaddr;
1297	size_t ssize, rsize = 0;
1298	int err;
1299
1300top:
1301	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1302	eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1303	    (uintptr_t)PAGEMASK);
1304
1305	AS_LOCK_ENTER(as, RW_WRITER);
1306
1307	as->a_updatedir = 1;	/* inform /proc */
1308	gethrestime(&as->a_updatetime);
1309
1310	/*
1311	 * Use as_findseg to find the first segment in the range, then
1312	 * step through the segments in order, following s_next.
1313	 */
1314	as_clearwatchprot(as, raddr, eaddr - raddr);
1315
1316	for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1317		const boolean_t is_hole = ((seg->s_flags & S_HOLE) != 0);
1318
1319		if (eaddr <= seg->s_base)
1320			break;		/* eaddr was in a gap; all done */
1321
1322		/* this is implied by the test above */
1323		ASSERT(raddr < eaddr);
1324
1325		if (raddr < seg->s_base)
1326			raddr = seg->s_base; 	/* raddr was in a gap */
1327
1328		if (eaddr > (seg->s_base + seg->s_size))
1329			ssize = seg->s_base + seg->s_size - raddr;
1330		else
1331			ssize = eaddr - raddr;
1332
1333		/*
1334		 * Save next segment pointer since seg can be
1335		 * destroyed during the segment unmap operation.
1336		 */
1337		seg_next = AS_SEGNEXT(as, seg);
1338
1339		/*
1340		 * We didn't count /dev/null mappings, so ignore them here.
1341		 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1342		 * we have to do this check here while we have seg.)
1343		 */
1344		rsize = 0;
1345		if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1346		    !SEG_IS_PARTIAL_RESV(seg))
1347			rsize = ssize;
1348
1349retry:
1350		err = SEGOP_UNMAP(seg, raddr, ssize);
1351		if (err == EAGAIN) {
1352			/*
1353			 * Memory is currently locked.  It must be unlocked
1354			 * before this operation can succeed through a retry.
1355			 * The possible reasons for locked memory and
1356			 * corresponding strategies for unlocking are:
1357			 * (1) Normal I/O
1358			 *	wait for a signal that the I/O operation
1359			 *	has completed and the memory is unlocked.
1360			 * (2) Asynchronous I/O
1361			 *	The aio subsystem does not unlock pages when
1362			 *	the I/O is completed. Those pages are unlocked
1363			 *	when the application calls aiowait/aioerror.
1364			 *	So, to prevent blocking forever, cv_broadcast()
1365			 *	is done to wake up aio_cleanup_thread.
1366			 *	Subsequently, segvn_reclaim will be called, and
1367			 *	that will do AS_CLRUNMAPWAIT() and wake us up.
1368			 * (3) Long term page locking:
1369			 *	Drivers intending to have pages locked for a
1370			 *	period considerably longer than for normal I/O
1371			 *	(essentially forever) may have registered for a
1372			 *	callback so they may unlock these pages on
1373			 *	request. This is needed to allow this operation
1374			 *	to succeed. Each entry on the callback list is
1375			 *	examined. If the event or address range pertains
1376			 *	the callback is invoked (unless it already is in
1377			 *	progress). The a_contents lock must be dropped
1378			 *	before the callback, so only one callback can
1379			 *	be done at a time. Go to the top and do more
1380			 *	until zero is returned. If zero is returned,
1381			 *	either there were no callbacks for this event
1382			 *	or they were already in progress.
1383			 */
1384			mutex_enter(&as->a_contents);
1385			if (as->a_callbacks &&
1386			    (cb = as_find_callback(as, AS_UNMAP_EVENT,
1387			    seg->s_base, seg->s_size))) {
1388				AS_LOCK_EXIT(as);
1389				as_execute_callback(as, cb, AS_UNMAP_EVENT);
1390			} else if (!AS_ISNOUNMAPWAIT(as)) {
1391				if (AS_ISUNMAPWAIT(as) == 0)
1392					cv_broadcast(&as->a_cv);
1393				AS_SETUNMAPWAIT(as);
1394				AS_LOCK_EXIT(as);
1395				while (AS_ISUNMAPWAIT(as))
1396					cv_wait(&as->a_cv, &as->a_contents);
1397			} else {
1398				/*
1399				 * We may have raced with
1400				 * segvn_reclaim()/segspt_reclaim(). In this
1401				 * case clean nounmapwait flag and retry since
1402				 * softlockcnt in this segment may be already
1403				 * 0.  We don't drop as writer lock so our
1404				 * number of retries without sleeping should
1405				 * be very small. See segvn_reclaim() for
1406				 * more comments.
1407				 */
1408				AS_CLRNOUNMAPWAIT(as);
1409				mutex_exit(&as->a_contents);
1410				goto retry;
1411			}
1412			mutex_exit(&as->a_contents);
1413			goto top;
1414		} else if (err == IE_RETRY) {
1415			AS_LOCK_EXIT(as);
1416			goto top;
1417		} else if (err) {
1418			as_setwatch(as);
1419			AS_LOCK_EXIT(as);
1420			return (-1);
1421		}
1422
1423		if (!is_hole) {
1424			as->a_size -= ssize;
1425			if (rsize)
1426				as->a_resvsize -= rsize;
1427		}
1428		raddr += ssize;
1429	}
1430	AS_LOCK_EXIT(as);
1431	return (0);
1432}
1433
1434static int
1435as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1436    segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1437{
1438	uint_t szc, nszc, save_szcvec;
1439	int error;
1440	caddr_t a, eaddr;
1441	size_t pgsz;
1442	const boolean_t do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1443
1444	ASSERT(AS_WRITE_HELD(as));
1445	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1446	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1447	ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1448
1449	if (!do_off) {
1450		vn_a->offset = 0;
1451	}
1452
1453	if (szcvec <= 1) {
1454		struct seg *seg, *segref;
1455
1456		seg = segref = seg_alloc(as, addr, size);
1457		if (seg == NULL) {
1458			return (ENOMEM);
1459		}
1460		vn_a->szc = 0;
1461		error = (*crfp)(&seg, vn_a);
1462		if (error != 0) {
1463			VERIFY3P(seg, ==, segref);
1464			seg_free(seg);
1465		} else {
1466			as->a_size += size;
1467			as->a_resvsize += size;
1468		}
1469		return (error);
1470	}
1471
1472	eaddr = addr + size;
1473	save_szcvec = szcvec;
1474	szcvec >>= 1;
1475	szc = 0;
1476	nszc = 0;
1477	while (szcvec) {
1478		if ((szcvec & 0x1) == 0) {
1479			nszc++;
1480			szcvec >>= 1;
1481			continue;
1482		}
1483		nszc++;
1484		pgsz = page_get_pagesize(nszc);
1485		a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1486		if (a != addr) {
1487			struct seg *seg, *segref;
1488			size_t segsize;
1489
1490			ASSERT(a < eaddr);
1491
1492			segsize = a - addr;
1493			seg = segref = seg_alloc(as, addr, segsize);
1494			if (seg == NULL) {
1495				return (ENOMEM);
1496			}
1497			vn_a->szc = szc;
1498			error = (*crfp)(&seg, vn_a);
1499			if (error != 0) {
1500				VERIFY3P(seg, ==, segref);
1501				seg_free(seg);
1502				return (error);
1503			}
1504			as->a_size += segsize;
1505			as->a_resvsize += segsize;
1506			*segcreated = B_TRUE;
1507			if (do_off) {
1508				vn_a->offset += segsize;
1509			}
1510			addr = a;
1511		}
1512		szc = nszc;
1513		szcvec >>= 1;
1514	}
1515
1516	ASSERT(addr < eaddr);
1517	szcvec = save_szcvec | 1; /* add 8K pages */
1518	while (szcvec) {
1519		a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1520		ASSERT(a >= addr);
1521		if (a != addr) {
1522			struct seg *seg, *segref;
1523			size_t segsize;
1524
1525			segsize = a - addr;
1526			seg = segref = seg_alloc(as, addr, segsize);
1527			if (seg == NULL) {
1528				return (ENOMEM);
1529			}
1530			vn_a->szc = szc;
1531			error = (*crfp)(&seg, vn_a);
1532			if (error != 0) {
1533				VERIFY3P(seg, ==, segref);
1534				seg_free(seg);
1535				return (error);
1536			}
1537			as->a_size += segsize;
1538			as->a_resvsize += segsize;
1539			*segcreated = B_TRUE;
1540			if (do_off) {
1541				vn_a->offset += segsize;
1542			}
1543			addr = a;
1544		}
1545		szcvec &= ~(1 << szc);
1546		if (szcvec) {
1547			szc = highbit(szcvec) - 1;
1548			pgsz = page_get_pagesize(szc);
1549		}
1550	}
1551	ASSERT(addr == eaddr);
1552
1553	return (0);
1554}
1555
1556static int
1557as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1558    segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1559{
1560	uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1561	int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1562	uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1563	    type, 0);
1564	int error;
1565	struct vattr va;
1566	u_offset_t eoff;
1567	size_t save_size = 0;
1568	extern size_t textrepl_size_thresh;
1569
1570	ASSERT(AS_WRITE_HELD(as));
1571	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1572	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1573	ASSERT(vn_a->vp != NULL);
1574	ASSERT(vn_a->amp == NULL);
1575
1576again:
1577	if (szcvec <= 1) {
1578		struct seg *seg, *segref;
1579
1580		seg = segref = seg_alloc(as, addr, size);
1581		if (seg == NULL) {
1582			return (ENOMEM);
1583		}
1584		vn_a->szc = 0;
1585		error = (*crfp)(&seg, vn_a);
1586		if (error != 0) {
1587			VERIFY3P(seg, ==, segref);
1588			seg_free(seg);
1589		} else {
1590			as->a_size += size;
1591			as->a_resvsize += size;
1592		}
1593		return (error);
1594	}
1595
1596	va.va_mask = AT_SIZE;
1597	if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1598		szcvec = 0;
1599		goto again;
1600	}
1601	eoff = vn_a->offset & PAGEMASK;
1602	if (eoff >= va.va_size) {
1603		szcvec = 0;
1604		goto again;
1605	}
1606	eoff += size;
1607	if (btopr(va.va_size) < btopr(eoff)) {
1608		save_size = size;
1609		size = va.va_size - (vn_a->offset & PAGEMASK);
1610		size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1611		szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1612		    type, 0);
1613		if (szcvec <= 1) {
1614			size = save_size;
1615			goto again;
1616		}
1617	}
1618
1619	if (size > textrepl_size_thresh) {
1620		vn_a->flags |= _MAP_TEXTREPL;
1621	}
1622	error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1623	    segcreated);
1624	if (error != 0) {
1625		return (error);
1626	}
1627	if (save_size) {
1628		addr += size;
1629		size = save_size - size;
1630		szcvec = 0;
1631		goto again;
1632	}
1633	return (0);
1634}
1635
1636/*
1637 * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1638 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1639 */
1640static int
1641as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1642    segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1643{
1644	uint_t szcvec;
1645	uchar_t type;
1646
1647	ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1648	if (vn_a->type == MAP_SHARED) {
1649		type = MAPPGSZC_SHM;
1650	} else if (vn_a->type == MAP_PRIVATE) {
1651		if (vn_a->szc == AS_MAP_HEAP) {
1652			type = MAPPGSZC_HEAP;
1653		} else if (vn_a->szc == AS_MAP_STACK) {
1654			type = MAPPGSZC_STACK;
1655		} else {
1656			type = MAPPGSZC_PRIVM;
1657		}
1658	}
1659	szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1660	    (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1661	    (vn_a->flags & MAP_TEXT), type, 0);
1662	ASSERT(AS_WRITE_HELD(as));
1663	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1664	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1665	ASSERT(vn_a->vp == NULL);
1666
1667	return (as_map_segvn_segs(as, addr, size, szcvec,
1668	    crfp, vn_a, segcreated));
1669}
1670
1671int
1672as_map(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
1673    void *argsp)
1674{
1675	AS_LOCK_ENTER(as, RW_WRITER);
1676	return (as_map_locked(as, addr, size, crfp, argsp));
1677}
1678
1679int
1680as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
1681    void *argsp)
1682{
1683	caddr_t raddr;			/* rounded down addr */
1684	size_t rsize;			/* rounded up size */
1685	int error;
1686	boolean_t is_hole = B_FALSE;
1687	/*
1688	 * The use of a_proc is preferred to handle the case where curproc is
1689	 * a door_call server and is allocating memory in the client's (a_proc)
1690	 * address space.
1691	 * When creating a shared memory segment a_proc will be NULL so we
1692	 * fallback to curproc in that case.
1693	 */
1694	struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1695	struct segvn_crargs crargs;
1696
1697	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1698	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1699	    (size_t)raddr;
1700
1701	/*
1702	 * check for wrap around
1703	 */
1704	if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1705		AS_LOCK_EXIT(as);
1706		return (ENOMEM);
1707	}
1708
1709	as->a_updatedir = 1;	/* inform /proc */
1710	gethrestime(&as->a_updatetime);
1711
1712	if (as != &kas) {
1713		/*
1714		 * Ensure that the virtual size of the process will not exceed
1715		 * the configured limit.  Since seg_hole segments will later
1716		 * set the S_HOLE flag indicating their status as a hole in the
1717		 * AS, they are excluded from this check.
1718		 */
1719		if (as->a_size + rsize > (size_t)p->p_vmem_ctl &&
1720		    !AS_MAP_CHECK_SEGHOLE(crfp)) {
1721			AS_LOCK_EXIT(as);
1722
1723			(void) rctl_action(rctlproc_legacy[RLIMIT_VMEM],
1724			    p->p_rctls, p, RCA_UNSAFE_ALL);
1725			return (ENOMEM);
1726		}
1727	}
1728
1729	if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1730		boolean_t do_unmap = B_FALSE;
1731
1732		crargs = *(struct segvn_crargs *)argsp;
1733		error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs,
1734		    &do_unmap);
1735		if (error != 0) {
1736			AS_LOCK_EXIT(as);
1737			if (do_unmap) {
1738				(void) as_unmap(as, addr, size);
1739			}
1740			return (error);
1741		}
1742	} else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1743		boolean_t do_unmap = B_FALSE;
1744
1745		crargs = *(struct segvn_crargs *)argsp;
1746		error = as_map_ansegs(as, raddr, rsize, crfp, &crargs,
1747		    &do_unmap);
1748		if (error != 0) {
1749			AS_LOCK_EXIT(as);
1750			if (do_unmap) {
1751				(void) as_unmap(as, addr, size);
1752			}
1753			return (error);
1754		}
1755	} else {
1756		struct seg *seg, *segref;
1757
1758		seg = segref = seg_alloc(as, addr, size);
1759		if (seg == NULL) {
1760			AS_LOCK_EXIT(as);
1761			return (ENOMEM);
1762		}
1763
1764		/*
1765		 * It is possible that the segment creation routine will free
1766		 * 'seg' as part of a more advanced operation, such as when
1767		 * segvn concatenates adjacent segments together.  When this
1768		 * occurs, the seg*_create routine must communicate the
1769		 * resulting segment out via the 'struct seg **' parameter.
1770		 *
1771		 * If segment creation fails, it must not free the passed-in
1772		 * segment, nor alter the argument pointer.
1773		 */
1774		error = (*crfp)(&seg, argsp);
1775		if (error != 0) {
1776			VERIFY3P(seg, ==, segref);
1777			seg_free(seg);
1778			AS_LOCK_EXIT(as);
1779			return (error);
1780		}
1781
1782		/*
1783		 * Check if the resulting segment represents a hole in the
1784		 * address space, rather than contributing to the AS size.
1785		 */
1786		is_hole = ((seg->s_flags & S_HOLE) != 0);
1787
1788		/* Add size now so as_unmap will work if as_ctl fails. */
1789		if (!is_hole) {
1790			as->a_size += rsize;
1791			as->a_resvsize += rsize;
1792		}
1793	}
1794
1795	as_setwatch(as);
1796
1797	/*
1798	 * Establish memory locks for the segment if the address space is
1799	 * locked, provided it's not an explicit hole in the AS.
1800	 */
1801	mutex_enter(&as->a_contents);
1802	if (AS_ISPGLCK(as) && !is_hole) {
1803		mutex_exit(&as->a_contents);
1804		AS_LOCK_EXIT(as);
1805		error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1806		if (error != 0)
1807			(void) as_unmap(as, addr, size);
1808	} else {
1809		mutex_exit(&as->a_contents);
1810		AS_LOCK_EXIT(as);
1811	}
1812	return (error);
1813}
1814
1815
1816/*
1817 * Delete all segments in the address space marked with S_PURGE.
1818 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1819 * These segments are deleted as a first step before calls to as_gap(), so
1820 * that they don't affect mmap() or shmat().
1821 */
1822void
1823as_purge(struct as *as)
1824{
1825	struct seg *seg;
1826	struct seg *next_seg;
1827
1828	/*
1829	 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1830	 * no need to grab a_contents mutex for this check
1831	 */
1832	if ((as->a_flags & AS_NEEDSPURGE) == 0)
1833		return;
1834
1835	AS_LOCK_ENTER(as, RW_WRITER);
1836	next_seg = NULL;
1837	seg = AS_SEGFIRST(as);
1838	while (seg != NULL) {
1839		next_seg = AS_SEGNEXT(as, seg);
1840		if (seg->s_flags & S_PURGE)
1841			SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1842		seg = next_seg;
1843	}
1844	AS_LOCK_EXIT(as);
1845
1846	mutex_enter(&as->a_contents);
1847	as->a_flags &= ~AS_NEEDSPURGE;
1848	mutex_exit(&as->a_contents);
1849}
1850
1851/*
1852 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1853 * range of addresses at least "minlen" long, where the base of the range is
1854 * at "off" phase from an "align" boundary and there is space for a
1855 * "redzone"-sized redzone on eithe rside of the range.  Thus,
1856 * if align was 4M and off was 16k, the user wants a hole which will start
1857 * 16k into a 4M page.
1858 *
1859 * If flags specifies AH_HI, the hole will have the highest possible address
1860 * in the range.  We use the as->a_lastgap field to figure out where to
1861 * start looking for a gap.
1862 *
1863 * Otherwise, the gap will have the lowest possible address.
1864 *
1865 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1866 *
1867 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1868 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1869 *
1870 * NOTE: This routine is not correct when base+len overflows caddr_t.
1871 */
1872int
1873as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1874    uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1875{
1876	caddr_t lobound = *basep;
1877	caddr_t hibound = lobound + *lenp;
1878	struct seg *lseg, *hseg;
1879	caddr_t lo, hi;
1880	int forward;
1881	caddr_t save_base;
1882	size_t save_len;
1883	size_t save_minlen;
1884	size_t save_redzone;
1885	int fast_path = 1;
1886
1887	save_base = *basep;
1888	save_len = *lenp;
1889	save_minlen = minlen;
1890	save_redzone = redzone;
1891
1892	/*
1893	 * For the first pass/fast_path, just add align and redzone into
1894	 * minlen since if we get an allocation, we can guarantee that it
1895	 * will fit the alignment and redzone requested.
1896	 * This increases the chance that hibound will be adjusted to
1897	 * a_lastgap->s_base which will likely allow us to find an
1898	 * acceptable hole in the address space quicker.
1899	 * If we can't find a hole with this fast_path, then we look for
1900	 * smaller holes in which the alignment and offset may allow
1901	 * the allocation to fit.
1902	 */
1903	minlen += align;
1904	minlen += 2 * redzone;
1905	redzone = 0;
1906
1907	AS_LOCK_ENTER(as, RW_READER);
1908	if (AS_SEGFIRST(as) == NULL) {
1909		if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1910		    align, redzone, off)) {
1911			AS_LOCK_EXIT(as);
1912			return (0);
1913		} else {
1914			AS_LOCK_EXIT(as);
1915			*basep = save_base;
1916			*lenp = save_len;
1917			return (-1);
1918		}
1919	}
1920
1921retry:
1922	/*
1923	 * Set up to iterate over all the inter-segment holes in the given
1924	 * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1925	 * NULL for the highest-addressed hole.  If moving backwards, we reset
1926	 * sseg to denote the highest-addressed segment.
1927	 */
1928	forward = (flags & AH_DIR) == AH_LO;
1929	if (forward) {
1930		hseg = as_findseg(as, lobound, 1);
1931		lseg = AS_SEGPREV(as, hseg);
1932	} else {
1933
1934		/*
1935		 * If allocating at least as much as the last allocation,
1936		 * use a_lastgap's base as a better estimate of hibound.
1937		 */
1938		if (as->a_lastgap &&
1939		    minlen >= as->a_lastgap->s_size &&
1940		    hibound >= as->a_lastgap->s_base)
1941			hibound = as->a_lastgap->s_base;
1942
1943		hseg = as_findseg(as, hibound, 1);
1944		if (hseg->s_base + hseg->s_size < hibound) {
1945			lseg = hseg;
1946			hseg = NULL;
1947		} else {
1948			lseg = AS_SEGPREV(as, hseg);
1949		}
1950	}
1951
1952	for (;;) {
1953		/*
1954		 * Set lo and hi to the hole's boundaries.  (We should really
1955		 * use MAXADDR in place of hibound in the expression below,
1956		 * but can't express it easily; using hibound in its place is
1957		 * harmless.)
1958		 */
1959		lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1960		hi = (hseg == NULL) ? hibound : hseg->s_base;
1961		/*
1962		 * If the iteration has moved past the interval from lobound
1963		 * to hibound it's pointless to continue.
1964		 */
1965		if ((forward && lo > hibound) || (!forward && hi < lobound))
1966			break;
1967		else if (lo > hibound || hi < lobound)
1968			goto cont;
1969		/*
1970		 * Candidate hole lies at least partially within the allowable
1971		 * range.  Restrict it to fall completely within that range,
1972		 * i.e., to [max(lo, lobound), min(hi, hibound)].
1973		 */
1974		if (lo < lobound)
1975			lo = lobound;
1976		if (hi > hibound)
1977			hi = hibound;
1978		/*
1979		 * Verify that the candidate hole is big enough and meets
1980		 * hardware constraints.  If the hole is too small, no need
1981		 * to do the further checks since they will fail.
1982		 */
1983		*basep = lo;
1984		*lenp = hi - lo;
1985		if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1986		    minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1987		    ((flags & AH_CONTAIN) == 0 ||
1988		    (*basep <= addr && *basep + *lenp > addr))) {
1989			if (!forward)
1990				as->a_lastgap = hseg;
1991			if (hseg != NULL)
1992				as->a_lastgaphl = hseg;
1993			else
1994				as->a_lastgaphl = lseg;
1995			AS_LOCK_EXIT(as);
1996			return (0);
1997		}
1998	cont:
1999		/*
2000		 * Move to the next hole.
2001		 */
2002		if (forward) {
2003			lseg = hseg;
2004			if (lseg == NULL)
2005				break;
2006			hseg = AS_SEGNEXT(as, hseg);
2007		} else {
2008			hseg = lseg;
2009			if (hseg == NULL)
2010				break;
2011			lseg = AS_SEGPREV(as, lseg);
2012		}
2013	}
2014	if (fast_path && (align != 0 || save_redzone != 0)) {
2015		fast_path = 0;
2016		minlen = save_minlen;
2017		redzone = save_redzone;
2018		goto retry;
2019	}
2020	*basep = save_base;
2021	*lenp = save_len;
2022	AS_LOCK_EXIT(as);
2023	return (-1);
2024}
2025
2026/*
2027 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2028 *
2029 * If flags specifies AH_HI, the hole will have the highest possible address
2030 * in the range.  We use the as->a_lastgap field to figure out where to
2031 * start looking for a gap.
2032 *
2033 * Otherwise, the gap will have the lowest possible address.
2034 *
2035 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2036 *
2037 * If an adequate hole is found, base and len are set to reflect the part of
2038 * the hole that is within range, and 0 is returned, otherwise,
2039 * -1 is returned.
2040 *
2041 * NOTE: This routine is not correct when base+len overflows caddr_t.
2042 */
2043int
2044as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2045    caddr_t addr)
2046{
2047
2048	return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2049}
2050
2051/*
2052 * Return the next range within [base, base + len) that is backed
2053 * with "real memory".  Skip holes and non-seg_vn segments.
2054 * We're lazy and only return one segment at a time.
2055 */
2056int
2057as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2058{
2059	extern struct seg_ops segspt_shmops;	/* needs a header file */
2060	struct seg *seg;
2061	caddr_t addr, eaddr;
2062	caddr_t segend;
2063
2064	AS_LOCK_ENTER(as, RW_READER);
2065
2066	addr = *basep;
2067	eaddr = addr + *lenp;
2068
2069	seg = as_findseg(as, addr, 0);
2070	if (seg != NULL)
2071		addr = MAX(seg->s_base, addr);
2072
2073	for (;;) {
2074		if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2075			AS_LOCK_EXIT(as);
2076			return (EINVAL);
2077		}
2078
2079		if (seg->s_ops == &segvn_ops) {
2080			segend = seg->s_base + seg->s_size;
2081			break;
2082		}
2083
2084		/*
2085		 * We do ISM by looking into the private data
2086		 * to determine the real size of the segment.
2087		 */
2088		if (seg->s_ops == &segspt_shmops) {
2089			segend = seg->s_base + spt_realsize(seg);
2090			if (addr < segend)
2091				break;
2092		}
2093
2094		seg = AS_SEGNEXT(as, seg);
2095
2096		if (seg != NULL)
2097			addr = seg->s_base;
2098	}
2099
2100	*basep = addr;
2101
2102	if (segend > eaddr)
2103		*lenp = eaddr - addr;
2104	else
2105		*lenp = segend - addr;
2106
2107	AS_LOCK_EXIT(as);
2108	return (0);
2109}
2110
2111/*
2112 * Swap the pages associated with the address space as out to
2113 * secondary storage, returning the number of bytes actually
2114 * swapped.
2115 *
2116 * The value returned is intended to correlate well with the process's
2117 * memory requirements.  Its usefulness for this purpose depends on
2118 * how well the segment-level routines do at returning accurate
2119 * information.
2120 */
2121size_t
2122as_swapout(struct as *as)
2123{
2124	struct seg *seg;
2125	size_t swpcnt = 0;
2126
2127	/*
2128	 * Kernel-only processes have given up their address
2129	 * spaces.  Of course, we shouldn't be attempting to
2130	 * swap out such processes in the first place...
2131	 */
2132	if (as == NULL)
2133		return (0);
2134
2135	AS_LOCK_ENTER(as, RW_READER);
2136
2137	/*
2138	 * Free all mapping resources associated with the address
2139	 * space.  The segment-level swapout routines capitalize
2140	 * on this unmapping by scavanging pages that have become
2141	 * unmapped here.
2142	 */
2143	hat_swapout(as->a_hat);
2144
2145	/*
2146	 * Call the swapout routines of all segments in the address
2147	 * space to do the actual work, accumulating the amount of
2148	 * space reclaimed.
2149	 */
2150	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2151		struct seg_ops *ov = seg->s_ops;
2152
2153		/*
2154		 * We have to check to see if the seg has
2155		 * an ops vector because the seg may have
2156		 * been in the middle of being set up when
2157		 * the process was picked for swapout.
2158		 */
2159		if ((ov != NULL) && (ov->swapout != NULL))
2160			swpcnt += SEGOP_SWAPOUT(seg);
2161	}
2162	AS_LOCK_EXIT(as);
2163	return (swpcnt);
2164}
2165
2166/*
2167 * Determine whether data from the mappings in interval [addr, addr + size)
2168 * are in the primary memory (core) cache.
2169 */
2170int
2171as_incore(struct as *as, caddr_t addr,
2172    size_t size, char *vec, size_t *sizep)
2173{
2174	struct seg *seg;
2175	size_t ssize;
2176	caddr_t raddr;		/* rounded down addr */
2177	size_t rsize;		/* rounded up size */
2178	size_t isize;			/* iteration size */
2179	int error = 0;		/* result, assume success */
2180
2181	*sizep = 0;
2182	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2183	rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2184	    (size_t)raddr;
2185
2186	if (raddr + rsize < raddr)		/* check for wraparound */
2187		return (ENOMEM);
2188
2189	AS_LOCK_ENTER(as, RW_READER);
2190	seg = as_segat(as, raddr);
2191	if (seg == NULL) {
2192		AS_LOCK_EXIT(as);
2193		return (-1);
2194	}
2195
2196	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2197		if (raddr >= seg->s_base + seg->s_size) {
2198			seg = AS_SEGNEXT(as, seg);
2199			if (seg == NULL || raddr != seg->s_base) {
2200				error = -1;
2201				break;
2202			}
2203		}
2204		if ((raddr + rsize) > (seg->s_base + seg->s_size))
2205			ssize = seg->s_base + seg->s_size - raddr;
2206		else
2207			ssize = rsize;
2208		*sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2209		if (isize != ssize) {
2210			error = -1;
2211			break;
2212		}
2213		vec += btopr(ssize);
2214	}
2215	AS_LOCK_EXIT(as);
2216	return (error);
2217}
2218
2219static void
2220as_segunlock(struct seg *seg, caddr_t addr, int attr,
2221    ulong_t *bitmap, size_t position, size_t npages)
2222{
2223	caddr_t	range_start;
2224	size_t	pos1 = position;
2225	size_t	pos2;
2226	size_t	size;
2227	size_t  end_pos = npages + position;
2228
2229	while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2230		size = ptob((pos2 - pos1));
2231		range_start = (caddr_t)((uintptr_t)addr +
2232		    ptob(pos1 - position));
2233
2234		(void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2235		    (ulong_t *)NULL, (size_t)NULL);
2236		pos1 = pos2;
2237	}
2238}
2239
2240static void
2241as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2242    caddr_t raddr, size_t rsize)
2243{
2244	struct seg *seg = as_segat(as, raddr);
2245	size_t ssize;
2246
2247	while (rsize != 0) {
2248		if (raddr >= seg->s_base + seg->s_size)
2249			seg = AS_SEGNEXT(as, seg);
2250
2251		if ((raddr + rsize) > (seg->s_base + seg->s_size))
2252			ssize = seg->s_base + seg->s_size - raddr;
2253		else
2254			ssize = rsize;
2255
2256		as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2257
2258		rsize -= ssize;
2259		raddr += ssize;
2260	}
2261}
2262
2263/*
2264 * Cache control operations over the interval [addr, addr + size) in
2265 * address space "as".
2266 */
2267/*ARGSUSED*/
2268int
2269as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2270    uintptr_t arg, ulong_t *lock_map, size_t pos)
2271{
2272	struct seg *seg;	/* working segment */
2273	caddr_t raddr;		/* rounded down addr */
2274	caddr_t initraddr;	/* saved initial rounded down addr */
2275	size_t rsize;		/* rounded up size */
2276	size_t initrsize;	/* saved initial rounded up size */
2277	size_t ssize;		/* size of seg */
2278	int error = 0;			/* result */
2279	size_t mlock_size;	/* size of bitmap */
2280	ulong_t *mlock_map;	/* pointer to bitmap used */
2281				/* to represent the locked */
2282				/* pages. */
2283retry:
2284	if (error == IE_RETRY)
2285		AS_LOCK_ENTER(as, RW_WRITER);
2286	else
2287		AS_LOCK_ENTER(as, RW_READER);
2288
2289	/*
2290	 * If these are address space lock/unlock operations, loop over
2291	 * all segments in the address space, as appropriate.
2292	 */
2293	if (func == MC_LOCKAS) {
2294		size_t npages, idx;
2295		size_t rlen = 0;	/* rounded as length */
2296
2297		idx = pos;
2298
2299		if (arg & MCL_FUTURE) {
2300			mutex_enter(&as->a_contents);
2301			AS_SETPGLCK(as);
2302			mutex_exit(&as->a_contents);
2303		}
2304		if ((arg & MCL_CURRENT) == 0) {
2305			AS_LOCK_EXIT(as);
2306			return (0);
2307		}
2308
2309		seg = AS_SEGFIRST(as);
2310		if (seg == NULL) {
2311			AS_LOCK_EXIT(as);
2312			return (0);
2313		}
2314
2315		do {
2316			raddr = (caddr_t)((uintptr_t)seg->s_base &
2317			    (uintptr_t)PAGEMASK);
2318			rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2319			    PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2320		} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2321
2322		mlock_size = BT_BITOUL(btopr(rlen));
2323		if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2324		    sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2325				AS_LOCK_EXIT(as);
2326				return (EAGAIN);
2327		}
2328
2329		for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2330			if ((seg->s_flags & S_HOLE) != 0) {
2331				continue;
2332			}
2333			error = SEGOP_LOCKOP(seg, seg->s_base,
2334			    seg->s_size, attr, MC_LOCK, mlock_map, pos);
2335			if (error != 0)
2336				break;
2337			pos += seg_pages(seg);
2338		}
2339
2340		if (error) {
2341			for (seg = AS_SEGFIRST(as); seg != NULL;
2342			    seg = AS_SEGNEXT(as, seg)) {
2343
2344				raddr = (caddr_t)((uintptr_t)seg->s_base &
2345				    (uintptr_t)PAGEMASK);
2346				npages = seg_pages(seg);
2347				as_segunlock(seg, raddr, attr, mlock_map,
2348				    idx, npages);
2349				idx += npages;
2350			}
2351		}
2352
2353		kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2354		AS_LOCK_EXIT(as);
2355		goto lockerr;
2356	} else if (func == MC_UNLOCKAS) {
2357		mutex_enter(&as->a_contents);
2358		AS_CLRPGLCK(as);
2359		mutex_exit(&as->a_contents);
2360
2361		for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2362			if ((seg->s_flags & S_HOLE) != 0) {
2363				continue;
2364			}
2365			error = SEGOP_LOCKOP(seg, seg->s_base,
2366			    seg->s_size, attr, MC_UNLOCK, NULL, 0);
2367			if (error != 0)
2368				break;
2369		}
2370
2371		AS_LOCK_EXIT(as);
2372		goto lockerr;
2373	}
2374
2375	/*
2376	 * Normalize addresses and sizes.
2377	 */
2378	initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2379	initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2380	    (size_t)raddr;
2381
2382	if (raddr + rsize < raddr) {		/* check for wraparound */
2383		AS_LOCK_EXIT(as);
2384		return (ENOMEM);
2385	}
2386
2387	/*
2388	 * Get initial segment.
2389	 */
2390	if ((seg = as_segat(as, raddr)) == NULL) {
2391		AS_LOCK_EXIT(as);
2392		return (ENOMEM);
2393	}
2394
2395	if (func == MC_LOCK) {
2396		mlock_size = BT_BITOUL(btopr(rsize));
2397		if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2398		    sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2399				AS_LOCK_EXIT(as);
2400				return (EAGAIN);
2401		}
2402	}
2403
2404	/*
2405	 * Loop over all segments.  If a hole in the address range is
2406	 * discovered, then fail.  For each segment, perform the appropriate
2407	 * control operation.
2408	 */
2409	while (rsize != 0) {
2410
2411		/*
2412		 * Make sure there's no hole, calculate the portion
2413		 * of the next segment to be operated over.
2414		 */
2415		if (raddr >= seg->s_base + seg->s_size) {
2416			seg = AS_SEGNEXT(as, seg);
2417			if (seg == NULL || raddr != seg->s_base) {
2418				if (func == MC_LOCK) {
2419					as_unlockerr(as, attr, mlock_map,
2420					    initraddr, initrsize - rsize);
2421					kmem_free(mlock_map,
2422					    mlock_size * sizeof (ulong_t));
2423				}
2424				AS_LOCK_EXIT(as);
2425				return (ENOMEM);
2426			}
2427		}
2428		if ((raddr + rsize) > (seg->s_base + seg->s_size))
2429			ssize = seg->s_base + seg->s_size - raddr;
2430		else
2431			ssize = rsize;
2432
2433		/*
2434		 * Dispatch on specific function.
2435		 */
2436		switch (func) {
2437
2438		/*
2439		 * Synchronize cached data from mappings with backing
2440		 * objects.
2441		 */
2442		case MC_SYNC:
2443			if (error = SEGOP_SYNC(seg, raddr, ssize,
2444			    attr, (uint_t)arg)) {
2445				AS_LOCK_EXIT(as);
2446				return (error);
2447			}
2448			break;
2449
2450		/*
2451		 * Lock pages in memory.
2452		 */
2453		case MC_LOCK:
2454			if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2455			    attr, func, mlock_map, pos)) {
2456				as_unlockerr(as, attr, mlock_map, initraddr,
2457				    initrsize - rsize + ssize);
2458				kmem_free(mlock_map, mlock_size *
2459				    sizeof (ulong_t));
2460				AS_LOCK_EXIT(as);
2461				goto lockerr;
2462			}
2463			break;
2464
2465		/*
2466		 * Unlock mapped pages.
2467		 */
2468		case MC_UNLOCK:
2469			(void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2470			    (ulong_t *)NULL, (size_t)NULL);
2471			break;
2472
2473		/*
2474		 * Store VM advise for mapped pages in segment layer.
2475		 */
2476		case MC_ADVISE:
2477			error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2478
2479			/*
2480			 * Check for regular errors and special retry error
2481			 */
2482			if (error) {
2483				if (error == IE_RETRY) {
2484					/*
2485					 * Need to acquire writers lock, so
2486					 * have to drop readers lock and start
2487					 * all over again
2488					 */
2489					AS_LOCK_EXIT(as);
2490					goto retry;
2491				} else if (error == IE_REATTACH) {
2492					/*
2493					 * Find segment for current address
2494					 * because current segment just got
2495					 * split or concatenated
2496					 */
2497					seg = as_segat(as, raddr);
2498					if (seg == NULL) {
2499						AS_LOCK_EXIT(as);
2500						return (ENOMEM);
2501					}
2502				} else {
2503					/*
2504					 * Regular error
2505					 */
2506					AS_LOCK_EXIT(as);
2507					return (error);
2508				}
2509			}
2510			break;
2511
2512		case MC_INHERIT_ZERO:
2513			if (seg->s_ops->inherit == NULL) {
2514				error = ENOTSUP;
2515			} else {
2516				error = SEGOP_INHERIT(seg, raddr, ssize,
2517				    SEGP_INH_ZERO);
2518			}
2519			if (error != 0) {
2520				AS_LOCK_EXIT(as);
2521				return (error);
2522			}
2523			break;
2524
2525		/*
2526		 * Can't happen.
2527		 */
2528		default:
2529			panic("as_ctl: bad operation %d", func);
2530			/*NOTREACHED*/
2531		}
2532
2533		rsize -= ssize;
2534		raddr += ssize;
2535	}
2536
2537	if (func == MC_LOCK)
2538		kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2539	AS_LOCK_EXIT(as);
2540	return (0);
2541lockerr:
2542
2543	/*
2544	 * If the lower levels returned EDEADLK for a segment lockop,
2545	 * it means that we should retry the operation.  Let's wait
2546	 * a bit also to let the deadlock causing condition clear.
2547	 * This is part of a gross hack to work around a design flaw
2548	 * in the ufs/sds logging code and should go away when the
2549	 * logging code is re-designed to fix the problem. See bug
2550	 * 4125102 for details of the problem.
2551	 */
2552	if (error == EDEADLK) {
2553		delay(deadlk_wait);
2554		error = 0;
2555		goto retry;
2556	}
2557	return (error);
2558}
2559
2560int
2561fc_decode(faultcode_t fault_err)
2562{
2563	int error = 0;
2564
2565	switch (FC_CODE(fault_err)) {
2566	case FC_OBJERR:
2567		error = FC_ERRNO(fault_err);
2568		break;
2569	case FC_PROT:
2570		error = EACCES;
2571		break;
2572	default:
2573		error = EFAULT;
2574		break;
2575	}
2576	return (error);
2577}
2578
2579/*
2580 * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2581 * lists from each segment and copy them to one contiguous shadow list (plist)
2582 * as expected by the caller.  Save pointers to per segment shadow lists at
2583 * the tail of plist so that they can be used during as_pageunlock().
2584 */
2585static int
2586as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2587    caddr_t addr, size_t size, enum seg_rw rw)
2588{
2589	caddr_t sv_addr = addr;
2590	size_t sv_size = size;
2591	struct seg *sv_seg = seg;
2592	ulong_t segcnt = 1;
2593	ulong_t cnt;
2594	size_t ssize;
2595	pgcnt_t npages = btop(size);
2596	page_t **plist;
2597	page_t **pl;
2598	int error;
2599	caddr_t eaddr;
2600	faultcode_t fault_err = 0;
2601	pgcnt_t pl_off;
2602	extern struct seg_ops segspt_shmops;
2603
2604	ASSERT(AS_LOCK_HELD(as));
2605	ASSERT(seg != NULL);
2606	ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2607	ASSERT(addr + size > seg->s_base + seg->s_size);
2608	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2609	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2610
2611	/*
2612	 * Count the number of segments covered by the range we are about to
2613	 * lock. The segment count is used to size the shadow list we return
2614	 * back to the caller.
2615	 */
2616	for (; size != 0; size -= ssize, addr += ssize) {
2617		if (addr >= seg->s_base + seg->s_size) {
2618
2619			seg = AS_SEGNEXT(as, seg);
2620			if (seg == NULL || addr != seg->s_base) {
2621				AS_LOCK_EXIT(as);
2622				return (EFAULT);
2623			}
2624			/*
2625			 * Do a quick check if subsequent segments
2626			 * will most likely support pagelock.
2627			 */
2628			if (seg->s_ops == &segvn_ops) {
2629				vnode_t *vp;
2630
2631				if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2632				    vp != NULL) {
2633					AS_LOCK_EXIT(as);
2634					goto slow;
2635				}
2636			} else if (seg->s_ops != &segspt_shmops) {
2637				AS_LOCK_EXIT(as);
2638				goto slow;
2639			}
2640			segcnt++;
2641		}
2642		if (addr + size > seg->s_base + seg->s_size) {
2643			ssize = seg->s_base + seg->s_size - addr;
2644		} else {
2645			ssize = size;
2646		}
2647	}
2648	ASSERT(segcnt > 1);
2649
2650	plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2651
2652	addr = sv_addr;
2653	size = sv_size;
2654	seg = sv_seg;
2655
2656	for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2657		if (addr >= seg->s_base + seg->s_size) {
2658			seg = AS_SEGNEXT(as, seg);
2659			ASSERT(seg != NULL && addr == seg->s_base);
2660			cnt++;
2661			ASSERT(cnt < segcnt);
2662		}
2663		if (addr + size > seg->s_base + seg->s_size) {
2664			ssize = seg->s_base + seg->s_size - addr;
2665		} else {
2666			ssize = size;
2667		}
2668		pl = &plist[npages + cnt];
2669		error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2670		    L_PAGELOCK, rw);
2671		if (error) {
2672			break;
2673		}
2674		ASSERT(plist[npages + cnt] != NULL);
2675		ASSERT(pl_off + btop(ssize) <= npages);
2676		bcopy(plist[npages + cnt], &plist[pl_off],
2677		    btop(ssize) * sizeof (page_t *));
2678		pl_off += btop(ssize);
2679	}
2680
2681	if (size == 0) {
2682		AS_LOCK_EXIT(as);
2683		ASSERT(cnt == segcnt - 1);
2684		*ppp = plist;
2685		return (0);
2686	}
2687
2688	/*
2689	 * one of pagelock calls failed. The error type is in error variable.
2690	 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2691	 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2692	 * back to the caller.
2693	 */
2694
2695	eaddr = addr;
2696	seg = sv_seg;
2697
2698	for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2699		if (addr >= seg->s_base + seg->s_size) {
2700			seg = AS_SEGNEXT(as, seg);
2701			ASSERT(seg != NULL && addr == seg->s_base);
2702			cnt++;
2703			ASSERT(cnt < segcnt);
2704		}
2705		if (eaddr > seg->s_base + seg->s_size) {
2706			ssize = seg->s_base + seg->s_size - addr;
2707		} else {
2708			ssize = eaddr - addr;
2709		}
2710		pl = &plist[npages + cnt];
2711		ASSERT(*pl != NULL);
2712		(void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2713		    L_PAGEUNLOCK, rw);
2714	}
2715
2716	AS_LOCK_EXIT(as);
2717
2718	kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2719
2720	if (error != ENOTSUP && error != EFAULT) {
2721		return (error);
2722	}
2723
2724slow:
2725	/*
2726	 * If we are here because pagelock failed due to the need to cow fault
2727	 * in the pages we want to lock F_SOFTLOCK will do this job and in
2728	 * next as_pagelock() call for this address range pagelock will
2729	 * hopefully succeed.
2730	 */
2731	fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2732	if (fault_err != 0) {
2733		return (fc_decode(fault_err));
2734	}
2735	*ppp = NULL;
2736
2737	return (0);
2738}
2739
2740/*
2741 * lock pages in a given address space. Return shadow list. If
2742 * the list is NULL, the MMU mapping is also locked.
2743 */
2744int
2745as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2746    size_t size, enum seg_rw rw)
2747{
2748	size_t rsize;
2749	caddr_t raddr;
2750	faultcode_t fault_err;
2751	struct seg *seg;
2752	int err;
2753
2754	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2755	    "as_pagelock_start: addr %p size %ld", addr, size);
2756
2757	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2758	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2759	    (size_t)raddr;
2760
2761	/*
2762	 * if the request crosses two segments let
2763	 * as_fault handle it.
2764	 */
2765	AS_LOCK_ENTER(as, RW_READER);
2766
2767	seg = as_segat(as, raddr);
2768	if (seg == NULL) {
2769		AS_LOCK_EXIT(as);
2770		return (EFAULT);
2771	}
2772	ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2773	if (raddr + rsize > seg->s_base + seg->s_size) {
2774		return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2775	}
2776	if (raddr + rsize <= raddr) {
2777		AS_LOCK_EXIT(as);
2778		return (EFAULT);
2779	}
2780
2781	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2782	    "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2783
2784	/*
2785	 * try to lock pages and pass back shadow list
2786	 */
2787	err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2788
2789	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2790
2791	AS_LOCK_EXIT(as);
2792
2793	if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2794		return (err);
2795	}
2796
2797	/*
2798	 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2799	 * to no pagelock support for this segment or pages need to be cow
2800	 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2801	 * this as_pagelock() call and in the next as_pagelock() call for the
2802	 * same address range pagelock call will hopefull succeed.
2803	 */
2804	fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2805	if (fault_err != 0) {
2806		return (fc_decode(fault_err));
2807	}
2808	*ppp = NULL;
2809
2810	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2811	return (0);
2812}
2813
2814/*
2815 * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2816 * lists from the end of plist and call pageunlock interface for each segment.
2817 * Drop as lock and free plist.
2818 */
2819static void
2820as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2821    struct page **plist, enum seg_rw rw)
2822{
2823	ulong_t cnt;
2824	caddr_t eaddr = addr + size;
2825	pgcnt_t npages = btop(size);
2826	size_t ssize;
2827	page_t **pl;
2828
2829	ASSERT(AS_LOCK_HELD(as));
2830	ASSERT(seg != NULL);
2831	ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2832	ASSERT(addr + size > seg->s_base + seg->s_size);
2833	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2834	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2835	ASSERT(plist != NULL);
2836
2837	for (cnt = 0; addr < eaddr; addr += ssize) {
2838		if (addr >= seg->s_base + seg->s_size) {
2839			seg = AS_SEGNEXT(as, seg);
2840			ASSERT(seg != NULL && addr == seg->s_base);
2841			cnt++;
2842		}
2843		if (eaddr > seg->s_base + seg->s_size) {
2844			ssize = seg->s_base + seg->s_size - addr;
2845		} else {
2846			ssize = eaddr - addr;
2847		}
2848		pl = &plist[npages + cnt];
2849		ASSERT(*pl != NULL);
2850		(void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2851		    L_PAGEUNLOCK, rw);
2852	}
2853	ASSERT(cnt > 0);
2854	AS_LOCK_EXIT(as);
2855
2856	cnt++;
2857	kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2858}
2859
2860/*
2861 * unlock pages in a given address range
2862 */
2863void
2864as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2865    enum seg_rw rw)
2866{
2867	struct seg *seg;
2868	size_t rsize;
2869	caddr_t raddr;
2870
2871	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2872	    "as_pageunlock_start: addr %p size %ld", addr, size);
2873
2874	/*
2875	 * if the shadow list is NULL, as_pagelock was
2876	 * falling back to as_fault
2877	 */
2878	if (pp == NULL) {
2879		(void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2880		return;
2881	}
2882
2883	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2884	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2885	    (size_t)raddr;
2886
2887	AS_LOCK_ENTER(as, RW_READER);
2888	seg = as_segat(as, raddr);
2889	ASSERT(seg != NULL);
2890
2891	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2892	    "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2893
2894	ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2895	if (raddr + rsize <= seg->s_base + seg->s_size) {
2896		SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2897	} else {
2898		as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2899		return;
2900	}
2901	AS_LOCK_EXIT(as);
2902	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2903}
2904
2905int
2906as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2907    boolean_t wait)
2908{
2909	struct seg *seg;
2910	size_t ssize;
2911	caddr_t raddr;			/* rounded down addr */
2912	size_t rsize;			/* rounded up size */
2913	int error = 0;
2914	size_t pgsz = page_get_pagesize(szc);
2915
2916setpgsz_top:
2917	if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2918		return (EINVAL);
2919	}
2920
2921	raddr = addr;
2922	rsize = size;
2923
2924	if (raddr + rsize < raddr)		/* check for wraparound */
2925		return (ENOMEM);
2926
2927	AS_LOCK_ENTER(as, RW_WRITER);
2928	as_clearwatchprot(as, raddr, rsize);
2929	seg = as_segat(as, raddr);
2930	if (seg == NULL) {
2931		as_setwatch(as);
2932		AS_LOCK_EXIT(as);
2933		return (ENOMEM);
2934	}
2935
2936	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2937		if (raddr >= seg->s_base + seg->s_size) {
2938			seg = AS_SEGNEXT(as, seg);
2939			if (seg == NULL || raddr != seg->s_base) {
2940				error = ENOMEM;
2941				break;
2942			}
2943		}
2944		if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2945			ssize = seg->s_base + seg->s_size - raddr;
2946		} else {
2947			ssize = rsize;
2948		}
2949
2950retry:
2951		error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2952
2953		if (error == IE_NOMEM) {
2954			error = EAGAIN;
2955			break;
2956		}
2957
2958		if (error == IE_RETRY) {
2959			AS_LOCK_EXIT(as);
2960			goto setpgsz_top;
2961		}
2962
2963		if (error == ENOTSUP) {
2964			error = EINVAL;
2965			break;
2966		}
2967
2968		if (wait && (error == EAGAIN)) {
2969			/*
2970			 * Memory is currently locked.  It must be unlocked
2971			 * before this operation can succeed through a retry.
2972			 * The possible reasons for locked memory and
2973			 * corresponding strategies for unlocking are:
2974			 * (1) Normal I/O
2975			 *	wait for a signal that the I/O operation
2976			 *	has completed and the memory is unlocked.
2977			 * (2) Asynchronous I/O
2978			 *	The aio subsystem does not unlock pages when
2979			 *	the I/O is completed. Those pages are unlocked
2980			 *	when the application calls aiowait/aioerror.
2981			 *	So, to prevent blocking forever, cv_broadcast()
2982			 *	is done to wake up aio_cleanup_thread.
2983			 *	Subsequently, segvn_reclaim will be called, and
2984			 *	that will do AS_CLRUNMAPWAIT() and wake us up.
2985			 * (3) Long term page locking:
2986			 *	This is not relevant for as_setpagesize()
2987			 *	because we cannot change the page size for
2988			 *	driver memory. The attempt to do so will
2989			 *	fail with a different error than EAGAIN so
2990			 *	there's no need to trigger as callbacks like
2991			 *	as_unmap, as_setprot or as_free would do.
2992			 */
2993			mutex_enter(&as->a_contents);
2994			if (!AS_ISNOUNMAPWAIT(as)) {
2995				if (AS_ISUNMAPWAIT(as) == 0) {
2996					cv_broadcast(&as->a_cv);
2997				}
2998				AS_SETUNMAPWAIT(as);
2999				AS_LOCK_EXIT(as);
3000				while (AS_ISUNMAPWAIT(as)) {
3001					cv_wait(&as->a_cv, &as->a_contents);
3002				}
3003			} else {
3004				/*
3005				 * We may have raced with
3006				 * segvn_reclaim()/segspt_reclaim(). In this
3007				 * case clean nounmapwait flag and retry since
3008				 * softlockcnt in this segment may be already
3009				 * 0.  We don't drop as writer lock so our
3010				 * number of retries without sleeping should
3011				 * be very small. See segvn_reclaim() for
3012				 * more comments.
3013				 */
3014				AS_CLRNOUNMAPWAIT(as);
3015				mutex_exit(&as->a_contents);
3016				goto retry;
3017			}
3018			mutex_exit(&as->a_contents);
3019			goto setpgsz_top;
3020		} else if (error != 0) {
3021			break;
3022		}
3023	}
3024	as_setwatch(as);
3025	AS_LOCK_EXIT(as);
3026	return (error);
3027}
3028
3029/*
3030 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3031 * in its chunk where s_szc is less than the szc we want to set.
3032 */
3033static int
3034as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3035    int *retry)
3036{
3037	struct seg *seg;
3038	size_t ssize;
3039	int error;
3040
3041	ASSERT(AS_WRITE_HELD(as));
3042
3043	seg = as_segat(as, raddr);
3044	if (seg == NULL) {
3045		panic("as_iset3_default_lpsize: no seg");
3046	}
3047
3048	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3049		if (raddr >= seg->s_base + seg->s_size) {
3050			seg = AS_SEGNEXT(as, seg);
3051			if (seg == NULL || raddr != seg->s_base) {
3052				panic("as_iset3_default_lpsize: as changed");
3053			}
3054		}
3055		if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3056			ssize = seg->s_base + seg->s_size - raddr;
3057		} else {
3058			ssize = rsize;
3059		}
3060
3061		if (szc > seg->s_szc) {
3062			error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3063			/* Only retry on EINVAL segments that have no vnode. */
3064			if (error == EINVAL) {
3065				vnode_t *vp = NULL;
3066				if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3067				    (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3068				    vp == NULL)) {
3069					*retry = 1;
3070				} else {
3071					*retry = 0;
3072				}
3073			}
3074			if (error) {
3075				return (error);
3076			}
3077		}
3078	}
3079	return (0);
3080}
3081
3082/*
3083 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3084 * pagesize on each segment in its range, but if any fails with EINVAL,
3085 * then it reduces the pagesizes to the next size in the bitmap and
3086 * retries as_iset3_default_lpsize(). The reason why the code retries
3087 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3088 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3089 * with) to pass to map_pgszcvec().
3090 */
3091static int
3092as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3093    uint_t szcvec)
3094{
3095	int error;
3096	int retry;
3097
3098	ASSERT(AS_WRITE_HELD(as));
3099
3100	for (;;) {
3101		error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3102		if (error == EINVAL && retry) {
3103			szcvec &= ~(1 << szc);
3104			if (szcvec <= 1) {
3105				return (EINVAL);
3106			}
3107			szc = highbit(szcvec) - 1;
3108		} else {
3109			return (error);
3110		}
3111	}
3112}
3113
3114/*
3115 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3116 * segments have a smaller szc than we want to set. For each such area,
3117 * it calls as_iset2_default_lpsize()
3118 */
3119static int
3120as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3121    uint_t szcvec)
3122{
3123	struct seg *seg;
3124	size_t ssize;
3125	caddr_t setaddr = raddr;
3126	size_t setsize = 0;
3127	int set;
3128	int error;
3129
3130	ASSERT(AS_WRITE_HELD(as));
3131
3132	seg = as_segat(as, raddr);
3133	if (seg == NULL) {
3134		panic("as_iset1_default_lpsize: no seg");
3135	}
3136	if (seg->s_szc < szc) {
3137		set = 1;
3138	} else {
3139		set = 0;
3140	}
3141
3142	for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3143		if (raddr >= seg->s_base + seg->s_size) {
3144			seg = AS_SEGNEXT(as, seg);
3145			if (seg == NULL || raddr != seg->s_base) {
3146				panic("as_iset1_default_lpsize: as changed");
3147			}
3148			if (seg->s_szc >= szc && set) {
3149				ASSERT(setsize != 0);
3150				error = as_iset2_default_lpsize(as,
3151				    setaddr, setsize, szc, szcvec);
3152				if (error) {
3153					return (error);
3154				}
3155				set = 0;
3156			} else if (seg->s_szc < szc && !set) {
3157				setaddr = raddr;
3158				setsize = 0;
3159				set = 1;
3160			}
3161		}
3162		if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3163			ssize = seg->s_base + seg->s_size - raddr;
3164		} else {
3165			ssize = rsize;
3166		}
3167	}
3168	error = 0;
3169	if (set) {
3170		ASSERT(setsize != 0);
3171		error = as_iset2_default_lpsize(as, setaddr, setsize,
3172		    szc, szcvec);
3173	}
3174	return (error);
3175}
3176
3177/*
3178 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3179 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3180 * chunk to as_iset1_default_lpsize().
3181 */
3182static int
3183as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3184    int type)
3185{
3186	int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3187	uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3188	    flags, rtype, 1);
3189	uint_t szc;
3190	uint_t nszc;
3191	int error;
3192	caddr_t a;
3193	caddr_t eaddr;
3194	size_t segsize;
3195	size_t pgsz;
3196	uint_t save_szcvec;
3197
3198	ASSERT(AS_WRITE_HELD(as));
3199	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3200	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3201
3202	szcvec &= ~1;
3203	if (szcvec <= 1) {	/* skip if base page size */
3204		return (0);
3205	}
3206
3207	/* Get the pagesize of the first larger page size. */
3208	szc = lowbit(szcvec) - 1;
3209	pgsz = page_get_pagesize(szc);
3210	eaddr = addr + size;
3211	addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3212	eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3213
3214	save_szcvec = szcvec;
3215	szcvec >>= (szc + 1);
3216	nszc = szc;
3217	while (szcvec) {
3218		if ((szcvec & 0x1) == 0) {
3219			nszc++;
3220			szcvec >>= 1;
3221			continue;
3222		}
3223		nszc++;
3224		pgsz = page_get_pagesize(nszc);
3225		a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3226		if (a != addr) {
3227			ASSERT(szc > 0);
3228			ASSERT(a < eaddr);
3229			segsize = a - addr;
3230			error = as_iset1_default_lpsize(as, addr, segsize, szc,
3231			    save_szcvec);
3232			if (error) {
3233				return (error);
3234			}
3235			addr = a;
3236		}
3237		szc = nszc;
3238		szcvec >>= 1;
3239	}
3240
3241	ASSERT(addr < eaddr);
3242	szcvec = save_szcvec;
3243	while (szcvec) {
3244		a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3245		ASSERT(a >= addr);
3246		if (a != addr) {
3247			ASSERT(szc > 0);
3248			segsize = a - addr;
3249			error = as_iset1_default_lpsize(as, addr, segsize, szc,
3250			    save_szcvec);
3251			if (error) {
3252				return (error);
3253			}
3254			addr = a;
3255		}
3256		szcvec &= ~(1 << szc);
3257		if (szcvec) {
3258			szc = highbit(szcvec) - 1;
3259			pgsz = page_get_pagesize(szc);
3260		}
3261	}
3262	ASSERT(addr == eaddr);
3263
3264	return (0);
3265}
3266
3267/*
3268 * Set the default large page size for the range. Called via memcntl with
3269 * page size set to 0. as_set_default_lpsize breaks the range down into
3270 * chunks with the same type/flags, ignores-non segvn segments, and passes
3271 * each chunk to as_iset_default_lpsize().
3272 */
3273int
3274as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3275{
3276	struct seg *seg;
3277	caddr_t raddr;
3278	size_t rsize;
3279	size_t ssize;
3280	int rtype, rflags;
3281	int stype, sflags;
3282	int error;
3283	caddr_t	setaddr;
3284	size_t setsize;
3285	int segvn;
3286
3287	if (size == 0)
3288		return (0);
3289
3290	AS_LOCK_ENTER(as, RW_WRITER);
3291again:
3292	error = 0;
3293
3294	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3295	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3296	    (size_t)raddr;
3297
3298	if (raddr + rsize < raddr) {		/* check for wraparound */
3299		AS_LOCK_EXIT(as);
3300		return (ENOMEM);
3301	}
3302	as_clearwatchprot(as, raddr, rsize);
3303	seg = as_segat(as, raddr);
3304	if (seg == NULL) {
3305		as_setwatch(as);
3306		AS_LOCK_EXIT(as);
3307		return (ENOMEM);
3308	}
3309	if (seg->s_ops == &segvn_ops) {
3310		rtype = SEGOP_GETTYPE(seg, addr);
3311		rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3312		rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3313		segvn = 1;
3314	} else {
3315		segvn = 0;
3316	}
3317	setaddr = raddr;
3318	setsize = 0;
3319
3320	for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3321		if (raddr >= (seg->s_base + seg->s_size)) {
3322			seg = AS_SEGNEXT(as, seg);
3323			if (seg == NULL || raddr != seg->s_base) {
3324				error = ENOMEM;
3325				break;
3326			}
3327			if (seg->s_ops == &segvn_ops) {
3328				stype = SEGOP_GETTYPE(seg, raddr);
3329				sflags = stype & (MAP_TEXT | MAP_INITDATA);
3330				stype &= (MAP_SHARED | MAP_PRIVATE);
3331				if (segvn && (rflags != sflags ||
3332				    rtype != stype)) {
3333					/*
3334					 * The next segment is also segvn but
3335					 * has different flags and/or type.
3336					 */
3337					ASSERT(setsize != 0);
3338					error = as_iset_default_lpsize(as,
3339					    setaddr, setsize, rflags, rtype);
3340					if (error) {
3341						break;
3342					}
3343					rflags = sflags;
3344					rtype = stype;
3345					setaddr = raddr;
3346					setsize = 0;
3347				} else if (!segvn) {
3348					rflags = sflags;
3349					rtype = stype;
3350					setaddr = raddr;
3351					setsize = 0;
3352					segvn = 1;
3353				}
3354			} else if (segvn) {
3355				/* The next segment is not segvn. */
3356				ASSERT(setsize != 0);
3357				error = as_iset_default_lpsize(as,
3358				    setaddr, setsize, rflags, rtype);
3359				if (error) {
3360					break;
3361				}
3362				segvn = 0;
3363			}
3364		}
3365		if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3366			ssize = seg->s_base + seg->s_size - raddr;
3367		} else {
3368			ssize = rsize;
3369		}
3370	}
3371	if (error == 0 && segvn) {
3372		/* The last chunk when rsize == 0. */
3373		ASSERT(setsize != 0);
3374		error = as_iset_default_lpsize(as, setaddr, setsize,
3375		    rflags, rtype);
3376	}
3377
3378	if (error == IE_RETRY) {
3379		goto again;
3380	} else if (error == IE_NOMEM) {
3381		error = EAGAIN;
3382	} else if (error == ENOTSUP) {
3383		error = EINVAL;
3384	} else if (error == EAGAIN) {
3385		mutex_enter(&as->a_contents);
3386		if (!AS_ISNOUNMAPWAIT(as)) {
3387			if (AS_ISUNMAPWAIT(as) == 0) {
3388				cv_broadcast(&as->a_cv);
3389			}
3390			AS_SETUNMAPWAIT(as);
3391			AS_LOCK_EXIT(as);
3392			while (AS_ISUNMAPWAIT(as)) {
3393				cv_wait(&as->a_cv, &as->a_contents);
3394			}
3395			mutex_exit(&as->a_contents);
3396			AS_LOCK_ENTER(as, RW_WRITER);
3397		} else {
3398			/*
3399			 * We may have raced with
3400			 * segvn_reclaim()/segspt_reclaim(). In this case
3401			 * clean nounmapwait flag and retry since softlockcnt
3402			 * in this segment may be already 0.  We don't drop as
3403			 * writer lock so our number of retries without
3404			 * sleeping should be very small. See segvn_reclaim()
3405			 * for more comments.
3406			 */
3407			AS_CLRNOUNMAPWAIT(as);
3408			mutex_exit(&as->a_contents);
3409		}
3410		goto again;
3411	}
3412
3413	as_setwatch(as);
3414	AS_LOCK_EXIT(as);
3415	return (error);
3416}
3417
3418/*
3419 * Setup all of the uninitialized watched pages that we can.
3420 */
3421void
3422as_setwatch(struct as *as)
3423{
3424	struct watched_page *pwp;
3425	struct seg *seg;
3426	caddr_t vaddr;
3427	uint_t prot;
3428	int  err, retrycnt;
3429
3430	if (avl_numnodes(&as->a_wpage) == 0)
3431		return;
3432
3433	ASSERT(AS_WRITE_HELD(as));
3434
3435	for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3436	    pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3437		retrycnt = 0;
3438	retry:
3439		vaddr = pwp->wp_vaddr;
3440		if (pwp->wp_oprot != 0 ||	/* already set up */
3441		    (seg = as_segat(as, vaddr)) == NULL ||
3442		    SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3443			continue;
3444
3445		pwp->wp_oprot = prot;
3446		if (pwp->wp_read)
3447			prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3448		if (pwp->wp_write)
3449			prot &= ~PROT_WRITE;
3450		if (pwp->wp_exec)
3451			prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3452		if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3453			err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3454			if (err == IE_RETRY) {
3455				pwp->wp_oprot = 0;
3456				ASSERT(retrycnt == 0);
3457				retrycnt++;
3458				goto retry;
3459			}
3460		}
3461		pwp->wp_prot = prot;
3462	}
3463}
3464
3465/*
3466 * Clear all of the watched pages in the address space.
3467 */
3468void
3469as_clearwatch(struct as *as)
3470{
3471	struct watched_page *pwp;
3472	struct seg *seg;
3473	caddr_t vaddr;
3474	uint_t prot;
3475	int err, retrycnt;
3476
3477	if (avl_numnodes(&as->a_wpage) == 0)
3478		return;
3479
3480	ASSERT(AS_WRITE_HELD(as));
3481
3482	for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3483	    pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3484		retrycnt = 0;
3485	retry:
3486		vaddr = pwp->wp_vaddr;
3487		if (pwp->wp_oprot == 0 ||	/* not set up */
3488		    (seg = as_segat(as, vaddr)) == NULL)
3489			continue;
3490
3491		if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3492			err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3493			if (err == IE_RETRY) {
3494				ASSERT(retrycnt == 0);
3495				retrycnt++;
3496				goto retry;
3497			}
3498		}
3499		pwp->wp_oprot = 0;
3500		pwp->wp_prot = 0;
3501	}
3502}
3503
3504/*
3505 * Force a new setup for all the watched pages in the range.
3506 */
3507static void
3508as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3509{
3510	struct watched_page *pwp;
3511	struct watched_page tpw;
3512	caddr_t eaddr = addr + size;
3513	caddr_t vaddr;
3514	struct seg *seg;
3515	int err, retrycnt;
3516	uint_t	wprot;
3517	avl_index_t where;
3518
3519	if (avl_numnodes(&as->a_wpage) == 0)
3520		return;
3521
3522	ASSERT(AS_WRITE_HELD(as));
3523
3524	tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3525	if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3526		pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3527
3528	while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3529		retrycnt = 0;
3530		vaddr = pwp->wp_vaddr;
3531
3532		wprot = prot;
3533		if (pwp->wp_read)
3534			wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3535		if (pwp->wp_write)
3536			wprot &= ~PROT_WRITE;
3537		if (pwp->wp_exec)
3538			wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3539		if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3540		retry:
3541			seg = as_segat(as, vaddr);
3542			if (seg == NULL) {
3543				panic("as_setwatchprot: no seg");
3544				/*NOTREACHED*/
3545			}
3546			err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3547			if (err == IE_RETRY) {
3548				ASSERT(retrycnt == 0);
3549				retrycnt++;
3550				goto retry;
3551			}
3552		}
3553		pwp->wp_oprot = prot;
3554		pwp->wp_prot = wprot;
3555
3556		pwp = AVL_NEXT(&as->a_wpage, pwp);
3557	}
3558}
3559
3560/*
3561 * Clear all of the watched pages in the range.
3562 */
3563static void
3564as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3565{
3566	caddr_t eaddr = addr + size;
3567	struct watched_page *pwp;
3568	struct watched_page tpw;
3569	uint_t prot;
3570	struct seg *seg;
3571	int err, retrycnt;
3572	avl_index_t where;
3573
3574	if (avl_numnodes(&as->a_wpage) == 0)
3575		return;
3576
3577	tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3578	if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3579		pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3580
3581	ASSERT(AS_WRITE_HELD(as));
3582
3583	while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3584
3585		if ((prot = pwp->wp_oprot) != 0) {
3586			retrycnt = 0;
3587
3588			if (prot != pwp->wp_prot) {
3589			retry:
3590				seg = as_segat(as, pwp->wp_vaddr);
3591				if (seg == NULL)
3592					continue;
3593				err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3594				    PAGESIZE, prot);
3595				if (err == IE_RETRY) {
3596					ASSERT(retrycnt == 0);
3597					retrycnt++;
3598					goto retry;
3599
3600				}
3601			}
3602			pwp->wp_oprot = 0;
3603			pwp->wp_prot = 0;
3604		}
3605
3606		pwp = AVL_NEXT(&as->a_wpage, pwp);
3607	}
3608}
3609
3610void
3611as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3612{
3613	struct proc *p;
3614
3615	mutex_enter(&pidlock);
3616	for (p = practive; p; p = p->p_next) {
3617		if (p->p_as == as) {
3618			mutex_enter(&p->p_lock);
3619			if (p->p_as == as)
3620				sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3621			mutex_exit(&p->p_lock);
3622		}
3623	}
3624	mutex_exit(&pidlock);
3625}
3626
3627/*
3628 * return memory object ID
3629 */
3630int
3631as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3632{
3633	struct seg	*seg;
3634	int		sts;
3635
3636	AS_LOCK_ENTER(as, RW_READER);
3637	seg = as_segat(as, addr);
3638	if (seg == NULL) {
3639		AS_LOCK_EXIT(as);
3640		return (EFAULT);
3641	}
3642	/*
3643	 * catch old drivers which may not support getmemid
3644	 */
3645	if (seg->s_ops->getmemid == NULL) {
3646		AS_LOCK_EXIT(as);
3647		return (ENODEV);
3648	}
3649
3650	sts = SEGOP_GETMEMID(seg, addr, memidp);
3651
3652	AS_LOCK_EXIT(as);
3653	return (sts);
3654}
3655