1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source.  A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13/*
14 * Copyright 2019 Joyent, Inc.
15 * Copyright 2020 Oxide Computer Company
16 */
17
18#include <sys/param.h>
19#include <sys/kmem.h>
20#include <sys/thread.h>
21#include <sys/list.h>
22#include <sys/mman.h>
23#include <sys/types.h>
24#include <sys/ddi.h>
25#include <sys/sysmacros.h>
26#include <sys/machsystm.h>
27#include <sys/vmsystm.h>
28#include <sys/malloc.h>
29#include <sys/x86_archext.h>
30#include <vm/as.h>
31#include <vm/seg_vn.h>
32#include <vm/seg_kmem.h>
33#include <vm/seg_vmm.h>
34
35#include <vm/vm_extern.h>
36#include <vm/vm_map.h>
37#include "vm/vm_glue.h"
38
39#define	PMAP_TO_VMMAP(pm)	((vm_map_t)		\
40	((caddr_t)(pm) - offsetof(struct vmspace, vms_pmap)))
41#define	VMMAP_TO_VMSPACE(vmmap)	((struct vmspace *)		\
42	((caddr_t)(vmmap) - offsetof(struct vmspace, vm_map)))
43
44
45struct vmspace_mapping {
46	list_node_t	vmsm_node;
47	vm_object_t	vmsm_object;
48	uintptr_t	vmsm_addr;
49	size_t		vmsm_len;
50	off_t		vmsm_offset;
51	uint_t		vmsm_prot;
52};
53typedef struct vmspace_mapping vmspace_mapping_t;
54
55#define	VMSM_OFFSET(vmsm, addr)	(			\
56	    (vmsm)->vmsm_offset +			\
57	    ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
58
59
60/* Private glue interfaces */
61static void pmap_free(pmap_t);
62static vmspace_mapping_t *vm_mapping_find(struct vmspace *, uintptr_t, size_t,
63    boolean_t);
64static void vm_mapping_remove(struct vmspace *, vmspace_mapping_t *);
65
66static vmem_t *vmm_alloc_arena = NULL;
67
68static void *
69vmm_arena_alloc(vmem_t *vmp, size_t size, int vmflag)
70{
71	return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
72	    segkmem_page_create, &kvps[KV_VVP]));
73}
74
75static void
76vmm_arena_free(vmem_t *vmp, void *inaddr, size_t size)
77{
78	segkmem_xfree(vmp, inaddr, size, &kvps[KV_VVP], NULL);
79}
80
81void
82vmm_arena_init(void)
83{
84	vmm_alloc_arena = vmem_create("vmm_alloc_arena", NULL, 0, 1024 * 1024,
85	    vmm_arena_alloc, vmm_arena_free, kvmm_arena, 0, VM_SLEEP);
86
87	ASSERT(vmm_alloc_arena != NULL);
88}
89
90void
91vmm_arena_fini(void)
92{
93	VERIFY(vmem_size(vmm_alloc_arena, VMEM_ALLOC) == 0);
94	vmem_destroy(vmm_alloc_arena);
95	vmm_alloc_arena = NULL;
96}
97
98struct vmspace *
99vmspace_alloc(vm_offset_t start, vm_offset_t end, pmap_pinit_t pinit)
100{
101	struct vmspace *vms;
102	const uintptr_t size = end + 1;
103
104	/*
105	 * This whole mess is built on the assumption that a 64-bit address
106	 * space is available to work with for the various pagetable tricks.
107	 */
108	VERIFY(ttoproc(curthread)->p_model == DATAMODEL_LP64);
109	VERIFY(start == 0 && size > 0 && (size & PAGEOFFSET) == 0 &&
110	    size <= (uintptr_t)USERLIMIT);
111
112	vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
113	vms->vms_size = size;
114	list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
115	    offsetof(vmspace_mapping_t, vmsm_node));
116
117	if (pinit(&vms->vms_pmap) == 0) {
118		kmem_free(vms, sizeof (*vms));
119		return (NULL);
120	}
121
122	return (vms);
123}
124
125void
126vmspace_free(struct vmspace *vms)
127{
128	VERIFY(list_is_empty(&vms->vms_maplist));
129
130	pmap_free(&vms->vms_pmap);
131	kmem_free(vms, sizeof (*vms));
132}
133
134pmap_t
135vmspace_pmap(struct vmspace *vms)
136{
137	return (&vms->vms_pmap);
138}
139
140long
141vmspace_resident_count(struct vmspace *vms)
142{
143	/* XXXJOY: finish */
144	return (0);
145}
146
147void *
148vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size)
149{
150	vmspace_mapping_t *vmsm;
151	void *result = NULL;
152
153	/*
154	 * Since vmspace_find_kva is provided so that vmm_drv consumers can do
155	 * GPA2KVA translations, it is expected to be called when there is a
156	 * read lock preventing vmspace alterations.  As such, it can do the
157	 * lockless vm_mapping_find() lookup.
158	 */
159	vmsm = vm_mapping_find(vms, addr, size, B_TRUE);
160	if (vmsm != NULL) {
161		struct vm_object *vmo = vmsm->vmsm_object;
162
163		switch (vmo->vmo_type) {
164		case OBJT_DEFAULT:
165			result = (void *)((uintptr_t)vmo->vmo_data +
166			    VMSM_OFFSET(vmsm, addr));
167			break;
168		default:
169			break;
170		}
171	}
172
173	return (result);
174}
175
176static int
177vmspace_pmap_iswired(struct vmspace *vms, uintptr_t addr, uint_t *prot)
178{
179	pmap_t pmap = &vms->vms_pmap;
180	int rv;
181
182	ASSERT(MUTEX_HELD(&vms->vms_lock));
183
184	rv = pmap->pm_ops->vpo_is_wired(pmap->pm_impl, addr, prot);
185	return (rv);
186}
187
188static void
189pmap_free(pmap_t pmap)
190{
191	void *pmi = pmap->pm_impl;
192	struct vmm_pt_ops *ops = pmap->pm_ops;
193
194	pmap->pm_pml4 = NULL;
195	pmap->pm_impl = NULL;
196	pmap->pm_ops = NULL;
197
198	ops->vpo_free(pmi);
199}
200
201int
202pmap_pinit_type(pmap_t pmap, enum pmap_type type, int flags)
203{
204	/* For use in vmm only */
205	pmap->pm_type = type;
206	switch (type) {
207	case PT_EPT: {
208		struct vmm_pt_ops *ops = &ept_ops;
209		void *pml4, *pmi;
210
211		pmi = ops->vpo_init((uintptr_t *)&pml4);
212
213		pmap->pm_ops = ops;
214		pmap->pm_impl = pmi;
215		pmap->pm_pml4 = pml4;
216		return (1);
217	}
218	case PT_RVI: {
219		struct vmm_pt_ops *ops = &rvi_ops;
220		void *pml4, *pmi;
221
222		pmi = ops->vpo_init((uintptr_t *)&pml4);
223
224		pmap->pm_ops = ops;
225		pmap->pm_impl = pmi;
226		pmap->pm_pml4 = pml4;
227		return (1);
228	}
229	default:
230		panic("unsupported pmap type: %x", type);
231		break;
232	}
233
234	return (1);
235}
236
237long
238pmap_wired_count(pmap_t pmap)
239{
240	long val;
241
242	val = pmap->pm_ops->vpo_wired_cnt(pmap->pm_impl);
243	VERIFY3S(val, >=, 0);
244
245	return (val);
246}
247
248int
249pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
250{
251	/* Allow the fallback to vm_fault to handle this */
252	return (-1);
253}
254
255
256
257struct sglist_ent {
258	vm_paddr_t	sge_pa;
259	size_t		sge_len;
260};
261struct sglist {
262	kmutex_t		sg_lock;
263	uint_t			sg_refcnt;
264	uint_t			sg_len;
265	uint_t			sg_next;
266	struct sglist_ent	sg_entries[];
267};
268
269#define	SG_SIZE(cnt)	(sizeof (struct sglist) + \
270	(sizeof (struct sglist_ent) * (cnt)))
271
272struct sglist *
273sglist_alloc(int nseg, int flags)
274{
275	const size_t sz = SG_SIZE(nseg);
276	const int flag = (flags & M_WAITOK) ? KM_SLEEP : KM_NOSLEEP;
277	struct sglist *sg;
278
279	ASSERT(nseg > 0);
280
281	sg = kmem_zalloc(sz, flag);
282	if (sg != NULL) {
283		sg->sg_len = nseg;
284		sg->sg_refcnt = 1;
285	}
286	return (sg);
287}
288
289void
290sglist_free(struct sglist *sg)
291{
292	size_t sz;
293
294	mutex_enter(&sg->sg_lock);
295	if (sg->sg_refcnt > 1) {
296		sg->sg_refcnt--;
297		mutex_exit(&sg->sg_lock);
298		return;
299	}
300
301	VERIFY(sg->sg_refcnt == 1);
302	sg->sg_refcnt = 0;
303	sz = SG_SIZE(sg->sg_len);
304	mutex_exit(&sg->sg_lock);
305	kmem_free(sg, sz);
306}
307
308int
309sglist_append_phys(struct sglist *sg, vm_paddr_t pa, size_t len)
310{
311	uint_t idx;
312	struct sglist_ent *ent;
313
314	/* Restrict to page-aligned entries */
315	if ((pa & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0 || len == 0) {
316		return (EINVAL);
317	}
318
319	mutex_enter(&sg->sg_lock);
320	idx = sg->sg_next;
321	if (idx >= sg->sg_len) {
322		mutex_exit(&sg->sg_lock);
323		return (ENOSPC);
324	}
325
326	ent = &sg->sg_entries[idx];
327	ASSERT(ent->sge_pa == 0 && ent->sge_len == 0);
328	ent->sge_pa = pa;
329	ent->sge_len = len;
330	sg->sg_next++;
331
332	mutex_exit(&sg->sg_lock);
333	return (0);
334}
335
336
337static pfn_t
338vm_object_pager_none(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
339{
340	panic("bad vm_object pager");
341	return (PFN_INVALID);
342}
343
344static pfn_t
345vm_object_pager_heap(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
346{
347	const uintptr_t kaddr = ALIGN2PAGE((uintptr_t)vmo->vmo_data + off);
348	uint_t idx, level;
349	htable_t *ht;
350	x86pte_t pte;
351	pfn_t top_pfn, pfn;
352
353	ASSERT(vmo->vmo_type == OBJT_DEFAULT);
354	ASSERT(off < vmo->vmo_size);
355
356	ht = htable_getpage(kas.a_hat, kaddr, &idx);
357	if (ht == NULL) {
358		return (PFN_INVALID);
359	}
360	pte = x86pte_get(ht, idx);
361	if (!PTE_ISPAGE(pte, ht->ht_level)) {
362		htable_release(ht);
363		return (PFN_INVALID);
364	}
365
366	pfn = top_pfn = PTE2PFN(pte, ht->ht_level);
367	level = ht->ht_level;
368	if (ht->ht_level > 0) {
369		pfn += mmu_btop(kaddr & LEVEL_OFFSET((uint_t)ht->ht_level));
370	}
371	htable_release(ht);
372
373	if (lpfn != NULL) {
374		*lpfn = top_pfn;
375	}
376	if (lvl != NULL) {
377		*lvl = level;
378	}
379	return (pfn);
380}
381
382static pfn_t
383vm_object_pager_sg(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
384{
385	const uintptr_t aoff = ALIGN2PAGE(off);
386	uint_t level = 0;
387	uintptr_t pos = 0;
388	struct sglist *sg;
389	struct sglist_ent *ent;
390	pfn_t pfn = PFN_INVALID;
391
392	ASSERT(vmo->vmo_type == OBJT_SG);
393	ASSERT(off < vmo->vmo_size);
394
395	sg = vmo->vmo_data;
396	if (sg == NULL) {
397		return (PFN_INVALID);
398	}
399
400	ent = &sg->sg_entries[0];
401	for (uint_t i = 0; i < sg->sg_next; i++, ent++) {
402		if (aoff >= pos && aoff < (pos + ent->sge_len)) {
403			/* XXXJOY: Punt on large pages for now */
404			level = 0;
405			pfn = mmu_btop(ent->sge_pa + (aoff - pos));
406			break;
407		}
408		pos += ent->sge_len;
409	}
410
411	if (lpfn != 0) {
412		*lpfn = pfn;
413	}
414	if (lvl != 0) {
415		*lvl = level;
416	}
417	return (pfn);
418}
419
420static void
421vm_reserve_pages(size_t npages)
422{
423	uint_t retries = 60;
424	int rc;
425
426	mutex_enter(&freemem_lock);
427	if (availrmem < npages) {
428		mutex_exit(&freemem_lock);
429
430		/*
431		 * Set needfree and wait for the ZFS ARC reap thread to free up
432		 * some memory.
433		 */
434		page_needfree(npages);
435
436		mutex_enter(&freemem_lock);
437		while ((availrmem < npages) && retries-- > 0) {
438			mutex_exit(&freemem_lock);
439			rc = delay_sig(drv_usectohz(1 * MICROSEC));
440			mutex_enter(&freemem_lock);
441
442			if (rc == EINTR)
443				break;
444		}
445		mutex_exit(&freemem_lock);
446
447		page_needfree(-npages);
448	} else {
449		mutex_exit(&freemem_lock);
450	}
451}
452
453void
454vm_object_clear(vm_object_t vmo)
455{
456	ASSERT(vmo->vmo_type == OBJT_DEFAULT);
457
458	/* XXXJOY: Better zeroing approach? */
459	bzero(vmo->vmo_data, vmo->vmo_size);
460}
461
462vm_object_t
463vm_object_allocate(objtype_t type, vm_pindex_t psize)
464{
465	vm_object_t vmo;
466	const size_t size = ptob((size_t)psize);
467
468	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
469	mutex_init(&vmo->vmo_lock, NULL, MUTEX_DEFAULT, NULL);
470
471	/* For now, these are to stay fixed after allocation */
472	vmo->vmo_type = type;
473	vmo->vmo_size = size;
474	vmo->vmo_attr = VM_MEMATTR_DEFAULT;
475
476	switch (type) {
477	case OBJT_DEFAULT: {
478		vm_reserve_pages(psize);
479
480		/* XXXJOY: opt-in to larger pages? */
481		vmo->vmo_data = vmem_alloc(vmm_alloc_arena, size, KM_NOSLEEP);
482		if (vmo->vmo_data == NULL) {
483			mutex_destroy(&vmo->vmo_lock);
484			kmem_free(vmo, sizeof (*vmo));
485			return (NULL);
486		}
487		vm_object_clear(vmo);
488		vmo->vmo_pager = vm_object_pager_heap;
489	}
490		break;
491	case OBJT_SG:
492		vmo->vmo_data = NULL;
493		vmo->vmo_pager = vm_object_pager_sg;
494		break;
495	default:
496		panic("Unsupported vm_object type");
497		break;
498	}
499
500	vmo->vmo_refcnt = 1;
501	return (vmo);
502}
503
504vm_object_t
505vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size,
506    vm_prot_t prot, vm_ooffset_t off, void *cred)
507{
508	struct vm_object *vmo;
509	struct sglist *sg = (struct sglist *)handle;
510
511	/* XXXJOY: be very restrictive for now */
512	VERIFY(type == OBJT_SG);
513	VERIFY(off == 0);
514
515	vmo = vm_object_allocate(type, size);
516	vmo->vmo_data = sg;
517
518	mutex_enter(&sg->sg_lock);
519	VERIFY(sg->sg_refcnt++ >= 1);
520	mutex_exit(&sg->sg_lock);
521
522	return (vmo);
523}
524
525void
526vm_object_deallocate(vm_object_t vmo)
527{
528	ASSERT(vmo != NULL);
529
530	uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
531	/* underflow would be a deadly serious mistake */
532	VERIFY3U(ref, !=, UINT_MAX);
533	if (ref != 0) {
534		return;
535	}
536
537	switch (vmo->vmo_type) {
538	case OBJT_DEFAULT:
539		vmem_free(vmm_alloc_arena, vmo->vmo_data, vmo->vmo_size);
540		break;
541	case OBJT_SG:
542		sglist_free((struct sglist *)vmo->vmo_data);
543		break;
544	default:
545		panic("Unsupported vm_object type");
546		break;
547	}
548
549	vmo->vmo_pager = vm_object_pager_none;
550	vmo->vmo_data = NULL;
551	vmo->vmo_size = 0;
552	mutex_destroy(&vmo->vmo_lock);
553	kmem_free(vmo, sizeof (*vmo));
554}
555
556CTASSERT(VM_MEMATTR_UNCACHEABLE == MTRR_TYPE_UC);
557CTASSERT(VM_MEMATTR_WRITE_BACK == MTRR_TYPE_WB);
558int
559vm_object_set_memattr(vm_object_t vmo, vm_memattr_t attr)
560{
561	ASSERT(MUTEX_HELD(&vmo->vmo_lock));
562
563	switch (attr) {
564	case VM_MEMATTR_UNCACHEABLE:
565	case VM_MEMATTR_WRITE_BACK:
566		vmo->vmo_attr = attr;
567		return (0);
568	default:
569		break;
570	}
571	return (EINVAL);
572}
573
574void
575vm_object_reference(vm_object_t vmo)
576{
577	ASSERT(vmo != NULL);
578
579	uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
580	/* overflow would be a deadly serious mistake */
581	VERIFY3U(ref, !=, 0);
582}
583
584static vmspace_mapping_t *
585vm_mapping_find(struct vmspace *vms, uintptr_t addr, size_t size,
586    boolean_t no_lock)
587{
588	vmspace_mapping_t *vmsm;
589	list_t *ml = &vms->vms_maplist;
590	const uintptr_t range_end = addr + size;
591
592	ASSERT(addr <= range_end);
593
594	if (no_lock) {
595		/*
596		 * This check should be superflous with the protections
597		 * promised by the bhyve logic which calls into the VM shim.
598		 * All the same, it is cheap to be paranoid.
599		 */
600		VERIFY(!vms->vms_map_changing);
601	} else {
602		VERIFY(MUTEX_HELD(&vms->vms_lock));
603	}
604
605	if (addr >= vms->vms_size) {
606		return (NULL);
607	}
608	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
609		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
610
611		if (addr >= vmsm->vmsm_addr && addr < seg_end) {
612			if (range_end <= seg_end) {
613				return (vmsm);
614			} else {
615				return (NULL);
616			}
617		}
618	}
619	return (NULL);
620}
621
622static boolean_t
623vm_mapping_gap(struct vmspace *vms, uintptr_t addr, size_t size)
624{
625	vmspace_mapping_t *vmsm;
626	list_t *ml = &vms->vms_maplist;
627	const uintptr_t range_end = addr + size;
628
629	ASSERT(MUTEX_HELD(&vms->vms_lock));
630
631	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
632		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
633
634		if ((vmsm->vmsm_addr >= addr && vmsm->vmsm_addr < range_end) ||
635		    (seg_end > addr && seg_end < range_end)) {
636			return (B_FALSE);
637		}
638	}
639	return (B_TRUE);
640}
641
642static void
643vm_mapping_remove(struct vmspace *vms, vmspace_mapping_t *vmsm)
644{
645	list_t *ml = &vms->vms_maplist;
646
647	ASSERT(MUTEX_HELD(&vms->vms_lock));
648	ASSERT(vms->vms_map_changing);
649
650	list_remove(ml, vmsm);
651	vm_object_deallocate(vmsm->vmsm_object);
652	kmem_free(vmsm, sizeof (*vmsm));
653}
654
655int
656vm_fault(vm_map_t map, vm_offset_t off, vm_prot_t type, int flag)
657{
658	struct vmspace *vms = VMMAP_TO_VMSPACE(map);
659	pmap_t pmap = &vms->vms_pmap;
660	void *pmi = pmap->pm_impl;
661	const uintptr_t addr = off;
662	vmspace_mapping_t *vmsm;
663	struct vm_object *vmo;
664	uint_t prot, map_lvl;
665	pfn_t pfn;
666	uintptr_t map_addr;
667
668	mutex_enter(&vms->vms_lock);
669	if (vmspace_pmap_iswired(vms, addr, &prot) == 0) {
670		int err = 0;
671
672		/*
673		 * It is possible that multiple vCPUs will race to fault-in a
674		 * given address.  In such cases, the race loser(s) will
675		 * encounter the already-mapped page, needing to do nothing
676		 * more than consider it a success.
677		 *
678		 * If the fault exceeds protection, it is an obvious error.
679		 */
680		if ((prot & type) != type) {
681			err = FC_PROT;
682		}
683
684		mutex_exit(&vms->vms_lock);
685		return (err);
686	}
687
688	/* Try to wire up the address */
689	if ((vmsm = vm_mapping_find(vms, addr, 0, B_FALSE)) == NULL) {
690		mutex_exit(&vms->vms_lock);
691		return (FC_NOMAP);
692	}
693	vmo = vmsm->vmsm_object;
694	prot = vmsm->vmsm_prot;
695
696	/* XXXJOY: punt on large pages for now */
697	pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, addr), NULL, NULL);
698	map_lvl = 0;
699	map_addr = P2ALIGN((uintptr_t)addr, LEVEL_SIZE(map_lvl));
700	VERIFY(pfn != PFN_INVALID);
701
702	/*
703	 * If pmap failure is to be handled, the previously acquired page locks
704	 * would need to be released.
705	 */
706	VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, prot,
707	    vmo->vmo_attr));
708	pmap->pm_eptgen++;
709
710	mutex_exit(&vms->vms_lock);
711	return (0);
712}
713
714int
715vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
716    vm_prot_t prot, vm_page_t *ma, int max_count)
717{
718	struct vmspace *vms = VMMAP_TO_VMSPACE(map);
719	const uintptr_t vaddr = addr;
720	vmspace_mapping_t *vmsm;
721	struct vm_object *vmo;
722	vm_page_t vmp;
723
724	ASSERT0(addr & PAGEOFFSET);
725	ASSERT(len == PAGESIZE);
726	ASSERT(max_count == 1);
727
728	/*
729	 * Unlike practically all of the other logic that queries or
730	 * manipulates vmspace objects, vm_fault_quick_hold_pages() does so
731	 * without holding vms_lock.  This is safe because bhyve ensures that
732	 * changes to the vmspace map occur only when all other threads have
733	 * been excluded from running.
734	 *
735	 * Since this task can count on vms_maplist remaining static and does
736	 * not need to modify the pmap (like vm_fault might), it can proceed
737	 * without the lock.  The vm_object has independent refcount and lock
738	 * protection, while the vmo_pager methods do not rely on vms_lock for
739	 * safety.
740	 *
741	 * Performing this work without locks is critical in cases where
742	 * multiple vCPUs require simultaneous instruction emulation, such as
743	 * for frequent guest APIC accesses on a host that lacks hardware
744	 * acceleration for that behavior.
745	 */
746	if ((vmsm = vm_mapping_find(vms, vaddr, PAGESIZE, B_TRUE)) == NULL ||
747	    (prot & ~vmsm->vmsm_prot) != 0) {
748		return (-1);
749	}
750
751	vmp = kmem_zalloc(sizeof (struct vm_page), KM_SLEEP);
752
753	vmo = vmsm->vmsm_object;
754	vm_object_reference(vmo);
755	vmp->vmp_obj_held = vmo;
756	vmp->vmp_pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, vaddr), NULL,
757	    NULL);
758
759	*ma = vmp;
760	return (1);
761}
762
763/*
764 * Find a suitable location for a mapping (and install it).
765 */
766int
767vm_map_find(vm_map_t map, vm_object_t vmo, vm_ooffset_t off, vm_offset_t *addr,
768    vm_size_t len, vm_offset_t max_addr, int find_flags, vm_prot_t prot,
769    vm_prot_t prot_max, int cow)
770{
771	struct vmspace *vms = VMMAP_TO_VMSPACE(map);
772	const size_t size = (size_t)len;
773	const uintptr_t uoff = (uintptr_t)off;
774	uintptr_t base = *addr;
775	vmspace_mapping_t *vmsm;
776	int res = 0;
777
778	/* For use in vmm only */
779	VERIFY(find_flags == VMFS_NO_SPACE); /* essentially MAP_FIXED */
780	VERIFY(max_addr == 0);
781
782	if (size == 0 || off < 0 ||
783	    uoff >= (uoff + size) || vmo->vmo_size < (uoff + size)) {
784		return (EINVAL);
785	}
786
787	if (*addr >= vms->vms_size) {
788		return (ENOMEM);
789	}
790
791	vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
792
793	mutex_enter(&vms->vms_lock);
794	vms->vms_map_changing = B_TRUE;
795	if (!vm_mapping_gap(vms, base, size)) {
796		res = ENOMEM;
797		goto out;
798	}
799
800	if (res == 0) {
801		vmsm->vmsm_object = vmo;
802		vmsm->vmsm_addr = base;
803		vmsm->vmsm_len = len;
804		vmsm->vmsm_offset = (off_t)uoff;
805		vmsm->vmsm_prot = prot;
806		list_insert_tail(&vms->vms_maplist, vmsm);
807
808		/* Communicate out the chosen address. */
809		*addr = (vm_offset_t)base;
810	}
811out:
812	vms->vms_map_changing = B_FALSE;
813	mutex_exit(&vms->vms_lock);
814	if (res != 0) {
815		kmem_free(vmsm, sizeof (*vmsm));
816	}
817	return (res);
818}
819
820int
821vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
822{
823	struct vmspace *vms = VMMAP_TO_VMSPACE(map);
824	pmap_t pmap = &vms->vms_pmap;
825	void *pmi = pmap->pm_impl;
826	const uintptr_t addr = start;
827	const size_t size = (size_t)(end - start);
828	vmspace_mapping_t *vmsm;
829
830	ASSERT(start < end);
831
832	mutex_enter(&vms->vms_lock);
833	vms->vms_map_changing = B_TRUE;
834	/* expect to match existing mapping exactly */
835	if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL ||
836	    vmsm->vmsm_addr != addr || vmsm->vmsm_len != size) {
837		vms->vms_map_changing = B_FALSE;
838		mutex_exit(&vms->vms_lock);
839		return (ENOENT);
840	}
841
842	(void) pmap->pm_ops->vpo_unmap(pmi, addr, end);
843	pmap->pm_eptgen++;
844
845	vm_mapping_remove(vms, vmsm);
846	vms->vms_map_changing = B_FALSE;
847	mutex_exit(&vms->vms_lock);
848	return (0);
849}
850
851int
852vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
853{
854	struct vmspace *vms = VMMAP_TO_VMSPACE(map);
855	pmap_t pmap = &vms->vms_pmap;
856	void *pmi = pmap->pm_impl;
857	const uintptr_t addr = start;
858	const size_t size = end - start;
859	vmspace_mapping_t *vmsm;
860	struct vm_object *vmo;
861	uint_t prot;
862
863	mutex_enter(&vms->vms_lock);
864
865	/* For the time being, only exact-match mappings are expected */
866	if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) {
867		mutex_exit(&vms->vms_lock);
868		return (FC_NOMAP);
869	}
870	vmo = vmsm->vmsm_object;
871	prot = vmsm->vmsm_prot;
872
873	for (uintptr_t pos = addr; pos < end; ) {
874		pfn_t pfn;
875		uintptr_t pg_size, map_addr;
876		uint_t map_lvl = 0;
877
878		/* XXXJOY: punt on large pages for now */
879		pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, pos), NULL, NULL);
880		pg_size = LEVEL_SIZE(map_lvl);
881		map_addr = P2ALIGN(pos, pg_size);
882		VERIFY(pfn != PFN_INVALID);
883
884		VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl,
885		    prot, vmo->vmo_attr));
886		vms->vms_pmap.pm_eptgen++;
887
888		pos += pg_size;
889	}
890
891	mutex_exit(&vms->vms_lock);
892
893	return (0);
894}
895
896/* Provided custom for bhyve 'devmem' segment mapping */
897int
898vm_segmap_obj(vm_object_t vmo, off_t map_off, size_t size, struct as *as,
899    caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
900{
901	int err;
902
903	VERIFY(map_off >= 0);
904	VERIFY(size <= vmo->vmo_size);
905	VERIFY((size + map_off) <= vmo->vmo_size);
906
907	if (vmo->vmo_type != OBJT_DEFAULT) {
908		/* Only support default objects for now */
909		return (ENOTSUP);
910	}
911
912	as_rangelock(as);
913
914	err = choose_addr(as, addrp, size, 0, ADDR_VACALIGN, flags);
915	if (err == 0) {
916		segvmm_crargs_t svma;
917
918		svma.kaddr = (caddr_t)vmo->vmo_data + map_off;
919		svma.prot = prot;
920		svma.cookie = vmo;
921		svma.hold = (segvmm_holdfn_t)vm_object_reference;
922		svma.rele = (segvmm_relefn_t)vm_object_deallocate;
923
924		err = as_map(as, *addrp, size, segvmm_create, &svma);
925	}
926
927	as_rangeunlock(as);
928	return (err);
929}
930
931int
932vm_segmap_space(struct vmspace *vms, off_t off, struct as *as, caddr_t *addrp,
933    off_t len, uint_t prot, uint_t maxprot, uint_t flags)
934{
935	const uintptr_t addr = (uintptr_t)off;
936	const size_t size = (uintptr_t)len;
937	vmspace_mapping_t *vmsm;
938	vm_object_t vmo;
939	int err;
940
941	if (off < 0 || len <= 0 ||
942	    (addr & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
943		return (EINVAL);
944	}
945
946	mutex_enter(&vms->vms_lock);
947	if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) {
948		mutex_exit(&vms->vms_lock);
949		return (ENXIO);
950	}
951	if ((prot & ~(vmsm->vmsm_prot | PROT_USER)) != 0) {
952		mutex_exit(&vms->vms_lock);
953		return (EACCES);
954	}
955	vmo = vmsm->vmsm_object;
956	if (vmo->vmo_type != OBJT_DEFAULT) {
957		/* Only support default objects for now */
958		mutex_exit(&vms->vms_lock);
959		return (ENOTSUP);
960	}
961
962	as_rangelock(as);
963
964	err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
965	if (err == 0) {
966		segvmm_crargs_t svma;
967		const uintptr_t addroff = addr - vmsm->vmsm_addr;
968		const uintptr_t mapoff = addroff + vmsm->vmsm_offset;
969
970		VERIFY(addroff < vmsm->vmsm_len);
971		VERIFY((vmsm->vmsm_len - addroff) >= size);
972		VERIFY(mapoff < vmo->vmo_size);
973		VERIFY((mapoff + size) <= vmo->vmo_size);
974
975		svma.kaddr = (void *)((uintptr_t)vmo->vmo_data + mapoff);
976		svma.prot = prot;
977		svma.cookie = vmo;
978		svma.hold = (segvmm_holdfn_t)vm_object_reference;
979		svma.rele = (segvmm_relefn_t)vm_object_deallocate;
980
981		err = as_map(as, *addrp, len, segvmm_create, &svma);
982	}
983
984	as_rangeunlock(as);
985	mutex_exit(&vms->vms_lock);
986	return (err);
987}
988
989void
990vm_page_unwire(vm_page_t vmp, uint8_t nqueue __unused)
991{
992	ASSERT(!MUTEX_HELD(&vmp->vmp_lock));
993	mutex_enter(&vmp->vmp_lock);
994
995	VERIFY(vmp->vmp_pfn != PFN_INVALID);
996
997	vm_object_deallocate(vmp->vmp_obj_held);
998	vmp->vmp_obj_held = NULL;
999	vmp->vmp_pfn = PFN_INVALID;
1000
1001	mutex_exit(&vmp->vmp_lock);
1002
1003	mutex_destroy(&vmp->vmp_lock);
1004	kmem_free(vmp, sizeof (*vmp));
1005}
1006