xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_gpt.c (revision b9b43e84)
1b4ceea05SDan Cross /*
2b4ceea05SDan Cross  * This file and its contents are supplied under the terms of the
3b4ceea05SDan Cross  * Common Development and Distribution License ("CDDL"), version 1.0.
4b4ceea05SDan Cross  * You may only use this file in accordance with the terms of version
5b4ceea05SDan Cross  * 1.0 of the CDDL.
6b4ceea05SDan Cross  *
7b4ceea05SDan Cross  * A full copy of the text of the CDDL should have accompanied this
8b4ceea05SDan Cross  * source.  A copy of the CDDL is also available via the Internet at
9b4ceea05SDan Cross  * http://www.illumos.org/license/CDDL.
10b4ceea05SDan Cross  */
11b4ceea05SDan Cross 
12b4ceea05SDan Cross /*
13b4ceea05SDan Cross  * Copyright 2019 Joyent, Inc.
143a0fa64cSPatrick Mooney  * Copyright 2023 Oxide Computer Company
15b4ceea05SDan Cross  */
16b4ceea05SDan Cross 
17b4ceea05SDan Cross #include <sys/types.h>
18b4ceea05SDan Cross #include <sys/atomic.h>
19b4ceea05SDan Cross #include <sys/kmem.h>
20b4ceea05SDan Cross #include <sys/sysmacros.h>
21b4ceea05SDan Cross #include <sys/sunddi.h>
22b4ceea05SDan Cross #include <sys/panic.h>
23b4ceea05SDan Cross #include <vm/hat.h>
24b4ceea05SDan Cross #include <vm/as.h>
25b4ceea05SDan Cross #include <vm/hat_i86.h>
26b4ceea05SDan Cross 
27b4ceea05SDan Cross #include <sys/vmm_gpt.h>
28b4ceea05SDan Cross 
29b4ceea05SDan Cross /*
30b4ceea05SDan Cross  * VMM Generic Page Tables
31b4ceea05SDan Cross  *
32b4ceea05SDan Cross  * Bhyve runs on AMD and Intel hosts and both support nested page tables
33b4ceea05SDan Cross  * describing the guest's physical address space.  But the two use different and
34b4ceea05SDan Cross  * mutually incompatible page table formats: Intel uses the EPT, which is based
35b4ceea05SDan Cross  * on the Itanium page table format, while AMD uses the nPT, which is based on
36b4ceea05SDan Cross  * the x86_64 page table format.
37b4ceea05SDan Cross  *
38b4ceea05SDan Cross  * The GPT abstracts these format differences, and provides a single interface
39b4ceea05SDan Cross  * for interacting with either kind of table structure.
40b4ceea05SDan Cross  *
41b4ceea05SDan Cross  * At a high-level, the GPT is a tree that mirrors the paging table radix tree.
42b4ceea05SDan Cross  * It is parameterized with operations on PTEs that are specific to the table
43b4ceea05SDan Cross  * type (EPT or nPT) and also keeps track of how many pages the table maps, as
44b4ceea05SDan Cross  * well as a pointer to the root node in the tree.
45b4ceea05SDan Cross  *
46b4ceea05SDan Cross  * A node in the GPT keep pointers to its parent (NULL for the root), its
473a0fa64cSPatrick Mooney  * left-most child, and its siblings.  The node understands its position in the
483a0fa64cSPatrick Mooney  * tree in terms of the level it appears at and the index it occupies at its
493a0fa64cSPatrick Mooney  * parent's level, as well as how many children it has.  It also owns the
503a0fa64cSPatrick Mooney  * physical memory page for the hardware page table entries that map its
513a0fa64cSPatrick Mooney  * children.  Thus, for a node at any given level in the tree, the nested PTE
523a0fa64cSPatrick Mooney  * for that node's child at index $i$ is the i'th uint64_t in that node's entry
533a0fa64cSPatrick Mooney  * page and the entry page is part of the paging structure consumed by hardware.
54b4ceea05SDan Cross  *
55b4ceea05SDan Cross  * The GPT interface provides functions for populating and vacating the tree for
56b4ceea05SDan Cross  * regions in the guest physical address space, and for mapping and unmapping
57b4ceea05SDan Cross  * pages in populated regions.  Users must populate a region before mapping
58b4ceea05SDan Cross  * pages into it, and must unmap pages before vacating the region.
59b4ceea05SDan Cross  *
60b4ceea05SDan Cross  * The interface also exposes a function for walking the table from the root to
61b4ceea05SDan Cross  * a leaf entry, populating an array of pointers to PTEs.  This walk uses the
62b4ceea05SDan Cross  * hardware page structure itself, and is thus fast, though as a result it
63b4ceea05SDan Cross  * potentially aliases entries; caveat emptor.  The walk primitive is used for
64b4ceea05SDan Cross  * mapping, unmapping, and lookups.
65b4ceea05SDan Cross  *
66b4ceea05SDan Cross  * Format-specific differences are abstracted by parameterizing the GPT with a
67b4ceea05SDan Cross  * set of PTE operations specific to the platform.  The GPT code makes use of
68b4ceea05SDan Cross  * these when mapping or populating entries, resetting accessed and dirty bits
69b4ceea05SDan Cross  * on entries, and similar operations.
70b4ceea05SDan Cross  */
71b4ceea05SDan Cross 
72b4ceea05SDan Cross /*
73b4ceea05SDan Cross  * A GPT node.
74b4ceea05SDan Cross  *
75b4ceea05SDan Cross  * Each node contains pointers to its parent, its left-most child, and its
763a0fa64cSPatrick Mooney  * siblings.  Interior nodes also maintain a reference count, and each node
773a0fa64cSPatrick Mooney  * contains its level and index in its parent's table.  Finally, each node
783a0fa64cSPatrick Mooney  * contains the host PFN of the page that it links into the page table, as well
793a0fa64cSPatrick Mooney  * as a kernel pointer to table.
80b4ceea05SDan Cross  *
817daa5405SPatrick Mooney  * On leaf nodes, the reference count tracks how many entries in the table are
827daa5405SPatrick Mooney  * covered by mapping from the containing vmspace.  This is maintained during
837daa5405SPatrick Mooney  * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace
847daa5405SPatrick Mooney  * map/unmap operations, rather than in the data path of faults populating the
857daa5405SPatrick Mooney  * PTEs themselves.
867daa5405SPatrick Mooney  *
87b4ceea05SDan Cross  * Note, this is carefully sized to fit exactly into a 64-byte cache line.
88b4ceea05SDan Cross  */
89b4ceea05SDan Cross typedef struct vmm_gpt_node vmm_gpt_node_t;
90b4ceea05SDan Cross struct vmm_gpt_node {
91b4ceea05SDan Cross 	uint64_t	vgn_host_pfn;
92b4ceea05SDan Cross 	uint16_t	vgn_level;
93b4ceea05SDan Cross 	uint16_t	vgn_index;
94b4ceea05SDan Cross 	uint32_t	vgn_ref_cnt;
95b4ceea05SDan Cross 	vmm_gpt_node_t	*vgn_parent;
96b4ceea05SDan Cross 	vmm_gpt_node_t	*vgn_children;
973a0fa64cSPatrick Mooney 	vmm_gpt_node_t	*vgn_sib_next;
983a0fa64cSPatrick Mooney 	vmm_gpt_node_t	*vgn_sib_prev;
99b4ceea05SDan Cross 	uint64_t	*vgn_entries;
1000153d828SPatrick Mooney 	uint64_t	vgn_gpa;
101b4ceea05SDan Cross };
102b4ceea05SDan Cross 
1033a0fa64cSPatrick Mooney /* Maximum node index determined by number of entries in page table (512) */
1043a0fa64cSPatrick Mooney #define	PTE_PER_TABLE	512
1053a0fa64cSPatrick Mooney #define	MAX_NODE_IDX	(PTE_PER_TABLE - 1)
1063a0fa64cSPatrick Mooney 
107b4ceea05SDan Cross /*
108b4ceea05SDan Cross  * A VMM Generic Page Table.
109b4ceea05SDan Cross  *
110b4ceea05SDan Cross  * The generic page table is a format-agnostic, 4-level paging structure
111b4ceea05SDan Cross  * modeling a second-level page table (EPT on Intel, nPT on AMD).  It
112b4ceea05SDan Cross  * contains a counter of pages the table maps, a pointer to the root node
113b4ceea05SDan Cross  * in the table, and is parameterized with a set of PTE operations specific
114b4ceea05SDan Cross  * to the table type.
115b4ceea05SDan Cross  */
116b4ceea05SDan Cross struct vmm_gpt {
117b4ceea05SDan Cross 	vmm_gpt_node_t	*vgpt_root;
118b4ceea05SDan Cross 	vmm_pte_ops_t	*vgpt_pte_ops;
119b4ceea05SDan Cross };
120b4ceea05SDan Cross 
121b4ceea05SDan Cross /*
122b4ceea05SDan Cross  * Allocates a vmm_gpt_node_t structure with corresponding page of memory to
123b4ceea05SDan Cross  * hold the PTEs it contains.
124b4ceea05SDan Cross  */
125b4ceea05SDan Cross static vmm_gpt_node_t *
vmm_gpt_node_alloc(void)126b4ceea05SDan Cross vmm_gpt_node_alloc(void)
127b4ceea05SDan Cross {
128b4ceea05SDan Cross 	vmm_gpt_node_t *node;
129b4ceea05SDan Cross 	caddr_t page;
130b4ceea05SDan Cross 
131b4ceea05SDan Cross 	node = kmem_zalloc(sizeof (*node), KM_SLEEP);
132b4ceea05SDan Cross 	/*
133b4ceea05SDan Cross 	 * Note: despite the man page, allocating PAGESIZE bytes is
134b4ceea05SDan Cross 	 * guaranteed to be page-aligned.
135b4ceea05SDan Cross 	 */
136b4ceea05SDan Cross 	page = kmem_zalloc(PAGESIZE, KM_SLEEP);
137b4ceea05SDan Cross 	node->vgn_entries = (uint64_t *)page;
138b4ceea05SDan Cross 	node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page);
139b4ceea05SDan Cross 
140b4ceea05SDan Cross 	return (node);
141b4ceea05SDan Cross }
142b4ceea05SDan Cross 
143b4ceea05SDan Cross /*
144b4ceea05SDan Cross  * Allocates and initializes a vmm_gpt_t.
145b4ceea05SDan Cross  */
146b4ceea05SDan Cross vmm_gpt_t *
vmm_gpt_alloc(vmm_pte_ops_t * pte_ops)147b4ceea05SDan Cross vmm_gpt_alloc(vmm_pte_ops_t *pte_ops)
148b4ceea05SDan Cross {
149b4ceea05SDan Cross 	vmm_gpt_t *gpt;
150b4ceea05SDan Cross 
151b4ceea05SDan Cross 	VERIFY(pte_ops != NULL);
152b4ceea05SDan Cross 	gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP);
153b4ceea05SDan Cross 	gpt->vgpt_pte_ops = pte_ops;
154b4ceea05SDan Cross 	gpt->vgpt_root = vmm_gpt_node_alloc();
155b4ceea05SDan Cross 
156b4ceea05SDan Cross 	return (gpt);
157b4ceea05SDan Cross }
158b4ceea05SDan Cross 
159b4ceea05SDan Cross /*
1603a0fa64cSPatrick Mooney  * Frees a given node.  The node is expected to have no familial (parent,
1613a0fa64cSPatrick Mooney  * children, siblings) associations at this point.  Accordingly, its reference
1623a0fa64cSPatrick Mooney  * count should be zero.
163b4ceea05SDan Cross  */
164b4ceea05SDan Cross static void
vmm_gpt_node_free(vmm_gpt_node_t * node)165b4ceea05SDan Cross vmm_gpt_node_free(vmm_gpt_node_t *node)
166b4ceea05SDan Cross {
167b4ceea05SDan Cross 	ASSERT(node != NULL);
168b4ceea05SDan Cross 	ASSERT3U(node->vgn_ref_cnt, ==, 0);
169b4ceea05SDan Cross 	ASSERT(node->vgn_host_pfn != PFN_INVALID);
170b4ceea05SDan Cross 	ASSERT(node->vgn_entries != NULL);
1713a0fa64cSPatrick Mooney 	ASSERT(node->vgn_parent == NULL);
1723a0fa64cSPatrick Mooney 
173b4ceea05SDan Cross 	kmem_free(node->vgn_entries, PAGESIZE);
174b4ceea05SDan Cross 	kmem_free(node, sizeof (*node));
175b4ceea05SDan Cross }
176b4ceea05SDan Cross 
177b4ceea05SDan Cross /*
1783a0fa64cSPatrick Mooney  * Frees a vmm_gpt_t.  Any lingering nodes in the GPT will be freed too.
179b4ceea05SDan Cross  */
1803a0fa64cSPatrick Mooney void
vmm_gpt_free(vmm_gpt_t * gpt)1813a0fa64cSPatrick Mooney vmm_gpt_free(vmm_gpt_t *gpt)
182b4ceea05SDan Cross {
1833a0fa64cSPatrick Mooney 	/* Empty anything remaining in the tree */
1843a0fa64cSPatrick Mooney 	vmm_gpt_vacate_region(gpt, 0, UINT64_MAX & PAGEMASK);
185b4ceea05SDan Cross 
1863a0fa64cSPatrick Mooney 	VERIFY(gpt->vgpt_root != NULL);
1873a0fa64cSPatrick Mooney 	VERIFY3U(gpt->vgpt_root->vgn_ref_cnt, ==, 0);
1883a0fa64cSPatrick Mooney 
1893a0fa64cSPatrick Mooney 	vmm_gpt_node_free(gpt->vgpt_root);
1903a0fa64cSPatrick Mooney 	kmem_free(gpt, sizeof (*gpt));
191b4ceea05SDan Cross }
192b4ceea05SDan Cross 
193b4ceea05SDan Cross /*
1943a0fa64cSPatrick Mooney  * Given a GPA, return its corresponding index in a paging structure at the
1953a0fa64cSPatrick Mooney  * provided level.
196b4ceea05SDan Cross  */
1973a0fa64cSPatrick Mooney static inline uint16_t
vmm_gpt_lvl_index(vmm_gpt_node_level_t level,uint64_t gpa)1983a0fa64cSPatrick Mooney vmm_gpt_lvl_index(vmm_gpt_node_level_t level, uint64_t gpa)
199b4ceea05SDan Cross {
2003a0fa64cSPatrick Mooney 	ASSERT(level < MAX_GPT_LEVEL);
2013a0fa64cSPatrick Mooney 
2023a0fa64cSPatrick Mooney 	const uint_t shifts[] = {
2033a0fa64cSPatrick Mooney 		[LEVEL4] = 39,
2043a0fa64cSPatrick Mooney 		[LEVEL3] = 30,
2053a0fa64cSPatrick Mooney 		[LEVEL2] = 21,
2063a0fa64cSPatrick Mooney 		[LEVEL1] = 12,
2073a0fa64cSPatrick Mooney 	};
2083a0fa64cSPatrick Mooney 	const uint16_t mask = (1U << 9) - 1;
2093a0fa64cSPatrick Mooney 	return ((gpa >> shifts[level]) & mask);
2103a0fa64cSPatrick Mooney }
2113a0fa64cSPatrick Mooney 
2123a0fa64cSPatrick Mooney /* Get mask for addresses of entries at a given table level. */
2133a0fa64cSPatrick Mooney static inline uint64_t
vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)2143a0fa64cSPatrick Mooney vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)
2153a0fa64cSPatrick Mooney {
2163a0fa64cSPatrick Mooney 	ASSERT(level < MAX_GPT_LEVEL);
2173a0fa64cSPatrick Mooney 
2183a0fa64cSPatrick Mooney 	const uint64_t gpa_mask[] = {
2193a0fa64cSPatrick Mooney 		[LEVEL4] = 0xffffff8000000000ul, /* entries cover 512G */
2203a0fa64cSPatrick Mooney 		[LEVEL3] = 0xffffffffc0000000ul, /* entries cover 1G */
2213a0fa64cSPatrick Mooney 		[LEVEL2] = 0xffffffffffe00000ul, /* entries cover 2M */
2223a0fa64cSPatrick Mooney 		[LEVEL1] = 0xfffffffffffff000ul, /* entries cover 4K */
2233a0fa64cSPatrick Mooney 	};
2243a0fa64cSPatrick Mooney 	return (gpa_mask[level]);
2253a0fa64cSPatrick Mooney }
2263a0fa64cSPatrick Mooney 
2273a0fa64cSPatrick Mooney /* Get length of GPA covered by entries at a given table level. */
2283a0fa64cSPatrick Mooney static inline uint64_t
vmm_gpt_lvl_len(vmm_gpt_node_level_t level)2293a0fa64cSPatrick Mooney vmm_gpt_lvl_len(vmm_gpt_node_level_t level)
2303a0fa64cSPatrick Mooney {
2313a0fa64cSPatrick Mooney 	ASSERT(level < MAX_GPT_LEVEL);
2323a0fa64cSPatrick Mooney 
2333a0fa64cSPatrick Mooney 	const uint64_t gpa_len[] = {
2343a0fa64cSPatrick Mooney 		[LEVEL4] = 0x8000000000ul,	/* entries cover 512G */
2353a0fa64cSPatrick Mooney 		[LEVEL3] = 0x40000000ul,	/* entries cover 1G */
2363a0fa64cSPatrick Mooney 		[LEVEL2] = 0x200000ul,		/* entries cover 2M */
2373a0fa64cSPatrick Mooney 		[LEVEL1] = 0x1000ul,		/* entries cover 4K */
2383a0fa64cSPatrick Mooney 	};
2393a0fa64cSPatrick Mooney 	return (gpa_len[level]);
240b4ceea05SDan Cross }
241b4ceea05SDan Cross 
242b4ceea05SDan Cross /*
2433a0fa64cSPatrick Mooney  * Get the ending GPA which this node could possibly cover given its base
2443a0fa64cSPatrick Mooney  * address and level.
245b4ceea05SDan Cross  */
2463a0fa64cSPatrick Mooney static inline uint64_t
vmm_gpt_node_end(vmm_gpt_node_t * node)2473a0fa64cSPatrick Mooney vmm_gpt_node_end(vmm_gpt_node_t *node)
248b4ceea05SDan Cross {
2493a0fa64cSPatrick Mooney 	ASSERT(node->vgn_level > LEVEL4);
2503a0fa64cSPatrick Mooney 	return (node->vgn_gpa + vmm_gpt_lvl_len(node->vgn_level - 1));
2513a0fa64cSPatrick Mooney }
2523a0fa64cSPatrick Mooney 
2533a0fa64cSPatrick Mooney /*
2543a0fa64cSPatrick Mooney  * Is this node the last entry in its parent node, based solely by its GPA?
2553a0fa64cSPatrick Mooney  */
2563a0fa64cSPatrick Mooney static inline bool
vmm_gpt_node_is_last(vmm_gpt_node_t * node)2573a0fa64cSPatrick Mooney vmm_gpt_node_is_last(vmm_gpt_node_t *node)
2583a0fa64cSPatrick Mooney {
2593a0fa64cSPatrick Mooney 	return (node->vgn_index == MAX_NODE_IDX);
2603a0fa64cSPatrick Mooney }
2613a0fa64cSPatrick Mooney 
2623a0fa64cSPatrick Mooney /*
2633a0fa64cSPatrick Mooney  * How many table entries (if any) in this node are covered by the range of
2643a0fa64cSPatrick Mooney  * [start, end).
2653a0fa64cSPatrick Mooney  */
2663a0fa64cSPatrick Mooney static uint16_t
vmm_gpt_node_entries_covered(vmm_gpt_node_t * node,uint64_t start,uint64_t end)2673a0fa64cSPatrick Mooney vmm_gpt_node_entries_covered(vmm_gpt_node_t *node, uint64_t start, uint64_t end)
2683a0fa64cSPatrick Mooney {
2693a0fa64cSPatrick Mooney 	const uint64_t node_end = vmm_gpt_node_end(node);
2703a0fa64cSPatrick Mooney 
2713a0fa64cSPatrick Mooney 	/* Is this node covered at all by the region? */
2723a0fa64cSPatrick Mooney 	if (start >= node_end || end <= node->vgn_gpa) {
2733a0fa64cSPatrick Mooney 		return (0);
2743a0fa64cSPatrick Mooney 	}
2753a0fa64cSPatrick Mooney 
2763a0fa64cSPatrick Mooney 	const uint64_t mask = vmm_gpt_lvl_mask(node->vgn_level);
2773a0fa64cSPatrick Mooney 	const uint64_t covered_start = MAX(node->vgn_gpa, start & mask);
2783a0fa64cSPatrick Mooney 	const uint64_t covered_end = MIN(node_end, end & mask);
2793a0fa64cSPatrick Mooney 	const uint64_t per_entry = vmm_gpt_lvl_len(node->vgn_level);
2803a0fa64cSPatrick Mooney 
2813a0fa64cSPatrick Mooney 	return ((covered_end - covered_start) / per_entry);
282b4ceea05SDan Cross }
283b4ceea05SDan Cross 
2843a0fa64cSPatrick Mooney /*
2853a0fa64cSPatrick Mooney  * Find the next node (by address) in the tree at the same level.
2863a0fa64cSPatrick Mooney  *
2873a0fa64cSPatrick Mooney  * Returns NULL if this is the last node in the tree or if `only_seq` was true
2883a0fa64cSPatrick Mooney  * and there is an address gap between this node and the next.
2893a0fa64cSPatrick Mooney  */
2903a0fa64cSPatrick Mooney static vmm_gpt_node_t *
vmm_gpt_node_next(vmm_gpt_node_t * node,bool only_seq)2913a0fa64cSPatrick Mooney vmm_gpt_node_next(vmm_gpt_node_t *node, bool only_seq)
2923a0fa64cSPatrick Mooney {
2933a0fa64cSPatrick Mooney 	ASSERT3P(node->vgn_parent, !=, NULL);
2943a0fa64cSPatrick Mooney 	ASSERT3U(node->vgn_level, >, LEVEL4);
2953a0fa64cSPatrick Mooney 
2963a0fa64cSPatrick Mooney 	/*
2973a0fa64cSPatrick Mooney 	 * Next node sequentially would be the one at the address starting at
2983a0fa64cSPatrick Mooney 	 * the end of what is covered by this node.
2993a0fa64cSPatrick Mooney 	 */
3003a0fa64cSPatrick Mooney 	const uint64_t gpa_match = vmm_gpt_node_end(node);
3013a0fa64cSPatrick Mooney 
3023a0fa64cSPatrick Mooney 	/* Try our next sibling */
3033a0fa64cSPatrick Mooney 	vmm_gpt_node_t *next = node->vgn_sib_next;
3043a0fa64cSPatrick Mooney 	if (next != NULL) {
3053a0fa64cSPatrick Mooney 		if (next->vgn_gpa == gpa_match || !only_seq) {
3063a0fa64cSPatrick Mooney 			return (next);
3073a0fa64cSPatrick Mooney 		}
3083a0fa64cSPatrick Mooney 	} else {
3093a0fa64cSPatrick Mooney 		/*
3103a0fa64cSPatrick Mooney 		 * If the next-sibling pointer is NULL on the node, it can mean
3113a0fa64cSPatrick Mooney 		 * one of two things:
3123a0fa64cSPatrick Mooney 		 *
3133a0fa64cSPatrick Mooney 		 * 1. This entry represents the space leading up to the trailing
3143a0fa64cSPatrick Mooney 		 *    boundary of what this node covers.
3153a0fa64cSPatrick Mooney 		 *
3163a0fa64cSPatrick Mooney 		 * 2. The node is not entirely populated, and there is a gap
3173a0fa64cSPatrick Mooney 		 *    between the last populated entry, and the trailing
3183a0fa64cSPatrick Mooney 		 *    boundary of the node.
3193a0fa64cSPatrick Mooney 		 *
3203a0fa64cSPatrick Mooney 		 * Either way, the proper course of action is to check the first
3213a0fa64cSPatrick Mooney 		 * child of our parent's next sibling.
3223a0fa64cSPatrick Mooney 		 */
3233a0fa64cSPatrick Mooney 		vmm_gpt_node_t *pibling = node->vgn_parent->vgn_sib_next;
3243a0fa64cSPatrick Mooney 		if (pibling != NULL) {
3253a0fa64cSPatrick Mooney 			next = pibling->vgn_children;
3263a0fa64cSPatrick Mooney 			if (next != NULL) {
3273a0fa64cSPatrick Mooney 				if (next->vgn_gpa == gpa_match || !only_seq) {
3283a0fa64cSPatrick Mooney 					return (next);
3293a0fa64cSPatrick Mooney 				}
3303a0fa64cSPatrick Mooney 			}
3313a0fa64cSPatrick Mooney 		}
3323a0fa64cSPatrick Mooney 	}
3333a0fa64cSPatrick Mooney 
3343a0fa64cSPatrick Mooney 	return (NULL);
3353a0fa64cSPatrick Mooney }
3363a0fa64cSPatrick Mooney 
3373a0fa64cSPatrick Mooney 
338b4ceea05SDan Cross /*
339b4ceea05SDan Cross  * Finds the child for the given GPA in the given parent node.
340b4ceea05SDan Cross  * Returns a pointer to node, or NULL if it is not found.
341b4ceea05SDan Cross  */
342b4ceea05SDan Cross static vmm_gpt_node_t *
vmm_gpt_node_find_child(vmm_gpt_node_t * parent,uint64_t gpa)343b4ceea05SDan Cross vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa)
344b4ceea05SDan Cross {
3453a0fa64cSPatrick Mooney 	const uint16_t index = vmm_gpt_lvl_index(parent->vgn_level, gpa);
346b4ceea05SDan Cross 	for (vmm_gpt_node_t *child = parent->vgn_children;
347b4ceea05SDan Cross 	    child != NULL && child->vgn_index <= index;
3483a0fa64cSPatrick Mooney 	    child = child->vgn_sib_next) {
349b4ceea05SDan Cross 		if (child->vgn_index == index)
350b4ceea05SDan Cross 			return (child);
351b4ceea05SDan Cross 	}
352b4ceea05SDan Cross 
353b4ceea05SDan Cross 	return (NULL);
354b4ceea05SDan Cross }
355b4ceea05SDan Cross 
3563a0fa64cSPatrick Mooney /*
3573a0fa64cSPatrick Mooney  * Add a child node to the GPT at a position determined by GPA, parent, and (if
3583a0fa64cSPatrick Mooney  * present) preceding sibling.
3593a0fa64cSPatrick Mooney  *
3603a0fa64cSPatrick Mooney  * If `parent` node contains any children, `prev_sibling` must be populated with
3613a0fa64cSPatrick Mooney  * a pointer to the node preceding (by GPA) the to-be-added child node.
3623a0fa64cSPatrick Mooney  */
3633a0fa64cSPatrick Mooney static void
vmm_gpt_node_add(vmm_gpt_t * gpt,vmm_gpt_node_t * parent,vmm_gpt_node_t * child,uint64_t gpa,vmm_gpt_node_t * prev_sibling)3643a0fa64cSPatrick Mooney vmm_gpt_node_add(vmm_gpt_t *gpt, vmm_gpt_node_t *parent,
3653a0fa64cSPatrick Mooney     vmm_gpt_node_t *child, uint64_t gpa, vmm_gpt_node_t *prev_sibling)
3663a0fa64cSPatrick Mooney {
3673a0fa64cSPatrick Mooney 	ASSERT3U(parent->vgn_level, <, LEVEL1);
3683a0fa64cSPatrick Mooney 	ASSERT3U(child->vgn_parent, ==, NULL);
3693a0fa64cSPatrick Mooney 
3703a0fa64cSPatrick Mooney 	const uint16_t idx = vmm_gpt_lvl_index(parent->vgn_level, gpa);
3713a0fa64cSPatrick Mooney 	child->vgn_index = idx;
3723a0fa64cSPatrick Mooney 	child->vgn_level = parent->vgn_level + 1;
3733a0fa64cSPatrick Mooney 	child->vgn_gpa = gpa & vmm_gpt_lvl_mask(parent->vgn_level);
3743a0fa64cSPatrick Mooney 
3753a0fa64cSPatrick Mooney 	/* Establish familial connections */
3763a0fa64cSPatrick Mooney 	child->vgn_parent = parent;
3773a0fa64cSPatrick Mooney 	if (prev_sibling != NULL) {
3783a0fa64cSPatrick Mooney 		ASSERT3U(prev_sibling->vgn_gpa, <, child->vgn_gpa);
3793a0fa64cSPatrick Mooney 
3803a0fa64cSPatrick Mooney 		child->vgn_sib_next = prev_sibling->vgn_sib_next;
3813a0fa64cSPatrick Mooney 		if (child->vgn_sib_next != NULL) {
3823a0fa64cSPatrick Mooney 			child->vgn_sib_next->vgn_sib_prev = child;
3833a0fa64cSPatrick Mooney 		}
3843a0fa64cSPatrick Mooney 		child->vgn_sib_prev = prev_sibling;
3853a0fa64cSPatrick Mooney 		prev_sibling->vgn_sib_next = child;
3863a0fa64cSPatrick Mooney 	} else if (parent->vgn_children != NULL) {
3873a0fa64cSPatrick Mooney 		vmm_gpt_node_t *next_sibling = parent->vgn_children;
3883a0fa64cSPatrick Mooney 
3893a0fa64cSPatrick Mooney 		ASSERT3U(next_sibling->vgn_gpa, >, child->vgn_gpa);
3903a0fa64cSPatrick Mooney 		ASSERT3U(next_sibling->vgn_sib_prev, ==, NULL);
3913a0fa64cSPatrick Mooney 
3923a0fa64cSPatrick Mooney 		child->vgn_sib_next = next_sibling;
3933a0fa64cSPatrick Mooney 		child->vgn_sib_prev = NULL;
3943a0fa64cSPatrick Mooney 		next_sibling->vgn_sib_prev = child;
3953a0fa64cSPatrick Mooney 		parent->vgn_children = child;
3963a0fa64cSPatrick Mooney 	} else {
3973a0fa64cSPatrick Mooney 		parent->vgn_children = child;
3983a0fa64cSPatrick Mooney 		child->vgn_sib_next = NULL;
3993a0fa64cSPatrick Mooney 		child->vgn_sib_prev = NULL;
4003a0fa64cSPatrick Mooney 	}
4013a0fa64cSPatrick Mooney 
4023a0fa64cSPatrick Mooney 	/* Configure PTE for child table */
4033a0fa64cSPatrick Mooney 	parent->vgn_entries[idx] =
4043a0fa64cSPatrick Mooney 	    gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn);
4053a0fa64cSPatrick Mooney 	parent->vgn_ref_cnt++;
4063a0fa64cSPatrick Mooney }
4073a0fa64cSPatrick Mooney 
4083a0fa64cSPatrick Mooney /*
4093a0fa64cSPatrick Mooney  * Remove a child node from its relatives (parent, siblings) and free it.
4103a0fa64cSPatrick Mooney  */
4113a0fa64cSPatrick Mooney static void
vmm_gpt_node_remove(vmm_gpt_node_t * child)4123a0fa64cSPatrick Mooney vmm_gpt_node_remove(vmm_gpt_node_t *child)
4133a0fa64cSPatrick Mooney {
4143a0fa64cSPatrick Mooney 	ASSERT3P(child->vgn_children, ==, NULL);
4153a0fa64cSPatrick Mooney 	ASSERT3U(child->vgn_ref_cnt, ==, 0);
4163a0fa64cSPatrick Mooney 	ASSERT3P(child->vgn_parent, !=, NULL);
4173a0fa64cSPatrick Mooney 
4183a0fa64cSPatrick Mooney 	/* Unlink child from its siblings and parent */
4193a0fa64cSPatrick Mooney 	vmm_gpt_node_t *parent = child->vgn_parent;
4203a0fa64cSPatrick Mooney 	vmm_gpt_node_t *prev = child->vgn_sib_prev;
4213a0fa64cSPatrick Mooney 	vmm_gpt_node_t *next = child->vgn_sib_next;
4223a0fa64cSPatrick Mooney 	if (prev != NULL) {
4233a0fa64cSPatrick Mooney 		ASSERT3P(prev->vgn_sib_next, ==, child);
4243a0fa64cSPatrick Mooney 		prev->vgn_sib_next = next;
4253a0fa64cSPatrick Mooney 	}
4263a0fa64cSPatrick Mooney 	if (next != NULL) {
4273a0fa64cSPatrick Mooney 		ASSERT3P(next->vgn_sib_prev, ==, child);
4283a0fa64cSPatrick Mooney 		next->vgn_sib_prev = prev;
4293a0fa64cSPatrick Mooney 	}
4303a0fa64cSPatrick Mooney 	if (prev == NULL) {
4313a0fa64cSPatrick Mooney 		ASSERT3P(parent->vgn_children, ==, child);
4323a0fa64cSPatrick Mooney 		parent->vgn_children = next;
4333a0fa64cSPatrick Mooney 	}
4343a0fa64cSPatrick Mooney 	child->vgn_parent = NULL;
4353a0fa64cSPatrick Mooney 	child->vgn_sib_next = NULL;
4363a0fa64cSPatrick Mooney 	child->vgn_sib_prev = NULL;
4373a0fa64cSPatrick Mooney 	parent->vgn_entries[child->vgn_index] = 0;
4383a0fa64cSPatrick Mooney 	parent->vgn_ref_cnt--;
4393a0fa64cSPatrick Mooney 
4403a0fa64cSPatrick Mooney 	vmm_gpt_node_free(child);
4413a0fa64cSPatrick Mooney }
4423a0fa64cSPatrick Mooney 
443b4ceea05SDan Cross /*
444b4ceea05SDan Cross  * Walks the GPT for the given GPA, accumulating entries to the given depth.  If
445b4ceea05SDan Cross  * the walk terminates before the depth is reached, the remaining entries are
446b4ceea05SDan Cross  * written with NULLs.
447b4ceea05SDan Cross  */
448b4ceea05SDan Cross void
vmm_gpt_walk(vmm_gpt_t * gpt,uint64_t gpa,uint64_t ** entries,vmm_gpt_node_level_t depth)449b4ceea05SDan Cross vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries,
4503a0fa64cSPatrick Mooney     vmm_gpt_node_level_t depth)
451b4ceea05SDan Cross {
452b4ceea05SDan Cross 	uint64_t *current_entries, entry;
453b4ceea05SDan Cross 	pfn_t pfn;
454b4ceea05SDan Cross 
455b4ceea05SDan Cross 	ASSERT(gpt != NULL);
456b4ceea05SDan Cross 	current_entries = gpt->vgpt_root->vgn_entries;
457b4ceea05SDan Cross 	for (uint_t i = 0; i < depth; i++) {
458b4ceea05SDan Cross 		if (current_entries == NULL) {
459b4ceea05SDan Cross 			entries[i] = NULL;
460b4ceea05SDan Cross 			continue;
461b4ceea05SDan Cross 		}
4623a0fa64cSPatrick Mooney 		entries[i] = &current_entries[vmm_gpt_lvl_index(i, gpa)];
463b4ceea05SDan Cross 		entry = *entries[i];
464b4ceea05SDan Cross 		if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
465b4ceea05SDan Cross 			current_entries = NULL;
466b4ceea05SDan Cross 			continue;
467b4ceea05SDan Cross 		}
468b4ceea05SDan Cross 		pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
469b4ceea05SDan Cross 		current_entries = (uint64_t *)hat_kpm_pfn2va(pfn);
470b4ceea05SDan Cross 	}
471b4ceea05SDan Cross }
472b4ceea05SDan Cross 
473b4ceea05SDan Cross /*
474b4ceea05SDan Cross  * Looks up an entry given GPA.
475b4ceea05SDan Cross  */
476b4ceea05SDan Cross uint64_t *
vmm_gpt_lookup(vmm_gpt_t * gpt,uint64_t gpa)477b4ceea05SDan Cross vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa)
478b4ceea05SDan Cross {
479b4ceea05SDan Cross 	uint64_t *entries[MAX_GPT_LEVEL];
480b4ceea05SDan Cross 
481b4ceea05SDan Cross 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
482b4ceea05SDan Cross 
483b4ceea05SDan Cross 	return (entries[LEVEL1]);
484b4ceea05SDan Cross }
485b4ceea05SDan Cross 
486b4ceea05SDan Cross /*
4873a0fa64cSPatrick Mooney  * Populate child table nodes for a given level between the provided interval
4883a0fa64cSPatrick Mooney  * of [addr, addr + len).  Caller is expected to provide a pointer to the parent
4893a0fa64cSPatrick Mooney  * node which would contain the child node for GPA at `addr`.  A pointer to said
4903a0fa64cSPatrick Mooney  * child node will be returned when the operation is complete.
491b4ceea05SDan Cross  */
4923a0fa64cSPatrick Mooney static vmm_gpt_node_t *
vmm_gpt_populate_region_lvl(vmm_gpt_t * gpt,uint64_t addr,uint64_t len,vmm_gpt_node_t * node_start)4933a0fa64cSPatrick Mooney vmm_gpt_populate_region_lvl(vmm_gpt_t *gpt, uint64_t addr, uint64_t len,
4943a0fa64cSPatrick Mooney     vmm_gpt_node_t *node_start)
495b4ceea05SDan Cross {
4963a0fa64cSPatrick Mooney 	const vmm_gpt_node_level_t lvl = node_start->vgn_level;
4973a0fa64cSPatrick Mooney 	const uint64_t end = addr + len;
4983a0fa64cSPatrick Mooney 	const uint64_t incr = vmm_gpt_lvl_len(lvl);
4993a0fa64cSPatrick Mooney 	uint64_t gpa = addr & vmm_gpt_lvl_mask(lvl);
5003a0fa64cSPatrick Mooney 	vmm_gpt_node_t *parent = node_start;
5013a0fa64cSPatrick Mooney 
5023a0fa64cSPatrick Mooney 	/* Try to locate node at starting address */
5033a0fa64cSPatrick Mooney 	vmm_gpt_node_t *prev = NULL, *node = parent->vgn_children;
5043a0fa64cSPatrick Mooney 	while (node != NULL && node->vgn_gpa < gpa) {
5053a0fa64cSPatrick Mooney 		prev = node;
5063a0fa64cSPatrick Mooney 		node = node->vgn_sib_next;
5073a0fa64cSPatrick Mooney 	}
508b4ceea05SDan Cross 
5093a0fa64cSPatrick Mooney 	/*
5103a0fa64cSPatrick Mooney 	 * If no node exists at the starting address, create one and link it
5113a0fa64cSPatrick Mooney 	 * into the parent.
5123a0fa64cSPatrick Mooney 	 */
5133a0fa64cSPatrick Mooney 	if (node == NULL || node->vgn_gpa > gpa) {
5143a0fa64cSPatrick Mooney 		/* Need to insert node for starting GPA */
5153a0fa64cSPatrick Mooney 		node = vmm_gpt_node_alloc();
5163a0fa64cSPatrick Mooney 		vmm_gpt_node_add(gpt, parent, node, gpa, prev);
5173a0fa64cSPatrick Mooney 	}
518b4ceea05SDan Cross 
5193a0fa64cSPatrick Mooney 	vmm_gpt_node_t *front_node = node;
5203a0fa64cSPatrick Mooney 	prev = node;
5213a0fa64cSPatrick Mooney 	gpa += incr;
5223a0fa64cSPatrick Mooney 
5233a0fa64cSPatrick Mooney 	/*
5243a0fa64cSPatrick Mooney 	 * With a node at the starting address, walk forward creating nodes in
5253a0fa64cSPatrick Mooney 	 * any of the gaps.
5263a0fa64cSPatrick Mooney 	 */
5273a0fa64cSPatrick Mooney 	for (; gpa < end; gpa += incr, prev = node) {
5283a0fa64cSPatrick Mooney 		node = vmm_gpt_node_next(prev, true);
5293a0fa64cSPatrick Mooney 		if (node != NULL) {
5303a0fa64cSPatrick Mooney 			ASSERT3U(node->vgn_gpa, ==, gpa);
5313a0fa64cSPatrick Mooney 
5323a0fa64cSPatrick Mooney 			/* We may have crossed into a new parent */
5333a0fa64cSPatrick Mooney 			parent = node->vgn_parent;
5343a0fa64cSPatrick Mooney 			continue;
535b4ceea05SDan Cross 		}
536b4ceea05SDan Cross 
5373a0fa64cSPatrick Mooney 		if (vmm_gpt_node_is_last(prev)) {
5383a0fa64cSPatrick Mooney 			/*
5393a0fa64cSPatrick Mooney 			 * The node preceding this was the last one in its
5403a0fa64cSPatrick Mooney 			 * containing parent, so move on to that parent's
5413a0fa64cSPatrick Mooney 			 * sibling.  We expect (demand) that it exist already.
5423a0fa64cSPatrick Mooney 			 */
5433a0fa64cSPatrick Mooney 			parent = vmm_gpt_node_next(parent, true);
5443a0fa64cSPatrick Mooney 			ASSERT(parent != NULL);
545b4ceea05SDan Cross 
5463a0fa64cSPatrick Mooney 			/*
5473a0fa64cSPatrick Mooney 			 * Forget our previous sibling, since it is of no use
5483a0fa64cSPatrick Mooney 			 * for assigning the new node to the a now-different
5493a0fa64cSPatrick Mooney 			 * parent.
5503a0fa64cSPatrick Mooney 			 */
5513a0fa64cSPatrick Mooney 			prev = NULL;
5520153d828SPatrick Mooney 
553b4ceea05SDan Cross 		}
5543a0fa64cSPatrick Mooney 		node = vmm_gpt_node_alloc();
5553a0fa64cSPatrick Mooney 		vmm_gpt_node_add(gpt, parent, node, gpa, prev);
556b4ceea05SDan Cross 	}
5577daa5405SPatrick Mooney 
5583a0fa64cSPatrick Mooney 	return (front_node);
559b4ceea05SDan Cross }
560b4ceea05SDan Cross 
561b4ceea05SDan Cross /*
562b4ceea05SDan Cross  * Ensures that PTEs for the region of address space bounded by
5633a0fa64cSPatrick Mooney  * [addr, addr + len) exist in the tree.
564b4ceea05SDan Cross  */
565b4ceea05SDan Cross void
vmm_gpt_populate_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)5663a0fa64cSPatrick Mooney vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
567b4ceea05SDan Cross {
5683a0fa64cSPatrick Mooney 	ASSERT0(addr & PAGEOFFSET);
5693a0fa64cSPatrick Mooney 	ASSERT0(len & PAGEOFFSET);
5700153d828SPatrick Mooney 
5713a0fa64cSPatrick Mooney 	/*
5723a0fa64cSPatrick Mooney 	 * Starting at the top of the tree, ensure that tables covering the
5733a0fa64cSPatrick Mooney 	 * requested region exist at each level.
5743a0fa64cSPatrick Mooney 	 */
5753a0fa64cSPatrick Mooney 	vmm_gpt_node_t *node = gpt->vgpt_root;
5763a0fa64cSPatrick Mooney 	for (uint_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
5773a0fa64cSPatrick Mooney 		ASSERT3U(node->vgn_level, ==, lvl);
5783a0fa64cSPatrick Mooney 
5793a0fa64cSPatrick Mooney 		node = vmm_gpt_populate_region_lvl(gpt, addr, len, node);
5803a0fa64cSPatrick Mooney 	}
5813a0fa64cSPatrick Mooney 
5823a0fa64cSPatrick Mooney 
5833a0fa64cSPatrick Mooney 	/*
5843a0fa64cSPatrick Mooney 	 * Establish reference counts for the soon-to-be memory PTEs which will
5853a0fa64cSPatrick Mooney 	 * be filling these LEVEL1 tables.
5863a0fa64cSPatrick Mooney 	 */
5873a0fa64cSPatrick Mooney 	uint64_t gpa = addr;
5883a0fa64cSPatrick Mooney 	const uint64_t end = addr + len;
5893a0fa64cSPatrick Mooney 	while (gpa < end) {
5903a0fa64cSPatrick Mooney 		ASSERT(node != NULL);
5913a0fa64cSPatrick Mooney 		ASSERT3U(node->vgn_level, ==, LEVEL1);
5923a0fa64cSPatrick Mooney 
5933a0fa64cSPatrick Mooney 		const uint16_t covered =
5943a0fa64cSPatrick Mooney 		    vmm_gpt_node_entries_covered(node, addr, end);
5953a0fa64cSPatrick Mooney 
5963a0fa64cSPatrick Mooney 		ASSERT(covered != 0);
5973a0fa64cSPatrick Mooney 		ASSERT3U(node->vgn_ref_cnt, <, PTE_PER_TABLE);
5983a0fa64cSPatrick Mooney 		ASSERT3U(node->vgn_ref_cnt + covered, <=, PTE_PER_TABLE);
5993a0fa64cSPatrick Mooney 
6003a0fa64cSPatrick Mooney 		node->vgn_ref_cnt += covered;
6013a0fa64cSPatrick Mooney 
6023a0fa64cSPatrick Mooney 		vmm_gpt_node_t *next = vmm_gpt_node_next(node, true);
6033a0fa64cSPatrick Mooney 		if (next != NULL) {
6043a0fa64cSPatrick Mooney 			gpa = next->vgn_gpa;
6053a0fa64cSPatrick Mooney 			node = next;
6063a0fa64cSPatrick Mooney 		} else {
6073a0fa64cSPatrick Mooney 			/*
6083a0fa64cSPatrick Mooney 			 * We do not expect to find a subsequent node after
6093a0fa64cSPatrick Mooney 			 * filling the last node in the table, completing PTE
6103a0fa64cSPatrick Mooney 			 * accounting for the specified range.
6113a0fa64cSPatrick Mooney 			 */
6123a0fa64cSPatrick Mooney 			VERIFY3U(end, <=, vmm_gpt_node_end(node));
6133a0fa64cSPatrick Mooney 			break;
6143a0fa64cSPatrick Mooney 		}
615b4ceea05SDan Cross 	}
616b4ceea05SDan Cross }
617b4ceea05SDan Cross 
618b4ceea05SDan Cross /*
6190153d828SPatrick Mooney  * Format a PTE and install it in the provided PTE-pointer.
620b4ceea05SDan Cross  */
621b4ceea05SDan Cross bool
vmm_gpt_map_at(vmm_gpt_t * gpt,uint64_t * ptep,pfn_t pfn,uint_t prot,uint8_t attr)6220153d828SPatrick Mooney vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot,
6230153d828SPatrick Mooney     uint8_t attr)
624b4ceea05SDan Cross {
6250153d828SPatrick Mooney 	uint64_t entry, old_entry;
626b4ceea05SDan Cross 
627b4ceea05SDan Cross 	entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr);
6280153d828SPatrick Mooney 	old_entry = atomic_cas_64(ptep, 0, entry);
629b4ceea05SDan Cross 	if (old_entry != 0) {
6300153d828SPatrick Mooney 		ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==,
631b4ceea05SDan Cross 		    gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry));
632b4ceea05SDan Cross 		return (false);
633b4ceea05SDan Cross 	}
634b4ceea05SDan Cross 
635b4ceea05SDan Cross 	return (true);
636b4ceea05SDan Cross }
637b4ceea05SDan Cross 
6380153d828SPatrick Mooney /*
6390153d828SPatrick Mooney  * Inserts an entry for a given GPA into the table.  The caller must
6400153d828SPatrick Mooney  * ensure that a conflicting PFN is not mapped at the requested location.
6410153d828SPatrick Mooney  * Racing operations to map the same PFN at one location is acceptable and
6420153d828SPatrick Mooney  * properly handled.
6430153d828SPatrick Mooney  */
6440153d828SPatrick Mooney bool
vmm_gpt_map(vmm_gpt_t * gpt,uint64_t gpa,pfn_t pfn,uint_t prot,uint8_t attr)6450153d828SPatrick Mooney vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr)
6460153d828SPatrick Mooney {
6470153d828SPatrick Mooney 	uint64_t *entries[MAX_GPT_LEVEL];
6480153d828SPatrick Mooney 
6490153d828SPatrick Mooney 	ASSERT(gpt != NULL);
6500153d828SPatrick Mooney 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
6510153d828SPatrick Mooney 	ASSERT(entries[LEVEL1] != NULL);
6520153d828SPatrick Mooney 
6530153d828SPatrick Mooney 	return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr));
6540153d828SPatrick Mooney }
6550153d828SPatrick Mooney 
656b4ceea05SDan Cross /*
6573a0fa64cSPatrick Mooney  * Cleans up the unused inner nodes in the GPT for a region of guest physical
6583a0fa64cSPatrick Mooney  * address space of [addr, addr + len).  The region must map no pages.
659b4ceea05SDan Cross  */
6603a0fa64cSPatrick Mooney void
vmm_gpt_vacate_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)6613a0fa64cSPatrick Mooney vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
662b4ceea05SDan Cross {
6633a0fa64cSPatrick Mooney 	ASSERT0(addr & PAGEOFFSET);
6643a0fa64cSPatrick Mooney 	ASSERT0(len & PAGEOFFSET);
665b4ceea05SDan Cross 
6663a0fa64cSPatrick Mooney 	const uint64_t end = addr + len;
6673a0fa64cSPatrick Mooney 	vmm_gpt_node_t *node, *starts[MAX_GPT_LEVEL] = {
6683a0fa64cSPatrick Mooney 		[LEVEL4] = gpt->vgpt_root,
6693a0fa64cSPatrick Mooney 	};
6703a0fa64cSPatrick Mooney 
6713a0fa64cSPatrick Mooney 	for (vmm_gpt_node_level_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
6723a0fa64cSPatrick Mooney 		node = vmm_gpt_node_find_child(starts[lvl], addr);
6733a0fa64cSPatrick Mooney 		if (node == NULL) {
674b4ceea05SDan Cross 			break;
675b4ceea05SDan Cross 		}
6763a0fa64cSPatrick Mooney 		starts[lvl + 1] = node;
677b4ceea05SDan Cross 	}
678b4ceea05SDan Cross 
6793a0fa64cSPatrick Mooney 	/*
6803a0fa64cSPatrick Mooney 	 * Starting at the bottom of the tree, ensure that PTEs for pages have
6813a0fa64cSPatrick Mooney 	 * been cleared for the region, and remove the corresponding reference
6823a0fa64cSPatrick Mooney 	 * counts from the containing LEVEL1 tables.
6833a0fa64cSPatrick Mooney 	 */
6843a0fa64cSPatrick Mooney 	uint64_t gpa = addr;
6853a0fa64cSPatrick Mooney 	node = starts[LEVEL1];
6863a0fa64cSPatrick Mooney 	while (gpa < end && node != NULL) {
6873a0fa64cSPatrick Mooney 		const uint16_t covered =
6883a0fa64cSPatrick Mooney 		    vmm_gpt_node_entries_covered(node, addr, end);
6893a0fa64cSPatrick Mooney 
6903a0fa64cSPatrick Mooney 		ASSERT3U(node->vgn_ref_cnt, >=, covered);
6913a0fa64cSPatrick Mooney 		node->vgn_ref_cnt -= covered;
6923a0fa64cSPatrick Mooney 
6933a0fa64cSPatrick Mooney 		node = vmm_gpt_node_next(node, false);
6943a0fa64cSPatrick Mooney 		if (node != NULL) {
6953a0fa64cSPatrick Mooney 			gpa = node->vgn_gpa;
6967daa5405SPatrick Mooney 		}
697b4ceea05SDan Cross 	}
698b4ceea05SDan Cross 
6993a0fa64cSPatrick Mooney 	/*
7003a0fa64cSPatrick Mooney 	 * With the page PTE references eliminated, work up from the bottom of
7013a0fa64cSPatrick Mooney 	 * the table, removing nodes which have no remaining references.
7023a0fa64cSPatrick Mooney 	 *
7033a0fa64cSPatrick Mooney 	 * This stops short of LEVEL4, which is the root table of the GPT.  It
7043a0fa64cSPatrick Mooney 	 * is left standing to be cleaned up when the vmm_gpt_t is destroyed.
7053a0fa64cSPatrick Mooney 	 */
7063a0fa64cSPatrick Mooney 	for (vmm_gpt_node_level_t lvl = LEVEL1; lvl > LEVEL4; lvl--) {
7073a0fa64cSPatrick Mooney 		gpa = addr;
7083a0fa64cSPatrick Mooney 		node = starts[lvl];
7093a0fa64cSPatrick Mooney 
7103a0fa64cSPatrick Mooney 		while (gpa < end && node != NULL) {
7113a0fa64cSPatrick Mooney 			vmm_gpt_node_t *next = vmm_gpt_node_next(node, false);
7123a0fa64cSPatrick Mooney 
7133a0fa64cSPatrick Mooney 			if (node->vgn_ref_cnt == 0) {
7143a0fa64cSPatrick Mooney 				vmm_gpt_node_remove(node);
7153a0fa64cSPatrick Mooney 			}
7163a0fa64cSPatrick Mooney 			if (next != NULL) {
7173a0fa64cSPatrick Mooney 				gpa = next->vgn_gpa;
7183a0fa64cSPatrick Mooney 			}
7193a0fa64cSPatrick Mooney 			node = next;
7203a0fa64cSPatrick Mooney 		}
721b4ceea05SDan Cross 	}
722b4ceea05SDan Cross }
723b4ceea05SDan Cross 
724b4ceea05SDan Cross /*
7250153d828SPatrick Mooney  * Remove a mapping from the table.  Returns false if the page was not mapped,
7260153d828SPatrick Mooney  * otherwise returns true.
727b4ceea05SDan Cross  */
728b4ceea05SDan Cross bool
vmm_gpt_unmap(vmm_gpt_t * gpt,uint64_t gpa)729b4ceea05SDan Cross vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
730b4ceea05SDan Cross {
731b4ceea05SDan Cross 	uint64_t *entries[MAX_GPT_LEVEL], entry;
732b4ceea05SDan Cross 
733b4ceea05SDan Cross 	ASSERT(gpt != NULL);
734b4ceea05SDan Cross 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
735b4ceea05SDan Cross 	if (entries[LEVEL1] == NULL)
736b4ceea05SDan Cross 		return (false);
737b4ceea05SDan Cross 
738b4ceea05SDan Cross 	entry = *entries[LEVEL1];
739b4ceea05SDan Cross 	*entries[LEVEL1] = 0;
7400153d828SPatrick Mooney 	return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry));
741b4ceea05SDan Cross }
742b4ceea05SDan Cross 
743b4ceea05SDan Cross /*
7440153d828SPatrick Mooney  * Un-maps the region of guest physical address space bounded by [start..end).
7450153d828SPatrick Mooney  * Returns the number of pages that are unmapped.
746b4ceea05SDan Cross  */
747b4ceea05SDan Cross size_t
vmm_gpt_unmap_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)7483a0fa64cSPatrick Mooney vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
749b4ceea05SDan Cross {
7503a0fa64cSPatrick Mooney 	ASSERT0(addr & PAGEOFFSET);
7513a0fa64cSPatrick Mooney 	ASSERT0(len & PAGEOFFSET);
752b4ceea05SDan Cross 
7533a0fa64cSPatrick Mooney 	const uint64_t end = addr + len;
7540153d828SPatrick Mooney 	size_t num_unmapped = 0;
7553a0fa64cSPatrick Mooney 	for (uint64_t gpa = addr; gpa < end; gpa += PAGESIZE) {
7563a0fa64cSPatrick Mooney 		if (vmm_gpt_unmap(gpt, gpa) != 0) {
7570153d828SPatrick Mooney 			num_unmapped++;
7580153d828SPatrick Mooney 		}
759b4ceea05SDan Cross 	}
760b4ceea05SDan Cross 
7610153d828SPatrick Mooney 	return (num_unmapped);
762b4ceea05SDan Cross }
763b4ceea05SDan Cross 
764b4ceea05SDan Cross /*
765b4ceea05SDan Cross  * Returns a value indicating whether or not this GPT maps the given
766b4ceea05SDan Cross  * GPA.  If the GPA is mapped, *protp will be filled with the protection
767b4ceea05SDan Cross  * bits of the entry.  Otherwise, it will be ignored.
768b4ceea05SDan Cross  */
769b4ceea05SDan Cross bool
vmm_gpt_is_mapped(vmm_gpt_t * gpt,uint64_t * ptep,pfn_t * pfnp,uint_t * protp)7700153d828SPatrick Mooney vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp)
771b4ceea05SDan Cross {
7720153d828SPatrick Mooney 	uint64_t entry;
773b4ceea05SDan Cross 
774*b9b43e84SPatrick Mooney 	ASSERT(pfnp != NULL);
775*b9b43e84SPatrick Mooney 	ASSERT(protp != NULL);
776*b9b43e84SPatrick Mooney 
7770153d828SPatrick Mooney 	if (ptep == NULL) {
778b4ceea05SDan Cross 		return (false);
7790153d828SPatrick Mooney 	}
7800153d828SPatrick Mooney 	entry = *ptep;
7810153d828SPatrick Mooney 	if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
782b4ceea05SDan Cross 		return (false);
7830153d828SPatrick Mooney 	}
7840153d828SPatrick Mooney 	*pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
785b4ceea05SDan Cross 	*protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry);
786b4ceea05SDan Cross 	return (true);
787b4ceea05SDan Cross }
788b4ceea05SDan Cross 
789b4ceea05SDan Cross /*
790b4ceea05SDan Cross  * Resets the accessed bit on the page table entry pointed to be `entry`.
791b4ceea05SDan Cross  * If `on` is true, the bit will be set, otherwise it will be cleared.
792b4ceea05SDan Cross  * The old value of the bit is returned.
793b4ceea05SDan Cross  */
794b4ceea05SDan Cross uint_t
vmm_gpt_reset_accessed(vmm_gpt_t * gpt,uint64_t * entry,bool on)795b4ceea05SDan Cross vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on)
796b4ceea05SDan Cross {
797b4ceea05SDan Cross 	ASSERT(entry != NULL);
798b4ceea05SDan Cross 	return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on));
799b4ceea05SDan Cross }
800b4ceea05SDan Cross 
801b4ceea05SDan Cross /*
802b4ceea05SDan Cross  * Resets the dirty bit on the page table entry pointed to be `entry`.
803b4ceea05SDan Cross  * If `on` is true, the bit will be set, otherwise it will be cleared.
804b4ceea05SDan Cross  * The old value of the bit is returned.
805b4ceea05SDan Cross  */
806b4ceea05SDan Cross uint_t
vmm_gpt_reset_dirty(vmm_gpt_t * gpt,uint64_t * entry,bool on)807b4ceea05SDan Cross vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on)
808b4ceea05SDan Cross {
809b4ceea05SDan Cross 	ASSERT(entry != NULL);
810b4ceea05SDan Cross 	return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on));
811b4ceea05SDan Cross }
8120153d828SPatrick Mooney 
813*b9b43e84SPatrick Mooney /*
814*b9b43e84SPatrick Mooney  * Query state from PTE pointed to by `entry`.
815*b9b43e84SPatrick Mooney  */
816*b9b43e84SPatrick Mooney bool
vmm_gpt_query(vmm_gpt_t * gpt,uint64_t * entry,vmm_gpt_query_t query)817*b9b43e84SPatrick Mooney vmm_gpt_query(vmm_gpt_t *gpt, uint64_t *entry, vmm_gpt_query_t query)
818*b9b43e84SPatrick Mooney {
819*b9b43e84SPatrick Mooney 	ASSERT(entry != NULL);
820*b9b43e84SPatrick Mooney 	return (gpt->vgpt_pte_ops->vpeo_query(entry, query));
821*b9b43e84SPatrick Mooney }
822*b9b43e84SPatrick Mooney 
8230153d828SPatrick Mooney /*
8240153d828SPatrick Mooney  * Get properly formatted PML4 (EPTP/nCR3) for GPT.
8250153d828SPatrick Mooney  */
8260153d828SPatrick Mooney uint64_t
vmm_gpt_get_pmtp(vmm_gpt_t * gpt,bool track_dirty)8274ac713daSLuqman Aden vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty)
8280153d828SPatrick Mooney {
8294ac713daSLuqman Aden 	const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn;
8304ac713daSLuqman Aden 	return (gpt->vgpt_pte_ops->vpeo_get_pmtp(root_pfn, track_dirty));
8310153d828SPatrick Mooney }
832*b9b43e84SPatrick Mooney 
833*b9b43e84SPatrick Mooney /*
834*b9b43e84SPatrick Mooney  * Does the GPT hardware support dirty-page-tracking?
835*b9b43e84SPatrick Mooney  */
836*b9b43e84SPatrick Mooney bool
vmm_gpt_can_track_dirty(vmm_gpt_t * gpt)837*b9b43e84SPatrick Mooney vmm_gpt_can_track_dirty(vmm_gpt_t *gpt)
838*b9b43e84SPatrick Mooney {
839*b9b43e84SPatrick Mooney 	return (gpt->vgpt_pte_ops->vpeo_hw_ad_supported());
840*b9b43e84SPatrick Mooney }
841