xref: /illumos-gate/usr/src/uts/i86pc/vm/i86_mmu.c (revision 74ecdb51)
1ae115bc7Smrj /*
2ae115bc7Smrj  * CDDL HEADER START
3ae115bc7Smrj  *
4ae115bc7Smrj  * The contents of this file are subject to the terms of the
5ae115bc7Smrj  * Common Development and Distribution License (the "License").
6ae115bc7Smrj  * You may not use this file except in compliance with the License.
7ae115bc7Smrj  *
8ae115bc7Smrj  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9ae115bc7Smrj  * or http://www.opensolaris.org/os/licensing.
10ae115bc7Smrj  * See the License for the specific language governing permissions
11ae115bc7Smrj  * and limitations under the License.
12ae115bc7Smrj  *
13ae115bc7Smrj  * When distributing Covered Code, include this CDDL HEADER in each
14ae115bc7Smrj  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15ae115bc7Smrj  * If applicable, add the following below this CDDL HEADER, with the
16ae115bc7Smrj  * fields enclosed by brackets "[]" replaced with your own identifying
17ae115bc7Smrj  * information: Portions Copyright [yyyy] [name of copyright owner]
18ae115bc7Smrj  *
19ae115bc7Smrj  * CDDL HEADER END
20ae115bc7Smrj  */
21ae115bc7Smrj /*
2256f33205SJonathan Adams  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23ae115bc7Smrj  * Use is subject to license terms.
24*74ecdb51SJohn Levon  *
25*74ecdb51SJohn Levon  * Copyright 2018 Joyent, Inc.
26ae115bc7Smrj  */
27ae115bc7Smrj 
28ae115bc7Smrj #include <sys/t_lock.h>
29ae115bc7Smrj #include <sys/memlist.h>
30ae115bc7Smrj #include <sys/cpuvar.h>
31ae115bc7Smrj #include <sys/vmem.h>
32ae115bc7Smrj #include <sys/mman.h>
33ae115bc7Smrj #include <sys/vm.h>
34ae115bc7Smrj #include <sys/kmem.h>
35ae115bc7Smrj #include <sys/cmn_err.h>
36ae115bc7Smrj #include <sys/debug.h>
37ae115bc7Smrj #include <sys/vm_machparam.h>
38ae115bc7Smrj #include <sys/tss.h>
39ae115bc7Smrj #include <sys/vnode.h>
40ae115bc7Smrj #include <vm/hat.h>
41ae115bc7Smrj #include <vm/anon.h>
42ae115bc7Smrj #include <vm/as.h>
43ae115bc7Smrj #include <vm/page.h>
44ae115bc7Smrj #include <vm/seg.h>
45ae115bc7Smrj #include <vm/seg_kmem.h>
46ae115bc7Smrj #include <vm/seg_map.h>
47ae115bc7Smrj #include <vm/hat_i86.h>
48ae115bc7Smrj #include <sys/promif.h>
49ae115bc7Smrj #include <sys/x86_archext.h>
50ae115bc7Smrj #include <sys/systm.h>
51ae115bc7Smrj #include <sys/archsystm.h>
52ae115bc7Smrj #include <sys/sunddi.h>
53ae115bc7Smrj #include <sys/ddidmareq.h>
54ae115bc7Smrj #include <sys/controlregs.h>
55ae115bc7Smrj #include <sys/reboot.h>
56ae115bc7Smrj #include <sys/kdi.h>
57ae115bc7Smrj #include <sys/bootconf.h>
58ae115bc7Smrj #include <sys/bootsvcs.h>
59ae115bc7Smrj #include <sys/bootinfo.h>
60ae115bc7Smrj #include <vm/kboot_mmu.h>
61ae115bc7Smrj 
62843e1988Sjohnlev #ifdef __xpv
63843e1988Sjohnlev #include <sys/hypervisor.h>
64843e1988Sjohnlev #endif
65843e1988Sjohnlev 
66*74ecdb51SJohn Levon #define	ON_USER_HAT(cpu) \
67*74ecdb51SJohn Levon 	((cpu)->cpu_m.mcpu_current_hat != NULL && \
68*74ecdb51SJohn Levon 	(cpu)->cpu_m.mcpu_current_hat != kas.a_hat)
69ae115bc7Smrj 
70ae115bc7Smrj /*
71ae115bc7Smrj  * Flag is not set early in boot. Once it is set we are no longer
72ae115bc7Smrj  * using boot's page tables.
73ae115bc7Smrj  */
74ae115bc7Smrj uint_t khat_running = 0;
75ae115bc7Smrj 
76ae115bc7Smrj /*
77ae115bc7Smrj  * This procedure is callable only while the boot loader is in charge of the
78ae115bc7Smrj  * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
79ae115bc7Smrj  * kboot_mmu.c since it's used from common code.
80ae115bc7Smrj  */
81ae115bc7Smrj pfn_t
va_to_pfn(void * vaddr)82ae115bc7Smrj va_to_pfn(void *vaddr)
83ae115bc7Smrj {
84ae115bc7Smrj 	uintptr_t	des_va = ALIGN2PAGE(vaddr);
85ae115bc7Smrj 	uintptr_t	va = des_va;
86ae115bc7Smrj 	size_t		len;
87ae115bc7Smrj 	uint_t		prot;
88ae115bc7Smrj 	pfn_t		pfn;
89ae115bc7Smrj 
90ae115bc7Smrj 	if (khat_running)
91ae115bc7Smrj 		panic("va_to_pfn(): called too late\n");
92ae115bc7Smrj 
93ae115bc7Smrj 	if (kbm_probe(&va, &len, &pfn, &prot) == 0)
94ae115bc7Smrj 		return (PFN_INVALID);
95ae115bc7Smrj 	if (va > des_va)
96ae115bc7Smrj 		return (PFN_INVALID);
97ae115bc7Smrj 	if (va < des_va)
98ae115bc7Smrj 		pfn += mmu_btop(des_va - va);
99ae115bc7Smrj 	return (pfn);
100ae115bc7Smrj }
101ae115bc7Smrj 
102ae115bc7Smrj /*
103ae115bc7Smrj  * Initialize a special area in the kernel that always holds some PTEs for
104ae115bc7Smrj  * faster performance. This always holds segmap's PTEs.
105ae115bc7Smrj  * In the 32 bit kernel this maps the kernel heap too.
106ae115bc7Smrj  */
107ae115bc7Smrj void
hat_kmap_init(uintptr_t base,size_t len)108ae115bc7Smrj hat_kmap_init(uintptr_t base, size_t len)
109ae115bc7Smrj {
110ae115bc7Smrj 	uintptr_t map_addr;	/* base rounded down to large page size */
111ae115bc7Smrj 	uintptr_t map_eaddr;	/* base + len rounded up */
112ae115bc7Smrj 	size_t map_len;
113ae115bc7Smrj 	caddr_t ptes;		/* mapping area in kernel for kmap ptes */
114ae115bc7Smrj 	size_t window_size;	/* size of mapping area for ptes */
115ae115bc7Smrj 	ulong_t htable_cnt;	/* # of page tables to cover map_len */
116ae115bc7Smrj 	ulong_t i;
117ae115bc7Smrj 	htable_t *ht;
118ae115bc7Smrj 	uintptr_t va;
119ae115bc7Smrj 
120ae115bc7Smrj 	/*
121ae115bc7Smrj 	 * We have to map in an area that matches an entire page table.
122843e1988Sjohnlev 	 * The PTEs are large page aligned to avoid spurious pagefaults
123843e1988Sjohnlev 	 * on the hypervisor.
124ae115bc7Smrj 	 */
125ae115bc7Smrj 	map_addr = base & LEVEL_MASK(1);
126ae115bc7Smrj 	map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
127ae115bc7Smrj 	map_len = map_eaddr - map_addr;
128ae115bc7Smrj 	window_size = mmu_btop(map_len) * mmu.pte_size;
129ae115bc7Smrj 	window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
130ae115bc7Smrj 	htable_cnt = map_len >> LEVEL_SHIFT(1);
131ae115bc7Smrj 
132ae115bc7Smrj 	/*
133ae115bc7Smrj 	 * allocate vmem for the kmap_ptes
134ae115bc7Smrj 	 */
135ae115bc7Smrj 	ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
136ae115bc7Smrj 	    0, NULL, NULL, VM_SLEEP);
137ae115bc7Smrj 	mmu.kmap_htables =
138ae115bc7Smrj 	    kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);
139ae115bc7Smrj 
140ae115bc7Smrj 	/*
141ae115bc7Smrj 	 * Map the page tables that cover kmap into the allocated range.
142ae115bc7Smrj 	 * Note we don't ever htable_release() the kmap page tables - they
143ae115bc7Smrj 	 * can't ever be stolen, freed, etc.
144ae115bc7Smrj 	 */
145ae115bc7Smrj 	for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
146ae115bc7Smrj 		ht = htable_create(kas.a_hat, va, 0, NULL);
147ae115bc7Smrj 		if (ht == NULL)
148ae115bc7Smrj 			panic("hat_kmap_init: ht == NULL");
149ae115bc7Smrj 		mmu.kmap_htables[i] = ht;
150ae115bc7Smrj 
151ae115bc7Smrj 		hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
152ae115bc7Smrj 		    MMU_PAGESIZE, ht->ht_pfn,
153843e1988Sjohnlev #ifdef __xpv
154843e1988Sjohnlev 		    PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
155843e1988Sjohnlev #else
156ae115bc7Smrj 		    PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
157843e1988Sjohnlev #endif
158ae115bc7Smrj 		    HAT_LOAD | HAT_LOAD_NOCONSIST);
159ae115bc7Smrj 	}
160ae115bc7Smrj 
161ae115bc7Smrj 	/*
162ae115bc7Smrj 	 * set information in mmu to activate handling of kmap
163ae115bc7Smrj 	 */
164ae115bc7Smrj 	mmu.kmap_addr = map_addr;
165ae115bc7Smrj 	mmu.kmap_eaddr = map_eaddr;
166ae115bc7Smrj 	mmu.kmap_ptes = (x86pte_t *)ptes;
167ae115bc7Smrj }
168ae115bc7Smrj 
169ae115bc7Smrj extern caddr_t	kpm_vbase;
170ae115bc7Smrj extern size_t	kpm_size;
171ae115bc7Smrj 
172843e1988Sjohnlev #ifdef __xpv
173843e1988Sjohnlev /*
174843e1988Sjohnlev  * Create the initial segkpm mappings for the hypervisor. To avoid having
175843e1988Sjohnlev  * to deal with page tables being read only, we make all mappings
176843e1988Sjohnlev  * read only at first.
177843e1988Sjohnlev  */
178843e1988Sjohnlev static void
xen_kpm_create(paddr_t paddr,level_t lvl)179843e1988Sjohnlev xen_kpm_create(paddr_t paddr, level_t lvl)
180843e1988Sjohnlev {
181843e1988Sjohnlev 	ulong_t pg_off;
182843e1988Sjohnlev 
183843e1988Sjohnlev 	for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
184843e1988Sjohnlev 		kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
185843e1988Sjohnlev 		kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
186843e1988Sjohnlev 		    paddr + pg_off);
187843e1988Sjohnlev 	}
188843e1988Sjohnlev }
189843e1988Sjohnlev 
190843e1988Sjohnlev /*
191843e1988Sjohnlev  * Try to make all kpm mappings writable. Failures are ok, as those
192843e1988Sjohnlev  * are just pagetable, GDT, etc. pages.
193843e1988Sjohnlev  */
194843e1988Sjohnlev static void
xen_kpm_finish_init(void)195843e1988Sjohnlev xen_kpm_finish_init(void)
196843e1988Sjohnlev {
197843e1988Sjohnlev 	pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
198843e1988Sjohnlev 	pfn_t pfn;
199843e1988Sjohnlev 	page_t *pp;
200843e1988Sjohnlev 
201843e1988Sjohnlev 	for (pfn = 0; pfn < mfn_count; ++pfn) {
202843e1988Sjohnlev 		/*
203843e1988Sjohnlev 		 * skip gdt
204843e1988Sjohnlev 		 */
205843e1988Sjohnlev 		if (pfn == gdtpfn)
206843e1988Sjohnlev 			continue;
207843e1988Sjohnlev 
208843e1988Sjohnlev 		/*
209843e1988Sjohnlev 		 * p_index is a hint that this is a pagetable
210843e1988Sjohnlev 		 */
211843e1988Sjohnlev 		pp = page_numtopp_nolock(pfn);
212843e1988Sjohnlev 		if (pp && pp->p_index) {
213843e1988Sjohnlev 			pp->p_index = 0;
214843e1988Sjohnlev 			continue;
215843e1988Sjohnlev 		}
216843e1988Sjohnlev 		(void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
217843e1988Sjohnlev 	}
218843e1988Sjohnlev }
219843e1988Sjohnlev #endif
220843e1988Sjohnlev 
221ae115bc7Smrj /*
222ae115bc7Smrj  * Routine to pre-allocate data structures for hat_kern_setup(). It computes
223ae115bc7Smrj  * how many pagetables it needs by walking the boot loader's page tables.
224ae115bc7Smrj  */
225ae115bc7Smrj /*ARGSUSED*/
226ae115bc7Smrj void
hat_kern_alloc(caddr_t segmap_base,size_t segmap_size,caddr_t ekernelheap)227ae115bc7Smrj hat_kern_alloc(
228ae115bc7Smrj 	caddr_t	segmap_base,
229ae115bc7Smrj 	size_t	segmap_size,
230ae115bc7Smrj 	caddr_t	ekernelheap)
231ae115bc7Smrj {
232ae115bc7Smrj 	uintptr_t	last_va = (uintptr_t)-1;	/* catch 1st time */
233ae115bc7Smrj 	uintptr_t	va = 0;
234ae115bc7Smrj 	size_t		size;
235ae115bc7Smrj 	pfn_t		pfn;
236ae115bc7Smrj 	uint_t		prot;
237ae115bc7Smrj 	uint_t		table_cnt = 1;
238ae115bc7Smrj 	uint_t		mapping_cnt;
239ae115bc7Smrj 	level_t		start_level;
240ae115bc7Smrj 	level_t		l;
241ae115bc7Smrj 	struct memlist	*pmem;
242ae115bc7Smrj 	level_t		lpagel = mmu.max_page_level;
243ae115bc7Smrj 	uint64_t	paddr;
244ae115bc7Smrj 	int64_t		psize;
245843e1988Sjohnlev 	int		nwindows;
246ae115bc7Smrj 
247ae115bc7Smrj 	if (kpm_size > 0) {
248ae115bc7Smrj 		/*
249843e1988Sjohnlev 		 * Create the kpm page tables.  When running on the
250843e1988Sjohnlev 		 * hypervisor these are made read/only at first.
251843e1988Sjohnlev 		 * Later we'll add write permission where possible.
252ae115bc7Smrj 		 */
25356f33205SJonathan Adams 		for (pmem = phys_install; pmem; pmem = pmem->ml_next) {
25456f33205SJonathan Adams 			paddr = pmem->ml_address;
25556f33205SJonathan Adams 			psize = pmem->ml_size;
256ae115bc7Smrj 			while (psize >= MMU_PAGESIZE) {
25702bc52beSkchow 				/* find the largest page size */
25802bc52beSkchow 				for (l = lpagel; l > 0; l--) {
25902bc52beSkchow 					if ((paddr & LEVEL_OFFSET(l)) == 0 &&
26002bc52beSkchow 					    psize > LEVEL_SIZE(l))
26102bc52beSkchow 						break;
26202bc52beSkchow 				}
26302bc52beSkchow 
264843e1988Sjohnlev #if defined(__xpv)
265843e1988Sjohnlev 				/*
266843e1988Sjohnlev 				 * Create read/only mappings to avoid
267843e1988Sjohnlev 				 * conflicting with pagetable usage
268843e1988Sjohnlev 				 */
269843e1988Sjohnlev 				xen_kpm_create(paddr, l);
270843e1988Sjohnlev #else
271ae115bc7Smrj 				kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
272ae115bc7Smrj 				    l, 1);
273843e1988Sjohnlev #endif
274ae115bc7Smrj 				paddr += LEVEL_SIZE(l);
275ae115bc7Smrj 				psize -= LEVEL_SIZE(l);
276ae115bc7Smrj 			}
277ae115bc7Smrj 		}
278843e1988Sjohnlev 	}
279843e1988Sjohnlev 
280843e1988Sjohnlev 	/*
281843e1988Sjohnlev 	 * If this machine doesn't have a kpm segment, we need to allocate
282843e1988Sjohnlev 	 * a small number of 'windows' which can be used to map pagetables.
283843e1988Sjohnlev 	 */
284843e1988Sjohnlev 	nwindows = (kpm_size == 0) ? 2 * NCPU : 0;
285843e1988Sjohnlev 
286843e1988Sjohnlev #if defined(__xpv)
287843e1988Sjohnlev 	/*
288843e1988Sjohnlev 	 * On a hypervisor, these windows are also used by the xpv_panic
289843e1988Sjohnlev 	 * code, where we need one window for each level of the pagetable
290843e1988Sjohnlev 	 * hierarchy.
291843e1988Sjohnlev 	 */
292843e1988Sjohnlev 	nwindows = MAX(nwindows, mmu.max_level);
293843e1988Sjohnlev #endif
294843e1988Sjohnlev 
295843e1988Sjohnlev 	if (nwindows != 0) {
296ae115bc7Smrj 		/*
297ae115bc7Smrj 		 * Create the page windows and 1 page of VA in
298ae115bc7Smrj 		 * which we map the PTEs of those windows.
299ae115bc7Smrj 		 */
300843e1988Sjohnlev 		mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
301ae115bc7Smrj 		    LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
302843e1988Sjohnlev 		ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
303ae115bc7Smrj 		mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
304ae115bc7Smrj 		    MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
305ae115bc7Smrj 
306ae115bc7Smrj 		/*
307ae115bc7Smrj 		 * Find/Create the page table window mappings.
308ae115bc7Smrj 		 */
309ae115bc7Smrj 		paddr = 0;
310ae115bc7Smrj 		(void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
311ae115bc7Smrj 		ASSERT(paddr != 0);
312ae115bc7Smrj 		ASSERT((paddr & MMU_PAGEOFFSET) == 0);
313ae115bc7Smrj 		mmu.pwin_pte_pa = paddr;
314843e1988Sjohnlev #ifdef __xpv
315843e1988Sjohnlev 		(void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
316843e1988Sjohnlev 		kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
317843e1988Sjohnlev #else
318ae115bc7Smrj 		kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
319843e1988Sjohnlev #endif
320ae115bc7Smrj 	}
321ae115bc7Smrj 
322ae115bc7Smrj 	/*
323ae115bc7Smrj 	 * Walk the boot loader's page tables and figure out
324ae115bc7Smrj 	 * how many tables and page mappings there will be.
325ae115bc7Smrj 	 */
326ae115bc7Smrj 	while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
327ae115bc7Smrj 		/*
328ae115bc7Smrj 		 * At each level, if the last_va falls into a new htable,
329ae115bc7Smrj 		 * increment table_cnt. We can stop at the 1st level where
330ae115bc7Smrj 		 * they are in the same htable.
331ae115bc7Smrj 		 */
33202bc52beSkchow 		start_level = 0;
33302bc52beSkchow 		while (start_level <= mmu.max_page_level) {
33402bc52beSkchow 			if (size == LEVEL_SIZE(start_level))
33502bc52beSkchow 				break;
33602bc52beSkchow 			start_level++;
33702bc52beSkchow 		}
338ae115bc7Smrj 
339ae115bc7Smrj 		for (l = start_level; l < mmu.max_level; ++l) {
340ae115bc7Smrj 			if (va >> LEVEL_SHIFT(l + 1) ==
341ae115bc7Smrj 			    last_va >> LEVEL_SHIFT(l + 1))
342ae115bc7Smrj 				break;
343ae115bc7Smrj 			++table_cnt;
344ae115bc7Smrj 		}
345ae115bc7Smrj 		last_va = va;
34602bc52beSkchow 		l = (start_level == 0) ? 1 : start_level;
34702bc52beSkchow 		va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
348ae115bc7Smrj 	}
349ae115bc7Smrj 
350ae115bc7Smrj 	/*
351ae115bc7Smrj 	 * Besides the boot loader mappings, we're going to fill in
352ae115bc7Smrj 	 * the entire top level page table for the kernel. Make sure there's
353ae115bc7Smrj 	 * enough reserve for that too.
354ae115bc7Smrj 	 */
355ae115bc7Smrj 	table_cnt += mmu.top_level_count - ((kernelbase >>
356ae115bc7Smrj 	    LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
357ae115bc7Smrj 
358ae115bc7Smrj 	/*
359ae115bc7Smrj 	 * Add 1/4 more into table_cnt for extra slop.  The unused
360ae115bc7Smrj 	 * slop is freed back when we htable_adjust_reserve() later.
361ae115bc7Smrj 	 */
362ae115bc7Smrj 	table_cnt += table_cnt >> 2;
363ae115bc7Smrj 
364ae115bc7Smrj 	/*
365ae115bc7Smrj 	 * We only need mapping entries (hments) for shared pages.
366ae115bc7Smrj 	 * This should be far, far fewer than the total possible,
367ae115bc7Smrj 	 * We'll allocate enough for 1/16 of all possible PTEs.
368ae115bc7Smrj 	 */
369ae115bc7Smrj 	mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
370ae115bc7Smrj 
371ae115bc7Smrj 	/*
372ae115bc7Smrj 	 * Now create the initial htable/hment reserves
373ae115bc7Smrj 	 */
374ae115bc7Smrj 	htable_initial_reserve(table_cnt);
375ae115bc7Smrj 	hment_reserve(mapping_cnt);
376ae115bc7Smrj 	x86pte_cpu_init(CPU);
377ae115bc7Smrj }
378ae115bc7Smrj 
379ae115bc7Smrj 
380ae115bc7Smrj /*
381ae115bc7Smrj  * This routine handles the work of creating the kernel's initial mappings
382ae115bc7Smrj  * by deciphering the mappings in the page tables created by the boot program.
383ae115bc7Smrj  *
384ae115bc7Smrj  * We maintain large page mappings, but only to a level 1 pagesize.
385ae115bc7Smrj  * The boot loader can only add new mappings once this function starts.
386ae115bc7Smrj  * In particular it can not change the pagesize used for any existing
387ae115bc7Smrj  * mappings or this code breaks!
388ae115bc7Smrj  */
389ae115bc7Smrj 
390ae115bc7Smrj void
hat_kern_setup(void)391ae115bc7Smrj hat_kern_setup(void)
392ae115bc7Smrj {
393ae115bc7Smrj 	/*
394ae115bc7Smrj 	 * Attach htables to the existing pagetables
395ae115bc7Smrj 	 */
396843e1988Sjohnlev 	/* BEGIN CSTYLED */
397ae115bc7Smrj 	htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
398843e1988Sjohnlev #ifdef __xpv
399843e1988Sjohnlev 	    mmu_btop(xen_info->pt_base - ONE_GIG));
400843e1988Sjohnlev #else
401*74ecdb51SJohn Levon 	    mmu_btop(getcr3_pa()));
402843e1988Sjohnlev #endif
403843e1988Sjohnlev 	/* END CSTYLED */
404ae115bc7Smrj 
405*74ecdb51SJohn Levon #if defined(__xpv)
406843e1988Sjohnlev 	/*
407843e1988Sjohnlev 	 * Try to make the kpm mappings r/w. Failures here are OK, as
408843e1988Sjohnlev 	 * it's probably just a pagetable
409843e1988Sjohnlev 	 */
410843e1988Sjohnlev 	xen_kpm_finish_init();
411843e1988Sjohnlev #endif
412843e1988Sjohnlev 
413ae115bc7Smrj 	/*
414ae115bc7Smrj 	 * The kernel HAT is now officially open for business.
415ae115bc7Smrj 	 */
416ae115bc7Smrj 	khat_running = 1;
417ae115bc7Smrj 
418ae115bc7Smrj 	CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
419ae115bc7Smrj 	CPU->cpu_current_hat = kas.a_hat;
420ae115bc7Smrj }
421*74ecdb51SJohn Levon 
422*74ecdb51SJohn Levon #ifndef __xpv
423*74ecdb51SJohn Levon 
424*74ecdb51SJohn Levon /*
425*74ecdb51SJohn Levon  * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
426*74ecdb51SJohn Levon  * INVPCID_ADDR isn't.
427*74ecdb51SJohn Levon  */
428*74ecdb51SJohn Levon static void
invpcid(uint64_t type,uint64_t pcid,uintptr_t addr)429*74ecdb51SJohn Levon invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
430*74ecdb51SJohn Levon {
431*74ecdb51SJohn Levon 	ulong_t	flag;
432*74ecdb51SJohn Levon 	uint64_t cr4;
433*74ecdb51SJohn Levon 
434*74ecdb51SJohn Levon 	if (x86_use_invpcid == 1) {
435*74ecdb51SJohn Levon 		ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
436*74ecdb51SJohn Levon 		invpcid_insn(type, pcid, addr);
437*74ecdb51SJohn Levon 		return;
438*74ecdb51SJohn Levon 	}
439*74ecdb51SJohn Levon 
440*74ecdb51SJohn Levon 	switch (type) {
441*74ecdb51SJohn Levon 	case INVPCID_ALL_GLOBAL:
442*74ecdb51SJohn Levon 		flag = intr_clear();
443*74ecdb51SJohn Levon 		cr4 = getcr4();
444*74ecdb51SJohn Levon 		setcr4(cr4 & ~(ulong_t)CR4_PGE);
445*74ecdb51SJohn Levon 		setcr4(cr4 | CR4_PGE);
446*74ecdb51SJohn Levon 		intr_restore(flag);
447*74ecdb51SJohn Levon 		break;
448*74ecdb51SJohn Levon 
449*74ecdb51SJohn Levon 	case INVPCID_ALL_NONGLOBAL:
450*74ecdb51SJohn Levon 		if (!(getcr4() & CR4_PCIDE)) {
451*74ecdb51SJohn Levon 			reload_cr3();
452*74ecdb51SJohn Levon 		} else {
453*74ecdb51SJohn Levon 			flag = intr_clear();
454*74ecdb51SJohn Levon 			cr4 = getcr4();
455*74ecdb51SJohn Levon 			setcr4(cr4 & ~(ulong_t)CR4_PGE);
456*74ecdb51SJohn Levon 			setcr4(cr4 | CR4_PGE);
457*74ecdb51SJohn Levon 			intr_restore(flag);
458*74ecdb51SJohn Levon 		}
459*74ecdb51SJohn Levon 		break;
460*74ecdb51SJohn Levon 
461*74ecdb51SJohn Levon 	case INVPCID_ADDR:
462*74ecdb51SJohn Levon 		if (pcid == PCID_USER) {
463*74ecdb51SJohn Levon 			flag = intr_clear();
464*74ecdb51SJohn Levon 			ASSERT(addr < kernelbase);
465*74ecdb51SJohn Levon 			ASSERT(ON_USER_HAT(CPU));
466*74ecdb51SJohn Levon 			ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
467*74ecdb51SJohn Levon 			tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
468*74ecdb51SJohn Levon 			    MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
469*74ecdb51SJohn Levon 			intr_restore(flag);
470*74ecdb51SJohn Levon 		} else {
471*74ecdb51SJohn Levon 			mmu_invlpg((caddr_t)addr);
472*74ecdb51SJohn Levon 		}
473*74ecdb51SJohn Levon 		break;
474*74ecdb51SJohn Levon 
475*74ecdb51SJohn Levon 	default:
476*74ecdb51SJohn Levon 		panic("unsupported invpcid(%lu)", type);
477*74ecdb51SJohn Levon 		break;
478*74ecdb51SJohn Levon 	}
479*74ecdb51SJohn Levon }
480*74ecdb51SJohn Levon 
481*74ecdb51SJohn Levon /*
482*74ecdb51SJohn Levon  * Flush one kernel mapping.
483*74ecdb51SJohn Levon  *
484*74ecdb51SJohn Levon  * We want to assert on kernel space here mainly for reasoning about the PCIDE
485*74ecdb51SJohn Levon  * case: namely, this flush should never need to flush a non-current PCID
486*74ecdb51SJohn Levon  * mapping.  This presumes we never have reason to flush the kernel regions
487*74ecdb51SJohn Levon  * available to PCID_USER (the trampolines and so on).  It also relies on
488*74ecdb51SJohn Levon  * PCID_KERNEL == PCID_NONE.
489*74ecdb51SJohn Levon  */
490*74ecdb51SJohn Levon void
mmu_flush_tlb_kpage(uintptr_t va)491*74ecdb51SJohn Levon mmu_flush_tlb_kpage(uintptr_t va)
492*74ecdb51SJohn Levon {
493*74ecdb51SJohn Levon 	ASSERT(va >= kernelbase);
494*74ecdb51SJohn Levon 	ASSERT(getpcid() == PCID_KERNEL);
495*74ecdb51SJohn Levon 	mmu_invlpg((caddr_t)va);
496*74ecdb51SJohn Levon }
497*74ecdb51SJohn Levon 
498*74ecdb51SJohn Levon /*
499*74ecdb51SJohn Levon  * Flush one mapping: local CPU version of hat_tlb_inval().
500*74ecdb51SJohn Levon  *
501*74ecdb51SJohn Levon  * If this is a userspace address in the PCIDE case, we need two invalidations,
502*74ecdb51SJohn Levon  * one for any potentially stale PCID_USER mapping, as well as any established
503*74ecdb51SJohn Levon  * while in the kernel.
504*74ecdb51SJohn Levon  */
505*74ecdb51SJohn Levon void
mmu_flush_tlb_page(uintptr_t va)506*74ecdb51SJohn Levon mmu_flush_tlb_page(uintptr_t va)
507*74ecdb51SJohn Levon {
508*74ecdb51SJohn Levon 	ASSERT(getpcid() == PCID_KERNEL);
509*74ecdb51SJohn Levon 
510*74ecdb51SJohn Levon 	if (va >= kernelbase) {
511*74ecdb51SJohn Levon 		mmu_flush_tlb_kpage(va);
512*74ecdb51SJohn Levon 		return;
513*74ecdb51SJohn Levon 	}
514*74ecdb51SJohn Levon 
515*74ecdb51SJohn Levon 	if (!(getcr4() & CR4_PCIDE)) {
516*74ecdb51SJohn Levon 		mmu_invlpg((caddr_t)va);
517*74ecdb51SJohn Levon 		return;
518*74ecdb51SJohn Levon 	}
519*74ecdb51SJohn Levon 
520*74ecdb51SJohn Levon 	/*
521*74ecdb51SJohn Levon 	 * Yes, kas will need to flush below kernelspace, at least during boot.
522*74ecdb51SJohn Levon 	 * But there's no PCID_USER context.
523*74ecdb51SJohn Levon 	 */
524*74ecdb51SJohn Levon 	if (ON_USER_HAT(CPU))
525*74ecdb51SJohn Levon 		invpcid(INVPCID_ADDR, PCID_USER, va);
526*74ecdb51SJohn Levon 	invpcid(INVPCID_ADDR, PCID_KERNEL, va);
527*74ecdb51SJohn Levon }
528*74ecdb51SJohn Levon 
529*74ecdb51SJohn Levon static void
mmu_flush_tlb_range(uintptr_t addr,size_t len,size_t pgsz)530*74ecdb51SJohn Levon mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
531*74ecdb51SJohn Levon {
532*74ecdb51SJohn Levon 	EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
533*74ecdb51SJohn Levon 	ASSERT(len > 0);
534*74ecdb51SJohn Levon 	ASSERT(pgsz != 0);
535*74ecdb51SJohn Levon 
536*74ecdb51SJohn Levon 	if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
537*74ecdb51SJohn Levon 		for (uintptr_t va = addr; va < (addr + len); va += pgsz)
538*74ecdb51SJohn Levon 			mmu_flush_tlb_page(va);
539*74ecdb51SJohn Levon 		return;
540*74ecdb51SJohn Levon 	}
541*74ecdb51SJohn Levon 
542*74ecdb51SJohn Levon 	/*
543*74ecdb51SJohn Levon 	 * As an emulated invpcid() in the PCIDE case requires jumping
544*74ecdb51SJohn Levon 	 * cr3s, we batch the invalidations.  We should only need to flush the
545*74ecdb51SJohn Levon 	 * user range if we're on a user-space HAT.
546*74ecdb51SJohn Levon 	 */
547*74ecdb51SJohn Levon 	if (addr < kernelbase && ON_USER_HAT(CPU)) {
548*74ecdb51SJohn Levon 		ulong_t flag = intr_clear();
549*74ecdb51SJohn Levon 		ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
550*74ecdb51SJohn Levon 		tr_mmu_flush_user_range(addr, len, pgsz,
551*74ecdb51SJohn Levon 		    CPU->cpu_m.mcpu_kpti.kf_user_cr3);
552*74ecdb51SJohn Levon 		intr_restore(flag);
553*74ecdb51SJohn Levon 	}
554*74ecdb51SJohn Levon 
555*74ecdb51SJohn Levon 	for (uintptr_t va = addr; va < (addr + len); va += pgsz)
556*74ecdb51SJohn Levon 		mmu_invlpg((caddr_t)va);
557*74ecdb51SJohn Levon }
558*74ecdb51SJohn Levon 
559*74ecdb51SJohn Levon /*
560*74ecdb51SJohn Levon  * MMU TLB (and PT cache) flushing on this CPU.
561*74ecdb51SJohn Levon  *
562*74ecdb51SJohn Levon  * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
563*74ecdb51SJohn Levon  * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
564*74ecdb51SJohn Levon  * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
565*74ecdb51SJohn Levon  * mappings as appropriate.  If using invpcid, PT_GLOBAL mappings are not
566*74ecdb51SJohn Levon  * invalidated.
567*74ecdb51SJohn Levon  */
568*74ecdb51SJohn Levon void
mmu_flush_tlb(flush_tlb_type_t type,tlb_range_t * range)569*74ecdb51SJohn Levon mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
570*74ecdb51SJohn Levon {
571*74ecdb51SJohn Levon 	ASSERT(getpcid() == PCID_KERNEL);
572*74ecdb51SJohn Levon 
573*74ecdb51SJohn Levon 	switch (type) {
574*74ecdb51SJohn Levon 	case FLUSH_TLB_ALL:
575*74ecdb51SJohn Levon 		ASSERT(range == NULL);
576*74ecdb51SJohn Levon 		invpcid(INVPCID_ALL_GLOBAL, 0, 0);
577*74ecdb51SJohn Levon 		break;
578*74ecdb51SJohn Levon 
579*74ecdb51SJohn Levon 	case FLUSH_TLB_NONGLOBAL:
580*74ecdb51SJohn Levon 		ASSERT(range == NULL);
581*74ecdb51SJohn Levon 		invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
582*74ecdb51SJohn Levon 		break;
583*74ecdb51SJohn Levon 
584*74ecdb51SJohn Levon 	case FLUSH_TLB_RANGE: {
585*74ecdb51SJohn Levon 		mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
586*74ecdb51SJohn Levon 		    LEVEL_SIZE(range->tr_level));
587*74ecdb51SJohn Levon 		break;
588*74ecdb51SJohn Levon 	}
589*74ecdb51SJohn Levon 
590*74ecdb51SJohn Levon 	default:
591*74ecdb51SJohn Levon 		panic("invalid call mmu_flush_tlb(%d)", type);
592*74ecdb51SJohn Levon 		break;
593*74ecdb51SJohn Levon 	}
594*74ecdb51SJohn Levon }
595*74ecdb51SJohn Levon 
596*74ecdb51SJohn Levon #endif /* ! __xpv */
597