1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 *
25 * Copyright 2018 Joyent, Inc.
26 */
27
28#include <sys/t_lock.h>
29#include <sys/memlist.h>
30#include <sys/cpuvar.h>
31#include <sys/vmem.h>
32#include <sys/mman.h>
33#include <sys/vm.h>
34#include <sys/kmem.h>
35#include <sys/cmn_err.h>
36#include <sys/debug.h>
37#include <sys/vm_machparam.h>
38#include <sys/tss.h>
39#include <sys/vnode.h>
40#include <vm/hat.h>
41#include <vm/anon.h>
42#include <vm/as.h>
43#include <vm/page.h>
44#include <vm/seg.h>
45#include <vm/seg_kmem.h>
46#include <vm/seg_map.h>
47#include <vm/hat_i86.h>
48#include <sys/promif.h>
49#include <sys/x86_archext.h>
50#include <sys/systm.h>
51#include <sys/archsystm.h>
52#include <sys/sunddi.h>
53#include <sys/ddidmareq.h>
54#include <sys/controlregs.h>
55#include <sys/reboot.h>
56#include <sys/kdi.h>
57#include <sys/bootconf.h>
58#include <sys/bootsvcs.h>
59#include <sys/bootinfo.h>
60#include <vm/kboot_mmu.h>
61
62#ifdef __xpv
63#include <sys/hypervisor.h>
64#endif
65
66#define	ON_USER_HAT(cpu) \
67	((cpu)->cpu_m.mcpu_current_hat != NULL && \
68	(cpu)->cpu_m.mcpu_current_hat != kas.a_hat)
69
70/*
71 * Flag is not set early in boot. Once it is set we are no longer
72 * using boot's page tables.
73 */
74uint_t khat_running = 0;
75
76/*
77 * This procedure is callable only while the boot loader is in charge of the
78 * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
79 * kboot_mmu.c since it's used from common code.
80 */
81pfn_t
82va_to_pfn(void *vaddr)
83{
84	uintptr_t	des_va = ALIGN2PAGE(vaddr);
85	uintptr_t	va = des_va;
86	size_t		len;
87	uint_t		prot;
88	pfn_t		pfn;
89
90	if (khat_running)
91		panic("va_to_pfn(): called too late\n");
92
93	if (kbm_probe(&va, &len, &pfn, &prot) == 0)
94		return (PFN_INVALID);
95	if (va > des_va)
96		return (PFN_INVALID);
97	if (va < des_va)
98		pfn += mmu_btop(des_va - va);
99	return (pfn);
100}
101
102/*
103 * Initialize a special area in the kernel that always holds some PTEs for
104 * faster performance. This always holds segmap's PTEs.
105 * In the 32 bit kernel this maps the kernel heap too.
106 */
107void
108hat_kmap_init(uintptr_t base, size_t len)
109{
110	uintptr_t map_addr;	/* base rounded down to large page size */
111	uintptr_t map_eaddr;	/* base + len rounded up */
112	size_t map_len;
113	caddr_t ptes;		/* mapping area in kernel for kmap ptes */
114	size_t window_size;	/* size of mapping area for ptes */
115	ulong_t htable_cnt;	/* # of page tables to cover map_len */
116	ulong_t i;
117	htable_t *ht;
118	uintptr_t va;
119
120	/*
121	 * We have to map in an area that matches an entire page table.
122	 * The PTEs are large page aligned to avoid spurious pagefaults
123	 * on the hypervisor.
124	 */
125	map_addr = base & LEVEL_MASK(1);
126	map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
127	map_len = map_eaddr - map_addr;
128	window_size = mmu_btop(map_len) * mmu.pte_size;
129	window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
130	htable_cnt = map_len >> LEVEL_SHIFT(1);
131
132	/*
133	 * allocate vmem for the kmap_ptes
134	 */
135	ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
136	    0, NULL, NULL, VM_SLEEP);
137	mmu.kmap_htables =
138	    kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);
139
140	/*
141	 * Map the page tables that cover kmap into the allocated range.
142	 * Note we don't ever htable_release() the kmap page tables - they
143	 * can't ever be stolen, freed, etc.
144	 */
145	for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
146		ht = htable_create(kas.a_hat, va, 0, NULL);
147		if (ht == NULL)
148			panic("hat_kmap_init: ht == NULL");
149		mmu.kmap_htables[i] = ht;
150
151		hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
152		    MMU_PAGESIZE, ht->ht_pfn,
153#ifdef __xpv
154		    PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
155#else
156		    PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
157#endif
158		    HAT_LOAD | HAT_LOAD_NOCONSIST);
159	}
160
161	/*
162	 * set information in mmu to activate handling of kmap
163	 */
164	mmu.kmap_addr = map_addr;
165	mmu.kmap_eaddr = map_eaddr;
166	mmu.kmap_ptes = (x86pte_t *)ptes;
167}
168
169extern caddr_t	kpm_vbase;
170extern size_t	kpm_size;
171
172#ifdef __xpv
173/*
174 * Create the initial segkpm mappings for the hypervisor. To avoid having
175 * to deal with page tables being read only, we make all mappings
176 * read only at first.
177 */
178static void
179xen_kpm_create(paddr_t paddr, level_t lvl)
180{
181	ulong_t pg_off;
182
183	for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
184		kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
185		kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
186		    paddr + pg_off);
187	}
188}
189
190/*
191 * Try to make all kpm mappings writable. Failures are ok, as those
192 * are just pagetable, GDT, etc. pages.
193 */
194static void
195xen_kpm_finish_init(void)
196{
197	pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
198	pfn_t pfn;
199	page_t *pp;
200
201	for (pfn = 0; pfn < mfn_count; ++pfn) {
202		/*
203		 * skip gdt
204		 */
205		if (pfn == gdtpfn)
206			continue;
207
208		/*
209		 * p_index is a hint that this is a pagetable
210		 */
211		pp = page_numtopp_nolock(pfn);
212		if (pp && pp->p_index) {
213			pp->p_index = 0;
214			continue;
215		}
216		(void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
217	}
218}
219#endif
220
221/*
222 * Routine to pre-allocate data structures for hat_kern_setup(). It computes
223 * how many pagetables it needs by walking the boot loader's page tables.
224 */
225/*ARGSUSED*/
226void
227hat_kern_alloc(
228	caddr_t	segmap_base,
229	size_t	segmap_size,
230	caddr_t	ekernelheap)
231{
232	uintptr_t	last_va = (uintptr_t)-1;	/* catch 1st time */
233	uintptr_t	va = 0;
234	size_t		size;
235	pfn_t		pfn;
236	uint_t		prot;
237	uint_t		table_cnt = 1;
238	uint_t		mapping_cnt;
239	level_t		start_level;
240	level_t		l;
241	struct memlist	*pmem;
242	level_t		lpagel = mmu.max_page_level;
243	uint64_t	paddr;
244	int64_t		psize;
245	int		nwindows;
246
247	if (kpm_size > 0) {
248		/*
249		 * Create the kpm page tables.  When running on the
250		 * hypervisor these are made read/only at first.
251		 * Later we'll add write permission where possible.
252		 */
253		for (pmem = phys_install; pmem; pmem = pmem->ml_next) {
254			paddr = pmem->ml_address;
255			psize = pmem->ml_size;
256			while (psize >= MMU_PAGESIZE) {
257				/* find the largest page size */
258				for (l = lpagel; l > 0; l--) {
259					if ((paddr & LEVEL_OFFSET(l)) == 0 &&
260					    psize > LEVEL_SIZE(l))
261						break;
262				}
263
264#if defined(__xpv)
265				/*
266				 * Create read/only mappings to avoid
267				 * conflicting with pagetable usage
268				 */
269				xen_kpm_create(paddr, l);
270#else
271				kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
272				    l, 1);
273#endif
274				paddr += LEVEL_SIZE(l);
275				psize -= LEVEL_SIZE(l);
276			}
277		}
278	}
279
280	/*
281	 * If this machine doesn't have a kpm segment, we need to allocate
282	 * a small number of 'windows' which can be used to map pagetables.
283	 */
284	nwindows = (kpm_size == 0) ? 2 * NCPU : 0;
285
286#if defined(__xpv)
287	/*
288	 * On a hypervisor, these windows are also used by the xpv_panic
289	 * code, where we need one window for each level of the pagetable
290	 * hierarchy.
291	 */
292	nwindows = MAX(nwindows, mmu.max_level);
293#endif
294
295	if (nwindows != 0) {
296		/*
297		 * Create the page windows and 1 page of VA in
298		 * which we map the PTEs of those windows.
299		 */
300		mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
301		    LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
302		ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
303		mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
304		    MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
305
306		/*
307		 * Find/Create the page table window mappings.
308		 */
309		paddr = 0;
310		(void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
311		ASSERT(paddr != 0);
312		ASSERT((paddr & MMU_PAGEOFFSET) == 0);
313		mmu.pwin_pte_pa = paddr;
314#ifdef __xpv
315		(void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
316		kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
317#else
318		kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
319#endif
320	}
321
322	/*
323	 * Walk the boot loader's page tables and figure out
324	 * how many tables and page mappings there will be.
325	 */
326	while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
327		/*
328		 * At each level, if the last_va falls into a new htable,
329		 * increment table_cnt. We can stop at the 1st level where
330		 * they are in the same htable.
331		 */
332		start_level = 0;
333		while (start_level <= mmu.max_page_level) {
334			if (size == LEVEL_SIZE(start_level))
335				break;
336			start_level++;
337		}
338
339		for (l = start_level; l < mmu.max_level; ++l) {
340			if (va >> LEVEL_SHIFT(l + 1) ==
341			    last_va >> LEVEL_SHIFT(l + 1))
342				break;
343			++table_cnt;
344		}
345		last_va = va;
346		l = (start_level == 0) ? 1 : start_level;
347		va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
348	}
349
350	/*
351	 * Besides the boot loader mappings, we're going to fill in
352	 * the entire top level page table for the kernel. Make sure there's
353	 * enough reserve for that too.
354	 */
355	table_cnt += mmu.top_level_count - ((kernelbase >>
356	    LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
357
358	/*
359	 * Add 1/4 more into table_cnt for extra slop.  The unused
360	 * slop is freed back when we htable_adjust_reserve() later.
361	 */
362	table_cnt += table_cnt >> 2;
363
364	/*
365	 * We only need mapping entries (hments) for shared pages.
366	 * This should be far, far fewer than the total possible,
367	 * We'll allocate enough for 1/16 of all possible PTEs.
368	 */
369	mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
370
371	/*
372	 * Now create the initial htable/hment reserves
373	 */
374	htable_initial_reserve(table_cnt);
375	hment_reserve(mapping_cnt);
376	x86pte_cpu_init(CPU);
377}
378
379
380/*
381 * This routine handles the work of creating the kernel's initial mappings
382 * by deciphering the mappings in the page tables created by the boot program.
383 *
384 * We maintain large page mappings, but only to a level 1 pagesize.
385 * The boot loader can only add new mappings once this function starts.
386 * In particular it can not change the pagesize used for any existing
387 * mappings or this code breaks!
388 */
389
390void
391hat_kern_setup(void)
392{
393	/*
394	 * Attach htables to the existing pagetables
395	 */
396	/* BEGIN CSTYLED */
397	htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
398#ifdef __xpv
399	    mmu_btop(xen_info->pt_base - ONE_GIG));
400#else
401	    mmu_btop(getcr3_pa()));
402#endif
403	/* END CSTYLED */
404
405#if defined(__xpv)
406	/*
407	 * Try to make the kpm mappings r/w. Failures here are OK, as
408	 * it's probably just a pagetable
409	 */
410	xen_kpm_finish_init();
411#endif
412
413	/*
414	 * The kernel HAT is now officially open for business.
415	 */
416	khat_running = 1;
417
418	CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
419	CPU->cpu_current_hat = kas.a_hat;
420}
421
422#ifndef __xpv
423
424/*
425 * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
426 * INVPCID_ADDR isn't.
427 */
428static void
429invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
430{
431	ulong_t	flag;
432	uint64_t cr4;
433
434	if (x86_use_invpcid == 1) {
435		ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
436		invpcid_insn(type, pcid, addr);
437		return;
438	}
439
440	switch (type) {
441	case INVPCID_ALL_GLOBAL:
442		flag = intr_clear();
443		cr4 = getcr4();
444		setcr4(cr4 & ~(ulong_t)CR4_PGE);
445		setcr4(cr4 | CR4_PGE);
446		intr_restore(flag);
447		break;
448
449	case INVPCID_ALL_NONGLOBAL:
450		if (!(getcr4() & CR4_PCIDE)) {
451			reload_cr3();
452		} else {
453			flag = intr_clear();
454			cr4 = getcr4();
455			setcr4(cr4 & ~(ulong_t)CR4_PGE);
456			setcr4(cr4 | CR4_PGE);
457			intr_restore(flag);
458		}
459		break;
460
461	case INVPCID_ADDR:
462		if (pcid == PCID_USER) {
463			flag = intr_clear();
464			ASSERT(addr < kernelbase);
465			ASSERT(ON_USER_HAT(CPU));
466			ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
467			tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
468			    MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
469			intr_restore(flag);
470		} else {
471			mmu_invlpg((caddr_t)addr);
472		}
473		break;
474
475	default:
476		panic("unsupported invpcid(%lu)", type);
477		break;
478	}
479}
480
481/*
482 * Flush one kernel mapping.
483 *
484 * We want to assert on kernel space here mainly for reasoning about the PCIDE
485 * case: namely, this flush should never need to flush a non-current PCID
486 * mapping.  This presumes we never have reason to flush the kernel regions
487 * available to PCID_USER (the trampolines and so on).  It also relies on
488 * PCID_KERNEL == PCID_NONE.
489 */
490void
491mmu_flush_tlb_kpage(uintptr_t va)
492{
493	ASSERT(va >= kernelbase);
494	ASSERT(getpcid() == PCID_KERNEL);
495	mmu_invlpg((caddr_t)va);
496}
497
498/*
499 * Flush one mapping: local CPU version of hat_tlb_inval().
500 *
501 * If this is a userspace address in the PCIDE case, we need two invalidations,
502 * one for any potentially stale PCID_USER mapping, as well as any established
503 * while in the kernel.
504 */
505void
506mmu_flush_tlb_page(uintptr_t va)
507{
508	ASSERT(getpcid() == PCID_KERNEL);
509
510	if (va >= kernelbase) {
511		mmu_flush_tlb_kpage(va);
512		return;
513	}
514
515	if (!(getcr4() & CR4_PCIDE)) {
516		mmu_invlpg((caddr_t)va);
517		return;
518	}
519
520	/*
521	 * Yes, kas will need to flush below kernelspace, at least during boot.
522	 * But there's no PCID_USER context.
523	 */
524	if (ON_USER_HAT(CPU))
525		invpcid(INVPCID_ADDR, PCID_USER, va);
526	invpcid(INVPCID_ADDR, PCID_KERNEL, va);
527}
528
529static void
530mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
531{
532	EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
533	ASSERT(len > 0);
534	ASSERT(pgsz != 0);
535
536	if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
537		for (uintptr_t va = addr; va < (addr + len); va += pgsz)
538			mmu_flush_tlb_page(va);
539		return;
540	}
541
542	/*
543	 * As an emulated invpcid() in the PCIDE case requires jumping
544	 * cr3s, we batch the invalidations.  We should only need to flush the
545	 * user range if we're on a user-space HAT.
546	 */
547	if (addr < kernelbase && ON_USER_HAT(CPU)) {
548		ulong_t flag = intr_clear();
549		ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
550		tr_mmu_flush_user_range(addr, len, pgsz,
551		    CPU->cpu_m.mcpu_kpti.kf_user_cr3);
552		intr_restore(flag);
553	}
554
555	for (uintptr_t va = addr; va < (addr + len); va += pgsz)
556		mmu_invlpg((caddr_t)va);
557}
558
559/*
560 * MMU TLB (and PT cache) flushing on this CPU.
561 *
562 * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
563 * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
564 * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
565 * mappings as appropriate.  If using invpcid, PT_GLOBAL mappings are not
566 * invalidated.
567 */
568void
569mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
570{
571	ASSERT(getpcid() == PCID_KERNEL);
572
573	switch (type) {
574	case FLUSH_TLB_ALL:
575		ASSERT(range == NULL);
576		invpcid(INVPCID_ALL_GLOBAL, 0, 0);
577		break;
578
579	case FLUSH_TLB_NONGLOBAL:
580		ASSERT(range == NULL);
581		invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
582		break;
583
584	case FLUSH_TLB_RANGE: {
585		mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
586		    LEVEL_SIZE(range->tr_level));
587		break;
588	}
589
590	default:
591		panic("invalid call mmu_flush_tlb(%d)", type);
592		break;
593	}
594}
595
596#endif /* ! __xpv */
597