17c478bdstevel@tonic-gate/*
27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
5a85a673josephb * Common Development and Distribution License (the "License").
6a85a673josephb * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
217c478bdstevel@tonic-gate/*
226c9930aJoe Bonasera * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
237c478bdstevel@tonic-gate */
24a311483Gerry Liu/*
25a311483Gerry Liu * Copyright (c) 2010, Intel Corporation.
26a311483Gerry Liu * All rights reserved.
27a311483Gerry Liu */
28c7c6ab2Garrett D'Amore/*
29c7c6ab2Garrett D'Amore * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30c2e5ad7Sam Gwydir * Copyright 2018 Joyent, Inc.  All rights reserved.
31a6a74e0Matthew Ahrens * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
32c7c6ab2Garrett D'Amore */
337c478bdstevel@tonic-gate
347c478bdstevel@tonic-gate/*
357c478bdstevel@tonic-gate * VM - Hardware Address Translation management for i386 and amd64
367c478bdstevel@tonic-gate *
377c478bdstevel@tonic-gate * Implementation of the interfaces described in <common/vm/hat.h>
387c478bdstevel@tonic-gate *
397c478bdstevel@tonic-gate * Nearly all the details of how the hardware is managed should not be
407c478bdstevel@tonic-gate * visible outside this layer except for misc. machine specific functions
417c478bdstevel@tonic-gate * that work in conjunction with this code.
427c478bdstevel@tonic-gate *
437c478bdstevel@tonic-gate * Routines used only inside of i86pc/vm start with hati_ for HAT Internal.
447c478bdstevel@tonic-gate */
457c478bdstevel@tonic-gate
4674ecdb5John Levon/*
4774ecdb5John Levon * amd64 HAT Design
4874ecdb5John Levon *
4974ecdb5John Levon * ----------
5074ecdb5John Levon * Background
5174ecdb5John Levon * ----------
5274ecdb5John Levon *
5374ecdb5John Levon * On x86, the address space is shared between a user process and the kernel.
5474ecdb5John Levon * This is different from SPARC. Conventionally, the kernel lives at the top of
5574ecdb5John Levon * the address space and the user process gets to enjoy the rest of it. If you
5674ecdb5John Levon * look at the image of the address map in uts/i86pc/os/startup.c, you'll get a
5774ecdb5John Levon * rough sense of how the address space is laid out and used.
5874ecdb5John Levon *
5974ecdb5John Levon * Every unique address space is represented by an instance of a HAT structure
6074ecdb5John Levon * called a 'hat_t'. In addition to a hat_t structure for each process, there is
6174ecdb5John Levon * also one that is used for the kernel (kas.a_hat), and each CPU ultimately
6274ecdb5John Levon * also has a HAT.
6374ecdb5John Levon *
6474ecdb5John Levon * Each HAT contains a pointer to its root page table. This root page table is
6574ecdb5John Levon * what we call an L3 page table in illumos and Intel calls the PML4. It is the
6674ecdb5John Levon * physical address of the L3 table that we place in the %cr3 register which the
6774ecdb5John Levon * processor uses.
6874ecdb5John Levon *
6974ecdb5John Levon * Each of the many layers of the page table is represented by a structure
7074ecdb5John Levon * called an htable_t. The htable_t manages a set of 512 8-byte entries. The
7174ecdb5John Levon * number of entries in a given page table is constant across all different
7274ecdb5John Levon * level page tables. Note, this is only true on amd64. This has not always been
7374ecdb5John Levon * the case on x86.
7474ecdb5John Levon *
7574ecdb5John Levon * Each entry in a page table, generally referred to as a PTE, may refer to
7674ecdb5John Levon * another page table or a memory location, depending on the level of the page
7774ecdb5John Levon * table and the use of large pages. Importantly, the top-level L3 page table
7874ecdb5John Levon * (PML4) only supports linking to further page tables. This is also true on
7974ecdb5John Levon * systems which support a 5th level page table (which we do not currently
8074ecdb5John Levon * support).
8174ecdb5John Levon *
8274ecdb5John Levon * Historically, on x86, when a process was running on CPU, the root of the page
8374ecdb5John Levon * table was inserted into %cr3 on each CPU on which it was currently running.
8474ecdb5John Levon * When processes would switch (by calling hat_switch()), then the value in %cr3
8574ecdb5John Levon * on that CPU would change to that of the new HAT. While this behavior is still
8674ecdb5John Levon * maintained in the xpv kernel, this is not what is done today.
8774ecdb5John Levon *
8874ecdb5John Levon * -------------------
8974ecdb5John Levon * Per-CPU Page Tables
9074ecdb5John Levon * -------------------
9174ecdb5John Levon *
9274ecdb5John Levon * Throughout the system the 64-bit kernel has a notion of what it calls a
9374ecdb5John Levon * per-CPU page table or PCP. The notion of a per-CPU page table was originally
9474ecdb5John Levon * introduced as part of the original work to support x86 PAE. On the 64-bit
9574ecdb5John Levon * kernel, it was originally used for 32-bit processes running on the 64-bit
9674ecdb5John Levon * kernel. The rationale behind this was that each 32-bit process could have all
9774ecdb5John Levon * of its memory represented in a single L2 page table as each L2 page table
9874ecdb5John Levon * entry represents 1 GbE of memory.
9974ecdb5John Levon *
10074ecdb5John Levon * Following on from this, the idea was that given that all of the L3 page table
10174ecdb5John Levon * entries for 32-bit processes are basically going to be identical with the
10274ecdb5John Levon * exception of the first entry in the page table, why not share those page
10374ecdb5John Levon * table entries. This gave rise to the idea of a per-CPU page table.
10474ecdb5John Levon *
10574ecdb5John Levon * The way this works is that we have a member in the machcpu_t called the
10674ecdb5John Levon * mcpu_hat_info. That structure contains two different 4k pages: one that
10774ecdb5John Levon * represents the L3 page table and one that represents an L2 page table. When
10874ecdb5John Levon * the CPU starts up, the L3 page table entries are copied in from the kernel's
10974ecdb5John Levon * page table. The L3 kernel entries do not change throughout the lifetime of
11074ecdb5John Levon * the kernel. The kernel portion of these L3 pages for each CPU have the same
11174ecdb5John Levon * records, meaning that they point to the same L2 page tables and thus see a
11274ecdb5John Levon * consistent view of the world.
11374ecdb5John Levon *
11474ecdb5John Levon * When a 32-bit process is loaded into this world, we copy the 32-bit process's
11574ecdb5John Levon * four top-level page table entries into the CPU's L2 page table and then set
11674ecdb5John Levon * the CPU's first L3 page table entry to point to the CPU's L2 page.
11774ecdb5John Levon * Specifically, in hat_pcp_update(), we're copying from the process's
11874ecdb5John Levon * HAT_COPIED_32 HAT into the page tables specific to this CPU.
11974ecdb5John Levon *
12074ecdb5John Levon * As part of the implementation of kernel page table isolation, this was also
12174ecdb5John Levon * extended to 64-bit processes. When a 64-bit process runs, we'll copy their L3
12274ecdb5John Levon * PTEs across into the current CPU's L3 page table. (As we can't do the
12374ecdb5John Levon * first-L3-entry trick for 64-bit processes, ->hci_pcp_l2ptes is unused in this
12474ecdb5John Levon * case.)
12574ecdb5John Levon *
12674ecdb5John Levon * The use of per-CPU page tables has a lot of implementation ramifications. A
12774ecdb5John Levon * HAT that runs a user process will be flagged with the HAT_COPIED flag to
12874ecdb5John Levon * indicate that it is using the per-CPU page table functionality. In tandem
12974ecdb5John Levon * with the HAT, the top-level htable_t will be flagged with the HTABLE_COPIED
13074ecdb5John Levon * flag. If the HAT represents a 32-bit process, then we will also set the
13174ecdb5John Levon * HAT_COPIED_32 flag on that hat_t.
13274ecdb5John Levon *
13374ecdb5John Levon * These two flags work together. The top-level htable_t when using per-CPU page
13474ecdb5John Levon * tables is 'virtual'. We never allocate a ptable for this htable_t (i.e.
13574ecdb5John Levon * ht->ht_pfn is PFN_INVALID).  Instead, when we need to modify a PTE in an
13674ecdb5John Levon * HTABLE_COPIED ptable, x86pte_access_pagetable() will redirect any accesses to
13774ecdb5John Levon * ht_hat->hat_copied_ptes.
13874ecdb5John Levon *
13974ecdb5John Levon * Of course, such a modification won't actually modify the HAT_PCP page tables
14074ecdb5John Levon * that were copied from the HAT_COPIED htable. When we change the top level
14174ecdb5John Levon * page table entries (L2 PTEs for a 32-bit process and L3 PTEs for a 64-bit
14274ecdb5John Levon * process), we need to make sure to trigger hat_pcp_update() on all CPUs that
14374ecdb5John Levon * are currently tied to this HAT (including the current CPU).
14474ecdb5John Levon *
14574ecdb5John Levon * To do this, PCP piggy-backs on TLB invalidation, specifically via the
14674ecdb5John Levon * hat_tlb_inval() path from link_ptp() and unlink_ptp().
14774ecdb5John Levon *
14874ecdb5John Levon * (Importantly, in all such cases, when this is in operation, the top-level
14974ecdb5John Levon * entry should not be able to refer to an actual page table entry that can be
15074ecdb5John Levon * changed and consolidated into a large page. If large page consolidation is
15174ecdb5John Levon * required here, then there will be much that needs to be reconsidered.)
15274ecdb5John Levon *
15374ecdb5John Levon * -----------------------------------------------
15474ecdb5John Levon * Kernel Page Table Isolation and the Per-CPU HAT
15574ecdb5John Levon * -----------------------------------------------
15674ecdb5John Levon *
15774ecdb5John Levon * All Intel CPUs that support speculative execution and paging are subject to a
15874ecdb5John Levon * series of bugs that have been termed 'Meltdown'. These exploits allow a user
15974ecdb5John Levon * process to read kernel memory through cache side channels and speculative
16074ecdb5John Levon * execution. To mitigate this on vulnerable CPUs, we need to use a technique
16174ecdb5John Levon * called kernel page table isolation. What this requires is that we have two
16274ecdb5John Levon * different page table roots. When executing in kernel mode, we will use a %cr3
16374ecdb5John Levon * value that has both the user and kernel pages. However when executing in user
16474ecdb5John Levon * mode, we will need to have a %cr3 that has all of the user pages; however,
16574ecdb5John Levon * only a subset of the kernel pages required to operate.
16674ecdb5John Levon *
16774ecdb5John Levon * These kernel pages that we need mapped are:
16874ecdb5John Levon *
16974ecdb5John Levon *   o Kernel Text that allows us to switch between the cr3 values.
17074ecdb5John Levon *   o The current global descriptor table (GDT)
17174ecdb5John Levon *   o The current interrupt descriptor table (IDT)
17274ecdb5John Levon *   o The current task switching state (TSS)
17374ecdb5John Levon *   o The current local descriptor table (LDT)
17474ecdb5John Levon *   o Stacks and scratch space used by the interrupt handlers
17574ecdb5John Levon *
17674ecdb5John Levon * For more information on the stack switching techniques, construction of the
17774ecdb5John Levon * trampolines, and more, please see i86pc/ml/kpti_trampolines.s. The most
17874ecdb5John Levon * important part of these mappings are the following two constraints:
17974ecdb5John Levon *
18074ecdb5John Levon *   o The mappings are all per-CPU (except for read-only text)
18174ecdb5John Levon *   o The mappings are static. They are all established before the CPU is
18274ecdb5John Levon *     started (with the exception of the boot CPU).
18374ecdb5John Levon *
18474ecdb5John Levon * To facilitate the kernel page table isolation we employ our per-CPU
18574ecdb5John Levon * page tables discussed in the previous section and add the notion of a per-CPU
18674ecdb5John Levon * HAT. Fundamentally we have a second page table root. There is both a kernel
18774ecdb5John Levon * page table (hci_pcp_l3ptes), and a user L3 page table (hci_user_l3ptes).
18874ecdb5John Levon * Both will have the user page table entries copied into them, the same way
18974ecdb5John Levon * that we discussed in the section 'Per-CPU Page Tables'.
19074ecdb5John Levon *
19174ecdb5John Levon * The complex part of this is how do we construct the set of kernel mappings
19274ecdb5John Levon * that should be present when running with the user page table. To answer that,
19374ecdb5John Levon * we add the notion of a per-CPU HAT. This HAT functions like a normal HAT,
19474ecdb5John Levon * except that it's not really associated with an address space the same way
19574ecdb5John Levon * that other HATs are.
19674ecdb5John Levon *
19774ecdb5John Levon * This HAT lives off of the 'struct hat_cpu_info' which is a member of the
19874ecdb5John Levon * machcpu in the member hci_user_hat. We use this per-CPU HAT to create the set
19974ecdb5John Levon * of kernel mappings that should be present on this CPU. The kernel mappings
20074ecdb5John Levon * are added to the per-CPU HAT through the function hati_cpu_punchin(). Once a
20174ecdb5John Levon * mapping has been punched in, it may not be punched out. The reason that we
20274ecdb5John Levon * opt to leverage a HAT structure is that it knows how to allocate and manage
20374ecdb5John Levon * all of the lower level page tables as required.
20474ecdb5John Levon *
20574ecdb5John Levon * Because all of the mappings are present at the beginning of time for this CPU
20674ecdb5John Levon * and none of the mappings are in the kernel pageable segment, we don't have to
20774ecdb5John Levon * worry about faulting on these HAT structures and thus the notion of the
20874ecdb5John Levon * current HAT that we're using is always the appropriate HAT for the process
20974ecdb5John Levon * (usually a user HAT or the kernel's HAT).
21074ecdb5John Levon *
21174ecdb5John Levon * A further constraint we place on the system with these per-CPU HATs is that
21274ecdb5John Levon * they are not subject to htable_steal(). Because each CPU will have a rather
21374ecdb5John Levon * fixed number of page tables, the same way that we don't steal from the
21474ecdb5John Levon * kernel's HAT, it was determined that we should not steal from this HAT due to
21574ecdb5John Levon * the complications involved and somewhat criminal nature of htable_steal().
21674ecdb5John Levon *
21774ecdb5John Levon * The per-CPU HAT is initialized in hat_pcp_setup() which is called as part of
21874ecdb5John Levon * onlining the CPU, but before the CPU is actually started. The per-CPU HAT is
21974ecdb5John Levon * removed in hat_pcp_teardown() which is called when a CPU is being offlined to
22074ecdb5John Levon * be removed from the system (which is different from what psradm usually
22174ecdb5John Levon * does).
22274ecdb5John Levon *
22374ecdb5John Levon * Finally, once the CPU has been onlined, the set of mappings in the per-CPU
22474ecdb5John Levon * HAT must not change. The HAT related functions that we call are not meant to
22574ecdb5John Levon * be called when we're switching between processes. For example, it is quite
22674ecdb5John Levon * possible that if they were, they would try to grab an htable mutex which
22774ecdb5John Levon * another thread might have. One needs to treat hat_switch() as though they
22874ecdb5John Levon * were above LOCK_LEVEL and therefore _must not_ block under any circumstance.
22974ecdb5John Levon */
23074ecdb5John Levon
2317c478bdstevel@tonic-gate#include <sys/machparam.h>
2327c478bdstevel@tonic-gate#include <sys/machsystm.h>
2337c478bdstevel@tonic-gate#include <sys/mman.h>
2347c478bdstevel@tonic-gate#include <sys/types.h>
2357c478bdstevel@tonic-gate#include <sys/systm.h>
2367c478bdstevel@tonic-gate#include <sys/cpuvar.h>
2377c478bdstevel@tonic-gate#include <sys/thread.h>
2387c478bdstevel@tonic-gate#include <sys/proc.h>
2397c478bdstevel@tonic-gate#include <sys/cpu.h>
2407c478bdstevel@tonic-gate#include <sys/kmem.h>
2417c478bdstevel@tonic-gate#include <sys/disp.h>
2427c478bdstevel@tonic-gate#include <sys/shm.h>
2437c478bdstevel@tonic-gate#include <sys/sysmacros.h>
2447c478bdstevel@tonic-gate#include <sys/machparam.h>
2457c478bdstevel@tonic-gate#include <sys/vmem.h>
2467c478bdstevel@tonic-gate#include <sys/vmsystm.h>
2477c478bdstevel@tonic-gate#include <sys/promif.h>
2487c478bdstevel@tonic-gate#include <sys/var.h>
2497c478bdstevel@tonic-gate#include <sys/x86_archext.h>
2507c478bdstevel@tonic-gate#include <sys/atomic.h>
2517c478bdstevel@tonic-gate#include <sys/bitmap.h>
252ae115bcmrj#include <sys/controlregs.h>
253ae115bcmrj#include <sys/bootconf.h>
254ae115bcmrj#include <sys/bootsvcs.h>
255ae115bcmrj#include <sys/bootinfo.h>
25695c0a3cjosephb#include <sys/archsystm.h>
2577c478bdstevel@tonic-gate
2587c478bdstevel@tonic-gate#include <vm/seg_kmem.h>
2597c478bdstevel@tonic-gate#include <vm/hat_i86.h>
2607c478bdstevel@tonic-gate#include <vm/as.h>
2617c478bdstevel@tonic-gate#include <vm/seg.h>
2627c478bdstevel@tonic-gate#include <vm/page.h>
2637c478bdstevel@tonic-gate#include <vm/seg_kp.h>
2647c478bdstevel@tonic-gate#include <vm/seg_kpm.h>
2657c478bdstevel@tonic-gate#include <vm/vm_dep.h>
266843e198johnlev#ifdef __xpv
267843e198johnlev#include <sys/hypervisor.h>
268843e198johnlev#endif
269ae115bcmrj#include <vm/kboot_mmu.h>
270250b7ffjosephb#include <vm/seg_spt.h>
2717c478bdstevel@tonic-gate
2727c478bdstevel@tonic-gate#include <sys/cmn_err.h>
2737c478bdstevel@tonic-gate
2747c478bdstevel@tonic-gate/*
2757c478bdstevel@tonic-gate * Basic parameters for hat operation.
2767c478bdstevel@tonic-gate */
2777c478bdstevel@tonic-gatestruct hat_mmu_info mmu;
2787c478bdstevel@tonic-gate
2797c478bdstevel@tonic-gate/*
2807c478bdstevel@tonic-gate * The page that is the kernel's top level pagetable.
2817c478bdstevel@tonic-gate *
282843e198johnlev * For 32 bit PAE support on i86pc, the kernel hat will use the 1st 4 entries
2837c478bdstevel@tonic-gate * on this 4K page for its top level page table. The remaining groups of
28474ecdb5John Levon * 4 entries are used for per processor copies of user PCP pagetables for
2857c478bdstevel@tonic-gate * running threads.  See hat_switch() and reload_pae32() for details.
2867c478bdstevel@tonic-gate *
28774ecdb5John Levon * pcp_page[0..3] - level==2 PTEs for kernel HAT
28874ecdb5John Levon * pcp_page[4..7] - level==2 PTEs for user thread on cpu 0
28974ecdb5John Levon * pcp_page[8..11]  - level==2 PTE for user thread on cpu 1
290843e198johnlev * etc...
29174ecdb5John Levon *
29274ecdb5John Levon * On the 64-bit kernel, this is the normal root of the page table and there is
29374ecdb5John Levon * nothing special about it when used for other CPUs.
2947c478bdstevel@tonic-gate */
29574ecdb5John Levonstatic x86pte_t *pcp_page;
2967c478bdstevel@tonic-gate
2977c478bdstevel@tonic-gate/*
2987c478bdstevel@tonic-gate * forward declaration of internal utility routines
2997c478bdstevel@tonic-gate */
3007c478bdstevel@tonic-gatestatic x86pte_t hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected,
3017c478bdstevel@tonic-gate	x86pte_t new);
3027c478bdstevel@tonic-gate
3037c478bdstevel@tonic-gate/*
30474ecdb5John Levon * The kernel address space exists in all non-HAT_COPIED HATs. To implement this
30574ecdb5John Levon * the kernel reserves a fixed number of entries in the topmost level(s) of page
30674ecdb5John Levon * tables. The values are setup during startup and then copied to every user hat
30774ecdb5John Levon * created by hat_alloc(). This means that kernelbase must be:
3087c478bdstevel@tonic-gate *
3097c478bdstevel@tonic-gate *	  4Meg aligned for 32 bit kernels
3107c478bdstevel@tonic-gate *	512Gig aligned for x86_64 64 bit kernel
3117c478bdstevel@tonic-gate *
312843e198johnlev * The hat_kernel_range_ts describe what needs to be copied from kernel hat
313843e198johnlev * to each user hat.
3147c478bdstevel@tonic-gate */
315843e198johnlevtypedef struct hat_kernel_range {
316843e198johnlev	level_t		hkr_level;
317843e198johnlev	uintptr_t	hkr_start_va;
318843e198johnlev	uintptr_t	hkr_end_va;	/* zero means to end of memory */
319843e198johnlev} hat_kernel_range_t;
320843e198johnlev#define	NUM_KERNEL_RANGE 2
321843e198johnlevstatic hat_kernel_range_t kernel_ranges[NUM_KERNEL_RANGE];
322843e198johnlevstatic int num_kernel_ranges;
3237c478bdstevel@tonic-gate
3247c478bdstevel@tonic-gateuint_t use_boot_reserve = 1;	/* cleared after early boot process */
3257c478bdstevel@tonic-gateuint_t can_steal_post_boot = 0;	/* set late in boot to enable stealing */
3267c478bdstevel@tonic-gate
327512cf78kchow/*
328512cf78kchow * enable_1gpg: controls 1g page support for user applications.
329512cf78kchow * By default, 1g pages are exported to user applications. enable_1gpg can
330512cf78kchow * be set to 0 to not export.
331512cf78kchow */
33278b03d3kchowint	enable_1gpg = 1;
33302bc52bkchow
334512cf78kchow/*
335512cf78kchow * AMD shanghai processors provide better management of 1gb ptes in its tlb.
33621584dbPavel Tatashin * By default, 1g page support will be disabled for pre-shanghai AMD
337512cf78kchow * processors that don't have optimal tlb support for the 1g page size.
338512cf78kchow * chk_optimal_1gtlb can be set to 0 to force 1g page support on sub-optimal
339512cf78kchow * processors.
340512cf78kchow */
341512cf78kchowint	chk_optimal_1gtlb = 1;
342512cf78kchow
343512cf78kchow
34402bc52bkchow#ifdef DEBUG
34502bc52bkchowuint_t	map1gcnt;
34602bc52bkchow#endif
34702bc52bkchow
34802bc52bkchow
3497c478bdstevel@tonic-gate/*
3507c478bdstevel@tonic-gate * A cpuset for all cpus. This is used for kernel address cross calls, since
3517c478bdstevel@tonic-gate * the kernel addresses apply to all cpus.
3527c478bdstevel@tonic-gate */
3537c478bdstevel@tonic-gatecpuset_t khat_cpuset;
3547c478bdstevel@tonic-gate
3557c478bdstevel@tonic-gate/*
3567c478bdstevel@tonic-gate * management stuff for hat structures
3577c478bdstevel@tonic-gate */
3587c478bdstevel@tonic-gatekmutex_t	hat_list_lock;
3597c478bdstevel@tonic-gatekcondvar_t	hat_list_cv;
3607c478bdstevel@tonic-gatekmem_cache_t	*hat_cache;
3617c478bdstevel@tonic-gatekmem_cache_t	*hat_hash_cache;
36274ecdb5John Levonkmem_cache_t	*hat32_hash_cache;
3637c478bdstevel@tonic-gate
3647c478bdstevel@tonic-gate/*
3657c478bdstevel@tonic-gate * Simple statistics
3667c478bdstevel@tonic-gate */
3677c478bdstevel@tonic-gatestruct hatstats hatstat;
3687c478bdstevel@tonic-gate
3697c478bdstevel@tonic-gate/*
370ab4a9bejohnlev * Some earlier hypervisor versions do not emulate cmpxchg of PTEs
371ab4a9bejohnlev * correctly.  For such hypervisors we must set PT_USER for kernel
372ab4a9bejohnlev * entries ourselves (normally the emulation would set PT_USER for
373ab4a9bejohnlev * kernel entries and PT_USER|PT_GLOBAL for user entries).  pt_kern is
374ab4a9bejohnlev * thus set appropriately.  Note that dboot/kbm is OK, as only the full
375ab4a9bejohnlev * HAT uses cmpxchg() and the other paths (hypercall etc.) were never
376ab4a9bejohnlev * incorrect.
377ab4a9bejohnlev */
378ab4a9bejohnlevint pt_kern;
379ab4a9bejohnlev
380a311483Gerry Liu#ifndef __xpv
381a311483Gerry Liuextern pfn_t memseg_get_start(struct memseg *);
382a311483Gerry Liu#endif
383a311483Gerry Liu
3847c478bdstevel@tonic-gate#define	PP_GETRM(pp, rmmask)    (pp->p_nrm & rmmask)
3857c478bdstevel@tonic-gate#define	PP_ISMOD(pp)		PP_GETRM(pp, P_MOD)
3867c478bdstevel@tonic-gate#define	PP_ISREF(pp)		PP_GETRM(pp, P_REF)
3877c478bdstevel@tonic-gate#define	PP_ISRO(pp)		PP_GETRM(pp, P_RO)
3887c478bdstevel@tonic-gate
3897c478bdstevel@tonic-gate#define	PP_SETRM(pp, rm)	atomic_orb(&(pp->p_nrm), rm)
3907c478bdstevel@tonic-gate#define	PP_SETMOD(pp)		PP_SETRM(pp, P_MOD)
3917c478bdstevel@tonic-gate#define	PP_SETREF(pp)		PP_SETRM(pp, P_REF)
3927c478bdstevel@tonic-gate#define	PP_SETRO(pp)		PP_SETRM(pp, P_RO)
3937c478bdstevel@tonic-gate
3947c478bdstevel@tonic-gate#define	PP_CLRRM(pp, rm)	atomic_andb(&(pp->p_nrm), ~(rm))
3957c478bdstevel@tonic-gate#define	PP_CLRMOD(pp)   	PP_CLRRM(pp, P_MOD)
3967c478bdstevel@tonic-gate#define	PP_CLRREF(pp)   	PP_CLRRM(pp, P_REF)
3977c478bdstevel@tonic-gate#define	PP_CLRRO(pp)    	PP_CLRRM(pp, P_RO)
3987c478bdstevel@tonic-gate#define	PP_CLRALL(pp)		PP_CLRRM(pp, P_MOD | P_REF | P_RO)
3997c478bdstevel@tonic-gate
4007c478bdstevel@tonic-gate/*
4017c478bdstevel@tonic-gate * kmem cache constructor for struct hat
4027c478bdstevel@tonic-gate */
4037c478bdstevel@tonic-gate/*ARGSUSED*/
4047c478bdstevel@tonic-gatestatic int
4057c478bdstevel@tonic-gatehati_constructor(void *buf, void *handle, int kmflags)
4067c478bdstevel@tonic-gate{
4077c478bdstevel@tonic-gate	hat_t	*hat = buf;
4087c478bdstevel@tonic-gate
4097c478bdstevel@tonic-gate	mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
4107c478bdstevel@tonic-gate	bzero(hat->hat_pages_mapped,
4117c478bdstevel@tonic-gate	    sizeof (pgcnt_t) * (mmu.max_page_level + 1));
412250b7ffjosephb	hat->hat_ism_pgcnt = 0;
4137c478bdstevel@tonic-gate	hat->hat_stats = 0;
4147c478bdstevel@tonic-gate	hat->hat_flags = 0;
4157c478bdstevel@tonic-gate	CPUSET_ZERO(hat->hat_cpus);
4167c478bdstevel@tonic-gate	hat->hat_htable = NULL;
4177c478bdstevel@tonic-gate	hat->hat_ht_hash = NULL;
4187c478bdstevel@tonic-gate	return (0);
4197c478bdstevel@tonic-gate}
4207c478bdstevel@tonic-gate
4217c478bdstevel@tonic-gate/*
42274ecdb5John Levon * Put it at the start of the global list of all hats (used by stealing)
42374ecdb5John Levon *
42474ecdb5John Levon * kas.a_hat is not in the list but is instead used to find the
42574ecdb5John Levon * first and last items in the list.
42674ecdb5John Levon *
42774ecdb5John Levon * - kas.a_hat->hat_next points to the start of the user hats.
42874ecdb5John Levon *   The list ends where hat->hat_next == NULL
42974ecdb5John Levon *
43074ecdb5John Levon * - kas.a_hat->hat_prev points to the last of the user hats.
43174ecdb5John Levon *   The list begins where hat->hat_prev == NULL
43274ecdb5John Levon */
43374ecdb5John Levonstatic void
43474ecdb5John Levonhat_list_append(hat_t *hat)
43574ecdb5John Levon{
43674ecdb5John Levon	mutex_enter(&hat_list_lock);
43774ecdb5John Levon	hat->hat_prev = NULL;
43874ecdb5John Levon	hat->hat_next = kas.a_hat->hat_next;
43974ecdb5John Levon	if (hat->hat_next)
44074ecdb5John Levon		hat->hat_next->hat_prev = hat;
44174ecdb5John Levon	else
44274ecdb5John Levon		kas.a_hat->hat_prev = hat;
44374ecdb5John Levon	kas.a_hat->hat_next = hat;
44474ecdb5John Levon	mutex_exit(&hat_list_lock);
44574ecdb5John Levon}
44674ecdb5John Levon
44774ecdb5John Levon/*
4487c478bdstevel@tonic-gate * Allocate a hat structure for as. We also create the top level
4497c478bdstevel@tonic-gate * htable and initialize it to contain the kernel hat entries.
4507c478bdstevel@tonic-gate */
4517c478bdstevel@tonic-gatehat_t *
4527c478bdstevel@tonic-gatehat_alloc(struct as *as)
4537c478bdstevel@tonic-gate{
454843e198johnlev	hat_t			*hat;
455843e198johnlev	htable_t		*ht;	/* top level htable */
45674ecdb5John Levon	uint_t			use_copied;
457843e198johnlev	uint_t			r;
458843e198johnlev	hat_kernel_range_t	*rp;
459843e198johnlev	uintptr_t		va;
460843e198johnlev	uintptr_t		eva;
461843e198johnlev	uint_t			start;
462843e198johnlev	uint_t			cnt;
463843e198johnlev	htable_t		*src;
46474ecdb5John Levon	boolean_t		use_hat32_cache;
4657c478bdstevel@tonic-gate
4667c478bdstevel@tonic-gate	/*
4677c478bdstevel@tonic-gate	 * Once we start creating user process HATs we can enable
4687c478bdstevel@tonic-gate	 * the htable_steal() code.
4697c478bdstevel@tonic-gate	 */
4707c478bdstevel@tonic-gate	if (can_steal_post_boot == 0)
4717c478bdstevel@tonic-gate		can_steal_post_boot = 1;
4727c478bdstevel@tonic-gate
473dc32d87Josef 'Jeff' Sipek	ASSERT(AS_WRITE_HELD(as));
4747c478bdstevel@tonic-gate	hat = kmem_cache_alloc(hat_cache, KM_SLEEP);
4757c478bdstevel@tonic-gate	hat->hat_as = as;
4767c478bdstevel@tonic-gate	mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
4777c478bdstevel@tonic-gate	ASSERT(hat->hat_flags == 0);
4787c478bdstevel@tonic-gate
479843e198johnlev#if defined(__xpv)
4807c478bdstevel@tonic-gate	/*
48174ecdb5John Levon	 * No PCP stuff on the hypervisor due to the 64-bit split top level
482843e198johnlev	 * page tables.  On 32-bit it's not needed as the hypervisor takes
483843e198johnlev	 * care of copying the top level PTEs to a below 4Gig page.
4847c478bdstevel@tonic-gate	 */
48574ecdb5John Levon	use_copied = 0;
48674ecdb5John Levon	use_hat32_cache = B_FALSE;
48774ecdb5John Levon	hat->hat_max_level = mmu.max_level;
48874ecdb5John Levon	hat->hat_num_copied = 0;
48974ecdb5John Levon	hat->hat_flags = 0;
490843e198johnlev#else	/* __xpv */
49174ecdb5John Levon
49274ecdb5John Levon	/*
49374ecdb5John Levon	 * All processes use HAT_COPIED on the 64-bit kernel if KPTI is
49474ecdb5John Levon	 * turned on.
49574ecdb5John Levon	 */
49674ecdb5John Levon	if (ttoproc(curthread)->p_model == DATAMODEL_ILP32) {
49774ecdb5John Levon		use_copied = 1;
49874ecdb5John Levon		hat->hat_max_level = mmu.max_level32;
49974ecdb5John Levon		hat->hat_num_copied = mmu.num_copied_ents32;
50074ecdb5John Levon		use_hat32_cache = B_TRUE;
50174ecdb5John Levon		hat->hat_flags |= HAT_COPIED_32;
50274ecdb5John Levon		HATSTAT_INC(hs_hat_copied32);
50374ecdb5John Levon	} else if (kpti_enable == 1) {
50474ecdb5John Levon		use_copied = 1;
50574ecdb5John Levon		hat->hat_max_level = mmu.max_level;
50674ecdb5John Levon		hat->hat_num_copied = mmu.num_copied_ents;
50774ecdb5John Levon		use_hat32_cache = B_FALSE;
50874ecdb5John Levon		HATSTAT_INC(hs_hat_copied64);
50974ecdb5John Levon	} else {
51074ecdb5John Levon		use_copied = 0;
51174ecdb5John Levon		use_hat32_cache = B_FALSE;
51274ecdb5John Levon		hat->hat_max_level = mmu.max_level;
51374ecdb5John Levon		hat->hat_num_copied = 0;
51474ecdb5John Levon		hat->hat_flags = 0;
51574ecdb5John Levon		HATSTAT_INC(hs_hat_normal64);
51674ecdb5John Levon	}
517843e198johnlev#endif	/* __xpv */
51874ecdb5John Levon	if (use_copied) {
51974ecdb5John Levon		hat->hat_flags |= HAT_COPIED;
52074ecdb5John Levon		bzero(hat->hat_copied_ptes, sizeof (hat->hat_copied_ptes));
5217c478bdstevel@tonic-gate	}
5227c478bdstevel@tonic-gate
5237c478bdstevel@tonic-gate	/*
52474ecdb5John Levon	 * Allocate the htable hash. For 32-bit PCP processes we use the
52574ecdb5John Levon	 * hat32_hash_cache. However, for 64-bit PCP processes we do not as the
52674ecdb5John Levon	 * number of entries that they have to handle is closer to
52774ecdb5John Levon	 * hat_hash_cache in count (though there will be more wastage when we
52874ecdb5John Levon	 * have more DRAM in the system and thus push down the user address
52974ecdb5John Levon	 * range).
5307c478bdstevel@tonic-gate	 */
53174ecdb5John Levon	if (use_hat32_cache) {
53274ecdb5John Levon		hat->hat_num_hash = mmu.hat32_hash_cnt;
53374ecdb5John Levon		hat->hat_ht_hash = kmem_cache_alloc(hat32_hash_cache, KM_SLEEP);
5347c478bdstevel@tonic-gate	} else {
5357c478bdstevel@tonic-gate		hat->hat_num_hash = mmu.hash_cnt;
5367c478bdstevel@tonic-gate		hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP);
5377c478bdstevel@tonic-gate	}
5387c478bdstevel@tonic-gate	bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *));
5397c478bdstevel@tonic-gate
5407c478bdstevel@tonic-gate	/*
5417c478bdstevel@tonic-gate	 * Initialize Kernel HAT entries at the top of the top level page
542843e198johnlev	 * tables for the new hat.
5437c478bdstevel@tonic-gate	 */
5447c478bdstevel@tonic-gate	hat->hat_htable = NULL;
5457c478bdstevel@tonic-gate	hat->hat_ht_cached = NULL;
546843e198johnlev	XPV_DISALLOW_MIGRATE();
5477c478bdstevel@tonic-gate	ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL);
548843e198johnlev	hat->hat_htable = ht;
549ae115bcmrj
550843e198johnlev#if defined(__amd64)
55174ecdb5John Levon	if (hat->hat_flags & HAT_COPIED)
552843e198johnlev		goto init_done;
5537c478bdstevel@tonic-gate#endif
5547c478bdstevel@tonic-gate
555843e198johnlev	for (r = 0; r < num_kernel_ranges; ++r) {
556843e198johnlev		rp = &kernel_ranges[r];
557843e198johnlev		for (va = rp->hkr_start_va; va != rp->hkr_end_va;
558843e198johnlev		    va += cnt * LEVEL_SIZE(rp->hkr_level)) {
559843e198johnlev
560843e198johnlev			if (rp->hkr_level == TOP_LEVEL(hat))
561843e198johnlev				ht = hat->hat_htable;
562843e198johnlev			else
563843e198johnlev				ht = htable_create(hat, va, rp->hkr_level,
564843e198johnlev				    NULL);
565843e198johnlev
566843e198johnlev			start = htable_va2entry(va, ht);
567843e198johnlev			cnt = HTABLE_NUM_PTES(ht) - start;
568843e198johnlev			eva = va +
569843e198johnlev			    ((uintptr_t)cnt << LEVEL_SHIFT(rp->hkr_level));
570843e198johnlev			if (rp->hkr_end_va != 0 &&
571843e198johnlev			    (eva > rp->hkr_end_va || eva == 0))
572843e198johnlev				cnt = htable_va2entry(rp->hkr_end_va, ht) -
573843e198johnlev				    start;
574843e198johnlev
575843e198johnlev#if defined(__i386) && !defined(__xpv)
57674ecdb5John Levon			if (ht->ht_flags & HTABLE_COPIED) {
57774ecdb5John Levon				bcopy(&pcp_page[start],
57874ecdb5John Levon				    &hat->hat_copied_ptes[start],
579843e198johnlev				    cnt * sizeof (x86pte_t));
580843e198johnlev				continue;
581843e198johnlev			}
582843e198johnlev#endif
583843e198johnlev			src = htable_lookup(kas.a_hat, va, rp->hkr_level);
584843e198johnlev			ASSERT(src != NULL);
585843e198johnlev			x86pte_copy(src, ht, start, cnt);
586843e198johnlev			htable_release(src);
587843e198johnlev		}
588843e198johnlev	}
589843e198johnlev
590843e198johnlevinit_done:
591843e198johnlev
592843e198johnlev#if defined(__xpv)
5937c478bdstevel@tonic-gate	/*
594843e198johnlev	 * Pin top level page tables after initializing them
5957c478bdstevel@tonic-gate	 */
596843e198johnlev	xen_pin(hat->hat_htable->ht_pfn, mmu.max_level);
597843e198johnlev#if defined(__amd64)
598843e198johnlev	xen_pin(hat->hat_user_ptable, mmu.max_level);
599843e198johnlev#endif
6007c478bdstevel@tonic-gate#endif
601551bc2amrj	XPV_ALLOW_MIGRATE();
6027c478bdstevel@tonic-gate
60374ecdb5John Levon	hat_list_append(hat);
60474ecdb5John Levon
60574ecdb5John Levon	return (hat);
60674ecdb5John Levon}
60774ecdb5John Levon
60874ecdb5John Levon#if !defined(__xpv)
60974ecdb5John Levon/*
61074ecdb5John Levon * Cons up a HAT for a CPU. This represents the user mappings. This will have
61174ecdb5John Levon * various kernel pages punched into it manually. Importantly, this hat is
61274ecdb5John Levon * ineligible for stealing. We really don't want to deal with this ever
61374ecdb5John Levon * faulting and figuring out that this is happening, much like we don't with
61474ecdb5John Levon * kas.
61574ecdb5John Levon */
61674ecdb5John Levonstatic hat_t *
61774ecdb5John Levonhat_cpu_alloc(cpu_t *cpu)
61874ecdb5John Levon{
61974ecdb5John Levon	hat_t *hat;
62074ecdb5John Levon	htable_t *ht;
62174ecdb5John Levon
62274ecdb5John Levon	hat = kmem_cache_alloc(hat_cache, KM_SLEEP);
62374ecdb5John Levon	hat->hat_as = NULL;
62474ecdb5John Levon	mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
62574ecdb5John Levon	hat->hat_max_level = mmu.max_level;
62674ecdb5John Levon	hat->hat_num_copied = 0;
62774ecdb5John Levon	hat->hat_flags = HAT_PCP;
62874ecdb5John Levon
62974ecdb5John Levon	hat->hat_num_hash = mmu.hash_cnt;
63074ecdb5John Levon	hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP);
63174ecdb5John Levon	bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *));
63274ecdb5John Levon
63374ecdb5John Levon	hat->hat_next = hat->hat_prev = NULL;
63474ecdb5John Levon
6357c478bdstevel@tonic-gate	/*
63674ecdb5John Levon	 * Because this HAT will only ever be used by the current CPU, we'll go
63774ecdb5John Levon	 * ahead and set the CPUSET up to only point to the CPU in question.
6387c478bdstevel@tonic-gate	 */
63974ecdb5John Levon	CPUSET_ADD(hat->hat_cpus, cpu->cpu_id);
64074ecdb5John Levon
64174ecdb5John Levon	hat->hat_htable = NULL;
64274ecdb5John Levon	hat->hat_ht_cached = NULL;
64374ecdb5John Levon	ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL);
64474ecdb5John Levon	hat->hat_htable = ht;
64574ecdb5John Levon
64674ecdb5John Levon	hat_list_append(hat);
6477c478bdstevel@tonic-gate
6487c478bdstevel@tonic-gate	return (hat);
6497c478bdstevel@tonic-gate}
65074ecdb5John Levon#endif /* !__xpv */
6517c478bdstevel@tonic-gate
6527c478bdstevel@tonic-gate/*
6537c478bdstevel@tonic-gate * process has finished executing but as has not been cleaned up yet.
6547c478bdstevel@tonic-gate */
6557c478bdstevel@tonic-gate/*ARGSUSED*/
6567c478bdstevel@tonic-gatevoid
6577c478bdstevel@tonic-gatehat_free_start(hat_t *hat)
6587c478bdstevel@tonic-gate{
659dc32d87Josef 'Jeff' Sipek	ASSERT(AS_WRITE_HELD(hat->hat_as));
660a85a673josephb
661a85a673josephb	/*
662a85a673josephb	 * If the hat is currently a stealing victim, wait for the stealing
663a85a673josephb	 * to finish.  Once we mark it as HAT_FREEING, htable_steal()
664a85a673josephb	 * won't look at its pagetables anymore.
665