xref: /illumos-gate/usr/src/uts/intel/os/desctbls.c (revision a0955b86)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2018 Joyent, Inc. All rights reserved.
28  */
29 
30 /*
31  * Copyright (c) 1992 Terrence R. Lambert.
32  * Copyright (c) 1990 The Regents of the University of California.
33  * All rights reserved.
34  *
35  * This code is derived from software contributed to Berkeley by
36  * William Jolitz.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *	This product includes software developed by the University of
49  *	California, Berkeley and its contributors.
50  * 4. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
67  */
68 
69 #include <sys/types.h>
70 #include <sys/sysmacros.h>
71 #include <sys/tss.h>
72 #include <sys/segments.h>
73 #include <sys/trap.h>
74 #include <sys/cpuvar.h>
75 #include <sys/bootconf.h>
76 #include <sys/x86_archext.h>
77 #include <sys/controlregs.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 #include <sys/kobj.h>
81 #include <sys/cmn_err.h>
82 #include <sys/reboot.h>
83 #include <sys/kdi.h>
84 #include <sys/mach_mmu.h>
85 #include <sys/systm.h>
86 #include <sys/note.h>
87 
88 #ifdef __xpv
89 #include <sys/hypervisor.h>
90 #include <vm/as.h>
91 #endif
92 
93 #include <sys/promif.h>
94 #include <sys/bootinfo.h>
95 #include <vm/kboot_mmu.h>
96 #include <vm/hat_pte.h>
97 
98 /*
99  * cpu0 and default tables and structures.
100  */
101 user_desc_t	*gdt0;
102 #if !defined(__xpv)
103 desctbr_t	gdt0_default_r;
104 #endif
105 
106 gate_desc_t	*idt0; 		/* interrupt descriptor table */
107 #if defined(__i386)
108 desctbr_t	idt0_default_r;		/* describes idt0 in IDTR format */
109 #endif
110 
111 tss_t		*ktss0;			/* kernel task state structure */
112 
113 #if defined(__i386)
114 tss_t		*dftss0;		/* #DF double-fault exception */
115 #endif	/* __i386 */
116 
117 user_desc_t	zero_udesc;		/* base zero user desc native procs */
118 user_desc_t	null_udesc;		/* null user descriptor */
119 system_desc_t	null_sdesc;		/* null system descriptor */
120 
121 #if defined(__amd64)
122 user_desc_t	zero_u32desc;		/* 32-bit compatibility procs */
123 #endif	/* __amd64 */
124 
125 #if defined(__amd64)
126 user_desc_t	ucs_on;
127 user_desc_t	ucs_off;
128 user_desc_t	ucs32_on;
129 user_desc_t	ucs32_off;
130 #endif	/* __amd64 */
131 
132 /*
133  * If the size of this is changed, you must update hat_pcp_setup() and the
134  * definitions in exception.s
135  */
136 extern char dblfault_stack0[DEFAULTSTKSZ];
137 extern char nmi_stack0[DEFAULTSTKSZ];
138 extern char mce_stack0[DEFAULTSTKSZ];
139 
140 extern void	fast_null(void);
141 extern hrtime_t	get_hrtime(void);
142 extern hrtime_t	gethrvtime(void);
143 extern hrtime_t	get_hrestime(void);
144 extern uint64_t	getlgrp(void);
145 
146 void (*(fasttable[]))(void) = {
147 	fast_null,			/* T_FNULL routine */
148 	fast_null,			/* T_FGETFP routine (initially null) */
149 	fast_null,			/* T_FSETFP routine (initially null) */
150 	(void (*)())get_hrtime,		/* T_GETHRTIME */
151 	(void (*)())gethrvtime,		/* T_GETHRVTIME */
152 	(void (*)())get_hrestime,	/* T_GETHRESTIME */
153 	(void (*)())getlgrp		/* T_GETLGRP */
154 };
155 
156 /*
157  * Structure containing pre-computed descriptors to allow us to temporarily
158  * interpose on a standard handler.
159  */
160 struct interposing_handler {
161 	int ih_inum;
162 	gate_desc_t ih_interp_desc;
163 	gate_desc_t ih_default_desc;
164 };
165 
166 /*
167  * The brand infrastructure interposes on two handlers, and we use one as a
168  * NULL signpost.
169  */
170 static struct interposing_handler brand_tbl[2];
171 
172 /*
173  * software prototypes for default local descriptor table
174  */
175 
176 /*
177  * Routines for loading segment descriptors in format the hardware
178  * can understand.
179  */
180 
181 /*
182  * In long mode we have the new L or long mode attribute bit
183  * for code segments. Only the conforming bit in type is used along
184  * with descriptor priority and present bits. Default operand size must
185  * be zero when in long mode. In 32-bit compatibility mode all fields
186  * are treated as in legacy mode. For data segments while in long mode
187  * only the present bit is loaded.
188  */
189 void
190 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
191     uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
192 {
193 	ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
194 	/* This should never be a "system" segment. */
195 	ASSERT3U(type & SDT_S, !=, 0);
196 
197 	/*
198 	 * 64-bit long mode.
199 	 */
200 	if (lmode == SDP_LONG)
201 		dp->usd_def32 = 0;		/* 32-bit operands only */
202 	else
203 		/*
204 		 * 32-bit compatibility mode.
205 		 */
206 		dp->usd_def32 = defopsz;	/* 0 = 16, 1 = 32-bit ops */
207 
208 	/*
209 	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
210 	 * will write to the GDT whenever we change segment registers around.
211 	 * With KPTI on, the GDT is read-only in the user page table, which
212 	 * causes crashes if we don't set this.
213 	 */
214 	ASSERT3U(type & SDT_A, !=, 0);
215 
216 	dp->usd_long = lmode;	/* 64-bit mode */
217 	dp->usd_type = type;
218 	dp->usd_dpl = dpl;
219 	dp->usd_p = 1;
220 	dp->usd_gran = gran;		/* 0 = bytes, 1 = pages */
221 
222 	dp->usd_lobase = (uintptr_t)base;
223 	dp->usd_midbase = (uintptr_t)base >> 16;
224 	dp->usd_hibase = (uintptr_t)base >> (16 + 8);
225 	dp->usd_lolimit = size;
226 	dp->usd_hilimit = (uintptr_t)size >> 16;
227 }
228 
229 /*
230  * Install system segment descriptor for LDT and TSS segments.
231  */
232 
233 void
234 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
235     uint_t dpl)
236 {
237 	dp->ssd_lolimit = size;
238 	dp->ssd_hilimit = (uintptr_t)size >> 16;
239 
240 	dp->ssd_lobase = (uintptr_t)base;
241 	dp->ssd_midbase = (uintptr_t)base >> 16;
242 	dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
243 	dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
244 
245 	dp->ssd_type = type;
246 	dp->ssd_zero1 = 0;	/* must be zero */
247 	dp->ssd_zero2 = 0;
248 	dp->ssd_dpl = dpl;
249 	dp->ssd_p = 1;
250 	dp->ssd_gran = 0;	/* force byte units */
251 }
252 
253 void *
254 get_ssd_base(system_desc_t *dp)
255 {
256 	uintptr_t	base;
257 
258 	base = (uintptr_t)dp->ssd_lobase |
259 	    (uintptr_t)dp->ssd_midbase << 16 |
260 	    (uintptr_t)dp->ssd_hibase << (16 + 8) |
261 	    (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
262 	return ((void *)base);
263 }
264 
265 /*
266  * Install gate segment descriptor for interrupt, trap, call and task gates.
267  *
268  * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
269  * all interrupts.  We have different ISTs for each class of exceptions that are
270  * most likely to occur while handling an existing exception; while many of
271  * these are just going to panic, it's nice not to trample on the existing
272  * exception state for debugging purposes.
273  *
274  * Normal interrupts are all redirected unconditionally to the KPTI trampoline
275  * stack space. This unifies the trampoline handling between user and kernel
276  * space (and avoids the need to touch %gs).
277  *
278  * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
279  * we do a read from KMDB that cause another #PF.  Without its own IST, this
280  * would stomp on the kernel's mcpu_kpti_flt frame.
281  */
282 uint_t
283 idt_vector_to_ist(uint_t vector)
284 {
285 #if defined(__xpv)
286 	_NOTE(ARGUNUSED(vector));
287 	return (IST_NONE);
288 #else
289 	switch (vector) {
290 	/* These should always use IST even without KPTI enabled. */
291 	case T_DBLFLT:
292 		return (IST_DF);
293 	case T_NMIFLT:
294 		return (IST_NMI);
295 	case T_MCE:
296 		return (IST_MCE);
297 
298 	case T_BPTFLT:
299 	case T_SGLSTP:
300 		if (kpti_enable == 1) {
301 			return (IST_DBG);
302 		}
303 		return (IST_NONE);
304 	case T_STKFLT:
305 	case T_GPFLT:
306 	case T_PGFLT:
307 		if (kpti_enable == 1) {
308 			return (IST_NESTABLE);
309 		}
310 		return (IST_NONE);
311 	default:
312 		if (kpti_enable == 1) {
313 			return (IST_DEFAULT);
314 		}
315 		return (IST_NONE);
316 	}
317 #endif
318 }
319 
320 void
321 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
322     uint_t type, uint_t dpl, uint_t ist)
323 {
324 	dp->sgd_looffset = (uintptr_t)func;
325 	dp->sgd_hioffset = (uintptr_t)func >> 16;
326 	dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
327 	dp->sgd_selector =  (uint16_t)sel;
328 	dp->sgd_ist = ist;
329 	dp->sgd_type = type;
330 	dp->sgd_dpl = dpl;
331 	dp->sgd_p = 1;
332 }
333 
334 /*
335  * Updates a single user descriptor in the the GDT of the current cpu.
336  * Caller is responsible for preventing cpu migration.
337  */
338 
339 void
340 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
341 {
342 #if defined(DEBUG)
343 	/* This should never be a "system" segment, but it might be null. */
344 	if (udp->usd_p != 0 || udp->usd_type != 0) {
345 		ASSERT3U(udp->usd_type & SDT_S, !=, 0);
346 	}
347 	/*
348 	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
349 	 * will write to the GDT whenever we change segment registers around.
350 	 * With KPTI on, the GDT is read-only in the user page table, which
351 	 * causes crashes if we don't set this.
352 	 */
353 	if (udp->usd_p != 0 || udp->usd_type != 0) {
354 		ASSERT3U(udp->usd_type & SDT_A, !=, 0);
355 	}
356 #endif
357 
358 #if defined(__xpv)
359 	uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
360 
361 	if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
362 		panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
363 
364 #else	/* __xpv */
365 	CPU->cpu_gdt[sidx] = *udp;
366 #endif	/* __xpv */
367 }
368 
369 /*
370  * Writes single descriptor pointed to by udp into a processes
371  * LDT entry pointed to by ldp.
372  */
373 int
374 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
375 {
376 #if defined(DEBUG)
377 	/* This should never be a "system" segment, but it might be null. */
378 	if (udp->usd_p != 0 || udp->usd_type != 0) {
379 		ASSERT3U(udp->usd_type & SDT_S, !=, 0);
380 	}
381 	/*
382 	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
383 	 * will write to the LDT whenever we change segment registers around.
384 	 * With KPTI on, the LDT is read-only in the user page table, which
385 	 * causes crashes if we don't set this.
386 	 */
387 	if (udp->usd_p != 0 || udp->usd_type != 0) {
388 		ASSERT3U(udp->usd_type & SDT_A, !=, 0);
389 	}
390 #endif
391 
392 #if defined(__xpv)
393 	uint64_t dpa;
394 
395 	dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
396 	    ((uintptr_t)ldp & PAGEOFFSET);
397 
398 	/*
399 	 * The hypervisor is a little more restrictive about what it
400 	 * supports in the LDT.
401 	 */
402 	if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
403 		return (EINVAL);
404 
405 #else	/* __xpv */
406 	*ldp = *udp;
407 
408 #endif	/* __xpv */
409 	return (0);
410 }
411 
412 #if defined(__xpv)
413 
414 /*
415  * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
416  * Returns true if a valid entry was written.
417  */
418 int
419 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
420 {
421 	trap_info_t *ti = ti_arg;	/* XXPV	Aargh - segments.h comment */
422 
423 	/*
424 	 * skip holes in the IDT
425 	 */
426 	if (GATESEG_GETOFFSET(sgd) == 0)
427 		return (0);
428 
429 	ASSERT(sgd->sgd_type == SDT_SYSIGT);
430 	ti->vector = vec;
431 	TI_SET_DPL(ti, sgd->sgd_dpl);
432 
433 	/*
434 	 * Is this an interrupt gate?
435 	 */
436 	if (sgd->sgd_type == SDT_SYSIGT) {
437 		/* LINTED */
438 		TI_SET_IF(ti, 1);
439 	}
440 	ti->cs = sgd->sgd_selector;
441 #if defined(__amd64)
442 	ti->cs |= SEL_KPL;	/* force into ring 3. see KCS_SEL  */
443 #endif
444 	ti->address = GATESEG_GETOFFSET(sgd);
445 	return (1);
446 }
447 
448 /*
449  * Convert a single hw format gate descriptor and write it into our virtual IDT.
450  */
451 void
452 xen_idt_write(gate_desc_t *sgd, uint_t vec)
453 {
454 	trap_info_t trapinfo[2];
455 
456 	bzero(trapinfo, sizeof (trapinfo));
457 	if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
458 		return;
459 	if (xen_set_trap_table(trapinfo) != 0)
460 		panic("xen_idt_write: xen_set_trap_table() failed");
461 }
462 
463 #endif	/* __xpv */
464 
465 #if defined(__amd64)
466 
467 /*
468  * Build kernel GDT.
469  */
470 
471 static void
472 init_gdt_common(user_desc_t *gdt)
473 {
474 	int i;
475 
476 	/*
477 	 * 64-bit kernel code segment.
478 	 */
479 	set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
480 	    SDP_PAGES, SDP_OP32);
481 
482 	/*
483 	 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
484 	 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
485 	 * instruction to return from system calls back to 32-bit applications.
486 	 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
487 	 * descriptors. We therefore must ensure that the kernel uses something,
488 	 * though it will be ignored by hardware, that is compatible with 32-bit
489 	 * apps. For the same reason we must set the default op size of this
490 	 * descriptor to 32-bit operands.
491 	 */
492 	set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
493 	    SEL_KPL, SDP_PAGES, SDP_OP32);
494 	gdt[GDT_KDATA].usd_def32 = 1;
495 
496 	/*
497 	 * 64-bit user code segment.
498 	 */
499 	set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
500 	    SDP_PAGES, SDP_OP32);
501 
502 	/*
503 	 * 32-bit user code segment.
504 	 */
505 	set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
506 	    SEL_UPL, SDP_PAGES, SDP_OP32);
507 
508 	/*
509 	 * See gdt_ucode32() and gdt_ucode_native().
510 	 */
511 	ucs_on = ucs_off = gdt[GDT_UCODE];
512 	ucs_off.usd_p = 0;	/* forces #np fault */
513 
514 	ucs32_on = ucs32_off = gdt[GDT_U32CODE];
515 	ucs32_off.usd_p = 0;	/* forces #np fault */
516 
517 	/*
518 	 * 32 and 64 bit data segments can actually share the same descriptor.
519 	 * In long mode only the present bit is checked but all other fields
520 	 * are loaded. But in compatibility mode all fields are interpreted
521 	 * as in legacy mode so they must be set correctly for a 32-bit data
522 	 * segment.
523 	 */
524 	set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
525 	    SDP_PAGES, SDP_OP32);
526 
527 #if !defined(__xpv)
528 
529 	/*
530 	 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
531 	 * in the GDT is 0.
532 	 */
533 
534 	/*
535 	 * Kernel TSS
536 	 */
537 	set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
538 	    sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
539 
540 #endif	/* !__xpv */
541 
542 	/*
543 	 * Initialize fs and gs descriptors for 32 bit processes.
544 	 * Only attributes and limits are initialized, the effective
545 	 * base address is programmed via fsbase/gsbase.
546 	 */
547 	set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
548 	    SEL_UPL, SDP_PAGES, SDP_OP32);
549 	set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
550 	    SEL_UPL, SDP_PAGES, SDP_OP32);
551 
552 	/*
553 	 * Initialize the descriptors set aside for brand usage.
554 	 * Only attributes and limits are initialized.
555 	 */
556 	for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
557 		set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
558 		    SEL_UPL, SDP_PAGES, SDP_OP32);
559 
560 	/*
561 	 * Initialize convenient zero base user descriptors for clearing
562 	 * lwp private %fs and %gs descriptors in GDT. See setregs() for
563 	 * an example.
564 	 */
565 	set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
566 	    SDP_BYTES, SDP_OP32);
567 	set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
568 	    SDP_PAGES, SDP_OP32);
569 }
570 
571 #if defined(__xpv)
572 
573 static user_desc_t *
574 init_gdt(void)
575 {
576 	uint64_t gdtpa;
577 	ulong_t ma[1];		/* XXPV should be a memory_t */
578 	ulong_t addr;
579 
580 #if !defined(__lint)
581 	/*
582 	 * Our gdt is never larger than a single page.
583 	 */
584 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
585 #endif
586 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
587 	    PAGESIZE, PAGESIZE);
588 	bzero(gdt0, PAGESIZE);
589 
590 	init_gdt_common(gdt0);
591 
592 	/*
593 	 * XXX Since we never invoke kmdb until after the kernel takes
594 	 * over the descriptor tables why not have it use the kernel's
595 	 * selectors?
596 	 */
597 	if (boothowto & RB_DEBUG) {
598 		set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
599 		    SEL_KPL, SDP_PAGES, SDP_OP32);
600 		set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
601 		    SEL_KPL, SDP_PAGES, SDP_OP32);
602 	}
603 
604 	/*
605 	 * Clear write permission for page containing the gdt and install it.
606 	 */
607 	gdtpa = pfn_to_pa(va_to_pfn(gdt0));
608 	ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
609 	kbm_read_only((uintptr_t)gdt0, gdtpa);
610 	xen_set_gdt(ma, NGDT);
611 
612 	/*
613 	 * Reload the segment registers to use the new GDT.
614 	 * On 64-bit, fixup KCS_SEL to be in ring 3.
615 	 * See KCS_SEL in segments.h.
616 	 */
617 	load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
618 
619 	/*
620 	 *  setup %gs for kernel
621 	 */
622 	xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
623 
624 	/*
625 	 * XX64 We should never dereference off "other gsbase" or
626 	 * "fsbase".  So, we should arrange to point FSBASE and
627 	 * KGSBASE somewhere truly awful e.g. point it at the last
628 	 * valid address below the hole so that any attempts to index
629 	 * off them cause an exception.
630 	 *
631 	 * For now, point it at 8G -- at least it should be unmapped
632 	 * until some 64-bit processes run.
633 	 */
634 	addr = 0x200000000ul;
635 	xen_set_segment_base(SEGBASE_FS, addr);
636 	xen_set_segment_base(SEGBASE_GS_USER, addr);
637 	xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
638 
639 	return (gdt0);
640 }
641 
642 #else	/* __xpv */
643 
644 static user_desc_t *
645 init_gdt(void)
646 {
647 	desctbr_t	r_bgdt, r_gdt;
648 	user_desc_t	*bgdt;
649 
650 #if !defined(__lint)
651 	/*
652 	 * Our gdt is never larger than a single page.
653 	 */
654 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
655 #endif
656 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
657 	    PAGESIZE, PAGESIZE);
658 	bzero(gdt0, PAGESIZE);
659 
660 	init_gdt_common(gdt0);
661 
662 	/*
663 	 * Copy in from boot's gdt to our gdt.
664 	 * Entry 0 is the null descriptor by definition.
665 	 */
666 	rd_gdtr(&r_bgdt);
667 	bgdt = (user_desc_t *)r_bgdt.dtr_base;
668 	if (bgdt == NULL)
669 		panic("null boot gdt");
670 
671 	gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
672 	gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
673 	gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
674 	gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
675 	gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
676 
677 	/*
678 	 * Install our new GDT
679 	 */
680 	r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
681 	r_gdt.dtr_base = (uintptr_t)gdt0;
682 	wr_gdtr(&r_gdt);
683 
684 	/*
685 	 * Reload the segment registers to use the new GDT
686 	 */
687 	load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
688 
689 	/*
690 	 *  setup %gs for kernel
691 	 */
692 	wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
693 
694 	/*
695 	 * XX64 We should never dereference off "other gsbase" or
696 	 * "fsbase".  So, we should arrange to point FSBASE and
697 	 * KGSBASE somewhere truly awful e.g. point it at the last
698 	 * valid address below the hole so that any attempts to index
699 	 * off them cause an exception.
700 	 *
701 	 * For now, point it at 8G -- at least it should be unmapped
702 	 * until some 64-bit processes run.
703 	 */
704 	wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
705 	wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
706 	return (gdt0);
707 }
708 
709 #endif	/* __xpv */
710 
711 #elif defined(__i386)
712 
713 static void
714 init_gdt_common(user_desc_t *gdt)
715 {
716 	int i;
717 
718 	/*
719 	 * Text and data for both kernel and user span entire 32 bit
720 	 * address space.
721 	 */
722 
723 	/*
724 	 * kernel code segment.
725 	 */
726 	set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES,
727 	    SDP_OP32);
728 
729 	/*
730 	 * kernel data segment.
731 	 */
732 	set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES,
733 	    SDP_OP32);
734 
735 	/*
736 	 * user code segment.
737 	 */
738 	set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES,
739 	    SDP_OP32);
740 
741 	/*
742 	 * user data segment.
743 	 */
744 	set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES,
745 	    SDP_OP32);
746 
747 #if !defined(__xpv)
748 
749 	/*
750 	 * TSS for T_DBLFLT (double fault) handler
751 	 */
752 	set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0,
753 	    sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL);
754 
755 	/*
756 	 * TSS for kernel
757 	 */
758 	set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
759 	    sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
760 
761 #endif	/* !__xpv */
762 
763 	/*
764 	 * %gs selector for kernel
765 	 */
766 	set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA,
767 	    SEL_KPL, SDP_BYTES, SDP_OP32);
768 
769 	/*
770 	 * Initialize lwp private descriptors.
771 	 * Only attributes and limits are initialized, the effective
772 	 * base address is programmed via fsbase/gsbase.
773 	 */
774 	set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
775 	    SDP_PAGES, SDP_OP32);
776 	set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
777 	    SDP_PAGES, SDP_OP32);
778 
779 	/*
780 	 * Initialize the descriptors set aside for brand usage.
781 	 * Only attributes and limits are initialized.
782 	 */
783 	for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
784 		set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
785 		    SDP_PAGES, SDP_OP32);
786 	/*
787 	 * Initialize convenient zero base user descriptor for clearing
788 	 * lwp  private %fs and %gs descriptors in GDT. See setregs() for
789 	 * an example.
790 	 */
791 	set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL,
792 	    SDP_BYTES, SDP_OP32);
793 }
794 
795 #if defined(__xpv)
796 
797 static user_desc_t *
798 init_gdt(void)
799 {
800 	uint64_t gdtpa;
801 	ulong_t ma[1];		/* XXPV should be a memory_t */
802 
803 #if !defined(__lint)
804 	/*
805 	 * Our gdt is never larger than a single page.
806 	 */
807 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
808 #endif
809 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
810 	    PAGESIZE, PAGESIZE);
811 	bzero(gdt0, PAGESIZE);
812 
813 	init_gdt_common(gdt0);
814 	gdtpa = pfn_to_pa(va_to_pfn(gdt0));
815 
816 	/*
817 	 * XXX Since we never invoke kmdb until after the kernel takes
818 	 * over the descriptor tables why not have it use the kernel's
819 	 * selectors?
820 	 */
821 	if (boothowto & RB_DEBUG) {
822 		set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
823 		    SDP_PAGES, SDP_OP32);
824 		set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
825 		    SDP_PAGES, SDP_OP32);
826 	}
827 
828 	/*
829 	 * Clear write permission for page containing the gdt and install it.
830 	 */
831 	ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
832 	kbm_read_only((uintptr_t)gdt0, gdtpa);
833 	xen_set_gdt(ma, NGDT);
834 
835 	/*
836 	 * Reload the segment registers to use the new GDT
837 	 */
838 	load_segment_registers(
839 	    KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
840 
841 	return (gdt0);
842 }
843 
844 #else	/* __xpv */
845 
846 static user_desc_t *
847 init_gdt(void)
848 {
849 	desctbr_t	r_bgdt, r_gdt;
850 	user_desc_t	*bgdt;
851 
852 #if !defined(__lint)
853 	/*
854 	 * Our gdt is never larger than a single page.
855 	 */
856 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
857 #endif
858 	/*
859 	 * XXX this allocation belongs in our caller, not here.
860 	 */
861 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
862 	    PAGESIZE, PAGESIZE);
863 	bzero(gdt0, PAGESIZE);
864 
865 	init_gdt_common(gdt0);
866 
867 	/*
868 	 * Copy in from boot's gdt to our gdt entries.
869 	 * Entry 0 is null descriptor by definition.
870 	 */
871 	rd_gdtr(&r_bgdt);
872 	bgdt = (user_desc_t *)r_bgdt.dtr_base;
873 	if (bgdt == NULL)
874 		panic("null boot gdt");
875 
876 	gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
877 	gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
878 	gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
879 	gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
880 
881 	/*
882 	 * Install our new GDT
883 	 */
884 	r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
885 	r_gdt.dtr_base = (uintptr_t)gdt0;
886 	wr_gdtr(&r_gdt);
887 
888 	/*
889 	 * Reload the segment registers to use the new GDT
890 	 */
891 	load_segment_registers(
892 	    KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
893 
894 	return (gdt0);
895 }
896 
897 #endif	/* __xpv */
898 #endif	/* __i386 */
899 
900 /*
901  * Build kernel IDT.
902  *
903  * Note that for amd64 we pretty much require every gate to be an interrupt
904  * gate which blocks interrupts atomically on entry; that's because of our
905  * dependency on using 'swapgs' every time we come into the kernel to find
906  * the cpu structure. If we get interrupted just before doing that, %cs could
907  * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
908  * %gsbase is really still pointing at something in userland. Bad things will
909  * ensue. We also use interrupt gates for i386 as well even though this is not
910  * required for some traps.
911  *
912  * Perhaps they should have invented a trap gate that does an atomic swapgs?
913  */
914 static void
915 init_idt_common(gate_desc_t *idt)
916 {
917 	set_gatesegd(&idt[T_ZERODIV],
918 	    (kpti_enable == 1) ? &tr_div0trap : &div0trap,
919 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
920 	set_gatesegd(&idt[T_SGLSTP],
921 	    (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
922 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
923 	set_gatesegd(&idt[T_NMIFLT],
924 	    (kpti_enable == 1) ? &tr_nmiint : &nmiint,
925 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
926 	set_gatesegd(&idt[T_BPTFLT],
927 	    (kpti_enable == 1) ? &tr_brktrap : &brktrap,
928 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
929 	set_gatesegd(&idt[T_OVFLW],
930 	    (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
931 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
932 	set_gatesegd(&idt[T_BOUNDFLT],
933 	    (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
934 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
935 	set_gatesegd(&idt[T_ILLINST],
936 	    (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
937 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
938 	set_gatesegd(&idt[T_NOEXTFLT],
939 	    (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
940 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
941 
942 	/*
943 	 * double fault handler.
944 	 *
945 	 * Note that on the hypervisor a guest does not receive #df faults.
946 	 * Instead a failsafe event is injected into the guest if its selectors
947 	 * and/or stack is in a broken state. See xen_failsafe_callback.
948 	 */
949 #if !defined(__xpv)
950 	set_gatesegd(&idt[T_DBLFLT],
951 	    (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
952 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
953 #endif	/* !__xpv */
954 
955 	/*
956 	 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
957 	 */
958 	set_gatesegd(&idt[T_TSSFLT],
959 	    (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
960 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
961 	set_gatesegd(&idt[T_SEGFLT],
962 	    (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
963 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
964 	set_gatesegd(&idt[T_STKFLT],
965 	    (kpti_enable == 1) ? &tr_stktrap : &stktrap,
966 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
967 	set_gatesegd(&idt[T_GPFLT],
968 	    (kpti_enable == 1) ? &tr_gptrap : &gptrap,
969 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
970 	set_gatesegd(&idt[T_PGFLT],
971 	    (kpti_enable == 1) ? &tr_pftrap : &pftrap,
972 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
973 	set_gatesegd(&idt[T_EXTERRFLT],
974 	    (kpti_enable == 1) ? &tr_ndperr : &ndperr,
975 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
976 	set_gatesegd(&idt[T_ALIGNMENT],
977 	    (kpti_enable == 1) ? &tr_achktrap : &achktrap,
978 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
979 	set_gatesegd(&idt[T_MCE],
980 	    (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
981 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
982 	set_gatesegd(&idt[T_SIMDFPE],
983 	    (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
984 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
985 
986 	/*
987 	 * install fast trap handler at 210.
988 	 */
989 	set_gatesegd(&idt[T_FASTTRAP],
990 	    (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
991 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
992 
993 	/*
994 	 * System call handler.
995 	 */
996 	set_gatesegd(&idt[T_SYSCALLINT],
997 	    (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
998 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
999 
1000 	/*
1001 	 * Install the DTrace interrupt handler for the pid provider.
1002 	 */
1003 	set_gatesegd(&idt[T_DTRACE_RET],
1004 	    (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
1005 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
1006 
1007 	/*
1008 	 * Prepare interposing descriptor for the syscall handler
1009 	 * and cache copy of the default descriptor.
1010 	 */
1011 	brand_tbl[0].ih_inum = T_SYSCALLINT;
1012 	brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
1013 
1014 	set_gatesegd(&(brand_tbl[0].ih_interp_desc),
1015 	    (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
1016 	    &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
1017 	    idt_vector_to_ist(T_SYSCALLINT));
1018 
1019 	brand_tbl[1].ih_inum = 0;
1020 }
1021 
1022 #if defined(__xpv)
1023 
1024 static void
1025 init_idt(gate_desc_t *idt)
1026 {
1027 	init_idt_common(idt);
1028 }
1029 
1030 #else	/* __xpv */
1031 
1032 static void
1033 init_idt(gate_desc_t *idt)
1034 {
1035 	char	ivctname[80];
1036 	void	(*ivctptr)(void);
1037 	int	i;
1038 
1039 	/*
1040 	 * Initialize entire table with 'reserved' trap and then overwrite
1041 	 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
1042 	 * since it can only be generated on a 386 processor. 15 is also
1043 	 * unsupported and reserved.
1044 	 */
1045 #if !defined(__xpv)
1046 	for (i = 0; i < NIDT; i++) {
1047 		set_gatesegd(&idt[i],
1048 		    (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
1049 		    KCS_SEL, SDT_SYSIGT, TRP_KPL,
1050 		    idt_vector_to_ist(T_RESVTRAP));
1051 	}
1052 #else
1053 	for (i = 0; i < NIDT; i++) {
1054 		set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1055 		    IST_NONE);
1056 	}
1057 #endif
1058 
1059 	/*
1060 	 * 20-31 reserved
1061 	 */
1062 #if !defined(__xpv)
1063 	for (i = 20; i < 32; i++) {
1064 		set_gatesegd(&idt[i],
1065 		    (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
1066 		    KCS_SEL, SDT_SYSIGT, TRP_KPL,
1067 		    idt_vector_to_ist(T_INVALTRAP));
1068 	}
1069 #else
1070 	for (i = 20; i < 32; i++) {
1071 		set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1072 		    IST_NONE);
1073 	}
1074 #endif
1075 
1076 	/*
1077 	 * interrupts 32 - 255
1078 	 */
1079 	for (i = 32; i < 256; i++) {
1080 #if !defined(__xpv)
1081 		(void) snprintf(ivctname, sizeof (ivctname),
1082 		    (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
1083 #else
1084 		(void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
1085 #endif
1086 		ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
1087 		if (ivctptr == NULL)
1088 			panic("kobj_getsymvalue(%s) failed", ivctname);
1089 
1090 		set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1091 		    idt_vector_to_ist(i));
1092 	}
1093 
1094 	/*
1095 	 * Now install the common ones. Note that it will overlay some
1096 	 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
1097 	 */
1098 	init_idt_common(idt);
1099 }
1100 
1101 #endif	/* __xpv */
1102 
1103 /*
1104  * The kernel does not deal with LDTs unless a user explicitly creates
1105  * one. Under normal circumstances, the LDTR contains 0. Any process attempting
1106  * to reference the LDT will therefore cause a #gp. System calls made via the
1107  * obsolete lcall mechanism are emulated by the #gp fault handler.
1108  */
1109 static void
1110 init_ldt(void)
1111 {
1112 #if defined(__xpv)
1113 	xen_set_ldt(NULL, 0);
1114 #else
1115 	wr_ldtr(0);
1116 #endif
1117 }
1118 
1119 #if !defined(__xpv)
1120 
1121 static void
1122 init_tss(void)
1123 {
1124 	extern struct cpu cpus[];
1125 
1126 	/*
1127 	 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
1128 	 * context switch but it'll be overwritten with this same value anyway.
1129 	 */
1130 	if (kpti_enable == 1) {
1131 		ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1132 	}
1133 
1134 	/* Set up the IST stacks for double fault, NMI, MCE. */
1135 	ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1136 	ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
1137 	ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
1138 
1139 	/*
1140 	 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
1141 	 * enabled), and also for KDI (always).
1142 	 */
1143 	ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
1144 
1145 	if (kpti_enable == 1) {
1146 		/* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
1147 		ktss0->tss_ist5 =
1148 		    (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
1149 
1150 		/* This IST stack is used for all other intrs (for KPTI). */
1151 		ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1152 	}
1153 
1154 	/*
1155 	 * Set I/O bit map offset equal to size of TSS segment limit
1156 	 * for no I/O permission map. This will force all user I/O
1157 	 * instructions to generate #gp fault.
1158 	 */
1159 	ktss0->tss_bitmapbase = sizeof (*ktss0);
1160 
1161 	/*
1162 	 * Point %tr to descriptor for ktss0 in gdt.
1163 	 */
1164 	wr_tsr(KTSS_SEL);
1165 }
1166 
1167 #endif	/* !__xpv */
1168 
1169 #if defined(__xpv)
1170 
1171 void
1172 init_desctbls(void)
1173 {
1174 	uint_t vec;
1175 	user_desc_t *gdt;
1176 
1177 	/*
1178 	 * Setup and install our GDT.
1179 	 */
1180 	gdt = init_gdt();
1181 
1182 	/*
1183 	 * Store static pa of gdt to speed up pa_to_ma() translations
1184 	 * on lwp context switches.
1185 	 */
1186 	ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1187 	CPU->cpu_gdt = gdt;
1188 	CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
1189 
1190 	/*
1191 	 * Setup and install our IDT.
1192 	 */
1193 #if !defined(__lint)
1194 	ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1195 #endif
1196 	idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1197 	    PAGESIZE, PAGESIZE);
1198 	bzero(idt0, PAGESIZE);
1199 	init_idt(idt0);
1200 	for (vec = 0; vec < NIDT; vec++)
1201 		xen_idt_write(&idt0[vec], vec);
1202 
1203 	CPU->cpu_idt = idt0;
1204 
1205 	/*
1206 	 * set default kernel stack
1207 	 */
1208 	xen_stack_switch(KDS_SEL,
1209 	    (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
1210 
1211 	xen_init_callbacks();
1212 
1213 	init_ldt();
1214 }
1215 
1216 #else	/* __xpv */
1217 
1218 void
1219 init_desctbls(void)
1220 {
1221 	user_desc_t *gdt;
1222 	desctbr_t idtr;
1223 
1224 	/*
1225 	 * Allocate IDT and TSS structures on unique pages for better
1226 	 * performance in virtual machines.
1227 	 */
1228 #if !defined(__lint)
1229 	ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1230 #endif
1231 	idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1232 	    PAGESIZE, PAGESIZE);
1233 	bzero(idt0, PAGESIZE);
1234 #if !defined(__lint)
1235 	ASSERT(sizeof (*ktss0) <= PAGESIZE);
1236 #endif
1237 	ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1238 	    PAGESIZE, PAGESIZE);
1239 	bzero(ktss0, PAGESIZE);
1240 
1241 #if defined(__i386)
1242 #if !defined(__lint)
1243 	ASSERT(sizeof (*dftss0) <= PAGESIZE);
1244 #endif
1245 	dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA,
1246 	    PAGESIZE, PAGESIZE);
1247 	bzero(dftss0, PAGESIZE);
1248 #endif
1249 
1250 	/*
1251 	 * Setup and install our GDT.
1252 	 */
1253 	gdt = init_gdt();
1254 	ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1255 	CPU->cpu_gdt = gdt;
1256 
1257 	/*
1258 	 * Initialize this CPU's LDT.
1259 	 */
1260 	CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1261 	    LDT_CPU_SIZE, PAGESIZE);
1262 	bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1263 	CPU->cpu_m.mcpu_ldt_len = 0;
1264 
1265 	/*
1266 	 * Setup and install our IDT.
1267 	 */
1268 	init_idt(idt0);
1269 
1270 	idtr.dtr_base = (uintptr_t)idt0;
1271 	idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1272 	wr_idtr(&idtr);
1273 	CPU->cpu_idt = idt0;
1274 
1275 #if defined(__i386)
1276 	/*
1277 	 * We maintain a description of idt0 in convenient IDTR format
1278 	 * for #pf's on some older pentium processors. See pentium_pftrap().
1279 	 */
1280 	idt0_default_r = idtr;
1281 #endif	/* __i386 */
1282 
1283 	init_tss();
1284 	CPU->cpu_tss = ktss0;
1285 	init_ldt();
1286 
1287 	/* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1288 	kpti_safe_cr3 = (uint64_t)getcr3();
1289 }
1290 
1291 #endif	/* __xpv */
1292 
1293 /*
1294  * In the early kernel, we need to set up a simple GDT to run on.
1295  *
1296  * XXPV	Can dboot use this too?  See dboot_gdt.s
1297  */
1298 void
1299 init_boot_gdt(user_desc_t *bgdt)
1300 {
1301 #if defined(__amd64)
1302 	set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1303 	    SDP_PAGES, SDP_OP32);
1304 	set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1305 	    SDP_PAGES, SDP_OP32);
1306 #elif defined(__i386)
1307 	set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
1308 	    SDP_PAGES, SDP_OP32);
1309 	set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
1310 	    SDP_PAGES, SDP_OP32);
1311 #endif	/* __i386 */
1312 }
1313 
1314 /*
1315  * Enable interpositioning on the system call path by rewriting the
1316  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1317  * the branded entry points.
1318  */
1319 void
1320 brand_interpositioning_enable(void)
1321 {
1322 	gate_desc_t	*idt = CPU->cpu_idt;
1323 	int 		i;
1324 
1325 	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1326 
1327 	for (i = 0; brand_tbl[i].ih_inum; i++) {
1328 		idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1329 #if defined(__xpv)
1330 		xen_idt_write(&idt[brand_tbl[i].ih_inum],
1331 		    brand_tbl[i].ih_inum);
1332 #endif
1333 	}
1334 
1335 #if defined(__amd64)
1336 #if defined(__xpv)
1337 
1338 	/*
1339 	 * Currently the hypervisor only supports 64-bit syscalls via
1340 	 * syscall instruction. The 32-bit syscalls are handled by
1341 	 * interrupt gate above.
1342 	 */
1343 	xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1344 	    CALLBACKF_mask_events);
1345 
1346 #else
1347 
1348 	if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1349 		if (kpti_enable == 1) {
1350 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1351 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1352 		} else {
1353 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1354 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1355 		}
1356 	}
1357 
1358 #endif
1359 #endif	/* __amd64 */
1360 
1361 	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1362 		if (kpti_enable == 1) {
1363 			wrmsr(MSR_INTC_SEP_EIP,
1364 			    (uintptr_t)tr_brand_sys_sysenter);
1365 		} else {
1366 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1367 		}
1368 	}
1369 }
1370 
1371 /*
1372  * Disable interpositioning on the system call path by rewriting the
1373  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1374  * the standard entry points, which bypass the interpositioning hooks.
1375  */
1376 void
1377 brand_interpositioning_disable(void)
1378 {
1379 	gate_desc_t	*idt = CPU->cpu_idt;
1380 	int i;
1381 
1382 	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1383 
1384 	for (i = 0; brand_tbl[i].ih_inum; i++) {
1385 		idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1386 #if defined(__xpv)
1387 		xen_idt_write(&idt[brand_tbl[i].ih_inum],
1388 		    brand_tbl[i].ih_inum);
1389 #endif
1390 	}
1391 
1392 #if defined(__amd64)
1393 #if defined(__xpv)
1394 
1395 	/*
1396 	 * See comment above in brand_interpositioning_enable.
1397 	 */
1398 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1399 	    CALLBACKF_mask_events);
1400 
1401 #else
1402 
1403 	if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1404 		if (kpti_enable == 1) {
1405 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1406 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1407 		} else {
1408 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1409 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1410 		}
1411 	}
1412 
1413 #endif
1414 #endif	/* __amd64 */
1415 
1416 	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1417 		if (kpti_enable == 1) {
1418 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1419 		} else {
1420 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
1421 		}
1422 	}
1423 }
1424