xref: /illumos-gate/usr/src/uts/intel/os/desctbls.c (revision 74ecdb51)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2018 Joyent, Inc. All rights reserved.
28  */
29 
30 /*
31  * Copyright (c) 1992 Terrence R. Lambert.
32  * Copyright (c) 1990 The Regents of the University of California.
33  * All rights reserved.
34  *
35  * This code is derived from software contributed to Berkeley by
36  * William Jolitz.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *	This product includes software developed by the University of
49  *	California, Berkeley and its contributors.
50  * 4. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
67  */
68 
69 #include <sys/types.h>
70 #include <sys/sysmacros.h>
71 #include <sys/tss.h>
72 #include <sys/segments.h>
73 #include <sys/trap.h>
74 #include <sys/cpuvar.h>
75 #include <sys/bootconf.h>
76 #include <sys/x86_archext.h>
77 #include <sys/controlregs.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 #include <sys/kobj.h>
81 #include <sys/cmn_err.h>
82 #include <sys/reboot.h>
83 #include <sys/kdi.h>
84 #include <sys/mach_mmu.h>
85 #include <sys/systm.h>
86 #include <sys/note.h>
87 
88 #ifdef __xpv
89 #include <sys/hypervisor.h>
90 #include <vm/as.h>
91 #endif
92 
93 #include <sys/promif.h>
94 #include <sys/bootinfo.h>
95 #include <vm/kboot_mmu.h>
96 #include <vm/hat_pte.h>
97 
98 /*
99  * cpu0 and default tables and structures.
100  */
101 user_desc_t	*gdt0;
102 #if !defined(__xpv)
103 desctbr_t	gdt0_default_r;
104 #endif
105 
106 gate_desc_t	*idt0; 		/* interrupt descriptor table */
107 #if defined(__i386)
108 desctbr_t	idt0_default_r;		/* describes idt0 in IDTR format */
109 #endif
110 
111 tss_t		*ktss0;			/* kernel task state structure */
112 
113 #if defined(__i386)
114 tss_t		*dftss0;		/* #DF double-fault exception */
115 #endif	/* __i386 */
116 
117 user_desc_t	zero_udesc;		/* base zero user desc native procs */
118 user_desc_t	null_udesc;		/* null user descriptor */
119 system_desc_t	null_sdesc;		/* null system descriptor */
120 
121 #if defined(__amd64)
122 user_desc_t	zero_u32desc;		/* 32-bit compatibility procs */
123 #endif	/* __amd64 */
124 
125 #if defined(__amd64)
126 user_desc_t	ucs_on;
127 user_desc_t	ucs_off;
128 user_desc_t	ucs32_on;
129 user_desc_t	ucs32_off;
130 #endif	/* __amd64 */
131 
132 /*
133  * If the size of this is changed, you must update hat_pcp_setup() and the
134  * definitions in exception.s
135  */
136 extern char dblfault_stack0[DEFAULTSTKSZ];
137 extern char nmi_stack0[DEFAULTSTKSZ];
138 extern char mce_stack0[DEFAULTSTKSZ];
139 
140 extern void	fast_null(void);
141 extern hrtime_t	get_hrtime(void);
142 extern hrtime_t	gethrvtime(void);
143 extern hrtime_t	get_hrestime(void);
144 extern uint64_t	getlgrp(void);
145 
146 void (*(fasttable[]))(void) = {
147 	fast_null,			/* T_FNULL routine */
148 	fast_null,			/* T_FGETFP routine (initially null) */
149 	fast_null,			/* T_FSETFP routine (initially null) */
150 	(void (*)())get_hrtime,		/* T_GETHRTIME */
151 	(void (*)())gethrvtime,		/* T_GETHRVTIME */
152 	(void (*)())get_hrestime,	/* T_GETHRESTIME */
153 	(void (*)())getlgrp		/* T_GETLGRP */
154 };
155 
156 /*
157  * Structure containing pre-computed descriptors to allow us to temporarily
158  * interpose on a standard handler.
159  */
160 struct interposing_handler {
161 	int ih_inum;
162 	gate_desc_t ih_interp_desc;
163 	gate_desc_t ih_default_desc;
164 };
165 
166 /*
167  * The brand infrastructure interposes on two handlers, and we use one as a
168  * NULL signpost.
169  */
170 static struct interposing_handler brand_tbl[2];
171 
172 /*
173  * software prototypes for default local descriptor table
174  */
175 
176 /*
177  * Routines for loading segment descriptors in format the hardware
178  * can understand.
179  */
180 
181 #if defined(__amd64)
182 
183 /*
184  * In long mode we have the new L or long mode attribute bit
185  * for code segments. Only the conforming bit in type is used along
186  * with descriptor priority and present bits. Default operand size must
187  * be zero when in long mode. In 32-bit compatibility mode all fields
188  * are treated as in legacy mode. For data segments while in long mode
189  * only the present bit is loaded.
190  */
191 void
192 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
193     uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
194 {
195 	ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
196 
197 	/*
198 	 * 64-bit long mode.
199 	 */
200 	if (lmode == SDP_LONG)
201 		dp->usd_def32 = 0;		/* 32-bit operands only */
202 	else
203 		/*
204 		 * 32-bit compatibility mode.
205 		 */
206 		dp->usd_def32 = defopsz;	/* 0 = 16, 1 = 32-bit ops */
207 
208 	dp->usd_long = lmode;	/* 64-bit mode */
209 	dp->usd_type = type;
210 	dp->usd_dpl = dpl;
211 	dp->usd_p = 1;
212 	dp->usd_gran = gran;		/* 0 = bytes, 1 = pages */
213 
214 	dp->usd_lobase = (uintptr_t)base;
215 	dp->usd_midbase = (uintptr_t)base >> 16;
216 	dp->usd_hibase = (uintptr_t)base >> (16 + 8);
217 	dp->usd_lolimit = size;
218 	dp->usd_hilimit = (uintptr_t)size >> 16;
219 }
220 
221 #elif defined(__i386)
222 
223 /*
224  * Install user segment descriptor for code and data.
225  */
226 void
227 set_usegd(user_desc_t *dp, void *base, size_t size, uint_t type,
228     uint_t dpl, uint_t gran, uint_t defopsz)
229 {
230 	dp->usd_lolimit = size;
231 	dp->usd_hilimit = (uintptr_t)size >> 16;
232 
233 	dp->usd_lobase = (uintptr_t)base;
234 	dp->usd_midbase = (uintptr_t)base >> 16;
235 	dp->usd_hibase = (uintptr_t)base >> (16 + 8);
236 
237 	dp->usd_type = type;
238 	dp->usd_dpl = dpl;
239 	dp->usd_p = 1;
240 	dp->usd_def32 = defopsz;	/* 0 = 16, 1 = 32 bit operands */
241 	dp->usd_gran = gran;		/* 0 = bytes, 1 = pages */
242 }
243 
244 #endif	/* __i386 */
245 
246 /*
247  * Install system segment descriptor for LDT and TSS segments.
248  */
249 
250 #if defined(__amd64)
251 
252 void
253 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
254     uint_t dpl)
255 {
256 	dp->ssd_lolimit = size;
257 	dp->ssd_hilimit = (uintptr_t)size >> 16;
258 
259 	dp->ssd_lobase = (uintptr_t)base;
260 	dp->ssd_midbase = (uintptr_t)base >> 16;
261 	dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
262 	dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
263 
264 	dp->ssd_type = type;
265 	dp->ssd_zero1 = 0;	/* must be zero */
266 	dp->ssd_zero2 = 0;
267 	dp->ssd_dpl = dpl;
268 	dp->ssd_p = 1;
269 	dp->ssd_gran = 0;	/* force byte units */
270 }
271 
272 void *
273 get_ssd_base(system_desc_t *dp)
274 {
275 	uintptr_t	base;
276 
277 	base = (uintptr_t)dp->ssd_lobase |
278 	    (uintptr_t)dp->ssd_midbase << 16 |
279 	    (uintptr_t)dp->ssd_hibase << (16 + 8) |
280 	    (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
281 	return ((void *)base);
282 }
283 
284 #elif defined(__i386)
285 
286 void
287 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
288     uint_t dpl)
289 {
290 	dp->ssd_lolimit = size;
291 	dp->ssd_hilimit = (uintptr_t)size >> 16;
292 
293 	dp->ssd_lobase = (uintptr_t)base;
294 	dp->ssd_midbase = (uintptr_t)base >> 16;
295 	dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
296 
297 	dp->ssd_type = type;
298 	dp->ssd_zero = 0;	/* must be zero */
299 	dp->ssd_dpl = dpl;
300 	dp->ssd_p = 1;
301 	dp->ssd_gran = 0;	/* force byte units */
302 }
303 
304 void *
305 get_ssd_base(system_desc_t *dp)
306 {
307 	uintptr_t	base;
308 
309 	base = (uintptr_t)dp->ssd_lobase |
310 	    (uintptr_t)dp->ssd_midbase << 16 |
311 	    (uintptr_t)dp->ssd_hibase << (16 + 8);
312 	return ((void *)base);
313 }
314 
315 #endif	/* __i386 */
316 
317 /*
318  * Install gate segment descriptor for interrupt, trap, call and task gates.
319  *
320  * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
321  * all interrupts.  We have different ISTs for each class of exceptions that are
322  * most likely to occur while handling an existing exception; while many of
323  * these are just going to panic, it's nice not to trample on the existing
324  * exception state for debugging purposes.
325  *
326  * Normal interrupts are all redirected unconditionally to the KPTI trampoline
327  * stack space. This unifies the trampoline handling between user and kernel
328  * space (and avoids the need to touch %gs).
329  *
330  * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
331  * we do a read from KMDB that cause another #PF.  Without its own IST, this
332  * would stomp on the kernel's mcpu_kpti_flt frame.
333  */
334 uint_t
335 idt_vector_to_ist(uint_t vector)
336 {
337 #if defined(__xpv)
338 	_NOTE(ARGUNUSED(vector));
339 	return (IST_NONE);
340 #else
341 	switch (vector) {
342 	/* These should always use IST even without KPTI enabled. */
343 	case T_DBLFLT:
344 		return (IST_DF);
345 	case T_NMIFLT:
346 		return (IST_NMI);
347 	case T_MCE:
348 		return (IST_MCE);
349 
350 	case T_BPTFLT:
351 	case T_SGLSTP:
352 		if (kpti_enable == 1) {
353 			return (IST_DBG);
354 		}
355 		return (IST_NONE);
356 	case T_STKFLT:
357 	case T_GPFLT:
358 	case T_PGFLT:
359 		if (kpti_enable == 1) {
360 			return (IST_NESTABLE);
361 		}
362 		return (IST_NONE);
363 	default:
364 		if (kpti_enable == 1) {
365 			return (IST_DEFAULT);
366 		}
367 		return (IST_NONE);
368 	}
369 #endif
370 }
371 
372 void
373 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
374     uint_t type, uint_t dpl, uint_t ist)
375 {
376 	dp->sgd_looffset = (uintptr_t)func;
377 	dp->sgd_hioffset = (uintptr_t)func >> 16;
378 	dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
379 	dp->sgd_selector =  (uint16_t)sel;
380 	dp->sgd_ist = ist;
381 	dp->sgd_type = type;
382 	dp->sgd_dpl = dpl;
383 	dp->sgd_p = 1;
384 }
385 
386 /*
387  * Updates a single user descriptor in the the GDT of the current cpu.
388  * Caller is responsible for preventing cpu migration.
389  */
390 
391 void
392 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
393 {
394 #if defined(__xpv)
395 
396 	uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
397 
398 	if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
399 		panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
400 
401 #else	/* __xpv */
402 
403 	CPU->cpu_gdt[sidx] = *udp;
404 
405 #endif	/* __xpv */
406 }
407 
408 /*
409  * Writes single descriptor pointed to by udp into a processes
410  * LDT entry pointed to by ldp.
411  */
412 int
413 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
414 {
415 #if defined(__xpv)
416 
417 	uint64_t dpa;
418 
419 	dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
420 	    ((uintptr_t)ldp & PAGEOFFSET);
421 
422 	/*
423 	 * The hypervisor is a little more restrictive about what it
424 	 * supports in the LDT.
425 	 */
426 	if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
427 		return (EINVAL);
428 
429 #else	/* __xpv */
430 
431 	*ldp = *udp;
432 
433 #endif	/* __xpv */
434 	return (0);
435 }
436 
437 #if defined(__xpv)
438 
439 /*
440  * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
441  * Returns true if a valid entry was written.
442  */
443 int
444 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
445 {
446 	trap_info_t *ti = ti_arg;	/* XXPV	Aargh - segments.h comment */
447 
448 	/*
449 	 * skip holes in the IDT
450 	 */
451 	if (GATESEG_GETOFFSET(sgd) == 0)
452 		return (0);
453 
454 	ASSERT(sgd->sgd_type == SDT_SYSIGT);
455 	ti->vector = vec;
456 	TI_SET_DPL(ti, sgd->sgd_dpl);
457 
458 	/*
459 	 * Is this an interrupt gate?
460 	 */
461 	if (sgd->sgd_type == SDT_SYSIGT) {
462 		/* LINTED */
463 		TI_SET_IF(ti, 1);
464 	}
465 	ti->cs = sgd->sgd_selector;
466 #if defined(__amd64)
467 	ti->cs |= SEL_KPL;	/* force into ring 3. see KCS_SEL  */
468 #endif
469 	ti->address = GATESEG_GETOFFSET(sgd);
470 	return (1);
471 }
472 
473 /*
474  * Convert a single hw format gate descriptor and write it into our virtual IDT.
475  */
476 void
477 xen_idt_write(gate_desc_t *sgd, uint_t vec)
478 {
479 	trap_info_t trapinfo[2];
480 
481 	bzero(trapinfo, sizeof (trapinfo));
482 	if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
483 		return;
484 	if (xen_set_trap_table(trapinfo) != 0)
485 		panic("xen_idt_write: xen_set_trap_table() failed");
486 }
487 
488 #endif	/* __xpv */
489 
490 #if defined(__amd64)
491 
492 /*
493  * Build kernel GDT.
494  */
495 
496 static void
497 init_gdt_common(user_desc_t *gdt)
498 {
499 	int i;
500 
501 	/*
502 	 * 64-bit kernel code segment.
503 	 */
504 	set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
505 	    SDP_PAGES, SDP_OP32);
506 
507 	/*
508 	 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
509 	 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
510 	 * instruction to return from system calls back to 32-bit applications.
511 	 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
512 	 * descriptors. We therefore must ensure that the kernel uses something,
513 	 * though it will be ignored by hardware, that is compatible with 32-bit
514 	 * apps. For the same reason we must set the default op size of this
515 	 * descriptor to 32-bit operands.
516 	 */
517 	set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
518 	    SEL_KPL, SDP_PAGES, SDP_OP32);
519 	gdt[GDT_KDATA].usd_def32 = 1;
520 
521 	/*
522 	 * 64-bit user code segment.
523 	 */
524 	set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
525 	    SDP_PAGES, SDP_OP32);
526 
527 	/*
528 	 * 32-bit user code segment.
529 	 */
530 	set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
531 	    SEL_UPL, SDP_PAGES, SDP_OP32);
532 
533 	/*
534 	 * See gdt_ucode32() and gdt_ucode_native().
535 	 */
536 	ucs_on = ucs_off = gdt[GDT_UCODE];
537 	ucs_off.usd_p = 0;	/* forces #np fault */
538 
539 	ucs32_on = ucs32_off = gdt[GDT_U32CODE];
540 	ucs32_off.usd_p = 0;	/* forces #np fault */
541 
542 	/*
543 	 * 32 and 64 bit data segments can actually share the same descriptor.
544 	 * In long mode only the present bit is checked but all other fields
545 	 * are loaded. But in compatibility mode all fields are interpreted
546 	 * as in legacy mode so they must be set correctly for a 32-bit data
547 	 * segment.
548 	 */
549 	set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
550 	    SDP_PAGES, SDP_OP32);
551 
552 #if !defined(__xpv)
553 
554 	/*
555 	 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
556 	 * in the GDT is 0.
557 	 */
558 
559 	/*
560 	 * Kernel TSS
561 	 */
562 	set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
563 	    sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
564 
565 #endif	/* !__xpv */
566 
567 	/*
568 	 * Initialize fs and gs descriptors for 32 bit processes.
569 	 * Only attributes and limits are initialized, the effective
570 	 * base address is programmed via fsbase/gsbase.
571 	 */
572 	set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
573 	    SEL_UPL, SDP_PAGES, SDP_OP32);
574 	set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
575 	    SEL_UPL, SDP_PAGES, SDP_OP32);
576 
577 	/*
578 	 * Initialize the descriptors set aside for brand usage.
579 	 * Only attributes and limits are initialized.
580 	 */
581 	for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
582 		set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
583 		    SEL_UPL, SDP_PAGES, SDP_OP32);
584 
585 	/*
586 	 * Initialize convenient zero base user descriptors for clearing
587 	 * lwp private %fs and %gs descriptors in GDT. See setregs() for
588 	 * an example.
589 	 */
590 	set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
591 	    SDP_BYTES, SDP_OP32);
592 	set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
593 	    SDP_PAGES, SDP_OP32);
594 }
595 
596 #if defined(__xpv)
597 
598 static user_desc_t *
599 init_gdt(void)
600 {
601 	uint64_t gdtpa;
602 	ulong_t ma[1];		/* XXPV should be a memory_t */
603 	ulong_t addr;
604 
605 #if !defined(__lint)
606 	/*
607 	 * Our gdt is never larger than a single page.
608 	 */
609 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
610 #endif
611 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
612 	    PAGESIZE, PAGESIZE);
613 	bzero(gdt0, PAGESIZE);
614 
615 	init_gdt_common(gdt0);
616 
617 	/*
618 	 * XXX Since we never invoke kmdb until after the kernel takes
619 	 * over the descriptor tables why not have it use the kernel's
620 	 * selectors?
621 	 */
622 	if (boothowto & RB_DEBUG) {
623 		set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
624 		    SEL_KPL, SDP_PAGES, SDP_OP32);
625 		set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
626 		    SEL_KPL, SDP_PAGES, SDP_OP32);
627 	}
628 
629 	/*
630 	 * Clear write permission for page containing the gdt and install it.
631 	 */
632 	gdtpa = pfn_to_pa(va_to_pfn(gdt0));
633 	ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
634 	kbm_read_only((uintptr_t)gdt0, gdtpa);
635 	xen_set_gdt(ma, NGDT);
636 
637 	/*
638 	 * Reload the segment registers to use the new GDT.
639 	 * On 64-bit, fixup KCS_SEL to be in ring 3.
640 	 * See KCS_SEL in segments.h.
641 	 */
642 	load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
643 
644 	/*
645 	 *  setup %gs for kernel
646 	 */
647 	xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
648 
649 	/*
650 	 * XX64 We should never dereference off "other gsbase" or
651 	 * "fsbase".  So, we should arrange to point FSBASE and
652 	 * KGSBASE somewhere truly awful e.g. point it at the last
653 	 * valid address below the hole so that any attempts to index
654 	 * off them cause an exception.
655 	 *
656 	 * For now, point it at 8G -- at least it should be unmapped
657 	 * until some 64-bit processes run.
658 	 */
659 	addr = 0x200000000ul;
660 	xen_set_segment_base(SEGBASE_FS, addr);
661 	xen_set_segment_base(SEGBASE_GS_USER, addr);
662 	xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
663 
664 	return (gdt0);
665 }
666 
667 #else	/* __xpv */
668 
669 static user_desc_t *
670 init_gdt(void)
671 {
672 	desctbr_t	r_bgdt, r_gdt;
673 	user_desc_t	*bgdt;
674 
675 #if !defined(__lint)
676 	/*
677 	 * Our gdt is never larger than a single page.
678 	 */
679 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
680 #endif
681 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
682 	    PAGESIZE, PAGESIZE);
683 	bzero(gdt0, PAGESIZE);
684 
685 	init_gdt_common(gdt0);
686 
687 	/*
688 	 * Copy in from boot's gdt to our gdt.
689 	 * Entry 0 is the null descriptor by definition.
690 	 */
691 	rd_gdtr(&r_bgdt);
692 	bgdt = (user_desc_t *)r_bgdt.dtr_base;
693 	if (bgdt == NULL)
694 		panic("null boot gdt");
695 
696 	gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
697 	gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
698 	gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
699 	gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
700 	gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
701 
702 	/*
703 	 * Install our new GDT
704 	 */
705 	r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
706 	r_gdt.dtr_base = (uintptr_t)gdt0;
707 	wr_gdtr(&r_gdt);
708 
709 	/*
710 	 * Reload the segment registers to use the new GDT
711 	 */
712 	load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
713 
714 	/*
715 	 *  setup %gs for kernel
716 	 */
717 	wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
718 
719 	/*
720 	 * XX64 We should never dereference off "other gsbase" or
721 	 * "fsbase".  So, we should arrange to point FSBASE and
722 	 * KGSBASE somewhere truly awful e.g. point it at the last
723 	 * valid address below the hole so that any attempts to index
724 	 * off them cause an exception.
725 	 *
726 	 * For now, point it at 8G -- at least it should be unmapped
727 	 * until some 64-bit processes run.
728 	 */
729 	wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
730 	wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
731 	return (gdt0);
732 }
733 
734 #endif	/* __xpv */
735 
736 #elif defined(__i386)
737 
738 static void
739 init_gdt_common(user_desc_t *gdt)
740 {
741 	int i;
742 
743 	/*
744 	 * Text and data for both kernel and user span entire 32 bit
745 	 * address space.
746 	 */
747 
748 	/*
749 	 * kernel code segment.
750 	 */
751 	set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES,
752 	    SDP_OP32);
753 
754 	/*
755 	 * kernel data segment.
756 	 */
757 	set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES,
758 	    SDP_OP32);
759 
760 	/*
761 	 * user code segment.
762 	 */
763 	set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES,
764 	    SDP_OP32);
765 
766 	/*
767 	 * user data segment.
768 	 */
769 	set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES,
770 	    SDP_OP32);
771 
772 #if !defined(__xpv)
773 
774 	/*
775 	 * TSS for T_DBLFLT (double fault) handler
776 	 */
777 	set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0,
778 	    sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL);
779 
780 	/*
781 	 * TSS for kernel
782 	 */
783 	set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
784 	    sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
785 
786 #endif	/* !__xpv */
787 
788 	/*
789 	 * %gs selector for kernel
790 	 */
791 	set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA,
792 	    SEL_KPL, SDP_BYTES, SDP_OP32);
793 
794 	/*
795 	 * Initialize lwp private descriptors.
796 	 * Only attributes and limits are initialized, the effective
797 	 * base address is programmed via fsbase/gsbase.
798 	 */
799 	set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
800 	    SDP_PAGES, SDP_OP32);
801 	set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
802 	    SDP_PAGES, SDP_OP32);
803 
804 	/*
805 	 * Initialize the descriptors set aside for brand usage.
806 	 * Only attributes and limits are initialized.
807 	 */
808 	for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
809 		set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
810 		    SDP_PAGES, SDP_OP32);
811 	/*
812 	 * Initialize convenient zero base user descriptor for clearing
813 	 * lwp  private %fs and %gs descriptors in GDT. See setregs() for
814 	 * an example.
815 	 */
816 	set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL,
817 	    SDP_BYTES, SDP_OP32);
818 }
819 
820 #if defined(__xpv)
821 
822 static user_desc_t *
823 init_gdt(void)
824 {
825 	uint64_t gdtpa;
826 	ulong_t ma[1];		/* XXPV should be a memory_t */
827 
828 #if !defined(__lint)
829 	/*
830 	 * Our gdt is never larger than a single page.
831 	 */
832 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
833 #endif
834 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
835 	    PAGESIZE, PAGESIZE);
836 	bzero(gdt0, PAGESIZE);
837 
838 	init_gdt_common(gdt0);
839 	gdtpa = pfn_to_pa(va_to_pfn(gdt0));
840 
841 	/*
842 	 * XXX Since we never invoke kmdb until after the kernel takes
843 	 * over the descriptor tables why not have it use the kernel's
844 	 * selectors?
845 	 */
846 	if (boothowto & RB_DEBUG) {
847 		set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
848 		    SDP_PAGES, SDP_OP32);
849 		set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
850 		    SDP_PAGES, SDP_OP32);
851 	}
852 
853 	/*
854 	 * Clear write permission for page containing the gdt and install it.
855 	 */
856 	ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
857 	kbm_read_only((uintptr_t)gdt0, gdtpa);
858 	xen_set_gdt(ma, NGDT);
859 
860 	/*
861 	 * Reload the segment registers to use the new GDT
862 	 */
863 	load_segment_registers(
864 	    KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
865 
866 	return (gdt0);
867 }
868 
869 #else	/* __xpv */
870 
871 static user_desc_t *
872 init_gdt(void)
873 {
874 	desctbr_t	r_bgdt, r_gdt;
875 	user_desc_t	*bgdt;
876 
877 #if !defined(__lint)
878 	/*
879 	 * Our gdt is never larger than a single page.
880 	 */
881 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
882 #endif
883 	/*
884 	 * XXX this allocation belongs in our caller, not here.
885 	 */
886 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
887 	    PAGESIZE, PAGESIZE);
888 	bzero(gdt0, PAGESIZE);
889 
890 	init_gdt_common(gdt0);
891 
892 	/*
893 	 * Copy in from boot's gdt to our gdt entries.
894 	 * Entry 0 is null descriptor by definition.
895 	 */
896 	rd_gdtr(&r_bgdt);
897 	bgdt = (user_desc_t *)r_bgdt.dtr_base;
898 	if (bgdt == NULL)
899 		panic("null boot gdt");
900 
901 	gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
902 	gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
903 	gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
904 	gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
905 
906 	/*
907 	 * Install our new GDT
908 	 */
909 	r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
910 	r_gdt.dtr_base = (uintptr_t)gdt0;
911 	wr_gdtr(&r_gdt);
912 
913 	/*
914 	 * Reload the segment registers to use the new GDT
915 	 */
916 	load_segment_registers(
917 	    KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
918 
919 	return (gdt0);
920 }
921 
922 #endif	/* __xpv */
923 #endif	/* __i386 */
924 
925 /*
926  * Build kernel IDT.
927  *
928  * Note that for amd64 we pretty much require every gate to be an interrupt
929  * gate which blocks interrupts atomically on entry; that's because of our
930  * dependency on using 'swapgs' every time we come into the kernel to find
931  * the cpu structure. If we get interrupted just before doing that, %cs could
932  * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
933  * %gsbase is really still pointing at something in userland. Bad things will
934  * ensue. We also use interrupt gates for i386 as well even though this is not
935  * required for some traps.
936  *
937  * Perhaps they should have invented a trap gate that does an atomic swapgs?
938  */
939 static void
940 init_idt_common(gate_desc_t *idt)
941 {
942 	set_gatesegd(&idt[T_ZERODIV],
943 	    (kpti_enable == 1) ? &tr_div0trap : &div0trap,
944 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
945 	set_gatesegd(&idt[T_SGLSTP],
946 	    (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
947 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
948 	set_gatesegd(&idt[T_NMIFLT],
949 	    (kpti_enable == 1) ? &tr_nmiint : &nmiint,
950 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
951 	set_gatesegd(&idt[T_BPTFLT],
952 	    (kpti_enable == 1) ? &tr_brktrap : &brktrap,
953 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
954 	set_gatesegd(&idt[T_OVFLW],
955 	    (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
956 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
957 	set_gatesegd(&idt[T_BOUNDFLT],
958 	    (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
959 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
960 	set_gatesegd(&idt[T_ILLINST],
961 	    (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
962 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
963 	set_gatesegd(&idt[T_NOEXTFLT],
964 	    (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
965 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
966 
967 	/*
968 	 * double fault handler.
969 	 *
970 	 * Note that on the hypervisor a guest does not receive #df faults.
971 	 * Instead a failsafe event is injected into the guest if its selectors
972 	 * and/or stack is in a broken state. See xen_failsafe_callback.
973 	 */
974 #if !defined(__xpv)
975 	set_gatesegd(&idt[T_DBLFLT],
976 	    (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
977 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
978 #endif	/* !__xpv */
979 
980 	/*
981 	 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
982 	 */
983 	set_gatesegd(&idt[T_TSSFLT],
984 	    (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
985 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
986 	set_gatesegd(&idt[T_SEGFLT],
987 	    (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
988 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
989 	set_gatesegd(&idt[T_STKFLT],
990 	    (kpti_enable == 1) ? &tr_stktrap : &stktrap,
991 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
992 	set_gatesegd(&idt[T_GPFLT],
993 	    (kpti_enable == 1) ? &tr_gptrap : &gptrap,
994 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
995 	set_gatesegd(&idt[T_PGFLT],
996 	    (kpti_enable == 1) ? &tr_pftrap : &pftrap,
997 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
998 	set_gatesegd(&idt[T_EXTERRFLT],
999 	    (kpti_enable == 1) ? &tr_ndperr : &ndperr,
1000 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
1001 	set_gatesegd(&idt[T_ALIGNMENT],
1002 	    (kpti_enable == 1) ? &tr_achktrap : &achktrap,
1003 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
1004 	set_gatesegd(&idt[T_MCE],
1005 	    (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
1006 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
1007 	set_gatesegd(&idt[T_SIMDFPE],
1008 	    (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
1009 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
1010 
1011 	/*
1012 	 * install fast trap handler at 210.
1013 	 */
1014 	set_gatesegd(&idt[T_FASTTRAP],
1015 	    (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
1016 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
1017 
1018 	/*
1019 	 * System call handler.
1020 	 */
1021 	set_gatesegd(&idt[T_SYSCALLINT],
1022 	    (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
1023 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
1024 
1025 	/*
1026 	 * Install the DTrace interrupt handler for the pid provider.
1027 	 */
1028 	set_gatesegd(&idt[T_DTRACE_RET],
1029 	    (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
1030 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
1031 
1032 	/*
1033 	 * Prepare interposing descriptor for the syscall handler
1034 	 * and cache copy of the default descriptor.
1035 	 */
1036 	brand_tbl[0].ih_inum = T_SYSCALLINT;
1037 	brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
1038 
1039 	set_gatesegd(&(brand_tbl[0].ih_interp_desc),
1040 	    (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
1041 	    &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
1042 	    idt_vector_to_ist(T_SYSCALLINT));
1043 
1044 	brand_tbl[1].ih_inum = 0;
1045 }
1046 
1047 #if defined(__xpv)
1048 
1049 static void
1050 init_idt(gate_desc_t *idt)
1051 {
1052 	init_idt_common(idt);
1053 }
1054 
1055 #else	/* __xpv */
1056 
1057 static void
1058 init_idt(gate_desc_t *idt)
1059 {
1060 	char	ivctname[80];
1061 	void	(*ivctptr)(void);
1062 	int	i;
1063 
1064 	/*
1065 	 * Initialize entire table with 'reserved' trap and then overwrite
1066 	 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
1067 	 * since it can only be generated on a 386 processor. 15 is also
1068 	 * unsupported and reserved.
1069 	 */
1070 #if !defined(__xpv)
1071 	for (i = 0; i < NIDT; i++) {
1072 		set_gatesegd(&idt[i],
1073 		    (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
1074 		    KCS_SEL, SDT_SYSIGT, TRP_KPL,
1075 		    idt_vector_to_ist(T_RESVTRAP));
1076 	}
1077 #else
1078 	for (i = 0; i < NIDT; i++) {
1079 		set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1080 		    IST_NONE);
1081 	}
1082 #endif
1083 
1084 	/*
1085 	 * 20-31 reserved
1086 	 */
1087 #if !defined(__xpv)
1088 	for (i = 20; i < 32; i++) {
1089 		set_gatesegd(&idt[i],
1090 		    (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
1091 		    KCS_SEL, SDT_SYSIGT, TRP_KPL,
1092 		    idt_vector_to_ist(T_INVALTRAP));
1093 	}
1094 #else
1095 	for (i = 20; i < 32; i++) {
1096 		set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1097 		    IST_NONE);
1098 	}
1099 #endif
1100 
1101 	/*
1102 	 * interrupts 32 - 255
1103 	 */
1104 	for (i = 32; i < 256; i++) {
1105 #if !defined(__xpv)
1106 		(void) snprintf(ivctname, sizeof (ivctname),
1107 		    (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
1108 #else
1109 		(void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
1110 #endif
1111 		ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
1112 		if (ivctptr == NULL)
1113 			panic("kobj_getsymvalue(%s) failed", ivctname);
1114 
1115 		set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1116 		    idt_vector_to_ist(i));
1117 	}
1118 
1119 	/*
1120 	 * Now install the common ones. Note that it will overlay some
1121 	 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
1122 	 */
1123 	init_idt_common(idt);
1124 }
1125 
1126 #endif	/* __xpv */
1127 
1128 /*
1129  * The kernel does not deal with LDTs unless a user explicitly creates
1130  * one. Under normal circumstances, the LDTR contains 0. Any process attempting
1131  * to reference the LDT will therefore cause a #gp. System calls made via the
1132  * obsolete lcall mechanism are emulated by the #gp fault handler.
1133  */
1134 static void
1135 init_ldt(void)
1136 {
1137 #if defined(__xpv)
1138 	xen_set_ldt(NULL, 0);
1139 #else
1140 	wr_ldtr(0);
1141 #endif
1142 }
1143 
1144 #if !defined(__xpv)
1145 
1146 static void
1147 init_tss(void)
1148 {
1149 	extern struct cpu cpus[];
1150 
1151 	/*
1152 	 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
1153 	 * context switch but it'll be overwritten with this same value anyway.
1154 	 */
1155 	if (kpti_enable == 1) {
1156 		ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1157 	}
1158 
1159 	/* Set up the IST stacks for double fault, NMI, MCE. */
1160 	ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1161 	ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
1162 	ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
1163 
1164 	/*
1165 	 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
1166 	 * enabled), and also for KDI (always).
1167 	 */
1168 	ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
1169 
1170 	if (kpti_enable == 1) {
1171 		/* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
1172 		ktss0->tss_ist5 =
1173 		    (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
1174 
1175 		/* This IST stack is used for all other intrs (for KPTI). */
1176 		ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1177 	}
1178 
1179 	/*
1180 	 * Set I/O bit map offset equal to size of TSS segment limit
1181 	 * for no I/O permission map. This will force all user I/O
1182 	 * instructions to generate #gp fault.
1183 	 */
1184 	ktss0->tss_bitmapbase = sizeof (*ktss0);
1185 
1186 	/*
1187 	 * Point %tr to descriptor for ktss0 in gdt.
1188 	 */
1189 	wr_tsr(KTSS_SEL);
1190 }
1191 
1192 #endif	/* !__xpv */
1193 
1194 #if defined(__xpv)
1195 
1196 void
1197 init_desctbls(void)
1198 {
1199 	uint_t vec;
1200 	user_desc_t *gdt;
1201 
1202 	/*
1203 	 * Setup and install our GDT.
1204 	 */
1205 	gdt = init_gdt();
1206 
1207 	/*
1208 	 * Store static pa of gdt to speed up pa_to_ma() translations
1209 	 * on lwp context switches.
1210 	 */
1211 	ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1212 	CPU->cpu_gdt = gdt;
1213 	CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
1214 
1215 	/*
1216 	 * Setup and install our IDT.
1217 	 */
1218 #if !defined(__lint)
1219 	ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1220 #endif
1221 	idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1222 	    PAGESIZE, PAGESIZE);
1223 	bzero(idt0, PAGESIZE);
1224 	init_idt(idt0);
1225 	for (vec = 0; vec < NIDT; vec++)
1226 		xen_idt_write(&idt0[vec], vec);
1227 
1228 	CPU->cpu_idt = idt0;
1229 
1230 	/*
1231 	 * set default kernel stack
1232 	 */
1233 	xen_stack_switch(KDS_SEL,
1234 	    (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
1235 
1236 	xen_init_callbacks();
1237 
1238 	init_ldt();
1239 }
1240 
1241 #else	/* __xpv */
1242 
1243 void
1244 init_desctbls(void)
1245 {
1246 	user_desc_t *gdt;
1247 	desctbr_t idtr;
1248 
1249 	/*
1250 	 * Allocate IDT and TSS structures on unique pages for better
1251 	 * performance in virtual machines.
1252 	 */
1253 #if !defined(__lint)
1254 	ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1255 #endif
1256 	idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1257 	    PAGESIZE, PAGESIZE);
1258 	bzero(idt0, PAGESIZE);
1259 #if !defined(__lint)
1260 	ASSERT(sizeof (*ktss0) <= PAGESIZE);
1261 #endif
1262 	ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1263 	    PAGESIZE, PAGESIZE);
1264 	bzero(ktss0, PAGESIZE);
1265 
1266 #if defined(__i386)
1267 #if !defined(__lint)
1268 	ASSERT(sizeof (*dftss0) <= PAGESIZE);
1269 #endif
1270 	dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA,
1271 	    PAGESIZE, PAGESIZE);
1272 	bzero(dftss0, PAGESIZE);
1273 #endif
1274 
1275 	/*
1276 	 * Setup and install our GDT.
1277 	 */
1278 	gdt = init_gdt();
1279 	ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1280 	CPU->cpu_gdt = gdt;
1281 
1282 	/*
1283 	 * Initialize this CPU's LDT.
1284 	 */
1285 	CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1286 	    LDT_CPU_SIZE, PAGESIZE);
1287 	bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1288 	CPU->cpu_m.mcpu_ldt_len = 0;
1289 
1290 	/*
1291 	 * Setup and install our IDT.
1292 	 */
1293 	init_idt(idt0);
1294 
1295 	idtr.dtr_base = (uintptr_t)idt0;
1296 	idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1297 	wr_idtr(&idtr);
1298 	CPU->cpu_idt = idt0;
1299 
1300 #if defined(__i386)
1301 	/*
1302 	 * We maintain a description of idt0 in convenient IDTR format
1303 	 * for #pf's on some older pentium processors. See pentium_pftrap().
1304 	 */
1305 	idt0_default_r = idtr;
1306 #endif	/* __i386 */
1307 
1308 	init_tss();
1309 	CPU->cpu_tss = ktss0;
1310 	init_ldt();
1311 
1312 	/* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1313 	kpti_safe_cr3 = (uint64_t)getcr3();
1314 }
1315 
1316 #endif	/* __xpv */
1317 
1318 /*
1319  * In the early kernel, we need to set up a simple GDT to run on.
1320  *
1321  * XXPV	Can dboot use this too?  See dboot_gdt.s
1322  */
1323 void
1324 init_boot_gdt(user_desc_t *bgdt)
1325 {
1326 #if defined(__amd64)
1327 	set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1328 	    SDP_PAGES, SDP_OP32);
1329 	set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1330 	    SDP_PAGES, SDP_OP32);
1331 #elif defined(__i386)
1332 	set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
1333 	    SDP_PAGES, SDP_OP32);
1334 	set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
1335 	    SDP_PAGES, SDP_OP32);
1336 #endif	/* __i386 */
1337 }
1338 
1339 /*
1340  * Enable interpositioning on the system call path by rewriting the
1341  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1342  * the branded entry points.
1343  */
1344 void
1345 brand_interpositioning_enable(void)
1346 {
1347 	gate_desc_t	*idt = CPU->cpu_idt;
1348 	int 		i;
1349 
1350 	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1351 
1352 	for (i = 0; brand_tbl[i].ih_inum; i++) {
1353 		idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1354 #if defined(__xpv)
1355 		xen_idt_write(&idt[brand_tbl[i].ih_inum],
1356 		    brand_tbl[i].ih_inum);
1357 #endif
1358 	}
1359 
1360 #if defined(__amd64)
1361 #if defined(__xpv)
1362 
1363 	/*
1364 	 * Currently the hypervisor only supports 64-bit syscalls via
1365 	 * syscall instruction. The 32-bit syscalls are handled by
1366 	 * interrupt gate above.
1367 	 */
1368 	xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1369 	    CALLBACKF_mask_events);
1370 
1371 #else
1372 
1373 	if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1374 		if (kpti_enable == 1) {
1375 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1376 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1377 		} else {
1378 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1379 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1380 		}
1381 	}
1382 
1383 #endif
1384 #endif	/* __amd64 */
1385 
1386 	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1387 		if (kpti_enable == 1) {
1388 			wrmsr(MSR_INTC_SEP_EIP,
1389 			    (uintptr_t)tr_brand_sys_sysenter);
1390 		} else {
1391 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1392 		}
1393 	}
1394 }
1395 
1396 /*
1397  * Disable interpositioning on the system call path by rewriting the
1398  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1399  * the standard entry points, which bypass the interpositioning hooks.
1400  */
1401 void
1402 brand_interpositioning_disable(void)
1403 {
1404 	gate_desc_t	*idt = CPU->cpu_idt;
1405 	int i;
1406 
1407 	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1408 
1409 	for (i = 0; brand_tbl[i].ih_inum; i++) {
1410 		idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1411 #if defined(__xpv)
1412 		xen_idt_write(&idt[brand_tbl[i].ih_inum],
1413 		    brand_tbl[i].ih_inum);
1414 #endif
1415 	}
1416 
1417 #if defined(__amd64)
1418 #if defined(__xpv)
1419 
1420 	/*
1421 	 * See comment above in brand_interpositioning_enable.
1422 	 */
1423 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1424 	    CALLBACKF_mask_events);
1425 
1426 #else
1427 
1428 	if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1429 		if (kpti_enable == 1) {
1430 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1431 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1432 		} else {
1433 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1434 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1435 		}
1436 	}
1437 
1438 #endif
1439 #endif	/* __amd64 */
1440 
1441 	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1442 		if (kpti_enable == 1) {
1443 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1444 		} else {
1445 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
1446 		}
1447 	}
1448 }
1449