xref: /illumos-gate/usr/src/uts/intel/os/desctbls.c (revision 0ea62e6f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2018 Joyent, Inc. All rights reserved.
28  * Copyright 2022 Oxide Computer Compnay
29  */
30 
31 /*
32  * Copyright (c) 1992 Terrence R. Lambert.
33  * Copyright (c) 1990 The Regents of the University of California.
34  * All rights reserved.
35  *
36  * This code is derived from software contributed to Berkeley by
37  * William Jolitz.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. All advertising materials mentioning features or use of this software
48  *    must display the following acknowledgement:
49  *	This product includes software developed by the University of
50  *	California, Berkeley and its contributors.
51  * 4. Neither the name of the University nor the names of its contributors
52  *    may be used to endorse or promote products derived from this software
53  *    without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  *
67  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
68  */
69 
70 #include <sys/types.h>
71 #include <sys/sysmacros.h>
72 #include <sys/tss.h>
73 #include <sys/segments.h>
74 #include <sys/trap.h>
75 #include <sys/cpuvar.h>
76 #include <sys/bootconf.h>
77 #include <sys/x86_archext.h>
78 #include <sys/controlregs.h>
79 #include <sys/archsystm.h>
80 #include <sys/machsystm.h>
81 #include <sys/kobj.h>
82 #include <sys/cmn_err.h>
83 #include <sys/reboot.h>
84 #include <sys/kdi.h>
85 #include <sys/mach_mmu.h>
86 #include <sys/systm.h>
87 #include <sys/note.h>
88 
89 #ifdef __xpv
90 #include <sys/hypervisor.h>
91 #include <vm/as.h>
92 #endif
93 
94 #include <sys/promif.h>
95 #include <sys/bootinfo.h>
96 #include <vm/kboot_mmu.h>
97 #include <vm/hat_pte.h>
98 
99 /*
100  * cpu0 and default tables and structures.
101  */
102 user_desc_t	*gdt0;
103 #if !defined(__xpv)
104 desctbr_t	gdt0_default_r;
105 #endif
106 
107 gate_desc_t	*idt0;		/* interrupt descriptor table */
108 
109 tss_t		*ktss0;			/* kernel task state structure */
110 
111 
112 user_desc_t	zero_udesc;		/* base zero user desc native procs */
113 user_desc_t	null_udesc;		/* null user descriptor */
114 system_desc_t	null_sdesc;		/* null system descriptor */
115 
116 user_desc_t	zero_u32desc;		/* 32-bit compatibility procs */
117 
118 user_desc_t	ucs_on;
119 user_desc_t	ucs_off;
120 user_desc_t	ucs32_on;
121 user_desc_t	ucs32_off;
122 
123 /*
124  * If the size of this is changed, you must update hat_pcp_setup() and the
125  * definitions in exception.s
126  */
127 extern char dblfault_stack0[DEFAULTSTKSZ];
128 extern char nmi_stack0[DEFAULTSTKSZ];
129 extern char mce_stack0[DEFAULTSTKSZ];
130 
131 extern void	fast_null(void);
132 extern hrtime_t	get_hrtime(void);
133 extern hrtime_t	gethrvtime(void);
134 extern hrtime_t	get_hrestime(void);
135 extern uint64_t	getlgrp(void);
136 
137 void (*(fasttable[]))(void) = {
138 	fast_null,			/* T_FNULL routine */
139 	fast_null,			/* T_FGETFP routine (initially null) */
140 	fast_null,			/* T_FSETFP routine (initially null) */
141 	(void (*)())(uintptr_t)get_hrtime,	/* T_GETHRTIME */
142 	(void (*)())(uintptr_t)gethrvtime,	/* T_GETHRVTIME */
143 	(void (*)())(uintptr_t)get_hrestime,	/* T_GETHRESTIME */
144 	(void (*)())(uintptr_t)getlgrp		/* T_GETLGRP */
145 };
146 
147 /*
148  * Structure containing pre-computed descriptors to allow us to temporarily
149  * interpose on a standard handler.
150  */
151 struct interposing_handler {
152 	int ih_inum;
153 	gate_desc_t ih_interp_desc;
154 	gate_desc_t ih_default_desc;
155 };
156 
157 /*
158  * The brand infrastructure interposes on two handlers, and we use one as a
159  * NULL signpost.
160  */
161 static struct interposing_handler brand_tbl[2];
162 
163 /*
164  * software prototypes for default local descriptor table
165  */
166 
167 /*
168  * Routines for loading segment descriptors in format the hardware
169  * can understand.
170  */
171 
172 /*
173  * In long mode we have the new L or long mode attribute bit
174  * for code segments. Only the conforming bit in type is used along
175  * with descriptor priority and present bits. Default operand size must
176  * be zero when in long mode. In 32-bit compatibility mode all fields
177  * are treated as in legacy mode. For data segments while in long mode
178  * only the present bit is loaded.
179  */
180 void
181 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
182     uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
183 {
184 	ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
185 	/* This should never be a "system" segment. */
186 	ASSERT3U(type & SDT_S, !=, 0);
187 
188 	/*
189 	 * 64-bit long mode.
190 	 */
191 	if (lmode == SDP_LONG)
192 		dp->usd_def32 = 0;		/* 32-bit operands only */
193 	else
194 		/*
195 		 * 32-bit compatibility mode.
196 		 */
197 		dp->usd_def32 = defopsz;	/* 0 = 16, 1 = 32-bit ops */
198 
199 	/*
200 	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
201 	 * will write to the GDT whenever we change segment registers around.
202 	 * With KPTI on, the GDT is read-only in the user page table, which
203 	 * causes crashes if we don't set this.
204 	 */
205 	ASSERT3U(type & SDT_A, !=, 0);
206 
207 	dp->usd_long = lmode;	/* 64-bit mode */
208 	dp->usd_type = type;
209 	dp->usd_dpl = dpl;
210 	dp->usd_p = 1;
211 	dp->usd_gran = gran;		/* 0 = bytes, 1 = pages */
212 
213 	dp->usd_lobase = (uintptr_t)base;
214 	dp->usd_midbase = (uintptr_t)base >> 16;
215 	dp->usd_hibase = (uintptr_t)base >> (16 + 8);
216 	dp->usd_lolimit = size;
217 	dp->usd_hilimit = (uintptr_t)size >> 16;
218 }
219 
220 /*
221  * Install system segment descriptor for LDT and TSS segments.
222  */
223 
224 void
225 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
226     uint_t dpl)
227 {
228 	dp->ssd_lolimit = size;
229 	dp->ssd_hilimit = (uintptr_t)size >> 16;
230 
231 	dp->ssd_lobase = (uintptr_t)base;
232 	dp->ssd_midbase = (uintptr_t)base >> 16;
233 	dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
234 	dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
235 
236 	dp->ssd_type = type;
237 	dp->ssd_zero1 = 0;	/* must be zero */
238 	dp->ssd_zero2 = 0;
239 	dp->ssd_dpl = dpl;
240 	dp->ssd_p = 1;
241 	dp->ssd_gran = 0;	/* force byte units */
242 }
243 
244 void *
245 get_ssd_base(system_desc_t *dp)
246 {
247 	uintptr_t	base;
248 
249 	base = (uintptr_t)dp->ssd_lobase |
250 	    (uintptr_t)dp->ssd_midbase << 16 |
251 	    (uintptr_t)dp->ssd_hibase << (16 + 8) |
252 	    (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
253 	return ((void *)base);
254 }
255 
256 /*
257  * Install gate segment descriptor for interrupt, trap, call and task gates.
258  *
259  * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
260  * all interrupts.  We have different ISTs for each class of exceptions that are
261  * most likely to occur while handling an existing exception; while many of
262  * these are just going to panic, it's nice not to trample on the existing
263  * exception state for debugging purposes.
264  *
265  * Normal interrupts are all redirected unconditionally to the KPTI trampoline
266  * stack space. This unifies the trampoline handling between user and kernel
267  * space (and avoids the need to touch %gs).
268  *
269  * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
270  * we do a read from KMDB that cause another #PF.  Without its own IST, this
271  * would stomp on the kernel's mcpu_kpti_flt frame.
272  */
273 uint_t
274 idt_vector_to_ist(uint_t vector)
275 {
276 #if defined(__xpv)
277 	_NOTE(ARGUNUSED(vector));
278 	return (IST_NONE);
279 #else
280 	switch (vector) {
281 	/* These should always use IST even without KPTI enabled. */
282 	case T_DBLFLT:
283 		return (IST_DF);
284 	case T_NMIFLT:
285 		return (IST_NMI);
286 	case T_MCE:
287 		return (IST_MCE);
288 
289 	case T_BPTFLT:
290 	case T_SGLSTP:
291 		if (kpti_enable == 1) {
292 			return (IST_DBG);
293 		}
294 		return (IST_NONE);
295 	case T_STKFLT:
296 	case T_GPFLT:
297 	case T_PGFLT:
298 		if (kpti_enable == 1) {
299 			return (IST_NESTABLE);
300 		}
301 		return (IST_NONE);
302 	default:
303 		if (kpti_enable == 1) {
304 			return (IST_DEFAULT);
305 		}
306 		return (IST_NONE);
307 	}
308 #endif
309 }
310 
311 void
312 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
313     uint_t type, uint_t dpl, uint_t ist)
314 {
315 	dp->sgd_looffset = (uintptr_t)func;
316 	dp->sgd_hioffset = (uintptr_t)func >> 16;
317 	dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
318 	dp->sgd_selector =  (uint16_t)sel;
319 	dp->sgd_ist = ist;
320 	dp->sgd_type = type;
321 	dp->sgd_dpl = dpl;
322 	dp->sgd_p = 1;
323 }
324 
325 /*
326  * Updates a single user descriptor in the the GDT of the current cpu.
327  * Caller is responsible for preventing cpu migration.
328  */
329 
330 void
331 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
332 {
333 #if defined(DEBUG)
334 	/* This should never be a "system" segment, but it might be null. */
335 	if (udp->usd_p != 0 || udp->usd_type != 0) {
336 		ASSERT3U(udp->usd_type & SDT_S, !=, 0);
337 	}
338 	/*
339 	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
340 	 * will write to the GDT whenever we change segment registers around.
341 	 * With KPTI on, the GDT is read-only in the user page table, which
342 	 * causes crashes if we don't set this.
343 	 */
344 	if (udp->usd_p != 0 || udp->usd_type != 0) {
345 		ASSERT3U(udp->usd_type & SDT_A, !=, 0);
346 	}
347 #endif
348 
349 #if defined(__xpv)
350 	uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
351 
352 	if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
353 		panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
354 
355 #else	/* __xpv */
356 	CPU->cpu_gdt[sidx] = *udp;
357 #endif	/* __xpv */
358 }
359 
360 /*
361  * Writes single descriptor pointed to by udp into a processes
362  * LDT entry pointed to by ldp.
363  */
364 int
365 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
366 {
367 #if defined(DEBUG)
368 	/* This should never be a "system" segment, but it might be null. */
369 	if (udp->usd_p != 0 || udp->usd_type != 0) {
370 		ASSERT3U(udp->usd_type & SDT_S, !=, 0);
371 	}
372 	/*
373 	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
374 	 * will write to the LDT whenever we change segment registers around.
375 	 * With KPTI on, the LDT is read-only in the user page table, which
376 	 * causes crashes if we don't set this.
377 	 */
378 	if (udp->usd_p != 0 || udp->usd_type != 0) {
379 		ASSERT3U(udp->usd_type & SDT_A, !=, 0);
380 	}
381 #endif
382 
383 #if defined(__xpv)
384 	uint64_t dpa;
385 
386 	dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
387 	    ((uintptr_t)ldp & PAGEOFFSET);
388 
389 	/*
390 	 * The hypervisor is a little more restrictive about what it
391 	 * supports in the LDT.
392 	 */
393 	if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
394 		return (EINVAL);
395 
396 #else	/* __xpv */
397 	*ldp = *udp;
398 
399 #endif	/* __xpv */
400 	return (0);
401 }
402 
403 #if defined(__xpv)
404 
405 /*
406  * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
407  * Returns true if a valid entry was written.
408  */
409 int
410 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
411 {
412 	trap_info_t *ti = ti_arg;	/* XXPV	Aargh - segments.h comment */
413 
414 	/*
415 	 * skip holes in the IDT
416 	 */
417 	if (GATESEG_GETOFFSET(sgd) == 0)
418 		return (0);
419 
420 	ASSERT(sgd->sgd_type == SDT_SYSIGT);
421 	ti->vector = vec;
422 	TI_SET_DPL(ti, sgd->sgd_dpl);
423 
424 	/*
425 	 * Is this an interrupt gate?
426 	 */
427 	if (sgd->sgd_type == SDT_SYSIGT) {
428 		/* LINTED */
429 		TI_SET_IF(ti, 1);
430 	}
431 	ti->cs = sgd->sgd_selector;
432 	ti->cs |= SEL_KPL;	/* force into ring 3. see KCS_SEL  */
433 	ti->address = GATESEG_GETOFFSET(sgd);
434 	return (1);
435 }
436 
437 /*
438  * Convert a single hw format gate descriptor and write it into our virtual IDT.
439  */
440 void
441 xen_idt_write(gate_desc_t *sgd, uint_t vec)
442 {
443 	trap_info_t trapinfo[2];
444 
445 	bzero(trapinfo, sizeof (trapinfo));
446 	if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
447 		return;
448 	if (xen_set_trap_table(trapinfo) != 0)
449 		panic("xen_idt_write: xen_set_trap_table() failed");
450 }
451 
452 #endif	/* __xpv */
453 
454 
455 /*
456  * Build kernel GDT.
457  */
458 
459 static void
460 init_gdt_common(user_desc_t *gdt)
461 {
462 	int i;
463 
464 	/*
465 	 * 64-bit kernel code segment.
466 	 */
467 	set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
468 	    SDP_PAGES, SDP_OP32);
469 
470 	/*
471 	 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
472 	 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
473 	 * instruction to return from system calls back to 32-bit applications.
474 	 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
475 	 * descriptors. We therefore must ensure that the kernel uses something,
476 	 * though it will be ignored by hardware, that is compatible with 32-bit
477 	 * apps. For the same reason we must set the default op size of this
478 	 * descriptor to 32-bit operands.
479 	 */
480 	set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
481 	    SEL_KPL, SDP_PAGES, SDP_OP32);
482 	gdt[GDT_KDATA].usd_def32 = 1;
483 
484 	/*
485 	 * 64-bit user code segment.
486 	 */
487 	set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
488 	    SDP_PAGES, SDP_OP32);
489 
490 	/*
491 	 * 32-bit user code segment.
492 	 */
493 	set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
494 	    SEL_UPL, SDP_PAGES, SDP_OP32);
495 
496 	/*
497 	 * See gdt_ucode32() and gdt_ucode_native().
498 	 */
499 	ucs_on = ucs_off = gdt[GDT_UCODE];
500 	ucs_off.usd_p = 0;	/* forces #np fault */
501 
502 	ucs32_on = ucs32_off = gdt[GDT_U32CODE];
503 	ucs32_off.usd_p = 0;	/* forces #np fault */
504 
505 	/*
506 	 * 32 and 64 bit data segments can actually share the same descriptor.
507 	 * In long mode only the present bit is checked but all other fields
508 	 * are loaded. But in compatibility mode all fields are interpreted
509 	 * as in legacy mode so they must be set correctly for a 32-bit data
510 	 * segment.
511 	 */
512 	set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
513 	    SDP_PAGES, SDP_OP32);
514 
515 #if !defined(__xpv)
516 
517 	/*
518 	 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
519 	 * in the GDT is 0.
520 	 */
521 
522 	/*
523 	 * Kernel TSS
524 	 */
525 	set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
526 	    sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
527 
528 #endif	/* !__xpv */
529 
530 	/*
531 	 * Initialize fs and gs descriptors for 32 bit processes.
532 	 * Only attributes and limits are initialized, the effective
533 	 * base address is programmed via fsbase/gsbase.
534 	 */
535 	set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
536 	    SEL_UPL, SDP_PAGES, SDP_OP32);
537 	set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
538 	    SEL_UPL, SDP_PAGES, SDP_OP32);
539 
540 	/*
541 	 * Initialize the descriptors set aside for brand usage.
542 	 * Only attributes and limits are initialized.
543 	 */
544 	for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
545 		set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
546 		    SEL_UPL, SDP_PAGES, SDP_OP32);
547 
548 	/*
549 	 * Initialize convenient zero base user descriptors for clearing
550 	 * lwp private %fs and %gs descriptors in GDT. See setregs() for
551 	 * an example.
552 	 */
553 	set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
554 	    SDP_BYTES, SDP_OP32);
555 	set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
556 	    SDP_PAGES, SDP_OP32);
557 }
558 
559 #if defined(__xpv)
560 
561 static user_desc_t *
562 init_gdt(void)
563 {
564 	uint64_t gdtpa;
565 	ulong_t ma[1];		/* XXPV should be a memory_t */
566 	ulong_t addr;
567 
568 #if !defined(__lint)
569 	/*
570 	 * Our gdt is never larger than a single page.
571 	 */
572 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
573 #endif
574 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
575 	    PAGESIZE, PAGESIZE);
576 	bzero(gdt0, PAGESIZE);
577 
578 	init_gdt_common(gdt0);
579 
580 	/*
581 	 * XXX Since we never invoke kmdb until after the kernel takes
582 	 * over the descriptor tables why not have it use the kernel's
583 	 * selectors?
584 	 */
585 	if (boothowto & RB_DEBUG) {
586 		set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
587 		    SEL_KPL, SDP_PAGES, SDP_OP32);
588 		set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
589 		    SEL_KPL, SDP_PAGES, SDP_OP32);
590 	}
591 
592 	/*
593 	 * Clear write permission for page containing the gdt and install it.
594 	 */
595 	gdtpa = pfn_to_pa(va_to_pfn(gdt0));
596 	ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
597 	kbm_read_only((uintptr_t)gdt0, gdtpa);
598 	xen_set_gdt(ma, NGDT);
599 
600 	/*
601 	 * Reload the segment registers to use the new GDT.
602 	 * On 64-bit, fixup KCS_SEL to be in ring 3.
603 	 * See KCS_SEL in segments.h.
604 	 */
605 	load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
606 
607 	/*
608 	 *  setup %gs for kernel
609 	 */
610 	xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
611 
612 	/*
613 	 * XX64 We should never dereference off "other gsbase" or
614 	 * "fsbase".  So, we should arrange to point FSBASE and
615 	 * KGSBASE somewhere truly awful e.g. point it at the last
616 	 * valid address below the hole so that any attempts to index
617 	 * off them cause an exception.
618 	 *
619 	 * For now, point it at 8G -- at least it should be unmapped
620 	 * until some 64-bit processes run.
621 	 */
622 	addr = 0x200000000ul;
623 	xen_set_segment_base(SEGBASE_FS, addr);
624 	xen_set_segment_base(SEGBASE_GS_USER, addr);
625 	xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
626 
627 	return (gdt0);
628 }
629 
630 #else	/* __xpv */
631 
632 static user_desc_t *
633 init_gdt(void)
634 {
635 	desctbr_t	r_bgdt, r_gdt;
636 	user_desc_t	*bgdt;
637 
638 #if !defined(__lint)
639 	/*
640 	 * Our gdt is never larger than a single page.
641 	 */
642 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
643 #endif
644 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
645 	    PAGESIZE, PAGESIZE);
646 	bzero(gdt0, PAGESIZE);
647 
648 	init_gdt_common(gdt0);
649 
650 	/*
651 	 * Copy in from boot's gdt to our gdt.
652 	 * Entry 0 is the null descriptor by definition.
653 	 */
654 	rd_gdtr(&r_bgdt);
655 	bgdt = (user_desc_t *)r_bgdt.dtr_base;
656 	if (bgdt == NULL)
657 		panic("null boot gdt");
658 
659 	gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
660 	gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
661 	gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
662 	gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
663 	gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
664 
665 	/*
666 	 * Install our new GDT
667 	 */
668 	r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
669 	r_gdt.dtr_base = (uintptr_t)gdt0;
670 	wr_gdtr(&r_gdt);
671 
672 	/*
673 	 * Reload the segment registers to use the new GDT
674 	 */
675 	load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
676 
677 	/*
678 	 *  setup %gs for kernel
679 	 */
680 	wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
681 
682 	/*
683 	 * XX64 We should never dereference off "other gsbase" or
684 	 * "fsbase".  So, we should arrange to point FSBASE and
685 	 * KGSBASE somewhere truly awful e.g. point it at the last
686 	 * valid address below the hole so that any attempts to index
687 	 * off them cause an exception.
688 	 *
689 	 * For now, point it at 8G -- at least it should be unmapped
690 	 * until some 64-bit processes run.
691 	 */
692 	wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
693 	wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
694 	return (gdt0);
695 }
696 
697 #endif	/* __xpv */
698 
699 
700 /*
701  * Build kernel IDT.
702  *
703  * Note that for amd64 we pretty much require every gate to be an interrupt
704  * gate which blocks interrupts atomically on entry; that's because of our
705  * dependency on using 'swapgs' every time we come into the kernel to find
706  * the cpu structure. If we get interrupted just before doing that, %cs could
707  * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
708  * %gsbase is really still pointing at something in userland. Bad things will
709  * ensue. We also use interrupt gates for i386 as well even though this is not
710  * required for some traps.
711  *
712  * Perhaps they should have invented a trap gate that does an atomic swapgs?
713  */
714 static void
715 init_idt_common(gate_desc_t *idt)
716 {
717 	set_gatesegd(&idt[T_ZERODIV],
718 	    (kpti_enable == 1) ? &tr_div0trap : &div0trap,
719 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
720 	set_gatesegd(&idt[T_SGLSTP],
721 	    (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
722 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
723 	set_gatesegd(&idt[T_NMIFLT],
724 	    (kpti_enable == 1) ? &tr_nmiint : &nmiint,
725 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
726 	set_gatesegd(&idt[T_BPTFLT],
727 	    (kpti_enable == 1) ? &tr_brktrap : &brktrap,
728 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
729 	set_gatesegd(&idt[T_OVFLW],
730 	    (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
731 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
732 	set_gatesegd(&idt[T_BOUNDFLT],
733 	    (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
734 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
735 	set_gatesegd(&idt[T_ILLINST],
736 	    (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
737 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
738 	set_gatesegd(&idt[T_NOEXTFLT],
739 	    (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
740 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
741 
742 	/*
743 	 * double fault handler.
744 	 *
745 	 * Note that on the hypervisor a guest does not receive #df faults.
746 	 * Instead a failsafe event is injected into the guest if its selectors
747 	 * and/or stack is in a broken state. See xen_failsafe_callback.
748 	 */
749 #if !defined(__xpv)
750 	set_gatesegd(&idt[T_DBLFLT],
751 	    (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
752 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
753 #endif	/* !__xpv */
754 
755 	/*
756 	 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
757 	 */
758 	set_gatesegd(&idt[T_TSSFLT],
759 	    (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
760 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
761 	set_gatesegd(&idt[T_SEGFLT],
762 	    (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
763 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
764 	set_gatesegd(&idt[T_STKFLT],
765 	    (kpti_enable == 1) ? &tr_stktrap : &stktrap,
766 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
767 	set_gatesegd(&idt[T_GPFLT],
768 	    (kpti_enable == 1) ? &tr_gptrap : &gptrap,
769 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
770 	set_gatesegd(&idt[T_PGFLT],
771 	    (kpti_enable == 1) ? &tr_pftrap : &pftrap,
772 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
773 	set_gatesegd(&idt[T_EXTERRFLT],
774 	    (kpti_enable == 1) ? &tr_ndperr : &ndperr,
775 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
776 	set_gatesegd(&idt[T_ALIGNMENT],
777 	    (kpti_enable == 1) ? &tr_achktrap : &achktrap,
778 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
779 	set_gatesegd(&idt[T_MCE],
780 	    (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
781 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
782 	set_gatesegd(&idt[T_SIMDFPE],
783 	    (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
784 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
785 
786 	/*
787 	 * install fast trap handler at 210.
788 	 */
789 	set_gatesegd(&idt[T_FASTTRAP],
790 	    (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
791 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
792 
793 	/*
794 	 * System call handler.
795 	 */
796 	set_gatesegd(&idt[T_SYSCALLINT],
797 	    (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
798 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
799 
800 	/*
801 	 * Install the DTrace interrupt handler for the pid provider.
802 	 */
803 	set_gatesegd(&idt[T_DTRACE_RET],
804 	    (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
805 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
806 
807 	/*
808 	 * Prepare interposing descriptor for the syscall handler
809 	 * and cache copy of the default descriptor.
810 	 */
811 	brand_tbl[0].ih_inum = T_SYSCALLINT;
812 	brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
813 
814 	set_gatesegd(&(brand_tbl[0].ih_interp_desc),
815 	    (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
816 	    &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
817 	    idt_vector_to_ist(T_SYSCALLINT));
818 
819 	brand_tbl[1].ih_inum = 0;
820 }
821 
822 #if defined(__xpv)
823 
824 static void
825 init_idt(gate_desc_t *idt)
826 {
827 	init_idt_common(idt);
828 }
829 
830 #else	/* __xpv */
831 
832 static void
833 init_idt(gate_desc_t *idt)
834 {
835 	char	ivctname[80];
836 	void	(*ivctptr)(void);
837 	int	i;
838 
839 	/*
840 	 * Initialize entire table with 'reserved' trap and then overwrite
841 	 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
842 	 * since it can only be generated on a 386 processor. 15 is also
843 	 * unsupported and reserved.
844 	 */
845 	for (i = 0; i < NIDT; i++) {
846 		set_gatesegd(&idt[i],
847 		    (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
848 		    KCS_SEL, SDT_SYSIGT, TRP_KPL,
849 		    idt_vector_to_ist(T_RESVTRAP));
850 	}
851 
852 	/*
853 	 * 20-31 reserved
854 	 */
855 	for (i = 20; i < 32; i++) {
856 		set_gatesegd(&idt[i],
857 		    (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
858 		    KCS_SEL, SDT_SYSIGT, TRP_KPL,
859 		    idt_vector_to_ist(T_INVALTRAP));
860 	}
861 
862 	/*
863 	 * interrupts 32 - 255
864 	 */
865 	for (i = 32; i < 256; i++) {
866 		(void) snprintf(ivctname, sizeof (ivctname),
867 		    (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
868 		ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
869 		if (ivctptr == NULL)
870 			panic("kobj_getsymvalue(%s) failed", ivctname);
871 
872 		set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
873 		    idt_vector_to_ist(i));
874 	}
875 
876 	/*
877 	 * Now install the common ones. Note that it will overlay some
878 	 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
879 	 */
880 	init_idt_common(idt);
881 }
882 
883 #endif	/* __xpv */
884 
885 /*
886  * The kernel does not deal with LDTs unless a user explicitly creates
887  * one. Under normal circumstances, the LDTR contains 0. Any process attempting
888  * to reference the LDT will therefore cause a #gp. System calls made via the
889  * obsolete lcall mechanism are emulated by the #gp fault handler.
890  */
891 static void
892 init_ldt(void)
893 {
894 #if defined(__xpv)
895 	xen_set_ldt(NULL, 0);
896 #else
897 	wr_ldtr(0);
898 #endif
899 }
900 
901 #if !defined(__xpv)
902 
903 static void
904 init_tss(void)
905 {
906 	extern struct cpu cpus[];
907 
908 	/*
909 	 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
910 	 * context switch but it'll be overwritten with this same value anyway.
911 	 */
912 	if (kpti_enable == 1) {
913 		ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
914 	}
915 
916 	/* Set up the IST stacks for double fault, NMI, MCE. */
917 	ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
918 	ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
919 	ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
920 
921 	/*
922 	 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
923 	 * enabled), and also for KDI (always).
924 	 */
925 	ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
926 
927 	if (kpti_enable == 1) {
928 		/* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
929 		ktss0->tss_ist5 =
930 		    (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
931 
932 		/* This IST stack is used for all other intrs (for KPTI). */
933 		ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
934 	}
935 
936 	/*
937 	 * Set I/O bit map offset equal to size of TSS segment limit
938 	 * for no I/O permission map. This will force all user I/O
939 	 * instructions to generate #gp fault.
940 	 */
941 	ktss0->tss_bitmapbase = sizeof (*ktss0);
942 
943 	/*
944 	 * Point %tr to descriptor for ktss0 in gdt.
945 	 */
946 	wr_tsr(KTSS_SEL);
947 }
948 
949 #endif	/* !__xpv */
950 
951 #if defined(__xpv)
952 
953 void
954 init_desctbls(void)
955 {
956 	uint_t vec;
957 	user_desc_t *gdt;
958 
959 	/*
960 	 * Setup and install our GDT.
961 	 */
962 	gdt = init_gdt();
963 
964 	/*
965 	 * Store static pa of gdt to speed up pa_to_ma() translations
966 	 * on lwp context switches.
967 	 */
968 	ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
969 	CPU->cpu_gdt = gdt;
970 	CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
971 
972 	/*
973 	 * Setup and install our IDT.
974 	 */
975 #if !defined(__lint)
976 	ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
977 #endif
978 	idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
979 	    PAGESIZE, PAGESIZE);
980 	bzero(idt0, PAGESIZE);
981 	init_idt(idt0);
982 	for (vec = 0; vec < NIDT; vec++)
983 		xen_idt_write(&idt0[vec], vec);
984 
985 	CPU->cpu_idt = idt0;
986 
987 	/*
988 	 * set default kernel stack
989 	 */
990 	xen_stack_switch(KDS_SEL,
991 	    (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
992 
993 	xen_init_callbacks();
994 
995 	init_ldt();
996 }
997 
998 #else	/* __xpv */
999 
1000 void
1001 init_desctbls(void)
1002 {
1003 	user_desc_t *gdt;
1004 	desctbr_t idtr;
1005 
1006 	/*
1007 	 * Allocate IDT and TSS structures on unique pages for better
1008 	 * performance in virtual machines.
1009 	 */
1010 #if !defined(__lint)
1011 	ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1012 #endif
1013 	idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1014 	    PAGESIZE, PAGESIZE);
1015 	bzero(idt0, PAGESIZE);
1016 #if !defined(__lint)
1017 	ASSERT(sizeof (*ktss0) <= PAGESIZE);
1018 #endif
1019 	ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1020 	    PAGESIZE, PAGESIZE);
1021 	bzero(ktss0, PAGESIZE);
1022 
1023 
1024 	/*
1025 	 * Setup and install our GDT.
1026 	 */
1027 	gdt = init_gdt();
1028 	ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1029 	CPU->cpu_gdt = gdt;
1030 
1031 	/*
1032 	 * Initialize this CPU's LDT.
1033 	 */
1034 	CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1035 	    LDT_CPU_SIZE, PAGESIZE);
1036 	bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1037 	CPU->cpu_m.mcpu_ldt_len = 0;
1038 
1039 	/*
1040 	 * Setup and install our IDT.
1041 	 */
1042 	init_idt(idt0);
1043 
1044 	idtr.dtr_base = (uintptr_t)idt0;
1045 	idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1046 	wr_idtr(&idtr);
1047 	CPU->cpu_idt = idt0;
1048 
1049 
1050 	init_tss();
1051 	CPU->cpu_tss = ktss0;
1052 	init_ldt();
1053 
1054 	/* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1055 	kpti_safe_cr3 = (uint64_t)getcr3();
1056 }
1057 
1058 #endif	/* __xpv */
1059 
1060 #ifndef __xpv
1061 /*
1062  * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so
1063  * we have to manually fix it up ourselves.
1064  *
1065  * The caller may still need to make sure that it can't go off-CPU with the
1066  * incorrect limit, before calling this (such as disabling pre-emption).
1067  */
1068 void
1069 reset_gdtr_limit(void)
1070 {
1071 	ulong_t flags = intr_clear();
1072 	desctbr_t gdtr;
1073 
1074 	rd_gdtr(&gdtr);
1075 	gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1;
1076 	wr_gdtr(&gdtr);
1077 
1078 	intr_restore(flags);
1079 }
1080 #endif /* __xpv */
1081 
1082 /*
1083  * In the early kernel, we need to set up a simple GDT to run on.
1084  *
1085  * XXPV	Can dboot use this too?  See dboot_gdt.s
1086  */
1087 void
1088 init_boot_gdt(user_desc_t *bgdt)
1089 {
1090 	set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1091 	    SDP_PAGES, SDP_OP32);
1092 	set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1093 	    SDP_PAGES, SDP_OP32);
1094 }
1095 
1096 /*
1097  * Enable interpositioning on the system call path by rewriting the
1098  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1099  * the branded entry points.
1100  */
1101 void
1102 brand_interpositioning_enable(void *arg __unused)
1103 {
1104 	gate_desc_t	*idt = CPU->cpu_idt;
1105 	int		i;
1106 
1107 	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1108 
1109 	for (i = 0; brand_tbl[i].ih_inum; i++) {
1110 		idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1111 #if defined(__xpv)
1112 		xen_idt_write(&idt[brand_tbl[i].ih_inum],
1113 		    brand_tbl[i].ih_inum);
1114 #endif
1115 	}
1116 
1117 #if defined(__xpv)
1118 
1119 	/*
1120 	 * Currently the hypervisor only supports 64-bit syscalls via
1121 	 * syscall instruction. The 32-bit syscalls are handled by
1122 	 * interrupt gate above.
1123 	 */
1124 	xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1125 	    CALLBACKF_mask_events);
1126 
1127 #else
1128 
1129 	if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1130 		if (kpti_enable == 1) {
1131 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1132 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1133 		} else {
1134 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1135 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1136 		}
1137 	}
1138 
1139 #endif
1140 
1141 	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1142 		if (kpti_enable == 1) {
1143 			wrmsr(MSR_INTC_SEP_EIP,
1144 			    (uintptr_t)tr_brand_sys_sysenter);
1145 		} else {
1146 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1147 		}
1148 	}
1149 }
1150 
1151 /*
1152  * Disable interpositioning on the system call path by rewriting the
1153  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1154  * the standard entry points, which bypass the interpositioning hooks.
1155  */
1156 void
1157 brand_interpositioning_disable(void *arg __unused)
1158 {
1159 	gate_desc_t	*idt = CPU->cpu_idt;
1160 	int i;
1161 
1162 	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1163 
1164 	for (i = 0; brand_tbl[i].ih_inum; i++) {
1165 		idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1166 #if defined(__xpv)
1167 		xen_idt_write(&idt[brand_tbl[i].ih_inum],
1168 		    brand_tbl[i].ih_inum);
1169 #endif
1170 	}
1171 
1172 #if defined(__xpv)
1173 
1174 	/*
1175 	 * See comment above in brand_interpositioning_enable.
1176 	 */
1177 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1178 	    CALLBACKF_mask_events);
1179 
1180 #else
1181 
1182 	if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1183 		if (kpti_enable == 1) {
1184 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1185 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1186 		} else {
1187 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1188 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1189 		}
1190 	}
1191 
1192 #endif
1193 
1194 	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1195 		if (kpti_enable == 1) {
1196 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1197 		} else {
1198 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
1199 		}
1200 	}
1201 }
1202