xref: /illumos-gate/usr/src/uts/intel/os/desctbls.c (revision 075343cb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2018 Joyent, Inc. All rights reserved.
28  * Copyright 2022 Oxide Computer Compnay
29  */
30 
31 /*
32  * Copyright (c) 1992 Terrence R. Lambert.
33  * Copyright (c) 1990 The Regents of the University of California.
34  * All rights reserved.
35  *
36  * This code is derived from software contributed to Berkeley by
37  * William Jolitz.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. All advertising materials mentioning features or use of this software
48  *    must display the following acknowledgement:
49  *	This product includes software developed by the University of
50  *	California, Berkeley and its contributors.
51  * 4. Neither the name of the University nor the names of its contributors
52  *    may be used to endorse or promote products derived from this software
53  *    without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  *
67  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
68  */
69 
70 #include <sys/types.h>
71 #include <sys/sysmacros.h>
72 #include <sys/tss.h>
73 #include <sys/segments.h>
74 #include <sys/trap.h>
75 #include <sys/cpuvar.h>
76 #include <sys/bootconf.h>
77 #include <sys/x86_archext.h>
78 #include <sys/controlregs.h>
79 #include <sys/archsystm.h>
80 #include <sys/machsystm.h>
81 #include <sys/kobj.h>
82 #include <sys/cmn_err.h>
83 #include <sys/reboot.h>
84 #include <sys/kdi.h>
85 #include <sys/mach_mmu.h>
86 #include <sys/systm.h>
87 #include <sys/note.h>
88 
89 #ifdef __xpv
90 #include <sys/hypervisor.h>
91 #include <vm/as.h>
92 #endif
93 
94 #include <sys/promif.h>
95 #include <sys/bootinfo.h>
96 #include <vm/kboot_mmu.h>
97 #include <vm/hat_pte.h>
98 
99 /*
100  * cpu0 and default tables and structures.
101  */
102 user_desc_t	*gdt0;
103 #if !defined(__xpv)
104 desctbr_t	gdt0_default_r;
105 #endif
106 
107 gate_desc_t	*idt0;		/* interrupt descriptor table */
108 
109 tss_t		*ktss0;			/* kernel task state structure */
110 
111 
112 user_desc_t	zero_udesc;		/* base zero user desc native procs */
113 user_desc_t	null_udesc;		/* null user descriptor */
114 system_desc_t	null_sdesc;		/* null system descriptor */
115 
116 user_desc_t	zero_u32desc;		/* 32-bit compatibility procs */
117 
118 user_desc_t	ucs_on;
119 user_desc_t	ucs_off;
120 user_desc_t	ucs32_on;
121 user_desc_t	ucs32_off;
122 
123 /*
124  * If the size of this is changed, you must update hat_pcp_setup() and the
125  * definitions in exception.s
126  */
127 extern char dblfault_stack0[DEFAULTSTKSZ];
128 extern char nmi_stack0[DEFAULTSTKSZ];
129 extern char mce_stack0[DEFAULTSTKSZ];
130 
131 extern void	fast_null(void);
132 extern hrtime_t	get_hrtime(void);
133 extern hrtime_t	gethrvtime(void);
134 extern hrtime_t	get_hrestime(void);
135 extern uint64_t	getlgrp(void);
136 
137 void (*(fasttable[]))(void) = {
138 	fast_null,			/* T_FNULL routine */
139 	fast_null,			/* T_FGETFP routine (initially null) */
140 	fast_null,			/* T_FSETFP routine (initially null) */
141 	(void (*)())(uintptr_t)get_hrtime,	/* T_GETHRTIME */
142 	(void (*)())(uintptr_t)gethrvtime,	/* T_GETHRVTIME */
143 	(void (*)())(uintptr_t)get_hrestime,	/* T_GETHRESTIME */
144 	(void (*)())(uintptr_t)getlgrp		/* T_GETLGRP */
145 };
146 
147 /*
148  * Structure containing pre-computed descriptors to allow us to temporarily
149  * interpose on a standard handler.
150  */
151 struct interposing_handler {
152 	int ih_inum;
153 	gate_desc_t ih_interp_desc;
154 	gate_desc_t ih_default_desc;
155 };
156 
157 /*
158  * The brand infrastructure interposes on two handlers, and we use one as a
159  * NULL signpost.
160  */
161 static struct interposing_handler brand_tbl[2];
162 
163 /*
164  * software prototypes for default local descriptor table
165  */
166 
167 /*
168  * Routines for loading segment descriptors in format the hardware
169  * can understand.
170  */
171 
172 /*
173  * In long mode we have the new L or long mode attribute bit
174  * for code segments. Only the conforming bit in type is used along
175  * with descriptor priority and present bits. Default operand size must
176  * be zero when in long mode. In 32-bit compatibility mode all fields
177  * are treated as in legacy mode. For data segments while in long mode
178  * only the present bit is loaded.
179  */
180 void
set_usegd(user_desc_t * dp,uint_t lmode,void * base,uint32_t size,uint_t type,uint_t dpl,uint_t gran,uint_t defopsz)181 set_usegd(user_desc_t *dp, uint_t lmode, void *base, uint32_t size,
182     uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
183 {
184 	ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
185 	/* This should never be a "system" segment. */
186 	ASSERT3U(type & SDT_S, !=, 0);
187 	ASSERT3P(dp, !=, NULL);
188 
189 	/*
190 	 * 64-bit long mode.
191 	 */
192 	if (lmode == SDP_LONG)
193 		dp->usd_def32 = 0;		/* 32-bit operands only */
194 	else
195 		/*
196 		 * 32-bit compatibility mode.
197 		 */
198 		dp->usd_def32 = defopsz;	/* 0 = 16, 1 = 32-bit ops */
199 
200 	/*
201 	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
202 	 * will write to the GDT whenever we change segment registers around.
203 	 * With KPTI on, the GDT is read-only in the user page table, which
204 	 * causes crashes if we don't set this.
205 	 */
206 	ASSERT3U(type & SDT_A, !=, 0);
207 
208 	dp->usd_long = lmode;	/* 64-bit mode */
209 	dp->usd_type = type;
210 	dp->usd_dpl = dpl;
211 	dp->usd_p = 1;
212 	dp->usd_gran = gran;		/* 0 = bytes, 1 = pages */
213 
214 	dp->usd_lobase = (uintptr_t)base;
215 	dp->usd_midbase = (uintptr_t)base >> 16;
216 	dp->usd_hibase = (uintptr_t)base >> (16 + 8);
217 	dp->usd_lolimit = size;
218 	dp->usd_hilimit = (uintptr_t)size >> 16;
219 }
220 
221 /*
222  * Install system segment descriptor for LDT and TSS segments.
223  */
224 
225 void
set_syssegd(system_desc_t * dp,void * base,size_t size,uint_t type,uint_t dpl)226 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
227     uint_t dpl)
228 {
229 	dp->ssd_lolimit = size;
230 	dp->ssd_hilimit = (uintptr_t)size >> 16;
231 
232 	dp->ssd_lobase = (uintptr_t)base;
233 	dp->ssd_midbase = (uintptr_t)base >> 16;
234 	dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
235 	dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
236 
237 	dp->ssd_type = type;
238 	dp->ssd_zero1 = 0;	/* must be zero */
239 	dp->ssd_zero2 = 0;
240 	dp->ssd_dpl = dpl;
241 	dp->ssd_p = 1;
242 	dp->ssd_gran = 0;	/* force byte units */
243 }
244 
245 void *
get_ssd_base(system_desc_t * dp)246 get_ssd_base(system_desc_t *dp)
247 {
248 	uintptr_t	base;
249 
250 	base = (uintptr_t)dp->ssd_lobase |
251 	    (uintptr_t)dp->ssd_midbase << 16 |
252 	    (uintptr_t)dp->ssd_hibase << (16 + 8) |
253 	    (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
254 	return ((void *)base);
255 }
256 
257 /*
258  * Install gate segment descriptor for interrupt, trap, call and task gates.
259  *
260  * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
261  * all interrupts.  We have different ISTs for each class of exceptions that are
262  * most likely to occur while handling an existing exception; while many of
263  * these are just going to panic, it's nice not to trample on the existing
264  * exception state for debugging purposes.
265  *
266  * Normal interrupts are all redirected unconditionally to the KPTI trampoline
267  * stack space. This unifies the trampoline handling between user and kernel
268  * space (and avoids the need to touch %gs).
269  *
270  * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
271  * we do a read from KMDB that cause another #PF.  Without its own IST, this
272  * would stomp on the kernel's mcpu_kpti_flt frame.
273  */
274 uint_t
idt_vector_to_ist(uint_t vector)275 idt_vector_to_ist(uint_t vector)
276 {
277 #if defined(__xpv)
278 	_NOTE(ARGUNUSED(vector));
279 	return (IST_NONE);
280 #else
281 	switch (vector) {
282 	/* These should always use IST even without KPTI enabled. */
283 	case T_DBLFLT:
284 		return (IST_DF);
285 	case T_NMIFLT:
286 		return (IST_NMI);
287 	case T_MCE:
288 		return (IST_MCE);
289 
290 	case T_BPTFLT:
291 	case T_SGLSTP:
292 		if (kpti_enable == 1) {
293 			return (IST_DBG);
294 		}
295 		return (IST_NONE);
296 	case T_STKFLT:
297 	case T_GPFLT:
298 	case T_PGFLT:
299 		if (kpti_enable == 1) {
300 			return (IST_NESTABLE);
301 		}
302 		return (IST_NONE);
303 	default:
304 		if (kpti_enable == 1) {
305 			return (IST_DEFAULT);
306 		}
307 		return (IST_NONE);
308 	}
309 #endif
310 }
311 
312 void
set_gatesegd(gate_desc_t * dp,void (* func)(void),selector_t sel,uint_t type,uint_t dpl,uint_t ist)313 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
314     uint_t type, uint_t dpl, uint_t ist)
315 {
316 	dp->sgd_looffset = (uintptr_t)func;
317 	dp->sgd_hioffset = (uintptr_t)func >> 16;
318 	dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
319 	dp->sgd_selector =  (uint16_t)sel;
320 	dp->sgd_ist = ist;
321 	dp->sgd_type = type;
322 	dp->sgd_dpl = dpl;
323 	dp->sgd_p = 1;
324 }
325 
326 /*
327  * Updates a single user descriptor in the the GDT of the current cpu.
328  * Caller is responsible for preventing cpu migration.
329  */
330 
331 void
gdt_update_usegd(uint_t sidx,user_desc_t * udp)332 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
333 {
334 #if defined(DEBUG)
335 	/* This should never be a "system" segment, but it might be null. */
336 	if (udp->usd_p != 0 || udp->usd_type != 0) {
337 		ASSERT3U(udp->usd_type & SDT_S, !=, 0);
338 	}
339 	/*
340 	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
341 	 * will write to the GDT whenever we change segment registers around.
342 	 * With KPTI on, the GDT is read-only in the user page table, which
343 	 * causes crashes if we don't set this.
344 	 */
345 	if (udp->usd_p != 0 || udp->usd_type != 0) {
346 		ASSERT3U(udp->usd_type & SDT_A, !=, 0);
347 	}
348 #endif
349 
350 #if defined(__xpv)
351 	uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
352 
353 	if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
354 		panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
355 
356 #else	/* __xpv */
357 	CPU->cpu_gdt[sidx] = *udp;
358 #endif	/* __xpv */
359 }
360 
361 /*
362  * Writes single descriptor pointed to by udp into a processes
363  * LDT entry pointed to by ldp.
364  */
365 int
ldt_update_segd(user_desc_t * ldp,user_desc_t * udp)366 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
367 {
368 #if defined(DEBUG)
369 	/* This should never be a "system" segment, but it might be null. */
370 	if (udp->usd_p != 0 || udp->usd_type != 0) {
371 		ASSERT3U(udp->usd_type & SDT_S, !=, 0);
372 	}
373 	/*
374 	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
375 	 * will write to the LDT whenever we change segment registers around.
376 	 * With KPTI on, the LDT is read-only in the user page table, which
377 	 * causes crashes if we don't set this.
378 	 */
379 	if (udp->usd_p != 0 || udp->usd_type != 0) {
380 		ASSERT3U(udp->usd_type & SDT_A, !=, 0);
381 	}
382 #endif
383 
384 #if defined(__xpv)
385 	uint64_t dpa;
386 
387 	dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
388 	    ((uintptr_t)ldp & PAGEOFFSET);
389 
390 	/*
391 	 * The hypervisor is a little more restrictive about what it
392 	 * supports in the LDT.
393 	 */
394 	if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
395 		return (EINVAL);
396 
397 #else	/* __xpv */
398 	*ldp = *udp;
399 
400 #endif	/* __xpv */
401 	return (0);
402 }
403 
404 #if defined(__xpv)
405 
406 /*
407  * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
408  * Returns true if a valid entry was written.
409  */
410 int
xen_idt_to_trap_info(uint_t vec,gate_desc_t * sgd,void * ti_arg)411 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
412 {
413 	trap_info_t *ti = ti_arg;	/* XXPV	Aargh - segments.h comment */
414 
415 	/*
416 	 * skip holes in the IDT
417 	 */
418 	if (GATESEG_GETOFFSET(sgd) == 0)
419 		return (0);
420 
421 	ASSERT(sgd->sgd_type == SDT_SYSIGT);
422 	ti->vector = vec;
423 	TI_SET_DPL(ti, sgd->sgd_dpl);
424 
425 	/*
426 	 * Is this an interrupt gate?
427 	 */
428 	if (sgd->sgd_type == SDT_SYSIGT) {
429 		/* LINTED */
430 		TI_SET_IF(ti, 1);
431 	}
432 	ti->cs = sgd->sgd_selector;
433 	ti->cs |= SEL_KPL;	/* force into ring 3. see KCS_SEL  */
434 	ti->address = GATESEG_GETOFFSET(sgd);
435 	return (1);
436 }
437 
438 /*
439  * Convert a single hw format gate descriptor and write it into our virtual IDT.
440  */
441 void
xen_idt_write(gate_desc_t * sgd,uint_t vec)442 xen_idt_write(gate_desc_t *sgd, uint_t vec)
443 {
444 	trap_info_t trapinfo[2];
445 
446 	bzero(trapinfo, sizeof (trapinfo));
447 	if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
448 		return;
449 	if (xen_set_trap_table(trapinfo) != 0)
450 		panic("xen_idt_write: xen_set_trap_table() failed");
451 }
452 
453 #endif	/* __xpv */
454 
455 
456 /*
457  * Build kernel GDT.
458  */
459 
460 static void
init_gdt_common(user_desc_t * gdt)461 init_gdt_common(user_desc_t *gdt)
462 {
463 	int i;
464 
465 	ASSERT3P(gdt, !=, NULL);
466 
467 	init_boot_gdt(gdt);
468 
469 	/*
470 	 * 64-bit kernel code segment.
471 	 */
472 	set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
473 	    SDP_PAGES, SDP_OP32);
474 
475 	/*
476 	 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
477 	 * mode, but we set it here to SDP_LIMIT_MAX so that we can use the
478 	 * SYSRET instruction to return from system calls back to 32-bit
479 	 * applications.  SYSRET doesn't update the base, limit, or attributes
480 	 * of %ss or %ds descriptors. We therefore must ensure that the kernel
481 	 * uses something, though it will be ignored by hardware, that is
482 	 * compatible with 32-bit apps. For the same reason we must set the
483 	 * default op size of this descriptor to 32-bit operands.
484 	 */
485 	set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
486 	    SEL_KPL, SDP_PAGES, SDP_OP32);
487 	gdt[GDT_KDATA].usd_def32 = 1;
488 
489 	/*
490 	 * 64-bit user code segment.
491 	 */
492 	set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
493 	    SDP_PAGES, SDP_OP32);
494 
495 	/*
496 	 * 32-bit user code segment.
497 	 */
498 	set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, SDP_LIMIT_MAX, SDT_MEMERA,
499 	    SEL_UPL, SDP_PAGES, SDP_OP32);
500 
501 	/*
502 	 * See gdt_ucode32() and gdt_ucode_native().
503 	 */
504 	ucs_on = ucs_off = gdt[GDT_UCODE];
505 	ucs_off.usd_p = 0;	/* forces #np fault */
506 
507 	ucs32_on = ucs32_off = gdt[GDT_U32CODE];
508 	ucs32_off.usd_p = 0;	/* forces #np fault */
509 
510 	/*
511 	 * 32 and 64 bit data segments can actually share the same descriptor.
512 	 * In long mode only the present bit is checked but all other fields
513 	 * are loaded. But in compatibility mode all fields are interpreted
514 	 * as in legacy mode so they must be set correctly for a 32-bit data
515 	 * segment.
516 	 */
517 	set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
518 	    SEL_UPL, SDP_PAGES, SDP_OP32);
519 
520 #if !defined(__xpv)
521 
522 	/*
523 	 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
524 	 * in the GDT is 0.
525 	 */
526 
527 	/*
528 	 * Kernel TSS
529 	 */
530 	set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
531 	    sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
532 
533 #endif	/* !__xpv */
534 
535 	/*
536 	 * Initialize fs and gs descriptors for 32 bit processes.
537 	 * Only attributes and limits are initialized, the effective
538 	 * base address is programmed via fsbase/gsbase.
539 	 */
540 	set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
541 	    SEL_UPL, SDP_PAGES, SDP_OP32);
542 	set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
543 	    SEL_UPL, SDP_PAGES, SDP_OP32);
544 
545 	/*
546 	 * Initialize the descriptors set aside for brand usage.
547 	 * Only attributes and limits are initialized.
548 	 */
549 	for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
550 		set_usegd(&gdt0[i], SDP_SHORT, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
551 		    SEL_UPL, SDP_PAGES, SDP_OP32);
552 
553 	/*
554 	 * Initialize convenient zero base user descriptors for clearing
555 	 * lwp private %fs and %gs descriptors in GDT. See setregs() for
556 	 * an example.
557 	 */
558 	set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
559 	    SDP_BYTES, SDP_OP32);
560 	set_usegd(&zero_u32desc, SDP_SHORT, 0, SDP_LIMIT_MAX, SDT_MEMRWA,
561 	    SEL_UPL, SDP_PAGES, SDP_OP32);
562 }
563 
564 #if defined(__xpv)
565 
566 static user_desc_t *
init_gdt(void)567 init_gdt(void)
568 {
569 	uint64_t gdtpa;
570 	ulong_t ma[1];		/* XXPV should be a memory_t */
571 	ulong_t addr;
572 
573 #if !defined(__lint)
574 	/*
575 	 * Our gdt is never larger than a single page.
576 	 */
577 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
578 #endif
579 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
580 	    PAGESIZE, PAGESIZE);
581 	ASSERT3P(gdt0, !=, NULL);
582 	bzero(gdt0, PAGESIZE);
583 
584 	init_gdt_common(gdt0);
585 
586 	/*
587 	 * XXX Since we never invoke kmdb until after the kernel takes
588 	 * over the descriptor tables why not have it use the kernel's
589 	 * selectors?
590 	 */
591 	if (boothowto & RB_DEBUG) {
592 		set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, SDP_LIMIT_MAX,
593 		    SDT_MEMRWA, SEL_KPL, SDP_PAGES, SDP_OP32);
594 		set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, SDP_LIMIT_MAX,
595 		    SDT_MEMERA, SEL_KPL, SDP_PAGES, SDP_OP32);
596 	}
597 
598 	/*
599 	 * Clear write permission for page containing the gdt and install it.
600 	 */
601 	gdtpa = pfn_to_pa(va_to_pfn(gdt0));
602 	ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
603 	kbm_read_only((uintptr_t)gdt0, gdtpa);
604 	xen_set_gdt(ma, NGDT);
605 
606 	/*
607 	 * Reload the segment registers to use the new GDT.
608 	 * On 64-bit, fixup KCS_SEL to be in ring 3.
609 	 * See KCS_SEL in segments.h.
610 	 */
611 	load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
612 
613 	/*
614 	 *  setup %gs for kernel
615 	 */
616 	xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
617 
618 	/*
619 	 * XX64 We should never dereference off "other gsbase" or
620 	 * "fsbase".  So, we should arrange to point FSBASE and
621 	 * KGSBASE somewhere truly awful e.g. point it at the last
622 	 * valid address below the hole so that any attempts to index
623 	 * off them cause an exception.
624 	 *
625 	 * For now, point it at 8G -- at least it should be unmapped
626 	 * until some 64-bit processes run.
627 	 */
628 	addr = 0x200000000ul;
629 	xen_set_segment_base(SEGBASE_FS, addr);
630 	xen_set_segment_base(SEGBASE_GS_USER, addr);
631 	xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
632 
633 	return (gdt0);
634 }
635 
636 #else	/* __xpv */
637 
638 static user_desc_t *
init_gdt(void)639 init_gdt(void)
640 {
641 	desctbr_t	r_gdt;
642 
643 #if !defined(__lint)
644 	/*
645 	 * Our gdt is never larger than a single page.
646 	 */
647 	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
648 #endif
649 	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
650 	    PAGESIZE, PAGESIZE);
651 	bzero(gdt0, PAGESIZE);
652 
653 	init_gdt_common(gdt0);
654 
655 	/*
656 	 * Install our new GDT
657 	 */
658 	r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
659 	r_gdt.dtr_base = (uintptr_t)gdt0;
660 	wr_gdtr(&r_gdt);
661 
662 	/*
663 	 * Reload the segment registers to use the new GDT
664 	 */
665 	load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
666 
667 	/*
668 	 *  setup %gs for kernel
669 	 */
670 	wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
671 
672 	/*
673 	 * XX64 We should never dereference off "other gsbase" or
674 	 * "fsbase".  So, we should arrange to point FSBASE and
675 	 * KGSBASE somewhere truly awful e.g. point it at the last
676 	 * valid address below the hole so that any attempts to index
677 	 * off them cause an exception.
678 	 *
679 	 * For now, point it at 8G -- at least it should be unmapped
680 	 * until some 64-bit processes run.
681 	 */
682 	wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
683 	wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
684 	return (gdt0);
685 }
686 
687 #endif	/* __xpv */
688 
689 
690 /*
691  * Build kernel IDT.
692  *
693  * Note that for amd64 we pretty much require every gate to be an interrupt
694  * gate which blocks interrupts atomically on entry; that's because of our
695  * dependency on using 'swapgs' every time we come into the kernel to find
696  * the cpu structure. If we get interrupted just before doing that, %cs could
697  * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
698  * %gsbase is really still pointing at something in userland. Bad things will
699  * ensue. We also use interrupt gates for i386 as well even though this is not
700  * required for some traps.
701  *
702  * Perhaps they should have invented a trap gate that does an atomic swapgs?
703  */
704 static void
init_idt_common(gate_desc_t * idt)705 init_idt_common(gate_desc_t *idt)
706 {
707 	set_gatesegd(&idt[T_ZERODIV],
708 	    (kpti_enable == 1) ? &tr_div0trap : &div0trap,
709 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
710 	set_gatesegd(&idt[T_SGLSTP],
711 	    (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
712 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
713 	set_gatesegd(&idt[T_NMIFLT],
714 	    (kpti_enable == 1) ? &tr_nmiint : &nmiint,
715 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
716 	set_gatesegd(&idt[T_BPTFLT],
717 	    (kpti_enable == 1) ? &tr_brktrap : &brktrap,
718 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
719 	set_gatesegd(&idt[T_OVFLW],
720 	    (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
721 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
722 	set_gatesegd(&idt[T_BOUNDFLT],
723 	    (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
724 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
725 	set_gatesegd(&idt[T_ILLINST],
726 	    (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
727 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
728 	set_gatesegd(&idt[T_NOEXTFLT],
729 	    (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
730 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
731 
732 	/*
733 	 * double fault handler.
734 	 *
735 	 * Note that on the hypervisor a guest does not receive #df faults.
736 	 * Instead a failsafe event is injected into the guest if its selectors
737 	 * and/or stack is in a broken state. See xen_failsafe_callback.
738 	 */
739 #if !defined(__xpv)
740 	set_gatesegd(&idt[T_DBLFLT],
741 	    (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
742 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
743 #endif	/* !__xpv */
744 
745 	/*
746 	 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
747 	 */
748 	set_gatesegd(&idt[T_TSSFLT],
749 	    (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
750 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
751 	set_gatesegd(&idt[T_SEGFLT],
752 	    (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
753 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
754 	set_gatesegd(&idt[T_STKFLT],
755 	    (kpti_enable == 1) ? &tr_stktrap : &stktrap,
756 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
757 	set_gatesegd(&idt[T_GPFLT],
758 	    (kpti_enable == 1) ? &tr_gptrap : &gptrap,
759 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
760 	set_gatesegd(&idt[T_PGFLT],
761 	    (kpti_enable == 1) ? &tr_pftrap : &pftrap,
762 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
763 	set_gatesegd(&idt[T_EXTERRFLT],
764 	    (kpti_enable == 1) ? &tr_ndperr : &ndperr,
765 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
766 	set_gatesegd(&idt[T_ALIGNMENT],
767 	    (kpti_enable == 1) ? &tr_achktrap : &achktrap,
768 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
769 	set_gatesegd(&idt[T_MCE],
770 	    (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
771 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
772 	set_gatesegd(&idt[T_SIMDFPE],
773 	    (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
774 	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
775 
776 	/*
777 	 * install fast trap handler at 210.
778 	 */
779 	set_gatesegd(&idt[T_FASTTRAP],
780 	    (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
781 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
782 
783 	/*
784 	 * System call handler.
785 	 */
786 	set_gatesegd(&idt[T_SYSCALLINT],
787 	    (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
788 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
789 
790 	/*
791 	 * Install the DTrace interrupt handler for the pid provider.
792 	 */
793 	set_gatesegd(&idt[T_DTRACE_RET],
794 	    (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
795 	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
796 
797 	/*
798 	 * Prepare interposing descriptor for the syscall handler
799 	 * and cache copy of the default descriptor.
800 	 */
801 	brand_tbl[0].ih_inum = T_SYSCALLINT;
802 	brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
803 
804 	set_gatesegd(&(brand_tbl[0].ih_interp_desc),
805 	    (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
806 	    &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
807 	    idt_vector_to_ist(T_SYSCALLINT));
808 
809 	brand_tbl[1].ih_inum = 0;
810 }
811 
812 #if defined(__xpv)
813 
814 static void
init_idt(gate_desc_t * idt)815 init_idt(gate_desc_t *idt)
816 {
817 	init_idt_common(idt);
818 }
819 
820 #else	/* __xpv */
821 
822 static void
init_idt(gate_desc_t * idt)823 init_idt(gate_desc_t *idt)
824 {
825 	char	ivctname[80];
826 	void	(*ivctptr)(void);
827 	int	i;
828 
829 	/*
830 	 * Initialize entire table with 'reserved' trap and then overwrite
831 	 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
832 	 * since it can only be generated on a 386 processor. 15 is also
833 	 * unsupported and reserved.
834 	 */
835 	for (i = 0; i < NIDT; i++) {
836 		set_gatesegd(&idt[i],
837 		    (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
838 		    KCS_SEL, SDT_SYSIGT, TRP_KPL,
839 		    idt_vector_to_ist(T_RESVTRAP));
840 	}
841 
842 	/*
843 	 * 20-31 reserved
844 	 */
845 	for (i = 20; i < 32; i++) {
846 		set_gatesegd(&idt[i],
847 		    (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
848 		    KCS_SEL, SDT_SYSIGT, TRP_KPL,
849 		    idt_vector_to_ist(T_INVALTRAP));
850 	}
851 
852 	/*
853 	 * interrupts 32 - 255
854 	 */
855 	for (i = 32; i < 256; i++) {
856 		(void) snprintf(ivctname, sizeof (ivctname),
857 		    (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
858 		ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
859 		if (ivctptr == NULL)
860 			panic("kobj_getsymvalue(%s) failed", ivctname);
861 
862 		set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
863 		    idt_vector_to_ist(i));
864 	}
865 
866 	/*
867 	 * Now install the common ones. Note that it will overlay some
868 	 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
869 	 */
870 	init_idt_common(idt);
871 }
872 
873 #endif	/* __xpv */
874 
875 /*
876  * The kernel does not deal with LDTs unless a user explicitly creates
877  * one. Under normal circumstances, the LDTR contains 0. Any process attempting
878  * to reference the LDT will therefore cause a #gp. System calls made via the
879  * obsolete lcall mechanism are emulated by the #gp fault handler.
880  */
881 static void
init_ldt(void)882 init_ldt(void)
883 {
884 #if defined(__xpv)
885 	xen_set_ldt(NULL, 0);
886 #else
887 	wr_ldtr(0);
888 #endif
889 }
890 
891 #if !defined(__xpv)
892 
893 static void
init_tss(void)894 init_tss(void)
895 {
896 	extern struct cpu cpus[];
897 
898 	/*
899 	 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
900 	 * context switch but it'll be overwritten with this same value anyway.
901 	 */
902 	if (kpti_enable == 1) {
903 		ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
904 	}
905 
906 	/* Set up the IST stacks for double fault, NMI, MCE. */
907 	ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
908 	ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
909 	ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
910 
911 	/*
912 	 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
913 	 * enabled), and also for KDI (always).
914 	 */
915 	ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
916 
917 	if (kpti_enable == 1) {
918 		/* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
919 		ktss0->tss_ist5 =
920 		    (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
921 
922 		/* This IST stack is used for all other intrs (for KPTI). */
923 		ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
924 	}
925 
926 	/*
927 	 * Set I/O bit map offset equal to size of TSS segment limit
928 	 * for no I/O permission map. This will force all user I/O
929 	 * instructions to generate #gp fault.
930 	 */
931 	ktss0->tss_bitmapbase = sizeof (*ktss0);
932 
933 	/*
934 	 * Point %tr to descriptor for ktss0 in gdt.
935 	 */
936 	wr_tsr(KTSS_SEL);
937 }
938 
939 #endif	/* !__xpv */
940 
941 #if defined(__xpv)
942 
943 void
init_desctbls(void)944 init_desctbls(void)
945 {
946 	uint_t vec;
947 	user_desc_t *gdt;
948 
949 	/*
950 	 * Setup and install our GDT.
951 	 */
952 	gdt = init_gdt();
953 
954 	/*
955 	 * Store static pa of gdt to speed up pa_to_ma() translations
956 	 * on lwp context switches.
957 	 */
958 	ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
959 	CPU->cpu_gdt = gdt;
960 	CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
961 
962 	/*
963 	 * Setup and install our IDT.
964 	 */
965 #if !defined(__lint)
966 	ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
967 #endif
968 	idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
969 	    PAGESIZE, PAGESIZE);
970 	bzero(idt0, PAGESIZE);
971 	init_idt(idt0);
972 	for (vec = 0; vec < NIDT; vec++)
973 		xen_idt_write(&idt0[vec], vec);
974 
975 	CPU->cpu_idt = idt0;
976 
977 	/*
978 	 * set default kernel stack
979 	 */
980 	xen_stack_switch(KDS_SEL,
981 	    (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
982 
983 	xen_init_callbacks();
984 
985 	init_ldt();
986 }
987 
988 #else	/* __xpv */
989 
990 void
init_desctbls(void)991 init_desctbls(void)
992 {
993 	user_desc_t *gdt;
994 	desctbr_t idtr;
995 
996 	/*
997 	 * Allocate IDT and TSS structures on unique pages for better
998 	 * performance in virtual machines.
999 	 */
1000 #if !defined(__lint)
1001 	ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1002 #endif
1003 	idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1004 	    PAGESIZE, PAGESIZE);
1005 	bzero(idt0, PAGESIZE);
1006 #if !defined(__lint)
1007 	ASSERT(sizeof (*ktss0) <= PAGESIZE);
1008 #endif
1009 	ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1010 	    PAGESIZE, PAGESIZE);
1011 	bzero(ktss0, PAGESIZE);
1012 
1013 
1014 	/*
1015 	 * Setup and install our GDT.
1016 	 */
1017 	gdt = init_gdt();
1018 	ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1019 	CPU->cpu_gdt = gdt;
1020 
1021 	/*
1022 	 * Initialize this CPU's LDT.
1023 	 */
1024 	CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1025 	    LDT_CPU_SIZE, PAGESIZE);
1026 	bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1027 	CPU->cpu_m.mcpu_ldt_len = 0;
1028 
1029 	/*
1030 	 * Setup and install our IDT.
1031 	 */
1032 	init_idt(idt0);
1033 
1034 	idtr.dtr_base = (uintptr_t)idt0;
1035 	idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1036 	wr_idtr(&idtr);
1037 	CPU->cpu_idt = idt0;
1038 
1039 
1040 	init_tss();
1041 	CPU->cpu_tss = ktss0;
1042 	init_ldt();
1043 
1044 	/* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1045 	kpti_safe_cr3 = (uint64_t)getcr3();
1046 }
1047 
1048 #endif	/* __xpv */
1049 
1050 #ifndef __xpv
1051 /*
1052  * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so
1053  * we have to manually fix it up ourselves.
1054  *
1055  * The caller may still need to make sure that it can't go off-CPU with the
1056  * incorrect limit, before calling this (such as disabling pre-emption).
1057  */
1058 void
reset_gdtr_limit(void)1059 reset_gdtr_limit(void)
1060 {
1061 	ulong_t flags = intr_clear();
1062 	desctbr_t gdtr;
1063 
1064 	rd_gdtr(&gdtr);
1065 	gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1;
1066 	wr_gdtr(&gdtr);
1067 
1068 	intr_restore(flags);
1069 }
1070 #endif /* __xpv */
1071 
1072 /*
1073  * We need a GDT owned by the kernel and not the bootstrap relatively
1074  * early in kernel initialization (e.g., to have segments we can reliably
1075  * catch an exception on).
1076  *
1077  * Initializes a GDT with segments normally defined in the boot loader.
1078  */
1079 void
init_boot_gdt(user_desc_t * bgdt)1080 init_boot_gdt(user_desc_t *bgdt)
1081 {
1082 	ASSERT3P(bgdt, !=, NULL);
1083 
1084 #ifdef	__xpv
1085 	/* XXX: It is unclear why this 32-bit data segment is marked long. */
1086 	set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
1087 	    SEL_KPL, SDP_PAGES, SDP_OP32);
1088 #else
1089 	/*
1090 	 * Reset boot segments.  These ostensibly come from the boot loader,
1091 	 * but we reset them to match our expectations, particulary if we
1092 	 * are not using that loader.
1093 	 */
1094 	set_usegd(&bgdt[GDT_B32DATA], SDP_SHORT, NULL, SDP_LIMIT_MAX,
1095 	    SDT_MEMRWA, SEL_KPL, SDP_PAGES, SDP_OP32);
1096 	set_usegd(&bgdt[GDT_B32CODE], SDP_SHORT, NULL, SDP_LIMIT_MAX,
1097 	    SDT_MEMERA, SEL_KPL, SDP_PAGES, SDP_OP32);
1098 
1099 	/*
1100 	 * 16-bit segments for making BIOS calls (not applicable on all
1101 	 * architectures).
1102 	 */
1103 	set_usegd(&bgdt[GDT_B16CODE], SDP_SHORT, NULL, SDP_LIMIT_MAX,
1104 	    SDT_MEMERA, SEL_KPL, 0, 0);
1105 	/*
1106 	 * XXX: SDP_OP32 makes this a 32-bit segment, which seems wrong
1107 	 * here, but that's what boot_gdt.s used.
1108 	 */
1109 	set_usegd(&bgdt[GDT_B16DATA], SDP_SHORT, NULL, SDP_LIMIT_MAX,
1110 	    SDT_MEMRWA, SEL_KPL, 0, SDP_OP32);
1111 #endif	/* __xpv */
1112 
1113 	/*
1114 	 * A 64-bit code segment used in early boot.  Early IDTs refer to this.
1115 	 */
1116 	set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, SDP_LIMIT_MAX, SDT_MEMERA,
1117 	    SEL_KPL, SDP_PAGES, SDP_OP32);
1118 }
1119 
1120 /*
1121  * Enable interpositioning on the system call path by rewriting the
1122  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1123  * the branded entry points.
1124  */
1125 void
brand_interpositioning_enable(void * arg __unused)1126 brand_interpositioning_enable(void *arg __unused)
1127 {
1128 	gate_desc_t	*idt = CPU->cpu_idt;
1129 	int		i;
1130 
1131 	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1132 
1133 	for (i = 0; brand_tbl[i].ih_inum; i++) {
1134 		idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1135 #if defined(__xpv)
1136 		xen_idt_write(&idt[brand_tbl[i].ih_inum],
1137 		    brand_tbl[i].ih_inum);
1138 #endif
1139 	}
1140 
1141 #if defined(__xpv)
1142 
1143 	/*
1144 	 * Currently the hypervisor only supports 64-bit syscalls via
1145 	 * syscall instruction. The 32-bit syscalls are handled by
1146 	 * interrupt gate above.
1147 	 */
1148 	xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1149 	    CALLBACKF_mask_events);
1150 
1151 #else
1152 
1153 	if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1154 		if (kpti_enable == 1) {
1155 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1156 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1157 		} else {
1158 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1159 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1160 		}
1161 	}
1162 
1163 #endif
1164 
1165 	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1166 		if (kpti_enable == 1) {
1167 			wrmsr(MSR_INTC_SEP_EIP,
1168 			    (uintptr_t)tr_brand_sys_sysenter);
1169 		} else {
1170 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1171 		}
1172 	}
1173 }
1174 
1175 /*
1176  * Disable interpositioning on the system call path by rewriting the
1177  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1178  * the standard entry points, which bypass the interpositioning hooks.
1179  */
1180 void
brand_interpositioning_disable(void * arg __unused)1181 brand_interpositioning_disable(void *arg __unused)
1182 {
1183 	gate_desc_t	*idt = CPU->cpu_idt;
1184 	int i;
1185 
1186 	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1187 
1188 	for (i = 0; brand_tbl[i].ih_inum; i++) {
1189 		idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1190 #if defined(__xpv)
1191 		xen_idt_write(&idt[brand_tbl[i].ih_inum],
1192 		    brand_tbl[i].ih_inum);
1193 #endif
1194 	}
1195 
1196 #if defined(__xpv)
1197 
1198 	/*
1199 	 * See comment above in brand_interpositioning_enable.
1200 	 */
1201 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1202 	    CALLBACKF_mask_events);
1203 
1204 #else
1205 
1206 	if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1207 		if (kpti_enable == 1) {
1208 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1209 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1210 		} else {
1211 			wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1212 			wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1213 		}
1214 	}
1215 
1216 #endif
1217 
1218 	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1219 		if (kpti_enable == 1) {
1220 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1221 		} else {
1222 			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
1223 		}
1224 	}
1225 }
1226