xref: /illumos-gate/usr/src/uts/intel/os/sysi86.c (revision 5a469116)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2021 Joyent, Inc.
24  */
25 
26 /*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 /*	Copyright (c) 1987, 1988 Microsoft Corporation	*/
31 /*	  All Rights Reserved	*/
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/signal.h>
38 #include <sys/errno.h>
39 #include <sys/fault.h>
40 #include <sys/syscall.h>
41 #include <sys/cpuvar.h>
42 #include <sys/sysi86.h>
43 #include <sys/psw.h>
44 #include <sys/cred.h>
45 #include <sys/policy.h>
46 #include <sys/thread.h>
47 #include <sys/debug.h>
48 #include <sys/ontrap.h>
49 #include <sys/privregs.h>
50 #include <sys/x86_archext.h>
51 #include <sys/vmem.h>
52 #include <sys/kmem.h>
53 #include <sys/mman.h>
54 #include <sys/archsystm.h>
55 #include <vm/hat.h>
56 #include <vm/as.h>
57 #include <vm/seg.h>
58 #include <vm/seg_kmem.h>
59 #include <vm/faultcode.h>
60 #include <sys/fp.h>
61 #include <sys/cmn_err.h>
62 #include <sys/segments.h>
63 #include <sys/clock.h>
64 #include <vm/hat_i86.h>
65 #if defined(__xpv)
66 #include <sys/hypervisor.h>
67 #include <sys/note.h>
68 #endif
69 
70 static void ldt_alloc(proc_t *, uint_t);
71 static void ldt_free(proc_t *);
72 static void ldt_dup(proc_t *, proc_t *);
73 static void ldt_grow(proc_t *, uint_t);
74 
75 /*
76  * sysi86 System Call
77  */
78 
79 /* ARGSUSED */
80 int
sysi86(short cmd,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3)81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
82 {
83 	struct ssd ssd;
84 	int error = 0;
85 	int c;
86 	proc_t *pp = curproc;
87 
88 	switch (cmd) {
89 
90 	/*
91 	 * The SI86V86 subsystem call of the SYSI86 system call
92 	 * supports only one subcode -- V86SC_IOPL.
93 	 */
94 	case SI86V86:
95 		if (arg1 == V86SC_IOPL) {
96 			struct regs *rp = lwptoregs(ttolwp(curthread));
97 			greg_t oldpl = rp->r_ps & PS_IOPL;
98 			greg_t newpl = arg2 & PS_IOPL;
99 
100 			/*
101 			 * Must be privileged to run this system call
102 			 * if giving more io privilege.
103 			 */
104 			if (newpl > oldpl && (error =
105 			    secpolicy_sys_config(CRED(), B_FALSE)) != 0)
106 				return (set_errno(error));
107 #if defined(__xpv)
108 			const struct ctxop_template xen_tpl = {
109 				.ct_rev		= CTXOP_TPL_REV,
110 				.ct_save	= xen_disable_user_iopl,
111 				.ct_restore	= xen_enable_user_iopl,
112 				.ct_exit	= xen_disable_user_iopl,
113 			};
114 			struct ctxop *ctx;
115 
116 			ctx = ctxop_allocate(&xen_tpl, NULL);
117 			kpreempt_disable();
118 			ctxop_attach(curthread, ctx);
119 			xen_enable_user_iopl(NULL);
120 			kpreempt_enable();
121 #else
122 			rp->r_ps ^= oldpl ^ newpl;
123 #endif
124 		} else
125 			error = EINVAL;
126 		break;
127 
128 	/*
129 	 * Set a segment descriptor
130 	 */
131 	case SI86DSCR:
132 		/*
133 		 * There are considerable problems here manipulating
134 		 * resources shared by many running lwps.  Get everyone
135 		 * into a safe state before changing the LDT.
136 		 */
137 		if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
138 			error = EINTR;
139 			break;
140 		}
141 
142 		if (get_udatamodel() == DATAMODEL_LP64) {
143 			error = EINVAL;
144 			break;
145 		}
146 
147 		if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
148 			error = EFAULT;
149 			break;
150 		}
151 
152 		error = setdscr(&ssd);
153 
154 		mutex_enter(&pp->p_lock);
155 		if (curthread != pp->p_agenttp)
156 			continuelwps(pp);
157 		mutex_exit(&pp->p_lock);
158 		break;
159 
160 	case SI86FPHW:
161 		c = fp_kind & 0xff;
162 		if (suword32((void *)arg1, c) == -1)
163 			error = EFAULT;
164 		break;
165 
166 	case SI86FPSTART:
167 		/*
168 		 * arg1 is the address of _fp_hw
169 		 * arg2 is the desired x87 FCW value
170 		 * arg3 is the desired SSE MXCSR value
171 		 * a return value of one means SSE hardware, else none.
172 		 */
173 		c = fp_kind & 0xff;
174 		if (suword32((void *)arg1, c) == -1) {
175 			error = EFAULT;
176 			break;
177 		}
178 		fpsetcw((uint16_t)arg2, (uint32_t)arg3);
179 		return ((fp_kind & __FP_SSE) ? 1 : 0);
180 
181 	/* real time clock management commands */
182 
183 	case WTODC:
184 		if ((error = secpolicy_settime(CRED())) == 0) {
185 			timestruc_t ts;
186 			mutex_enter(&tod_lock);
187 			gethrestime(&ts);
188 			tod_set(ts);
189 			mutex_exit(&tod_lock);
190 		}
191 		break;
192 
193 /* Give some timezone playing room */
194 #define	ONEWEEK	(7 * 24 * 60 * 60)
195 
196 	case SGMTL:
197 		/*
198 		 * Called from 32 bit land, negative values
199 		 * are not sign extended, so we do that here
200 		 * by casting it to an int and back.  We also
201 		 * clamp the value to within reason and detect
202 		 * when a 64 bit call overflows an int.
203 		 */
204 		if ((error = secpolicy_settime(CRED())) == 0) {
205 			int newlag = (int)arg1;
206 
207 #ifdef _SYSCALL32_IMPL
208 			if (get_udatamodel() == DATAMODEL_NATIVE &&
209 			    (long)newlag != (long)arg1) {
210 				error = EOVERFLOW;
211 			} else
212 #endif
213 			if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
214 				sgmtl(newlag);
215 			else
216 				error = EOVERFLOW;
217 		}
218 		break;
219 
220 	case GGMTL:
221 		if (get_udatamodel() == DATAMODEL_NATIVE) {
222 			if (sulword((void *)arg1, ggmtl()) == -1)
223 				error = EFAULT;
224 #ifdef _SYSCALL32_IMPL
225 		} else {
226 			time_t gmtl;
227 
228 			if ((gmtl = ggmtl()) > INT32_MAX) {
229 				/*
230 				 * Since gmt_lag can at most be
231 				 * +/- 12 hours, something is
232 				 * *seriously* messed up here.
233 				 */
234 				error = EOVERFLOW;
235 			} else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
236 				error = EFAULT;
237 #endif
238 		}
239 		break;
240 
241 	case RTCSYNC:
242 		if ((error = secpolicy_settime(CRED())) == 0)
243 			rtcsync();
244 		break;
245 
246 	/* END OF real time clock management commands */
247 
248 	default:
249 		error = EINVAL;
250 		break;
251 	}
252 	return (error == 0 ? 0 : set_errno(error));
253 }
254 
255 void
usd_to_ssd(user_desc_t * usd,struct ssd * ssd,selector_t sel)256 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
257 {
258 	ssd->bo = USEGD_GETBASE(usd);
259 	ssd->ls = USEGD_GETLIMIT(usd);
260 	ssd->sel = sel;
261 
262 	/*
263 	 * set type, dpl and present bits.
264 	 */
265 	ssd->acc1 = usd->usd_type;
266 	ssd->acc1 |= usd->usd_dpl << 5;
267 	ssd->acc1 |= usd->usd_p << (5 + 2);
268 
269 	/*
270 	 * set avl, DB and granularity bits.
271 	 */
272 	ssd->acc2 = usd->usd_avl;
273 
274 	ssd->acc2 |= usd->usd_long << 1;
275 
276 	ssd->acc2 |= usd->usd_def32 << (1 + 1);
277 	ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
278 }
279 
280 static void
ssd_to_usd(struct ssd * ssd,user_desc_t * usd)281 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
282 {
283 
284 	ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
285 
286 	USEGD_SETBASE(usd, ssd->bo);
287 	USEGD_SETLIMIT(usd, ssd->ls);
288 
289 	/*
290 	 * Set type, dpl and present bits.
291 	 *
292 	 * Force the "accessed" bit to on so that we don't run afoul of
293 	 * KPTI.
294 	 */
295 	usd->usd_type = ssd->acc1 | SDT_A;
296 	usd->usd_dpl = ssd->acc1 >> 5;
297 	usd->usd_p = ssd->acc1 >> (5 + 2);
298 
299 	ASSERT(usd->usd_type >= SDT_MEMRO);
300 	ASSERT(usd->usd_dpl == SEL_UPL);
301 
302 	/*
303 	 * 64-bit code selectors are never allowed in the LDT.
304 	 * Reserved bit is always 0 on 32-bit systems.
305 	 */
306 	usd->usd_long = 0;
307 
308 	/*
309 	 * set avl, DB and granularity bits.
310 	 */
311 	usd->usd_avl = ssd->acc2;
312 	usd->usd_def32 = ssd->acc2 >> (1 + 1);
313 	usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
314 }
315 
316 
317 
318 /*
319  * Load LDT register with the current process's LDT.
320  */
321 static void
ldt_load(void)322 ldt_load(void)
323 {
324 #if defined(__xpv)
325 	xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1);
326 #else
327 	size_t len;
328 	system_desc_t desc;
329 
330 	/*
331 	 * Before we can use the LDT on this CPU, we must install the LDT in the
332 	 * user mapping table.
333 	 */
334 	len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
335 	bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
336 	CPU->cpu_m.mcpu_ldt_len = len;
337 	set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
338 	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
339 
340 	wr_ldtr(ULDT_SEL);
341 #endif
342 }
343 
344 /*
345  * Store a NULL selector in the LDTR. All subsequent illegal references to
346  * the LDT will result in a #gp.
347  */
348 void
ldt_unload(void)349 ldt_unload(void)
350 {
351 #if defined(__xpv)
352 	xen_set_ldt(NULL, 0);
353 #else
354 	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
355 	wr_ldtr(0);
356 
357 	bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
358 	CPU->cpu_m.mcpu_ldt_len = 0;
359 #endif
360 }
361 
362 /*ARGSUSED*/
363 static void
ldt_savectx(proc_t * p)364 ldt_savectx(proc_t *p)
365 {
366 	ASSERT(p->p_ldt != NULL);
367 	ASSERT(p == curproc);
368 
369 	/*
370 	 * The 64-bit kernel must be sure to clear any stale ldt
371 	 * selectors when context switching away from a process that
372 	 * has a private ldt. Consider the following example:
373 	 *
374 	 *	Wine creats a ldt descriptor and points a segment register
375 	 *	to it.
376 	 *
377 	 *	We then context switch away from wine lwp to kernel
378 	 *	thread and hit breakpoint in kernel with kmdb
379 	 *
380 	 *	When we continue and resume from kmdb we will #gp
381 	 *	fault since kmdb will have saved the stale ldt selector
382 	 *	from wine and will try to restore it but we are no longer in
383 	 *	the context of the wine process and do not have our
384 	 *	ldtr register pointing to the private ldt.
385 	 */
386 	reset_sregs();
387 
388 	ldt_unload();
389 	cpu_fast_syscall_enable();
390 }
391 
392 static void
ldt_restorectx(proc_t * p)393 ldt_restorectx(proc_t *p)
394 {
395 	ASSERT(p->p_ldt != NULL);
396 	ASSERT(p == curproc);
397 
398 	ldt_load();
399 	cpu_fast_syscall_disable();
400 }
401 
402 /*
403  * At exec time, we need to clear up our LDT context and re-enable fast syscalls
404  * for the new process image.
405  *
406  * The same is true for the other case, where we have:
407  *
408  * proc_exit()
409  *  ->exitpctx()->ldt_savectx()
410  *  ->freepctx()->ldt_freectx()
411  *
412  * Because pre-emption is not prevented between the two callbacks, we could have
413  * come off CPU, and brought back LDT context when coming back on CPU via
414  * ldt_restorectx().
415  */
416 /* ARGSUSED */
417 static void
ldt_freectx(proc_t * p,int isexec)418 ldt_freectx(proc_t *p, int isexec)
419 {
420 	ASSERT(p->p_ldt != NULL);
421 	ASSERT(p == curproc);
422 
423 	kpreempt_disable();
424 	ldt_free(p);
425 	cpu_fast_syscall_enable();
426 	kpreempt_enable();
427 }
428 
429 /*
430  * Install ctx op that ensures syscall/sysenter are disabled.
431  * See comments below.
432  *
433  * When a thread with a private LDT forks, the new process
434  * must have the LDT context ops installed.
435  */
436 /* ARGSUSED */
437 static void
ldt_installctx(proc_t * p,proc_t * cp)438 ldt_installctx(proc_t *p, proc_t *cp)
439 {
440 	proc_t		*targ = p;
441 	kthread_t	*t;
442 
443 	/*
444 	 * If this is a fork, operate on the child process.
445 	 */
446 	if (cp != NULL) {
447 		targ = cp;
448 		ldt_dup(p, cp);
449 	}
450 
451 	/*
452 	 * The process context ops expect the target process as their argument.
453 	 */
454 	ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
455 	    ldt_installctx, ldt_savectx, ldt_freectx) == 0);
456 
457 	installpctx(targ, targ, ldt_savectx, ldt_restorectx,
458 	    ldt_installctx, ldt_savectx, ldt_freectx);
459 
460 	/*
461 	 * We've just disabled fast system call and return instructions; take
462 	 * the slow path out to make sure we don't try to use one to return
463 	 * back to user. We must set t_post_sys for every thread in the
464 	 * process to make sure none of them escape out via fast return.
465 	 */
466 
467 	mutex_enter(&targ->p_lock);
468 	t = targ->p_tlist;
469 	do {
470 		t->t_post_sys = 1;
471 	} while ((t = t->t_forw) != targ->p_tlist);
472 	mutex_exit(&targ->p_lock);
473 }
474 
475 int
setdscr(struct ssd * ssd)476 setdscr(struct ssd *ssd)
477 {
478 	ushort_t seli;		/* selector index */
479 	user_desc_t *ldp;	/* descriptor pointer */
480 	user_desc_t ndesc;	/* new descriptor */
481 	proc_t	*pp = curproc;
482 	int	rc = 0;
483 
484 	/*
485 	 * LDT segments: executable and data at DPL 3 only.
486 	 */
487 	if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
488 		return (EINVAL);
489 
490 	/*
491 	 * check the selector index.
492 	 */
493 	seli = SELTOIDX(ssd->sel);
494 	if (seli >= MAXNLDT || seli < LDT_UDBASE)
495 		return (EINVAL);
496 
497 	ndesc = null_udesc;
498 	mutex_enter(&pp->p_ldtlock);
499 
500 	/*
501 	 * If this is the first time for this process then setup a
502 	 * private LDT for it.
503 	 */
504 	if (pp->p_ldt == NULL) {
505 		ldt_alloc(pp, seli);
506 
507 		/*
508 		 * Now that this process has a private LDT, the use of
509 		 * the syscall/sysret and sysenter/sysexit instructions
510 		 * is forbidden for this processes because they destroy
511 		 * the contents of %cs and %ss segment registers.
512 		 *
513 		 * Explicity disable them here and add a context handler
514 		 * to the process. Note that disabling
515 		 * them here means we can't use sysret or sysexit on
516 		 * the way out of this system call - so we force this
517 		 * thread to take the slow path (which doesn't make use
518 		 * of sysenter or sysexit) back out.
519 		 */
520 		kpreempt_disable();
521 		ldt_installctx(pp, NULL);
522 		cpu_fast_syscall_disable();
523 		ASSERT(curthread->t_post_sys != 0);
524 		kpreempt_enable();
525 
526 	} else if (seli > pp->p_ldtlimit) {
527 		ASSERT(pp->p_pctx != NULL);
528 
529 		/*
530 		 * Increase size of ldt to include seli.
531 		 */
532 		ldt_grow(pp, seli);
533 	}
534 
535 	ASSERT(seli <= pp->p_ldtlimit);
536 	ldp = &pp->p_ldt[seli];
537 
538 	/*
539 	 * On the 64-bit kernel, this is where things get more subtle.
540 	 * Recall that in the 64-bit kernel, when we enter the kernel we
541 	 * deliberately -don't- reload the segment selectors we came in on
542 	 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
543 	 * and the underlying descriptors are essentially ignored by the
544 	 * hardware in long mode - except for the base that we override with
545 	 * the gsbase MSRs.
546 	 *
547 	 * However, there's one unfortunate issue with this rosy picture --
548 	 * a descriptor that's not marked as 'present' will still generate
549 	 * an #np when loading a segment register.
550 	 *
551 	 * Consider this case.  An lwp creates a harmless LDT entry, points
552 	 * one of it's segment registers at it, then tells the kernel (here)
553 	 * to delete it.  In the 32-bit kernel, the #np will happen on the
554 	 * way back to userland where we reload the segment registers, and be
555 	 * handled in kern_gpfault().  In the 64-bit kernel, the same thing
556 	 * will happen in the normal case too.  However, if we're trying to
557 	 * use a debugger that wants to save and restore the segment registers,
558 	 * and the debugger things that we have valid segment registers, we
559 	 * have the problem that the debugger will try and restore the
560 	 * segment register that points at the now 'not present' descriptor
561 	 * and will take a #np right there.
562 	 *
563 	 * We should obviously fix the debugger to be paranoid about
564 	 * -not- restoring segment registers that point to bad descriptors;
565 	 * however we can prevent the problem here if we check to see if any
566 	 * of the segment registers are still pointing at the thing we're
567 	 * destroying; if they are, return an error instead. (That also seems
568 	 * a lot better failure mode than SIGKILL and a core file
569 	 * from kern_gpfault() too.)
570 	 */
571 	if (SI86SSD_PRES(ssd) == 0) {
572 		kthread_t *t;
573 		int bad = 0;
574 
575 		/*
576 		 * Look carefully at the segment registers of every lwp
577 		 * in the process (they're all stopped by our caller).
578 		 * If we're about to invalidate a descriptor that's still
579 		 * being referenced by *any* of them, return an error,
580 		 * rather than having them #gp on their way out of the kernel.
581 		 */
582 		ASSERT(pp->p_lwprcnt == 1);
583 
584 		mutex_enter(&pp->p_lock);
585 		t = pp->p_tlist;
586 		do {
587 			klwp_t *lwp = ttolwp(t);
588 			struct regs *rp = lwp->lwp_regs;
589 			pcb_t *pcb = &lwp->lwp_pcb;
590 
591 			if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
592 				bad = 1;
593 				break;
594 			}
595 
596 			if (PCB_NEED_UPDATE_SEGS(pcb)) {
597 				if (ssd->sel == pcb->pcb_ds ||
598 				    ssd->sel == pcb->pcb_es ||
599 				    ssd->sel == pcb->pcb_fs ||
600 				    ssd->sel == pcb->pcb_gs) {
601 					bad = 1;
602 					break;
603 				}
604 			} else {
605 				if (ssd->sel == rp->r_ds ||
606 				    ssd->sel == rp->r_es ||
607 				    ssd->sel == rp->r_fs ||
608 				    ssd->sel == rp->r_gs) {
609 					bad = 1;
610 					break;
611 				}
612 			}
613 
614 		} while ((t = t->t_forw) != pp->p_tlist);
615 		mutex_exit(&pp->p_lock);
616 
617 		if (bad) {
618 			mutex_exit(&pp->p_ldtlock);
619 			return (EBUSY);
620 		}
621 	}
622 
623 	/*
624 	 * If acc1 is zero, clear the descriptor (including the 'present' bit).
625 	 * Make sure we update the CPU-private copy of the LDT.
626 	 */
627 	if (ssd->acc1 == 0) {
628 		rc  = ldt_update_segd(ldp, &null_udesc);
629 		kpreempt_disable();
630 		ldt_load();
631 		kpreempt_enable();
632 		mutex_exit(&pp->p_ldtlock);
633 		return (rc);
634 	}
635 
636 	/*
637 	 * Check segment type, allow segment not present and
638 	 * only user DPL (3).
639 	 */
640 	if (SI86SSD_DPL(ssd) != SEL_UPL) {
641 		mutex_exit(&pp->p_ldtlock);
642 		return (EINVAL);
643 	}
644 
645 	/*
646 	 * Do not allow 32-bit applications to create 64-bit mode code
647 	 * segments.
648 	 */
649 	if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
650 	    SI86SSD_ISLONG(ssd)) {
651 		mutex_exit(&pp->p_ldtlock);
652 		return (EINVAL);
653 	}
654 
655 	/*
656 	 * Set up a code or data user segment descriptor, making sure to update
657 	 * the CPU-private copy of the LDT.
658 	 */
659 	if (SI86SSD_ISUSEG(ssd)) {
660 		ssd_to_usd(ssd, &ndesc);
661 		rc = ldt_update_segd(ldp, &ndesc);
662 		kpreempt_disable();
663 		ldt_load();
664 		kpreempt_enable();
665 		mutex_exit(&pp->p_ldtlock);
666 		return (rc);
667 	}
668 
669 	mutex_exit(&pp->p_ldtlock);
670 	return (EINVAL);
671 }
672 
673 /*
674  * Allocate new LDT for process just large enough to contain seli.  Note we
675  * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the
676  * implementation and because on the hypervisor it's required, since the LDT
677  * must live on pages that have PROT_WRITE removed and which are given to the
678  * hypervisor.
679  *
680  * Note that we don't actually load the LDT into the current CPU here: it's done
681  * later by our caller.
682  */
683 static void
ldt_alloc(proc_t * pp,uint_t seli)684 ldt_alloc(proc_t *pp, uint_t seli)
685 {
686 	user_desc_t	*ldt;
687 	size_t		ldtsz;
688 	uint_t		nsels;
689 
690 	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
691 	ASSERT(pp->p_ldt == NULL);
692 	ASSERT(pp->p_ldtlimit == 0);
693 
694 	/*
695 	 * Allocate new LDT just large enough to contain seli. The LDT must
696 	 * always be allocated in units of pages for KPTI.
697 	 */
698 	ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
699 	nsels = ldtsz / sizeof (user_desc_t);
700 	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
701 
702 	ldt = kmem_zalloc(ldtsz, KM_SLEEP);
703 	ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
704 
705 #if defined(__xpv)
706 	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
707 		panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
708 #endif
709 
710 	pp->p_ldt = ldt;
711 	pp->p_ldtlimit = nsels - 1;
712 }
713 
714 static void
ldt_free(proc_t * pp)715 ldt_free(proc_t *pp)
716 {
717 	user_desc_t	*ldt;
718 	size_t		ldtsz;
719 
720 	ASSERT(pp->p_ldt != NULL);
721 
722 	mutex_enter(&pp->p_ldtlock);
723 	ldt = pp->p_ldt;
724 	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
725 
726 	ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
727 
728 	pp->p_ldt = NULL;
729 	pp->p_ldtlimit = 0;
730 	mutex_exit(&pp->p_ldtlock);
731 
732 	if (pp == curproc) {
733 		kpreempt_disable();
734 		ldt_unload();
735 		kpreempt_enable();
736 	}
737 
738 #if defined(__xpv)
739 	/*
740 	 * We are not allowed to make the ldt writable until after
741 	 * we tell the hypervisor to unload it.
742 	 */
743 	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
744 		panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
745 #endif
746 
747 	kmem_free(ldt, ldtsz);
748 }
749 
750 /*
751  * On fork copy new ldt for child.
752  */
753 static void
ldt_dup(proc_t * pp,proc_t * cp)754 ldt_dup(proc_t *pp, proc_t *cp)
755 {
756 	size_t	ldtsz;
757 
758 	ASSERT(pp->p_ldt != NULL);
759 	ASSERT(cp != curproc);
760 
761 	/*
762 	 * I assume the parent's ldt can't increase since we're in a fork.
763 	 */
764 	mutex_enter(&pp->p_ldtlock);
765 	mutex_enter(&cp->p_ldtlock);
766 
767 	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
768 
769 	ldt_alloc(cp, pp->p_ldtlimit);
770 
771 #if defined(__xpv)
772 	/*
773 	 * Make child's ldt writable so it can be copied into from
774 	 * parent's ldt. This works since ldt_alloc above did not load
775 	 * the ldt since its for the child process. If we tried to make
776 	 * an LDT writable that is loaded in hw the setprot operation
777 	 * would fail.
778 	 */
779 	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
780 		panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
781 #endif
782 
783 	bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
784 
785 #if defined(__xpv)
786 	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
787 		panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
788 #endif
789 	mutex_exit(&cp->p_ldtlock);
790 	mutex_exit(&pp->p_ldtlock);
791 
792 }
793 
794 /*
795  * Note that we don't actually load the LDT into the current CPU here: it's done
796  * later by our caller - unless we take an error.  This works out because
797  * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT
798  * (and therefore can't be using the freed old LDT), and by definition if the
799  * new entry didn't pass validation, then the proc shouldn't be referencing an
800  * entry in the extended region.
801  */
802 static void
ldt_grow(proc_t * pp,uint_t seli)803 ldt_grow(proc_t *pp, uint_t seli)
804 {
805 	user_desc_t	*oldt, *nldt;
806 	uint_t		nsels;
807 	size_t		oldtsz, nldtsz;
808 
809 	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
810 	ASSERT(pp->p_ldt != NULL);
811 	ASSERT(pp->p_ldtlimit != 0);
812 
813 	/*
814 	 * Allocate larger LDT just large enough to contain seli. The LDT must
815 	 * always be allocated in units of pages for KPTI.
816 	 */
817 	nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
818 	nsels = nldtsz / sizeof (user_desc_t);
819 	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
820 	ASSERT(nsels > pp->p_ldtlimit);
821 
822 	oldt = pp->p_ldt;
823 	oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
824 
825 	nldt = kmem_zalloc(nldtsz, KM_SLEEP);
826 	ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
827 
828 	bcopy(oldt, nldt, oldtsz);
829 
830 	/*
831 	 * unload old ldt.
832 	 */
833 	kpreempt_disable();
834 	ldt_unload();
835 	kpreempt_enable();
836 
837 #if defined(__xpv)
838 
839 	/*
840 	 * Make old ldt writable and new ldt read only.
841 	 */
842 	if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
843 		panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
844 
845 	if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
846 		panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
847 #endif
848 
849 	pp->p_ldt = nldt;
850 	pp->p_ldtlimit = nsels - 1;
851 
852 	kmem_free(oldt, oldtsz);
853 }
854