1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/errno.h>
27 #include <sys/exec.h>
28 #include <sys/file.h>
29 #include <sys/kmem.h>
30 #include <sys/modctl.h>
31 #include <sys/model.h>
32 #include <sys/proc.h>
33 #include <sys/syscall.h>
34 #include <sys/systm.h>
35 #include <sys/thread.h>
36 #include <sys/cmn_err.h>
37 #include <sys/archsystm.h>
38 #include <sys/pathname.h>
39 #include <sys/sunddi.h>
40 
41 #include <sys/machbrand.h>
42 #include <sys/brand.h>
43 #include "s10_brand.h"
44 
45 char *s10_emulation_table = NULL;
46 
47 void	s10_init_brand_data(zone_t *);
48 void	s10_free_brand_data(zone_t *);
49 void	s10_setbrand(proc_t *);
50 int	s10_getattr(zone_t *, int, void *, size_t *);
51 int	s10_setattr(zone_t *, int, void *, size_t);
52 int	s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
53 		uintptr_t, uintptr_t, uintptr_t);
54 void	s10_copy_procdata(proc_t *, proc_t *);
55 void	s10_proc_exit(struct proc *, klwp_t *);
56 void	s10_exec();
57 int	s10_initlwp(klwp_t *);
58 void	s10_forklwp(klwp_t *, klwp_t *);
59 void	s10_freelwp(klwp_t *);
60 void	s10_lwpexit(klwp_t *);
61 int	s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
62 	long *, int, caddr_t, cred_t *, int);
63 void	s10_sigset_native_to_s10(sigset_t *);
64 void	s10_sigset_s10_to_native(sigset_t *);
65 
66 /* s10 brand */
67 struct brand_ops s10_brops = {
68 	s10_init_brand_data,
69 	s10_free_brand_data,
70 	s10_brandsys,
71 	s10_setbrand,
72 	s10_getattr,
73 	s10_setattr,
74 	s10_copy_procdata,
75 	s10_proc_exit,
76 	s10_exec,
77 	lwp_setrval,
78 	s10_initlwp,
79 	s10_forklwp,
80 	s10_freelwp,
81 	s10_lwpexit,
82 	s10_elfexec,
83 	s10_sigset_native_to_s10,
84 	s10_sigset_s10_to_native,
85 	S10_NSIG,
86 };
87 
88 #ifdef	sparc
89 
90 struct brand_mach_ops s10_mops = {
91 	s10_brand_syscall_callback,
92 	s10_brand_syscall32_callback
93 };
94 
95 #else	/* sparc */
96 
97 #ifdef	__amd64
98 
99 struct brand_mach_ops s10_mops = {
100 	s10_brand_sysenter_callback,
101 	s10_brand_int91_callback,
102 	s10_brand_syscall_callback,
103 	s10_brand_syscall32_callback
104 };
105 
106 #else	/* ! __amd64 */
107 
108 struct brand_mach_ops s10_mops = {
109 	s10_brand_sysenter_callback,
110 	NULL,
111 	s10_brand_syscall_callback,
112 	NULL
113 };
114 #endif	/* __amd64 */
115 
116 #endif	/* _sparc */
117 
118 struct brand	s10_brand = {
119 	BRAND_VER_1,
120 	"solaris10",
121 	&s10_brops,
122 	&s10_mops
123 };
124 
125 static struct modlbrand modlbrand = {
126 	&mod_brandops,		/* type of module */
127 	"Solaris 10 Brand",	/* description of module */
128 	&s10_brand		/* driver ops */
129 };
130 
131 static struct modlinkage modlinkage = {
132 	MODREV_1, (void *)&modlbrand, NULL
133 };
134 
135 void
136 s10_setbrand(proc_t *p)
137 {
138 	brand_solaris_setbrand(p, &s10_brand);
139 }
140 
141 /*ARGSUSED*/
142 int
143 s10_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
144 {
145 	ASSERT(zone->zone_brand == &s10_brand);
146 	if (attr == S10_EMUL_BITMAP) {
147 		if (buf == NULL || *bufsize != sizeof (s10_emul_bitmap_t))
148 			return (EINVAL);
149 		if (copyout(((s10_zone_data_t *)zone->zone_brand_data)->
150 		    emul_bitmap, buf, sizeof (s10_emul_bitmap_t)) != 0)
151 			return (EFAULT);
152 		return (0);
153 	}
154 
155 	return (EINVAL);
156 }
157 
158 int
159 s10_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
160 {
161 	ASSERT(zone->zone_brand == &s10_brand);
162 	if (attr == S10_EMUL_BITMAP) {
163 		if (buf == NULL || bufsize != sizeof (s10_emul_bitmap_t))
164 			return (EINVAL);
165 		if (copyin(buf, ((s10_zone_data_t *)zone->zone_brand_data)->
166 		    emul_bitmap, sizeof (s10_emul_bitmap_t)) != 0)
167 			return (EFAULT);
168 		return (0);
169 	}
170 
171 	return (EINVAL);
172 }
173 
174 #ifdef	__amd64
175 /*
176  * The Nevada kernel clears %fs for threads in 64-bit x86 processes but S10's
177  * libc expects %fs to be nonzero.  This causes some committed
178  * libc/libthread interfaces (e.g., thr_main()) to fail, which impacts several
179  * libraries, including libdoor.  This function sets the specified LWP's %fs
180  * register to the legacy S10 selector value (LWPFS_SEL).
181  *
182  * The best solution to the aforementioned problem is backporting CRs
183  * 6467491 to Solaris 10 so that 64-bit x86 Solaris 10 processes
184  * would accept zero for %fs.  Backporting the CRs is a requirement for running
185  * S10 Containers in PV domUs because 64-bit Xen clears %fsbase when %fs is
186  * nonzero.  Such behavior breaks 64-bit processes because Xen has to fetch the
187  * FS segments' base addresses from the LWPs' GDTs, which are only capable of
188  * 32-bit addressing.
189  */
190 /*ARGSUSED*/
191 static void
192 s10_amd64_correct_fsreg(klwp_t *l)
193 {
194 	if (lwp_getdatamodel(l) == DATAMODEL_NATIVE) {
195 		kpreempt_disable();
196 		l->lwp_pcb.pcb_fs = LWPFS_SEL;
197 		l->lwp_pcb.pcb_rupdate = 1;
198 		lwptot(l)->t_post_sys = 1;	/* Guarantee update_sregs() */
199 		kpreempt_enable();
200 	}
201 }
202 #endif	/* __amd64 */
203 
204 int
205 s10_native()
206 {
207 	struct user	*up = PTOU(curproc);
208 	char		*args_new, *comm_new, *p;
209 	int		len;
210 
211 	len = sizeof (BRAND_NATIVE_LINKER32 " ") - 1;
212 
213 	/*
214 	 * Make sure that the process' interpreter is the native dynamic linker.
215 	 * Convention dictates that native processes executing within solaris10-
216 	 * branded zones are interpreted by the native dynamic linker (the
217 	 * process and its arguments are specified as arguments to the dynamic
218 	 * linker).  If this convention is violated (i.e.,
219 	 * brandsys(B_S10_NATIVE, ...) is invoked by a process that shouldn't be
220 	 * native), then do nothing and silently indicate success.
221 	 */
222 	if (strcmp(up->u_comm, S10_LINKER_NAME) != 0)
223 		return (0);
224 	if (strncmp(up->u_psargs, BRAND_NATIVE_LINKER64 " /", len + 4) == 0)
225 		len += 3;		/* to account for "/64" in the path */
226 	else if (strncmp(up->u_psargs, BRAND_NATIVE_LINKER32 " /", len + 1)
227 	    != 0)
228 		return (0);
229 
230 	args_new = strdup(&up->u_psargs[len]);
231 	if ((p = strchr(args_new, ' ')) != NULL)
232 		*p = '\0';
233 	if ((comm_new = strrchr(args_new, '/')) != NULL)
234 		comm_new = strdup(comm_new + 1);
235 	else
236 		comm_new = strdup(args_new);
237 	if (p != NULL)
238 		*p = ' ';
239 
240 	if ((strlen(args_new) != 0) && (strlen(comm_new) != 0)) {
241 		mutex_enter(&curproc->p_lock);
242 		(void) strlcpy(up->u_comm, comm_new, MAXCOMLEN+1);
243 		(void) strlcpy(up->u_psargs, args_new, PSARGSZ);
244 		mutex_exit(&curproc->p_lock);
245 	}
246 
247 	strfree(args_new);
248 	strfree(comm_new);
249 	return (0);
250 }
251 
252 /*ARGSUSED*/
253 int
254 s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
255     uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
256 {
257 	proc_t	*p = curproc;
258 	int	res;
259 
260 	*rval = 0;
261 
262 	if (cmd == B_S10_NATIVE)
263 		return (s10_native());
264 
265 	res = brand_solaris_cmd(cmd, arg1, arg2, arg3, &s10_brand, S10_VERSION);
266 	if (res >= 0)
267 		return (res);
268 
269 	switch ((cmd)) {
270 	case B_S10_PIDINFO:
271 		/*
272 		 * The s10 brand needs to be able to get the pid of the
273 		 * current process and the pid of the zone's init, and it
274 		 * needs to do this on every process startup.  Early in
275 		 * brand startup, we can't call getpid() because calls to
276 		 * getpid() represent a magical signal to some old-skool
277 		 * debuggers.  By merging all of this into one call, we
278 		 * make this quite a bit cheaper and easier to handle in
279 		 * the brand module.
280 		 */
281 		if (copyout(&p->p_pid, (void *)arg1, sizeof (pid_t)) != 0)
282 			return (EFAULT);
283 		if (copyout(&p->p_zone->zone_proc_initpid, (void *)arg2,
284 		    sizeof (pid_t)) != 0)
285 			return (EFAULT);
286 		return (0);
287 
288 	case B_S10_ISFDXATTRDIR: {
289 		/*
290 		 * This subcommand enables the userland brand emulation library
291 		 * to determine whether a file descriptor refers to an extended
292 		 * file attributes directory.  There is no standard syscall or
293 		 * libc function that can make such a determination.
294 		 */
295 		file_t *dir_filep;
296 
297 		dir_filep = getf((int)arg1);
298 		if (dir_filep == NULL)
299 			return (EBADF);
300 		ASSERT(dir_filep->f_vnode != NULL);
301 		*rval = IS_XATTRDIR(dir_filep->f_vnode);
302 		releasef((int)arg1);
303 		return (0);
304 	}
305 
306 #ifdef	__amd64
307 	case B_S10_FSREGCORRECTION:
308 		/*
309 		 * This subcommand exists so that the SYS_lwp_private and
310 		 * SYS_lwp_create syscalls can manually set the current thread's
311 		 * %fs register to the legacy S10 selector value for 64-bit x86
312 		 * processes.
313 		 */
314 		s10_amd64_correct_fsreg(ttolwp(curthread));
315 		return (0);
316 #endif	/* __amd64 */
317 	}
318 
319 	return (EINVAL);
320 }
321 
322 void
323 s10_copy_procdata(proc_t *child, proc_t *parent)
324 {
325 	brand_solaris_copy_procdata(child, parent, &s10_brand);
326 }
327 
328 void
329 s10_proc_exit(struct proc *p, klwp_t *l)
330 {
331 	brand_solaris_proc_exit(p, l, &s10_brand);
332 }
333 
334 void
335 s10_exec()
336 {
337 	brand_solaris_exec(&s10_brand);
338 }
339 
340 int
341 s10_initlwp(klwp_t *l)
342 {
343 	return (brand_solaris_initlwp(l, &s10_brand));
344 }
345 
346 void
347 s10_forklwp(klwp_t *p, klwp_t *c)
348 {
349 	brand_solaris_forklwp(p, c, &s10_brand);
350 
351 #ifdef	__amd64
352 	/*
353 	 * Only correct the child's %fs register if the parent's %fs register
354 	 * is LWPFS_SEL.  If the parent's %fs register is zero, then the Solaris
355 	 * 10 environment that we're emulating uses a version of libc that
356 	 * works when %fs is zero (i.e., it contains backports of CRs 6467491
357 	 * and 6501650).
358 	 */
359 	if (p->lwp_pcb.pcb_fs == LWPFS_SEL)
360 		s10_amd64_correct_fsreg(c);
361 #endif	/* __amd64 */
362 }
363 
364 void
365 s10_freelwp(klwp_t *l)
366 {
367 	brand_solaris_freelwp(l, &s10_brand);
368 }
369 
370 void
371 s10_lwpexit(klwp_t *l)
372 {
373 	brand_solaris_lwpexit(l, &s10_brand);
374 }
375 
376 void
377 s10_free_brand_data(zone_t *zone)
378 {
379 	kmem_free(zone->zone_brand_data, sizeof (s10_zone_data_t));
380 }
381 
382 void
383 s10_init_brand_data(zone_t *zone)
384 {
385 	ASSERT(zone->zone_brand == &s10_brand);
386 	ASSERT(zone->zone_brand_data == NULL);
387 	zone->zone_brand_data = kmem_zalloc(sizeof (s10_zone_data_t), KM_SLEEP);
388 }
389 
390 int
391 s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
392 	int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
393 	int brand_action)
394 {
395 	return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz,
396 	    setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME,
397 	    S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32));
398 }
399 
400 void
401 s10_sigset_native_to_s10(sigset_t *set)
402 {
403 	int nativesig;
404 	int s10sig;
405 	sigset_t s10set;
406 
407 	/*
408 	 * Shortcut: we know the first 32 signals are the same in both
409 	 * s10 and native Solaris.  Just assign the first word.
410 	 */
411 	s10set.__sigbits[0] = set->__sigbits[0];
412 	s10set.__sigbits[1] = 0;
413 	s10set.__sigbits[2] = 0;
414 	s10set.__sigbits[3] = 0;
415 
416 	/*
417 	 * Copy the remainder of the initial set of common signals.
418 	 */
419 	for (nativesig = 33; nativesig < S10_SIGRTMIN; nativesig++)
420 		if (sigismember(set, nativesig))
421 			sigaddset(&s10set, nativesig);
422 
423 	/*
424 	 * Convert any native RT signals to their S10 values.
425 	 */
426 	for (nativesig = _SIGRTMIN, s10sig = S10_SIGRTMIN;
427 	    nativesig <= _SIGRTMAX && s10sig <= S10_SIGRTMAX;
428 	    nativesig++, s10sig++) {
429 		if (sigismember(set, nativesig))
430 			sigaddset(&s10set, s10sig);
431 	}
432 
433 	*set = s10set;
434 }
435 
436 void
437 s10_sigset_s10_to_native(sigset_t *set)
438 {
439 	int s10sig;
440 	int nativesig;
441 	sigset_t nativeset;
442 
443 	/*
444 	 * Shortcut: we know the first 32 signals are the same in both
445 	 * s10 and native Solaris.  Just assign the first word.
446 	 */
447 	nativeset.__sigbits[0] = set->__sigbits[0];
448 	nativeset.__sigbits[1] = 0;
449 	nativeset.__sigbits[2] = 0;
450 	nativeset.__sigbits[3] = 0;
451 
452 	/*
453 	 * Copy the remainder of the initial set of common signals.
454 	 */
455 	for (s10sig = 33; s10sig < S10_SIGRTMIN; s10sig++)
456 		if (sigismember(set, s10sig))
457 			sigaddset(&nativeset, s10sig);
458 
459 	/*
460 	 * Convert any S10 RT signals to their native values.
461 	 */
462 	for (s10sig = S10_SIGRTMIN, nativesig = _SIGRTMIN;
463 	    s10sig <= S10_SIGRTMAX && nativesig <= _SIGRTMAX;
464 	    s10sig++, nativesig++) {
465 		if (sigismember(set, s10sig))
466 			sigaddset(&nativeset, nativesig);
467 	}
468 
469 	*set = nativeset;
470 }
471 
472 int
473 _init(void)
474 {
475 	int err;
476 
477 	/*
478 	 * Set up the table indicating which system calls we want to
479 	 * interpose on.  We should probably build this automatically from
480 	 * a list of system calls that is shared with the user-space
481 	 * library.
482 	 */
483 	s10_emulation_table = kmem_zalloc(NSYSCALL, KM_SLEEP);
484 	s10_emulation_table[S10_SYS_forkall] = 1;		/*   2 */
485 	s10_emulation_table[S10_SYS_open] = 1;			/*   5 */
486 	s10_emulation_table[S10_SYS_wait] = 1;			/*   7 */
487 	s10_emulation_table[S10_SYS_creat] = 1;			/*   8 */
488 	s10_emulation_table[S10_SYS_unlink] = 1;		/*  10 */
489 	s10_emulation_table[S10_SYS_exec] = 1;			/*  11 */
490 	s10_emulation_table[S10_SYS_chown] = 1;			/*  16 */
491 	s10_emulation_table[S10_SYS_stat] = 1;			/*  18 */
492 	s10_emulation_table[S10_SYS_umount] = 1;		/*  22 */
493 	s10_emulation_table[S10_SYS_fstat] = 1;			/*  28 */
494 	s10_emulation_table[S10_SYS_utime] = 1;			/*  30 */
495 	s10_emulation_table[S10_SYS_access] = 1;		/*  33 */
496 	s10_emulation_table[SYS_kill] = 1;			/*  37 */
497 	s10_emulation_table[S10_SYS_dup] = 1;			/*  41 */
498 	s10_emulation_table[SYS_ioctl] = 1;			/*  54 */
499 	s10_emulation_table[SYS_execve] = 1;			/*  59 */
500 	s10_emulation_table[SYS_acctctl] = 1;			/*  71 */
501 	s10_emulation_table[S10_SYS_issetugid] = 1;		/*  75 */
502 	s10_emulation_table[S10_SYS_fsat] = 1;			/*  76 */
503 	s10_emulation_table[S10_SYS_rmdir] = 1;			/*  79 */
504 	s10_emulation_table[SYS_getdents] = 1;			/*  81 */
505 	s10_emulation_table[S10_SYS_poll] = 1;			/*  87 */
506 	s10_emulation_table[S10_SYS_lstat] = 1;			/*  88 */
507 	s10_emulation_table[S10_SYS_fchown] = 1;		/*  94 */
508 	s10_emulation_table[SYS_sigprocmask] = 1;		/*  95 */
509 	s10_emulation_table[SYS_sigsuspend] = 1;		/*  96 */
510 	s10_emulation_table[SYS_sigaction] = 1;			/*  98 */
511 	s10_emulation_table[SYS_sigpending] = 1;		/*  99 */
512 	s10_emulation_table[SYS_waitid] = 1;			/* 107 */
513 	s10_emulation_table[SYS_sigsendsys] = 1;		/* 108 */
514 #if defined(__x86)
515 	s10_emulation_table[S10_SYS_xstat] = 1;			/* 123 */
516 	s10_emulation_table[S10_SYS_lxstat] = 1;		/* 124 */
517 	s10_emulation_table[S10_SYS_fxstat] = 1;		/* 125 */
518 	s10_emulation_table[S10_SYS_xmknod] = 1;		/* 126 */
519 #endif
520 	s10_emulation_table[S10_SYS_lchown] = 1;		/* 130 */
521 	s10_emulation_table[S10_SYS_rename] = 1;		/* 134 */
522 	s10_emulation_table[SYS_uname] = 1;			/* 135 */
523 	s10_emulation_table[SYS_sysconfig] = 1;			/* 137 */
524 	s10_emulation_table[SYS_systeminfo] = 1;		/* 139 */
525 	s10_emulation_table[S10_SYS_fork1] = 1;			/* 143 */
526 	s10_emulation_table[SYS_sigtimedwait] = 1;		/* 144 */
527 	s10_emulation_table[S10_SYS_lwp_sema_wait] = 1;		/* 147 */
528 	s10_emulation_table[S10_SYS_utimes] = 1;		/* 154 */
529 	s10_emulation_table[SYS_lwp_create] = 1;		/* 159 */
530 	s10_emulation_table[SYS_lwp_kill] = 1;			/* 163 */
531 	s10_emulation_table[SYS_lwp_sigmask] = 1;		/* 165 */
532 #if defined(__amd64)
533 	s10_emulation_table[SYS_lwp_private] = 1;		/* 166 */
534 #endif	/* __amd64 */
535 	s10_emulation_table[S10_SYS_lwp_mutex_lock] = 1;	/* 169 */
536 	s10_emulation_table[SYS_pwrite] = 1;			/* 174 */
537 	s10_emulation_table[SYS_acl] = 1;			/* 185 */
538 	s10_emulation_table[SYS_auditsys] = 1;			/* 186 */
539 	s10_emulation_table[SYS_sigqueue] = 1;			/* 190 */
540 	s10_emulation_table[SYS_facl] = 1;			/* 200 */
541 	s10_emulation_table[SYS_signotify] = 1;			/* 205 */
542 	s10_emulation_table[SYS_lwp_mutex_timedlock] = 1;	/* 210 */
543 	s10_emulation_table[SYS_getdents64] = 1;		/* 213 */
544 	s10_emulation_table[S10_SYS_stat64] = 1;		/* 215 */
545 	s10_emulation_table[S10_SYS_lstat64] = 1;		/* 216 */
546 	s10_emulation_table[S10_SYS_fstat64] = 1;		/* 217 */
547 	s10_emulation_table[SYS_pwrite64] = 1;			/* 223 */
548 	s10_emulation_table[S10_SYS_creat64] = 1;		/* 224 */
549 	s10_emulation_table[S10_SYS_open64] = 1;		/* 225 */
550 	s10_emulation_table[SYS_zone] = 1;			/* 227 */
551 	s10_emulation_table[SYS_lwp_mutex_trylock] = 1;		/* 251 */
552 
553 	err = mod_install(&modlinkage);
554 	if (err) {
555 		cmn_err(CE_WARN, "Couldn't install brand module");
556 		kmem_free(s10_emulation_table, NSYSCALL);
557 	}
558 
559 	return (err);
560 }
561 
562 int
563 _info(struct modinfo *modinfop)
564 {
565 	return (mod_info(&modlinkage, modinfop));
566 }
567 
568 int
569 _fini(void)
570 {
571 	return (brand_solaris_fini(&s10_emulation_table, &modlinkage,
572 	    &s10_brand));
573 }
574