1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/errno.h> 27 #include <sys/exec.h> 28 #include <sys/file.h> 29 #include <sys/kmem.h> 30 #include <sys/modctl.h> 31 #include <sys/model.h> 32 #include <sys/proc.h> 33 #include <sys/syscall.h> 34 #include <sys/systm.h> 35 #include <sys/thread.h> 36 #include <sys/cmn_err.h> 37 #include <sys/archsystm.h> 38 #include <sys/pathname.h> 39 #include <sys/sunddi.h> 40 41 #include <sys/machbrand.h> 42 #include <sys/brand.h> 43 #include "s10_brand.h" 44 45 char *s10_emulation_table = NULL; 46 47 void s10_init_brand_data(zone_t *); 48 void s10_free_brand_data(zone_t *); 49 void s10_setbrand(proc_t *); 50 int s10_getattr(zone_t *, int, void *, size_t *); 51 int s10_setattr(zone_t *, int, void *, size_t); 52 int s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, 53 uintptr_t, uintptr_t, uintptr_t); 54 void s10_copy_procdata(proc_t *, proc_t *); 55 void s10_proc_exit(struct proc *, klwp_t *); 56 void s10_exec(); 57 int s10_initlwp(klwp_t *); 58 void s10_forklwp(klwp_t *, klwp_t *); 59 void s10_freelwp(klwp_t *); 60 void s10_lwpexit(klwp_t *); 61 int s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, 62 long *, int, caddr_t, cred_t *, int); 63 void s10_sigset_native_to_s10(sigset_t *); 64 void s10_sigset_s10_to_native(sigset_t *); 65 66 /* s10 brand */ 67 struct brand_ops s10_brops = { 68 s10_init_brand_data, 69 s10_free_brand_data, 70 s10_brandsys, 71 s10_setbrand, 72 s10_getattr, 73 s10_setattr, 74 s10_copy_procdata, 75 s10_proc_exit, 76 s10_exec, 77 lwp_setrval, 78 s10_initlwp, 79 s10_forklwp, 80 s10_freelwp, 81 s10_lwpexit, 82 s10_elfexec, 83 s10_sigset_native_to_s10, 84 s10_sigset_s10_to_native, 85 S10_NSIG, 86 }; 87 88 #ifdef sparc 89 90 struct brand_mach_ops s10_mops = { 91 s10_brand_syscall_callback, 92 s10_brand_syscall32_callback 93 }; 94 95 #else /* sparc */ 96 97 #ifdef __amd64 98 99 struct brand_mach_ops s10_mops = { 100 s10_brand_sysenter_callback, 101 s10_brand_int91_callback, 102 s10_brand_syscall_callback, 103 s10_brand_syscall32_callback 104 }; 105 106 #else /* ! __amd64 */ 107 108 struct brand_mach_ops s10_mops = { 109 s10_brand_sysenter_callback, 110 NULL, 111 s10_brand_syscall_callback, 112 NULL 113 }; 114 #endif /* __amd64 */ 115 116 #endif /* _sparc */ 117 118 struct brand s10_brand = { 119 BRAND_VER_1, 120 "solaris10", 121 &s10_brops, 122 &s10_mops 123 }; 124 125 static struct modlbrand modlbrand = { 126 &mod_brandops, /* type of module */ 127 "Solaris 10 Brand", /* description of module */ 128 &s10_brand /* driver ops */ 129 }; 130 131 static struct modlinkage modlinkage = { 132 MODREV_1, (void *)&modlbrand, NULL 133 }; 134 135 void 136 s10_setbrand(proc_t *p) 137 { 138 brand_solaris_setbrand(p, &s10_brand); 139 } 140 141 /*ARGSUSED*/ 142 int 143 s10_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) 144 { 145 ASSERT(zone->zone_brand == &s10_brand); 146 if (attr == S10_EMUL_BITMAP) { 147 if (buf == NULL || *bufsize != sizeof (s10_emul_bitmap_t)) 148 return (EINVAL); 149 if (copyout(((s10_zone_data_t *)zone->zone_brand_data)-> 150 emul_bitmap, buf, sizeof (s10_emul_bitmap_t)) != 0) 151 return (EFAULT); 152 return (0); 153 } 154 155 return (EINVAL); 156 } 157 158 int 159 s10_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) 160 { 161 ASSERT(zone->zone_brand == &s10_brand); 162 if (attr == S10_EMUL_BITMAP) { 163 if (buf == NULL || bufsize != sizeof (s10_emul_bitmap_t)) 164 return (EINVAL); 165 if (copyin(buf, ((s10_zone_data_t *)zone->zone_brand_data)-> 166 emul_bitmap, sizeof (s10_emul_bitmap_t)) != 0) 167 return (EFAULT); 168 return (0); 169 } 170 171 return (EINVAL); 172 } 173 174 #ifdef __amd64 175 /* 176 * The Nevada kernel clears %fs for threads in 64-bit x86 processes but S10's 177 * libc expects %fs to be nonzero. This causes some committed 178 * libc/libthread interfaces (e.g., thr_main()) to fail, which impacts several 179 * libraries, including libdoor. This function sets the specified LWP's %fs 180 * register to the legacy S10 selector value (LWPFS_SEL). 181 * 182 * The best solution to the aforementioned problem is backporting CRs 183 * 6467491 to Solaris 10 so that 64-bit x86 Solaris 10 processes 184 * would accept zero for %fs. Backporting the CRs is a requirement for running 185 * S10 Containers in PV domUs because 64-bit Xen clears %fsbase when %fs is 186 * nonzero. Such behavior breaks 64-bit processes because Xen has to fetch the 187 * FS segments' base addresses from the LWPs' GDTs, which are only capable of 188 * 32-bit addressing. 189 */ 190 /*ARGSUSED*/ 191 static void 192 s10_amd64_correct_fsreg(klwp_t *l) 193 { 194 if (lwp_getdatamodel(l) == DATAMODEL_NATIVE) { 195 kpreempt_disable(); 196 l->lwp_pcb.pcb_fs = LWPFS_SEL; 197 l->lwp_pcb.pcb_rupdate = 1; 198 lwptot(l)->t_post_sys = 1; /* Guarantee update_sregs() */ 199 kpreempt_enable(); 200 } 201 } 202 #endif /* __amd64 */ 203 204 /* 205 * The native ld.so.1 is invoked with a set of -e options which we also want to 206 * strip off. This function assumes the set of -e options immediately follows 207 * the native ld.so.1 command and is contiguous. This is OK, since we control 208 * the code in s10_isaexec_wrapper. We do it this way so we don't accidently 209 * strip a -e option from the native command itself. The format of an ld.so.1 210 * -e option looks like: 211 * -e LD_NOENVIRON=1 212 */ 213 char * 214 rm_e_options(char *args) 215 { 216 char *p; 217 218 while (strncmp(args, "-e ", 3) == 0) { 219 args += 3; 220 if ((p = strchr(args, ' ')) != NULL) 221 args = p + 1; 222 } 223 224 return (args); 225 } 226 227 int 228 s10_native() 229 { 230 struct user *up = PTOU(curproc); 231 char *args_new, *comm_new, *p; 232 int len; 233 234 /* 235 * len has an extra value for the trailing '\0' so this covers the 236 * appended " " in the following strcmps. 237 */ 238 len = sizeof (BRAND_NATIVE_LINKER32); 239 240 /* 241 * Make sure that the process' interpreter is the native dynamic linker. 242 * Convention dictates that native processes executing within solaris10- 243 * branded zones are interpreted by the native dynamic linker (the 244 * process and its arguments are specified as arguments to the dynamic 245 * linker). If this convention is violated (i.e., 246 * brandsys(B_S10_NATIVE, ...) is invoked by a process that shouldn't be 247 * native), then do nothing and silently indicate success. 248 */ 249 if (strcmp(up->u_comm, S10_LINKER_NAME) != 0) 250 return (0); 251 if (strncmp(up->u_psargs, BRAND_NATIVE_LINKER64 " ", len + 3) == 0) 252 len += 3; /* to account for "/64" in the path */ 253 else if (strncmp(up->u_psargs, BRAND_NATIVE_LINKER32 " ", len) != 0) 254 return (0); 255 256 args_new = strdup(rm_e_options(&up->u_psargs[len])); 257 if ((p = strchr(args_new, ' ')) != NULL) 258 *p = '\0'; 259 if ((comm_new = strrchr(args_new, '/')) != NULL) 260 comm_new = strdup(comm_new + 1); 261 else 262 comm_new = strdup(args_new); 263 if (p != NULL) 264 *p = ' '; 265 266 if ((strlen(args_new) != 0) && (strlen(comm_new) != 0)) { 267 mutex_enter(&curproc->p_lock); 268 (void) strlcpy(up->u_comm, comm_new, MAXCOMLEN+1); 269 (void) strlcpy(up->u_psargs, args_new, PSARGSZ); 270 mutex_exit(&curproc->p_lock); 271 } 272 273 strfree(args_new); 274 strfree(comm_new); 275 return (0); 276 } 277 278 /*ARGSUSED*/ 279 int 280 s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, 281 uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) 282 { 283 proc_t *p = curproc; 284 int res; 285 286 *rval = 0; 287 288 if (cmd == B_S10_NATIVE) 289 return (s10_native()); 290 291 res = brand_solaris_cmd(cmd, arg1, arg2, arg3, &s10_brand, S10_VERSION); 292 if (res >= 0) 293 return (res); 294 295 switch ((cmd)) { 296 case B_S10_PIDINFO: 297 /* 298 * The s10 brand needs to be able to get the pid of the 299 * current process and the pid of the zone's init, and it 300 * needs to do this on every process startup. Early in 301 * brand startup, we can't call getpid() because calls to 302 * getpid() represent a magical signal to some old-skool 303 * debuggers. By merging all of this into one call, we 304 * make this quite a bit cheaper and easier to handle in 305 * the brand module. 306 */ 307 if (copyout(&p->p_pid, (void *)arg1, sizeof (pid_t)) != 0) 308 return (EFAULT); 309 if (copyout(&p->p_zone->zone_proc_initpid, (void *)arg2, 310 sizeof (pid_t)) != 0) 311 return (EFAULT); 312 return (0); 313 314 case B_S10_ISFDXATTRDIR: { 315 /* 316 * This subcommand enables the userland brand emulation library 317 * to determine whether a file descriptor refers to an extended 318 * file attributes directory. There is no standard syscall or 319 * libc function that can make such a determination. 320 */ 321 file_t *dir_filep; 322 323 dir_filep = getf((int)arg1); 324 if (dir_filep == NULL) 325 return (EBADF); 326 ASSERT(dir_filep->f_vnode != NULL); 327 *rval = IS_XATTRDIR(dir_filep->f_vnode); 328 releasef((int)arg1); 329 return (0); 330 } 331 332 #ifdef __amd64 333 case B_S10_FSREGCORRECTION: 334 /* 335 * This subcommand exists so that the SYS_lwp_private and 336 * SYS_lwp_create syscalls can manually set the current thread's 337 * %fs register to the legacy S10 selector value for 64-bit x86 338 * processes. 339 */ 340 s10_amd64_correct_fsreg(ttolwp(curthread)); 341 return (0); 342 #endif /* __amd64 */ 343 } 344 345 return (EINVAL); 346 } 347 348 void 349 s10_copy_procdata(proc_t *child, proc_t *parent) 350 { 351 brand_solaris_copy_procdata(child, parent, &s10_brand); 352 } 353 354 void 355 s10_proc_exit(struct proc *p, klwp_t *l) 356 { 357 brand_solaris_proc_exit(p, l, &s10_brand); 358 } 359 360 void 361 s10_exec() 362 { 363 brand_solaris_exec(&s10_brand); 364 } 365 366 int 367 s10_initlwp(klwp_t *l) 368 { 369 return (brand_solaris_initlwp(l, &s10_brand)); 370 } 371 372 void 373 s10_forklwp(klwp_t *p, klwp_t *c) 374 { 375 brand_solaris_forklwp(p, c, &s10_brand); 376 377 #ifdef __amd64 378 /* 379 * Only correct the child's %fs register if the parent's %fs register 380 * is LWPFS_SEL. If the parent's %fs register is zero, then the Solaris 381 * 10 environment that we're emulating uses a version of libc that 382 * works when %fs is zero (i.e., it contains backports of CRs 6467491 383 * and 6501650). 384 */ 385 if (p->lwp_pcb.pcb_fs == LWPFS_SEL) 386 s10_amd64_correct_fsreg(c); 387 #endif /* __amd64 */ 388 } 389 390 void 391 s10_freelwp(klwp_t *l) 392 { 393 brand_solaris_freelwp(l, &s10_brand); 394 } 395 396 void 397 s10_lwpexit(klwp_t *l) 398 { 399 brand_solaris_lwpexit(l, &s10_brand); 400 } 401 402 void 403 s10_free_brand_data(zone_t *zone) 404 { 405 kmem_free(zone->zone_brand_data, sizeof (s10_zone_data_t)); 406 } 407 408 void 409 s10_init_brand_data(zone_t *zone) 410 { 411 ASSERT(zone->zone_brand == &s10_brand); 412 ASSERT(zone->zone_brand_data == NULL); 413 zone->zone_brand_data = kmem_zalloc(sizeof (s10_zone_data_t), KM_SLEEP); 414 } 415 416 int 417 s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, 418 int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, 419 int brand_action) 420 { 421 return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, 422 setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME, 423 S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32)); 424 } 425 426 void 427 s10_sigset_native_to_s10(sigset_t *set) 428 { 429 int nativesig; 430 int s10sig; 431 sigset_t s10set; 432 433 /* 434 * Shortcut: we know the first 32 signals are the same in both 435 * s10 and native Solaris. Just assign the first word. 436 */ 437 s10set.__sigbits[0] = set->__sigbits[0]; 438 s10set.__sigbits[1] = 0; 439 s10set.__sigbits[2] = 0; 440 s10set.__sigbits[3] = 0; 441 442 /* 443 * Copy the remainder of the initial set of common signals. 444 */ 445 for (nativesig = 33; nativesig < S10_SIGRTMIN; nativesig++) 446 if (sigismember(set, nativesig)) 447 sigaddset(&s10set, nativesig); 448 449 /* 450 * Convert any native RT signals to their S10 values. 451 */ 452 for (nativesig = _SIGRTMIN, s10sig = S10_SIGRTMIN; 453 nativesig <= _SIGRTMAX && s10sig <= S10_SIGRTMAX; 454 nativesig++, s10sig++) { 455 if (sigismember(set, nativesig)) 456 sigaddset(&s10set, s10sig); 457 } 458 459 *set = s10set; 460 } 461 462 void 463 s10_sigset_s10_to_native(sigset_t *set) 464 { 465 int s10sig; 466 int nativesig; 467 sigset_t nativeset; 468 469 /* 470 * Shortcut: we know the first 32 signals are the same in both 471 * s10 and native Solaris. Just assign the first word. 472 */ 473 nativeset.__sigbits[0] = set->__sigbits[0]; 474 nativeset.__sigbits[1] = 0; 475 nativeset.__sigbits[2] = 0; 476 nativeset.__sigbits[3] = 0; 477 478 /* 479 * Copy the remainder of the initial set of common signals. 480 */ 481 for (s10sig = 33; s10sig < S10_SIGRTMIN; s10sig++) 482 if (sigismember(set, s10sig)) 483 sigaddset(&nativeset, s10sig); 484 485 /* 486 * Convert any S10 RT signals to their native values. 487 */ 488 for (s10sig = S10_SIGRTMIN, nativesig = _SIGRTMIN; 489 s10sig <= S10_SIGRTMAX && nativesig <= _SIGRTMAX; 490 s10sig++, nativesig++) { 491 if (sigismember(set, s10sig)) 492 sigaddset(&nativeset, nativesig); 493 } 494 495 *set = nativeset; 496 } 497 498 int 499 _init(void) 500 { 501 int err; 502 503 /* 504 * Set up the table indicating which system calls we want to 505 * interpose on. We should probably build this automatically from 506 * a list of system calls that is shared with the user-space 507 * library. 508 */ 509 s10_emulation_table = kmem_zalloc(NSYSCALL, KM_SLEEP); 510 s10_emulation_table[S10_SYS_forkall] = 1; /* 2 */ 511 s10_emulation_table[S10_SYS_open] = 1; /* 5 */ 512 s10_emulation_table[S10_SYS_wait] = 1; /* 7 */ 513 s10_emulation_table[S10_SYS_creat] = 1; /* 8 */ 514 s10_emulation_table[S10_SYS_unlink] = 1; /* 10 */ 515 s10_emulation_table[S10_SYS_exec] = 1; /* 11 */ 516 s10_emulation_table[S10_SYS_chown] = 1; /* 16 */ 517 s10_emulation_table[S10_SYS_stat] = 1; /* 18 */ 518 s10_emulation_table[S10_SYS_umount] = 1; /* 22 */ 519 s10_emulation_table[S10_SYS_fstat] = 1; /* 28 */ 520 s10_emulation_table[S10_SYS_utime] = 1; /* 30 */ 521 s10_emulation_table[S10_SYS_access] = 1; /* 33 */ 522 s10_emulation_table[SYS_kill] = 1; /* 37 */ 523 s10_emulation_table[S10_SYS_dup] = 1; /* 41 */ 524 s10_emulation_table[SYS_ioctl] = 1; /* 54 */ 525 s10_emulation_table[SYS_execve] = 1; /* 59 */ 526 s10_emulation_table[SYS_acctctl] = 1; /* 71 */ 527 s10_emulation_table[S10_SYS_issetugid] = 1; /* 75 */ 528 s10_emulation_table[S10_SYS_fsat] = 1; /* 76 */ 529 s10_emulation_table[S10_SYS_rmdir] = 1; /* 79 */ 530 s10_emulation_table[SYS_getdents] = 1; /* 81 */ 531 s10_emulation_table[S10_SYS_poll] = 1; /* 87 */ 532 s10_emulation_table[S10_SYS_lstat] = 1; /* 88 */ 533 s10_emulation_table[S10_SYS_fchown] = 1; /* 94 */ 534 s10_emulation_table[SYS_sigprocmask] = 1; /* 95 */ 535 s10_emulation_table[SYS_sigsuspend] = 1; /* 96 */ 536 s10_emulation_table[SYS_sigaction] = 1; /* 98 */ 537 s10_emulation_table[SYS_sigpending] = 1; /* 99 */ 538 s10_emulation_table[SYS_waitid] = 1; /* 107 */ 539 s10_emulation_table[SYS_sigsendsys] = 1; /* 108 */ 540 #if defined(__x86) 541 s10_emulation_table[S10_SYS_xstat] = 1; /* 123 */ 542 s10_emulation_table[S10_SYS_lxstat] = 1; /* 124 */ 543 s10_emulation_table[S10_SYS_fxstat] = 1; /* 125 */ 544 s10_emulation_table[S10_SYS_xmknod] = 1; /* 126 */ 545 #endif 546 s10_emulation_table[S10_SYS_lchown] = 1; /* 130 */ 547 s10_emulation_table[S10_SYS_rename] = 1; /* 134 */ 548 s10_emulation_table[SYS_uname] = 1; /* 135 */ 549 s10_emulation_table[SYS_sysconfig] = 1; /* 137 */ 550 s10_emulation_table[SYS_systeminfo] = 1; /* 139 */ 551 s10_emulation_table[S10_SYS_fork1] = 1; /* 143 */ 552 s10_emulation_table[SYS_sigtimedwait] = 1; /* 144 */ 553 s10_emulation_table[S10_SYS_lwp_sema_wait] = 1; /* 147 */ 554 s10_emulation_table[S10_SYS_utimes] = 1; /* 154 */ 555 s10_emulation_table[SYS_lwp_create] = 1; /* 159 */ 556 s10_emulation_table[SYS_lwp_kill] = 1; /* 163 */ 557 s10_emulation_table[SYS_lwp_sigmask] = 1; /* 165 */ 558 #if defined(__amd64) 559 s10_emulation_table[SYS_lwp_private] = 1; /* 166 */ 560 #endif /* __amd64 */ 561 s10_emulation_table[S10_SYS_lwp_mutex_lock] = 1; /* 169 */ 562 s10_emulation_table[SYS_pwrite] = 1; /* 174 */ 563 s10_emulation_table[SYS_acl] = 1; /* 185 */ 564 s10_emulation_table[SYS_auditsys] = 1; /* 186 */ 565 s10_emulation_table[SYS_sigqueue] = 1; /* 190 */ 566 s10_emulation_table[SYS_facl] = 1; /* 200 */ 567 s10_emulation_table[SYS_signotify] = 1; /* 205 */ 568 s10_emulation_table[SYS_lwp_mutex_timedlock] = 1; /* 210 */ 569 s10_emulation_table[SYS_getdents64] = 1; /* 213 */ 570 s10_emulation_table[S10_SYS_stat64] = 1; /* 215 */ 571 s10_emulation_table[S10_SYS_lstat64] = 1; /* 216 */ 572 s10_emulation_table[S10_SYS_fstat64] = 1; /* 217 */ 573 s10_emulation_table[SYS_pwrite64] = 1; /* 223 */ 574 s10_emulation_table[S10_SYS_creat64] = 1; /* 224 */ 575 s10_emulation_table[S10_SYS_open64] = 1; /* 225 */ 576 s10_emulation_table[SYS_zone] = 1; /* 227 */ 577 s10_emulation_table[SYS_lwp_mutex_trylock] = 1; /* 251 */ 578 579 err = mod_install(&modlinkage); 580 if (err) { 581 cmn_err(CE_WARN, "Couldn't install brand module"); 582 kmem_free(s10_emulation_table, NSYSCALL); 583 } 584 585 return (err); 586 } 587 588 int 589 _info(struct modinfo *modinfop) 590 { 591 return (mod_info(&modlinkage, modinfop)); 592 } 593 594 int 595 _fini(void) 596 { 597 return (brand_solaris_fini(&s10_emulation_table, &modlinkage, 598 &s10_brand)); 599 } 600