1*80e2ca85S /* 2*80e2ca85S * CDDL HEADER START 3*80e2ca85S * 4*80e2ca85S * The contents of this file are subject to the terms of the 5*80e2ca85S * Common Development and Distribution License (the "License"). 6*80e2ca85S * You may not use this file except in compliance with the License. 7*80e2ca85S * 8*80e2ca85S * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*80e2ca85S * or http://www.opensolaris.org/os/licensing. 10*80e2ca85S * See the License for the specific language governing permissions 11*80e2ca85S * and limitations under the License. 12*80e2ca85S * 13*80e2ca85S * When distributing Covered Code, include this CDDL HEADER in each 14*80e2ca85S * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*80e2ca85S * If applicable, add the following below this CDDL HEADER, with the 16*80e2ca85S * fields enclosed by brackets "[]" replaced with your own identifying 17*80e2ca85S * information: Portions Copyright [yyyy] [name of copyright owner] 18*80e2ca85S * 19*80e2ca85S * CDDL HEADER END 20*80e2ca85S */ 21*80e2ca85S 22*80e2ca85S /* 23*80e2ca85S * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24*80e2ca85S */ 25*80e2ca85S 26*80e2ca85S #include <errno.h> 27*80e2ca85S #include <stdio.h> 28*80e2ca85S #include <stdlib.h> 29*80e2ca85S #include <strings.h> 30*80e2ca85S #include <unistd.h> 31*80e2ca85S #include <sys/auxv.h> 32*80e2ca85S #include <sys/bitmap.h> 33*80e2ca85S #include <sys/brand.h> 34*80e2ca85S #include <sys/inttypes.h> 35*80e2ca85S #include <sys/lwp.h> 36*80e2ca85S #include <sys/syscall.h> 37*80e2ca85S #include <sys/systm.h> 38*80e2ca85S #include <sys/utsname.h> 39*80e2ca85S #include <fcntl.h> 40*80e2ca85S #include <brand_misc.h> 41*80e2ca85S #include <sys/brand.h> 42*80e2ca85S 43*80e2ca85S extern brand_sysent_table_t brand_sysent_table[]; 44*80e2ca85S 45*80e2ca85S /*LINTED: static unused*/ 46*80e2ca85S static volatile int brand_abort_err; 47*80e2ca85S /*LINTED: static unused*/ 48*80e2ca85S static volatile const char *brand_abort_msg; 49*80e2ca85S /*LINTED: static unused*/ 50*80e2ca85S static volatile const char *brand_abort_file; 51*80e2ca85S /*LINTED: static unused*/ 52*80e2ca85S static volatile int brand_abort_line; 53*80e2ca85S 54*80e2ca85S /* 55*80e2ca85S * Principles of emulation 101. 56*80e2ca85S * 57*80e2ca85S * 58*80e2ca85S * *** Setting errno 59*80e2ca85S * 60*80e2ca85S * Just don't do it. This emulation library is loaded onto a 61*80e2ca85S * seperate link map from the application who's address space we're 62*80e2ca85S * running in. We have our own private copy of libc, so there for, 63*80e2ca85S * the errno value accessible from here is is also private and changing 64*80e2ca85S * it will not affect any errno value that the processes who's address 65*80e2ca85S * space we are running in will see. To return an error condition we 66*80e2ca85S * should return the errno value we'd like the system to return. 67*80e2ca85S * For more information about this see the comments in brand_misc.h. 68*80e2ca85S * Basically, when we return to the caller that initiated the system 69*80e2ca85S * call it's their responsibility to set errno. 70*80e2ca85S * 71*80e2ca85S * 72*80e2ca85S * *** Recursion Considerations 73*80e2ca85S * 74*80e2ca85S * When emulating system calls we need to be very careful about what 75*80e2ca85S * library calls we invoke. Library calls should be kept to a minimum. 76*80e2ca85S * One issue is that library calls can invoke system calls, so if we're 77*80e2ca85S * emulating a system call and we invoke a library call that depends on 78*80e2ca85S * that system call we will probably enter a recursive loop, which would 79*80e2ca85S * be bad. 80*80e2ca85S * 81*80e2ca85S * 82*80e2ca85S * *** Return Values. 83*80e2ca85S * 84*80e2ca85S * See brand_misc.h. 85*80e2ca85S * 86*80e2ca85S * *** Agent lwp considerations 87*80e2ca85S * 88*80e2ca85S * It is currently impossible to do any emulation for these system call 89*80e2ca85S * when they are being invoked on behalf of an agent lwp. To understand why 90*80e2ca85S * it's impossible you have to understand how agent lwp syscalls work. 91*80e2ca85S * 92*80e2ca85S * The agent lwp syscall process works as follows: 93*80e2ca85S * 1 The controlling process stops the target. 94*80e2ca85S * 2 The controlling process injects an agent lwp which is also stopped. 95*80e2ca85S * This agent lwp assumes the userland stack and register values 96*80e2ca85S * of another stopped lwp in the current process. 97*80e2ca85S * 3 The controlling process configures the agent lwp to start 98*80e2ca85S * executing the requested system call. 99*80e2ca85S * 4 The controlling process configure /proc to stop the agent lwp when 100*80e2ca85S * it enters the requested system call. 101*80e2ca85S * 5 The controlling processes allows the agent lwp to start executing. 102*80e2ca85S * 6 The agent lwp traps into the kernel to perform the requested system 103*80e2ca85S * call and immediately stop. 104*80e2ca85S * 7 The controlling process copies all the arguments for the requested 105*80e2ca85S * system call onto the agent lwp's stack. 106*80e2ca85S * 8 The controlling process configures /proc to stop the agent lwp 107*80e2ca85S * when it completes the requested system call. 108*80e2ca85S * 9 The controlling processes allows the agent lwp to start executing. 109*80e2ca85S * 10 The agent lwp executes the system call and then stop before returning 110*80e2ca85S * to userland. 111*80e2ca85S * 11 The controlling process copies the return value and return arguments 112*80e2ca85S * back from the agent lwps stack. 113*80e2ca85S * 12 The controlling process destroys the agent lwp and restarts 114*80e2ca85S * the target process. 115*80e2ca85S * 116*80e2ca85S * The fundamental problem is that when the agent executes the request 117*80e2ca85S * system call in step 5, if we're emulating that system call then the 118*80e2ca85S * lwp is redirected back to our emulation layer without blocking 119*80e2ca85S * in the kernel. But our emulation layer can't access the arguments 120*80e2ca85S * for the system call because they haven't been copied to the stack 121*80e2ca85S * yet and they still only exist in the controlling processes address 122*80e2ca85S * space. This prevents us from being able to do any emulation of 123*80e2ca85S * agent lwp system calls. Hence, currently our brand trap interposition 124*80e2ca85S * callback (XXX_brand_syscall_callback_common) will detect if a system 125*80e2ca85S * call is being made by an agent lwp, and if this is the case it will 126*80e2ca85S * never redirect the system call to this emulation library. 127*80e2ca85S * 128*80e2ca85S * In the future, if this proves to be a problem the the easiest solution 129*80e2ca85S * would probably be to replace the branded versions of these application 130*80e2ca85S * with their native counterparts. Ie, truss, plimit, and pfiles could be 131*80e2ca85S * replace with wrapper scripts that execute the native versions of these 132*80e2ca85S * applications. In the case of plimit and pfiles this should be pretty 133*80e2ca85S * strait forward. Truss would probably be more tricky since it can 134*80e2ca85S * execute applications which would be branded applications, so in that 135*80e2ca85S * case it might be necessary to create a loadable library which could 136*80e2ca85S * be LD_PRELOADed into truss and this library would interpose on the 137*80e2ca85S * exec() system call to allow truss to correctly execute branded 138*80e2ca85S * processes. It should be pointed out that this solution could work 139*80e2ca85S * because "native agent lwps" (ie, agent lwps created by native 140*80e2ca85S * processes) can be treated differently from "branded aged lwps" (ie, 141*80e2ca85S * agent lwps created by branded processes), since native agent lwps 142*80e2ca85S * would presumably be making native system calls and hence not need 143*80e2ca85S * any interposition. 144*80e2ca85S * 145*80e2ca85S * *** General considerations 146*80e2ca85S * 147*80e2ca85S * One of the differences between the lx brand and the s10 148*80e2ca85S * brand, is that the s10 brand only interposes on syscalls 149*80e2ca85S * that need some kind of emulation, whereas the lx brand interposes 150*80e2ca85S * on _all_ system calls. Lx branded system calls that don't need 151*80e2ca85S * any emulation are then redirected back to the kernel from the 152*80e2ca85S * userland library via the IN_KERNEL_SYSCALL macro. The lx-syscall 153*80e2ca85S * dtrace provider depends on this behavior. 154*80e2ca85S * 155*80e2ca85S */ 156*80e2ca85S 157*80e2ca85S /*ARGSUSED*/ 158*80e2ca85S void 159*80e2ca85S _brand_abort(int err, const char *msg, const char *file, int line) 160*80e2ca85S { 161*80e2ca85S sysret_t rval; 162*80e2ca85S 163*80e2ca85S /* Save the error message into convenient globals */ 164*80e2ca85S brand_abort_err = err; 165*80e2ca85S brand_abort_msg = msg; 166*80e2ca85S brand_abort_file = file; 167*80e2ca85S brand_abort_line = line; 168*80e2ca85S 169*80e2ca85S /* kill ourselves */ 170*80e2ca85S abort(); 171*80e2ca85S 172*80e2ca85S /* If abort() didn't work, try something stronger. */ 173*80e2ca85S (void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGKILL); 174*80e2ca85S } 175*80e2ca85S 176*80e2ca85S int 177*80e2ca85S brand_uucopy(const void *from, void *to, size_t size) 178*80e2ca85S { 179*80e2ca85S sysret_t rval; 180*80e2ca85S 181*80e2ca85S if (__systemcall(&rval, SYS_uucopy + 1024, from, to, size) != 0) 182*80e2ca85S return (EFAULT); 183*80e2ca85S return (0); 184*80e2ca85S } 185*80e2ca85S 186*80e2ca85S /* 187*80e2ca85S * ATTENTION: uucopystr() does NOT ensure that string are null terminated! 188*80e2ca85S */ 189*80e2ca85S int 190*80e2ca85S brand_uucopystr(const void *from, void *to, size_t size) 191*80e2ca85S { 192*80e2ca85S sysret_t rval; 193*80e2ca85S 194*80e2ca85S if (__systemcall(&rval, SYS_uucopystr + 1024, from, to, size) != 0) 195*80e2ca85S return (EFAULT); 196*80e2ca85S return (0); 197*80e2ca85S } 198*80e2ca85S 199*80e2ca85S /* 200*80e2ca85S * This function is defined to be NOSYS but it won't be called from the 201*80e2ca85S * the kernel since the NOSYS system calls are not enabled in the kernel. 202*80e2ca85S * Thus, the only time this function is called is directly from within the 203*80e2ca85S * indirect system call path. 204*80e2ca85S */ 205*80e2ca85S /*ARGSUSED*/ 206*80e2ca85S long 207*80e2ca85S brand_unimpl(sysret_t *rv, uintptr_t p1) 208*80e2ca85S { 209*80e2ca85S sysret_t rval; 210*80e2ca85S 211*80e2ca85S /* 212*80e2ca85S * We'd like to print out some kind of error message here like 213*80e2ca85S * "unsupported syscall", but we can't because it's not safe to 214*80e2ca85S * assume that stderr or STDERR_FILENO actually points to something 215*80e2ca85S * that is a terminal, and if we wrote to those files we could 216*80e2ca85S * inadvertantly write to some applications open files, which would 217*80e2ca85S * be bad. 218*80e2ca85S * 219*80e2ca85S * Normally, if an application calls an invalid system call 220*80e2ca85S * it get a SIGSYS sent to it. So we'll just go ahead and send 221*80e2ca85S * ourselves a signal here. Note that this is far from ideal since 222*80e2ca85S * if the application has registered a signal handler, that signal 223*80e2ca85S * handler may recieve a ucontext_t as the third parameter to 224*80e2ca85S * indicate the context of the process when the signal was 225*80e2ca85S * generated, and in this case that context will not be what the 226*80e2ca85S * application is expecting. Hence, we should probably create a 227*80e2ca85S * brandsys() kernel function that can deliver the signal to us 228*80e2ca85S * with the correct ucontext_t. 229*80e2ca85S */ 230*80e2ca85S (void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGSYS); 231*80e2ca85S return (ENOSYS); 232*80e2ca85S } 233*80e2ca85S 234*80e2ca85S #if defined(__sparc) && !defined(__sparcv9) 235*80e2ca85S /* 236*80e2ca85S * Yuck. For 32-bit sparc applications, handle indirect system calls. 237*80e2ca85S * Note that we declare this interface to use the maximum number of 238*80e2ca85S * system call arguments. If we recieve a system call that uses less 239*80e2ca85S * arguments, then the additional arguments will be garbage, but they 240*80e2ca85S * will also be ignored so that should be ok. 241*80e2ca85S */ 242*80e2ca85S long 243*80e2ca85S brand_indir(sysret_t *rv, int code, 244*80e2ca85S uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4, 245*80e2ca85S uintptr_t a5, uintptr_t a6, uintptr_t a7) 246*80e2ca85S { 247*80e2ca85S brand_sysent_table_t *sst = &(brand_sysent_table[code]); 248*80e2ca85S 249*80e2ca85S brand_assert(code < NSYSCALL); 250*80e2ca85S switch (sst->st_args & NARGS_MASK) { 251*80e2ca85S case 0: 252*80e2ca85S return ((sst->st_callc)(rv)); 253*80e2ca85S case 1: 254*80e2ca85S return ((sst->st_callc)(rv, a0)); 255*80e2ca85S case 2: 256*80e2ca85S return ((sst->st_callc)(rv, a0, a1)); 257*80e2ca85S case 3: 258*80e2ca85S return ((sst->st_callc)(rv, a0, a1, a2)); 259*80e2ca85S case 4: 260*80e2ca85S return ((sst->st_callc)(rv, a0, a1, a2, a3)); 261*80e2ca85S case 5: 262*80e2ca85S return ((sst->st_callc)(rv, a0, a1, a2, a3, a4)); 263*80e2ca85S case 6: 264*80e2ca85S return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5)); 265*80e2ca85S case 7: 266*80e2ca85S return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6)); 267*80e2ca85S case 8: 268*80e2ca85S return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6, a7)); 269*80e2ca85S } 270*80e2ca85S brand_abort(0, "invalid entry in brand_sysent_table"); 271*80e2ca85S return (EINVAL); 272*80e2ca85S } 273*80e2ca85S #endif /* __sparc && !__sparcv9 */ 274*80e2ca85S 275*80e2ca85S /* 276*80e2ca85S * Close a libc file handle, but don't actually close the underlying 277*80e2ca85S * file descriptor. 278*80e2ca85S */ 279*80e2ca85S static void 280*80e2ca85S brand_close_fh(FILE *file) 281*80e2ca85S { 282*80e2ca85S int fd, fd_new; 283*80e2ca85S 284*80e2ca85S if (file == NULL) 285*80e2ca85S return; 286*80e2ca85S 287*80e2ca85S if ((fd = fileno(file)) < 0) 288*80e2ca85S return; 289*80e2ca85S 290*80e2ca85S /* 291*80e2ca85S * We're a branded process but our handler isn't installed yet. We 292*80e2ca85S * can't use the dup() syscall since it no longer exists. 293*80e2ca85S */ 294*80e2ca85S fd_new = fcntl(fd, F_DUPFD, 0); 295*80e2ca85S if (fd_new == -1) 296*80e2ca85S return; 297*80e2ca85S 298*80e2ca85S (void) fclose(file); 299*80e2ca85S (void) dup2(fd_new, fd); 300*80e2ca85S (void) close(fd_new); 301*80e2ca85S } 302*80e2ca85S 303*80e2ca85S /*ARGSUSED*/ 304*80e2ca85S void 305*80e2ca85S brand_pre_init() 306*80e2ca85S { 307*80e2ca85S int i; 308*80e2ca85S 309*80e2ca85S /* Sanity check our translation table return value codes */ 310*80e2ca85S for (i = 0; i < NSYSCALL; i++) { 311*80e2ca85S brand_sysent_table_t *est = &(brand_sysent_table[i]); 312*80e2ca85S brand_assert(BIT_ONLYONESET(est->st_args & RV_MASK)); 313*80e2ca85S } 314*80e2ca85S 315*80e2ca85S /* 316*80e2ca85S * We need to shutdown all libc stdio. libc stdio normally goes to 317*80e2ca85S * file descriptors, but since we're actually part of a another 318*80e2ca85S * process we don't own these file descriptors and we can't make 319*80e2ca85S * any assumptions about their state. 320*80e2ca85S */ 321*80e2ca85S brand_close_fh(stdin); 322*80e2ca85S brand_close_fh(stdout); 323*80e2ca85S brand_close_fh(stderr); 324*80e2ca85S } 325*80e2ca85S 326*80e2ca85S /*ARGSUSED*/ 327*80e2ca85S ulong_t 328*80e2ca85S brand_post_init(int version, int argc, char *argv[], char *envp[]) 329*80e2ca85S { 330*80e2ca85S sysret_t rval; 331*80e2ca85S brand_proc_reg_t reg; 332*80e2ca85S brand_elf_data_t sed; 333*80e2ca85S auxv_t *ap; 334*80e2ca85S uintptr_t *p; 335*80e2ca85S int err; 336*80e2ca85S 337*80e2ca85S /* 338*80e2ca85S * Register our syscall emulation table with the kernel. 339*80e2ca85S * Note that we don't have to do invoke (syscall_number + 1024) 340*80e2ca85S * until we've actually establised a syscall emulation callback 341*80e2ca85S * handler address, which is what we're doing with this brand 342*80e2ca85S * syscall. 343*80e2ca85S */ 344*80e2ca85S reg.sbr_version = version; 345*80e2ca85S #ifdef __x86 346*80e2ca85S reg.sbr_handler = (caddr_t)brand_handler_table; 347*80e2ca85S #else /* !__x86 */ 348*80e2ca85S reg.sbr_handler = (caddr_t)brand_handler; 349*80e2ca85S #endif /* !__x86 */ 350*80e2ca85S 351*80e2ca85S if ((err = __systemcall(&rval, SYS_brand, B_REGISTER, ®)) != 0) { 352*80e2ca85S brand_abort(err, "Failed to brand current process"); 353*80e2ca85S 354*80e2ca85S /*NOTREACHED*/ 355*80e2ca85S } 356*80e2ca85S 357*80e2ca85S /* Get data about the executable we're running from the kernel. */ 358*80e2ca85S if ((err = __systemcall(&rval, SYS_brand + 1024, 359*80e2ca85S B_ELFDATA, (void *)&sed)) != 0) { 360*80e2ca85S brand_abort(err, 361*80e2ca85S "Failed to get required brand ELF data from the kernel"); 362*80e2ca85S /*NOTREACHED*/ 363*80e2ca85S } 364*80e2ca85S 365*80e2ca85S /* 366*80e2ca85S * Find the aux vector on the stack. 367*80e2ca85S */ 368*80e2ca85S p = (uintptr_t *)envp; 369*80e2ca85S while (*p != NULL) 370*80e2ca85S p++; 371*80e2ca85S 372*80e2ca85S /* 373*80e2ca85S * p is now pointing at the 0 word after the environ pointers. 374*80e2ca85S * After that is the aux vectors. 375*80e2ca85S * 376*80e2ca85S * The aux vectors are currently pointing to the brand emulation 377*80e2ca85S * library and associated linker. We're going to change them to 378*80e2ca85S * point to the brand executable and associated linker (or to no 379*80e2ca85S * linker for static binaries). This matches the process data 380*80e2ca85S * stored within the kernel and visible from /proc, which was 381*80e2ca85S * all setup in sn1_elfexec(). We do this so that when a debugger 382*80e2ca85S * attaches to the process it sees the process as a normal solaris 383*80e2ca85S * process, this brand emulation library and everything on it's 384*80e2ca85S * link map will not be visible, unless our librtld_db plugin 385*80e2ca85S * is used. Note that this is very different from how Linux 386*80e2ca85S * branded processes are implemented within lx branded zones. 387*80e2ca85S * In that situation, the primary linkmap of the process is the 388*80e2ca85S * brand emulation libraries linkmap, not the Linux applications 389*80e2ca85S * linkmap. 390*80e2ca85S * 391*80e2ca85S * We also need to clear the AF_SUN_NOPLM flag from the AT_SUN_AUXFLAGS 392*80e2ca85S * aux vector. This flag told our linker that we don't have a 393*80e2ca85S * primary link map. Now that our linker is done initializing, we 394*80e2ca85S * want to clear this flag before we transfer control to the 395*80e2ca85S * applications copy of the linker, since we want that linker to have 396*80e2ca85S * a primary link map which will be the link map for the application 397*80e2ca85S * we're running. 398*80e2ca85S */ 399*80e2ca85S p++; 400*80e2ca85S for (ap = (auxv_t *)p; ap->a_type != AT_NULL; ap++) { 401*80e2ca85S switch (ap->a_type) { 402*80e2ca85S case AT_BASE: 403*80e2ca85S /* Hide AT_BASE if static binary */ 404*80e2ca85S if (sed.sed_base == NULL) { 405*80e2ca85S ap->a_type = AT_IGNORE; 406*80e2ca85S ap->a_un.a_val = NULL; 407*80e2ca85S } else { 408*80e2ca85S ap->a_un.a_val = sed.sed_base; 409*80e2ca85S } 410*80e2ca85S break; 411*80e2ca85S case AT_ENTRY: 412*80e2ca85S ap->a_un.a_val = sed.sed_entry; 413*80e2ca85S break; 414*80e2ca85S case AT_PHDR: 415*80e2ca85S ap->a_un.a_val = sed.sed_phdr; 416*80e2ca85S break; 417*80e2ca85S case AT_PHENT: 418*80e2ca85S ap->a_un.a_val = sed.sed_phent; 419*80e2ca85S break; 420*80e2ca85S case AT_PHNUM: 421*80e2ca85S ap->a_un.a_val = sed.sed_phnum; 422*80e2ca85S break; 423*80e2ca85S case AT_SUN_AUXFLAGS: 424*80e2ca85S ap->a_un.a_val &= ~AF_SUN_NOPLM; 425*80e2ca85S break; 426*80e2ca85S case AT_SUN_EMULATOR: 427*80e2ca85S /* 428*80e2ca85S * ld.so.1 inspects AT_SUN_EMULATOR to see if 429*80e2ca85S * if it is the linker for the brand emulation 430*80e2ca85S * library. Hide AT_SUN_EMULATOR, as the 431*80e2ca85S * linker we are about to jump to is the linker 432*80e2ca85S * for the binary. 433*80e2ca85S */ 434*80e2ca85S ap->a_type = AT_IGNORE; 435*80e2ca85S ap->a_un.a_val = NULL; 436*80e2ca85S break; 437*80e2ca85S case AT_SUN_LDDATA: 438*80e2ca85S /* Hide AT_SUN_LDDATA if static binary */ 439*80e2ca85S if (sed.sed_lddata == NULL) { 440*80e2ca85S ap->a_type = AT_IGNORE; 441*80e2ca85S ap->a_un.a_val = NULL; 442*80e2ca85S } else { 443*80e2ca85S ap->a_un.a_val = sed.sed_lddata; 444*80e2ca85S } 445*80e2ca85S break; 446*80e2ca85S default: 447*80e2ca85S break; 448*80e2ca85S } 449*80e2ca85S } 450*80e2ca85S 451*80e2ca85S return (sed.sed_ldentry); 452*80e2ca85S } 453