1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/param.h>
27#include <sys/vmparam.h>
28#include <sys/types.h>
29#include <sys/sysmacros.h>
30#include <sys/systm.h>
31#include <sys/signal.h>
32#include <sys/stack.h>
33#include <sys/cred.h>
34#include <sys/cmn_err.h>
35#include <sys/user.h>
36#include <sys/privregs.h>
37#include <sys/psw.h>
38#include <sys/debug.h>
39#include <sys/errno.h>
40#include <sys/proc.h>
41#include <sys/modctl.h>
42#include <sys/var.h>
43#include <sys/inline.h>
44#include <sys/syscall.h>
45#include <sys/ucontext.h>
46#include <sys/cpuvar.h>
47#include <sys/siginfo.h>
48#include <sys/trap.h>
49#include <sys/vtrace.h>
50#include <sys/sysinfo.h>
51#include <sys/procfs.h>
52#include <sys/prsystm.h>
53#include <c2/audit.h>
54#include <sys/modctl.h>
55#include <sys/aio_impl.h>
56#include <sys/tnf.h>
57#include <sys/tnf_probe.h>
58#include <sys/copyops.h>
59#include <sys/priv.h>
60#include <sys/msacct.h>
61
62int syscalltrace = 0;
63#ifdef SYSCALLTRACE
64static kmutex_t systrace_lock;		/* syscall tracing lock */
65#else
66#define	syscalltrace 0
67#endif /* SYSCALLTRACE */
68
69typedef	int64_t (*llfcn_t)();	/* function returning long long */
70
71int pre_syscall(void);
72void post_syscall(long rval1, long rval2);
73static krwlock_t *lock_syscall(struct sysent *, uint_t);
74void deferred_singlestep_trap(caddr_t);
75
76#ifdef _SYSCALL32_IMPL
77#define	LWP_GETSYSENT(lwp)	\
78	(lwp_getdatamodel(lwp) == DATAMODEL_NATIVE ? sysent : sysent32)
79#else
80#define	LWP_GETSYSENT(lwp)	(sysent)
81#endif
82
83/*
84 * If watchpoints are active, don't make copying in of
85 * system call arguments take a read watchpoint trap.
86 */
87static int
88copyin_args(struct regs *rp, long *ap, uint_t nargs)
89{
90	greg_t *sp = 1 + (greg_t *)rp->r_sp;		/* skip ret addr */
91
92	ASSERT(nargs <= MAXSYSARGS);
93
94	return (copyin_nowatch(sp, ap, nargs * sizeof (*sp)));
95}
96
97#if defined(_SYSCALL32_IMPL)
98static int
99copyin_args32(struct regs *rp, long *ap, uint_t nargs)
100{
101	greg32_t *sp = 1 + (greg32_t *)rp->r_sp;	/* skip ret addr */
102	uint32_t a32[MAXSYSARGS];
103	int rc;
104
105	ASSERT(nargs <= MAXSYSARGS);
106
107	if ((rc = copyin_nowatch(sp, a32, nargs * sizeof (*sp))) == 0) {
108		uint32_t *a32p = &a32[0];
109
110		while (nargs--)
111			*ap++ = (ulong_t)*a32p++;
112	}
113	return (rc);
114}
115#define	COPYIN_ARGS32	copyin_args32
116#else
117#define	COPYIN_ARGS32	copyin_args
118#endif
119
120/*
121 * Error handler for system calls where arg copy gets fault.
122 */
123static longlong_t
124syscall_err()
125{
126	return (0);
127}
128
129/*
130 * Corresponding sysent entry to allow syscall_entry caller
131 * to invoke syscall_err.
132 */
133static struct sysent sysent_err =  {
134	0, SE_32RVAL1, NULL, NULL, (llfcn_t)syscall_err
135};
136
137/*
138 * Called from syscall() when a non-trivial 32-bit system call occurs.
139 *	Sets up the args and returns a pointer to the handler.
140 */
141struct sysent *
142syscall_entry(kthread_t *t, long *argp)
143{
144	klwp_t *lwp = ttolwp(t);
145	struct regs *rp = lwptoregs(lwp);
146	unsigned int code;
147	struct sysent *callp;
148	struct sysent *se = LWP_GETSYSENT(lwp);
149	int error = 0;
150	uint_t nargs;
151
152	ASSERT(t == curthread && curthread->t_schedflag & TS_DONT_SWAP);
153
154	lwp->lwp_ru.sysc++;
155	lwp->lwp_eosys = NORMALRETURN;	/* assume this will be normal */
156
157	/*
158	 * Set lwp_ap to point to the args, even if none are needed for this
159	 * system call.  This is for the loadable-syscall case where the
160	 * number of args won't be known until the system call is loaded, and
161	 * also maintains a non-NULL lwp_ap setup for get_syscall_args(). Note
162	 * that lwp_ap MUST be set to a non-NULL value _BEFORE_ t_sysnum is
163	 * set to non-zero; otherwise get_syscall_args(), seeing a non-zero
164	 * t_sysnum for this thread, will charge ahead and dereference lwp_ap.
165	 */
166	lwp->lwp_ap = argp;		/* for get_syscall_args */
167
168	code = rp->r_r0;
169	t->t_sysnum = (short)code;
170	callp = code >= NSYSCALL ? &nosys_ent : se + code;
171
172	if ((t->t_pre_sys | syscalltrace) != 0) {
173		error = pre_syscall();
174
175		/*
176		 * pre_syscall() has taken care so that lwp_ap is current;
177		 * it either points to syscall-entry-saved amd64 regs,
178		 * or it points to lwp_arg[], which has been re-copied from
179		 * the ia32 ustack, but either way, it's a current copy after
180		 * /proc has possibly mucked with the syscall args.
181		 */
182
183		if (error)
184			return (&sysent_err);	/* use dummy handler */
185	}
186
187	/*
188	 * Fetch the system call arguments to the kernel stack copy used
189	 * for syscall handling.
190	 * Note: for loadable system calls the number of arguments required
191	 * may not be known at this point, and will be zero if the system call
192	 * was never loaded.  Once the system call has been loaded, the number
193	 * of args is not allowed to be changed.
194	 */
195	if ((nargs = (uint_t)callp->sy_narg) != 0 &&
196	    COPYIN_ARGS32(rp, argp, nargs)) {
197		(void) set_errno(EFAULT);
198		return (&sysent_err);	/* use dummy handler */
199	}
200
201	return (callp);		/* return sysent entry for caller */
202}
203
204void
205syscall_exit(kthread_t *t, long rval1, long rval2)
206{
207	/*
208	 * Handle signals and other post-call events if necessary.
209	 */
210	if ((t->t_post_sys_ast | syscalltrace) == 0) {
211		klwp_t *lwp = ttolwp(t);
212		struct regs *rp = lwptoregs(lwp);
213
214		/*
215		 * Normal return.
216		 * Clear error indication and set return values.
217		 */
218		rp->r_ps &= ~PS_C;	/* reset carry bit */
219		rp->r_r0 = rval1;
220		rp->r_r1 = rval2;
221		lwp->lwp_state = LWP_USER;
222	} else {
223		post_syscall(rval1, rval2);
224	}
225	t->t_sysnum = 0;		/* invalidate args */
226}
227
228/*
229 * Perform pre-system-call processing, including stopping for tracing,
230 * auditing, etc.
231 *
232 * This routine is called only if the t_pre_sys flag is set. Any condition
233 * requiring pre-syscall handling must set the t_pre_sys flag. If the
234 * condition is persistent, this routine will repost t_pre_sys.
235 */
236int
237pre_syscall()
238{
239	kthread_t *t = curthread;
240	unsigned code = t->t_sysnum;
241	klwp_t *lwp = ttolwp(t);
242	proc_t *p = ttoproc(t);
243	int	repost;
244
245	t->t_pre_sys = repost = 0;	/* clear pre-syscall processing flag */
246
247	ASSERT(t->t_schedflag & TS_DONT_SWAP);
248
249#if defined(DEBUG)
250	/*
251	 * On the i386 kernel, lwp_ap points at the piece of the thread
252	 * stack that we copy the users arguments into.
253	 *
254	 * On the amd64 kernel, the syscall arguments in the rdi..r9
255	 * registers should be pointed at by lwp_ap.  If the args need to
256	 * be copied so that those registers can be changed without losing
257	 * the ability to get the args for /proc, they can be saved by
258	 * save_syscall_args(), and lwp_ap will be restored by post_syscall().
259	 */
260	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
261#if defined(_LP64)
262		ASSERT(lwp->lwp_ap == (long *)&lwptoregs(lwp)->r_rdi);
263	} else {
264#endif
265		ASSERT((caddr_t)lwp->lwp_ap > t->t_stkbase &&
266		    (caddr_t)lwp->lwp_ap < t->t_stk);
267	}
268#endif	/* DEBUG */
269
270	/*
271	 * Make sure the thread is holding the latest credentials for the
272	 * process.  The credentials in the process right now apply to this
273	 * thread for the entire system call.
274	 */
275	if (t->t_cred != p->p_cred) {
276		cred_t *oldcred = t->t_cred;
277		/*
278		 * DTrace accesses t_cred in probe context.  t_cred must
279		 * always be either NULL, or point to a valid, allocated cred
280		 * structure.
281		 */
282		t->t_cred = crgetcred();
283		crfree(oldcred);
284	}
285
286	/*
287	 * From the proc(4) manual page:
288	 * When entry to a system call is being traced, the traced process
289	 * stops after having begun the call to the system but before the
290	 * system call arguments have been fetched from the process.
291	 */
292	if (PTOU(p)->u_systrap) {
293		if (prismember(&PTOU(p)->u_entrymask, code)) {
294			mutex_enter(&p->p_lock);
295			/*
296			 * Recheck stop condition, now that lock is held.
297			 */
298			if (PTOU(p)->u_systrap &&
299			    prismember(&PTOU(p)->u_entrymask, code)) {
300				stop(PR_SYSENTRY, code);
301
302				/*
303				 * /proc may have modified syscall args,
304				 * either in regs for amd64 or on ustack
305				 * for ia32.  Either way, arrange to
306				 * copy them again, both for the syscall
307				 * handler and for other consumers in
308				 * post_syscall (like audit).  Here, we
309				 * only do amd64, and just set lwp_ap
310				 * back to the kernel-entry stack copy;
311				 * the syscall ml code redoes
312				 * move-from-regs to set up for the
313				 * syscall handler after we return.  For
314				 * ia32, save_syscall_args() below makes
315				 * an lwp_ap-accessible copy.
316				 */
317#if defined(_LP64)
318				if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
319					lwp->lwp_argsaved = 0;
320					lwp->lwp_ap =
321					    (long *)&lwptoregs(lwp)->r_rdi;
322				}
323#endif
324			}
325			mutex_exit(&p->p_lock);
326		}
327		repost = 1;
328	}
329
330	/*
331	 * ia32 kernel, or ia32 proc on amd64 kernel: keep args in
332	 * lwp_arg for post-syscall processing, regardless of whether
333	 * they might have been changed in /proc above.
334	 */
335#if defined(_LP64)
336	if (lwp_getdatamodel(lwp) != DATAMODEL_NATIVE)
337#endif
338		(void) save_syscall_args();
339
340	if (lwp->lwp_sysabort) {
341		/*
342		 * lwp_sysabort may have been set via /proc while the process
343		 * was stopped on PR_SYSENTRY.  If so, abort the system call.
344		 * Override any error from the copyin() of the arguments.
345		 */
346		lwp->lwp_sysabort = 0;
347		(void) set_errno(EINTR);	/* forces post_sys */
348		t->t_pre_sys = 1;	/* repost anyway */
349		return (1);		/* don't do system call, return EINTR */
350	}
351
352	/*
353	 * begin auditing for this syscall if the c2audit module is loaded
354	 * and auditing is enabled
355	 */
356	if (audit_active == C2AUDIT_LOADED) {
357		uint32_t auditing = au_zone_getstate(NULL);
358
359		if (auditing & AU_AUDIT_MASK) {
360			int error;
361			if (error = audit_start(T_SYSCALL, code, auditing, \
362			    0, lwp)) {
363				t->t_pre_sys = 1;	/* repost anyway */
364				(void) set_errno(error);
365				return (1);
366			}
367			repost = 1;
368		}
369	}
370
371#ifndef NPROBE
372	/* Kernel probe */
373	if (tnf_tracing_active) {
374		TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
375			tnf_sysnum,	sysnum,		t->t_sysnum);
376		t->t_post_sys = 1;	/* make sure post_syscall runs */
377		repost = 1;
378	}
379#endif /* NPROBE */
380
381#ifdef SYSCALLTRACE
382	if (syscalltrace) {
383		int i;
384		long *ap;
385		char *cp;
386		char *sysname;
387		struct sysent *callp;
388
389		if (code >= NSYSCALL)
390			callp = &nosys_ent;	/* nosys has no args */
391		else
392			callp = LWP_GETSYSENT(lwp) + code;
393		(void) save_syscall_args();
394		mutex_enter(&systrace_lock);
395		printf("%d: ", p->p_pid);
396		if (code >= NSYSCALL) {
397			printf("0x%x", code);
398		} else {
399			sysname = mod_getsysname(code);
400			printf("%s[0x%x/0x%p]", sysname == NULL ? "NULL" :
401			    sysname, code, callp->sy_callc);
402		}
403		cp = "(";
404		for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
405			printf("%s%lx", cp, *ap);
406			cp = ", ";
407		}
408		if (i)
409			printf(")");
410		printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
411		mutex_exit(&systrace_lock);
412	}
413#endif /* SYSCALLTRACE */
414
415	/*
416	 * If there was a continuing reason for pre-syscall processing,
417	 * set the t_pre_sys flag for the next system call.
418	 */
419	if (repost)
420		t->t_pre_sys = 1;
421	lwp->lwp_error = 0;	/* for old drivers */
422	lwp->lwp_badpriv = PRIV_NONE;
423	return (0);
424}
425
426
427/*
428 * Post-syscall processing.  Perform abnormal system call completion
429 * actions such as /proc tracing, profiling, signals, preemption, etc.
430 *
431 * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
432 * Any condition requiring pre-syscall handling must set one of these.
433 * If the condition is persistent, this routine will repost t_post_sys.
434 */
435void
436post_syscall(long rval1, long rval2)
437{
438	kthread_t *t = curthread;
439	klwp_t *lwp = ttolwp(t);
440	proc_t *p = ttoproc(t);
441	struct regs *rp = lwptoregs(lwp);
442	uint_t	error;
443	uint_t	code = t->t_sysnum;
444	int	repost = 0;
445	int	proc_stop = 0;		/* non-zero if stopping */
446	int	sigprof = 0;		/* non-zero if sending SIGPROF */
447
448	t->t_post_sys = 0;
449
450	error = lwp->lwp_errno;
451
452	/*
453	 * Code can be zero if this is a new LWP returning after a forkall(),
454	 * other than the one which matches the one in the parent which called
455	 * forkall().  In these LWPs, skip most of post-syscall activity.
456	 */
457	if (code == 0)
458		goto sig_check;
459	/*
460	 * If the trace flag is set, mark the lwp to take a single-step trap
461	 * on return to user level (below). The x86 lcall interface and
462	 * sysenter has already done this, and turned off the flag, but
463	 * amd64 syscall interface has not.
464	 */
465	if (rp->r_ps & PS_T) {
466		lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING;
467		rp->r_ps &= ~PS_T;
468		aston(curthread);
469	}
470
471	/* put out audit record for this syscall */
472	if (AU_AUDITING()) {
473		rval_t	rval;
474
475		/* XX64 -- truncation of 64-bit return values? */
476		rval.r_val1 = (int)rval1;
477		rval.r_val2 = (int)rval2;
478		audit_finish(T_SYSCALL, code, error, &rval);
479		repost = 1;
480	}
481
482	if (curthread->t_pdmsg != NULL) {
483		char *m = curthread->t_pdmsg;
484
485		uprintf("%s", m);
486		kmem_free(m, strlen(m) + 1);
487		curthread->t_pdmsg = NULL;
488	}
489
490	/*
491	 * If we're going to stop for /proc tracing, set the flag and
492	 * save the arguments so that the return values don't smash them.
493	 */
494	if (PTOU(p)->u_systrap) {
495		if (prismember(&PTOU(p)->u_exitmask, code)) {
496			if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
497				(void) save_syscall_args();
498			proc_stop = 1;
499		}
500		repost = 1;
501	}
502
503	/*
504	 * Similarly check to see if SIGPROF might be sent.
505	 */
506	if (curthread->t_rprof != NULL &&
507	    curthread->t_rprof->rp_anystate != 0) {
508		if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
509			(void) save_syscall_args();
510		sigprof = 1;
511	}
512
513	if (lwp->lwp_eosys == NORMALRETURN) {
514		if (error == 0) {
515#ifdef SYSCALLTRACE
516			if (syscalltrace) {
517				mutex_enter(&systrace_lock);
518				printf(
519				    "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
520				    p->p_pid, rval1, rval2, curthread);
521				mutex_exit(&systrace_lock);
522			}
523#endif /* SYSCALLTRACE */
524			rp->r_ps &= ~PS_C;
525			rp->r_r0 = rval1;
526			rp->r_r1 = rval2;
527		} else {
528			int sig;
529#ifdef SYSCALLTRACE
530			if (syscalltrace) {
531				mutex_enter(&systrace_lock);
532				printf("%d: error=%d, id 0x%p\n",
533				    p->p_pid, error, curthread);
534				mutex_exit(&systrace_lock);
535			}
536#endif /* SYSCALLTRACE */
537			if (error == EINTR && t->t_activefd.a_stale)
538				error = EBADF;
539			if (error == EINTR &&
540			    (sig = lwp->lwp_cursig) != 0 &&
541			    sigismember(&PTOU(p)->u_sigrestart, sig) &&
542			    PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
543			    PTOU(p)->u_signal[sig - 1] != SIG_IGN)
544				error = ERESTART;
545			rp->r_r0 = error;
546			rp->r_ps |= PS_C;
547		}
548	}
549
550	/*
551	 * From the proc(4) manual page:
552	 * When exit from a system call is being traced, the traced process
553	 * stops on completion of the system call just prior to checking for
554	 * signals and returning to user level.  At this point all return
555	 * values have been stored into the traced process's saved registers.
556	 */
557	if (proc_stop) {
558		mutex_enter(&p->p_lock);
559		if (PTOU(p)->u_systrap &&
560		    prismember(&PTOU(p)->u_exitmask, code))
561			stop(PR_SYSEXIT, code);
562		mutex_exit(&p->p_lock);
563	}
564
565	/*
566	 * If we are the parent returning from a successful
567	 * vfork, wait for the child to exec or exit.
568	 * This code must be here and not in the bowels of the system
569	 * so that /proc can intercept exit from vfork in a timely way.
570	 */
571	if (t->t_flag & T_VFPARENT) {
572		ASSERT(code == SYS_vfork || code == SYS_forksys);
573		ASSERT(rp->r_r1 == 0 && error == 0);
574		vfwait((pid_t)rval1);
575		t->t_flag &= ~T_VFPARENT;
576	}
577
578	/*
579	 * If profiling is active, bill the current PC in user-land
580	 * and keep reposting until profiling is disabled.
581	 */
582	if (p->p_prof.pr_scale) {
583		if (lwp->lwp_oweupc)
584			profil_tick(rp->r_pc);
585		repost = 1;
586	}
587
588sig_check:
589	/*
590	 * Reset flag for next time.
591	 * We must do this after stopping on PR_SYSEXIT
592	 * because /proc uses the information in lwp_eosys.
593	 */
594	lwp->lwp_eosys = NORMALRETURN;
595	clear_stale_fd();
596	t->t_flag &= ~T_FORKALL;
597
598	if (t->t_astflag | t->t_sig_check) {
599		/*
600		 * Turn off the AST flag before checking all the conditions that
601		 * may have caused an AST.  This flag is on whenever a signal or
602		 * unusual condition should be handled after the next trap or
603		 * syscall.
604		 */
605		astoff(t);
606		/*
607		 * If a single-step trap occurred on a syscall (see trap())
608		 * recognize it now.  Do this before checking for signals
609		 * because deferred_singlestep_trap() may generate a SIGTRAP to
610		 * the LWP or may otherwise mark the LWP to call issig(FORREAL).
611		 */
612		if (lwp->lwp_pcb.pcb_flags & DEBUG_PENDING)
613			deferred_singlestep_trap((caddr_t)rp->r_pc);
614
615		t->t_sig_check = 0;
616
617		/*
618		 * The following check is legal for the following reasons:
619		 *	1) The thread we are checking, is ourselves, so there is
620		 *	   no way the proc can go away.
621		 *	2) The only time we need to be protected by the
622		 *	   lock is if the binding is changed.
623		 *
624		 *	Note we will still take the lock and check the binding
625		 *	if the condition was true without the lock held.  This
626		 *	prevents lock contention among threads owned by the
627		 *	same proc.
628		 */
629
630		if (curthread->t_proc_flag & TP_CHANGEBIND) {
631			mutex_enter(&p->p_lock);
632			if (curthread->t_proc_flag & TP_CHANGEBIND) {
633				timer_lwpbind();
634				curthread->t_proc_flag &= ~TP_CHANGEBIND;
635			}
636			mutex_exit(&p->p_lock);
637		}
638
639		/*
640		 * for kaio requests on the special kaio poll queue,
641		 * copyout their results to user memory.
642		 */
643		if (p->p_aio)
644			aio_cleanup(0);
645		/*
646		 * If this LWP was asked to hold, call holdlwp(), which will
647		 * stop.  holdlwps() sets this up and calls pokelwps() which
648		 * sets the AST flag.
649		 *
650		 * Also check TP_EXITLWP, since this is used by fresh new LWPs
651		 * through lwp_rtt().  That flag is set if the lwp_create(2)
652		 * syscall failed after creating the LWP.
653		 */
654		if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
655			holdlwp();
656
657		/*
658		 * All code that sets signals and makes ISSIG_PENDING
659		 * evaluate true must set t_sig_check afterwards.
660		 */
661		if (ISSIG_PENDING(t, lwp, p)) {
662			if (issig(FORREAL))
663				psig();
664			t->t_sig_check = 1;	/* recheck next time */
665		}
666
667		if (sigprof) {
668			int nargs = (code > 0 && code < NSYSCALL)?
669			    LWP_GETSYSENT(lwp)[code].sy_narg : 0;
670			realsigprof(code, nargs, error);
671			t->t_sig_check = 1;	/* recheck next time */
672		}
673
674		/*
675		 * If a performance counter overflow interrupt was
676		 * delivered *during* the syscall, then re-enable the
677		 * AST so that we take a trip through trap() to cause
678		 * the SIGEMT to be delivered.
679		 */
680		if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
681			aston(t);
682
683		/*
684		 * /proc can't enable/disable the trace bit itself
685		 * because that could race with the call gate used by
686		 * system calls via "lcall". If that happened, an
687		 * invalid EFLAGS would result. prstep()/prnostep()
688		 * therefore schedule an AST for the purpose.
689		 */
690		if (lwp->lwp_pcb.pcb_flags & REQUEST_STEP) {
691			lwp->lwp_pcb.pcb_flags &= ~REQUEST_STEP;
692			rp->r_ps |= PS_T;
693		}
694		if (lwp->lwp_pcb.pcb_flags & REQUEST_NOSTEP) {
695			lwp->lwp_pcb.pcb_flags &= ~REQUEST_NOSTEP;
696			rp->r_ps &= ~PS_T;
697		}
698	}
699
700	lwp->lwp_errno = 0;		/* clear error for next time */
701
702#ifndef NPROBE
703	/* Kernel probe */
704	if (tnf_tracing_active) {
705		TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
706		    tnf_long,	rval1,		rval1,
707		    tnf_long,	rval2,		rval2,
708		    tnf_long,	errno,		(long)error);
709		repost = 1;
710	}
711#endif /* NPROBE */
712
713	/*
714	 * Set state to LWP_USER here so preempt won't give us a kernel
715	 * priority if it occurs after this point.  Call CL_TRAPRET() to
716	 * restore the user-level priority.
717	 *
718	 * It is important that no locks (other than spinlocks) be entered
719	 * after this point before returning to user mode (unless lwp_state
720	 * is set back to LWP_SYS).
721	 *
722	 * XXX Sampled times past this point are charged to the user.
723	 */
724	lwp->lwp_state = LWP_USER;
725
726	if (t->t_trapret) {
727		t->t_trapret = 0;
728		thread_lock(t);
729		CL_TRAPRET(t);
730		thread_unlock(t);
731	}
732	if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
733		preempt();
734	prunstop();
735
736	lwp->lwp_errno = 0;		/* clear error for next time */
737
738	/*
739	 * The thread lock must be held in order to clear sysnum and reset
740	 * lwp_ap atomically with respect to other threads in the system that
741	 * may be looking at the args via lwp_ap from get_syscall_args().
742	 */
743
744	thread_lock(t);
745	t->t_sysnum = 0;		/* no longer in a system call */
746
747	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
748#if defined(_LP64)
749		/*
750		 * In case the args were copied to the lwp, reset the
751		 * pointer so the next syscall will have the right
752		 * lwp_ap pointer.
753		 */
754		lwp->lwp_ap = (long *)&rp->r_rdi;
755	} else {
756#endif
757		lwp->lwp_ap = NULL;	/* reset on every syscall entry */
758	}
759	thread_unlock(t);
760
761	lwp->lwp_argsaved = 0;
762
763	/*
764	 * If there was a continuing reason for post-syscall processing,
765	 * set the t_post_sys flag for the next system call.
766	 */
767	if (repost)
768		t->t_post_sys = 1;
769
770	/*
771	 * If there is a ustack registered for this lwp, and the stack rlimit
772	 * has been altered, read in the ustack. If the saved stack rlimit
773	 * matches the bounds of the ustack, update the ustack to reflect
774	 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
775	 * stack checking by setting the size to 0.
776	 */
777	if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
778		rlim64_t new_size;
779		caddr_t top;
780		stack_t stk;
781		struct rlimit64 rl;
782
783		mutex_enter(&p->p_lock);
784		new_size = p->p_stk_ctl;
785		top = p->p_usrstack;
786		(void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
787		mutex_exit(&p->p_lock);
788
789		if (rl.rlim_cur == RLIM64_INFINITY)
790			new_size = 0;
791
792		if (copyin((stack_t *)lwp->lwp_ustack, &stk,
793		    sizeof (stack_t)) == 0 &&
794		    (stk.ss_size == lwp->lwp_old_stk_ctl ||
795		    stk.ss_size == 0) &&
796		    stk.ss_sp == top - stk.ss_size) {
797			stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
798			    stk.ss_size - (uintptr_t)new_size);
799			stk.ss_size = new_size;
800
801			(void) copyout(&stk, (stack_t *)lwp->lwp_ustack,
802			    sizeof (stack_t));
803		}
804
805		lwp->lwp_old_stk_ctl = 0;
806	}
807}
808
809/*
810 * Called from post_syscall() when a deferred singlestep is to be taken.
811 */
812void
813deferred_singlestep_trap(caddr_t pc)
814{
815	proc_t *p = ttoproc(curthread);
816	klwp_t *lwp = ttolwp(curthread);
817	pcb_t *pcb = &lwp->lwp_pcb;
818	uint_t fault = 0;
819	k_siginfo_t siginfo;
820
821	bzero(&siginfo, sizeof (siginfo));
822
823	/*
824	 * If both NORMAL_STEP and WATCH_STEP are in
825	 * effect, give precedence to WATCH_STEP.
826	 * If neither is set, user must have set the
827	 * PS_T bit in %efl; treat this as NORMAL_STEP.
828	 */
829	if ((fault = undo_watch_step(&siginfo)) == 0 &&
830	    ((pcb->pcb_flags & NORMAL_STEP) ||
831	    !(pcb->pcb_flags & WATCH_STEP))) {
832		siginfo.si_signo = SIGTRAP;
833		siginfo.si_code = TRAP_TRACE;
834		siginfo.si_addr  = pc;
835		fault = FLTTRACE;
836	}
837	pcb->pcb_flags &= ~(DEBUG_PENDING|NORMAL_STEP|WATCH_STEP);
838
839	if (fault) {
840		/*
841		 * Remember the fault and fault adddress
842		 * for real-time (SIGPROF) profiling.
843		 */
844		lwp->lwp_lastfault = fault;
845		lwp->lwp_lastfaddr = siginfo.si_addr;
846		/*
847		 * If a debugger has declared this fault to be an
848		 * event of interest, stop the lwp.  Otherwise just
849		 * deliver the associated signal.
850		 */
851		if (prismember(&p->p_fltmask, fault) &&
852		    stop_on_fault(fault, &siginfo) == 0)
853			siginfo.si_signo = 0;
854	}
855
856	if (siginfo.si_signo)
857		trapsig(&siginfo, 1);
858}
859
860/*
861 * nonexistent system call-- signal lwp (may want to handle it)
862 * flag error if lwp won't see signal immediately
863 */
864int64_t
865nosys(void)
866{
867	tsignal(curthread, SIGSYS);
868	return (set_errno(ENOSYS));
869}
870
871int
872nosys32(void)
873{
874	return (nosys());
875}
876
877/*
878 * Execute a 32-bit system call on behalf of the current thread.
879 */
880void
881dosyscall(void)
882{
883	/*
884	 * Need space on the stack to store syscall arguments.
885	 */
886	long		syscall_args[MAXSYSARGS];
887	struct sysent	*se;
888	int64_t		ret;
889
890	syscall_mstate(LMS_TRAP, LMS_SYSTEM);
891
892	ASSERT(curproc->p_model == DATAMODEL_ILP32);
893
894	CPU_STATS_ENTER_K();
895	CPU_STATS_ADDQ(CPU, sys, syscall, 1);
896	CPU_STATS_EXIT_K();
897
898	se = syscall_entry(curthread, syscall_args);
899
900	/*
901	 * syscall_entry() copied all 8 arguments into syscall_args.
902	 */
903	ret = se->sy_callc(syscall_args[0], syscall_args[1], syscall_args[2],
904	    syscall_args[3], syscall_args[4], syscall_args[5], syscall_args[6],
905	    syscall_args[7]);
906
907	syscall_exit(curthread, (int)ret & 0xffffffffu, (int)(ret >> 32));
908	syscall_mstate(LMS_SYSTEM, LMS_TRAP);
909}
910
911/*
912 * Get the arguments to the current system call. See comment atop
913 * save_syscall_args() regarding lwp_ap usage.
914 */
915
916uint_t
917get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
918{
919	kthread_t	*t = lwptot(lwp);
920	ulong_t	mask = 0xfffffffful;
921	uint_t	code;
922	long	*ap;
923	int	nargs;
924
925#if defined(_LP64)
926	if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
927		mask = 0xfffffffffffffffful;
928#endif
929
930	/*
931	 * The thread lock must be held while looking at the arguments to ensure
932	 * they don't go away via post_syscall().
933	 * get_syscall_args() is the only routine to read them which is callable
934	 * outside the LWP in question and hence the only one that must be
935	 * synchronized in this manner.
936	 */
937	thread_lock(t);
938
939	code = t->t_sysnum;
940	ap = lwp->lwp_ap;
941
942	thread_unlock(t);
943
944	if (code != 0 && code < NSYSCALL) {
945		nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
946
947		ASSERT(nargs <= MAXSYSARGS);
948
949		*nargsp = nargs;
950		while (nargs-- > 0)
951			*argp++ = *ap++ & mask;
952	} else {
953		*nargsp = 0;
954	}
955
956	return (code);
957}
958
959#ifdef _SYSCALL32_IMPL
960/*
961 * Get the arguments to the current 32-bit system call.
962 */
963uint_t
964get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
965{
966	long args[MAXSYSARGS];
967	uint_t i, code;
968
969	code = get_syscall_args(lwp, args, nargsp);
970
971	for (i = 0; i != *nargsp; i++)
972		*argp++ = (int)args[i];
973	return (code);
974}
975#endif
976
977/*
978 * Save the system call arguments in a safe place.
979 *
980 * On the i386 kernel:
981 *
982 *	Copy the users args prior to changing the stack or stack pointer.
983 *	This is so /proc will be able to get a valid copy of the
984 *	args from the user stack even after the user stack has been changed.
985 *	Note that the kernel stack copy of the args may also have been
986 *	changed by a system call handler which takes C-style arguments.
987 *
988 *	Note that this may be called by stop() from trap().  In that case
989 *	t_sysnum will be zero (syscall_exit clears it), so no args will be
990 *	copied.
991 *
992 * On the amd64 kernel:
993 *
994 *	For 64-bit applications, lwp->lwp_ap normally points to %rdi..%r9
995 *	in the reg structure. If the user is going to change the argument
996 *	registers, rax, or the stack and might want to get the args (for
997 *	/proc tracing), it must copy the args elsewhere via save_syscall_args().
998 *
999 *	For 32-bit applications, lwp->lwp_ap normally points to a copy of
1000 *	the system call arguments on the kernel stack made from the user
1001 *	stack.  Copy the args prior to change the stack or stack pointer.
1002 *	This is so /proc will be able to get a valid copy of the args
1003 *	from the user stack even after that stack has been changed.
1004 *
1005 *	This may be called from stop() even when we're not in a system call.
1006 *	Since there's no easy way to tell, this must be safe (not panic).
1007 *	If the copyins get data faults, return non-zero.
1008 */
1009int
1010save_syscall_args()
1011{
1012	kthread_t	*t = curthread;
1013	klwp_t		*lwp = ttolwp(t);
1014	uint_t		code = t->t_sysnum;
1015	uint_t		nargs;
1016
1017	if (lwp->lwp_argsaved || code == 0)
1018		return (0);		/* args already saved or not needed */
1019
1020	if (code >= NSYSCALL) {
1021		nargs = 0;		/* illegal syscall */
1022	} else {
1023		struct sysent *se = LWP_GETSYSENT(lwp);
1024		struct sysent *callp = se + code;
1025
1026		nargs = callp->sy_narg;
1027		if (LOADABLE_SYSCALL(callp) && nargs == 0) {
1028			krwlock_t	*module_lock;
1029
1030			/*
1031			 * Find out how many arguments the system
1032			 * call uses.
1033			 *
1034			 * We have the property that loaded syscalls
1035			 * never change the number of arguments they
1036			 * use after they've been loaded once.  This
1037			 * allows us to stop for /proc tracing without
1038			 * holding the module lock.
1039			 * /proc is assured that sy_narg is valid.
1040			 */
1041			module_lock = lock_syscall(se, code);
1042			nargs = callp->sy_narg;
1043			rw_exit(module_lock);
1044		}
1045	}
1046
1047	/*
1048	 * Fetch the system call arguments.
1049	 */
1050	if (nargs == 0)
1051		goto out;
1052
1053	ASSERT(nargs <= MAXSYSARGS);
1054
1055	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
1056#if defined(_LP64)
1057		struct regs *rp = lwptoregs(lwp);
1058
1059		lwp->lwp_arg[0] = rp->r_rdi;
1060		lwp->lwp_arg[1] = rp->r_rsi;
1061		lwp->lwp_arg[2] = rp->r_rdx;
1062		lwp->lwp_arg[3] = rp->r_rcx;
1063		lwp->lwp_arg[4] = rp->r_r8;
1064		lwp->lwp_arg[5] = rp->r_r9;
1065		if (nargs > 6 && copyin_args(rp, &lwp->lwp_arg[6], nargs - 6))
1066			return (-1);
1067	} else {
1068#endif
1069		if (COPYIN_ARGS32(lwptoregs(lwp), lwp->lwp_arg, nargs))
1070			return (-1);
1071	}
1072out:
1073	lwp->lwp_ap = lwp->lwp_arg;
1074	lwp->lwp_argsaved = 1;
1075	t->t_post_sys = 1;	/* so lwp_ap will be reset */
1076	return (0);
1077}
1078
1079void
1080reset_syscall_args(void)
1081{
1082	ttolwp(curthread)->lwp_argsaved = 0;
1083}
1084
1085/*
1086 * Call a system call which takes a pointer to the user args struct and
1087 * a pointer to the return values.  This is a bit slower than the standard
1088 * C arg-passing method in some cases.
1089 */
1090int64_t
1091syscall_ap(void)
1092{
1093	uint_t	error;
1094	struct sysent *callp;
1095	rval_t	rval;
1096	kthread_t *t = curthread;
1097	klwp_t	*lwp = ttolwp(t);
1098	struct regs *rp = lwptoregs(lwp);
1099
1100	callp = LWP_GETSYSENT(lwp) + t->t_sysnum;
1101
1102#if defined(__amd64)
1103	/*
1104	 * If the arguments don't fit in registers %rdi-%r9, make sure they
1105	 * have been copied to the lwp_arg array.
1106	 */
1107	if (callp->sy_narg > 6 && save_syscall_args())
1108		return ((int64_t)set_errno(EFAULT));
1109#endif
1110
1111	rval.r_val1 = 0;
1112	rval.r_val2 = rp->r_r1;
1113	lwp->lwp_error = 0;	/* for old drivers */
1114	error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
1115	if (error)
1116		return ((longlong_t)set_errno(error));
1117	return (rval.r_vals);
1118}
1119
1120/*
1121 * Load system call module.
1122 *	Returns with pointer to held read lock for module.
1123 */
1124static krwlock_t *
1125lock_syscall(struct sysent *table, uint_t code)
1126{
1127	krwlock_t	*module_lock;
1128	struct modctl	*modp;
1129	int		id;
1130	struct sysent   *callp;
1131
1132	callp = table + code;
1133	module_lock = callp->sy_lock;
1134
1135	/*
1136	 * Optimization to only call modload if we don't have a loaded
1137	 * syscall.
1138	 */
1139	rw_enter(module_lock, RW_READER);
1140	if (LOADED_SYSCALL(callp))
1141		return (module_lock);
1142	rw_exit(module_lock);
1143
1144	for (;;) {
1145		if ((id = modload("sys", syscallnames[code])) == -1)
1146			break;
1147
1148		/*
1149		 * If we loaded successfully at least once, the modctl
1150		 * will still be valid, so we try to grab it by filename.
1151		 * If this call fails, it's because the mod_filename
1152		 * was changed after the call to modload() (mod_hold_by_name()
1153		 * is the likely culprit).  We can safely just take
1154		 * another lap if this is the case;  the modload() will
1155		 * change the mod_filename back to one by which we can
1156		 * find the modctl.
1157		 */
1158		modp = mod_find_by_filename("sys", syscallnames[code]);
1159
1160		if (modp == NULL)
1161			continue;
1162
1163		mutex_enter(&mod_lock);
1164
1165		if (!modp->mod_installed) {
1166			mutex_exit(&mod_lock);
1167			continue;
1168		}
1169		break;
1170	}
1171	rw_enter(module_lock, RW_READER);
1172
1173	if (id != -1)
1174		mutex_exit(&mod_lock);
1175
1176	return (module_lock);
1177}
1178
1179/*
1180 * Loadable syscall support.
1181 *	If needed, load the module, then reserve it by holding a read
1182 *	lock for the duration of the call.
1183 *	Later, if the syscall is not unloadable, it could patch the vector.
1184 */
1185/*ARGSUSED*/
1186int64_t
1187loadable_syscall(
1188    long a0, long a1, long a2, long a3,
1189    long a4, long a5, long a6, long a7)
1190{
1191	klwp_t *lwp = ttolwp(curthread);
1192	int64_t	rval;
1193	struct sysent *callp;
1194	struct sysent *se = LWP_GETSYSENT(lwp);
1195	krwlock_t *module_lock;
1196	int code, error = 0;
1197
1198	code = curthread->t_sysnum;
1199	callp = se + code;
1200
1201	/*
1202	 * Try to autoload the system call if necessary
1203	 */
1204	module_lock = lock_syscall(se, code);
1205	THREAD_KPRI_RELEASE();	/* drop priority given by rw_enter */
1206
1207	/*
1208	 * we've locked either the loaded syscall or nosys
1209	 */
1210
1211	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
1212#if defined(_LP64)
1213		if (callp->sy_flags & SE_ARGC) {
1214			rval = (int64_t)(*callp->sy_call)(a0, a1, a2, a3,
1215			    a4, a5);
1216		} else {
1217			rval = syscall_ap();
1218		}
1219	} else {
1220#endif
1221		/*
1222		 * Now that it's loaded, make sure enough args were copied.
1223		 */
1224		if (COPYIN_ARGS32(lwptoregs(lwp), lwp->lwp_ap, callp->sy_narg))
1225			error = EFAULT;
1226		if (error) {
1227			rval = set_errno(error);
1228		} else if (callp->sy_flags & SE_ARGC) {
1229			rval = (int64_t)(*callp->sy_call)(lwp->lwp_ap[0],
1230			    lwp->lwp_ap[1], lwp->lwp_ap[2], lwp->lwp_ap[3],
1231			    lwp->lwp_ap[4], lwp->lwp_ap[5]);
1232		} else {
1233			rval = syscall_ap();
1234		}
1235	}
1236
1237	THREAD_KPRI_REQUEST();	/* regain priority from read lock */
1238	rw_exit(module_lock);
1239	return (rval);
1240}
1241
1242/*
1243 * Indirect syscall handled in libc on x86 architectures
1244 */
1245int64_t
1246indir()
1247{
1248	return (nosys());
1249}
1250
1251/*
1252 * set_errno - set an error return from the current system call.
1253 *	This could be a macro.
1254 *	This returns the value it is passed, so that the caller can
1255 *	use tail-recursion-elimination and do return (set_errno(ERRNO));
1256 */
1257uint_t
1258set_errno(uint_t error)
1259{
1260	ASSERT(error != 0);		/* must not be used to clear errno */
1261
1262	curthread->t_post_sys = 1;	/* have post_syscall do error return */
1263	return (ttolwp(curthread)->lwp_errno = error);
1264}
1265
1266/*
1267 * set_proc_pre_sys - Set pre-syscall processing for entire process.
1268 */
1269void
1270set_proc_pre_sys(proc_t *p)
1271{
1272	kthread_t	*t;
1273	kthread_t	*first;
1274
1275	ASSERT(MUTEX_HELD(&p->p_lock));
1276
1277	t = first = p->p_tlist;
1278	do {
1279		t->t_pre_sys = 1;
1280	} while ((t = t->t_forw) != first);
1281}
1282
1283/*
1284 * set_proc_post_sys - Set post-syscall processing for entire process.
1285 */
1286void
1287set_proc_post_sys(proc_t *p)
1288{
1289	kthread_t	*t;
1290	kthread_t	*first;
1291
1292	ASSERT(MUTEX_HELD(&p->p_lock));
1293
1294	t = first = p->p_tlist;
1295	do {
1296		t->t_post_sys = 1;
1297	} while ((t = t->t_forw) != first);
1298}
1299
1300/*
1301 * set_proc_sys - Set pre- and post-syscall processing for entire process.
1302 */
1303void
1304set_proc_sys(proc_t *p)
1305{
1306	kthread_t	*t;
1307	kthread_t	*first;
1308
1309	ASSERT(MUTEX_HELD(&p->p_lock));
1310
1311	t = first = p->p_tlist;
1312	do {
1313		t->t_pre_sys = 1;
1314		t->t_post_sys = 1;
1315	} while ((t = t->t_forw) != first);
1316}
1317
1318/*
1319 * set_all_proc_sys - set pre- and post-syscall processing flags for all
1320 * user processes.
1321 *
1322 * This is needed when auditing, tracing, or other facilities which affect
1323 * all processes are turned on.
1324 */
1325void
1326set_all_proc_sys()
1327{
1328	kthread_t	*t;
1329	kthread_t	*first;
1330
1331	mutex_enter(&pidlock);
1332	t = first = curthread;
1333	do {
1334		t->t_pre_sys = 1;
1335		t->t_post_sys = 1;
1336	} while ((t = t->t_next) != first);
1337	mutex_exit(&pidlock);
1338}
1339
1340/*
1341 * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1342 * all user processes running in the zone of the current process
1343 *
1344 * This is needed when auditing, tracing, or other facilities which affect
1345 * all processes are turned on.
1346 */
1347void
1348set_all_zone_usr_proc_sys(zoneid_t zoneid)
1349{
1350	proc_t	    *p;
1351	kthread_t   *t;
1352
1353	mutex_enter(&pidlock);
1354	for (p = practive; p != NULL; p = p->p_next) {
1355		/* skip kernel and incomplete processes */
1356		if (p->p_exec == NULLVP || p->p_as == &kas ||
1357		    p->p_stat == SIDL || p->p_stat == SZOMB ||
1358		    (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1359			continue;
1360		/*
1361		 * Only processes in the given zone (eventually in
1362		 * all zones) are taken into account
1363		 */
1364		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1365			mutex_enter(&p->p_lock);
1366			if ((t = p->p_tlist) == NULL) {
1367				mutex_exit(&p->p_lock);
1368				continue;
1369			}
1370			/*
1371			 * Set pre- and post-syscall processing flags
1372			 * for all threads of the process
1373			 */
1374			do {
1375				t->t_pre_sys = 1;
1376				t->t_post_sys = 1;
1377			} while (p->p_tlist != (t = t->t_forw));
1378			mutex_exit(&p->p_lock);
1379		}
1380	}
1381	mutex_exit(&pidlock);
1382}
1383
1384/*
1385 * set_proc_ast - Set asynchronous service trap (AST) flag for all
1386 * threads in process.
1387 */
1388void
1389set_proc_ast(proc_t *p)
1390{
1391	kthread_t	*t;
1392	kthread_t	*first;
1393
1394	ASSERT(MUTEX_HELD(&p->p_lock));
1395
1396	t = first = p->p_tlist;
1397	do {
1398		aston(t);
1399	} while ((t = t->t_forw) != first);
1400}
1401