1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/param.h>
27#include <sys/vmparam.h>
28#include <sys/types.h>
29#include <sys/sysmacros.h>
30#include <sys/systm.h>
31#include <sys/cmn_err.h>
32#include <sys/signal.h>
33#include <sys/stack.h>
34#include <sys/cred.h>
35#include <sys/user.h>
36#include <sys/debug.h>
37#include <sys/errno.h>
38#include <sys/proc.h>
39#include <sys/var.h>
40#include <sys/inline.h>
41#include <sys/syscall.h>
42#include <sys/ucontext.h>
43#include <sys/cpuvar.h>
44#include <sys/siginfo.h>
45#include <sys/trap.h>
46#include <sys/machtrap.h>
47#include <sys/sysinfo.h>
48#include <sys/procfs.h>
49#include <sys/prsystm.h>
50#include <sys/fpu/fpusystm.h>
51#include <sys/modctl.h>
52#include <sys/aio_impl.h>
53#include <c2/audit.h>
54#include <sys/tnf.h>
55#include <sys/tnf_probe.h>
56#include <sys/machpcb.h>
57#include <sys/privregs.h>
58#include <sys/copyops.h>
59#include <sys/timer.h>
60#include <sys/priv.h>
61#include <sys/msacct.h>
62
63int syscalltrace = 0;
64#ifdef SYSCALLTRACE
65static kmutex_t	systrace_lock;		/* syscall tracing lock */
66#endif /* SYSCALLTRACE */
67
68static krwlock_t *lock_syscall(struct sysent *, uint_t);
69
70#ifdef _SYSCALL32_IMPL
71static struct sysent *
72lwp_getsysent(klwp_t *lwp)
73{
74	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
75		return (sysent);
76	return (sysent32);
77}
78#define	LWP_GETSYSENT(lwp)	(lwp_getsysent(lwp))
79#else
80#define	LWP_GETSYSENT(lwp)	(sysent)
81#endif
82
83/*
84 * Called to restore the lwp's register window just before
85 * returning to user level (only if the registers have been
86 * fetched or modified through /proc).
87 */
88/*ARGSUSED1*/
89void
90xregrestore(klwp_t *lwp, int shared)
91{
92	/*
93	 * If locals+ins were modified by /proc copy them out.
94	 * Also copy to the shared window, if necessary.
95	 */
96	if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) {
97		struct machpcb *mpcb = lwptompcb(lwp);
98		caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp;
99
100		size_t rwinsize;
101		caddr_t rwp;
102		int is64;
103
104		if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) {
105			rwinsize = sizeof (struct rwindow);
106			rwp = sp + STACK_BIAS;
107			is64 = 1;
108		} else {
109			rwinsize = sizeof (struct rwindow32);
110			sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp;
111			rwp = sp;
112			is64 = 0;
113		}
114
115		if (is64)
116			(void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs,
117			    rwp, rwinsize);
118		else {
119			struct rwindow32 rwindow32;
120			int watched;
121
122			watched = watch_disable_addr(rwp, rwinsize, S_WRITE);
123			rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32);
124			(void) copyout(&rwindow32, rwp, rwinsize);
125			if (watched)
126				watch_enable_addr(rwp, rwinsize, S_WRITE);
127		}
128
129		/* also copy to the user return window */
130		mpcb->mpcb_rsp[0] = sp;
131		mpcb->mpcb_rsp[1] = NULL;
132		bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0],
133		    sizeof (lwp->lwp_pcb.pcb_xregs));
134	}
135	lwp->lwp_pcb.pcb_xregstat = XREGNONE;
136}
137
138
139/*
140 * Get the arguments to the current system call.
141 *	lwp->lwp_ap normally points to the out regs in the reg structure.
142 *	If the user is going to change the out registers and might want to
143 *	get the args (for /proc tracing), it must copy the args elsewhere
144 *	via save_syscall_args().
145 */
146uint_t
147get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
148{
149	kthread_t	*t = lwptot(lwp);
150	uint_t	code = t->t_sysnum;
151	long	mask;
152	long	*ap;
153	int	nargs;
154
155	if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32)
156		mask = (uint32_t)0xffffffffU;
157	else
158		mask = 0xffffffffffffffff;
159
160	if (code != 0 && code < NSYSCALL) {
161
162		nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
163
164		ASSERT(nargs <= MAXSYSARGS);
165
166		*nargsp = nargs;
167		ap = lwp->lwp_ap;
168		while (nargs-- > 0)
169			*argp++ = *ap++ & mask;
170	} else {
171		*nargsp = 0;
172	}
173	return (code);
174}
175
176#ifdef _SYSCALL32_IMPL
177/*
178 * Get the arguments to the current 32-bit system call.
179 */
180uint_t
181get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
182{
183	long args[MAXSYSARGS];
184	uint_t i, code;
185
186	code = get_syscall_args(lwp, args, nargsp);
187	for (i = 0; i != *nargsp; i++)
188		*argp++ = (int)args[i];
189	return (code);
190}
191#endif
192
193/*
194 * 	Save the system call arguments in a safe place.
195 *	lwp->lwp_ap normally points to the out regs in the reg structure.
196 *	If the user is going to change the out registers, g1, or the stack,
197 *	and might want to get the args (for /proc tracing), it must copy
198 *	the args elsewhere via save_syscall_args().
199 *
200 *	This may be called from stop() even when we're not in a system call.
201 *	Since there's no easy way to tell, this must be safe (not panic).
202 *	If the copyins get data faults, return non-zero.
203 */
204int
205save_syscall_args()
206{
207	kthread_t	*t = curthread;
208	klwp_t		*lwp = ttolwp(t);
209	struct regs	*rp = lwptoregs(lwp);
210	uint_t		code = t->t_sysnum;
211	uint_t		nargs;
212	int		i;
213	caddr_t		ua;
214	model_t		datamodel;
215
216	if (lwp->lwp_argsaved || code == 0)
217		return (0);		/* args already saved or not needed */
218
219	if (code >= NSYSCALL) {
220		nargs = 0;		/* illegal syscall */
221	} else {
222		struct sysent *se = LWP_GETSYSENT(lwp);
223		struct sysent *callp = se + code;
224
225		nargs = callp->sy_narg;
226		if (LOADABLE_SYSCALL(callp) && nargs == 0) {
227			krwlock_t	*module_lock;
228
229			/*
230			 * Find out how many arguments the system
231			 * call uses.
232			 *
233			 * We have the property that loaded syscalls
234			 * never change the number of arguments they
235			 * use after they've been loaded once.  This
236			 * allows us to stop for /proc tracing without
237			 * holding the module lock.
238			 * /proc is assured that sy_narg is valid.
239			 */
240			module_lock = lock_syscall(se, code);
241			nargs = callp->sy_narg;
242			rw_exit(module_lock);
243		}
244	}
245
246	/*
247	 * Fetch the system call arguments.
248	 */
249	if (nargs == 0)
250		goto out;
251
252
253	ASSERT(nargs <= MAXSYSARGS);
254
255	if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) {
256
257		if (rp->r_g1 == 0) {	/* indirect syscall */
258
259			lwp->lwp_arg[0] = (uint32_t)rp->r_o1;
260			lwp->lwp_arg[1] = (uint32_t)rp->r_o2;
261			lwp->lwp_arg[2] = (uint32_t)rp->r_o3;
262			lwp->lwp_arg[3] = (uint32_t)rp->r_o4;
263			lwp->lwp_arg[4] = (uint32_t)rp->r_o5;
264			if (nargs > 5) {
265				ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
266				    (rp->r_sp + MINFRAME32);
267				for (i = 5; i < nargs; i++) {
268					uint32_t a;
269					if (fuword32(ua, &a) != 0)
270						return (-1);
271					lwp->lwp_arg[i] = a;
272					ua += sizeof (a);
273				}
274			}
275		} else {
276			lwp->lwp_arg[0] = (uint32_t)rp->r_o0;
277			lwp->lwp_arg[1] = (uint32_t)rp->r_o1;
278			lwp->lwp_arg[2] = (uint32_t)rp->r_o2;
279			lwp->lwp_arg[3] = (uint32_t)rp->r_o3;
280			lwp->lwp_arg[4] = (uint32_t)rp->r_o4;
281			lwp->lwp_arg[5] = (uint32_t)rp->r_o5;
282			if (nargs > 6) {
283				ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
284				    (rp->r_sp + MINFRAME32);
285				for (i = 6; i < nargs; i++) {
286					uint32_t a;
287					if (fuword32(ua, &a) != 0)
288						return (-1);
289					lwp->lwp_arg[i] = a;
290					ua += sizeof (a);
291				}
292			}
293		}
294	} else {
295		ASSERT(datamodel == DATAMODEL_LP64);
296		lwp->lwp_arg[0] = rp->r_o0;
297		lwp->lwp_arg[1] = rp->r_o1;
298		lwp->lwp_arg[2] = rp->r_o2;
299		lwp->lwp_arg[3] = rp->r_o3;
300		lwp->lwp_arg[4] = rp->r_o4;
301		lwp->lwp_arg[5] = rp->r_o5;
302		if (nargs > 6) {
303			ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS;
304			for (i = 6; i < nargs; i++) {
305				unsigned long a;
306				if (fulword(ua, &a) != 0)
307					return (-1);
308				lwp->lwp_arg[i] = a;
309				ua += sizeof (a);
310			}
311		}
312	}
313
314out:
315	lwp->lwp_ap = lwp->lwp_arg;
316	lwp->lwp_argsaved = 1;
317	t->t_post_sys = 1;	/* so lwp_ap will be reset */
318	return (0);
319}
320
321void
322reset_syscall_args(void)
323{
324	klwp_t *lwp = ttolwp(curthread);
325
326	lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0;
327	lwp->lwp_argsaved = 0;
328}
329
330/*
331 * nonexistent system call-- signal lwp (may want to handle it)
332 * flag error if lwp won't see signal immediately
333 * This works for old or new calling sequence.
334 */
335int64_t
336nosys()
337{
338	tsignal(curthread, SIGSYS);
339	return ((int64_t)set_errno(ENOSYS));
340}
341
342/*
343 * Perform pre-system-call processing, including stopping for tracing,
344 * auditing, microstate-accounting, etc.
345 *
346 * This routine is called only if the t_pre_sys flag is set.  Any condition
347 * requiring pre-syscall handling must set the t_pre_sys flag.  If the
348 * condition is persistent, this routine will repost t_pre_sys.
349 */
350int
351pre_syscall(int arg0)
352{
353	unsigned int code;
354	kthread_t *t = curthread;
355	proc_t *p = ttoproc(t);
356	klwp_t *lwp = ttolwp(t);
357	struct regs *rp = lwptoregs(lwp);
358	int	repost;
359
360	t->t_pre_sys = repost = 0;	/* clear pre-syscall processing flag */
361
362	ASSERT(t->t_schedflag & TS_DONT_SWAP);
363
364	syscall_mstate(LMS_USER, LMS_SYSTEM);
365
366	/*
367	 * The syscall arguments in the out registers should be pointed to
368	 * by lwp_ap.  If the args need to be copied so that the outs can
369	 * be changed without losing the ability to get the args for /proc,
370	 * they can be saved by save_syscall_args(), and lwp_ap will be
371	 * restored by post_syscall().
372	 */
373	ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);
374
375	/*
376	 * Make sure the thread is holding the latest credentials for the
377	 * process.  The credentials in the process right now apply to this
378	 * thread for the entire system call.
379	 */
380	if (t->t_cred != p->p_cred) {
381		cred_t *oldcred = t->t_cred;
382		/*
383		 * DTrace accesses t_cred in probe context.  t_cred must
384		 * always be either NULL, or point to a valid, allocated cred
385		 * structure.
386		 */
387		t->t_cred = crgetcred();
388		crfree(oldcred);
389	}
390
391	/*
392	 * Undo special arrangements to single-step the lwp
393	 * so that a debugger will see valid register contents.
394	 * Also so that the pc is valid for syncfpu().
395	 * Also so that a syscall like exec() can be stepped.
396	 */
397	if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
398		(void) prundostep();
399		repost = 1;
400	}
401
402	/*
403	 * Check for indirect system call in case we stop for tracing.
404	 * Don't allow multiple indirection.
405	 */
406	code = t->t_sysnum;
407	if (code == 0 && arg0 != 0) {		/* indirect syscall */
408		code = arg0;
409		t->t_sysnum = arg0;
410	}
411
412	/*
413	 * From the proc(4) manual page:
414	 * When entry to a system call is being traced, the traced process
415	 * stops after having begun the call to the system but before the
416	 * system call arguments have been fetched from the process.
417	 * If proc changes the args we must refetch them after starting.
418	 */
419	if (PTOU(p)->u_systrap) {
420		if (prismember(&PTOU(p)->u_entrymask, code)) {
421			/*
422			 * Recheck stop condition, now that lock is held.
423			 */
424			mutex_enter(&p->p_lock);
425			if (PTOU(p)->u_systrap &&
426			    prismember(&PTOU(p)->u_entrymask, code)) {
427				stop(PR_SYSENTRY, code);
428				/*
429				 * Must refetch args since they were
430				 * possibly modified by /proc.  Indicate
431				 * that the valid copy is in the
432				 * registers.
433				 */
434				lwp->lwp_argsaved = 0;
435				lwp->lwp_ap = (long *)&rp->r_o0;
436			}
437			mutex_exit(&p->p_lock);
438		}
439		repost = 1;
440	}
441
442	if (lwp->lwp_sysabort) {
443		/*
444		 * lwp_sysabort may have been set via /proc while the process
445		 * was stopped on PR_SYSENTRY.  If so, abort the system call.
446		 * Override any error from the copyin() of the arguments.
447		 */
448		lwp->lwp_sysabort = 0;
449		(void) set_errno(EINTR); /* sets post-sys processing */
450		t->t_pre_sys = 1;	/* repost anyway */
451		return (1);		/* don't do system call, return EINTR */
452	}
453
454	/* begin auditing for this syscall */
455	if (audit_active == C2AUDIT_LOADED) {
456		uint32_t auditing = au_zone_getstate(NULL);
457
458		if (auditing & AU_AUDIT_MASK) {
459			int error;
460			if (error = audit_start(T_SYSCALL, code, auditing, \
461			    0, lwp)) {
462				t->t_pre_sys = 1;	/* repost anyway */
463				lwp->lwp_error = 0;	/* for old drivers */
464				return (error);
465			}
466			repost = 1;
467		}
468	}
469
470#ifndef NPROBE
471	/* Kernel probe */
472	if (tnf_tracing_active) {
473		TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
474			tnf_sysnum,	sysnum,		t->t_sysnum);
475		t->t_post_sys = 1;	/* make sure post_syscall runs */
476		repost = 1;
477	}
478#endif /* NPROBE */
479
480#ifdef SYSCALLTRACE
481	if (syscalltrace) {
482		int i;
483		long *ap;
484		char *cp;
485		char *sysname;
486		struct sysent *callp;
487
488		if (code >= NSYSCALL)
489			callp = &nosys_ent;	/* nosys has no args */
490		else
491			callp = LWP_GETSYSENT(lwp) + code;
492		(void) save_syscall_args();
493		mutex_enter(&systrace_lock);
494		printf("%d: ", p->p_pid);
495		if (code >= NSYSCALL)
496			printf("0x%x", code);
497		else {
498			sysname = mod_getsysname(code);
499			printf("%s[0x%x]", sysname == NULL ? "NULL" :
500			    sysname, code);
501		}
502		cp = "(";
503		for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
504			printf("%s%lx", cp, *ap);
505			cp = ", ";
506		}
507		if (i)
508			printf(")");
509		printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
510		mutex_exit(&systrace_lock);
511	}
512#endif /* SYSCALLTRACE */
513
514	/*
515	 * If there was a continuing reason for pre-syscall processing,
516	 * set the t_pre_sys flag for the next system call.
517	 */
518	if (repost)
519		t->t_pre_sys = 1;
520	lwp->lwp_error = 0;	/* for old drivers */
521	lwp->lwp_badpriv = PRIV_NONE;	/* for privilege tracing */
522	return (0);
523}
524
525/*
526 * Post-syscall processing.  Perform abnormal system call completion
527 * actions such as /proc tracing, profiling, signals, preemption, etc.
528 *
529 * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
530 * Any condition requiring pre-syscall handling must set one of these.
531 * If the condition is persistent, this routine will repost t_post_sys.
532 */
533void
534post_syscall(long rval1, long rval2)
535{
536	kthread_t	*t = curthread;
537	proc_t	*p = curproc;
538	klwp_t	*lwp = ttolwp(t);
539	struct regs *rp = lwptoregs(lwp);
540	uint_t	error;
541	int	code = t->t_sysnum;
542	int	repost = 0;
543	int	proc_stop = 0;		/* non-zero if stopping for /proc */
544	int	sigprof = 0;		/* non-zero if sending SIGPROF */
545
546	t->t_post_sys = 0;
547
548	error = lwp->lwp_errno;
549
550	/*
551	 * Code can be zero if this is a new LWP returning after a forkall(),
552	 * other than the one which matches the one in the parent which called
553	 * forkall().  In these LWPs, skip most of post-syscall activity.
554	 */
555	if (code == 0)
556		goto sig_check;
557
558	/* put out audit record for this syscall */
559	if (AU_AUDITING()) {
560		rval_t	rval;	/* fix audit_finish() someday */
561
562		/* XX64 -- truncation of 64-bit return values? */
563		rval.r_val1 = (int)rval1;
564		rval.r_val2 = (int)rval2;
565		audit_finish(T_SYSCALL, code, error, &rval);
566		repost = 1;
567	}
568
569	if (curthread->t_pdmsg != NULL) {
570		char *m = curthread->t_pdmsg;
571
572		uprintf("%s", m);
573		kmem_free(m, strlen(m) + 1);
574		curthread->t_pdmsg = NULL;
575	}
576
577	/*
578	 * If we're going to stop for /proc tracing, set the flag and
579	 * save the arguments so that the return values don't smash them.
580	 */
581	if (PTOU(p)->u_systrap) {
582		if (prismember(&PTOU(p)->u_exitmask, code)) {
583			proc_stop = 1;
584			(void) save_syscall_args();
585		}
586		repost = 1;
587	}
588
589	/*
590	 * Similarly check to see if SIGPROF might be sent.
591	 */
592	if (curthread->t_rprof != NULL &&
593	    curthread->t_rprof->rp_anystate != 0) {
594		(void) save_syscall_args();
595		sigprof = 1;
596	}
597
598	if (lwp->lwp_eosys == NORMALRETURN) {
599		if (error == 0) {
600#ifdef SYSCALLTRACE
601			if (syscalltrace) {
602				mutex_enter(&systrace_lock);
603				printf(
604				    "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
605				    p->p_pid, rval1, rval2, curthread);
606				mutex_exit(&systrace_lock);
607			}
608#endif /* SYSCALLTRACE */
609			rp->r_tstate &= ~TSTATE_IC;
610			rp->r_o0 = rval1;
611			rp->r_o1 = rval2;
612		} else {
613			int sig;
614
615#ifdef SYSCALLTRACE
616			if (syscalltrace) {
617				mutex_enter(&systrace_lock);
618				printf("%d: error=%d, id 0x%p\n",
619				    p->p_pid, error, curthread);
620				mutex_exit(&systrace_lock);
621			}
622#endif /* SYSCALLTRACE */
623			if (error == EINTR && t->t_activefd.a_stale)
624				error = EBADF;
625			if (error == EINTR &&
626			    (sig = lwp->lwp_cursig) != 0 &&
627			    sigismember(&PTOU(p)->u_sigrestart, sig) &&
628			    PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
629			    PTOU(p)->u_signal[sig - 1] != SIG_IGN)
630				error = ERESTART;
631			rp->r_o0 = error;
632			rp->r_tstate |= TSTATE_IC;
633		}
634		/*
635		 * The default action is to redo the trap instruction.
636		 * We increment the pc and npc past it for NORMALRETURN.
637		 * JUSTRETURN has set up a new pc and npc already.
638		 * If we are a cloned thread of forkall(), don't
639		 * adjust here because we have already inherited
640		 * the adjusted values from our clone.
641		 */
642		if (!(t->t_flag & T_FORKALL)) {
643			rp->r_pc = rp->r_npc;
644			rp->r_npc += 4;
645		}
646	}
647
648	/*
649	 * From the proc(4) manual page:
650	 * When exit from a system call is being traced, the traced process
651	 * stops on completion of the system call just prior to checking for
652	 * signals and returning to user level.  At this point all return
653	 * values have been stored into the traced process's saved registers.
654	 */
655	if (proc_stop) {
656		mutex_enter(&p->p_lock);
657		if (PTOU(p)->u_systrap &&
658		    prismember(&PTOU(p)->u_exitmask, code))
659			stop(PR_SYSEXIT, code);
660		mutex_exit(&p->p_lock);
661	}
662
663	/*
664	 * If we are the parent returning from a successful
665	 * vfork, wait for the child to exec or exit.
666	 * This code must be here and not in the bowels of the system
667	 * so that /proc can intercept exit from vfork in a timely way.
668	 */
669	if (t->t_flag & T_VFPARENT) {
670		ASSERT(code == SYS_vfork || code == SYS_forksys);
671		ASSERT(rp->r_o1 == 0 && error == 0);
672		vfwait((pid_t)rval1);
673		t->t_flag &= ~T_VFPARENT;
674	}
675
676	/*
677	 * If profiling is active, bill the current PC in user-land
678	 * and keep reposting until profiling is disabled.
679	 */
680	if (p->p_prof.pr_scale) {
681		if (lwp->lwp_oweupc)
682			profil_tick(rp->r_pc);
683		repost = 1;
684	}
685
686sig_check:
687	/*
688	 * Reset flag for next time.
689	 * We must do this after stopping on PR_SYSEXIT
690	 * because /proc uses the information in lwp_eosys.
691	 */
692	lwp->lwp_eosys = NORMALRETURN;
693	clear_stale_fd();
694	t->t_flag &= ~T_FORKALL;
695
696	if (t->t_astflag | t->t_sig_check) {
697		/*
698		 * Turn off the AST flag before checking all the conditions that
699		 * may have caused an AST.  This flag is on whenever a signal or
700		 * unusual condition should be handled after the next trap or
701		 * syscall.
702		 */
703		astoff(t);
704		t->t_sig_check = 0;
705
706		/*
707		 * The following check is legal for the following reasons:
708		 *	1) The thread we are checking, is ourselves, so there is
709		 *	   no way the proc can go away.
710		 *	2) The only time we need to be protected by the
711		 *	   lock is if the binding is changed.
712		 *
713		 *	Note we will still take the lock and check the binding
714		 *	if the condition was true without the lock held.  This
715		 *	prevents lock contention among threads owned by the
716		 *	same proc.
717		 */
718
719		if (curthread->t_proc_flag & TP_CHANGEBIND) {
720			mutex_enter(&p->p_lock);
721			if (curthread->t_proc_flag & TP_CHANGEBIND) {
722				timer_lwpbind();
723				curthread->t_proc_flag &= ~TP_CHANGEBIND;
724			}
725			mutex_exit(&p->p_lock);
726		}
727
728		/*
729		 * for kaio requests on the special kaio poll queue,
730		 * copyout their results to user memory.
731		 */
732		if (p->p_aio)
733			aio_cleanup(0);
734
735		/*
736		 * If this LWP was asked to hold, call holdlwp(), which will
737		 * stop.  holdlwps() sets this up and calls pokelwps() which
738		 * sets the AST flag.
739		 *
740		 * Also check TP_EXITLWP, since this is used by fresh new LWPs
741		 * through lwp_rtt().  That flag is set if the lwp_create(2)
742		 * syscall failed after creating the LWP.
743		 */
744		if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
745			holdlwp();
746
747		/*
748		 * All code that sets signals and makes ISSIG_PENDING
749		 * evaluate true must set t_sig_check afterwards.
750		 */
751		if (ISSIG_PENDING(t, lwp, p)) {
752			if (issig(FORREAL))
753				psig();
754			t->t_sig_check = 1;	/* recheck next time */
755		}
756
757		if (sigprof) {
758			int nargs = (code > 0 && code < NSYSCALL)?
759			    LWP_GETSYSENT(lwp)[code].sy_narg : 0;
760			realsigprof(code, nargs, error);
761			t->t_sig_check = 1;	/* recheck next time */
762		}
763
764		/*
765		 * If a performance counter overflow interrupt was
766		 * delivered *during* the syscall, then re-enable the
767		 * AST so that we take a trip through trap() to cause
768		 * the SIGEMT to be delivered.
769		 */
770		if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
771			aston(t);
772
773		/*
774		 * If an asynchronous hardware error is pending, turn AST flag
775		 * back on.  AST will be checked again before we return to user
776		 * mode and we'll come back through trap() to handle the error.
777		 */
778		if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR)
779			aston(t);
780	}
781
782	/*
783	 * Restore register window if a debugger modified it.
784	 * Set up to perform a single-step if a debugger requested it.
785	 */
786	if (lwp->lwp_pcb.pcb_xregstat != XREGNONE)
787		xregrestore(lwp, 1);
788
789	lwp->lwp_errno = 0;		/* clear error for next time */
790
791#ifndef NPROBE
792	/* Kernel probe */
793	if (tnf_tracing_active) {
794		TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
795		    tnf_long,	rval1,		rval1,
796		    tnf_long,	rval2,		rval2,
797		    tnf_long,	errno,		(long)error);
798		repost = 1;
799	}
800#endif /* NPROBE */
801
802	/*
803	 * Set state to LWP_USER here so preempt won't give us a kernel
804	 * priority if it occurs after this point.  Call CL_TRAPRET() to
805	 * restore the user-level priority.
806	 *
807	 * It is important that no locks (other than spinlocks) be entered
808	 * after this point before returning to user mode (unless lwp_state
809	 * is set back to LWP_SYS).
810	 *
811	 * Sampled times past this point are charged to the user.
812	 */
813	lwp->lwp_state = LWP_USER;
814
815	if (t->t_trapret) {
816		t->t_trapret = 0;
817		thread_lock(t);
818		CL_TRAPRET(t);
819		thread_unlock(t);
820	}
821	if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
822		preempt();
823	prunstop();
824
825	/*
826	 * t_post_sys will be set if pcb_step is active.
827	 */
828	if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
829		prdostep();
830		repost = 1;
831	}
832
833	t->t_sysnum = 0;	/* no longer in a system call */
834
835	/*
836	 * In case the args were copied to the lwp, reset the
837	 * pointer so the next syscall will have the right lwp_ap pointer.
838	 */
839	lwp->lwp_ap = (long *)&rp->r_o0;
840	lwp->lwp_argsaved = 0;
841
842	/*
843	 * If there was a continuing reason for post-syscall processing,
844	 * set the t_post_sys flag for the next system call.
845	 */
846	if (repost)
847		t->t_post_sys = 1;
848
849	/*
850	 * If there is a ustack registered for this lwp, and the stack rlimit
851	 * has been altered, read in the ustack. If the saved stack rlimit
852	 * matches the bounds of the ustack, update the ustack to reflect
853	 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
854	 * stack checking by setting the size to 0.
855	 */
856	if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
857		rlim64_t new_size;
858		model_t model;
859		caddr_t top;
860		struct rlimit64 rl;
861
862		mutex_enter(&p->p_lock);
863		new_size = p->p_stk_ctl;
864		model = p->p_model;
865		top = p->p_usrstack;
866		(void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
867		mutex_exit(&p->p_lock);
868
869		if (rl.rlim_cur == RLIM64_INFINITY)
870			new_size = 0;
871
872		if (model == DATAMODEL_NATIVE) {
873			stack_t stk;
874
875			if (copyin((stack_t *)lwp->lwp_ustack, &stk,
876			    sizeof (stack_t)) == 0 &&
877			    (stk.ss_size == lwp->lwp_old_stk_ctl ||
878			    stk.ss_size == 0) &&
879			    stk.ss_sp == top - stk.ss_size) {
880				stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
881				    stk.ss_size - new_size);
882				stk.ss_size = new_size;
883
884				(void) copyout(&stk,
885				    (stack_t *)lwp->lwp_ustack,
886				    sizeof (stack_t));
887			}
888		} else {
889			stack32_t stk32;
890
891			if (copyin((stack32_t *)lwp->lwp_ustack, &stk32,
892			    sizeof (stack32_t)) == 0 &&
893			    (stk32.ss_size == lwp->lwp_old_stk_ctl ||
894			    stk32.ss_size == 0) &&
895			    stk32.ss_sp ==
896			    (caddr32_t)(uintptr_t)(top - stk32.ss_size)) {
897				stk32.ss_sp += stk32.ss_size - new_size;
898				stk32.ss_size = new_size;
899
900				(void) copyout(&stk32,
901				    (stack32_t *)lwp->lwp_ustack,
902				    sizeof (stack32_t));
903			}
904		}
905
906		lwp->lwp_old_stk_ctl = 0;
907	}
908
909	syscall_mstate(LMS_SYSTEM, LMS_USER);
910}
911
912/*
913 * Call a system call which takes a pointer to the user args struct and
914 * a pointer to the return values.  This is a bit slower than the standard
915 * C arg-passing method in some cases.
916 */
917int64_t
918syscall_ap()
919{
920	uint_t	error;
921	struct sysent *callp;
922	rval_t	rval;
923	klwp_t	*lwp = ttolwp(curthread);
924	struct regs *rp = lwptoregs(lwp);
925
926	callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum;
927
928	/*
929	 * If the arguments don't fit in registers %o0 - o5, make sure they
930	 * have been copied to the lwp_arg array.
931	 */
932	if (callp->sy_narg > 6 && save_syscall_args())
933		return ((int64_t)set_errno(EFAULT));
934
935	rval.r_val1 = 0;
936	rval.r_val2 = (int)rp->r_o1;
937	lwp->lwp_error = 0;	/* for old drivers */
938	error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
939	if (error)
940		return ((int64_t)set_errno(error));
941	return (rval.r_vals);
942}
943
944/*
945 * Load system call module.
946 *	Returns with pointer to held read lock for module.
947 */
948static krwlock_t *
949lock_syscall(struct sysent *table, uint_t code)
950{
951	krwlock_t	*module_lock;
952	struct modctl	*modp;
953	int		id;
954	struct sysent   *callp;
955
956	module_lock = table[code].sy_lock;
957	callp = &table[code];
958
959	/*
960	 * Optimization to only call modload if we don't have a loaded
961	 * syscall.
962	 */
963	rw_enter(module_lock, RW_READER);
964	if (LOADED_SYSCALL(callp))
965		return (module_lock);
966	rw_exit(module_lock);
967
968	for (;;) {
969		if ((id = modload("sys", syscallnames[code])) == -1)
970			break;
971
972		/*
973		 * If we loaded successfully at least once, the modctl
974		 * will still be valid, so we try to grab it by filename.
975		 * If this call fails, it's because the mod_filename
976		 * was changed after the call to modload() (mod_hold_by_name()
977		 * is the likely culprit).  We can safely just take
978		 * another lap if this is the case;  the modload() will
979		 * change the mod_filename back to one by which we can
980		 * find the modctl.
981		 */
982		modp = mod_find_by_filename("sys", syscallnames[code]);
983
984		if (modp == NULL)
985			continue;
986
987		mutex_enter(&mod_lock);
988
989		if (!modp->mod_installed) {
990			mutex_exit(&mod_lock);
991			continue;
992		}
993		break;
994	}
995
996	rw_enter(module_lock, RW_READER);
997
998	if (id != -1)
999		mutex_exit(&mod_lock);
1000
1001	return (module_lock);
1002}
1003
1004/*
1005 * Loadable syscall support.
1006 *	If needed, load the module, then reserve it by holding a read
1007 * 	lock for the duration of the call.
1008 *	Later, if the syscall is not unloadable, it could patch the vector.
1009 */
1010/*ARGSUSED*/
1011int64_t
1012loadable_syscall(
1013    long a0, long a1, long a2, long a3,
1014    long a4, long a5, long a6, long a7)
1015{
1016	int64_t		rval;
1017	struct sysent	*callp;
1018	struct sysent	*se = LWP_GETSYSENT(ttolwp(curthread));
1019	krwlock_t	*module_lock;
1020	int		code;
1021
1022	code = curthread->t_sysnum;
1023	callp = se + code;
1024
1025	/*
1026	 * Try to autoload the system call if necessary.
1027	 */
1028	module_lock = lock_syscall(se, code);
1029	THREAD_KPRI_RELEASE();	/* drop priority given by rw_enter */
1030
1031	/*
1032	 * we've locked either the loaded syscall or nosys
1033	 */
1034	if (callp->sy_flags & SE_ARGC) {
1035		int64_t (*sy_call)();
1036
1037		sy_call = (int64_t (*)())callp->sy_call;
1038		rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1039	} else {
1040		rval = syscall_ap();
1041	}
1042
1043	THREAD_KPRI_REQUEST();	/* regain priority from read lock */
1044	rw_exit(module_lock);
1045	return (rval);
1046}
1047
1048/*
1049 * Handle indirect system calls.
1050 *	This interface should be deprecated.  The library can handle
1051 *	this more efficiently, but keep this implementation for old binaries.
1052 *
1053 * XX64	Needs some work.
1054 */
1055int64_t
1056indir(int code, long a0, long a1, long a2, long a3, long a4)
1057{
1058	klwp_t		*lwp = ttolwp(curthread);
1059	struct sysent	*callp;
1060
1061	if (code <= 0 || code >= NSYSCALL)
1062		return (nosys());
1063
1064	ASSERT(lwp->lwp_ap != NULL);
1065
1066	curthread->t_sysnum = code;
1067	callp = LWP_GETSYSENT(lwp) + code;
1068
1069	/*
1070	 * Handle argument setup, unless already done in pre_syscall().
1071	 */
1072	if (callp->sy_narg > 5) {
1073		if (save_syscall_args()) 	/* move args to LWP array */
1074			return ((int64_t)set_errno(EFAULT));
1075	} else if (!lwp->lwp_argsaved) {
1076		long *ap;
1077
1078		ap = lwp->lwp_ap;		/* args haven't been saved */
1079		lwp->lwp_ap = ap + 1;		/* advance arg pointer */
1080		curthread->t_post_sys = 1;	/* so lwp_ap will be reset */
1081	}
1082	return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5]));
1083}
1084
1085/*
1086 * set_errno - set an error return from the current system call.
1087 *	This could be a macro.
1088 *	This returns the value it is passed, so that the caller can
1089 *	use tail-recursion-elimination and do return (set_errno(ERRNO));
1090 */
1091uint_t
1092set_errno(uint_t error)
1093{
1094	ASSERT(error != 0);		/* must not be used to clear errno */
1095
1096	curthread->t_post_sys = 1;	/* have post_syscall do error return */
1097	return (ttolwp(curthread)->lwp_errno = error);
1098}
1099
1100/*
1101 * set_proc_pre_sys - Set pre-syscall processing for entire process.
1102 */
1103void
1104set_proc_pre_sys(proc_t *p)
1105{
1106	kthread_t	*t;
1107	kthread_t	*first;
1108
1109	ASSERT(MUTEX_HELD(&p->p_lock));
1110
1111	t = first = p->p_tlist;
1112	do {
1113		t->t_pre_sys = 1;
1114	} while ((t = t->t_forw) != first);
1115}
1116
1117/*
1118 * set_proc_post_sys - Set post-syscall processing for entire process.
1119 */
1120void
1121set_proc_post_sys(proc_t *p)
1122{
1123	kthread_t	*t;
1124	kthread_t	*first;
1125
1126	ASSERT(MUTEX_HELD(&p->p_lock));
1127
1128	t = first = p->p_tlist;
1129	do {
1130		t->t_post_sys = 1;
1131	} while ((t = t->t_forw) != first);
1132}
1133
1134/*
1135 * set_proc_sys - Set pre- and post-syscall processing for entire process.
1136 */
1137void
1138set_proc_sys(proc_t *p)
1139{
1140	kthread_t	*t;
1141	kthread_t	*first;
1142
1143	ASSERT(MUTEX_HELD(&p->p_lock));
1144
1145	t = first = p->p_tlist;
1146	do {
1147		t->t_pre_sys = 1;
1148		t->t_post_sys = 1;
1149	} while ((t = t->t_forw) != first);
1150}
1151
1152/*
1153 * set_all_proc_sys - set pre- and post-syscall processing flags for all
1154 * user processes.
1155 *
1156 * This is needed when auditing, tracing, or other facilities which affect
1157 * all processes are turned on.
1158 */
1159void
1160set_all_proc_sys()
1161{
1162	kthread_t	*t;
1163	kthread_t	*first;
1164
1165	mutex_enter(&pidlock);
1166	t = first = curthread;
1167	do {
1168		t->t_pre_sys = 1;
1169		t->t_post_sys = 1;
1170	} while ((t = t->t_next) != first);
1171	mutex_exit(&pidlock);
1172}
1173
1174/*
1175 * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1176 * all user processes running in the zone of the current process
1177 *
1178 * This is needed when auditing is turned on.
1179 */
1180void
1181set_all_zone_usr_proc_sys(zoneid_t zoneid)
1182{
1183	proc_t	    *p;
1184	kthread_t   *t;
1185
1186	mutex_enter(&pidlock);
1187	for (p = practive; p != NULL; p = p->p_next) {
1188		/* skip kernel processes */
1189		if (p->p_exec == NULLVP || p->p_as == &kas ||
1190		    p->p_stat == SIDL || p->p_stat == SZOMB ||
1191		    (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1192			continue;
1193		/*
1194		 * Only processes in the given zone (eventually in
1195		 * all zones) are taken into account
1196		 */
1197		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1198			mutex_enter(&p->p_lock);
1199			if ((t = p->p_tlist) == NULL) {
1200				mutex_exit(&p->p_lock);
1201				continue;
1202			}
1203			/*
1204			 * Set pre- and post-syscall processing flags
1205			 * for all threads of the process
1206			 */
1207			do {
1208				t->t_pre_sys = 1;
1209				t->t_post_sys = 1;
1210			} while (p->p_tlist != (t = t->t_forw));
1211			mutex_exit(&p->p_lock);
1212		}
1213	}
1214	mutex_exit(&pidlock);
1215}
1216
1217/*
1218 * set_proc_ast - Set asynchronous service trap (AST) flag for all
1219 * threads in process.
1220 */
1221void
1222set_proc_ast(proc_t *p)
1223{
1224	kthread_t	*t;
1225	kthread_t	*first;
1226
1227	ASSERT(MUTEX_HELD(&p->p_lock));
1228
1229	t = first = p->p_tlist;
1230	do {
1231		aston(t);
1232	} while ((t = t->t_forw) != first);
1233}
1234