1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27#include <sys/param.h>
28#include <sys/vmparam.h>
29#include <sys/types.h>
30#include <sys/sysmacros.h>
31#include <sys/systm.h>
32#include <sys/cmn_err.h>
33#include <sys/signal.h>
34#include <sys/stack.h>
35#include <sys/cred.h>
36#include <sys/user.h>
37#include <sys/debug.h>
38#include <sys/errno.h>
39#include <sys/proc.h>
40#include <sys/var.h>
41#include <sys/inline.h>
42#include <sys/syscall.h>
43#include <sys/ucontext.h>
44#include <sys/cpuvar.h>
45#include <sys/siginfo.h>
46#include <sys/trap.h>
47#include <sys/machtrap.h>
48#include <sys/sysinfo.h>
49#include <sys/procfs.h>
50#include <sys/prsystm.h>
51#include <sys/fpu/fpusystm.h>
52#include <sys/modctl.h>
53#include <sys/aio_impl.h>
54#include <c2/audit.h>
55#include <sys/tnf.h>
56#include <sys/tnf_probe.h>
57#include <sys/machpcb.h>
58#include <sys/privregs.h>
59#include <sys/copyops.h>
60#include <sys/timer.h>
61#include <sys/priv.h>
62#include <sys/msacct.h>
63
64int syscalltrace = 0;
65#ifdef SYSCALLTRACE
66static kmutex_t	systrace_lock;		/* syscall tracing lock */
67#endif /* SYSCALLTRACE */
68
69static krwlock_t *lock_syscall(struct sysent *, uint_t);
70
71#ifdef _SYSCALL32_IMPL
72static struct sysent *
73lwp_getsysent(klwp_t *lwp)
74{
75	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
76		return (sysent);
77	return (sysent32);
78}
79#define	LWP_GETSYSENT(lwp)	(lwp_getsysent(lwp))
80#else
81#define	LWP_GETSYSENT(lwp)	(sysent)
82#endif
83
84/*
85 * Called to restore the lwp's register window just before
86 * returning to user level (only if the registers have been
87 * fetched or modified through /proc).
88 */
89/*ARGSUSED1*/
90void
91xregrestore(klwp_t *lwp, int shared)
92{
93	/*
94	 * If locals+ins were modified by /proc copy them out.
95	 * Also copy to the shared window, if necessary.
96	 */
97	if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) {
98		struct machpcb *mpcb = lwptompcb(lwp);
99		caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp;
100
101		size_t rwinsize;
102		caddr_t rwp;
103		int is64;
104
105		if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) {
106			rwinsize = sizeof (struct rwindow);
107			rwp = sp + STACK_BIAS;
108			is64 = 1;
109		} else {
110			rwinsize = sizeof (struct rwindow32);
111			sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp;
112			rwp = sp;
113			is64 = 0;
114		}
115
116		if (is64)
117			(void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs,
118			    rwp, rwinsize);
119		else {
120			struct rwindow32 rwindow32;
121			int watched;
122
123			watched = watch_disable_addr(rwp, rwinsize, S_WRITE);
124			rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32);
125			(void) copyout(&rwindow32, rwp, rwinsize);
126			if (watched)
127				watch_enable_addr(rwp, rwinsize, S_WRITE);
128		}
129
130		/* also copy to the user return window */
131		mpcb->mpcb_rsp[0] = sp;
132		mpcb->mpcb_rsp[1] = NULL;
133		bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0],
134		    sizeof (lwp->lwp_pcb.pcb_xregs));
135	}
136	lwp->lwp_pcb.pcb_xregstat = XREGNONE;
137}
138
139
140/*
141 * Get the arguments to the current system call.
142 *	lwp->lwp_ap normally points to the out regs in the reg structure.
143 *	If the user is going to change the out registers and might want to
144 *	get the args (for /proc tracing), it must copy the args elsewhere
145 *	via save_syscall_args().
146 */
147uint_t
148get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
149{
150	kthread_t	*t = lwptot(lwp);
151	uint_t	code = t->t_sysnum;
152	long	mask;
153	long	*ap;
154	int	nargs;
155
156	if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32)
157		mask = (uint32_t)0xffffffffU;
158	else
159		mask = 0xffffffffffffffff;
160
161	if (code != 0 && code < NSYSCALL) {
162
163		nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
164
165		ASSERT(nargs <= MAXSYSARGS);
166
167		*nargsp = nargs;
168		ap = lwp->lwp_ap;
169		while (nargs-- > 0)
170			*argp++ = *ap++ & mask;
171	} else {
172		*nargsp = 0;
173	}
174	return (code);
175}
176
177#ifdef _SYSCALL32_IMPL
178/*
179 * Get the arguments to the current 32-bit system call.
180 */
181uint_t
182get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
183{
184	long args[MAXSYSARGS];
185	uint_t i, code;
186
187	code = get_syscall_args(lwp, args, nargsp);
188	for (i = 0; i != *nargsp; i++)
189		*argp++ = (int)args[i];
190	return (code);
191}
192#endif
193
194/*
195 *	Save the system call arguments in a safe place.
196 *	lwp->lwp_ap normally points to the out regs in the reg structure.
197 *	If the user is going to change the out registers, g1, or the stack,
198 *	and might want to get the args (for /proc tracing), it must copy
199 *	the args elsewhere via save_syscall_args().
200 *
201 *	This may be called from stop() even when we're not in a system call.
202 *	Since there's no easy way to tell, this must be safe (not panic).
203 *	If the copyins get data faults, return non-zero.
204 */
205int
206save_syscall_args()
207{
208	kthread_t	*t = curthread;
209	klwp_t		*lwp = ttolwp(t);
210	struct regs	*rp = lwptoregs(lwp);
211	uint_t		code = t->t_sysnum;
212	uint_t		nargs;
213	int		i;
214	caddr_t		ua;
215	model_t		datamodel;
216
217	if (lwp->lwp_argsaved || code == 0)
218		return (0);		/* args already saved or not needed */
219
220	if (code >= NSYSCALL) {
221		nargs = 0;		/* illegal syscall */
222	} else {
223		struct sysent *se = LWP_GETSYSENT(lwp);
224		struct sysent *callp = se + code;
225
226		nargs = callp->sy_narg;
227		if (LOADABLE_SYSCALL(callp) && nargs == 0) {
228			krwlock_t	*module_lock;
229
230			/*
231			 * Find out how many arguments the system
232			 * call uses.
233			 *
234			 * We have the property that loaded syscalls
235			 * never change the number of arguments they
236			 * use after they've been loaded once.  This
237			 * allows us to stop for /proc tracing without
238			 * holding the module lock.
239			 * /proc is assured that sy_narg is valid.
240			 */
241			module_lock = lock_syscall(se, code);
242			nargs = callp->sy_narg;
243			rw_exit(module_lock);
244		}
245	}
246
247	/*
248	 * Fetch the system call arguments.
249	 */
250	if (nargs == 0)
251		goto out;
252
253
254	ASSERT(nargs <= MAXSYSARGS);
255
256	if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) {
257
258		if (rp->r_g1 == 0) {	/* indirect syscall */
259
260			lwp->lwp_arg[0] = (uint32_t)rp->r_o1;
261			lwp->lwp_arg[1] = (uint32_t)rp->r_o2;
262			lwp->lwp_arg[2] = (uint32_t)rp->r_o3;
263			lwp->lwp_arg[3] = (uint32_t)rp->r_o4;
264			lwp->lwp_arg[4] = (uint32_t)rp->r_o5;
265			if (nargs > 5) {
266				ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
267				    (rp->r_sp + MINFRAME32);
268				for (i = 5; i < nargs; i++) {
269					uint32_t a;
270					if (fuword32(ua, &a) != 0)
271						return (-1);
272					lwp->lwp_arg[i] = a;
273					ua += sizeof (a);
274				}
275			}
276		} else {
277			lwp->lwp_arg[0] = (uint32_t)rp->r_o0;
278			lwp->lwp_arg[1] = (uint32_t)rp->r_o1;
279			lwp->lwp_arg[2] = (uint32_t)rp->r_o2;
280			lwp->lwp_arg[3] = (uint32_t)rp->r_o3;
281			lwp->lwp_arg[4] = (uint32_t)rp->r_o4;
282			lwp->lwp_arg[5] = (uint32_t)rp->r_o5;
283			if (nargs > 6) {
284				ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
285				    (rp->r_sp + MINFRAME32);
286				for (i = 6; i < nargs; i++) {
287					uint32_t a;
288					if (fuword32(ua, &a) != 0)
289						return (-1);
290					lwp->lwp_arg[i] = a;
291					ua += sizeof (a);
292				}
293			}
294		}
295	} else {
296		ASSERT(datamodel == DATAMODEL_LP64);
297		lwp->lwp_arg[0] = rp->r_o0;
298		lwp->lwp_arg[1] = rp->r_o1;
299		lwp->lwp_arg[2] = rp->r_o2;
300		lwp->lwp_arg[3] = rp->r_o3;
301		lwp->lwp_arg[4] = rp->r_o4;
302		lwp->lwp_arg[5] = rp->r_o5;
303		if (nargs > 6) {
304			ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS;
305			for (i = 6; i < nargs; i++) {
306				unsigned long a;
307				if (fulword(ua, &a) != 0)
308					return (-1);
309				lwp->lwp_arg[i] = a;
310				ua += sizeof (a);
311			}
312		}
313	}
314
315out:
316	lwp->lwp_ap = lwp->lwp_arg;
317	lwp->lwp_argsaved = 1;
318	t->t_post_sys = 1;	/* so lwp_ap will be reset */
319	return (0);
320}
321
322void
323reset_syscall_args(void)
324{
325	klwp_t *lwp = ttolwp(curthread);
326
327	lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0;
328	lwp->lwp_argsaved = 0;
329}
330
331/*
332 * nonexistent system call-- signal lwp (may want to handle it)
333 * flag error if lwp won't see signal immediately
334 * This works for old or new calling sequence.
335 */
336int64_t
337nosys(void)
338{
339	tsignal(curthread, SIGSYS);
340	return ((int64_t)set_errno(ENOSYS));
341}
342
343int
344nosys32(void)
345{
346	return (nosys());
347}
348
349/*
350 * Perform pre-system-call processing, including stopping for tracing,
351 * auditing, microstate-accounting, etc.
352 *
353 * This routine is called only if the t_pre_sys flag is set.  Any condition
354 * requiring pre-syscall handling must set the t_pre_sys flag.  If the
355 * condition is persistent, this routine will repost t_pre_sys.
356 */
357int
358pre_syscall(int arg0)
359{
360	unsigned int code;
361	kthread_t *t = curthread;
362	proc_t *p = ttoproc(t);
363	klwp_t *lwp = ttolwp(t);
364	struct regs *rp = lwptoregs(lwp);
365	int	repost;
366
367	t->t_pre_sys = repost = 0;	/* clear pre-syscall processing flag */
368
369	ASSERT(t->t_schedflag & TS_DONT_SWAP);
370
371	syscall_mstate(LMS_USER, LMS_SYSTEM);
372
373	/*
374	 * The syscall arguments in the out registers should be pointed to
375	 * by lwp_ap.  If the args need to be copied so that the outs can
376	 * be changed without losing the ability to get the args for /proc,
377	 * they can be saved by save_syscall_args(), and lwp_ap will be
378	 * restored by post_syscall().
379	 */
380	ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);
381
382	/*
383	 * Make sure the thread is holding the latest credentials for the
384	 * process.  The credentials in the process right now apply to this
385	 * thread for the entire system call.
386	 */
387	if (t->t_cred != p->p_cred) {
388		cred_t *oldcred = t->t_cred;
389		/*
390		 * DTrace accesses t_cred in probe context.  t_cred must
391		 * always be either NULL, or point to a valid, allocated cred
392		 * structure.
393		 */
394		t->t_cred = crgetcred();
395		crfree(oldcred);
396	}
397
398	/*
399	 * Undo special arrangements to single-step the lwp
400	 * so that a debugger will see valid register contents.
401	 * Also so that the pc is valid for syncfpu().
402	 * Also so that a syscall like exec() can be stepped.
403	 */
404	if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
405		(void) prundostep();
406		repost = 1;
407	}
408
409	/*
410	 * Check for indirect system call in case we stop for tracing.
411	 * Don't allow multiple indirection.
412	 */
413	code = t->t_sysnum;
414	if (code == 0 && arg0 != 0) {		/* indirect syscall */
415		code = arg0;
416		t->t_sysnum = arg0;
417	}
418
419	/*
420	 * From the proc(4) manual page:
421	 * When entry to a system call is being traced, the traced process
422	 * stops after having begun the call to the system but before the
423	 * system call arguments have been fetched from the process.
424	 * If proc changes the args we must refetch them after starting.
425	 */
426	if (PTOU(p)->u_systrap) {
427		if (prismember(&PTOU(p)->u_entrymask, code)) {
428			/*
429			 * Recheck stop condition, now that lock is held.
430			 */
431			mutex_enter(&p->p_lock);
432			if (PTOU(p)->u_systrap &&
433			    prismember(&PTOU(p)->u_entrymask, code)) {
434				stop(PR_SYSENTRY, code);
435				/*
436				 * Must refetch args since they were
437				 * possibly modified by /proc.  Indicate
438				 * that the valid copy is in the
439				 * registers.
440				 */
441				lwp->lwp_argsaved = 0;
442				lwp->lwp_ap = (long *)&rp->r_o0;
443			}
444			mutex_exit(&p->p_lock);
445		}
446		repost = 1;
447	}
448
449	if (lwp->lwp_sysabort) {
450		/*
451		 * lwp_sysabort may have been set via /proc while the process
452		 * was stopped on PR_SYSENTRY.  If so, abort the system call.
453		 * Override any error from the copyin() of the arguments.
454		 */
455		lwp->lwp_sysabort = 0;
456		(void) set_errno(EINTR); /* sets post-sys processing */
457		t->t_pre_sys = 1;	/* repost anyway */
458		return (1);		/* don't do system call, return EINTR */
459	}
460
461	/* begin auditing for this syscall */
462	if (audit_active == C2AUDIT_LOADED) {
463		uint32_t auditing = au_zone_getstate(NULL);
464
465		if (auditing & AU_AUDIT_MASK) {
466			int error;
467			if (error = audit_start(T_SYSCALL, code, auditing, \
468			    0, lwp)) {
469				t->t_pre_sys = 1;	/* repost anyway */
470				lwp->lwp_error = 0;	/* for old drivers */
471				return (error);
472			}
473			repost = 1;
474		}
475	}
476
477#ifndef NPROBE
478	/* Kernel probe */
479	if (tnf_tracing_active) {
480		TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
481			tnf_sysnum,	sysnum,		t->t_sysnum);
482		t->t_post_sys = 1;	/* make sure post_syscall runs */
483		repost = 1;
484	}
485#endif /* NPROBE */
486
487#ifdef SYSCALLTRACE
488	if (syscalltrace) {
489		int i;
490		long *ap;
491		char *cp;
492		char *sysname;
493		struct sysent *callp;
494
495		if (code >= NSYSCALL)
496			callp = &nosys_ent;	/* nosys has no args */
497		else
498			callp = LWP_GETSYSENT(lwp) + code;
499		(void) save_syscall_args();
500		mutex_enter(&systrace_lock);
501		printf("%d: ", p->p_pid);
502		if (code >= NSYSCALL)
503			printf("0x%x", code);
504		else {
505			sysname = mod_getsysname(code);
506			printf("%s[0x%x]", sysname == NULL ? "NULL" :
507			    sysname, code);
508		}
509		cp = "(";
510		for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
511			printf("%s%lx", cp, *ap);
512			cp = ", ";
513		}
514		if (i)
515			printf(")");
516		printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
517		mutex_exit(&systrace_lock);
518	}
519#endif /* SYSCALLTRACE */
520
521	/*
522	 * If there was a continuing reason for pre-syscall processing,
523	 * set the t_pre_sys flag for the next system call.
524	 */
525	if (repost)
526		t->t_pre_sys = 1;
527	lwp->lwp_error = 0;	/* for old drivers */
528	lwp->lwp_badpriv = PRIV_NONE;	/* for privilege tracing */
529	return (0);
530}
531
532/*
533 * Post-syscall processing.  Perform abnormal system call completion
534 * actions such as /proc tracing, profiling, signals, preemption, etc.
535 *
536 * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
537 * Any condition requiring pre-syscall handling must set one of these.
538 * If the condition is persistent, this routine will repost t_post_sys.
539 */
540void
541post_syscall(long rval1, long rval2)
542{
543	kthread_t	*t = curthread;
544	proc_t	*p = curproc;
545	klwp_t	*lwp = ttolwp(t);
546	struct regs *rp = lwptoregs(lwp);
547	uint_t	error;
548	int	code = t->t_sysnum;
549	int	repost = 0;
550	int	proc_stop = 0;		/* non-zero if stopping for /proc */
551	int	sigprof = 0;		/* non-zero if sending SIGPROF */
552
553	t->t_post_sys = 0;
554
555	error = lwp->lwp_errno;
556
557	/*
558	 * Code can be zero if this is a new LWP returning after a forkall(),
559	 * other than the one which matches the one in the parent which called
560	 * forkall().  In these LWPs, skip most of post-syscall activity.
561	 */
562	if (code == 0)
563		goto sig_check;
564
565	/* put out audit record for this syscall */
566	if (AU_AUDITING()) {
567		rval_t	rval;	/* fix audit_finish() someday */
568
569		/* XX64 -- truncation of 64-bit return values? */
570		rval.r_val1 = (int)rval1;
571		rval.r_val2 = (int)rval2;
572		audit_finish(T_SYSCALL, code, error, &rval);
573		repost = 1;
574	}
575
576	if (curthread->t_pdmsg != NULL) {
577		char *m = curthread->t_pdmsg;
578
579		uprintf("%s", m);
580		kmem_free(m, strlen(m) + 1);
581		curthread->t_pdmsg = NULL;
582	}
583
584	/*
585	 * If we're going to stop for /proc tracing, set the flag and
586	 * save the arguments so that the return values don't smash them.
587	 */
588	if (PTOU(p)->u_systrap) {
589		if (prismember(&PTOU(p)->u_exitmask, code)) {
590			proc_stop = 1;
591			(void) save_syscall_args();
592		}
593		repost = 1;
594	}
595
596	/*
597	 * Similarly check to see if SIGPROF might be sent.
598	 */
599	if (curthread->t_rprof != NULL &&
600	    curthread->t_rprof->rp_anystate != 0) {
601		(void) save_syscall_args();
602		sigprof = 1;
603	}
604
605	if (lwp->lwp_eosys == NORMALRETURN) {
606		if (error == 0) {
607#ifdef SYSCALLTRACE
608			if (syscalltrace) {
609				mutex_enter(&systrace_lock);
610				printf(
611				    "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
612				    p->p_pid, rval1, rval2, curthread);
613				mutex_exit(&systrace_lock);
614			}
615#endif /* SYSCALLTRACE */
616			rp->r_tstate &= ~TSTATE_IC;
617			rp->r_o0 = rval1;
618			rp->r_o1 = rval2;
619		} else {
620			int sig;
621
622#ifdef SYSCALLTRACE
623			if (syscalltrace) {
624				mutex_enter(&systrace_lock);
625				printf("%d: error=%d, id 0x%p\n",
626				    p->p_pid, error, curthread);
627				mutex_exit(&systrace_lock);
628			}
629#endif /* SYSCALLTRACE */
630			if (error == EINTR && t->t_activefd.a_stale)
631				error = EBADF;
632			if (error == EINTR &&
633			    (sig = lwp->lwp_cursig) != 0 &&
634			    sigismember(&PTOU(p)->u_sigrestart, sig) &&
635			    PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
636			    PTOU(p)->u_signal[sig - 1] != SIG_IGN)
637				error = ERESTART;
638			rp->r_o0 = error;
639			rp->r_tstate |= TSTATE_IC;
640		}
641		/*
642		 * The default action is to redo the trap instruction.
643		 * We increment the pc and npc past it for NORMALRETURN.
644		 * JUSTRETURN has set up a new pc and npc already.
645		 * If we are a cloned thread of forkall(), don't
646		 * adjust here because we have already inherited
647		 * the adjusted values from our clone.
648		 */
649		if (!(t->t_flag & T_FORKALL)) {
650			rp->r_pc = rp->r_npc;
651			rp->r_npc += 4;
652		}
653	}
654
655	/*
656	 * From the proc(4) manual page:
657	 * When exit from a system call is being traced, the traced process
658	 * stops on completion of the system call just prior to checking for
659	 * signals and returning to user level.  At this point all return
660	 * values have been stored into the traced process's saved registers.
661	 */
662	if (proc_stop) {
663		mutex_enter(&p->p_lock);
664		if (PTOU(p)->u_systrap &&
665		    prismember(&PTOU(p)->u_exitmask, code))
666			stop(PR_SYSEXIT, code);
667		mutex_exit(&p->p_lock);
668	}
669
670	/*
671	 * If we are the parent returning from a successful
672	 * vfork, wait for the child to exec or exit.
673	 * This code must be here and not in the bowels of the system
674	 * so that /proc can intercept exit from vfork in a timely way.
675	 */
676	if (t->t_flag & T_VFPARENT) {
677		ASSERT(code == SYS_vfork || code == SYS_forksys);
678		ASSERT(rp->r_o1 == 0 && error == 0);
679		vfwait((pid_t)rval1);
680		t->t_flag &= ~T_VFPARENT;
681	}
682
683	/*
684	 * If profiling is active, bill the current PC in user-land
685	 * and keep reposting until profiling is disabled.
686	 */
687	if (p->p_prof.pr_scale) {
688		if (lwp->lwp_oweupc)
689			profil_tick(rp->r_pc);
690		repost = 1;
691	}
692
693sig_check:
694	/*
695	 * Reset flag for next time.
696	 * We must do this after stopping on PR_SYSEXIT
697	 * because /proc uses the information in lwp_eosys.
698	 */
699	lwp->lwp_eosys = NORMALRETURN;
700	clear_stale_fd();
701	t->t_flag &= ~T_FORKALL;
702
703	if (t->t_astflag | t->t_sig_check) {
704		/*
705		 * Turn off the AST flag before checking all the conditions that
706		 * may have caused an AST.  This flag is on whenever a signal or
707		 * unusual condition should be handled after the next trap or
708		 * syscall.
709		 */
710		astoff(t);
711		t->t_sig_check = 0;
712
713		/*
714		 * The following check is legal for the following reasons:
715		 *	1) The thread we are checking, is ourselves, so there is
716		 *	   no way the proc can go away.
717		 *	2) The only time we need to be protected by the
718		 *	   lock is if the binding is changed.
719		 *
720		 *	Note we will still take the lock and check the binding
721		 *	if the condition was true without the lock held.  This
722		 *	prevents lock contention among threads owned by the
723		 *	same proc.
724		 */
725
726		if (curthread->t_proc_flag & TP_CHANGEBIND) {
727			mutex_enter(&p->p_lock);
728			if (curthread->t_proc_flag & TP_CHANGEBIND) {
729				timer_lwpbind();
730				curthread->t_proc_flag &= ~TP_CHANGEBIND;
731			}
732			mutex_exit(&p->p_lock);
733		}
734
735		/*
736		 * for kaio requests on the special kaio poll queue,
737		 * copyout their results to user memory.
738		 */
739		if (p->p_aio)
740			aio_cleanup(0);
741
742		/*
743		 * If this LWP was asked to hold, call holdlwp(), which will
744		 * stop.  holdlwps() sets this up and calls pokelwps() which
745		 * sets the AST flag.
746		 *
747		 * Also check TP_EXITLWP, since this is used by fresh new LWPs
748		 * through lwp_rtt().  That flag is set if the lwp_create(2)
749		 * syscall failed after creating the LWP.
750		 */
751		if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
752			holdlwp();
753
754		/*
755		 * All code that sets signals and makes ISSIG_PENDING
756		 * evaluate true must set t_sig_check afterwards.
757		 */
758		if (ISSIG_PENDING(t, lwp, p)) {
759			if (issig(FORREAL))
760				psig();
761			t->t_sig_check = 1;	/* recheck next time */
762		}
763
764		if (sigprof) {
765			int nargs = (code > 0 && code < NSYSCALL)?
766			    LWP_GETSYSENT(lwp)[code].sy_narg : 0;
767			realsigprof(code, nargs, error);
768			t->t_sig_check = 1;	/* recheck next time */
769		}
770
771		/*
772		 * If a performance counter overflow interrupt was
773		 * delivered *during* the syscall, then re-enable the
774		 * AST so that we take a trip through trap() to cause
775		 * the SIGEMT to be delivered.
776		 */
777		if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
778			aston(t);
779
780		/*
781		 * If an asynchronous hardware error is pending, turn AST flag
782		 * back on.  AST will be checked again before we return to user
783		 * mode and we'll come back through trap() to handle the error.
784		 */
785		if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR)
786			aston(t);
787	}
788
789	/*
790	 * Restore register window if a debugger modified it.
791	 * Set up to perform a single-step if a debugger requested it.
792	 */
793	if (lwp->lwp_pcb.pcb_xregstat != XREGNONE)
794		xregrestore(lwp, 1);
795
796	lwp->lwp_errno = 0;		/* clear error for next time */
797
798#ifndef NPROBE
799	/* Kernel probe */
800	if (tnf_tracing_active) {
801		TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
802		    tnf_long,	rval1,		rval1,
803		    tnf_long,	rval2,		rval2,
804		    tnf_long,	errno,		(long)error);
805		repost = 1;
806	}
807#endif /* NPROBE */
808
809	/*
810	 * Set state to LWP_USER here so preempt won't give us a kernel
811	 * priority if it occurs after this point.  Call CL_TRAPRET() to
812	 * restore the user-level priority.
813	 *
814	 * It is important that no locks (other than spinlocks) be entered
815	 * after this point before returning to user mode (unless lwp_state
816	 * is set back to LWP_SYS).
817	 *
818	 * Sampled times past this point are charged to the user.
819	 */
820	lwp->lwp_state = LWP_USER;
821
822	if (t->t_trapret) {
823		t->t_trapret = 0;
824		thread_lock(t);
825		CL_TRAPRET(t);
826		thread_unlock(t);
827	}
828	if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
829		preempt();
830	prunstop();
831
832	/*
833	 * t_post_sys will be set if pcb_step is active.
834	 */
835	if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
836		prdostep();
837		repost = 1;
838	}
839
840	t->t_sysnum = 0;	/* no longer in a system call */
841
842	/*
843	 * In case the args were copied to the lwp, reset the
844	 * pointer so the next syscall will have the right lwp_ap pointer.
845	 */
846	lwp->lwp_ap = (long *)&rp->r_o0;
847	lwp->lwp_argsaved = 0;
848
849	/*
850	 * If there was a continuing reason for post-syscall processing,
851	 * set the t_post_sys flag for the next system call.
852	 */
853	if (repost)
854		t->t_post_sys = 1;
855
856	/*
857	 * If there is a ustack registered for this lwp, and the stack rlimit
858	 * has been altered, read in the ustack. If the saved stack rlimit
859	 * matches the bounds of the ustack, update the ustack to reflect
860	 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
861	 * stack checking by setting the size to 0.
862	 */
863	if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
864		rlim64_t new_size;
865		model_t model;
866		caddr_t top;
867		struct rlimit64 rl;
868
869		mutex_enter(&p->p_lock);
870		new_size = p->p_stk_ctl;
871		model = p->p_model;
872		top = p->p_usrstack;
873		(void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
874		mutex_exit(&p->p_lock);
875
876		if (rl.rlim_cur == RLIM64_INFINITY)
877			new_size = 0;
878
879		if (model == DATAMODEL_NATIVE) {
880			stack_t stk;
881
882			if (copyin((stack_t *)lwp->lwp_ustack, &stk,
883			    sizeof (stack_t)) == 0 &&
884			    (stk.ss_size == lwp->lwp_old_stk_ctl ||
885			    stk.ss_size == 0) &&
886			    stk.ss_sp == top - stk.ss_size) {
887				stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
888				    stk.ss_size - new_size);
889				stk.ss_size = new_size;
890
891				(void) copyout(&stk,
892				    (stack_t *)lwp->lwp_ustack,
893				    sizeof (stack_t));
894			}
895		} else {
896			stack32_t stk32;
897
898			if (copyin((stack32_t *)lwp->lwp_ustack, &stk32,
899			    sizeof (stack32_t)) == 0 &&
900			    (stk32.ss_size == lwp->lwp_old_stk_ctl ||
901			    stk32.ss_size == 0) &&
902			    stk32.ss_sp ==
903			    (caddr32_t)(uintptr_t)(top - stk32.ss_size)) {
904				stk32.ss_sp += stk32.ss_size - new_size;
905				stk32.ss_size = new_size;
906
907				(void) copyout(&stk32,
908				    (stack32_t *)lwp->lwp_ustack,
909				    sizeof (stack32_t));
910			}
911		}
912
913		lwp->lwp_old_stk_ctl = 0;
914	}
915
916	syscall_mstate(LMS_SYSTEM, LMS_USER);
917}
918
919/*
920 * Call a system call which takes a pointer to the user args struct and
921 * a pointer to the return values.  This is a bit slower than the standard
922 * C arg-passing method in some cases.
923 */
924int64_t
925syscall_ap()
926{
927	uint_t	error;
928	struct sysent *callp;
929	rval_t	rval;
930	klwp_t	*lwp = ttolwp(curthread);
931	struct regs *rp = lwptoregs(lwp);
932
933	callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum;
934
935	/*
936	 * If the arguments don't fit in registers %o0 - o5, make sure they
937	 * have been copied to the lwp_arg array.
938	 */
939	if (callp->sy_narg > 6 && save_syscall_args())
940		return ((int64_t)set_errno(EFAULT));
941
942	rval.r_val1 = 0;
943	rval.r_val2 = (int)rp->r_o1;
944	lwp->lwp_error = 0;	/* for old drivers */
945	error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
946	if (error)
947		return ((int64_t)set_errno(error));
948	return (rval.r_vals);
949}
950
951/*
952 * Load system call module.
953 *	Returns with pointer to held read lock for module.
954 */
955static krwlock_t *
956lock_syscall(struct sysent *table, uint_t code)
957{
958	krwlock_t	*module_lock;
959	struct modctl	*modp;
960	int		id;
961	struct sysent   *callp;
962
963	module_lock = table[code].sy_lock;
964	callp = &table[code];
965
966	/*
967	 * Optimization to only call modload if we don't have a loaded
968	 * syscall.
969	 */
970	rw_enter(module_lock, RW_READER);
971	if (LOADED_SYSCALL(callp))
972		return (module_lock);
973	rw_exit(module_lock);
974
975	for (;;) {
976		if ((id = modload("sys", syscallnames[code])) == -1)
977			break;
978
979		/*
980		 * If we loaded successfully at least once, the modctl
981		 * will still be valid, so we try to grab it by filename.
982		 * If this call fails, it's because the mod_filename
983		 * was changed after the call to modload() (mod_hold_by_name()
984		 * is the likely culprit).  We can safely just take
985		 * another lap if this is the case;  the modload() will
986		 * change the mod_filename back to one by which we can
987		 * find the modctl.
988		 */
989		modp = mod_find_by_filename("sys", syscallnames[code]);
990
991		if (modp == NULL)
992			continue;
993
994		mutex_enter(&mod_lock);
995
996		if (!modp->mod_installed) {
997			mutex_exit(&mod_lock);
998			continue;
999		}
1000		break;
1001	}
1002
1003	rw_enter(module_lock, RW_READER);
1004
1005	if (id != -1)
1006		mutex_exit(&mod_lock);
1007
1008	return (module_lock);
1009}
1010
1011/*
1012 * Loadable syscall support.
1013 *	If needed, load the module, then reserve it by holding a read
1014 *	lock for the duration of the call.
1015 *	Later, if the syscall is not unloadable, it could patch the vector.
1016 */
1017/*ARGSUSED*/
1018int64_t
1019loadable_syscall(
1020    long a0, long a1, long a2, long a3,
1021    long a4, long a5, long a6, long a7)
1022{
1023	int64_t		rval;
1024	struct sysent	*callp;
1025	struct sysent	*se = LWP_GETSYSENT(ttolwp(curthread));
1026	krwlock_t	*module_lock;
1027	int		code;
1028
1029	code = curthread->t_sysnum;
1030	callp = se + code;
1031
1032	/*
1033	 * Try to autoload the system call if necessary.
1034	 */
1035	module_lock = lock_syscall(se, code);
1036
1037	/*
1038	 * we've locked either the loaded syscall or nosys
1039	 */
1040	if (callp->sy_flags & SE_ARGC) {
1041		int64_t (*sy_call)();
1042
1043		sy_call = (int64_t (*)())callp->sy_call;
1044		rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1045	} else {
1046		rval = syscall_ap();
1047	}
1048
1049	rw_exit(module_lock);
1050	return (rval);
1051}
1052
1053/*
1054 * Handle indirect system calls.
1055 *	This interface should be deprecated.  The library can handle
1056 *	this more efficiently, but keep this implementation for old binaries.
1057 *
1058 * XX64	Needs some work.
1059 */
1060int64_t
1061indir(int code, long a0, long a1, long a2, long a3, long a4)
1062{
1063	klwp_t		*lwp = ttolwp(curthread);
1064	struct sysent	*callp;
1065
1066	if (code <= 0 || code >= NSYSCALL)
1067		return (nosys());
1068
1069	ASSERT(lwp->lwp_ap != NULL);
1070
1071	curthread->t_sysnum = code;
1072	callp = LWP_GETSYSENT(lwp) + code;
1073
1074	/*
1075	 * Handle argument setup, unless already done in pre_syscall().
1076	 */
1077	if (callp->sy_narg > 5) {
1078		if (save_syscall_args())	/* move args to LWP array */
1079			return ((int64_t)set_errno(EFAULT));
1080	} else if (!lwp->lwp_argsaved) {
1081		long *ap;
1082
1083		ap = lwp->lwp_ap;		/* args haven't been saved */
1084		lwp->lwp_ap = ap + 1;		/* advance arg pointer */
1085		curthread->t_post_sys = 1;	/* so lwp_ap will be reset */
1086	}
1087	return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5]));
1088}
1089
1090/*
1091 * set_errno - set an error return from the current system call.
1092 *	This could be a macro.
1093 *	This returns the value it is passed, so that the caller can
1094 *	use tail-recursion-elimination and do return (set_errno(ERRNO));
1095 */
1096uint_t
1097set_errno(uint_t error)
1098{
1099	ASSERT(error != 0);		/* must not be used to clear errno */
1100
1101	curthread->t_post_sys = 1;	/* have post_syscall do error return */
1102	return (ttolwp(curthread)->lwp_errno = error);
1103}
1104
1105/*
1106 * set_proc_pre_sys - Set pre-syscall processing for entire process.
1107 */
1108void
1109set_proc_pre_sys(proc_t *p)
1110{
1111	kthread_t	*t;
1112	kthread_t	*first;
1113
1114	ASSERT(MUTEX_HELD(&p->p_lock));
1115
1116	t = first = p->p_tlist;
1117	do {
1118		t->t_pre_sys = 1;
1119	} while ((t = t->t_forw) != first);
1120}
1121
1122/*
1123 * set_proc_post_sys - Set post-syscall processing for entire process.
1124 */
1125void
1126set_proc_post_sys(proc_t *p)
1127{
1128	kthread_t	*t;
1129	kthread_t	*first;
1130
1131	ASSERT(MUTEX_HELD(&p->p_lock));
1132
1133	t = first = p->p_tlist;
1134	do {
1135		t->t_post_sys = 1;
1136	} while ((t = t->t_forw) != first);
1137}
1138
1139/*
1140 * set_proc_sys - Set pre- and post-syscall processing for entire process.
1141 */
1142void
1143set_proc_sys(proc_t *p)
1144{
1145	kthread_t	*t;
1146	kthread_t	*first;
1147
1148	ASSERT(MUTEX_HELD(&p->p_lock));
1149
1150	t = first = p->p_tlist;
1151	do {
1152		t->t_pre_sys = 1;
1153		t->t_post_sys = 1;
1154	} while ((t = t->t_forw) != first);
1155}
1156
1157/*
1158 * set_all_proc_sys - set pre- and post-syscall processing flags for all
1159 * user processes.
1160 *
1161 * This is needed when auditing, tracing, or other facilities which affect
1162 * all processes are turned on.
1163 */
1164void
1165set_all_proc_sys()
1166{
1167	kthread_t	*t;
1168	kthread_t	*first;
1169
1170	mutex_enter(&pidlock);
1171	t = first = curthread;
1172	do {
1173		t->t_pre_sys = 1;
1174		t->t_post_sys = 1;
1175	} while ((t = t->t_next) != first);
1176	mutex_exit(&pidlock);
1177}
1178
1179/*
1180 * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1181 * all user processes running in the zone of the current process
1182 *
1183 * This is needed when auditing is turned on.
1184 */
1185void
1186set_all_zone_usr_proc_sys(zoneid_t zoneid)
1187{
1188	proc_t	    *p;
1189	kthread_t   *t;
1190
1191	mutex_enter(&pidlock);
1192	for (p = practive; p != NULL; p = p->p_next) {
1193		/* skip kernel processes */
1194		if (p->p_exec == NULLVP || p->p_as == &kas ||
1195		    p->p_stat == SIDL || p->p_stat == SZOMB ||
1196		    (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1197			continue;
1198		/*
1199		 * Only processes in the given zone (eventually in
1200		 * all zones) are taken into account
1201		 */
1202		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1203			mutex_enter(&p->p_lock);
1204			if ((t = p->p_tlist) == NULL) {
1205				mutex_exit(&p->p_lock);
1206				continue;
1207			}
1208			/*
1209			 * Set pre- and post-syscall processing flags
1210			 * for all threads of the process
1211			 */
1212			do {
1213				t->t_pre_sys = 1;
1214				t->t_post_sys = 1;
1215			} while (p->p_tlist != (t = t->t_forw));
1216			mutex_exit(&p->p_lock);
1217		}
1218	}
1219	mutex_exit(&pidlock);
1220}
1221
1222/*
1223 * set_proc_ast - Set asynchronous service trap (AST) flag for all
1224 * threads in process.
1225 */
1226void
1227set_proc_ast(proc_t *p)
1228{
1229	kthread_t	*t;
1230	kthread_t	*first;
1231
1232	ASSERT(MUTEX_HELD(&p->p_lock));
1233
1234	t = first = p->p_tlist;
1235	do {
1236		aston(t);
1237	} while ((t = t->t_forw) != first);
1238}
1239