1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28/*	  All Rights Reserved	*/
29
30#include <sys/types.h>
31#include <sys/param.h>
32#include <sys/sysmacros.h>
33#include <sys/proc.h>
34#include <sys/kmem.h>
35#include <sys/tuneable.h>
36#include <sys/var.h>
37#include <sys/cred.h>
38#include <sys/systm.h>
39#include <sys/prsystm.h>
40#include <sys/vnode.h>
41#include <sys/session.h>
42#include <sys/cpuvar.h>
43#include <sys/cmn_err.h>
44#include <sys/bitmap.h>
45#include <sys/debug.h>
46#include <c2/audit.h>
47#include <sys/project.h>
48#include <sys/task.h>
49#include <sys/zone.h>
50
51/* directory entries for /proc */
52union procent {
53	proc_t *pe_proc;
54	union procent *pe_next;
55};
56
57struct pid pid0 = {
58	0,		/* pid_prinactive */
59	1,		/* pid_pgorphaned */
60	0,		/* pid_padding	*/
61	0,		/* pid_prslot	*/
62	0,		/* pid_id	*/
63	NULL,		/* pid_pglink	*/
64	NULL,		/* pid_pgtail	*/
65	NULL,		/* pid_link	*/
66	3		/* pid_ref	*/
67};
68
69static int pid_hashlen = 4;	/* desired average hash chain length */
70static int pid_hashsz;		/* number of buckets in the hash table */
71
72#define	HASHPID(pid)	(pidhash[((pid)&(pid_hashsz-1))])
73
74extern uint_t nproc;
75extern struct kmem_cache *process_cache;
76static void	upcount_init(void);
77
78kmutex_t	pidlock;	/* global process lock */
79kmutex_t	pr_pidlock;	/* /proc global process lock */
80kcondvar_t	*pr_pid_cv;	/* for /proc, one per process slot */
81struct plock	*proc_lock;	/* persistent array of p_lock's */
82
83/*
84 * See the comment above pid_getlockslot() for a detailed explanation of this
85 * constant.  Note that a PLOCK_SHIFT of 3 implies 64-byte coherence
86 * granularity; if the coherence granularity is ever changed, this constant
87 * should be modified to reflect the change to minimize proc_lock false
88 * sharing (correctness, however, is guaranteed regardless of the coherence
89 * granularity).
90 */
91#define	PLOCK_SHIFT	3
92
93static kmutex_t	pidlinklock;
94static struct pid **pidhash;
95static pid_t minpid;
96static pid_t mpid = FAMOUS_PIDS;	/* one more than the last famous pid */
97static union procent *procdir;
98static union procent *procentfree;
99
100static struct pid *
101pid_lookup(pid_t pid)
102{
103	struct pid *pidp;
104
105	ASSERT(MUTEX_HELD(&pidlinklock));
106
107	for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) {
108		if (pidp->pid_id == pid) {
109			ASSERT(pidp->pid_ref > 0);
110			break;
111		}
112	}
113	return (pidp);
114}
115
116void
117pid_setmin(void)
118{
119	if (jump_pid && jump_pid > mpid)
120		minpid = mpid = jump_pid;
121	else
122		minpid = mpid;
123}
124
125/*
126 * When prslots are simply used as an index to determine a process' p_lock,
127 * adjacent prslots share adjacent p_locks.  On machines where the size
128 * of a mutex is smaller than that of a cache line (which, as of this writing,
129 * is true for all machines on which Solaris runs), this can potentially
130 * induce false sharing.  The standard solution for false sharing is to pad
131 * out one's data structures (in this case, struct plock).  However,
132 * given the size and (generally) sparse use of the proc_lock array, this
133 * is suboptimal.  We therefore stride through the proc_lock array with
134 * a stride of PLOCK_SHIFT.  PLOCK_SHIFT should be defined as:
135 *
136 *   log_2 (coherence_granularity / sizeof (kmutex_t))
137 *
138 * Under this scheme, false sharing is still possible -- but only when
139 * the number of active processes is very large.  Note that the one-to-one
140 * mapping between prslots and lockslots is maintained.
141 */
142static int
143pid_getlockslot(int prslot)
144{
145	int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT;
146	int perlap = even >> PLOCK_SHIFT;
147
148	if (prslot >= even)
149		return (prslot);
150
151	return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap));
152}
153
154/*
155 * This function allocates a pid structure, a free pid, and optionally a
156 * slot in the proc table for it.
157 *
158 * pid_allocate() returns the new pid on success, -1 on failure.
159 */
160pid_t
161pid_allocate(proc_t *prp, pid_t pid, int flags)
162{
163	struct pid *pidp;
164	union procent *pep;
165	pid_t newpid, startpid;
166
167	pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP);
168
169	mutex_enter(&pidlinklock);
170	pep = procentfree;
171	if ((flags & PID_ALLOC_PROC) && pep == NULL) {
172		/*
173		 * ran out of /proc directory entries
174		 */
175		goto failed;
176	}
177
178	if (pid != 0) {
179		VERIFY(minpid == 0);
180		VERIFY3P(pid, <, mpid);
181		VERIFY3P(pid_lookup(pid), ==, NULL);
182		newpid = pid;
183	} else {
184		/*
185		 * Allocate a pid
186		 */
187		ASSERT(minpid <= mpid && mpid < maxpid);
188
189		startpid = mpid;
190		for (;;) {
191			newpid = mpid;
192			if (++mpid == maxpid)
193				mpid = minpid;
194
195			if (pid_lookup(newpid) == NULL)
196				break;
197
198			if (mpid == startpid)
199				goto failed;
200		}
201	}
202
203	/*
204	 * Put pid into the pid hash table.
205	 */
206	pidp->pid_link = HASHPID(newpid);
207	HASHPID(newpid) = pidp;
208	pidp->pid_ref = 1;
209	pidp->pid_id = newpid;
210
211	if (flags & PID_ALLOC_PROC) {
212		procentfree = pep->pe_next;
213		pidp->pid_prslot = pep - procdir;
214		pep->pe_proc = prp;
215		prp->p_pidp = pidp;
216		prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)];
217	} else {
218		pidp->pid_prslot = 0;
219	}
220
221	mutex_exit(&pidlinklock);
222
223	return (newpid);
224
225failed:
226	mutex_exit(&pidlinklock);
227	kmem_free(pidp, sizeof (struct pid));
228	return (-1);
229}
230
231/*
232 * decrement the reference count for pid
233 */
234int
235pid_rele(struct pid *pidp)
236{
237	struct pid **pidpp;
238
239	mutex_enter(&pidlinklock);
240	ASSERT(pidp != &pid0);
241
242	pidpp = &HASHPID(pidp->pid_id);
243	for (;;) {
244		ASSERT(*pidpp != NULL);
245		if (*pidpp == pidp)
246			break;
247		pidpp = &(*pidpp)->pid_link;
248	}
249
250	*pidpp = pidp->pid_link;
251	mutex_exit(&pidlinklock);
252
253	kmem_free(pidp, sizeof (*pidp));
254	return (0);
255}
256
257void
258proc_entry_free(struct pid *pidp)
259{
260	mutex_enter(&pidlinklock);
261	pidp->pid_prinactive = 1;
262	procdir[pidp->pid_prslot].pe_next = procentfree;
263	procentfree = &procdir[pidp->pid_prslot];
264	mutex_exit(&pidlinklock);
265}
266
267/*
268 * The original task needs to be passed in since the process has already been
269 * detached from the task at this point in time.
270 */
271void
272pid_exit(proc_t *prp, struct task *tk)
273{
274	struct pid *pidp;
275	zone_t	*zone = prp->p_zone;
276
277	ASSERT(MUTEX_HELD(&pidlock));
278
279	/*
280	 * Exit process group.  If it is NULL, it's because fork failed
281	 * before calling pgjoin().
282	 */
283	ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL);
284	if (prp->p_pgidp != NULL)
285		pgexit(prp);
286
287	sess_rele(prp->p_sessp, B_TRUE);
288
289	pidp = prp->p_pidp;
290
291	proc_entry_free(pidp);
292
293	if (audit_active)
294		audit_pfree(prp);
295
296	if (practive == prp) {
297		practive = prp->p_next;
298	}
299
300	if (prp->p_next) {
301		prp->p_next->p_prev = prp->p_prev;
302	}
303	if (prp->p_prev) {
304		prp->p_prev->p_next = prp->p_next;
305	}
306
307	PID_RELE(pidp);
308
309	mutex_destroy(&prp->p_crlock);
310	kmem_cache_free(process_cache, prp);
311	nproc--;
312
313	/*
314	 * Decrement the process counts of the original task, project and zone.
315	 */
316	mutex_enter(&zone->zone_nlwps_lock);
317	tk->tk_nprocs--;
318	tk->tk_proj->kpj_nprocs--;
319	zone->zone_nprocs--;
320	mutex_exit(&zone->zone_nlwps_lock);
321}
322
323/*
324 * Find a process visible from the specified zone given its process ID.
325 */
326proc_t *
327prfind_zone(pid_t pid, zoneid_t zoneid)
328{
329	struct pid *pidp;
330	proc_t *p;
331
332	ASSERT(MUTEX_HELD(&pidlock));
333
334	mutex_enter(&pidlinklock);
335	pidp = pid_lookup(pid);
336	mutex_exit(&pidlinklock);
337	if (pidp != NULL && pidp->pid_prinactive == 0) {
338		p = procdir[pidp->pid_prslot].pe_proc;
339		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid)
340			return (p);
341	}
342	return (NULL);
343}
344
345/*
346 * Find a process given its process ID.  This obeys zone restrictions,
347 * so if the caller is in a non-global zone it won't find processes
348 * associated with other zones.  Use prfind_zone(pid, ALL_ZONES) to
349 * bypass this restriction.
350 */
351proc_t *
352prfind(pid_t pid)
353{
354	zoneid_t zoneid;
355
356	if (INGLOBALZONE(curproc))
357		zoneid = ALL_ZONES;
358	else
359		zoneid = getzoneid();
360	return (prfind_zone(pid, zoneid));
361}
362
363proc_t *
364pgfind_zone(pid_t pgid, zoneid_t zoneid)
365{
366	struct pid *pidp;
367
368	ASSERT(MUTEX_HELD(&pidlock));
369
370	mutex_enter(&pidlinklock);
371	pidp = pid_lookup(pgid);
372	mutex_exit(&pidlinklock);
373	if (pidp != NULL) {
374		proc_t *p = pidp->pid_pglink;
375
376		if (zoneid == ALL_ZONES || pgid == 0 || p == NULL ||
377		    p->p_zone->zone_id == zoneid)
378			return (p);
379	}
380	return (NULL);
381}
382
383/*
384 * return the head of the list of processes whose process group ID is 'pgid',
385 * or NULL, if no such process group
386 */
387proc_t *
388pgfind(pid_t pgid)
389{
390	zoneid_t zoneid;
391
392	if (INGLOBALZONE(curproc))
393		zoneid = ALL_ZONES;
394	else
395		zoneid = getzoneid();
396	return (pgfind_zone(pgid, zoneid));
397}
398
399/*
400 * Sets P_PR_LOCK on a non-system process.  Process must be fully created
401 * and not exiting to succeed.
402 *
403 * Returns 0 on success.
404 * Returns 1 if P_PR_LOCK is set.
405 * Returns -1 if proc is in invalid state.
406 */
407int
408sprtrylock_proc(proc_t *p)
409{
410	ASSERT(MUTEX_HELD(&p->p_lock));
411
412	/* skip system and incomplete processes */
413	if (p->p_stat == SIDL || p->p_stat == SZOMB ||
414	    (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) {
415		return (-1);
416	}
417
418	if (p->p_proc_flag & P_PR_LOCK)
419		return (1);
420
421	p->p_proc_flag |= P_PR_LOCK;
422
423	return (0);
424}
425
426/*
427 * Wait for P_PR_LOCK to become clear.  Returns with p_lock dropped,
428 * and the proc pointer no longer valid, as the proc may have exited.
429 */
430void
431sprwaitlock_proc(proc_t *p)
432{
433	kmutex_t *mp;
434
435	ASSERT(MUTEX_HELD(&p->p_lock));
436	ASSERT(p->p_proc_flag & P_PR_LOCK);
437
438	/*
439	 * p_lock is persistent, but p itself is not -- it could
440	 * vanish during cv_wait().  Load p->p_lock now so we can
441	 * drop it after cv_wait() without referencing p.
442	 */
443	mp = &p->p_lock;
444	cv_wait(&pr_pid_cv[p->p_slot], mp);
445	mutex_exit(mp);
446}
447
448/*
449 * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
450 * Returns the proc pointer on success, NULL on failure.  sprlock() is
451 * really just a stripped-down version of pr_p_lock() to allow practive
452 * walkers like dofusers() and dumpsys() to synchronize with /proc.
453 */
454proc_t *
455sprlock_zone(pid_t pid, zoneid_t zoneid)
456{
457	proc_t *p;
458	int ret;
459
460	for (;;) {
461		mutex_enter(&pidlock);
462		if ((p = prfind_zone(pid, zoneid)) == NULL) {
463			mutex_exit(&pidlock);
464			return (NULL);
465		}
466		mutex_enter(&p->p_lock);
467		mutex_exit(&pidlock);
468
469		if (panicstr)
470			return (p);
471
472		ret = sprtrylock_proc(p);
473		if (ret == -1) {
474			mutex_exit(&p->p_lock);
475			return (NULL);
476		} else if (ret == 0) {
477			break;
478		}
479		sprwaitlock_proc(p);
480	}
481	return (p);
482}
483
484proc_t *
485sprlock(pid_t pid)
486{
487	zoneid_t zoneid;
488
489	if (INGLOBALZONE(curproc))
490		zoneid = ALL_ZONES;
491	else
492		zoneid = getzoneid();
493	return (sprlock_zone(pid, zoneid));
494}
495
496void
497sprlock_proc(proc_t *p)
498{
499	ASSERT(MUTEX_HELD(&p->p_lock));
500
501	while (p->p_proc_flag & P_PR_LOCK) {
502		cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock);
503	}
504
505	p->p_proc_flag |= P_PR_LOCK;
506}
507
508void
509sprunlock(proc_t *p)
510{
511	if (panicstr) {
512		mutex_exit(&p->p_lock);
513		return;
514	}
515
516	ASSERT(p->p_proc_flag & P_PR_LOCK);
517	ASSERT(MUTEX_HELD(&p->p_lock));
518
519	cv_signal(&pr_pid_cv[p->p_slot]);
520	p->p_proc_flag &= ~P_PR_LOCK;
521	mutex_exit(&p->p_lock);
522}
523
524void
525pid_init(void)
526{
527	int i;
528
529	pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen);
530
531	pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP);
532	procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP);
533	pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP);
534	proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP);
535
536	nproc = 1;
537	practive = proc_sched;
538	proc_sched->p_next = NULL;
539	procdir[0].pe_proc = proc_sched;
540
541	procentfree = &procdir[1];
542	for (i = 1; i < v.v_proc - 1; i++)
543		procdir[i].pe_next = &procdir[i+1];
544	procdir[i].pe_next = NULL;
545
546	HASHPID(0) = &pid0;
547
548	upcount_init();
549}
550
551proc_t *
552pid_entry(int slot)
553{
554	union procent *pep;
555	proc_t *prp;
556
557	ASSERT(MUTEX_HELD(&pidlock));
558	ASSERT(slot >= 0 && slot < v.v_proc);
559
560	pep = procdir[slot].pe_next;
561	if (pep >= procdir && pep < &procdir[v.v_proc])
562		return (NULL);
563	prp = procdir[slot].pe_proc;
564	if (prp != 0 && prp->p_stat == SIDL)
565		return (NULL);
566	return (prp);
567}
568
569/*
570 * Send the specified signal to all processes whose process group ID is
571 * equal to 'pgid'
572 */
573
574void
575signal(pid_t pgid, int sig)
576{
577	struct pid *pidp;
578	proc_t *prp;
579
580	mutex_enter(&pidlock);
581	mutex_enter(&pidlinklock);
582	if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) {
583		mutex_exit(&pidlinklock);
584		mutex_exit(&pidlock);
585		return;
586	}
587	mutex_exit(&pidlinklock);
588	for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) {
589		mutex_enter(&prp->p_lock);
590		sigtoproc(prp, NULL, sig);
591		mutex_exit(&prp->p_lock);
592	}
593	mutex_exit(&pidlock);
594}
595
596/*
597 * Send the specified signal to the specified process
598 */
599
600void
601prsignal(struct pid *pidp, int sig)
602{
603	if (!(pidp->pid_prinactive))
604		psignal(procdir[pidp->pid_prslot].pe_proc, sig);
605}
606
607#include <sys/sunddi.h>
608
609/*
610 * DDI/DKI interfaces for drivers to send signals to processes
611 */
612
613/*
614 * obtain an opaque reference to a process for signaling
615 */
616void *
617proc_ref(void)
618{
619	struct pid *pidp;
620
621	mutex_enter(&pidlock);
622	pidp = curproc->p_pidp;
623	PID_HOLD(pidp);
624	mutex_exit(&pidlock);
625
626	return (pidp);
627}
628
629/*
630 * release a reference to a process
631 * - a process can exit even if a driver has a reference to it
632 * - one proc_unref for every proc_ref
633 */
634void
635proc_unref(void *pref)
636{
637	mutex_enter(&pidlock);
638	PID_RELE((struct pid *)pref);
639	mutex_exit(&pidlock);
640}
641
642/*
643 * send a signal to a process
644 *
645 * - send the process the signal
646 * - if the process went away, return a -1
647 * - if the process is still there return 0
648 */
649int
650proc_signal(void *pref, int sig)
651{
652	struct pid *pidp = pref;
653
654	prsignal(pidp, sig);
655	return (pidp->pid_prinactive ? -1 : 0);
656}
657
658
659static struct upcount	**upc_hash;	/* a boot time allocated array */
660static ulong_t		upc_hashmask;
661#define	UPC_HASH(x, y)	((ulong_t)(x ^ y) & upc_hashmask)
662
663/*
664 * Get us off the ground.  Called once at boot.
665 */
666void
667upcount_init(void)
668{
669	ulong_t	upc_hashsize;
670
671	/*
672	 * An entry per MB of memory is our current guess
673	 */
674	/*
675	 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT
676	 * converts pages to megs (without overflowing a u_int
677	 * if you have more than 4G of memory, like ptob(physmem)/1M
678	 * would).
679	 */
680	upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT)));
681	upc_hashmask = upc_hashsize - 1;
682	upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *),
683	    KM_SLEEP);
684}
685
686/*
687 * Increment the number of processes associated with a given uid and zoneid.
688 */
689void
690upcount_inc(uid_t uid, zoneid_t zoneid)
691{
692	struct upcount	**upc, **hupc;
693	struct upcount	*new;
694
695	ASSERT(MUTEX_HELD(&pidlock));
696	new = NULL;
697	hupc = &upc_hash[UPC_HASH(uid, zoneid)];
698top:
699	upc = hupc;
700	while ((*upc) != NULL) {
701		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
702			(*upc)->up_count++;
703			if (new) {
704				/*
705				 * did not need `new' afterall.
706				 */
707				kmem_free(new, sizeof (*new));
708			}
709			return;
710		}
711		upc = &(*upc)->up_next;
712	}
713
714	/*
715	 * There is no entry for this <uid,zoneid> pair.
716	 * Allocate one.  If we have to drop pidlock, check
717	 * again.
718	 */
719	if (new == NULL) {
720		new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP);
721		if (new == NULL) {
722			mutex_exit(&pidlock);
723			new = (struct upcount *)kmem_alloc(sizeof (*new),
724			    KM_SLEEP);
725			mutex_enter(&pidlock);
726			goto top;
727		}
728	}
729
730
731	/*
732	 * On the assumption that a new user is going to do some
733	 * more forks, put the new upcount structure on the front.
734	 */
735	upc = hupc;
736
737	new->up_uid = uid;
738	new->up_zoneid = zoneid;
739	new->up_count = 1;
740	new->up_next = *upc;
741
742	*upc = new;
743}
744
745/*
746 * Decrement the number of processes a given uid and zoneid has.
747 */
748void
749upcount_dec(uid_t uid, zoneid_t zoneid)
750{
751	struct	upcount **upc;
752	struct	upcount *done;
753
754	ASSERT(MUTEX_HELD(&pidlock));
755
756	upc = &upc_hash[UPC_HASH(uid, zoneid)];
757	while ((*upc) != NULL) {
758		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
759			(*upc)->up_count--;
760			if ((*upc)->up_count == 0) {
761				done = *upc;
762				*upc = (*upc)->up_next;
763				kmem_free(done, sizeof (*done));
764			}
765			return;
766		}
767		upc = &(*upc)->up_next;
768	}
769	cmn_err(CE_PANIC, "decr_upcount-off the end");
770}
771
772/*
773 * Returns the number of processes a uid has.
774 * Non-existent uid's are assumed to have no processes.
775 */
776int
777upcount_get(uid_t uid, zoneid_t zoneid)
778{
779	struct	upcount *upc;
780
781	ASSERT(MUTEX_HELD(&pidlock));
782
783	upc = upc_hash[UPC_HASH(uid, zoneid)];
784	while (upc != NULL) {
785		if (upc->up_uid == uid && upc->up_zoneid == zoneid) {
786			return (upc->up_count);
787		}
788		upc = upc->up_next;
789	}
790	return (0);
791}
792