1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1989, 1991, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD$");
41
42#include "opt_capsicum.h"
43#include "opt_ddb.h"
44#include "opt_ktrace.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48
49#include <sys/capsicum.h>
50#include <sys/conf.h>
51#include <sys/fcntl.h>
52#include <sys/file.h>
53#include <sys/filedesc.h>
54#include <sys/filio.h>
55#include <sys/jail.h>
56#include <sys/kernel.h>
57#include <sys/limits.h>
58#include <sys/lock.h>
59#include <sys/malloc.h>
60#include <sys/mount.h>
61#include <sys/mutex.h>
62#include <sys/namei.h>
63#include <sys/selinfo.h>
64#include <sys/priv.h>
65#include <sys/proc.h>
66#include <sys/protosw.h>
67#include <sys/racct.h>
68#include <sys/resourcevar.h>
69#include <sys/sbuf.h>
70#include <sys/signalvar.h>
71#include <sys/kdb.h>
72#include <sys/smr.h>
73#include <sys/stat.h>
74#include <sys/sx.h>
75#include <sys/syscallsubr.h>
76#include <sys/sysctl.h>
77#include <sys/sysproto.h>
78#include <sys/unistd.h>
79#include <sys/user.h>
80#include <sys/vnode.h>
81#ifdef KTRACE
82#include <sys/ktrace.h>
83#endif
84
85#include <net/vnet.h>
86
87#include <security/audit/audit.h>
88
89#include <vm/uma.h>
90#include <vm/vm.h>
91
92#include <ddb/ddb.h>
93
94static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
95static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes");
96static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
97    "file desc to leader structures");
98static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
99MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
100
101MALLOC_DECLARE(M_FADVISE);
102
103static __read_mostly uma_zone_t file_zone;
104static __read_mostly uma_zone_t filedesc0_zone;
105__read_mostly uma_zone_t pwd_zone;
106VFS_SMR_DECLARE;
107
108static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
109		    struct thread *td, int holdleaders);
110static int	fd_first_free(struct filedesc *fdp, int low, int size);
111static void	fdgrowtable(struct filedesc *fdp, int nfd);
112static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
113static void	fdunused(struct filedesc *fdp, int fd);
114static void	fdused(struct filedesc *fdp, int fd);
115static int	getmaxfd(struct thread *td);
116static u_long	*filecaps_copy_prep(const struct filecaps *src);
117static void	filecaps_copy_finish(const struct filecaps *src,
118		    struct filecaps *dst, u_long *ioctls);
119static u_long 	*filecaps_free_prep(struct filecaps *fcaps);
120static void	filecaps_free_finish(u_long *ioctls);
121
122static struct pwd *pwd_alloc(void);
123
124/*
125 * Each process has:
126 *
127 * - An array of open file descriptors (fd_ofiles)
128 * - An array of file flags (fd_ofileflags)
129 * - A bitmap recording which descriptors are in use (fd_map)
130 *
131 * A process starts out with NDFILE descriptors.  The value of NDFILE has
132 * been selected based the historical limit of 20 open files, and an
133 * assumption that the majority of processes, especially short-lived
134 * processes like shells, will never need more.
135 *
136 * If this initial allocation is exhausted, a larger descriptor table and
137 * map are allocated dynamically, and the pointers in the process's struct
138 * filedesc are updated to point to those.  This is repeated every time
139 * the process runs out of file descriptors (provided it hasn't hit its
140 * resource limit).
141 *
142 * Since threads may hold references to individual descriptor table
143 * entries, the tables are never freed.  Instead, they are placed on a
144 * linked list and freed only when the struct filedesc is released.
145 */
146#define NDFILE		20
147#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
148#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
149#define NDSLOT(x)	((x) / NDENTRIES)
150#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
151#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
152
153/*
154 * SLIST entry used to keep track of ofiles which must be reclaimed when
155 * the process exits.
156 */
157struct freetable {
158	struct fdescenttbl *ft_table;
159	SLIST_ENTRY(freetable) ft_next;
160};
161
162/*
163 * Initial allocation: a filedesc structure + the head of SLIST used to
164 * keep track of old ofiles + enough space for NDFILE descriptors.
165 */
166
167struct fdescenttbl0 {
168	int	fdt_nfiles;
169	struct	filedescent fdt_ofiles[NDFILE];
170};
171
172struct filedesc0 {
173	struct filedesc fd_fd;
174	SLIST_HEAD(, freetable) fd_free;
175	struct	fdescenttbl0 fd_dfiles;
176	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
177};
178
179/*
180 * Descriptor management.
181 */
182static int __exclusive_cache_line openfiles; /* actual number of open files */
183struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
184void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
185
186/*
187 * If low >= size, just return low. Otherwise find the first zero bit in the
188 * given bitmap, starting at low and not exceeding size - 1. Return size if
189 * not found.
190 */
191static int
192fd_first_free(struct filedesc *fdp, int low, int size)
193{
194	NDSLOTTYPE *map = fdp->fd_map;
195	NDSLOTTYPE mask;
196	int off, maxoff;
197
198	if (low >= size)
199		return (low);
200
201	off = NDSLOT(low);
202	if (low % NDENTRIES) {
203		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
204		if ((mask &= ~map[off]) != 0UL)
205			return (off * NDENTRIES + ffsl(mask) - 1);
206		++off;
207	}
208	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
209		if (map[off] != ~0UL)
210			return (off * NDENTRIES + ffsl(~map[off]) - 1);
211	return (size);
212}
213
214/*
215 * Find the last used fd.
216 *
217 * Call this variant if fdp can't be modified by anyone else (e.g, during exec).
218 * Otherwise use fdlastfile.
219 */
220int
221fdlastfile_single(struct filedesc *fdp)
222{
223	NDSLOTTYPE *map = fdp->fd_map;
224	int off, minoff;
225
226	off = NDSLOT(fdp->fd_nfiles - 1);
227	for (minoff = NDSLOT(0); off >= minoff; --off)
228		if (map[off] != 0)
229			return (off * NDENTRIES + flsl(map[off]) - 1);
230	return (-1);
231}
232
233int
234fdlastfile(struct filedesc *fdp)
235{
236
237	FILEDESC_LOCK_ASSERT(fdp);
238	return (fdlastfile_single(fdp));
239}
240
241static int
242fdisused(struct filedesc *fdp, int fd)
243{
244
245	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
246	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
247
248	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
249}
250
251/*
252 * Mark a file descriptor as used.
253 */
254static void
255fdused_init(struct filedesc *fdp, int fd)
256{
257
258	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
259
260	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
261}
262
263static void
264fdused(struct filedesc *fdp, int fd)
265{
266
267	FILEDESC_XLOCK_ASSERT(fdp);
268
269	fdused_init(fdp, fd);
270	if (fd == fdp->fd_freefile)
271		fdp->fd_freefile++;
272}
273
274/*
275 * Mark a file descriptor as unused.
276 */
277static void
278fdunused(struct filedesc *fdp, int fd)
279{
280
281	FILEDESC_XLOCK_ASSERT(fdp);
282
283	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
284	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
285	    ("fd=%d is still in use", fd));
286
287	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
288	if (fd < fdp->fd_freefile)
289		fdp->fd_freefile = fd;
290}
291
292/*
293 * Free a file descriptor.
294 *
295 * Avoid some work if fdp is about to be destroyed.
296 */
297static inline void
298fdefree_last(struct filedescent *fde)
299{
300
301	filecaps_free(&fde->fde_caps);
302}
303
304static inline void
305fdfree(struct filedesc *fdp, int fd)
306{
307	struct filedescent *fde;
308
309	fde = &fdp->fd_ofiles[fd];
310#ifdef CAPABILITIES
311	seqc_write_begin(&fde->fde_seqc);
312#endif
313	fde->fde_file = NULL;
314#ifdef CAPABILITIES
315	seqc_write_end(&fde->fde_seqc);
316#endif
317	fdefree_last(fde);
318	fdunused(fdp, fd);
319}
320
321/*
322 * System calls on descriptors.
323 */
324#ifndef _SYS_SYSPROTO_H_
325struct getdtablesize_args {
326	int	dummy;
327};
328#endif
329/* ARGSUSED */
330int
331sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
332{
333#ifdef	RACCT
334	uint64_t lim;
335#endif
336
337	td->td_retval[0] = getmaxfd(td);
338#ifdef	RACCT
339	PROC_LOCK(td->td_proc);
340	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
341	PROC_UNLOCK(td->td_proc);
342	if (lim < td->td_retval[0])
343		td->td_retval[0] = lim;
344#endif
345	return (0);
346}
347
348/*
349 * Duplicate a file descriptor to a particular value.
350 *
351 * Note: keep in mind that a potential race condition exists when closing
352 * descriptors from a shared descriptor table (via rfork).
353 */
354#ifndef _SYS_SYSPROTO_H_
355struct dup2_args {
356	u_int	from;
357	u_int	to;
358};
359#endif
360/* ARGSUSED */
361int
362sys_dup2(struct thread *td, struct dup2_args *uap)
363{
364
365	return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
366}
367
368/*
369 * Duplicate a file descriptor.
370 */
371#ifndef _SYS_SYSPROTO_H_
372struct dup_args {
373	u_int	fd;
374};
375#endif
376/* ARGSUSED */
377int
378sys_dup(struct thread *td, struct dup_args *uap)
379{
380
381	return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
382}
383
384/*
385 * The file control system call.
386 */
387#ifndef _SYS_SYSPROTO_H_
388struct fcntl_args {
389	int	fd;
390	int	cmd;
391	long	arg;
392};
393#endif
394/* ARGSUSED */
395int
396sys_fcntl(struct thread *td, struct fcntl_args *uap)
397{
398
399	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
400}
401
402int
403kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
404{
405	struct flock fl;
406	struct __oflock ofl;
407	intptr_t arg1;
408	int error, newcmd;
409
410	error = 0;
411	newcmd = cmd;
412	switch (cmd) {
413	case F_OGETLK:
414	case F_OSETLK:
415	case F_OSETLKW:
416		/*
417		 * Convert old flock structure to new.
418		 */
419		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
420		fl.l_start = ofl.l_start;
421		fl.l_len = ofl.l_len;
422		fl.l_pid = ofl.l_pid;
423		fl.l_type = ofl.l_type;
424		fl.l_whence = ofl.l_whence;
425		fl.l_sysid = 0;
426
427		switch (cmd) {
428		case F_OGETLK:
429			newcmd = F_GETLK;
430			break;
431		case F_OSETLK:
432			newcmd = F_SETLK;
433			break;
434		case F_OSETLKW:
435			newcmd = F_SETLKW;
436			break;
437		}
438		arg1 = (intptr_t)&fl;
439		break;
440	case F_GETLK:
441	case F_SETLK:
442	case F_SETLKW:
443	case F_SETLK_REMOTE:
444		error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
445		arg1 = (intptr_t)&fl;
446		break;
447	default:
448		arg1 = arg;
449		break;
450	}
451	if (error)
452		return (error);
453	error = kern_fcntl(td, fd, newcmd, arg1);
454	if (error)
455		return (error);
456	if (cmd == F_OGETLK) {
457		ofl.l_start = fl.l_start;
458		ofl.l_len = fl.l_len;
459		ofl.l_pid = fl.l_pid;
460		ofl.l_type = fl.l_type;
461		ofl.l_whence = fl.l_whence;
462		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
463	} else if (cmd == F_GETLK) {
464		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
465	}
466	return (error);
467}
468
469int
470kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
471{
472	struct filedesc *fdp;
473	struct flock *flp;
474	struct file *fp, *fp2;
475	struct filedescent *fde;
476	struct proc *p;
477	struct vnode *vp;
478	struct mount *mp;
479	int error, flg, seals, tmp;
480	uint64_t bsize;
481	off_t foffset;
482
483	error = 0;
484	flg = F_POSIX;
485	p = td->td_proc;
486	fdp = p->p_fd;
487
488	AUDIT_ARG_FD(cmd);
489	AUDIT_ARG_CMD(cmd);
490	switch (cmd) {
491	case F_DUPFD:
492		tmp = arg;
493		error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
494		break;
495
496	case F_DUPFD_CLOEXEC:
497		tmp = arg;
498		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
499		break;
500
501	case F_DUP2FD:
502		tmp = arg;
503		error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
504		break;
505
506	case F_DUP2FD_CLOEXEC:
507		tmp = arg;
508		error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
509		break;
510
511	case F_GETFD:
512		error = EBADF;
513		FILEDESC_SLOCK(fdp);
514		fde = fdeget_locked(fdp, fd);
515		if (fde != NULL) {
516			td->td_retval[0] =
517			    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
518			error = 0;
519		}
520		FILEDESC_SUNLOCK(fdp);
521		break;
522
523	case F_SETFD:
524		error = EBADF;
525		FILEDESC_XLOCK(fdp);
526		fde = fdeget_locked(fdp, fd);
527		if (fde != NULL) {
528			fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
529			    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
530			error = 0;
531		}
532		FILEDESC_XUNLOCK(fdp);
533		break;
534
535	case F_GETFL:
536		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp);
537		if (error != 0)
538			break;
539		td->td_retval[0] = OFLAGS(fp->f_flag);
540		fdrop(fp, td);
541		break;
542
543	case F_SETFL:
544		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
545		if (error != 0)
546			break;
547		do {
548			tmp = flg = fp->f_flag;
549			tmp &= ~FCNTLFLAGS;
550			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
551		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
552		tmp = fp->f_flag & FNONBLOCK;
553		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
554		if (error != 0) {
555			fdrop(fp, td);
556			break;
557		}
558		tmp = fp->f_flag & FASYNC;
559		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
560		if (error == 0) {
561			fdrop(fp, td);
562			break;
563		}
564		atomic_clear_int(&fp->f_flag, FNONBLOCK);
565		tmp = 0;
566		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
567		fdrop(fp, td);
568		break;
569
570	case F_GETOWN:
571		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp);
572		if (error != 0)
573			break;
574		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
575		if (error == 0)
576			td->td_retval[0] = tmp;
577		fdrop(fp, td);
578		break;
579
580	case F_SETOWN:
581		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp);
582		if (error != 0)
583			break;
584		tmp = arg;
585		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
586		fdrop(fp, td);
587		break;
588
589	case F_SETLK_REMOTE:
590		error = priv_check(td, PRIV_NFS_LOCKD);
591		if (error != 0)
592			return (error);
593		flg = F_REMOTE;
594		goto do_setlk;
595
596	case F_SETLKW:
597		flg |= F_WAIT;
598		/* FALLTHROUGH F_SETLK */
599
600	case F_SETLK:
601	do_setlk:
602		flp = (struct flock *)arg;
603		if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) {
604			error = EINVAL;
605			break;
606		}
607
608		error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
609		if (error != 0)
610			break;
611		if (fp->f_type != DTYPE_VNODE) {
612			error = EBADF;
613			fdrop(fp, td);
614			break;
615		}
616
617		if (flp->l_whence == SEEK_CUR) {
618			foffset = foffset_get(fp);
619			if (foffset < 0 ||
620			    (flp->l_start > 0 &&
621			     foffset > OFF_MAX - flp->l_start)) {
622				error = EOVERFLOW;
623				fdrop(fp, td);
624				break;
625			}
626			flp->l_start += foffset;
627		}
628
629		vp = fp->f_vnode;
630		switch (flp->l_type) {
631		case F_RDLCK:
632			if ((fp->f_flag & FREAD) == 0) {
633				error = EBADF;
634				break;
635			}
636			if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
637				PROC_LOCK(p->p_leader);
638				p->p_leader->p_flag |= P_ADVLOCK;
639				PROC_UNLOCK(p->p_leader);
640			}
641			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
642			    flp, flg);
643			break;
644		case F_WRLCK:
645			if ((fp->f_flag & FWRITE) == 0) {
646				error = EBADF;
647				break;
648			}
649			if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
650				PROC_LOCK(p->p_leader);
651				p->p_leader->p_flag |= P_ADVLOCK;
652				PROC_UNLOCK(p->p_leader);
653			}
654			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
655			    flp, flg);
656			break;
657		case F_UNLCK:
658			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
659			    flp, flg);
660			break;
661		case F_UNLCKSYS:
662			if (flg != F_REMOTE) {
663				error = EINVAL;
664				break;
665			}
666			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
667			    F_UNLCKSYS, flp, flg);
668			break;
669		default:
670			error = EINVAL;
671			break;
672		}
673		if (error != 0 || flp->l_type == F_UNLCK ||
674		    flp->l_type == F_UNLCKSYS) {
675			fdrop(fp, td);
676			break;
677		}
678
679		/*
680		 * Check for a race with close.
681		 *
682		 * The vnode is now advisory locked (or unlocked, but this case
683		 * is not really important) as the caller requested.
684		 * We had to drop the filedesc lock, so we need to recheck if
685		 * the descriptor is still valid, because if it was closed
686		 * in the meantime we need to remove advisory lock from the
687		 * vnode - close on any descriptor leading to an advisory
688		 * locked vnode, removes that lock.
689		 * We will return 0 on purpose in that case, as the result of
690		 * successful advisory lock might have been externally visible
691		 * already. This is fine - effectively we pretend to the caller
692		 * that the closing thread was a bit slower and that the
693		 * advisory lock succeeded before the close.
694		 */
695		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2);
696		if (error != 0) {
697			fdrop(fp, td);
698			break;
699		}
700		if (fp != fp2) {
701			flp->l_whence = SEEK_SET;
702			flp->l_start = 0;
703			flp->l_len = 0;
704			flp->l_type = F_UNLCK;
705			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
706			    F_UNLCK, flp, F_POSIX);
707		}
708		fdrop(fp, td);
709		fdrop(fp2, td);
710		break;
711
712	case F_GETLK:
713		error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
714		if (error != 0)
715			break;
716		if (fp->f_type != DTYPE_VNODE) {
717			error = EBADF;
718			fdrop(fp, td);
719			break;
720		}
721		flp = (struct flock *)arg;
722		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
723		    flp->l_type != F_UNLCK) {
724			error = EINVAL;
725			fdrop(fp, td);
726			break;
727		}
728		if (flp->l_whence == SEEK_CUR) {
729			foffset = foffset_get(fp);
730			if ((flp->l_start > 0 &&
731			    foffset > OFF_MAX - flp->l_start) ||
732			    (flp->l_start < 0 &&
733			    foffset < OFF_MIN - flp->l_start)) {
734				error = EOVERFLOW;
735				fdrop(fp, td);
736				break;
737			}
738			flp->l_start += foffset;
739		}
740		vp = fp->f_vnode;
741		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
742		    F_POSIX);
743		fdrop(fp, td);
744		break;
745
746	case F_ADD_SEALS:
747		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
748		if (error != 0)
749			break;
750		error = fo_add_seals(fp, arg);
751		fdrop(fp, td);
752		break;
753
754	case F_GET_SEALS:
755		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
756		if (error != 0)
757			break;
758		if (fo_get_seals(fp, &seals) == 0)
759			td->td_retval[0] = seals;
760		else
761			error = EINVAL;
762		fdrop(fp, td);
763		break;
764
765	case F_RDAHEAD:
766		arg = arg ? 128 * 1024: 0;
767		/* FALLTHROUGH */
768	case F_READAHEAD:
769		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
770		if (error != 0)
771			break;
772		if (fp->f_type != DTYPE_VNODE) {
773			fdrop(fp, td);
774			error = EBADF;
775			break;
776		}
777		vp = fp->f_vnode;
778		if (vp->v_type != VREG) {
779			fdrop(fp, td);
780			error = ENOTTY;
781			break;
782		}
783
784		/*
785		 * Exclusive lock synchronizes against f_seqcount reads and
786		 * writes in sequential_heuristic().
787		 */
788		error = vn_lock(vp, LK_EXCLUSIVE);
789		if (error != 0) {
790			fdrop(fp, td);
791			break;
792		}
793		if (arg >= 0) {
794			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
795			arg = MIN(arg, INT_MAX - bsize + 1);
796			fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX,
797			    (arg + bsize - 1) / bsize);
798			atomic_set_int(&fp->f_flag, FRDAHEAD);
799		} else {
800			atomic_clear_int(&fp->f_flag, FRDAHEAD);
801		}
802		VOP_UNLOCK(vp);
803		fdrop(fp, td);
804		break;
805
806	case F_ISUNIONSTACK:
807		/*
808		 * Check if the vnode is part of a union stack (either the
809		 * "union" flag from mount(2) or unionfs).
810		 *
811		 * Prior to introduction of this op libc's readdir would call
812		 * fstatfs(2), in effect unnecessarily copying kilobytes of
813		 * data just to check fs name and a mount flag.
814		 *
815		 * Fixing the code to handle everything in the kernel instead
816		 * is a non-trivial endeavor and has low priority, thus this
817		 * horrible kludge facilitates the current behavior in a much
818		 * cheaper manner until someone(tm) sorts this out.
819		 */
820		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
821		if (error != 0)
822			break;
823		if (fp->f_type != DTYPE_VNODE) {
824			fdrop(fp, td);
825			error = EBADF;
826			break;
827		}
828		vp = fp->f_vnode;
829		/*
830		 * Since we don't prevent dooming the vnode even non-null mp
831		 * found can become immediately stale. This is tolerable since
832		 * mount points are type-stable (providing safe memory access)
833		 * and any vfs op on this vnode going forward will return an
834		 * error (meaning return value in this case is meaningless).
835		 */
836		mp = atomic_load_ptr(&vp->v_mount);
837		if (__predict_false(mp == NULL)) {
838			fdrop(fp, td);
839			error = EBADF;
840			break;
841		}
842		td->td_retval[0] = 0;
843		if (mp->mnt_kern_flag & MNTK_UNIONFS ||
844		    mp->mnt_flag & MNT_UNION)
845			td->td_retval[0] = 1;
846		fdrop(fp, td);
847		break;
848
849	default:
850		error = EINVAL;
851		break;
852	}
853	return (error);
854}
855
856static int
857getmaxfd(struct thread *td)
858{
859
860	return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
861}
862
863/*
864 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
865 */
866int
867kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
868{
869	struct filedesc *fdp;
870	struct filedescent *oldfde, *newfde;
871	struct proc *p;
872	struct file *delfp;
873	u_long *oioctls, *nioctls;
874	int error, maxfd;
875
876	p = td->td_proc;
877	fdp = p->p_fd;
878	oioctls = NULL;
879
880	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
881	MPASS(mode < FDDUP_LASTMODE);
882
883	AUDIT_ARG_FD(old);
884	/* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
885
886	/*
887	 * Verify we have a valid descriptor to dup from and possibly to
888	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
889	 * return EINVAL when the new descriptor is out of bounds.
890	 */
891	if (old < 0)
892		return (EBADF);
893	if (new < 0)
894		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
895	maxfd = getmaxfd(td);
896	if (new >= maxfd)
897		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
898
899	error = EBADF;
900	FILEDESC_XLOCK(fdp);
901	if (fget_locked(fdp, old) == NULL)
902		goto unlock;
903	if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) {
904		td->td_retval[0] = new;
905		if (flags & FDDUP_FLAG_CLOEXEC)
906			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
907		error = 0;
908		goto unlock;
909	}
910
911	oldfde = &fdp->fd_ofiles[old];
912	if (!fhold(oldfde->fde_file))
913		goto unlock;
914
915	/*
916	 * If the caller specified a file descriptor, make sure the file
917	 * table is large enough to hold it, and grab it.  Otherwise, just
918	 * allocate a new descriptor the usual way.
919	 */
920	switch (mode) {
921	case FDDUP_NORMAL:
922	case FDDUP_FCNTL:
923		if ((error = fdalloc(td, new, &new)) != 0) {
924			fdrop(oldfde->fde_file, td);
925			goto unlock;
926		}
927		break;
928	case FDDUP_MUSTREPLACE:
929		/* Target file descriptor must exist. */
930		if (fget_locked(fdp, new) == NULL) {
931			fdrop(oldfde->fde_file, td);
932			goto unlock;
933		}
934		break;
935	case FDDUP_FIXED:
936		if (new >= fdp->fd_nfiles) {
937			/*
938			 * The resource limits are here instead of e.g.
939			 * fdalloc(), because the file descriptor table may be
940			 * shared between processes, so we can't really use
941			 * racct_add()/racct_sub().  Instead of counting the
942			 * number of actually allocated descriptors, just put
943			 * the limit on the size of the file descriptor table.
944			 */
945#ifdef RACCT
946			if (RACCT_ENABLED()) {
947				error = racct_set_unlocked(p, RACCT_NOFILE, new + 1);
948				if (error != 0) {
949					error = EMFILE;
950					fdrop(oldfde->fde_file, td);
951					goto unlock;
952				}
953			}
954#endif
955			fdgrowtable_exp(fdp, new + 1);
956		}
957		if (!fdisused(fdp, new))
958			fdused(fdp, new);
959		break;
960	default:
961		KASSERT(0, ("%s unsupported mode %d", __func__, mode));
962	}
963
964	KASSERT(old != new, ("new fd is same as old"));
965
966	newfde = &fdp->fd_ofiles[new];
967	delfp = newfde->fde_file;
968
969	nioctls = filecaps_copy_prep(&oldfde->fde_caps);
970
971	/*
972	 * Duplicate the source descriptor.
973	 */
974#ifdef CAPABILITIES
975	seqc_write_begin(&newfde->fde_seqc);
976#endif
977	oioctls = filecaps_free_prep(&newfde->fde_caps);
978	memcpy(newfde, oldfde, fde_change_size);
979	filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
980	    nioctls);
981	if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
982		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
983	else
984		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
985#ifdef CAPABILITIES
986	seqc_write_end(&newfde->fde_seqc);
987#endif
988	td->td_retval[0] = new;
989
990	error = 0;
991
992	if (delfp != NULL) {
993		(void) closefp(fdp, new, delfp, td, 1);
994		FILEDESC_UNLOCK_ASSERT(fdp);
995	} else {
996unlock:
997		FILEDESC_XUNLOCK(fdp);
998	}
999
1000	filecaps_free_finish(oioctls);
1001	return (error);
1002}
1003
1004/*
1005 * If sigio is on the list associated with a process or process group,
1006 * disable signalling from the device, remove sigio from the list and
1007 * free sigio.
1008 */
1009void
1010funsetown(struct sigio **sigiop)
1011{
1012	struct sigio *sigio;
1013
1014	if (*sigiop == NULL)
1015		return;
1016	SIGIO_LOCK();
1017	sigio = *sigiop;
1018	if (sigio == NULL) {
1019		SIGIO_UNLOCK();
1020		return;
1021	}
1022	*(sigio->sio_myref) = NULL;
1023	if ((sigio)->sio_pgid < 0) {
1024		struct pgrp *pg = (sigio)->sio_pgrp;
1025		PGRP_LOCK(pg);
1026		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
1027			    sigio, sio_pgsigio);
1028		PGRP_UNLOCK(pg);
1029	} else {
1030		struct proc *p = (sigio)->sio_proc;
1031		PROC_LOCK(p);
1032		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
1033			    sigio, sio_pgsigio);
1034		PROC_UNLOCK(p);
1035	}
1036	SIGIO_UNLOCK();
1037	crfree(sigio->sio_ucred);
1038	free(sigio, M_SIGIO);
1039}
1040
1041/*
1042 * Free a list of sigio structures.
1043 * We only need to lock the SIGIO_LOCK because we have made ourselves
1044 * inaccessible to callers of fsetown and therefore do not need to lock
1045 * the proc or pgrp struct for the list manipulation.
1046 */
1047void
1048funsetownlst(struct sigiolst *sigiolst)
1049{
1050	struct proc *p;
1051	struct pgrp *pg;
1052	struct sigio *sigio;
1053
1054	sigio = SLIST_FIRST(sigiolst);
1055	if (sigio == NULL)
1056		return;
1057	p = NULL;
1058	pg = NULL;
1059
1060	/*
1061	 * Every entry of the list should belong
1062	 * to a single proc or pgrp.
1063	 */
1064	if (sigio->sio_pgid < 0) {
1065		pg = sigio->sio_pgrp;
1066		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
1067	} else /* if (sigio->sio_pgid > 0) */ {
1068		p = sigio->sio_proc;
1069		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1070	}
1071
1072	SIGIO_LOCK();
1073	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
1074		*(sigio->sio_myref) = NULL;
1075		if (pg != NULL) {
1076			KASSERT(sigio->sio_pgid < 0,
1077			    ("Proc sigio in pgrp sigio list"));
1078			KASSERT(sigio->sio_pgrp == pg,
1079			    ("Bogus pgrp in sigio list"));
1080			PGRP_LOCK(pg);
1081			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
1082			    sio_pgsigio);
1083			PGRP_UNLOCK(pg);
1084		} else /* if (p != NULL) */ {
1085			KASSERT(sigio->sio_pgid > 0,
1086			    ("Pgrp sigio in proc sigio list"));
1087			KASSERT(sigio->sio_proc == p,
1088			    ("Bogus proc in sigio list"));
1089			PROC_LOCK(p);
1090			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
1091			    sio_pgsigio);
1092			PROC_UNLOCK(p);
1093		}
1094		SIGIO_UNLOCK();
1095		crfree(sigio->sio_ucred);
1096		free(sigio, M_SIGIO);
1097		SIGIO_LOCK();
1098	}
1099	SIGIO_UNLOCK();
1100}
1101
1102/*
1103 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
1104 *
1105 * After permission checking, add a sigio structure to the sigio list for
1106 * the process or process group.
1107 */
1108int
1109fsetown(pid_t pgid, struct sigio **sigiop)
1110{
1111	struct proc *proc;
1112	struct pgrp *pgrp;
1113	struct sigio *sigio;
1114	int ret;
1115
1116	if (pgid == 0) {
1117		funsetown(sigiop);
1118		return (0);
1119	}
1120
1121	ret = 0;
1122
1123	/* Allocate and fill in the new sigio out of locks. */
1124	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
1125	sigio->sio_pgid = pgid;
1126	sigio->sio_ucred = crhold(curthread->td_ucred);
1127	sigio->sio_myref = sigiop;
1128
1129	sx_slock(&proctree_lock);
1130	if (pgid > 0) {
1131		proc = pfind(pgid);
1132		if (proc == NULL) {
1133			ret = ESRCH;
1134			goto fail;
1135		}
1136
1137		/*
1138		 * Policy - Don't allow a process to FSETOWN a process
1139		 * in another session.
1140		 *
1141		 * Remove this test to allow maximum flexibility or
1142		 * restrict FSETOWN to the current process or process
1143		 * group for maximum safety.
1144		 */
1145		PROC_UNLOCK(proc);
1146		if (proc->p_session != curthread->td_proc->p_session) {
1147			ret = EPERM;
1148			goto fail;
1149		}
1150
1151		pgrp = NULL;
1152	} else /* if (pgid < 0) */ {
1153		pgrp = pgfind(-pgid);
1154		if (pgrp == NULL) {
1155			ret = ESRCH;
1156			goto fail;
1157		}
1158		PGRP_UNLOCK(pgrp);
1159
1160		/*
1161		 * Policy - Don't allow a process to FSETOWN a process
1162		 * in another session.
1163		 *
1164		 * Remove this test to allow maximum flexibility or
1165		 * restrict FSETOWN to the current process or process
1166		 * group for maximum safety.
1167		 */
1168		if (pgrp->pg_session != curthread->td_proc->p_session) {
1169			ret = EPERM;
1170			goto fail;
1171		}
1172
1173		proc = NULL;
1174	}
1175	funsetown(sigiop);
1176	if (pgid > 0) {
1177		PROC_LOCK(proc);
1178		/*
1179		 * Since funsetownlst() is called without the proctree
1180		 * locked, we need to check for P_WEXIT.
1181		 * XXX: is ESRCH correct?
1182		 */
1183		if ((proc->p_flag & P_WEXIT) != 0) {
1184			PROC_UNLOCK(proc);
1185			ret = ESRCH;
1186			goto fail;
1187		}
1188		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
1189		sigio->sio_proc = proc;
1190		PROC_UNLOCK(proc);
1191	} else {
1192		PGRP_LOCK(pgrp);
1193		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
1194		sigio->sio_pgrp = pgrp;
1195		PGRP_UNLOCK(pgrp);
1196	}
1197	sx_sunlock(&proctree_lock);
1198	SIGIO_LOCK();
1199	*sigiop = sigio;
1200	SIGIO_UNLOCK();
1201	return (0);
1202
1203fail:
1204	sx_sunlock(&proctree_lock);
1205	crfree(sigio->sio_ucred);
1206	free(sigio, M_SIGIO);
1207	return (ret);
1208}
1209
1210/*
1211 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1212 */
1213pid_t
1214fgetown(struct sigio **sigiop)
1215{
1216	pid_t pgid;
1217
1218	SIGIO_LOCK();
1219	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1220	SIGIO_UNLOCK();
1221	return (pgid);
1222}
1223
1224/*
1225 * Function drops the filedesc lock on return.
1226 */
1227static int
1228closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1229    int holdleaders)
1230{
1231	int error;
1232
1233	FILEDESC_XLOCK_ASSERT(fdp);
1234
1235	if (holdleaders) {
1236		if (td->td_proc->p_fdtol != NULL) {
1237			/*
1238			 * Ask fdfree() to sleep to ensure that all relevant
1239			 * process leaders can be traversed in closef().
1240			 */
1241			fdp->fd_holdleaderscount++;
1242		} else {
1243			holdleaders = 0;
1244		}
1245	}
1246
1247	/*
1248	 * We now hold the fp reference that used to be owned by the
1249	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
1250	 * knote_fdclose to prevent a race of the fd getting opened, a knote
1251	 * added, and deleteing a knote for the new fd.
1252	 */
1253	if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist)))
1254		knote_fdclose(td, fd);
1255
1256	/*
1257	 * We need to notify mqueue if the object is of type mqueue.
1258	 */
1259	if (__predict_false(fp->f_type == DTYPE_MQUEUE))
1260		mq_fdclose(td, fd, fp);
1261	FILEDESC_XUNLOCK(fdp);
1262
1263	error = closef(fp, td);
1264	if (holdleaders) {
1265		FILEDESC_XLOCK(fdp);
1266		fdp->fd_holdleaderscount--;
1267		if (fdp->fd_holdleaderscount == 0 &&
1268		    fdp->fd_holdleaderswakeup != 0) {
1269			fdp->fd_holdleaderswakeup = 0;
1270			wakeup(&fdp->fd_holdleaderscount);
1271		}
1272		FILEDESC_XUNLOCK(fdp);
1273	}
1274	return (error);
1275}
1276
1277/*
1278 * Close a file descriptor.
1279 */
1280#ifndef _SYS_SYSPROTO_H_
1281struct close_args {
1282	int     fd;
1283};
1284#endif
1285/* ARGSUSED */
1286int
1287sys_close(struct thread *td, struct close_args *uap)
1288{
1289
1290	return (kern_close(td, uap->fd));
1291}
1292
1293int
1294kern_close(struct thread *td, int fd)
1295{
1296	struct filedesc *fdp;
1297	struct file *fp;
1298
1299	fdp = td->td_proc->p_fd;
1300
1301	AUDIT_SYSCLOSE(td, fd);
1302
1303	FILEDESC_XLOCK(fdp);
1304	if ((fp = fget_locked(fdp, fd)) == NULL) {
1305		FILEDESC_XUNLOCK(fdp);
1306		return (EBADF);
1307	}
1308	fdfree(fdp, fd);
1309
1310	/* closefp() drops the FILEDESC lock for us. */
1311	return (closefp(fdp, fd, fp, td, 1));
1312}
1313
1314int
1315kern_close_range(struct thread *td, u_int lowfd, u_int highfd)
1316{
1317	struct filedesc *fdp;
1318	int fd, ret, lastfile;
1319
1320	ret = 0;
1321	fdp = td->td_proc->p_fd;
1322	FILEDESC_SLOCK(fdp);
1323
1324	/*
1325	 * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2
1326	 * open should not be a usage error.  From a close_range() perspective,
1327	 * close_range(3, ~0U, 0) in the same scenario should also likely not
1328	 * be a usage error as all fd above 3 are in-fact already closed.
1329	 */
1330	if (highfd < lowfd) {
1331		ret = EINVAL;
1332		goto out;
1333	}
1334
1335	/*
1336	 * If lastfile == -1, we're dealing with either a fresh file
1337	 * table or one in which every fd has been closed.  Just return
1338	 * successful; there's nothing left to do.
1339	 */
1340	lastfile = fdlastfile(fdp);
1341	if (lastfile == -1)
1342		goto out;
1343	/* Clamped to [lowfd, lastfile] */
1344	highfd = MIN(highfd, lastfile);
1345	for (fd = lowfd; fd <= highfd; fd++) {
1346		if (fdp->fd_ofiles[fd].fde_file != NULL) {
1347			FILEDESC_SUNLOCK(fdp);
1348			(void)kern_close(td, fd);
1349			FILEDESC_SLOCK(fdp);
1350		}
1351	}
1352out:
1353	FILEDESC_SUNLOCK(fdp);
1354	return (ret);
1355}
1356
1357#ifndef _SYS_SYSPROTO_H_
1358struct close_range_args {
1359	u_int	lowfd;
1360	u_int	highfd;
1361	int	flags;
1362};
1363#endif
1364int
1365sys_close_range(struct thread *td, struct close_range_args *uap)
1366{
1367
1368	/* No flags currently defined */
1369	if (uap->flags != 0)
1370		return (EINVAL);
1371	return (kern_close_range(td, uap->lowfd, uap->highfd));
1372}
1373
1374#ifdef COMPAT_FREEBSD12
1375/*
1376 * Close open file descriptors.
1377 */
1378#ifndef _SYS_SYSPROTO_H_
1379struct freebsd12_closefrom_args {
1380	int	lowfd;
1381};
1382#endif
1383/* ARGSUSED */
1384int
1385freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap)
1386{
1387	u_int lowfd;
1388
1389	AUDIT_ARG_FD(uap->lowfd);
1390
1391	/*
1392	 * Treat negative starting file descriptor values identical to
1393	 * closefrom(0) which closes all files.
1394	 */
1395	lowfd = MAX(0, uap->lowfd);
1396	return (kern_close_range(td, lowfd, ~0U));
1397}
1398#endif	/* COMPAT_FREEBSD12 */
1399
1400#if defined(COMPAT_43)
1401/*
1402 * Return status information about a file descriptor.
1403 */
1404#ifndef _SYS_SYSPROTO_H_
1405struct ofstat_args {
1406	int	fd;
1407	struct	ostat *sb;
1408};
1409#endif
1410/* ARGSUSED */
1411int
1412ofstat(struct thread *td, struct ofstat_args *uap)
1413{
1414	struct ostat oub;
1415	struct stat ub;
1416	int error;
1417
1418	error = kern_fstat(td, uap->fd, &ub);
1419	if (error == 0) {
1420		cvtstat(&ub, &oub);
1421		error = copyout(&oub, uap->sb, sizeof(oub));
1422	}
1423	return (error);
1424}
1425#endif /* COMPAT_43 */
1426
1427#if defined(COMPAT_FREEBSD11)
1428int
1429freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap)
1430{
1431	struct stat sb;
1432	struct freebsd11_stat osb;
1433	int error;
1434
1435	error = kern_fstat(td, uap->fd, &sb);
1436	if (error != 0)
1437		return (error);
1438	error = freebsd11_cvtstat(&sb, &osb);
1439	if (error == 0)
1440		error = copyout(&osb, uap->sb, sizeof(osb));
1441	return (error);
1442}
1443#endif	/* COMPAT_FREEBSD11 */
1444
1445/*
1446 * Return status information about a file descriptor.
1447 */
1448#ifndef _SYS_SYSPROTO_H_
1449struct fstat_args {
1450	int	fd;
1451	struct	stat *sb;
1452};
1453#endif
1454/* ARGSUSED */
1455int
1456sys_fstat(struct thread *td, struct fstat_args *uap)
1457{
1458	struct stat ub;
1459	int error;
1460
1461	error = kern_fstat(td, uap->fd, &ub);
1462	if (error == 0)
1463		error = copyout(&ub, uap->sb, sizeof(ub));
1464	return (error);
1465}
1466
1467int
1468kern_fstat(struct thread *td, int fd, struct stat *sbp)
1469{
1470	struct file *fp;
1471	int error;
1472
1473	AUDIT_ARG_FD(fd);
1474
1475	error = fget(td, fd, &cap_fstat_rights, &fp);
1476	if (__predict_false(error != 0))
1477		return (error);
1478
1479	AUDIT_ARG_FILE(td->td_proc, fp);
1480
1481	error = fo_stat(fp, sbp, td->td_ucred, td);
1482	fdrop(fp, td);
1483#ifdef __STAT_TIME_T_EXT
1484	sbp->st_atim_ext = 0;
1485	sbp->st_mtim_ext = 0;
1486	sbp->st_ctim_ext = 0;
1487	sbp->st_btim_ext = 0;
1488#endif
1489#ifdef KTRACE
1490	if (KTRPOINT(td, KTR_STRUCT))
1491		ktrstat_error(sbp, error);
1492#endif
1493	return (error);
1494}
1495
1496#if defined(COMPAT_FREEBSD11)
1497/*
1498 * Return status information about a file descriptor.
1499 */
1500#ifndef _SYS_SYSPROTO_H_
1501struct freebsd11_nfstat_args {
1502	int	fd;
1503	struct	nstat *sb;
1504};
1505#endif
1506/* ARGSUSED */
1507int
1508freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap)
1509{
1510	struct nstat nub;
1511	struct stat ub;
1512	int error;
1513
1514	error = kern_fstat(td, uap->fd, &ub);
1515	if (error == 0) {
1516		freebsd11_cvtnstat(&ub, &nub);
1517		error = copyout(&nub, uap->sb, sizeof(nub));
1518	}
1519	return (error);
1520}
1521#endif /* COMPAT_FREEBSD11 */
1522
1523/*
1524 * Return pathconf information about a file descriptor.
1525 */
1526#ifndef _SYS_SYSPROTO_H_
1527struct fpathconf_args {
1528	int	fd;
1529	int	name;
1530};
1531#endif
1532/* ARGSUSED */
1533int
1534sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
1535{
1536	long value;
1537	int error;
1538
1539	error = kern_fpathconf(td, uap->fd, uap->name, &value);
1540	if (error == 0)
1541		td->td_retval[0] = value;
1542	return (error);
1543}
1544
1545int
1546kern_fpathconf(struct thread *td, int fd, int name, long *valuep)
1547{
1548	struct file *fp;
1549	struct vnode *vp;
1550	int error;
1551
1552	error = fget(td, fd, &cap_fpathconf_rights, &fp);
1553	if (error != 0)
1554		return (error);
1555
1556	if (name == _PC_ASYNC_IO) {
1557		*valuep = _POSIX_ASYNCHRONOUS_IO;
1558		goto out;
1559	}
1560	vp = fp->f_vnode;
1561	if (vp != NULL) {
1562		vn_lock(vp, LK_SHARED | LK_RETRY);
1563		error = VOP_PATHCONF(vp, name, valuep);
1564		VOP_UNLOCK(vp);
1565	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1566		if (name != _PC_PIPE_BUF) {
1567			error = EINVAL;
1568		} else {
1569			*valuep = PIPE_BUF;
1570			error = 0;
1571		}
1572	} else {
1573		error = EOPNOTSUPP;
1574	}
1575out:
1576	fdrop(fp, td);
1577	return (error);
1578}
1579
1580/*
1581 * Copy filecaps structure allocating memory for ioctls array if needed.
1582 *
1583 * The last parameter indicates whether the fdtable is locked. If it is not and
1584 * ioctls are encountered, copying fails and the caller must lock the table.
1585 *
1586 * Note that if the table was not locked, the caller has to check the relevant
1587 * sequence counter to determine whether the operation was successful.
1588 */
1589bool
1590filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
1591{
1592	size_t size;
1593
1594	if (src->fc_ioctls != NULL && !locked)
1595		return (false);
1596	memcpy(dst, src, sizeof(*src));
1597	if (src->fc_ioctls == NULL)
1598		return (true);
1599
1600	KASSERT(src->fc_nioctls > 0,
1601	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1602
1603	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1604	dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
1605	memcpy(dst->fc_ioctls, src->fc_ioctls, size);
1606	return (true);
1607}
1608
1609static u_long *
1610filecaps_copy_prep(const struct filecaps *src)
1611{
1612	u_long *ioctls;
1613	size_t size;
1614
1615	if (__predict_true(src->fc_ioctls == NULL))
1616		return (NULL);
1617
1618	KASSERT(src->fc_nioctls > 0,
1619	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1620
1621	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1622	ioctls = malloc(size, M_FILECAPS, M_WAITOK);
1623	return (ioctls);
1624}
1625
1626static void
1627filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst,
1628    u_long *ioctls)
1629{
1630	size_t size;
1631
1632	*dst = *src;
1633	if (__predict_true(src->fc_ioctls == NULL)) {
1634		MPASS(ioctls == NULL);
1635		return;
1636	}
1637
1638	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1639	dst->fc_ioctls = ioctls;
1640	bcopy(src->fc_ioctls, dst->fc_ioctls, size);
1641}
1642
1643/*
1644 * Move filecaps structure to the new place and clear the old place.
1645 */
1646void
1647filecaps_move(struct filecaps *src, struct filecaps *dst)
1648{
1649
1650	*dst = *src;
1651	bzero(src, sizeof(*src));
1652}
1653
1654/*
1655 * Fill the given filecaps structure with full rights.
1656 */
1657static void
1658filecaps_fill(struct filecaps *fcaps)
1659{
1660
1661	CAP_ALL(&fcaps->fc_rights);
1662	fcaps->fc_ioctls = NULL;
1663	fcaps->fc_nioctls = -1;
1664	fcaps->fc_fcntls = CAP_FCNTL_ALL;
1665}
1666
1667/*
1668 * Free memory allocated within filecaps structure.
1669 */
1670void
1671filecaps_free(struct filecaps *fcaps)
1672{
1673
1674	free(fcaps->fc_ioctls, M_FILECAPS);
1675	bzero(fcaps, sizeof(*fcaps));
1676}
1677
1678static u_long *
1679filecaps_free_prep(struct filecaps *fcaps)
1680{
1681	u_long *ioctls;
1682
1683	ioctls = fcaps->fc_ioctls;
1684	bzero(fcaps, sizeof(*fcaps));
1685	return (ioctls);
1686}
1687
1688static void
1689filecaps_free_finish(u_long *ioctls)
1690{
1691
1692	free(ioctls, M_FILECAPS);
1693}
1694
1695/*
1696 * Validate the given filecaps structure.
1697 */
1698static void
1699filecaps_validate(const struct filecaps *fcaps, const char *func)
1700{
1701
1702	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
1703	    ("%s: invalid rights", func));
1704	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
1705	    ("%s: invalid fcntls", func));
1706	KASSERT(fcaps->fc_fcntls == 0 ||
1707	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
1708	    ("%s: fcntls without CAP_FCNTL", func));
1709	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
1710	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
1711	    ("%s: invalid ioctls", func));
1712	KASSERT(fcaps->fc_nioctls == 0 ||
1713	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
1714	    ("%s: ioctls without CAP_IOCTL", func));
1715}
1716
1717static void
1718fdgrowtable_exp(struct filedesc *fdp, int nfd)
1719{
1720	int nfd1;
1721
1722	FILEDESC_XLOCK_ASSERT(fdp);
1723
1724	nfd1 = fdp->fd_nfiles * 2;
1725	if (nfd1 < nfd)
1726		nfd1 = nfd;
1727	fdgrowtable(fdp, nfd1);
1728}
1729
1730/*
1731 * Grow the file table to accommodate (at least) nfd descriptors.
1732 */
1733static void
1734fdgrowtable(struct filedesc *fdp, int nfd)
1735{
1736	struct filedesc0 *fdp0;
1737	struct freetable *ft;
1738	struct fdescenttbl *ntable;
1739	struct fdescenttbl *otable;
1740	int nnfiles, onfiles;
1741	NDSLOTTYPE *nmap, *omap;
1742
1743	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
1744
1745	/* save old values */
1746	onfiles = fdp->fd_nfiles;
1747	otable = fdp->fd_files;
1748	omap = fdp->fd_map;
1749
1750	/* compute the size of the new table */
1751	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1752	if (nnfiles <= onfiles)
1753		/* the table is already large enough */
1754		return;
1755
1756	/*
1757	 * Allocate a new table.  We need enough space for the number of
1758	 * entries, file entries themselves and the struct freetable we will use
1759	 * when we decommission the table and place it on the freelist.
1760	 * We place the struct freetable in the middle so we don't have
1761	 * to worry about padding.
1762	 */
1763	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
1764	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
1765	    sizeof(struct freetable),
1766	    M_FILEDESC, M_ZERO | M_WAITOK);
1767	/* copy the old data */
1768	ntable->fdt_nfiles = nnfiles;
1769	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
1770	    onfiles * sizeof(ntable->fdt_ofiles[0]));
1771
1772	/*
1773	 * Allocate a new map only if the old is not large enough.  It will
1774	 * grow at a slower rate than the table as it can map more
1775	 * entries than the table can hold.
1776	 */
1777	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
1778		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
1779		    M_ZERO | M_WAITOK);
1780		/* copy over the old data and update the pointer */
1781		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
1782		fdp->fd_map = nmap;
1783	}
1784
1785	/*
1786	 * Make sure that ntable is correctly initialized before we replace
1787	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
1788	 * data.
1789	 */
1790	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
1791
1792	/*
1793	 * Do not free the old file table, as some threads may still
1794	 * reference entries within it.  Instead, place it on a freelist
1795	 * which will be processed when the struct filedesc is released.
1796	 *
1797	 * Note that if onfiles == NDFILE, we're dealing with the original
1798	 * static allocation contained within (struct filedesc0 *)fdp,
1799	 * which must not be freed.
1800	 */
1801	if (onfiles > NDFILE) {
1802		ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
1803		fdp0 = (struct filedesc0 *)fdp;
1804		ft->ft_table = otable;
1805		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
1806	}
1807	/*
1808	 * The map does not have the same possibility of threads still
1809	 * holding references to it.  So always free it as long as it
1810	 * does not reference the original static allocation.
1811	 */
1812	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
1813		free(omap, M_FILEDESC);
1814}
1815
1816/*
1817 * Allocate a file descriptor for the process.
1818 */
1819int
1820fdalloc(struct thread *td, int minfd, int *result)
1821{
1822	struct proc *p = td->td_proc;
1823	struct filedesc *fdp = p->p_fd;
1824	int fd, maxfd, allocfd;
1825#ifdef RACCT
1826	int error;
1827#endif
1828
1829	FILEDESC_XLOCK_ASSERT(fdp);
1830
1831	if (fdp->fd_freefile > minfd)
1832		minfd = fdp->fd_freefile;
1833
1834	maxfd = getmaxfd(td);
1835
1836	/*
1837	 * Search the bitmap for a free descriptor starting at minfd.
1838	 * If none is found, grow the file table.
1839	 */
1840	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1841	if (__predict_false(fd >= maxfd))
1842		return (EMFILE);
1843	if (__predict_false(fd >= fdp->fd_nfiles)) {
1844		allocfd = min(fd * 2, maxfd);
1845#ifdef RACCT
1846		if (RACCT_ENABLED()) {
1847			error = racct_set_unlocked(p, RACCT_NOFILE, allocfd);
1848			if (error != 0)
1849				return (EMFILE);
1850		}
1851#endif
1852		/*
1853		 * fd is already equal to first free descriptor >= minfd, so
1854		 * we only need to grow the table and we are done.
1855		 */
1856		fdgrowtable_exp(fdp, allocfd);
1857	}
1858
1859	/*
1860	 * Perform some sanity checks, then mark the file descriptor as
1861	 * used and return it to the caller.
1862	 */
1863	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
1864	    ("invalid descriptor %d", fd));
1865	KASSERT(!fdisused(fdp, fd),
1866	    ("fd_first_free() returned non-free descriptor"));
1867	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
1868	    ("file descriptor isn't free"));
1869	fdused(fdp, fd);
1870	*result = fd;
1871	return (0);
1872}
1873
1874/*
1875 * Allocate n file descriptors for the process.
1876 */
1877int
1878fdallocn(struct thread *td, int minfd, int *fds, int n)
1879{
1880	struct proc *p = td->td_proc;
1881	struct filedesc *fdp = p->p_fd;
1882	int i;
1883
1884	FILEDESC_XLOCK_ASSERT(fdp);
1885
1886	for (i = 0; i < n; i++)
1887		if (fdalloc(td, 0, &fds[i]) != 0)
1888			break;
1889
1890	if (i < n) {
1891		for (i--; i >= 0; i--)
1892			fdunused(fdp, fds[i]);
1893		return (EMFILE);
1894	}
1895
1896	return (0);
1897}
1898
1899/*
1900 * Create a new open file structure and allocate a file descriptor for the
1901 * process that refers to it.  We add one reference to the file for the
1902 * descriptor table and one reference for resultfp. This is to prevent us
1903 * being preempted and the entry in the descriptor table closed after we
1904 * release the FILEDESC lock.
1905 */
1906int
1907falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
1908    struct filecaps *fcaps)
1909{
1910	struct file *fp;
1911	int error, fd;
1912
1913	error = falloc_noinstall(td, &fp);
1914	if (error)
1915		return (error);		/* no reference held on error */
1916
1917	error = finstall(td, fp, &fd, flags, fcaps);
1918	if (error) {
1919		fdrop(fp, td);		/* one reference (fp only) */
1920		return (error);
1921	}
1922
1923	if (resultfp != NULL)
1924		*resultfp = fp;		/* copy out result */
1925	else
1926		fdrop(fp, td);		/* release local reference */
1927
1928	if (resultfd != NULL)
1929		*resultfd = fd;
1930
1931	return (0);
1932}
1933
1934/*
1935 * Create a new open file structure without allocating a file descriptor.
1936 */
1937int
1938falloc_noinstall(struct thread *td, struct file **resultfp)
1939{
1940	struct file *fp;
1941	int maxuserfiles = maxfiles - (maxfiles / 20);
1942	int openfiles_new;
1943	static struct timeval lastfail;
1944	static int curfail;
1945
1946	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
1947
1948	openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
1949	if ((openfiles_new >= maxuserfiles &&
1950	    priv_check(td, PRIV_MAXFILES) != 0) ||
1951	    openfiles_new >= maxfiles) {
1952		atomic_subtract_int(&openfiles, 1);
1953		if (ppsratecheck(&lastfail, &curfail, 1)) {
1954			printf("kern.maxfiles limit exceeded by uid %i, (%s) "
1955			    "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
1956		}
1957		return (ENFILE);
1958	}
1959	fp = uma_zalloc(file_zone, M_WAITOK);
1960	bzero(fp, sizeof(*fp));
1961	refcount_init(&fp->f_count, 1);
1962	fp->f_cred = crhold(td->td_ucred);
1963	fp->f_ops = &badfileops;
1964	*resultfp = fp;
1965	return (0);
1966}
1967
1968/*
1969 * Install a file in a file descriptor table.
1970 */
1971void
1972_finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
1973    struct filecaps *fcaps)
1974{
1975	struct filedescent *fde;
1976
1977	MPASS(fp != NULL);
1978	if (fcaps != NULL)
1979		filecaps_validate(fcaps, __func__);
1980	FILEDESC_XLOCK_ASSERT(fdp);
1981
1982	fde = &fdp->fd_ofiles[fd];
1983#ifdef CAPABILITIES
1984	seqc_write_begin(&fde->fde_seqc);
1985#endif
1986	fde->fde_file = fp;
1987	fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
1988	if (fcaps != NULL)
1989		filecaps_move(fcaps, &fde->fde_caps);
1990	else
1991		filecaps_fill(&fde->fde_caps);
1992#ifdef CAPABILITIES
1993	seqc_write_end(&fde->fde_seqc);
1994#endif
1995}
1996
1997int
1998finstall(struct thread *td, struct file *fp, int *fd, int flags,
1999    struct filecaps *fcaps)
2000{
2001	struct filedesc *fdp = td->td_proc->p_fd;
2002	int error;
2003
2004	MPASS(fd != NULL);
2005
2006	if (!fhold(fp))
2007		return (EBADF);
2008	FILEDESC_XLOCK(fdp);
2009	error = fdalloc(td, 0, fd);
2010	if (__predict_false(error != 0)) {
2011		FILEDESC_XUNLOCK(fdp);
2012		fdrop(fp, td);
2013		return (error);
2014	}
2015	_finstall(fdp, fp, *fd, flags, fcaps);
2016	FILEDESC_XUNLOCK(fdp);
2017	return (0);
2018}
2019
2020/*
2021 * Build a new filedesc structure from another.
2022 * Copy the current, root, and jail root vnode references.
2023 *
2024 * If fdp is not NULL, return with it shared locked.
2025 */
2026struct filedesc *
2027fdinit(struct filedesc *fdp, bool prepfiles, int *lastfile)
2028{
2029	struct filedesc0 *newfdp0;
2030	struct filedesc *newfdp;
2031	struct pwd *newpwd;
2032
2033	if (prepfiles)
2034		MPASS(lastfile != NULL);
2035	else
2036		MPASS(lastfile == NULL);
2037
2038	newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
2039	newfdp = &newfdp0->fd_fd;
2040
2041	/* Create the file descriptor table. */
2042	FILEDESC_LOCK_INIT(newfdp);
2043	refcount_init(&newfdp->fd_refcnt, 1);
2044	refcount_init(&newfdp->fd_holdcnt, 1);
2045	newfdp->fd_cmask = CMASK;
2046	newfdp->fd_map = newfdp0->fd_dmap;
2047	newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
2048	newfdp->fd_files->fdt_nfiles = NDFILE;
2049
2050	if (fdp == NULL) {
2051		newpwd = pwd_alloc();
2052		smr_serialized_store(&newfdp->fd_pwd, newpwd, true);
2053		return (newfdp);
2054	}
2055
2056	FILEDESC_SLOCK(fdp);
2057	newpwd = pwd_hold_filedesc(fdp);
2058	smr_serialized_store(&newfdp->fd_pwd, newpwd, true);
2059	if (!prepfiles) {
2060		FILEDESC_SUNLOCK(fdp);
2061		return (newfdp);
2062	}
2063
2064	for (;;) {
2065		*lastfile = fdlastfile(fdp);
2066		if (*lastfile < newfdp->fd_nfiles)
2067			break;
2068		FILEDESC_SUNLOCK(fdp);
2069		fdgrowtable(newfdp, *lastfile + 1);
2070		FILEDESC_SLOCK(fdp);
2071	}
2072
2073	return (newfdp);
2074}
2075
2076static struct filedesc *
2077fdhold(struct proc *p)
2078{
2079	struct filedesc *fdp;
2080
2081	PROC_LOCK_ASSERT(p, MA_OWNED);
2082	fdp = p->p_fd;
2083	if (fdp != NULL)
2084		refcount_acquire(&fdp->fd_holdcnt);
2085	return (fdp);
2086}
2087
2088static void
2089fddrop(struct filedesc *fdp)
2090{
2091
2092	if (fdp->fd_holdcnt > 1) {
2093		if (refcount_release(&fdp->fd_holdcnt) == 0)
2094			return;
2095	}
2096
2097	FILEDESC_LOCK_DESTROY(fdp);
2098	uma_zfree(filedesc0_zone, fdp);
2099}
2100
2101/*
2102 * Share a filedesc structure.
2103 */
2104struct filedesc *
2105fdshare(struct filedesc *fdp)
2106{
2107
2108	refcount_acquire(&fdp->fd_refcnt);
2109	return (fdp);
2110}
2111
2112/*
2113 * Unshare a filedesc structure, if necessary by making a copy
2114 */
2115void
2116fdunshare(struct thread *td)
2117{
2118	struct filedesc *tmp;
2119	struct proc *p = td->td_proc;
2120
2121	if (p->p_fd->fd_refcnt == 1)
2122		return;
2123
2124	tmp = fdcopy(p->p_fd);
2125	fdescfree(td);
2126	p->p_fd = tmp;
2127}
2128
2129void
2130fdinstall_remapped(struct thread *td, struct filedesc *fdp)
2131{
2132
2133	fdescfree(td);
2134	td->td_proc->p_fd = fdp;
2135}
2136
2137/*
2138 * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
2139 * this is to ease callers, not catch errors.
2140 */
2141struct filedesc *
2142fdcopy(struct filedesc *fdp)
2143{
2144	struct filedesc *newfdp;
2145	struct filedescent *nfde, *ofde;
2146	int i, lastfile;
2147
2148	MPASS(fdp != NULL);
2149
2150	newfdp = fdinit(fdp, true, &lastfile);
2151	/* copy all passable descriptors (i.e. not kqueue) */
2152	newfdp->fd_freefile = -1;
2153	for (i = 0; i <= lastfile; ++i) {
2154		ofde = &fdp->fd_ofiles[i];
2155		if (ofde->fde_file == NULL ||
2156		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
2157		    !fhold(ofde->fde_file)) {
2158			if (newfdp->fd_freefile == -1)
2159				newfdp->fd_freefile = i;
2160			continue;
2161		}
2162		nfde = &newfdp->fd_ofiles[i];
2163		*nfde = *ofde;
2164		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
2165		fdused_init(newfdp, i);
2166	}
2167	if (newfdp->fd_freefile == -1)
2168		newfdp->fd_freefile = i;
2169	newfdp->fd_cmask = fdp->fd_cmask;
2170	FILEDESC_SUNLOCK(fdp);
2171	return (newfdp);
2172}
2173
2174/*
2175 * Copies a filedesc structure, while remapping all file descriptors
2176 * stored inside using a translation table.
2177 *
2178 * File descriptors are copied over to the new file descriptor table,
2179 * regardless of whether the close-on-exec flag is set.
2180 */
2181int
2182fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
2183    struct filedesc **ret)
2184{
2185	struct filedesc *newfdp;
2186	struct filedescent *nfde, *ofde;
2187	int error, i, lastfile;
2188
2189	MPASS(fdp != NULL);
2190
2191	newfdp = fdinit(fdp, true, &lastfile);
2192	if (nfds > lastfile + 1) {
2193		/* New table cannot be larger than the old one. */
2194		error = E2BIG;
2195		goto bad;
2196	}
2197	/* Copy all passable descriptors (i.e. not kqueue). */
2198	newfdp->fd_freefile = nfds;
2199	for (i = 0; i < nfds; ++i) {
2200		if (fds[i] < 0 || fds[i] > lastfile) {
2201			/* File descriptor out of bounds. */
2202			error = EBADF;
2203			goto bad;
2204		}
2205		ofde = &fdp->fd_ofiles[fds[i]];
2206		if (ofde->fde_file == NULL) {
2207			/* Unused file descriptor. */
2208			error = EBADF;
2209			goto bad;
2210		}
2211		if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
2212			/* File descriptor cannot be passed. */
2213			error = EINVAL;
2214			goto bad;
2215		}
2216		if (!fhold(ofde->fde_file)) {
2217			error = EBADF;
2218			goto bad;
2219		}
2220		nfde = &newfdp->fd_ofiles[i];
2221		*nfde = *ofde;
2222		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
2223		fdused_init(newfdp, i);
2224	}
2225	newfdp->fd_cmask = fdp->fd_cmask;
2226	FILEDESC_SUNLOCK(fdp);
2227	*ret = newfdp;
2228	return (0);
2229bad:
2230	FILEDESC_SUNLOCK(fdp);
2231	fdescfree_remapped(newfdp);
2232	return (error);
2233}
2234
2235/*
2236 * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
2237 * one of processes using it exits) and the table used to be shared.
2238 */
2239static void
2240fdclearlocks(struct thread *td)
2241{
2242	struct filedesc *fdp;
2243	struct filedesc_to_leader *fdtol;
2244	struct flock lf;
2245	struct file *fp;
2246	struct proc *p;
2247	struct vnode *vp;
2248	int i, lastfile;
2249
2250	p = td->td_proc;
2251	fdp = p->p_fd;
2252	fdtol = p->p_fdtol;
2253	MPASS(fdtol != NULL);
2254
2255	FILEDESC_XLOCK(fdp);
2256	KASSERT(fdtol->fdl_refcount > 0,
2257	    ("filedesc_to_refcount botch: fdl_refcount=%d",
2258	    fdtol->fdl_refcount));
2259	if (fdtol->fdl_refcount == 1 &&
2260	    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
2261		lastfile = fdlastfile(fdp);
2262		for (i = 0; i <= lastfile; i++) {
2263			fp = fdp->fd_ofiles[i].fde_file;
2264			if (fp == NULL || fp->f_type != DTYPE_VNODE ||
2265			    !fhold(fp))
2266				continue;
2267			FILEDESC_XUNLOCK(fdp);
2268			lf.l_whence = SEEK_SET;
2269			lf.l_start = 0;
2270			lf.l_len = 0;
2271			lf.l_type = F_UNLCK;
2272			vp = fp->f_vnode;
2273			(void) VOP_ADVLOCK(vp,
2274			    (caddr_t)p->p_leader, F_UNLCK,
2275			    &lf, F_POSIX);
2276			FILEDESC_XLOCK(fdp);
2277			fdrop(fp, td);
2278		}
2279	}
2280retry:
2281	if (fdtol->fdl_refcount == 1) {
2282		if (fdp->fd_holdleaderscount > 0 &&
2283		    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
2284			/*
2285			 * close() or kern_dup() has cleared a reference
2286			 * in a shared file descriptor table.
2287			 */
2288			fdp->fd_holdleaderswakeup = 1;
2289			sx_sleep(&fdp->fd_holdleaderscount,
2290			    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
2291			goto retry;
2292		}
2293		if (fdtol->fdl_holdcount > 0) {
2294			/*
2295			 * Ensure that fdtol->fdl_leader remains
2296			 * valid in closef().
2297			 */
2298			fdtol->fdl_wakeup = 1;
2299			sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
2300			    "fdlhold", 0);
2301			goto retry;
2302		}
2303	}
2304	fdtol->fdl_refcount--;
2305	if (fdtol->fdl_refcount == 0 &&
2306	    fdtol->fdl_holdcount == 0) {
2307		fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
2308		fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
2309	} else
2310		fdtol = NULL;
2311	p->p_fdtol = NULL;
2312	FILEDESC_XUNLOCK(fdp);
2313	if (fdtol != NULL)
2314		free(fdtol, M_FILEDESC_TO_LEADER);
2315}
2316
2317/*
2318 * Release a filedesc structure.
2319 */
2320static void
2321fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose)
2322{
2323	struct filedesc0 *fdp0;
2324	struct freetable *ft, *tft;
2325	struct filedescent *fde;
2326	struct file *fp;
2327	int i, lastfile;
2328
2329	lastfile = fdlastfile_single(fdp);
2330	for (i = 0; i <= lastfile; i++) {
2331		fde = &fdp->fd_ofiles[i];
2332		fp = fde->fde_file;
2333		if (fp != NULL) {
2334			fdefree_last(fde);
2335			if (needclose)
2336				(void) closef(fp, td);
2337			else
2338				fdrop(fp, td);
2339		}
2340	}
2341
2342	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
2343		free(fdp->fd_map, M_FILEDESC);
2344	if (fdp->fd_nfiles > NDFILE)
2345		free(fdp->fd_files, M_FILEDESC);
2346
2347	fdp0 = (struct filedesc0 *)fdp;
2348	SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
2349		free(ft->ft_table, M_FILEDESC);
2350
2351	fddrop(fdp);
2352}
2353
2354void
2355fdescfree(struct thread *td)
2356{
2357	struct proc *p;
2358	struct filedesc *fdp;
2359	struct pwd *pwd;
2360
2361	p = td->td_proc;
2362	fdp = p->p_fd;
2363	MPASS(fdp != NULL);
2364
2365#ifdef RACCT
2366	if (RACCT_ENABLED())
2367		racct_set_unlocked(p, RACCT_NOFILE, 0);
2368#endif
2369
2370	if (p->p_fdtol != NULL)
2371		fdclearlocks(td);
2372
2373	PROC_LOCK(p);
2374	p->p_fd = NULL;
2375	PROC_UNLOCK(p);
2376
2377	if (refcount_release(&fdp->fd_refcnt) == 0)
2378		return;
2379
2380	FILEDESC_XLOCK(fdp);
2381	pwd = FILEDESC_XLOCKED_LOAD_PWD(fdp);
2382	pwd_set(fdp, NULL);
2383	FILEDESC_XUNLOCK(fdp);
2384
2385	pwd_drop(pwd);
2386
2387	fdescfree_fds(td, fdp, 1);
2388}
2389
2390void
2391fdescfree_remapped(struct filedesc *fdp)
2392{
2393
2394	pwd_drop(smr_serialized_load(&fdp->fd_pwd, true));
2395	fdescfree_fds(curthread, fdp, 0);
2396}
2397
2398/*
2399 * For setugid programs, we don't want to people to use that setugidness
2400 * to generate error messages which write to a file which otherwise would
2401 * otherwise be off-limits to the process.  We check for filesystems where
2402 * the vnode can change out from under us after execve (like [lin]procfs).
2403 *
2404 * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
2405 * sufficient.  We also don't check for setugidness since we know we are.
2406 */
2407static bool
2408is_unsafe(struct file *fp)
2409{
2410	struct vnode *vp;
2411
2412	if (fp->f_type != DTYPE_VNODE)
2413		return (false);
2414
2415	vp = fp->f_vnode;
2416	return ((vp->v_vflag & VV_PROCDEP) != 0);
2417}
2418
2419/*
2420 * Make this setguid thing safe, if at all possible.
2421 */
2422void
2423fdsetugidsafety(struct thread *td)
2424{
2425	struct filedesc *fdp;
2426	struct file *fp;
2427	int i;
2428
2429	fdp = td->td_proc->p_fd;
2430	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2431	MPASS(fdp->fd_nfiles >= 3);
2432	for (i = 0; i <= 2; i++) {
2433		fp = fdp->fd_ofiles[i].fde_file;
2434		if (fp != NULL && is_unsafe(fp)) {
2435			FILEDESC_XLOCK(fdp);
2436			knote_fdclose(td, i);
2437			/*
2438			 * NULL-out descriptor prior to close to avoid
2439			 * a race while close blocks.
2440			 */
2441			fdfree(fdp, i);
2442			FILEDESC_XUNLOCK(fdp);
2443			(void) closef(fp, td);
2444		}
2445	}
2446}
2447
2448/*
2449 * If a specific file object occupies a specific file descriptor, close the
2450 * file descriptor entry and drop a reference on the file object.  This is a
2451 * convenience function to handle a subsequent error in a function that calls
2452 * falloc() that handles the race that another thread might have closed the
2453 * file descriptor out from under the thread creating the file object.
2454 */
2455void
2456fdclose(struct thread *td, struct file *fp, int idx)
2457{
2458	struct filedesc *fdp = td->td_proc->p_fd;
2459
2460	FILEDESC_XLOCK(fdp);
2461	if (fdp->fd_ofiles[idx].fde_file == fp) {
2462		fdfree(fdp, idx);
2463		FILEDESC_XUNLOCK(fdp);
2464		fdrop(fp, td);
2465	} else
2466		FILEDESC_XUNLOCK(fdp);
2467}
2468
2469/*
2470 * Close any files on exec?
2471 */
2472void
2473fdcloseexec(struct thread *td)
2474{
2475	struct filedesc *fdp;
2476	struct filedescent *fde;
2477	struct file *fp;
2478	int i, lastfile;
2479
2480	fdp = td->td_proc->p_fd;
2481	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2482	lastfile = fdlastfile_single(fdp);
2483	for (i = 0; i <= lastfile; i++) {
2484		fde = &fdp->fd_ofiles[i];
2485		fp = fde->fde_file;
2486		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
2487		    (fde->fde_flags & UF_EXCLOSE))) {
2488			FILEDESC_XLOCK(fdp);
2489			fdfree(fdp, i);
2490			(void) closefp(fdp, i, fp, td, 0);
2491			FILEDESC_UNLOCK_ASSERT(fdp);
2492		}
2493	}
2494}
2495
2496/*
2497 * It is unsafe for set[ug]id processes to be started with file
2498 * descriptors 0..2 closed, as these descriptors are given implicit
2499 * significance in the Standard C library.  fdcheckstd() will create a
2500 * descriptor referencing /dev/null for each of stdin, stdout, and
2501 * stderr that is not already open.
2502 */
2503int
2504fdcheckstd(struct thread *td)
2505{
2506	struct filedesc *fdp;
2507	register_t save;
2508	int i, error, devnull;
2509
2510	fdp = td->td_proc->p_fd;
2511	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2512	MPASS(fdp->fd_nfiles >= 3);
2513	devnull = -1;
2514	for (i = 0; i <= 2; i++) {
2515		if (fdp->fd_ofiles[i].fde_file != NULL)
2516			continue;
2517
2518		save = td->td_retval[0];
2519		if (devnull != -1) {
2520			error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
2521		} else {
2522			error = kern_openat(td, AT_FDCWD, "/dev/null",
2523			    UIO_SYSSPACE, O_RDWR, 0);
2524			if (error == 0) {
2525				devnull = td->td_retval[0];
2526				KASSERT(devnull == i, ("we didn't get our fd"));
2527			}
2528		}
2529		td->td_retval[0] = save;
2530		if (error != 0)
2531			return (error);
2532	}
2533	return (0);
2534}
2535
2536/*
2537 * Internal form of close.  Decrement reference count on file structure.
2538 * Note: td may be NULL when closing a file that was being passed in a
2539 * message.
2540 */
2541int
2542closef(struct file *fp, struct thread *td)
2543{
2544	struct vnode *vp;
2545	struct flock lf;
2546	struct filedesc_to_leader *fdtol;
2547	struct filedesc *fdp;
2548
2549	/*
2550	 * POSIX record locking dictates that any close releases ALL
2551	 * locks owned by this process.  This is handled by setting
2552	 * a flag in the unlock to free ONLY locks obeying POSIX
2553	 * semantics, and not to free BSD-style file locks.
2554	 * If the descriptor was in a message, POSIX-style locks
2555	 * aren't passed with the descriptor, and the thread pointer
2556	 * will be NULL.  Callers should be careful only to pass a
2557	 * NULL thread pointer when there really is no owning
2558	 * context that might have locks, or the locks will be
2559	 * leaked.
2560	 */
2561	if (fp->f_type == DTYPE_VNODE && td != NULL) {
2562		vp = fp->f_vnode;
2563		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2564			lf.l_whence = SEEK_SET;
2565			lf.l_start = 0;
2566			lf.l_len = 0;
2567			lf.l_type = F_UNLCK;
2568			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2569			    F_UNLCK, &lf, F_POSIX);
2570		}
2571		fdtol = td->td_proc->p_fdtol;
2572		if (fdtol != NULL) {
2573			/*
2574			 * Handle special case where file descriptor table is
2575			 * shared between multiple process leaders.
2576			 */
2577			fdp = td->td_proc->p_fd;
2578			FILEDESC_XLOCK(fdp);
2579			for (fdtol = fdtol->fdl_next;
2580			    fdtol != td->td_proc->p_fdtol;
2581			    fdtol = fdtol->fdl_next) {
2582				if ((fdtol->fdl_leader->p_flag &
2583				    P_ADVLOCK) == 0)
2584					continue;
2585				fdtol->fdl_holdcount++;
2586				FILEDESC_XUNLOCK(fdp);
2587				lf.l_whence = SEEK_SET;
2588				lf.l_start = 0;
2589				lf.l_len = 0;
2590				lf.l_type = F_UNLCK;
2591				vp = fp->f_vnode;
2592				(void) VOP_ADVLOCK(vp,
2593				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
2594				    F_POSIX);
2595				FILEDESC_XLOCK(fdp);
2596				fdtol->fdl_holdcount--;
2597				if (fdtol->fdl_holdcount == 0 &&
2598				    fdtol->fdl_wakeup != 0) {
2599					fdtol->fdl_wakeup = 0;
2600					wakeup(fdtol);
2601				}
2602			}
2603			FILEDESC_XUNLOCK(fdp);
2604		}
2605	}
2606	return (fdrop(fp, td));
2607}
2608
2609/*
2610 * Initialize the file pointer with the specified properties.
2611 *
2612 * The ops are set with release semantics to be certain that the flags, type,
2613 * and data are visible when ops is.  This is to prevent ops methods from being
2614 * called with bad data.
2615 */
2616void
2617finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
2618{
2619	fp->f_data = data;
2620	fp->f_flag = flag;
2621	fp->f_type = type;
2622	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
2623}
2624
2625void
2626finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops)
2627{
2628	fp->f_seqcount[UIO_READ] = 1;
2629	fp->f_seqcount[UIO_WRITE] = 1;
2630	finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE,
2631	    data, ops);
2632}
2633
2634int
2635fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
2636    struct file **fpp, struct filecaps *havecapsp)
2637{
2638	struct filedescent *fde;
2639	int error;
2640
2641	FILEDESC_LOCK_ASSERT(fdp);
2642
2643	fde = fdeget_locked(fdp, fd);
2644	if (fde == NULL) {
2645		error = EBADF;
2646		goto out;
2647	}
2648
2649#ifdef CAPABILITIES
2650	error = cap_check(cap_rights_fde_inline(fde), needrightsp);
2651	if (error != 0)
2652		goto out;
2653#endif
2654
2655	if (havecapsp != NULL)
2656		filecaps_copy(&fde->fde_caps, havecapsp, true);
2657
2658	*fpp = fde->fde_file;
2659
2660	error = 0;
2661out:
2662	return (error);
2663}
2664
2665int
2666fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
2667    struct file **fpp, struct filecaps *havecapsp)
2668{
2669	struct filedesc *fdp = td->td_proc->p_fd;
2670	int error;
2671#ifndef CAPABILITIES
2672	error = fget_unlocked(fdp, fd, needrightsp, fpp);
2673	if (havecapsp != NULL && error == 0)
2674		filecaps_fill(havecapsp);
2675#else
2676	struct file *fp;
2677	seqc_t seq;
2678
2679	*fpp = NULL;
2680	for (;;) {
2681		error = fget_unlocked_seq(fdp, fd, needrightsp, &fp, &seq);
2682		if (error != 0)
2683			return (error);
2684
2685		if (havecapsp != NULL) {
2686			if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
2687			    havecapsp, false)) {
2688				fdrop(fp, td);
2689				goto get_locked;
2690			}
2691		}
2692
2693		if (!fd_modified(fdp, fd, seq))
2694			break;
2695		fdrop(fp, td);
2696	}
2697
2698	*fpp = fp;
2699	return (0);
2700
2701get_locked:
2702	FILEDESC_SLOCK(fdp);
2703	error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp);
2704	if (error == 0 && !fhold(*fpp))
2705		error = EBADF;
2706	FILEDESC_SUNLOCK(fdp);
2707#endif
2708	return (error);
2709}
2710
2711#ifdef CAPABILITIES
2712int
2713fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
2714{
2715	const struct filedescent *fde;
2716	const struct fdescenttbl *fdt;
2717	struct filedesc *fdp;
2718	struct file *fp;
2719	struct vnode *vp;
2720	const cap_rights_t *haverights;
2721	cap_rights_t rights;
2722	seqc_t seq;
2723
2724	VFS_SMR_ASSERT_ENTERED();
2725
2726	rights = *ndp->ni_rightsneeded;
2727	cap_rights_set_one(&rights, CAP_LOOKUP);
2728
2729	fdp = curproc->p_fd;
2730	fdt = fdp->fd_files;
2731	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
2732		return (EBADF);
2733	seq = seqc_read_any(fd_seqc(fdt, fd));
2734	if (__predict_false(seqc_in_modify(seq)))
2735		return (EAGAIN);
2736	fde = &fdt->fdt_ofiles[fd];
2737	haverights = cap_rights_fde_inline(fde);
2738	fp = fde->fde_file;
2739	if (__predict_false(fp == NULL))
2740		return (EAGAIN);
2741	if (__predict_false(cap_check_inline_transient(haverights, &rights)))
2742		return (EAGAIN);
2743	*fsearch = ((fp->f_flag & FSEARCH) != 0);
2744	vp = fp->f_vnode;
2745	if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
2746		return (EAGAIN);
2747	}
2748	if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) {
2749		return (EAGAIN);
2750	}
2751	/*
2752	 * Use an acquire barrier to force re-reading of fdt so it is
2753	 * refreshed for verification.
2754	 */
2755	atomic_thread_fence_acq();
2756	fdt = fdp->fd_files;
2757	if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq)))
2758		return (EAGAIN);
2759	/*
2760	 * If file descriptor doesn't have all rights,
2761	 * all lookups relative to it must also be
2762	 * strictly relative.
2763	 *
2764	 * Not yet supported by fast path.
2765	 */
2766	CAP_ALL(&rights);
2767	if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) ||
2768	    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
2769	    ndp->ni_filecaps.fc_nioctls != -1) {
2770#ifdef notyet
2771		ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
2772#else
2773		return (EAGAIN);
2774#endif
2775	}
2776	*vpp = vp;
2777	return (0);
2778}
2779#else
2780int
2781fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
2782{
2783	const struct fdescenttbl *fdt;
2784	struct filedesc *fdp;
2785	struct file *fp;
2786	struct vnode *vp;
2787
2788	VFS_SMR_ASSERT_ENTERED();
2789
2790	fdp = curproc->p_fd;
2791	fdt = fdp->fd_files;
2792	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
2793		return (EBADF);
2794	fp = fdt->fdt_ofiles[fd].fde_file;
2795	if (__predict_false(fp == NULL))
2796		return (EAGAIN);
2797	*fsearch = ((fp->f_flag & FSEARCH) != 0);
2798	vp = fp->f_vnode;
2799	if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
2800		return (EAGAIN);
2801	}
2802	/*
2803	 * Use an acquire barrier to force re-reading of fdt so it is
2804	 * refreshed for verification.
2805	 */
2806	atomic_thread_fence_acq();
2807	fdt = fdp->fd_files;
2808	if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
2809		return (EAGAIN);
2810	filecaps_fill(&ndp->ni_filecaps);
2811	*vpp = vp;
2812	return (0);
2813}
2814#endif
2815
2816int
2817fget_unlocked_seq(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
2818    struct file **fpp, seqc_t *seqp)
2819{
2820#ifdef CAPABILITIES
2821	const struct filedescent *fde;
2822#endif
2823	const struct fdescenttbl *fdt;
2824	struct file *fp;
2825#ifdef CAPABILITIES
2826	seqc_t seq;
2827	cap_rights_t haverights;
2828	int error;
2829#endif
2830
2831	fdt = fdp->fd_files;
2832	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
2833		return (EBADF);
2834	/*
2835	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
2836	 * never raising a refcount above 0.  To accomplish this we have
2837	 * to use a cmpset loop rather than an atomic_add.  The descriptor
2838	 * must be re-verified once we acquire a reference to be certain
2839	 * that the identity is still correct and we did not lose a race
2840	 * due to preemption.
2841	 */
2842	for (;;) {
2843#ifdef CAPABILITIES
2844		seq = seqc_read(fd_seqc(fdt, fd));
2845		fde = &fdt->fdt_ofiles[fd];
2846		haverights = *cap_rights_fde_inline(fde);
2847		fp = fde->fde_file;
2848		if (!seqc_consistent(fd_seqc(fdt, fd), seq))
2849			continue;
2850#else
2851		fp = fdt->fdt_ofiles[fd].fde_file;
2852#endif
2853		if (fp == NULL)
2854			return (EBADF);
2855#ifdef CAPABILITIES
2856		error = cap_check_inline(&haverights, needrightsp);
2857		if (error != 0)
2858			return (error);
2859#endif
2860		if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) {
2861			/*
2862			 * The count was found either saturated or zero.
2863			 * This re-read is not any more racy than using the
2864			 * return value from fcmpset.
2865			 */
2866			if (fp->f_count != 0)
2867				return (EBADF);
2868			/*
2869			 * Force a reload. Other thread could reallocate the
2870			 * table before this fd was closed, so it is possible
2871			 * that there is a stale fp pointer in cached version.
2872			 */
2873			fdt = atomic_load_ptr(&fdp->fd_files);
2874			continue;
2875		}
2876		/*
2877		 * Use an acquire barrier to force re-reading of fdt so it is
2878		 * refreshed for verification.
2879		 */
2880		atomic_thread_fence_acq();
2881		fdt = fdp->fd_files;
2882#ifdef	CAPABILITIES
2883		if (seqc_consistent_nomb(fd_seqc(fdt, fd), seq))
2884#else
2885		if (fp == fdt->fdt_ofiles[fd].fde_file)
2886#endif
2887			break;
2888		fdrop(fp, curthread);
2889	}
2890	*fpp = fp;
2891	if (seqp != NULL) {
2892#ifdef CAPABILITIES
2893		*seqp = seq;
2894#endif
2895	}
2896	return (0);
2897}
2898
2899/*
2900 * See the comments in fget_unlocked_seq for an explanation of how this works.
2901 *
2902 * This is a simplified variant which bails out to the aforementioned routine
2903 * if anything goes wrong. In practice this only happens when userspace is
2904 * racing with itself.
2905 */
2906int
2907fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
2908    struct file **fpp)
2909{
2910#ifdef CAPABILITIES
2911	const struct filedescent *fde;
2912#endif
2913	const struct fdescenttbl *fdt;
2914	struct file *fp;
2915#ifdef CAPABILITIES
2916	seqc_t seq;
2917	const cap_rights_t *haverights;
2918#endif
2919
2920	fdt = fdp->fd_files;
2921	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
2922		return (EBADF);
2923#ifdef CAPABILITIES
2924	seq = seqc_read_any(fd_seqc(fdt, fd));
2925	if (__predict_false(seqc_in_modify(seq)))
2926		goto out_fallback;
2927	fde = &fdt->fdt_ofiles[fd];
2928	haverights = cap_rights_fde_inline(fde);
2929	fp = fde->fde_file;
2930#else
2931	fp = fdt->fdt_ofiles[fd].fde_file;
2932#endif
2933	if (__predict_false(fp == NULL))
2934		goto out_fallback;
2935#ifdef CAPABILITIES
2936	if (__predict_false(cap_check_inline_transient(haverights, needrightsp)))
2937		goto out_fallback;
2938#endif
2939	if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count)))
2940		goto out_fallback;
2941
2942	/*
2943	 * Use an acquire barrier to force re-reading of fdt so it is
2944	 * refreshed for verification.
2945	 */
2946	atomic_thread_fence_acq();
2947	fdt = fdp->fd_files;
2948#ifdef	CAPABILITIES
2949	if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq)))
2950#else
2951	if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
2952#endif
2953		goto out_fdrop;
2954	*fpp = fp;
2955	return (0);
2956out_fdrop:
2957	fdrop(fp, curthread);
2958out_fallback:
2959	return (fget_unlocked_seq(fdp, fd, needrightsp, fpp, NULL));
2960}
2961
2962/*
2963 * Extract the file pointer associated with the specified descriptor for the
2964 * current user process.
2965 *
2966 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
2967 * returned.
2968 *
2969 * File's rights will be checked against the capability rights mask.
2970 *
2971 * If an error occurred the non-zero error is returned and *fpp is set to
2972 * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
2973 * responsible for fdrop().
2974 */
2975static __inline int
2976_fget(struct thread *td, int fd, struct file **fpp, int flags,
2977    cap_rights_t *needrightsp)
2978{
2979	struct filedesc *fdp;
2980	struct file *fp;
2981	int error;
2982
2983	*fpp = NULL;
2984	fdp = td->td_proc->p_fd;
2985	error = fget_unlocked(fdp, fd, needrightsp, &fp);
2986	if (__predict_false(error != 0))
2987		return (error);
2988	if (__predict_false(fp->f_ops == &badfileops)) {
2989		fdrop(fp, td);
2990		return (EBADF);
2991	}
2992
2993	/*
2994	 * FREAD and FWRITE failure return EBADF as per POSIX.
2995	 */
2996	error = 0;
2997	switch (flags) {
2998	case FREAD:
2999	case FWRITE:
3000		if ((fp->f_flag & flags) == 0)
3001			error = EBADF;
3002		break;
3003	case FEXEC:
3004	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
3005		    ((fp->f_flag & FWRITE) != 0))
3006			error = EBADF;
3007		break;
3008	case 0:
3009		break;
3010	default:
3011		KASSERT(0, ("wrong flags"));
3012	}
3013
3014	if (error != 0) {
3015		fdrop(fp, td);
3016		return (error);
3017	}
3018
3019	*fpp = fp;
3020	return (0);
3021}
3022
3023int
3024fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
3025{
3026
3027	return (_fget(td, fd, fpp, 0, rightsp));
3028}
3029
3030int
3031fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp,
3032    struct file **fpp)
3033{
3034	int error;
3035#ifndef CAPABILITIES
3036	error = _fget(td, fd, fpp, 0, rightsp);
3037	if (maxprotp != NULL)
3038		*maxprotp = VM_PROT_ALL;
3039	return (error);
3040#else
3041	cap_rights_t fdrights;
3042	struct filedesc *fdp;
3043	struct file *fp;
3044	seqc_t seq;
3045
3046	*fpp = NULL;
3047	fdp = td->td_proc->p_fd;
3048	MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
3049	for (;;) {
3050		error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq);
3051		if (__predict_false(error != 0))
3052			return (error);
3053		if (__predict_false(fp->f_ops == &badfileops)) {
3054			fdrop(fp, td);
3055			return (EBADF);
3056		}
3057		if (maxprotp != NULL)
3058			fdrights = *cap_rights(fdp, fd);
3059		if (!fd_modified(fdp, fd, seq))
3060			break;
3061		fdrop(fp, td);
3062	}
3063
3064	/*
3065	 * If requested, convert capability rights to access flags.
3066	 */
3067	if (maxprotp != NULL)
3068		*maxprotp = cap_rights_to_vmprot(&fdrights);
3069	*fpp = fp;
3070	return (0);
3071#endif
3072}
3073
3074int
3075fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
3076{
3077
3078	return (_fget(td, fd, fpp, FREAD, rightsp));
3079}
3080
3081int
3082fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
3083{
3084
3085	return (_fget(td, fd, fpp, FWRITE, rightsp));
3086}
3087
3088int
3089fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
3090    struct file **fpp)
3091{
3092	struct filedesc *fdp = td->td_proc->p_fd;
3093#ifndef CAPABILITIES
3094	return (fget_unlocked(fdp, fd, rightsp, fpp));
3095#else
3096	struct file *fp;
3097	int error;
3098	seqc_t seq;
3099
3100	*fpp = NULL;
3101	MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
3102	for (;;) {
3103		error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq);
3104		if (error != 0)
3105			return (error);
3106		error = cap_fcntl_check(fdp, fd, needfcntl);
3107		if (!fd_modified(fdp, fd, seq))
3108			break;
3109		fdrop(fp, td);
3110	}
3111	if (error != 0) {
3112		fdrop(fp, td);
3113		return (error);
3114	}
3115	*fpp = fp;
3116	return (0);
3117#endif
3118}
3119
3120/*
3121 * Like fget() but loads the underlying vnode, or returns an error if the
3122 * descriptor does not represent a vnode.  Note that pipes use vnodes but
3123 * never have VM objects.  The returned vnode will be vref()'d.
3124 *
3125 * XXX: what about the unused flags ?
3126 */
3127static __inline int
3128_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
3129    struct vnode **vpp)
3130{
3131	struct file *fp;
3132	int error;
3133
3134	*vpp = NULL;
3135	error = _fget(td, fd, &fp, flags, needrightsp);
3136	if (error != 0)
3137		return (error);
3138	if (fp->f_vnode == NULL) {
3139		error = EINVAL;
3140	} else {
3141		*vpp = fp->f_vnode;
3142		vrefact(*vpp);
3143	}
3144	fdrop(fp, td);
3145
3146	return (error);
3147}
3148
3149int
3150fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
3151{
3152
3153	return (_fgetvp(td, fd, 0, rightsp, vpp));
3154}
3155
3156int
3157fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
3158    struct filecaps *havecaps, struct vnode **vpp)
3159{
3160	struct filecaps caps;
3161	struct file *fp;
3162	int error;
3163
3164	error = fget_cap(td, fd, needrightsp, &fp, &caps);
3165	if (error != 0)
3166		return (error);
3167	if (fp->f_ops == &badfileops) {
3168		error = EBADF;
3169		goto out;
3170	}
3171	if (fp->f_vnode == NULL) {
3172		error = EINVAL;
3173		goto out;
3174	}
3175
3176	*havecaps = caps;
3177	*vpp = fp->f_vnode;
3178	vrefact(*vpp);
3179	fdrop(fp, td);
3180
3181	return (0);
3182out:
3183	filecaps_free(&caps);
3184	fdrop(fp, td);
3185	return (error);
3186}
3187
3188int
3189fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
3190{
3191
3192	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
3193}
3194
3195int
3196fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
3197{
3198
3199	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
3200}
3201
3202#ifdef notyet
3203int
3204fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
3205    struct vnode **vpp)
3206{
3207
3208	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
3209}
3210#endif
3211
3212/*
3213 * Handle the last reference to a file being closed.
3214 *
3215 * Without the noinline attribute clang keeps inlining the func thorough this
3216 * file when fdrop is used.
3217 */
3218int __noinline
3219_fdrop(struct file *fp, struct thread *td)
3220{
3221	int error;
3222
3223	if (fp->f_count != 0)
3224		panic("fdrop: count %d", fp->f_count);
3225	error = fo_close(fp, td);
3226	atomic_subtract_int(&openfiles, 1);
3227	crfree(fp->f_cred);
3228	free(fp->f_advice, M_FADVISE);
3229	uma_zfree(file_zone, fp);
3230
3231	return (error);
3232}
3233
3234/*
3235 * Apply an advisory lock on a file descriptor.
3236 *
3237 * Just attempt to get a record lock of the requested type on the entire file
3238 * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
3239 */
3240#ifndef _SYS_SYSPROTO_H_
3241struct flock_args {
3242	int	fd;
3243	int	how;
3244};
3245#endif
3246/* ARGSUSED */
3247int
3248sys_flock(struct thread *td, struct flock_args *uap)
3249{
3250	struct file *fp;
3251	struct vnode *vp;
3252	struct flock lf;
3253	int error;
3254
3255	error = fget(td, uap->fd, &cap_flock_rights, &fp);
3256	if (error != 0)
3257		return (error);
3258	if (fp->f_type != DTYPE_VNODE) {
3259		fdrop(fp, td);
3260		return (EOPNOTSUPP);
3261	}
3262
3263	vp = fp->f_vnode;
3264	lf.l_whence = SEEK_SET;
3265	lf.l_start = 0;
3266	lf.l_len = 0;
3267	if (uap->how & LOCK_UN) {
3268		lf.l_type = F_UNLCK;
3269		atomic_clear_int(&fp->f_flag, FHASLOCK);
3270		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
3271		goto done2;
3272	}
3273	if (uap->how & LOCK_EX)
3274		lf.l_type = F_WRLCK;
3275	else if (uap->how & LOCK_SH)
3276		lf.l_type = F_RDLCK;
3277	else {
3278		error = EBADF;
3279		goto done2;
3280	}
3281	atomic_set_int(&fp->f_flag, FHASLOCK);
3282	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
3283	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
3284done2:
3285	fdrop(fp, td);
3286	return (error);
3287}
3288/*
3289 * Duplicate the specified descriptor to a free descriptor.
3290 */
3291int
3292dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
3293    int openerror, int *indxp)
3294{
3295	struct filedescent *newfde, *oldfde;
3296	struct file *fp;
3297	u_long *ioctls;
3298	int error, indx;
3299
3300	KASSERT(openerror == ENODEV || openerror == ENXIO,
3301	    ("unexpected error %d in %s", openerror, __func__));
3302
3303	/*
3304	 * If the to-be-dup'd fd number is greater than the allowed number
3305	 * of file descriptors, or the fd to be dup'd has already been
3306	 * closed, then reject.
3307	 */
3308	FILEDESC_XLOCK(fdp);
3309	if ((fp = fget_locked(fdp, dfd)) == NULL) {
3310		FILEDESC_XUNLOCK(fdp);
3311		return (EBADF);
3312	}
3313
3314	error = fdalloc(td, 0, &indx);
3315	if (error != 0) {
3316		FILEDESC_XUNLOCK(fdp);
3317		return (error);
3318	}
3319
3320	/*
3321	 * There are two cases of interest here.
3322	 *
3323	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
3324	 *
3325	 * For ENXIO steal away the file structure from (dfd) and store it in
3326	 * (indx).  (dfd) is effectively closed by this operation.
3327	 */
3328	switch (openerror) {
3329	case ENODEV:
3330		/*
3331		 * Check that the mode the file is being opened for is a
3332		 * subset of the mode of the existing descriptor.
3333		 */
3334		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
3335			fdunused(fdp, indx);
3336			FILEDESC_XUNLOCK(fdp);
3337			return (EACCES);
3338		}
3339		if (!fhold(fp)) {
3340			fdunused(fdp, indx);
3341			FILEDESC_XUNLOCK(fdp);
3342			return (EBADF);
3343		}
3344		newfde = &fdp->fd_ofiles[indx];
3345		oldfde = &fdp->fd_ofiles[dfd];
3346		ioctls = filecaps_copy_prep(&oldfde->fde_caps);
3347#ifdef CAPABILITIES
3348		seqc_write_begin(&newfde->fde_seqc);
3349#endif
3350		memcpy(newfde, oldfde, fde_change_size);
3351		filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
3352		    ioctls);
3353#ifdef CAPABILITIES
3354		seqc_write_end(&newfde->fde_seqc);
3355#endif
3356		break;
3357	case ENXIO:
3358		/*
3359		 * Steal away the file pointer from dfd and stuff it into indx.
3360		 */
3361		newfde = &fdp->fd_ofiles[indx];
3362		oldfde = &fdp->fd_ofiles[dfd];
3363#ifdef CAPABILITIES
3364		seqc_write_begin(&newfde->fde_seqc);
3365#endif
3366		memcpy(newfde, oldfde, fde_change_size);
3367		oldfde->fde_file = NULL;
3368		fdunused(fdp, dfd);
3369#ifdef CAPABILITIES
3370		seqc_write_end(&newfde->fde_seqc);
3371#endif
3372		break;
3373	}
3374	FILEDESC_XUNLOCK(fdp);
3375	*indxp = indx;
3376	return (0);
3377}
3378
3379/*
3380 * This sysctl determines if we will allow a process to chroot(2) if it
3381 * has a directory open:
3382 *	0: disallowed for all processes.
3383 *	1: allowed for processes that were not already chroot(2)'ed.
3384 *	2: allowed for all processes.
3385 */
3386
3387static int chroot_allow_open_directories = 1;
3388
3389SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
3390    &chroot_allow_open_directories, 0,
3391    "Allow a process to chroot(2) if it has a directory open");
3392
3393/*
3394 * Helper function for raised chroot(2) security function:  Refuse if
3395 * any filedescriptors are open directories.
3396 */
3397static int
3398chroot_refuse_vdir_fds(struct filedesc *fdp)
3399{
3400	struct vnode *vp;
3401	struct file *fp;
3402	int fd, lastfile;
3403
3404	FILEDESC_LOCK_ASSERT(fdp);
3405
3406	lastfile = fdlastfile(fdp);
3407	for (fd = 0; fd <= lastfile; fd++) {
3408		fp = fget_locked(fdp, fd);
3409		if (fp == NULL)
3410			continue;
3411		if (fp->f_type == DTYPE_VNODE) {
3412			vp = fp->f_vnode;
3413			if (vp->v_type == VDIR)
3414				return (EPERM);
3415		}
3416	}
3417	return (0);
3418}
3419
3420static void
3421pwd_fill(struct pwd *oldpwd, struct pwd *newpwd)
3422{
3423
3424	if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) {
3425		vrefact(oldpwd->pwd_cdir);
3426		newpwd->pwd_cdir = oldpwd->pwd_cdir;
3427	}
3428
3429	if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) {
3430		vrefact(oldpwd->pwd_rdir);
3431		newpwd->pwd_rdir = oldpwd->pwd_rdir;
3432	}
3433
3434	if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) {
3435		vrefact(oldpwd->pwd_jdir);
3436		newpwd->pwd_jdir = oldpwd->pwd_jdir;
3437	}
3438}
3439
3440struct pwd *
3441pwd_hold_filedesc(struct filedesc *fdp)
3442{
3443	struct pwd *pwd;
3444
3445	FILEDESC_LOCK_ASSERT(fdp);
3446	pwd = FILEDESC_LOCKED_LOAD_PWD(fdp);
3447	if (pwd != NULL)
3448		refcount_acquire(&pwd->pwd_refcount);
3449	return (pwd);
3450}
3451
3452bool
3453pwd_hold_smr(struct pwd *pwd)
3454{
3455
3456	MPASS(pwd != NULL);
3457	if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) {
3458		return (true);
3459	}
3460	return (false);
3461}
3462
3463struct pwd *
3464pwd_hold(struct thread *td)
3465{
3466	struct filedesc *fdp;
3467	struct pwd *pwd;
3468
3469	fdp = td->td_proc->p_fd;
3470
3471	vfs_smr_enter();
3472	pwd = vfs_smr_entered_load(&fdp->fd_pwd);
3473	if (pwd_hold_smr(pwd)) {
3474		vfs_smr_exit();
3475		return (pwd);
3476	}
3477	vfs_smr_exit();
3478	FILEDESC_SLOCK(fdp);
3479	pwd = pwd_hold_filedesc(fdp);
3480	MPASS(pwd != NULL);
3481	FILEDESC_SUNLOCK(fdp);
3482	return (pwd);
3483}
3484
3485struct pwd *
3486pwd_get_smr(void)
3487{
3488	struct pwd *pwd;
3489
3490	pwd = vfs_smr_entered_load(&curproc->p_fd->fd_pwd);
3491	MPASS(pwd != NULL);
3492	return (pwd);
3493}
3494
3495static struct pwd *
3496pwd_alloc(void)
3497{
3498	struct pwd *pwd;
3499
3500	pwd = uma_zalloc_smr(pwd_zone, M_WAITOK);
3501	bzero(pwd, sizeof(*pwd));
3502	refcount_init(&pwd->pwd_refcount, 1);
3503	return (pwd);
3504}
3505
3506void
3507pwd_drop(struct pwd *pwd)
3508{
3509
3510	if (!refcount_release(&pwd->pwd_refcount))
3511		return;
3512
3513	if (pwd->pwd_cdir != NULL)
3514		vrele(pwd->pwd_cdir);
3515	if (pwd->pwd_rdir != NULL)
3516		vrele(pwd->pwd_rdir);
3517	if (pwd->pwd_jdir != NULL)
3518		vrele(pwd->pwd_jdir);
3519	uma_zfree_smr(pwd_zone, pwd);
3520}
3521
3522/*
3523* Common routine for kern_chroot() and jail_attach().  The caller is
3524* responsible for invoking priv_check() and mac_vnode_check_chroot() to
3525* authorize this operation.
3526*/
3527int
3528pwd_chroot(struct thread *td, struct vnode *vp)
3529{
3530	struct filedesc *fdp;
3531	struct pwd *newpwd, *oldpwd;
3532	int error;
3533
3534	fdp = td->td_proc->p_fd;
3535	newpwd = pwd_alloc();
3536	FILEDESC_XLOCK(fdp);
3537	oldpwd = FILEDESC_XLOCKED_LOAD_PWD(fdp);
3538	if (chroot_allow_open_directories == 0 ||
3539	    (chroot_allow_open_directories == 1 &&
3540	    oldpwd->pwd_rdir != rootvnode)) {
3541		error = chroot_refuse_vdir_fds(fdp);
3542		if (error != 0) {
3543			FILEDESC_XUNLOCK(fdp);
3544			pwd_drop(newpwd);
3545			return (error);
3546		}
3547	}
3548
3549	vrefact(vp);
3550	newpwd->pwd_rdir = vp;
3551	if (oldpwd->pwd_jdir == NULL) {
3552		vrefact(vp);
3553		newpwd->pwd_jdir = vp;
3554	}
3555	pwd_fill(oldpwd, newpwd);
3556	pwd_set(fdp, newpwd);
3557	FILEDESC_XUNLOCK(fdp);
3558	pwd_drop(oldpwd);
3559	return (0);
3560}
3561
3562void
3563pwd_chdir(struct thread *td, struct vnode *vp)
3564{
3565	struct filedesc *fdp;
3566	struct pwd *newpwd, *oldpwd;
3567
3568	VNPASS(vp->v_usecount > 0, vp);
3569
3570	newpwd = pwd_alloc();
3571	fdp = td->td_proc->p_fd;
3572	FILEDESC_XLOCK(fdp);
3573	oldpwd = FILEDESC_XLOCKED_LOAD_PWD(fdp);
3574	newpwd->pwd_cdir = vp;
3575	pwd_fill(oldpwd, newpwd);
3576	pwd_set(fdp, newpwd);
3577	FILEDESC_XUNLOCK(fdp);
3578	pwd_drop(oldpwd);
3579}
3580
3581void
3582pwd_ensure_dirs(void)
3583{
3584	struct filedesc *fdp;
3585	struct pwd *oldpwd, *newpwd;
3586
3587	fdp = curproc->p_fd;
3588	FILEDESC_XLOCK(fdp);
3589	oldpwd = FILEDESC_XLOCKED_LOAD_PWD(fdp);
3590	if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) {
3591		FILEDESC_XUNLOCK(fdp);
3592		return;
3593	}
3594	FILEDESC_XUNLOCK(fdp);
3595
3596	newpwd = pwd_alloc();
3597	FILEDESC_XLOCK(fdp);
3598	oldpwd = FILEDESC_XLOCKED_LOAD_PWD(fdp);
3599	pwd_fill(oldpwd, newpwd);
3600	if (newpwd->pwd_cdir == NULL) {
3601		vrefact(rootvnode);
3602		newpwd->pwd_cdir = rootvnode;
3603	}
3604	if (newpwd->pwd_rdir == NULL) {
3605		vrefact(rootvnode);
3606		newpwd->pwd_rdir = rootvnode;
3607	}
3608	pwd_set(fdp, newpwd);
3609	FILEDESC_XUNLOCK(fdp);
3610	pwd_drop(oldpwd);
3611}
3612
3613void
3614pwd_set_rootvnode(void)
3615{
3616	struct filedesc *fdp;
3617	struct pwd *oldpwd, *newpwd;
3618
3619	fdp = curproc->p_fd;
3620
3621	newpwd = pwd_alloc();
3622	FILEDESC_XLOCK(fdp);
3623	oldpwd = FILEDESC_XLOCKED_LOAD_PWD(fdp);
3624	vrefact(rootvnode);
3625	newpwd->pwd_cdir = rootvnode;
3626	vrefact(rootvnode);
3627	newpwd->pwd_rdir = rootvnode;
3628	pwd_fill(oldpwd, newpwd);
3629	pwd_set(fdp, newpwd);
3630	FILEDESC_XUNLOCK(fdp);
3631	pwd_drop(oldpwd);
3632}
3633
3634/*
3635 * Scan all active processes and prisons to see if any of them have a current
3636 * or root directory of `olddp'. If so, replace them with the new mount point.
3637 */
3638void
3639mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
3640{
3641	struct filedesc *fdp;
3642	struct pwd *newpwd, *oldpwd;
3643	struct prison *pr;
3644	struct proc *p;
3645	int nrele;
3646
3647	if (vrefcnt(olddp) == 1)
3648		return;
3649	nrele = 0;
3650	newpwd = pwd_alloc();
3651	sx_slock(&allproc_lock);
3652	FOREACH_PROC_IN_SYSTEM(p) {
3653		PROC_LOCK(p);
3654		fdp = fdhold(p);
3655		PROC_UNLOCK(p);
3656		if (fdp == NULL)
3657			continue;
3658		FILEDESC_XLOCK(fdp);
3659		oldpwd = FILEDESC_XLOCKED_LOAD_PWD(fdp);
3660		if (oldpwd == NULL ||
3661		    (oldpwd->pwd_cdir != olddp &&
3662		    oldpwd->pwd_rdir != olddp &&
3663		    oldpwd->pwd_jdir != olddp)) {
3664			FILEDESC_XUNLOCK(fdp);
3665			fddrop(fdp);
3666			continue;
3667		}
3668		if (oldpwd->pwd_cdir == olddp) {
3669			vrefact(newdp);
3670			newpwd->pwd_cdir = newdp;
3671		}
3672		if (oldpwd->pwd_rdir == olddp) {
3673			vrefact(newdp);
3674			newpwd->pwd_rdir = newdp;
3675		}
3676		if (oldpwd->pwd_jdir == olddp) {
3677			vrefact(newdp);
3678			newpwd->pwd_jdir = newdp;
3679		}
3680		pwd_fill(oldpwd, newpwd);
3681		pwd_set(fdp, newpwd);
3682		FILEDESC_XUNLOCK(fdp);
3683		pwd_drop(oldpwd);
3684		fddrop(fdp);
3685		newpwd = pwd_alloc();
3686	}
3687	sx_sunlock(&allproc_lock);
3688	pwd_drop(newpwd);
3689	if (rootvnode == olddp) {
3690		vrefact(newdp);
3691		rootvnode = newdp;
3692		nrele++;
3693	}
3694	mtx_lock(&prison0.pr_mtx);
3695	if (prison0.pr_root == olddp) {
3696		vrefact(newdp);
3697		prison0.pr_root = newdp;
3698		nrele++;
3699	}
3700	mtx_unlock(&prison0.pr_mtx);
3701	sx_slock(&allprison_lock);
3702	TAILQ_FOREACH(pr, &allprison, pr_list) {
3703		mtx_lock(&pr->pr_mtx);
3704		if (pr->pr_root == olddp) {
3705			vrefact(newdp);
3706			pr->pr_root = newdp;
3707			nrele++;
3708		}
3709		mtx_unlock(&pr->pr_mtx);
3710	}
3711	sx_sunlock(&allprison_lock);
3712	while (nrele--)
3713		vrele(olddp);
3714}
3715
3716struct filedesc_to_leader *
3717filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
3718{
3719	struct filedesc_to_leader *fdtol;
3720
3721	fdtol = malloc(sizeof(struct filedesc_to_leader),
3722	    M_FILEDESC_TO_LEADER, M_WAITOK);
3723	fdtol->fdl_refcount = 1;
3724	fdtol->fdl_holdcount = 0;
3725	fdtol->fdl_wakeup = 0;
3726	fdtol->fdl_leader = leader;
3727	if (old != NULL) {
3728		FILEDESC_XLOCK(fdp);
3729		fdtol->fdl_next = old->fdl_next;
3730		fdtol->fdl_prev = old;
3731		old->fdl_next = fdtol;
3732		fdtol->fdl_next->fdl_prev = fdtol;
3733		FILEDESC_XUNLOCK(fdp);
3734	} else {
3735		fdtol->fdl_next = fdtol;
3736		fdtol->fdl_prev = fdtol;
3737	}
3738	return (fdtol);
3739}
3740
3741static int
3742sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
3743{
3744	NDSLOTTYPE *map;
3745	struct filedesc *fdp;
3746	int count, off, minoff;
3747
3748	if (*(int *)arg1 != 0)
3749		return (EINVAL);
3750
3751	fdp = curproc->p_fd;
3752	count = 0;
3753	FILEDESC_SLOCK(fdp);
3754	map = fdp->fd_map;
3755	off = NDSLOT(fdp->fd_nfiles - 1);
3756	for (minoff = NDSLOT(0); off >= minoff; --off)
3757		count += bitcountl(map[off]);
3758	FILEDESC_SUNLOCK(fdp);
3759
3760	return (SYSCTL_OUT(req, &count, sizeof(count)));
3761}
3762
3763static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
3764    CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
3765    "Number of open file descriptors");
3766
3767/*
3768 * Get file structures globally.
3769 */
3770static int
3771sysctl_kern_file(SYSCTL_HANDLER_ARGS)
3772{
3773	struct xfile xf;
3774	struct filedesc *fdp;
3775	struct file *fp;
3776	struct proc *p;
3777	int error, n, lastfile;
3778
3779	error = sysctl_wire_old_buffer(req, 0);
3780	if (error != 0)
3781		return (error);
3782	if (req->oldptr == NULL) {
3783		n = 0;
3784		sx_slock(&allproc_lock);
3785		FOREACH_PROC_IN_SYSTEM(p) {
3786			PROC_LOCK(p);
3787			if (p->p_state == PRS_NEW) {
3788				PROC_UNLOCK(p);
3789				continue;
3790			}
3791			fdp = fdhold(p);
3792			PROC_UNLOCK(p);
3793			if (fdp == NULL)
3794				continue;
3795			/* overestimates sparse tables. */
3796			n += fdp->fd_nfiles;
3797			fddrop(fdp);
3798		}
3799		sx_sunlock(&allproc_lock);
3800		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
3801	}
3802	error = 0;
3803	bzero(&xf, sizeof(xf));
3804	xf.xf_size = sizeof(xf);
3805	sx_slock(&allproc_lock);
3806	FOREACH_PROC_IN_SYSTEM(p) {
3807		PROC_LOCK(p);
3808		if (p->p_state == PRS_NEW) {
3809			PROC_UNLOCK(p);
3810			continue;
3811		}
3812		if (p_cansee(req->td, p) != 0) {
3813			PROC_UNLOCK(p);
3814			continue;
3815		}
3816		xf.xf_pid = p->p_pid;
3817		xf.xf_uid = p->p_ucred->cr_uid;
3818		fdp = fdhold(p);
3819		PROC_UNLOCK(p);
3820		if (fdp == NULL)
3821			continue;
3822		FILEDESC_SLOCK(fdp);
3823		lastfile = fdlastfile(fdp);
3824		for (n = 0; fdp->fd_refcnt > 0 && n <= lastfile; ++n) {
3825			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
3826				continue;
3827			xf.xf_fd = n;
3828			xf.xf_file = (uintptr_t)fp;
3829			xf.xf_data = (uintptr_t)fp->f_data;
3830			xf.xf_vnode = (uintptr_t)fp->f_vnode;
3831			xf.xf_type = (uintptr_t)fp->f_type;
3832			xf.xf_count = fp->f_count;
3833			xf.xf_msgcount = 0;
3834			xf.xf_offset = foffset_get(fp);
3835			xf.xf_flag = fp->f_flag;
3836			error = SYSCTL_OUT(req, &xf, sizeof(xf));
3837			if (error)
3838				break;
3839		}
3840		FILEDESC_SUNLOCK(fdp);
3841		fddrop(fdp);
3842		if (error)
3843			break;
3844	}
3845	sx_sunlock(&allproc_lock);
3846	return (error);
3847}
3848
3849SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
3850    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
3851
3852#ifdef KINFO_FILE_SIZE
3853CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
3854#endif
3855
3856static int
3857xlate_fflags(int fflags)
3858{
3859	static const struct {
3860		int	fflag;
3861		int	kf_fflag;
3862	} fflags_table[] = {
3863		{ FAPPEND, KF_FLAG_APPEND },
3864		{ FASYNC, KF_FLAG_ASYNC },
3865		{ FFSYNC, KF_FLAG_FSYNC },
3866		{ FHASLOCK, KF_FLAG_HASLOCK },
3867		{ FNONBLOCK, KF_FLAG_NONBLOCK },
3868		{ FREAD, KF_FLAG_READ },
3869		{ FWRITE, KF_FLAG_WRITE },
3870		{ O_CREAT, KF_FLAG_CREAT },
3871		{ O_DIRECT, KF_FLAG_DIRECT },
3872		{ O_EXCL, KF_FLAG_EXCL },
3873		{ O_EXEC, KF_FLAG_EXEC },
3874		{ O_EXLOCK, KF_FLAG_EXLOCK },
3875		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
3876		{ O_SHLOCK, KF_FLAG_SHLOCK },
3877		{ O_TRUNC, KF_FLAG_TRUNC }
3878	};
3879	unsigned int i;
3880	int kflags;
3881
3882	kflags = 0;
3883	for (i = 0; i < nitems(fflags_table); i++)
3884		if (fflags & fflags_table[i].fflag)
3885			kflags |=  fflags_table[i].kf_fflag;
3886	return (kflags);
3887}
3888
3889/* Trim unused data from kf_path by truncating the structure size. */
3890void
3891pack_kinfo(struct kinfo_file *kif)
3892{
3893
3894	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
3895	    strlen(kif->kf_path) + 1;
3896	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
3897}
3898
3899static void
3900export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
3901    struct kinfo_file *kif, struct filedesc *fdp, int flags)
3902{
3903	int error;
3904
3905	bzero(kif, sizeof(*kif));
3906
3907	/* Set a default type to allow for empty fill_kinfo() methods. */
3908	kif->kf_type = KF_TYPE_UNKNOWN;
3909	kif->kf_flags = xlate_fflags(fp->f_flag);
3910	if (rightsp != NULL)
3911		kif->kf_cap_rights = *rightsp;
3912	else
3913		cap_rights_init_zero(&kif->kf_cap_rights);
3914	kif->kf_fd = fd;
3915	kif->kf_ref_count = fp->f_count;
3916	kif->kf_offset = foffset_get(fp);
3917
3918	/*
3919	 * This may drop the filedesc lock, so the 'fp' cannot be
3920	 * accessed after this call.
3921	 */
3922	error = fo_fill_kinfo(fp, kif, fdp);
3923	if (error == 0)
3924		kif->kf_status |= KF_ATTR_VALID;
3925	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
3926		pack_kinfo(kif);
3927	else
3928		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
3929}
3930
3931static void
3932export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
3933    struct kinfo_file *kif, int flags)
3934{
3935	int error;
3936
3937	bzero(kif, sizeof(*kif));
3938
3939	kif->kf_type = KF_TYPE_VNODE;
3940	error = vn_fill_kinfo_vnode(vp, kif);
3941	if (error == 0)
3942		kif->kf_status |= KF_ATTR_VALID;
3943	kif->kf_flags = xlate_fflags(fflags);
3944	cap_rights_init_zero(&kif->kf_cap_rights);
3945	kif->kf_fd = fd;
3946	kif->kf_ref_count = -1;
3947	kif->kf_offset = -1;
3948	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
3949		pack_kinfo(kif);
3950	else
3951		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
3952	vrele(vp);
3953}
3954
3955struct export_fd_buf {
3956	struct filedesc		*fdp;
3957	struct sbuf 		*sb;
3958	ssize_t			remainder;
3959	struct kinfo_file	kif;
3960	int			flags;
3961};
3962
3963static int
3964export_kinfo_to_sb(struct export_fd_buf *efbuf)
3965{
3966	struct kinfo_file *kif;
3967
3968	kif = &efbuf->kif;
3969	if (efbuf->remainder != -1) {
3970		if (efbuf->remainder < kif->kf_structsize) {
3971			/* Terminate export. */
3972			efbuf->remainder = 0;
3973			return (0);
3974		}
3975		efbuf->remainder -= kif->kf_structsize;
3976	}
3977	return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
3978}
3979
3980static int
3981export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
3982    struct export_fd_buf *efbuf)
3983{
3984	int error;
3985
3986	if (efbuf->remainder == 0)
3987		return (0);
3988	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
3989	    efbuf->flags);
3990	FILEDESC_SUNLOCK(efbuf->fdp);
3991	error = export_kinfo_to_sb(efbuf);
3992	FILEDESC_SLOCK(efbuf->fdp);
3993	return (error);
3994}
3995
3996static int
3997export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
3998    struct export_fd_buf *efbuf)
3999{
4000	int error;
4001
4002	if (efbuf->remainder == 0)
4003		return (0);
4004	if (efbuf->fdp != NULL)
4005		FILEDESC_SUNLOCK(efbuf->fdp);
4006	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
4007	error = export_kinfo_to_sb(efbuf);
4008	if (efbuf->fdp != NULL)
4009		FILEDESC_SLOCK(efbuf->fdp);
4010	return (error);
4011}
4012
4013/*
4014 * Store a process file descriptor information to sbuf.
4015 *
4016 * Takes a locked proc as argument, and returns with the proc unlocked.
4017 */
4018int
4019kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
4020    int flags)
4021{
4022	struct file *fp;
4023	struct filedesc *fdp;
4024	struct export_fd_buf *efbuf;
4025	struct vnode *cttyvp, *textvp, *tracevp;
4026	struct pwd *pwd;
4027	int error, i, lastfile;
4028	cap_rights_t rights;
4029
4030	PROC_LOCK_ASSERT(p, MA_OWNED);
4031
4032	/* ktrace vnode */
4033	tracevp = p->p_tracevp;
4034	if (tracevp != NULL)
4035		vrefact(tracevp);
4036	/* text vnode */
4037	textvp = p->p_textvp;
4038	if (textvp != NULL)
4039		vrefact(textvp);
4040	/* Controlling tty. */
4041	cttyvp = NULL;
4042	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
4043		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
4044		if (cttyvp != NULL)
4045			vrefact(cttyvp);
4046	}
4047	fdp = fdhold(p);
4048	PROC_UNLOCK(p);
4049	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
4050	efbuf->fdp = NULL;
4051	efbuf->sb = sb;
4052	efbuf->remainder = maxlen;
4053	efbuf->flags = flags;
4054	if (tracevp != NULL)
4055		export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
4056		    efbuf);
4057	if (textvp != NULL)
4058		export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
4059	if (cttyvp != NULL)
4060		export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
4061		    efbuf);
4062	error = 0;
4063	if (fdp == NULL)
4064		goto fail;
4065	efbuf->fdp = fdp;
4066	FILEDESC_SLOCK(fdp);
4067	pwd = pwd_hold_filedesc(fdp);
4068	if (pwd != NULL) {
4069		/* working directory */
4070		if (pwd->pwd_cdir != NULL) {
4071			vrefact(pwd->pwd_cdir);
4072			export_vnode_to_sb(pwd->pwd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
4073		}
4074		/* root directory */
4075		if (pwd->pwd_rdir != NULL) {
4076			vrefact(pwd->pwd_rdir);
4077			export_vnode_to_sb(pwd->pwd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
4078		}
4079		/* jail directory */
4080		if (pwd->pwd_jdir != NULL) {
4081			vrefact(pwd->pwd_jdir);
4082			export_vnode_to_sb(pwd->pwd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
4083		}
4084	}
4085	lastfile = fdlastfile(fdp);
4086	for (i = 0; fdp->fd_refcnt > 0 && i <= lastfile; i++) {
4087		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
4088			continue;
4089#ifdef CAPABILITIES
4090		rights = *cap_rights(fdp, i);
4091#else /* !CAPABILITIES */
4092		rights = cap_no_rights;
4093#endif
4094		/*
4095		 * Create sysctl entry.  It is OK to drop the filedesc
4096		 * lock inside of export_file_to_sb() as we will
4097		 * re-validate and re-evaluate its properties when the
4098		 * loop continues.
4099		 */
4100		error = export_file_to_sb(fp, i, &rights, efbuf);
4101		if (error != 0 || efbuf->remainder == 0)
4102			break;
4103	}
4104	FILEDESC_SUNLOCK(fdp);
4105	if (pwd != NULL)
4106		pwd_drop(pwd);
4107	fddrop(fdp);
4108fail:
4109	free(efbuf, M_TEMP);
4110	return (error);
4111}
4112
4113#define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
4114
4115/*
4116 * Get per-process file descriptors for use by procstat(1), et al.
4117 */
4118static int
4119sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
4120{
4121	struct sbuf sb;
4122	struct proc *p;
4123	ssize_t maxlen;
4124	int error, error2, *name;
4125
4126	name = (int *)arg1;
4127
4128	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
4129	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
4130	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
4131	if (error != 0) {
4132		sbuf_delete(&sb);
4133		return (error);
4134	}
4135	maxlen = req->oldptr != NULL ? req->oldlen : -1;
4136	error = kern_proc_filedesc_out(p, &sb, maxlen,
4137	    KERN_FILEDESC_PACK_KINFO);
4138	error2 = sbuf_finish(&sb);
4139	sbuf_delete(&sb);
4140	return (error != 0 ? error : error2);
4141}
4142
4143#ifdef COMPAT_FREEBSD7
4144#ifdef KINFO_OFILE_SIZE
4145CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
4146#endif
4147
4148static void
4149kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
4150{
4151
4152	okif->kf_structsize = sizeof(*okif);
4153	okif->kf_type = kif->kf_type;
4154	okif->kf_fd = kif->kf_fd;
4155	okif->kf_ref_count = kif->kf_ref_count;
4156	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
4157	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
4158	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
4159	okif->kf_offset = kif->kf_offset;
4160	if (kif->kf_type == KF_TYPE_VNODE)
4161		okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type;
4162	else
4163		okif->kf_vnode_type = KF_VTYPE_VNON;
4164	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
4165	if (kif->kf_type == KF_TYPE_SOCKET) {
4166		okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0;
4167		okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0;
4168		okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0;
4169		okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local;
4170		okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer;
4171	} else {
4172		okif->kf_sa_local.ss_family = AF_UNSPEC;
4173		okif->kf_sa_peer.ss_family = AF_UNSPEC;
4174	}
4175}
4176
4177static int
4178export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
4179    struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req)
4180{
4181	int error;
4182
4183	vrefact(vp);
4184	FILEDESC_SUNLOCK(fdp);
4185	export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
4186	kinfo_to_okinfo(kif, okif);
4187	error = SYSCTL_OUT(req, okif, sizeof(*okif));
4188	FILEDESC_SLOCK(fdp);
4189	return (error);
4190}
4191
4192/*
4193 * Get per-process file descriptors for use by procstat(1), et al.
4194 */
4195static int
4196sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
4197{
4198	struct kinfo_ofile *okif;
4199	struct kinfo_file *kif;
4200	struct filedesc *fdp;
4201	struct pwd *pwd;
4202	int error, i, lastfile, *name;
4203	struct file *fp;
4204	struct proc *p;
4205
4206	name = (int *)arg1;
4207	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
4208	if (error != 0)
4209		return (error);
4210	fdp = fdhold(p);
4211	PROC_UNLOCK(p);
4212	if (fdp == NULL)
4213		return (ENOENT);
4214	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
4215	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
4216	FILEDESC_SLOCK(fdp);
4217	pwd = pwd_hold_filedesc(fdp);
4218	if (pwd != NULL) {
4219		if (pwd->pwd_cdir != NULL)
4220			export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif,
4221			    okif, fdp, req);
4222		if (pwd->pwd_rdir != NULL)
4223			export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif,
4224			    okif, fdp, req);
4225		if (pwd->pwd_jdir != NULL)
4226			export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif,
4227			    okif, fdp, req);
4228	}
4229	lastfile = fdlastfile(fdp);
4230	for (i = 0; fdp->fd_refcnt > 0 && i <= lastfile; i++) {
4231		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
4232			continue;
4233		export_file_to_kinfo(fp, i, NULL, kif, fdp,
4234		    KERN_FILEDESC_PACK_KINFO);
4235		FILEDESC_SUNLOCK(fdp);
4236		kinfo_to_okinfo(kif, okif);
4237		error = SYSCTL_OUT(req, okif, sizeof(*okif));
4238		FILEDESC_SLOCK(fdp);
4239		if (error)
4240			break;
4241	}
4242	FILEDESC_SUNLOCK(fdp);
4243	if (pwd != NULL)
4244		pwd_drop(pwd);
4245	fddrop(fdp);
4246	free(kif, M_TEMP);
4247	free(okif, M_TEMP);
4248	return (0);
4249}
4250
4251static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
4252    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
4253    "Process ofiledesc entries");
4254#endif	/* COMPAT_FREEBSD7 */
4255
4256int
4257vntype_to_kinfo(int vtype)
4258{
4259	struct {
4260		int	vtype;
4261		int	kf_vtype;
4262	} vtypes_table[] = {
4263		{ VBAD, KF_VTYPE_VBAD },
4264		{ VBLK, KF_VTYPE_VBLK },
4265		{ VCHR, KF_VTYPE_VCHR },
4266		{ VDIR, KF_VTYPE_VDIR },
4267		{ VFIFO, KF_VTYPE_VFIFO },
4268		{ VLNK, KF_VTYPE_VLNK },
4269		{ VNON, KF_VTYPE_VNON },
4270		{ VREG, KF_VTYPE_VREG },
4271		{ VSOCK, KF_VTYPE_VSOCK }
4272	};
4273	unsigned int i;
4274
4275	/*
4276	 * Perform vtype translation.
4277	 */
4278	for (i = 0; i < nitems(vtypes_table); i++)
4279		if (vtypes_table[i].vtype == vtype)
4280			return (vtypes_table[i].kf_vtype);
4281
4282	return (KF_VTYPE_UNKNOWN);
4283}
4284
4285static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
4286    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
4287    "Process filedesc entries");
4288
4289/*
4290 * Store a process current working directory information to sbuf.
4291 *
4292 * Takes a locked proc as argument, and returns with the proc unlocked.
4293 */
4294int
4295kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
4296{
4297	struct filedesc *fdp;
4298	struct pwd *pwd;
4299	struct export_fd_buf *efbuf;
4300	struct vnode *cdir;
4301	int error;
4302
4303	PROC_LOCK_ASSERT(p, MA_OWNED);
4304
4305	fdp = fdhold(p);
4306	PROC_UNLOCK(p);
4307	if (fdp == NULL)
4308		return (EINVAL);
4309
4310	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
4311	efbuf->fdp = fdp;
4312	efbuf->sb = sb;
4313	efbuf->remainder = maxlen;
4314
4315	FILEDESC_SLOCK(fdp);
4316	pwd = FILEDESC_LOCKED_LOAD_PWD(fdp);
4317	cdir = pwd->pwd_cdir;
4318	if (cdir == NULL) {
4319		error = EINVAL;
4320	} else {
4321		vrefact(cdir);
4322		error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
4323	}
4324	FILEDESC_SUNLOCK(fdp);
4325	fddrop(fdp);
4326	free(efbuf, M_TEMP);
4327	return (error);
4328}
4329
4330/*
4331 * Get per-process current working directory.
4332 */
4333static int
4334sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
4335{
4336	struct sbuf sb;
4337	struct proc *p;
4338	ssize_t maxlen;
4339	int error, error2, *name;
4340
4341	name = (int *)arg1;
4342
4343	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
4344	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
4345	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
4346	if (error != 0) {
4347		sbuf_delete(&sb);
4348		return (error);
4349	}
4350	maxlen = req->oldptr != NULL ? req->oldlen : -1;
4351	error = kern_proc_cwd_out(p, &sb, maxlen);
4352	error2 = sbuf_finish(&sb);
4353	sbuf_delete(&sb);
4354	return (error != 0 ? error : error2);
4355}
4356
4357static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
4358    sysctl_kern_proc_cwd, "Process current working directory");
4359
4360#ifdef DDB
4361/*
4362 * For the purposes of debugging, generate a human-readable string for the
4363 * file type.
4364 */
4365static const char *
4366file_type_to_name(short type)
4367{
4368
4369	switch (type) {
4370	case 0:
4371		return ("zero");
4372	case DTYPE_VNODE:
4373		return ("vnode");
4374	case DTYPE_SOCKET:
4375		return ("socket");
4376	case DTYPE_PIPE:
4377		return ("pipe");
4378	case DTYPE_FIFO:
4379		return ("fifo");
4380	case DTYPE_KQUEUE:
4381		return ("kqueue");
4382	case DTYPE_CRYPTO:
4383		return ("crypto");
4384	case DTYPE_MQUEUE:
4385		return ("mqueue");
4386	case DTYPE_SHM:
4387		return ("shm");
4388	case DTYPE_SEM:
4389		return ("ksem");
4390	case DTYPE_PTS:
4391		return ("pts");
4392	case DTYPE_DEV:
4393		return ("dev");
4394	case DTYPE_PROCDESC:
4395		return ("proc");
4396	case DTYPE_LINUXEFD:
4397		return ("levent");
4398	case DTYPE_LINUXTFD:
4399		return ("ltimer");
4400	default:
4401		return ("unkn");
4402	}
4403}
4404
4405/*
4406 * For the purposes of debugging, identify a process (if any, perhaps one of
4407 * many) that references the passed file in its file descriptor array. Return
4408 * NULL if none.
4409 */
4410static struct proc *
4411file_to_first_proc(struct file *fp)
4412{
4413	struct filedesc *fdp;
4414	struct proc *p;
4415	int n;
4416
4417	FOREACH_PROC_IN_SYSTEM(p) {
4418		if (p->p_state == PRS_NEW)
4419			continue;
4420		fdp = p->p_fd;
4421		if (fdp == NULL)
4422			continue;
4423		for (n = 0; n < fdp->fd_nfiles; n++) {
4424			if (fp == fdp->fd_ofiles[n].fde_file)
4425				return (p);
4426		}
4427	}
4428	return (NULL);
4429}
4430
4431static void
4432db_print_file(struct file *fp, int header)
4433{
4434#define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4))
4435	struct proc *p;
4436
4437	if (header)
4438		db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n",
4439		    XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag",
4440		    "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID",
4441		    "FCmd");
4442	p = file_to_first_proc(fp);
4443	db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH,
4444	    fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data,
4445	    fp->f_flag, 0, fp->f_count, 0, XPTRWIDTH, fp->f_vnode,
4446	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
4447
4448#undef XPTRWIDTH
4449}
4450
4451DB_SHOW_COMMAND(file, db_show_file)
4452{
4453	struct file *fp;
4454
4455	if (!have_addr) {
4456		db_printf("usage: show file <addr>\n");
4457		return;
4458	}
4459	fp = (struct file *)addr;
4460	db_print_file(fp, 1);
4461}
4462
4463DB_SHOW_COMMAND(files, db_show_files)
4464{
4465	struct filedesc *fdp;
4466	struct file *fp;
4467	struct proc *p;
4468	int header;
4469	int n;
4470
4471	header = 1;
4472	FOREACH_PROC_IN_SYSTEM(p) {
4473		if (p->p_state == PRS_NEW)
4474			continue;
4475		if ((fdp = p->p_fd) == NULL)
4476			continue;
4477		for (n = 0; n < fdp->fd_nfiles; ++n) {
4478			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
4479				continue;
4480			db_print_file(fp, header);
4481			header = 0;
4482		}
4483	}
4484}
4485#endif
4486
4487SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
4488    &maxfilesperproc, 0, "Maximum files allowed open per process");
4489
4490SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
4491    &maxfiles, 0, "Maximum number of files");
4492
4493SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
4494    &openfiles, 0, "System-wide number of open files");
4495
4496/* ARGSUSED*/
4497static void
4498filelistinit(void *dummy)
4499{
4500
4501	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
4502	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
4503	filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
4504	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
4505	pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
4506	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
4507	/*
4508	 * XXXMJG this is a temporary hack due to boot ordering issues against
4509	 * the vnode zone.
4510	 */
4511	vfs_smr = uma_zone_get_smr(pwd_zone);
4512	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
4513}
4514SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
4515
4516/*-------------------------------------------------------------------*/
4517
4518static int
4519badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
4520    int flags, struct thread *td)
4521{
4522
4523	return (EBADF);
4524}
4525
4526static int
4527badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
4528    struct thread *td)
4529{
4530
4531	return (EINVAL);
4532}
4533
4534static int
4535badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
4536    struct thread *td)
4537{
4538
4539	return (EBADF);
4540}
4541
4542static int
4543badfo_poll(struct file *fp, int events, struct ucred *active_cred,
4544    struct thread *td)
4545{
4546
4547	return (0);
4548}
4549
4550static int
4551badfo_kqfilter(struct file *fp, struct knote *kn)
4552{
4553
4554	return (EBADF);
4555}
4556
4557static int
4558badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
4559    struct thread *td)
4560{
4561
4562	return (EBADF);
4563}
4564
4565static int
4566badfo_close(struct file *fp, struct thread *td)
4567{
4568
4569	return (0);
4570}
4571
4572static int
4573badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
4574    struct thread *td)
4575{
4576
4577	return (EBADF);
4578}
4579
4580static int
4581badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
4582    struct thread *td)
4583{
4584
4585	return (EBADF);
4586}
4587
4588static int
4589badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
4590    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
4591    struct thread *td)
4592{
4593
4594	return (EBADF);
4595}
4596
4597static int
4598badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
4599{
4600
4601	return (0);
4602}
4603
4604struct fileops badfileops = {
4605	.fo_read = badfo_readwrite,
4606	.fo_write = badfo_readwrite,
4607	.fo_truncate = badfo_truncate,
4608	.fo_ioctl = badfo_ioctl,
4609	.fo_poll = badfo_poll,
4610	.fo_kqfilter = badfo_kqfilter,
4611	.fo_stat = badfo_stat,
4612	.fo_close = badfo_close,
4613	.fo_chmod = badfo_chmod,
4614	.fo_chown = badfo_chown,
4615	.fo_sendfile = badfo_sendfile,
4616	.fo_fill_kinfo = badfo_fill_kinfo,
4617};
4618
4619int
4620invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
4621    int flags, struct thread *td)
4622{
4623
4624	return (EOPNOTSUPP);
4625}
4626
4627int
4628invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
4629    struct thread *td)
4630{
4631
4632	return (EINVAL);
4633}
4634
4635int
4636invfo_ioctl(struct file *fp, u_long com, void *data,
4637    struct ucred *active_cred, struct thread *td)
4638{
4639
4640	return (ENOTTY);
4641}
4642
4643int
4644invfo_poll(struct file *fp, int events, struct ucred *active_cred,
4645    struct thread *td)
4646{
4647
4648	return (poll_no_poll(events));
4649}
4650
4651int
4652invfo_kqfilter(struct file *fp, struct knote *kn)
4653{
4654
4655	return (EINVAL);
4656}
4657
4658int
4659invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
4660    struct thread *td)
4661{
4662
4663	return (EINVAL);
4664}
4665
4666int
4667invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
4668    struct thread *td)
4669{
4670
4671	return (EINVAL);
4672}
4673
4674int
4675invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
4676    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
4677    struct thread *td)
4678{
4679
4680	return (EINVAL);
4681}
4682
4683/*-------------------------------------------------------------------*/
4684
4685/*
4686 * File Descriptor pseudo-device driver (/dev/fd/).
4687 *
4688 * Opening minor device N dup()s the file (if any) connected to file
4689 * descriptor N belonging to the calling process.  Note that this driver
4690 * consists of only the ``open()'' routine, because all subsequent
4691 * references to this file will be direct to the other driver.
4692 *
4693 * XXX: we could give this one a cloning event handler if necessary.
4694 */
4695
4696/* ARGSUSED */
4697static int
4698fdopen(struct cdev *dev, int mode, int type, struct thread *td)
4699{
4700
4701	/*
4702	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
4703	 * the file descriptor being sought for duplication. The error
4704	 * return ensures that the vnode for this device will be released
4705	 * by vn_open. Open will detect this special error and take the
4706	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
4707	 * will simply report the error.
4708	 */
4709	td->td_dupfd = dev2unit(dev);
4710	return (ENODEV);
4711}
4712
4713static struct cdevsw fildesc_cdevsw = {
4714	.d_version =	D_VERSION,
4715	.d_open =	fdopen,
4716	.d_name =	"FD",
4717};
4718
4719static void
4720fildesc_drvinit(void *unused)
4721{
4722	struct cdev *dev;
4723
4724	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
4725	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
4726	make_dev_alias(dev, "stdin");
4727	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
4728	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
4729	make_dev_alias(dev, "stdout");
4730	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
4731	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
4732	make_dev_alias(dev, "stderr");
4733}
4734
4735SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
4736