1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
26/*	  All Rights Reserved	*/
27
28/*
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
31 * All Rights Reserved
32 *
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
35 * contributors.
36 */
37
38/*
39 * Directory manipulation routines.
40 *
41 * When manipulating directories, the i_rwlock provides serialization
42 * since directories cannot be mmapped. The i_contents lock is redundant.
43 */
44
45#include <sys/types.h>
46#include <sys/t_lock.h>
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/signal.h>
50#include <sys/cred.h>
51#include <sys/proc.h>
52#include <sys/disp.h>
53#include <sys/user.h>
54#include <sys/vfs.h>
55#include <sys/vnode.h>
56#include <sys/stat.h>
57#include <sys/mode.h>
58#include <sys/buf.h>
59#include <sys/uio.h>
60#include <sys/dnlc.h>
61#include <sys/fs/ufs_inode.h>
62#include <sys/fs/ufs_fs.h>
63#include <sys/mount.h>
64#include <sys/fs/ufs_fsdir.h>
65#include <sys/fs/ufs_trans.h>
66#include <sys/fs/ufs_panic.h>
67#include <sys/fs/ufs_quota.h>
68#include <sys/errno.h>
69#include <sys/debug.h>
70#include <vm/seg.h>
71#include <sys/sysmacros.h>
72#include <sys/cmn_err.h>
73#include <sys/cpuvar.h>
74#include <sys/unistd.h>
75#include <sys/policy.h>
76
77/*
78 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
79 */
80#if !ISP2(DIRBLKSIZ)
81#error	"DIRBLKSIZ not a power of 2"
82#endif
83
84/*
85 * A virgin directory.
86 */
87static struct dirtemplate mastertemplate = {
88	0, 12, 1, ".",
89	0, DIRBLKSIZ - 12, 2, ".."
90};
91
92#define	LDIRSIZ(len) \
93	((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
94#define	MAX_DIR_NAME_LEN(len) \
95	(((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
96
97/*
98 * The dnlc directory cache allows a 64 bit handle for directory entries.
99 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
100 * into the handle. Note, a 32 bit offset allows a 4GB directory, which
101 * is way beyond what could be cached in memory by the directory
102 * caching routines. So we are quite safe with this limit.
103 * The macros below pack and unpack the handle.
104 */
105#define	H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
106#define	H_TO_OFF(h) (off_t)((h) >> 32)
107#define	INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
108
109/*
110 * The average size of a typical on disk directory entry is about 16 bytes
111 * and so defines AV_DIRECT_SHIFT : log2(16)
112 * This define is only used to approximate the number of entries
113 * is a directory. This is needed for dnlc_dir_start() which will immediately
114 * return an error if the value is not within its acceptable range of
115 * number of files in a directory.
116 */
117#define	AV_DIRECT_SHIFT 4
118/*
119 * If the directory size (from i_size) is greater than the ufs_min_dir_cache
120 * tunable then we request dnlc directory caching.
121 * This has found to be profitable after 1024 file names.
122 */
123int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;
124
125/* The time point the dnlc directory caching was disabled */
126static hrtime_t ufs_dc_disable_at;
127/* directory caching disable duration */
128static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5;
129
130#ifdef DEBUG
131int dirchk = 1;
132#else /* !DEBUG */
133int dirchk = 0;
134#endif /* DEBUG */
135int ufs_negative_cache = 1;
136uint64_t ufs_dirremove_retry_cnt;
137
138static void dirbad();
139static int ufs_dirrename();
140static int ufs_diraddentry();
141static int ufs_dirempty();
142static int ufs_dirscan();
143static int ufs_dirclrdotdot();
144static int ufs_dirfixdotdot();
145static int ufs_dirpurgedotdot();
146static int dirprepareentry();
147static int ufs_dirmakedirect();
148static int dirbadname();
149static int dirmangled();
150
151/*
152 * Check accessibility of directory against inquired mode and type.
153 * Execute access is required to search the directory.
154 * Access for write is interpreted as allowing
155 * deletion of files in the directory.
156 * Note, the reader i_contents lock will be acquired in
157 * ufs_iaccess().
158 */
159int
160ufs_diraccess(struct inode *ip, int mode, struct cred *cr)
161{
162	if (((ip->i_mode & IFMT) != IFDIR) &&
163	    ((ip->i_mode & IFMT) != IFATTRDIR))
164		return (ENOTDIR);
165
166	return (ufs_iaccess(ip, mode, cr, 1));
167}
168
169/*
170 * Look for a given name in a directory.  On successful return, *ipp
171 * will point to the VN_HELD inode.
172 * The caller is responsible for checking accessibility upfront
173 * via ufs_diraccess().
174 */
175int
176ufs_dirlook(
177	struct inode *dp,
178	char *namep,
179	struct inode **ipp,
180	struct cred *cr,
181	int skipdnlc,			/* skip the 1st level dnlc */
182	int skipcaching)		/* force directory caching off */
183{
184	uint64_t handle;
185	struct fbuf *fbp;		/* a buffer of directory entries */
186	struct direct *ep;		/* the current directory entry */
187	struct vnode *vp;
188	struct vnode *dvp;		/* directory vnode ptr */
189	struct ulockfs *ulp;
190	dcanchor_t *dcap;
191	off_t endsearch;		/* offset to end directory search */
192	off_t offset;
193	off_t start_off;		/* starting offset from middle search */
194	off_t last_offset;		/* last offset */
195	int entryoffsetinblock;		/* offset of ep in addr's buffer */
196	int numdirpasses;		/* strategy for directory search */
197	int namlen;			/* length of name */
198	int err;
199	int doingchk;
200	int i;
201	int caching;
202	int indeadlock;
203	ino_t ep_ino;			/* entry i number */
204	ino_t chkino;
205	ushort_t ep_reclen;		/* direct local d_reclen */
206
207	ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */
208
209	if (dp->i_ufsvfs)
210		ulp = &dp->i_ufsvfs->vfs_ulockfs;
211
212	/*
213	 * Check the directory name lookup cache, first for individual files
214	 * then for complete directories.
215	 */
216	dvp = ITOV(dp);
217	if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
218		/* vp is already held from dnlc_lookup */
219		if (vp == DNLC_NO_VNODE) {
220			VN_RELE(vp);
221			return (ENOENT);
222		}
223		*ipp = VTOI(vp);
224		return (0);
225	}
226
227	dcap = &dp->i_danchor;
228
229	/*
230	 * Grab the reader lock on the directory data before checking
231	 * the dnlc to avoid a race with ufs_dirremove() & friends.
232	 *
233	 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
234	 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
235	 * possible, retries the operation.
236	 */
237	ufs_tryirwlock((&dp->i_rwlock), RW_READER, retry_dircache);
238	if (indeadlock)
239		return (EAGAIN);
240
241	switch (dnlc_dir_lookup(dcap, namep, &handle)) {
242	case DFOUND:
243		ep_ino = (ino_t)H_TO_INO(handle);
244		if (dp->i_number == ep_ino) {
245			VN_HOLD(dvp);	/* want ourself, "." */
246			*ipp = dp;
247			rw_exit(&dp->i_rwlock);
248			return (0);
249		}
250		if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
251			uint64_t handle2;
252			/*
253			 * release the lock on the dir we are searching
254			 * to avoid a deadlock when grabbing the
255			 * i_contents lock in ufs_iget_alloced().
256			 */
257			rw_exit(&dp->i_rwlock);
258			rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
259			err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
260			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
261			/*
262			 * must recheck as we dropped dp->i_rwlock
263			 */
264			ufs_tryirwlock(&dp->i_rwlock, RW_READER, retry_parent);
265			if (indeadlock) {
266				if (!err)
267					VN_RELE(ITOV(*ipp));
268				return (EAGAIN);
269			}
270			if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
271			    == DFOUND) && (handle == handle2)) {
272				dnlc_update(dvp, namep, ITOV(*ipp));
273				rw_exit(&dp->i_rwlock);
274				return (0);
275			}
276			/* check failed, read the actual directory */
277			if (!err) {
278				VN_RELE(ITOV(*ipp));
279			}
280			goto restart;
281		}
282		/* usual case of not "." nor ".." */
283		rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
284		err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
285		rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
286		if (err) {
287			rw_exit(&dp->i_rwlock);
288			return (err);
289		}
290		dnlc_update(dvp, namep, ITOV(*ipp));
291		rw_exit(&dp->i_rwlock);
292		return (0);
293	case DNOENT:
294		if (ufs_negative_cache && (dp->i_nlink > 0)) {
295			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
296		}
297		rw_exit(&dp->i_rwlock);
298		return (ENOENT);
299	default:
300		break;
301	}
302restart:
303
304	fbp = NULL;
305	doingchk = 0;
306	chkino = 0;
307	caching = 0;
308
309	/*
310	 * Attempt to cache any directories greater than the tunable
311	 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM),
312	 * disable caching for this directory and record the system time.
313	 * Any attempt after the disable time has expired will enable
314	 * the caching again.
315	 */
316	if (!skipcaching && (dp->i_size >= ufs_min_dir_cache)) {
317		/*
318		 * if the directory caching disable time has expired
319		 * enable the caching again.
320		 */
321		if (dp->i_cachedir == CD_DISABLED_NOMEM &&
322		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
323			ufs_dc_disable_at = 0;
324			dp->i_cachedir = CD_ENABLED;
325		}
326		if (dp->i_cachedir == CD_ENABLED) {
327			switch (dnlc_dir_start(dcap, dp->i_size >>
328			    AV_DIRECT_SHIFT)) {
329			case DNOMEM:
330				dp->i_cachedir = CD_DISABLED_NOMEM;
331				ufs_dc_disable_at = gethrtime();
332				break;
333			case DTOOBIG:
334				dp->i_cachedir = CD_DISABLED_TOOBIG;
335				break;
336			case DOK:
337				caching = 1;
338				break;
339			default:
340				break;
341			}
342		}
343	}
344	/*
345	 * If caching we don't stop when the file has been
346	 * found, but need to know later, so clear *ipp now
347	 */
348	*ipp = NULL;
349
350recheck:
351	if (caching) {
352		offset = 0;
353		entryoffsetinblock = 0;
354		numdirpasses = 1;
355	} else {
356		/*
357		 * Take care to look at dp->i_diroff only once, as it
358		 * may be changing due to other threads/cpus.
359		 */
360		offset = dp->i_diroff;
361		if (offset > dp->i_size) {
362			offset = 0;
363		}
364		if (offset == 0) {
365			entryoffsetinblock = 0;
366			numdirpasses = 1;
367		} else {
368			start_off = offset;
369
370			entryoffsetinblock = blkoff(dp->i_fs, offset);
371			if (entryoffsetinblock != 0) {
372				err = blkatoff(dp, offset, (char **)0, &fbp);
373				if (err)
374					goto bad;
375			}
376			numdirpasses = 2;
377		}
378	}
379	endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t);
380	namlen = strlen(namep);
381	last_offset = 0;
382
383searchloop:
384	while (offset < endsearch) {
385		/*
386		 * If offset is on a block boundary,
387		 * read the next directory block.
388		 * Release previous if it exists.
389		 */
390		if (blkoff(dp->i_fs, offset) == 0) {
391			if (fbp != NULL) {
392				fbrelse(fbp, S_OTHER);
393			}
394			err = blkatoff(dp, offset, (char **)0, &fbp);
395			if (err)
396				goto bad;
397			entryoffsetinblock = 0;
398		}
399
400		/*
401		 * If the offset to the next entry is invalid or if the
402		 * next entry is a zero length record or if the record
403		 * length is invalid, then skip to the next directory
404		 * block.  Complete validation checks are done if the
405		 * record length is invalid.
406		 *
407		 * Full validation checks are slow so they are disabled
408		 * by default.  Complete checks can be run by patching
409		 * "dirchk" to be true.
410		 *
411		 * We have to check the validity of entryoffsetinblock
412		 * here because it can be set to i_diroff above.
413		 */
414		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
415		if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
416		    (dirchk || (ep->d_reclen & 0x3)) &&
417		    dirmangled(dp, ep, entryoffsetinblock, offset)) {
418			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
419			offset += i;
420			entryoffsetinblock += i;
421			if (caching) {
422				dnlc_dir_purge(dcap);
423				caching = 0;
424			}
425			continue;
426		}
427
428		ep_reclen = ep->d_reclen;
429
430		/*
431		 * Add named entries and free space into the directory cache
432		 */
433		if (caching) {
434			ushort_t extra;
435			off_t off2;
436
437			if (ep->d_ino == 0) {
438				extra = ep_reclen;
439				if (offset & (DIRBLKSIZ - 1)) {
440					dnlc_dir_purge(dcap);
441					dp->i_cachedir = CD_DISABLED;
442					caching = 0;
443				}
444			} else {
445				/*
446				 * entries hold the previous offset except the
447				 * 1st which holds the offset + 1
448				 */
449				if (offset & (DIRBLKSIZ - 1)) {
450					off2 = last_offset;
451				} else {
452					off2 = offset + 1;
453				}
454				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
455				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
456				extra = ep_reclen - DIRSIZ(ep);
457			}
458			if (caching && (extra >= LDIRSIZ(1))) {
459				caching = (dnlc_dir_add_space(dcap, extra,
460				    (uint64_t)offset) == DOK);
461			}
462		}
463
464		/*
465		 * Check for a name match.
466		 * We have the parent inode read locked with i_rwlock.
467		 */
468		if (ep->d_ino && ep->d_namlen == namlen &&
469		    *namep == *ep->d_name &&	/* fast chk 1st chr */
470		    bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {
471
472			/*
473			 * We have to release the fbp early here to avoid
474			 * a possible deadlock situation where we have the
475			 * fbp and want the directory inode and someone doing
476			 * a ufs_direnter_* has the directory inode and wants
477			 * the fbp.  XXX - is this still needed?
478			 */
479			ep_ino = (ino_t)ep->d_ino;
480			ASSERT(fbp != NULL);
481			fbrelse(fbp, S_OTHER);
482			fbp = NULL;
483
484			/*
485			 * Atomic update (read lock held)
486			 */
487			dp->i_diroff = offset;
488
489			if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
490				struct timeval32 omtime;
491
492				if (caching) {
493					dnlc_dir_purge(dcap);
494					caching = 0;
495				}
496				if (doingchk) {
497					/*
498					 * if the inumber didn't change
499					 * continue with already found inode.
500					 */
501					if (ep_ino == chkino)
502						goto checkok;
503					else {
504						VN_RELE(ITOV(*ipp));
505						/* *ipp is nulled at restart */
506						goto restart;
507					}
508				}
509				/*
510				 * release the lock on the dir we are searching
511				 * to avoid a deadlock when grabbing the
512				 * i_contents lock in ufs_iget_alloced().
513				 */
514				omtime = dp->i_mtime;
515				rw_exit(&dp->i_rwlock);
516				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
517				    RW_READER);
518				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
519				    cr);
520				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
521				ufs_tryirwlock(&dp->i_rwlock, RW_READER,
522				    retry_disk);
523				if (indeadlock) {
524					if (!err)
525						VN_RELE(ITOV(*ipp));
526					return (EAGAIN);
527				}
528				if (err)
529					goto bad;
530				/*
531				 * Since we released the lock on the directory,
532				 * we must check that the same inode is still
533				 * the ".." entry for this directory.
534				 */
535				/*CSTYLED*/
536				if (timercmp(&omtime, &dp->i_mtime, !=)) {
537					/*
538					 * Modification time changed on the
539					 * directory, we must go check if
540					 * the inumber changed for ".."
541					 */
542					doingchk = 1;
543					chkino = ep_ino;
544					entryoffsetinblock = 0;
545					if (caching) {
546						/*
547						 * Forget directory caching
548						 * for this rare case
549						 */
550						dnlc_dir_purge(dcap);
551						caching = 0;
552					}
553					goto recheck;
554				}
555			} else if (dp->i_number == ep_ino) {
556				VN_HOLD(dvp);	/* want ourself, "." */
557				*ipp = dp;
558				if (caching) {
559					dnlc_dir_purge(dcap);
560					caching = 0;
561				}
562			} else {
563				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
564				    RW_READER);
565				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
566				    cr);
567				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
568				if (err)
569					goto bad;
570			}
571checkok:
572			ASSERT(*ipp);
573			dnlc_update(dvp, namep, ITOV(*ipp));
574			/*
575			 * If we are not caching then just return the entry
576			 * otherwise complete loading up the cache
577			 */
578			if (!caching) {
579				rw_exit(&dp->i_rwlock);
580				return (0);
581			}
582			err = blkatoff(dp, offset, (char **)0, &fbp);
583			if (err)
584				goto bad;
585		}
586		last_offset = offset;
587		offset += ep_reclen;
588		entryoffsetinblock += ep_reclen;
589	}
590	/*
591	 * If we started in the middle of the directory and failed
592	 * to find our target, we must check the beginning as well.
593	 */
594	if (numdirpasses == 2) {
595		numdirpasses--;
596		offset = 0;
597		endsearch = start_off;
598		goto searchloop;
599	}
600
601	/*
602	 * If whole directory caching is on (or was originally on) then
603	 * the entry may have been found.
604	 */
605	if (*ipp == NULL) {
606		err = ENOENT;
607		if (ufs_negative_cache && (dp->i_nlink > 0)) {
608			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
609		}
610	}
611	if (caching) {
612		dnlc_dir_complete(dcap);
613		caching = 0;
614	}
615
616bad:
617	if (err && *ipp) {
618		/*
619		 * err and *ipp can both be set if we were attempting to
620		 * cache the directory, and we found the entry, then later
621		 * while trying to complete the directory cache encountered
622		 * a error (eg reading a directory sector).
623		 */
624		VN_RELE(ITOV(*ipp));
625		*ipp = NULL;
626	}
627
628	if (fbp)
629		fbrelse(fbp, S_OTHER);
630	rw_exit(&dp->i_rwlock);
631	if (caching)
632		dnlc_dir_purge(dcap);
633	return (err);
634}
635
636/*
637 * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
638 */
639int
640ufs_direnter_cm(
641	struct inode *tdp,	/* target directory to make entry in */
642	char *namep,		/* name of entry */
643	enum de_op op,		/* entry operation */
644	struct vattr *vap,	/* attributes if new inode needed */
645	struct inode **ipp,	/* return entered inode here */
646	struct cred *cr,	/* user credentials */
647	int flags)		/* no entry exists */
648{
649	struct inode *tip;	/* inode of (existing) target file */
650	char *s;
651	struct ufs_slot slot;	/* slot info to pass around */
652	int namlen;		/* length of name */
653	int err;		/* error number */
654	struct inode *nip;	/* new inode */
655	int do_rele_nip = 0;	/* release nip */
656	int noentry = flags & ~IQUIET;
657	int quiet = flags & IQUIET;	/* Suppress out of inodes message */
658	int indeadlock;
659	struct ulockfs *ulp;
660
661	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
662
663	if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
664	    ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
665	    (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
666	    (vap->va_type == VFIFO))))
667		return (EINVAL);
668
669	/* don't allow '/' characters in pathname component */
670	for (s = namep, namlen = 0; *s; s++, namlen++)
671		if (*s == '/')
672			return (EACCES);
673	ASSERT(namlen);
674
675	/*
676	 * Check accessibility of target directory.
677	 */
678	if (err = ufs_diraccess(tdp, IEXEC, cr))
679		return (err);
680
681	/*
682	 * If name is "." or ".." then if this is a create look it up
683	 * and return EEXIST.
684	 */
685	if (namep[0] == '.' &&
686	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
687		/*
688		 * ufs_dirlook will acquire the i_rwlock
689		 */
690		if (tdp->i_ufsvfs)
691			ulp = &tdp->i_ufsvfs->vfs_ulockfs;
692		rw_exit(&tdp->i_rwlock);
693		if (err = ufs_dirlook(tdp, namep, ipp, cr, 0, 0)) {
694			if (err == EAGAIN)
695				return (err);
696
697			/*
698			 * ufs_tryirwlock uses rw_tryenter and checks for
699			 * SLOCK to avoid i_rwlock, ufs_lockfs_begin deadlock.
700			 * If deadlock possible, retries the operation.
701			 */
702			ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry_err);
703			if (indeadlock)
704				return (EAGAIN);
705
706			return (err);
707		}
708		ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry);
709		if (indeadlock) {
710			VN_RELE(ITOV(*ipp));
711			return (EAGAIN);
712		}
713		return (EEXIST);
714	}
715
716	/*
717	 * If target directory has not been removed, then we can consider
718	 * allowing file to be created.
719	 */
720	if (tdp->i_nlink <= 0) {
721		return (ENOENT);
722	}
723
724	/*
725	 * Search for the entry. Return VN_HELD tip if found.
726	 */
727	tip = NULL;
728	slot.fbp = NULL;
729	slot.status = NONE;
730	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
731	rw_enter(&tdp->i_contents, RW_WRITER);
732	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
733	if (err)
734		goto out;
735	if (tip) {
736		ASSERT(!noentry);
737		*ipp = tip;
738		err = EEXIST;
739	} else {
740		/*
741		 * The entry does not exist. Check write permission in
742		 * directory to see if entry can be created.
743		 */
744		if (err = ufs_iaccess(tdp, IWRITE, cr, 0))
745			goto out;
746		/*
747		 * Make new inode and directory entry.
748		 */
749		tdp->i_flag |= quiet;
750		if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
751			if (nip != NULL)
752				do_rele_nip = 1;
753			goto out;
754		}
755		if (err = ufs_diraddentry(tdp, namep, op,
756		    namlen, &slot, nip, NULL, cr)) {
757			/*
758			 * Unmake the inode we just made.
759			 */
760			rw_enter(&nip->i_contents, RW_WRITER);
761			if (((nip->i_mode & IFMT) == IFDIR) ||
762			    ((nip->i_mode & IFMT) == IFATTRDIR)) {
763				tdp->i_nlink--;
764				ufs_setreclaim(tdp);
765				tdp->i_flag |= ICHG;
766				tdp->i_seq++;
767				TRANS_INODE(tdp->i_ufsvfs, tdp);
768				ITIMES_NOLOCK(tdp);
769			}
770			nip->i_nlink = 0;
771			ufs_setreclaim(nip);
772			TRANS_INODE(nip->i_ufsvfs, nip);
773			nip->i_flag |= ICHG;
774			nip->i_seq++;
775			ITIMES_NOLOCK(nip);
776			rw_exit(&nip->i_contents);
777			do_rele_nip = 1;
778		} else {
779			*ipp = nip;
780		}
781	}
782
783out:
784	if (slot.fbp)
785		fbrelse(slot.fbp, S_OTHER);
786
787	tdp->i_flag &= ~quiet;
788	rw_exit(&tdp->i_contents);
789
790	/*
791	 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
792	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
793	 */
794	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
795
796	if (do_rele_nip) {
797		VN_RELE(ITOV(nip));
798	}
799
800	return (err);
801}
802
803/*
804 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
805 */
806int
807ufs_direnter_lr(
808	struct inode *tdp,	/* target directory to make entry in */
809	char *namep,		/* name of entry */
810	enum de_op op,		/* entry operation */
811	struct inode *sdp,	/* source inode parent if rename */
812	struct inode *sip,	/* source inode */
813	struct cred *cr)	/* user credentials */
814{
815	struct inode *tip;	/* inode of (existing) target file */
816	char *s;
817	struct ufs_slot slot;	/* slot info to pass around */
818	int namlen;		/* length of name */
819	int err;		/* error number */
820
821	/* don't allow '/' characters in pathname component */
822	for (s = namep, namlen = 0; *s; s++, namlen++)
823		if (*s == '/')
824			return (EACCES);
825	ASSERT(namlen);
826	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
827
828	/*
829	 * If name is "." or ".." then if this is a create look it up
830	 * and return EEXIST.  Rename or link TO "." or ".." is forbidden.
831	 */
832	if (namep[0] == '.' &&
833	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
834		if (op == DE_RENAME) {
835			return (EINVAL);	/* *SIGH* should be ENOTEMPTY */
836		}
837		return (EEXIST);
838	}
839	/*
840	 * For link and rename lock the source entry and check the link count
841	 * to see if it has been removed while it was unlocked.  If not, we
842	 * increment the link count and force the inode to disk to make sure
843	 * that it is there before any directory entry that points to it.
844	 *
845	 * In the case of a symbolic link, we are dealing with a new inode
846	 * which does not yet have any links.  We've created it with a link
847	 * count of 1, and we don't want to increment it since this will be
848	 * its first link.
849	 *
850	 * We are about to push the inode to disk. We make sure
851	 * that the inode's data blocks are flushed first so the
852	 * inode and it's data blocks are always in sync.  This
853	 * adds some robustness in in the event of a power failure
854	 * or panic where sync fails. If we panic before the
855	 * inode is updated, then the inode still refers to the
856	 * old data blocks (or none for a new file). If we panic
857	 * after the inode is updated, then the inode refers to
858	 * the new data blocks.
859	 *
860	 * We do this before grabbing the i_contents lock because
861	 * ufs_syncip() will want that lock. We could do the data
862	 * syncing after the removal checks, but upon return from
863	 * the data sync we would have to repeat the removal
864	 * checks.
865	 */
866	if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
867		return (err);
868	}
869
870	rw_enter(&sip->i_contents, RW_WRITER);
871	if (sip->i_nlink <= 0) {
872		rw_exit(&sip->i_contents);
873		return (ENOENT);
874	}
875	if (sip->i_nlink == MAXLINK) {
876		rw_exit(&sip->i_contents);
877		return (EMLINK);
878	}
879
880	/*
881	 * Sync the indirect blocks associated with the file
882	 * for the same reasons as described above.  Since this
883	 * call wants the i_contents lock held for it we can do
884	 * this here with no extra work.
885	 */
886	if (err = ufs_sync_indir(sip)) {
887		rw_exit(&sip->i_contents);
888		return (err);
889	}
890
891	if (op != DE_SYMLINK)
892		sip->i_nlink++;
893	TRANS_INODE(sip->i_ufsvfs, sip);
894	sip->i_flag |= ICHG;
895	sip->i_seq++;
896	ufs_iupdat(sip, I_SYNC);
897	rw_exit(&sip->i_contents);
898
899	/*
900	 * If target directory has not been removed, then we can consider
901	 * allowing file to be created.
902	 */
903	if (tdp->i_nlink <= 0) {
904		err = ENOENT;
905		goto out2;
906	}
907
908	/*
909	 * Check accessibility of target directory.
910	 */
911	if (err = ufs_diraccess(tdp, IEXEC, cr))
912		goto out2;
913
914	/*
915	 * Search for the entry. Return VN_HELD tip if found.
916	 */
917	tip = NULL;
918	slot.status = NONE;
919	slot.fbp = NULL;
920	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
921	rw_enter(&tdp->i_contents, RW_WRITER);
922	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
923	if (err)
924		goto out;
925
926	if (tip) {
927		switch (op) {
928		case DE_RENAME:
929			err = ufs_dirrename(sdp, sip, tdp, namep,
930			    tip, &slot, cr);
931			break;
932
933		case DE_LINK:
934		case DE_SYMLINK:
935			/*
936			 * Can't link to an existing file.
937			 */
938			err = EEXIST;
939			break;
940		default:
941			break;
942		}
943	} else {
944		/*
945		 * The entry does not exist. Check write permission in
946		 * directory to see if entry can be created.
947		 */
948		if (err = ufs_iaccess(tdp, IWRITE, cr, 0))
949			goto out;
950		err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
951		    cr);
952	}
953
954out:
955	if (slot.fbp)
956		fbrelse(slot.fbp, S_OTHER);
957
958	rw_exit(&tdp->i_contents);
959
960	/*
961	 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
962	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
963	 */
964	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
965
966	/*
967	 * If we renamed a file over the top of an existing file,
968	 * or linked a file to an existing file (or tried to),
969	 * then release and delete (or just release) the inode.
970	 */
971	if (tip)
972		VN_RELE(ITOV(tip));
973
974out2:
975	if (err) {
976		/*
977		 * Undo bumped link count.
978		 */
979		if (op != DE_SYMLINK) {
980			rw_enter(&sip->i_contents, RW_WRITER);
981			sip->i_nlink--;
982			ufs_setreclaim(sip);
983			TRANS_INODE(sip->i_ufsvfs, sip);
984			sip->i_flag |= ICHG;
985			sip->i_seq++;
986			ITIMES_NOLOCK(sip);
987			rw_exit(&sip->i_contents);
988		}
989	}
990	return (err);
991}
992
993/*
994 * Check for the existence of a name in a directory (unless noentry
995 * is set) , or else of an empty
996 * slot in which an entry may be made.  If the requested name is found,
997 * then on return *ipp points at the inode and *offp contains
998 * its offset in the directory.  If the name is not found, then *ipp
999 * will be NULL and *slotp will contain information about a directory slot in
1000 * which an entry may be made (either an empty slot, or the first position
1001 * past the end of the directory).
1002 * The target directory inode (tdp) is supplied write locked (i_rwlock).
1003 *
1004 * This may not be used on "." or "..", but aliases of "." are ok.
1005 */
1006int
1007ufs_dircheckforname(
1008	struct inode *tdp,	/* inode of directory being checked */
1009	char *namep,		/* name we're checking for */
1010	int namlen,		/* length of name, excluding null */
1011	struct ufs_slot *slotp,	/* slot structure */
1012	struct inode **ipp,	/* return inode if we find one */
1013	struct cred *cr,
1014	int noentry)		/* noentry - just look for space */
1015{
1016	uint64_t handle;
1017	struct fbuf *fbp;	/* pointer to directory block */
1018	struct direct *ep;	/* directory entry */
1019	struct direct *nep;	/* next directory entry */
1020	dcanchor_t *dcap;
1021	vnode_t *dvp;		/* directory vnode ptr */
1022	off_t dirsize;		/* size of the directory */
1023	off_t offset;		/* offset in the directory */
1024	off_t last_offset;	/* last offset */
1025	off_t enduseful;	/* pointer past last used dir slot */
1026	int entryoffsetinblk;	/* offset of ep in fbp's buffer */
1027	int i;			/* length of mangled entry */
1028	int needed;
1029	int err;
1030	int first;
1031	int caching;
1032	int stat;
1033	ino_t ep_ino;
1034	slotstat_t initstat = slotp->status;
1035
1036	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1037	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1038	ASSERT(*ipp == NULL);
1039	fbp = NULL;
1040
1041	/*
1042	 * First check if there is a complete cache of the directory.
1043	 */
1044	dvp = ITOV(tdp);
1045
1046	dcap = &tdp->i_danchor;
1047	if (noentry) {
1048		/*
1049		 * We know from the 1st level dnlc cache that the entry
1050		 * doesn't exist, so don't bother searching the directory
1051		 * cache, but just look for space (possibly in the directory
1052		 * cache).
1053		 */
1054		stat = DNOENT;
1055	} else {
1056		stat = dnlc_dir_lookup(dcap, namep, &handle);
1057	}
1058	switch (stat) {
1059	case DFOUND:
1060		ep_ino = (ino_t)H_TO_INO(handle);
1061		if (tdp->i_number == ep_ino) {
1062			*ipp = tdp;	/* we want ourself, ie "." */
1063			VN_HOLD(dvp);
1064		} else {
1065			err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
1066			if (err)
1067				return (err);
1068		}
1069		offset = H_TO_OFF(handle);
1070		first = 0;
1071		if (offset & 1) {
1072			/* This is the first entry in the block */
1073			first = 1;
1074			offset -= 1;
1075			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1076		}
1077		err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1078		if (err) {
1079			VN_RELE(ITOV(*ipp));
1080			*ipp = NULL;
1081			return (err);
1082		}
1083		/*
1084		 * Check the validity of the entry.
1085		 * If it's bad, then throw away the cache and
1086		 * continue without it. The dirmangled() routine
1087		 * will then be called upon it.
1088		 */
1089		if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1090			VN_RELE(ITOV(*ipp));
1091			*ipp = NULL;
1092			dnlc_dir_purge(dcap);
1093			break;
1094		}
1095		/*
1096		 * Remember the returned offset is the offset of the
1097		 * preceding record (unless this is the 1st record
1098		 * in the DIRBLKSIZ sized block (disk sector)), then it's
1099		 * offset + 1. Note, no real offsets are on odd boundaries.
1100		 */
1101		if (first) {
1102			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1103			slotp->offset = offset;
1104			slotp->size = 0;
1105			slotp->ep = ep;
1106		} else {
1107			/* get the next entry */
1108			nep = (struct direct *)((char *)ep + ep->d_reclen);
1109			/*
1110			 * Check the validity of this entry as well
1111			 * If it's bad, then throw away the cache and
1112			 * continue without it. The dirmangled() routine
1113			 * will then be called upon it.
1114			 */
1115			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1116			    (nep->d_ino != ep_ino)) {
1117				VN_RELE(ITOV(*ipp));
1118				*ipp = NULL;
1119				dnlc_dir_purge(dcap);
1120				break;
1121			}
1122			slotp->offset = offset + ep->d_reclen;
1123			slotp->size = ep->d_reclen;
1124			slotp->ep = nep;
1125		}
1126		slotp->status = EXIST;
1127		slotp->fbp = fbp;
1128		slotp->endoff = 0;
1129		slotp->cached = 1;
1130		dnlc_update(dvp, namep, ITOV(*ipp));
1131		return (0);
1132	case DNOENT:
1133		/*
1134		 * The caller gets to set the initial slot status to
1135		 * indicate whether it's interested in getting a
1136		 * empty slot. For example, the status can be set
1137		 * to FOUND when an entry is being deleted.
1138		 */
1139		ASSERT(slotp->fbp == NULL);
1140		if (slotp->status == FOUND) {
1141			return (0);
1142		}
1143		switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
1144		    &handle)) {
1145		case DFOUND:
1146			offset = (off_t)handle;
1147			err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1148			if (err) {
1149				dnlc_dir_purge(dcap);
1150				ASSERT(*ipp == NULL);
1151				return (err);
1152			}
1153			/*
1154			 * Check the validity of the entry.
1155			 * If it's bad, then throw away the cache and
1156			 * continue without it. The dirmangled() routine
1157			 * will then be called upon it.
1158			 */
1159			if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1160				dnlc_dir_purge(dcap);
1161				break;
1162			}
1163			/*
1164			 * Remember the returned offset is the offset of the
1165			 * containing record.
1166			 */
1167			slotp->status = FOUND;
1168			slotp->ep = ep;
1169			slotp->offset = offset;
1170			slotp->fbp = fbp;
1171			slotp->size = ep->d_reclen;
1172			/*
1173			 * Set end offset to 0. Truncation is handled
1174			 * because the dnlc cache will blow away the
1175			 * cached directory when an entry is removed
1176			 * that drops the entries left to less than half
1177			 * the minumum number (dnlc_min_dir_cache).
1178			 */
1179			slotp->endoff = 0;
1180			slotp->cached = 1;
1181			return (0);
1182		case DNOENT:
1183			slotp->status = NONE;
1184			slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
1185			    DIRBLKSIZ, u_offset_t);
1186			slotp->size = DIRBLKSIZ;
1187			slotp->endoff = 0;
1188			slotp->cached = 1;
1189			return (0);
1190		default:
1191			break;
1192		}
1193		break;
1194	}
1195	slotp->cached = 0;
1196	caching = 0;
1197	if (!noentry && tdp->i_size >= ufs_min_dir_cache) {
1198		/*
1199		 * if the directory caching disable time has expired
1200		 * enable caching again.
1201		 */
1202		if (tdp->i_cachedir == CD_DISABLED_NOMEM &&
1203		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
1204			ufs_dc_disable_at = 0;
1205			tdp->i_cachedir = CD_ENABLED;
1206		}
1207		/*
1208		 * Attempt to cache any directories greater than the tunable
1209		 * ufs_min_cache_dir. If it fails due to memory shortage
1210		 * (DNOMEM), disable caching for this directory and record
1211		 * the system time. Any attempt after the disable time has
1212		 * expired will enable the caching again.
1213		 */
1214		if (tdp->i_cachedir == CD_ENABLED) {
1215			switch (dnlc_dir_start(dcap,
1216			    tdp->i_size >> AV_DIRECT_SHIFT)) {
1217			case DNOMEM:
1218				tdp->i_cachedir = CD_DISABLED_NOMEM;
1219				ufs_dc_disable_at = gethrtime();
1220				break;
1221			case DTOOBIG:
1222				tdp->i_cachedir = CD_DISABLED_TOOBIG;
1223				break;
1224			case DOK:
1225				caching = 1;
1226				break;
1227			default:
1228				break;
1229			}
1230		}
1231	}
1232
1233	/*
1234	 * No point in using i_diroff since we must search whole directory
1235	 */
1236	dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t);
1237	enduseful = 0;
1238	offset = last_offset = 0;
1239	entryoffsetinblk = 0;
1240	needed = (int)LDIRSIZ(namlen);
1241	while (offset < dirsize) {
1242		/*
1243		 * If offset is on a block boundary,
1244		 * read the next directory block.
1245		 * Release previous if it exists.
1246		 */
1247		if (blkoff(tdp->i_fs, offset) == 0) {
1248			if (fbp != NULL)
1249				fbrelse(fbp, S_OTHER);
1250
1251			err = blkatoff(tdp, offset, (char **)0, &fbp);
1252			if (err) {
1253				ASSERT(*ipp == NULL);
1254				if (caching) {
1255					dnlc_dir_purge(dcap);
1256				}
1257				return (err);
1258			}
1259			entryoffsetinblk = 0;
1260		}
1261		/*
1262		 * If still looking for a slot, and at a DIRBLKSIZ
1263		 * boundary, have to start looking for free space
1264		 * again.
1265		 */
1266		if (slotp->status == NONE &&
1267		    (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
1268			slotp->offset = -1;
1269		}
1270		/*
1271		 * If the next entry is a zero length record or if the
1272		 * record length is invalid, then skip to the next
1273		 * directory block.  Complete validation checks are
1274		 * done if the record length is invalid.
1275		 *
1276		 * Full validation checks are slow so they are disabled
1277		 * by default.  Complete checks can be run by patching
1278		 * "dirchk" to be true.
1279		 *
1280		 * We do not have to check the validity of
1281		 * entryoffsetinblk here because it starts out as zero
1282		 * and is only incremented by d_reclen values that we
1283		 * validate here.
1284		 */
1285		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1286		if (ep->d_reclen == 0 ||
1287		    (dirchk || (ep->d_reclen & 0x3)) &&
1288		    dirmangled(tdp, ep, entryoffsetinblk, offset)) {
1289			i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
1290			offset += i;
1291			entryoffsetinblk += i;
1292			if (caching) {
1293				dnlc_dir_purge(dcap);
1294				caching = 0;
1295			}
1296			continue;
1297		}
1298
1299		/*
1300		 * Add named entries and free space into the directory cache
1301		 */
1302		if (caching) {
1303			ushort_t extra;
1304			off_t off2;
1305
1306			if (ep->d_ino == 0) {
1307				extra = ep->d_reclen;
1308				if (offset & (DIRBLKSIZ - 1)) {
1309					dnlc_dir_purge(dcap);
1310					caching = 0;
1311				}
1312			} else {
1313				/*
1314				 * entries hold the previous offset if
1315				 * not the 1st one
1316				 */
1317				if (offset & (DIRBLKSIZ - 1)) {
1318					off2 = last_offset;
1319				} else {
1320					off2 = offset + 1;
1321				}
1322				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
1323				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
1324				extra = ep->d_reclen - DIRSIZ(ep);
1325			}
1326			if (caching && (extra >= LDIRSIZ(1))) {
1327				caching = (dnlc_dir_add_space(dcap, extra,
1328				    (uint64_t)offset) == DOK);
1329			}
1330		}
1331
1332		/*
1333		 * If an appropriate sized slot has not yet been found,
1334		 * check to see if one is available.
1335		 */
1336		if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
1337			int size = ep->d_reclen;
1338
1339			if (ep->d_ino != 0)
1340				size -= DIRSIZ(ep);
1341			if (size > 0) {
1342				if (size >= needed) {
1343					slotp->offset = offset;
1344					slotp->size = ep->d_reclen;
1345					if (noentry) {
1346						slotp->ep = ep;
1347						slotp->fbp = fbp;
1348						slotp->status = FOUND;
1349						slotp->endoff = 0;
1350						return (0);
1351					}
1352					slotp->status = FOUND;
1353				} else if (slotp->status == NONE) {
1354					if (slotp->offset == -1)
1355						slotp->offset = offset;
1356				}
1357			}
1358		}
1359		/*
1360		 * Check for a name match.
1361		 */
1362		if (ep->d_ino && ep->d_namlen == namlen &&
1363		    *namep == *ep->d_name &&	/* fast chk 1st char */
1364		    bcmp(namep, ep->d_name, namlen) == 0) {
1365
1366			tdp->i_diroff = offset;
1367
1368			if (tdp->i_number == ep->d_ino) {
1369				*ipp = tdp;	/* we want ourself, ie "." */
1370				VN_HOLD(dvp);
1371			} else {
1372				err = ufs_iget_alloced(tdp->i_vfs,
1373				    (ino_t)ep->d_ino, ipp, cr);
1374				if (err) {
1375					fbrelse(fbp, S_OTHER);
1376					if (caching)
1377						dnlc_dir_purge(dcap);
1378					return (err);
1379				}
1380			}
1381			slotp->status = EXIST;
1382			slotp->offset = offset;
1383			slotp->size = (int)(offset - last_offset);
1384			slotp->fbp = fbp;
1385			slotp->ep = ep;
1386			slotp->endoff = 0;
1387			if (caching)
1388				dnlc_dir_purge(dcap);
1389			return (0);
1390		}
1391		last_offset = offset;
1392		offset += ep->d_reclen;
1393		entryoffsetinblk += ep->d_reclen;
1394		if (ep->d_ino)
1395			enduseful = offset;
1396	}
1397	if (fbp) {
1398		fbrelse(fbp, S_OTHER);
1399	}
1400
1401	if (caching) {
1402		dnlc_dir_complete(dcap);
1403		slotp->cached = 1;
1404		if (slotp->status == FOUND) {
1405			if (initstat == FOUND) {
1406				return (0);
1407			}
1408			(void) dnlc_dir_rem_space_by_handle(dcap,
1409			    slotp->offset);
1410			slotp->endoff = 0;
1411			return (0);
1412		}
1413	}
1414
1415	if (slotp->status == NONE) {
1416		/*
1417		 * We didn't find a slot; the new directory entry should be put
1418		 * at the end of the directory.  Return an indication of where
1419		 * this is, and set "endoff" to zero; since we're going to have
1420		 * to extend the directory, we're certainly not going to
1421		 * truncate it.
1422		 */
1423		slotp->offset = dirsize;
1424		slotp->size = DIRBLKSIZ;
1425		slotp->endoff = 0;
1426	} else {
1427		/*
1428		 * We found a slot, and will return an indication of where that
1429		 * slot is, as any new directory entry will be put there.
1430		 * Since that slot will become a useful entry, if the last
1431		 * useful entry we found was before this one, update the offset
1432		 * of the last useful entry.
1433		 */
1434		if (enduseful < slotp->offset + slotp->size)
1435			enduseful = slotp->offset + slotp->size;
1436		slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
1437	}
1438	*ipp = NULL;
1439	return (0);
1440}
1441
1442uint64_t ufs_dirrename_retry_cnt;
1443
1444/*
1445 * Rename the entry in the directory tdp so that it points to
1446 * sip instead of tip.
1447 */
1448static int
1449ufs_dirrename(
1450	struct inode *sdp,	/* parent directory of source */
1451	struct inode *sip,	/* source inode */
1452	struct inode *tdp,	/* parent directory of target */
1453	char *namep,		/* entry we are trying to change */
1454	struct inode *tip,	/* target inode */
1455	struct ufs_slot *slotp,	/* slot for entry */
1456	struct cred *cr)	/* credentials */
1457{
1458	vnode_t *tdvp;
1459	off_t offset;
1460	int err;
1461	int doingdirectory;
1462
1463	ASSERT(sdp->i_ufsvfs != NULL);
1464	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1465	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1466	/*
1467	 * Short circuit rename of something to itself.
1468	 */
1469	if (sip->i_number == tip->i_number) {
1470		return (ESAME); /* special KLUDGE error code */
1471	}
1472
1473	/*
1474	 * We're locking 2 peer level locks, so must use tryenter
1475	 * on the 2nd to avoid deadlocks that would occur
1476	 * if we renamed a->b and b->a concurrently.
1477	 */
1478retry:
1479	rw_enter(&tip->i_contents, RW_WRITER);
1480	if (!rw_tryenter(&sip->i_contents, RW_READER)) {
1481		/*
1482		 * drop tip and wait (sleep) until we stand a chance
1483		 * of holding sip
1484		 */
1485		rw_exit(&tip->i_contents);
1486		rw_enter(&sip->i_contents, RW_READER);
1487		/*
1488		 * Reverse the lock grabs in case we have heavy
1489		 * contention on the 2nd lock.
1490		 */
1491		if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
1492			ufs_dirrename_retry_cnt++;
1493			rw_exit(&sip->i_contents);
1494			goto retry;
1495		}
1496	}
1497
1498	/*
1499	 * Check that everything is on the same filesystem.
1500	 */
1501	if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
1502	    (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
1503		err = EXDEV;		/* XXX archaic */
1504		goto out;
1505	}
1506	/*
1507	 * Must have write permission to rewrite target entry.
1508	 * Perform additional checks for sticky directories.
1509	 */
1510	if ((err = ufs_iaccess(tdp, IWRITE, cr, 0)) != 0 ||
1511	    (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
1512		goto out;
1513
1514	/*
1515	 * Ensure source and target are compatible (both directories
1516	 * or both not directories).  If target is a directory it must
1517	 * be empty and have no links to it; in addition it must not
1518	 * be a mount point, and both the source and target must be
1519	 * writable.
1520	 */
1521	doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
1522	    ((sip->i_mode & IFMT) == IFATTRDIR));
1523	if (((tip->i_mode & IFMT) == IFDIR) ||
1524	    ((tip->i_mode & IFMT) == IFATTRDIR)) {
1525		if (!doingdirectory) {
1526			err = EISDIR;
1527			goto out;
1528		}
1529		/*
1530		 * vn_vfsrlock will prevent mounts from using the directory
1531		 * until we are done.
1532		 */
1533		if (vn_vfsrlock(ITOV(tip))) {
1534			err = EBUSY;
1535			goto out;
1536		}
1537		if (vn_mountedvfs(ITOV(tip)) != NULL) {
1538			vn_vfsunlock(ITOV(tip));
1539			err = EBUSY;
1540			goto out;
1541		}
1542		if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
1543			vn_vfsunlock(ITOV(tip));
1544			err = EEXIST;	/* SIGH should be ENOTEMPTY */
1545			goto out;
1546		}
1547	} else if (doingdirectory) {
1548		err = ENOTDIR;
1549		goto out;
1550	}
1551
1552	/*
1553	 * Rewrite the inode pointer for target name entry
1554	 * from the target inode (ip) to the source inode (sip).
1555	 * This prevents the target entry from disappearing
1556	 * during a crash. Mark the directory inode to reflect the changes.
1557	 */
1558	tdvp = ITOV(tdp);
1559	slotp->ep->d_ino = (int32_t)sip->i_number;
1560	dnlc_update(tdvp, namep, ITOV(sip));
1561	if (slotp->size) {
1562		offset = slotp->offset - slotp->size;
1563	} else {
1564		offset = slotp->offset + 1;
1565	}
1566	if (slotp->cached) {
1567		(void) dnlc_dir_update(&tdp->i_danchor, namep,
1568		    INO_OFF_TO_H(slotp->ep->d_ino, offset));
1569	}
1570
1571	err = TRANS_DIR(tdp, slotp->offset);
1572	if (err)
1573		fbrelse(slotp->fbp, S_OTHER);
1574	else
1575		err = ufs_fbwrite(slotp->fbp, tdp);
1576
1577	slotp->fbp = NULL;
1578	if (err) {
1579		if (doingdirectory)
1580			vn_vfsunlock(ITOV(tip));
1581		goto out;
1582	}
1583
1584	TRANS_INODE(tdp->i_ufsvfs, tdp);
1585	tdp->i_flag |= IUPD|ICHG;
1586	tdp->i_seq++;
1587	ITIMES_NOLOCK(tdp);
1588
1589	/*
1590	 * Decrement the link count of the target inode.
1591	 * Fix the ".." entry in sip to point to dp.
1592	 * This is done after the new entry is on the disk.
1593	 */
1594	tip->i_nlink--;
1595	TRANS_INODE(tip->i_ufsvfs, tip);
1596	tip->i_flag |= ICHG;
1597	tip->i_seq++;
1598	ITIMES_NOLOCK(tip);
1599	if (doingdirectory) {
1600		/*
1601		 * The entry for tip no longer exists so I can unlock the
1602		 * vfslock.
1603		 */
1604		vn_vfsunlock(ITOV(tip));
1605		/*
1606		 * Decrement target link count once more if it was a directory.
1607		 */
1608		if (--tip->i_nlink != 0) {
1609			err = ufs_fault(ITOV(tip),
1610		    "ufs_dirrename: target directory link count != 0 (%s)",
1611			    tip->i_fs->fs_fsmnt);
1612			rw_exit(&tip->i_contents);
1613			return (err);
1614		}
1615		TRANS_INODE(tip->i_ufsvfs, tip);
1616		ufs_setreclaim(tip);
1617		/*
1618		 * Renaming a directory with the parent different
1619		 * requires that ".." be rewritten.  The window is
1620		 * still there for ".." to be inconsistent, but this
1621		 * is unavoidable, and a lot shorter than when it was
1622		 * done in a user process.  We decrement the link
1623		 * count in the new parent as appropriate to reflect
1624		 * the just-removed target.  If the parent is the
1625		 * same, this is appropriate since the original
1626		 * directory is going away.  If the new parent is
1627		 * different, ufs_dirfixdotdot() will bump the link count
1628		 * back.
1629		 */
1630		tdp->i_nlink--;
1631		ufs_setreclaim(tdp);
1632		TRANS_INODE(tdp->i_ufsvfs, tdp);
1633		tdp->i_flag |= ICHG;
1634		tdp->i_seq++;
1635		ITIMES_NOLOCK(tdp);
1636		if (sdp != tdp) {
1637			rw_exit(&tip->i_contents);
1638			rw_exit(&sip->i_contents);
1639			err = ufs_dirfixdotdot(sip, sdp, tdp);
1640			return (err);
1641		}
1642	} else
1643		ufs_setreclaim(tip);
1644out:
1645	rw_exit(&tip->i_contents);
1646	rw_exit(&sip->i_contents);
1647	return (err);
1648}
1649
1650/*
1651 * Fix the ".." entry of the child directory so that it points
1652 * to the new parent directory instead of the old one.  Routine
1653 * assumes that dp is a directory and that all the inodes are on
1654 * the same file system.
1655 */
1656static int
1657ufs_dirfixdotdot(
1658	struct inode *dp,	/* child directory */
1659	struct inode *opdp,	/* old parent directory */
1660	struct inode *npdp)	/* new parent directory */
1661{
1662	struct fbuf *fbp;
1663	struct dirtemplate *dirp;
1664	vnode_t *dvp;
1665	int err;
1666
1667	ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
1668	ASSERT(RW_WRITE_HELD(&npdp->i_contents));
1669
1670	/*
1671	 * We hold the child directory's i_contents lock before calling
1672	 * blkatoff so that we honor correct locking protocol which is
1673	 * i_contents lock and then page lock. (blkatoff will call
1674	 * ufs_getpage where we want the page lock)
1675	 * We hold the child directory's i_rwlock before i_contents (as
1676	 * per the locking protocol) since we are modifying the ".." entry
1677	 * of the child directory.
1678	 * We hold the i_rwlock and i_contents lock until we record
1679	 * this directory delta to the log (via ufs_trans_dir) and have
1680	 * done fbrelse.
1681	 */
1682	rw_enter(&dp->i_rwlock, RW_WRITER);
1683	rw_enter(&dp->i_contents, RW_WRITER);
1684	err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
1685	if (err)
1686		goto bad;
1687
1688	if (dp->i_nlink <= 0 ||
1689	    dp->i_size < sizeof (struct dirtemplate)) {
1690		err = ENOENT;
1691		goto bad;
1692	}
1693
1694	if (dirp->dotdot_namlen != 2 ||
1695	    dirp->dotdot_name[0] != '.' ||
1696	    dirp->dotdot_name[1] != '.') {	/* Sanity check. */
1697		dirbad(dp, "mangled .. entry", (off_t)0);
1698		err = ENOTDIR;
1699		goto bad;
1700	}
1701
1702	/*
1703	 * Increment the link count in the new parent inode and force it out.
1704	 */
1705	if (npdp->i_nlink == MAXLINK) {
1706		err = EMLINK;
1707		goto bad;
1708	}
1709	npdp->i_nlink++;
1710	TRANS_INODE(npdp->i_ufsvfs, npdp);
1711	npdp->i_flag |= ICHG;
1712	npdp->i_seq++;
1713	ufs_iupdat(npdp, I_SYNC);
1714
1715	/*
1716	 * Rewrite the child ".." entry and force it out.
1717	 */
1718	dvp = ITOV(dp);
1719	dirp->dotdot_ino = (uint32_t)npdp->i_number;
1720	dnlc_update(dvp, "..", ITOV(npdp));
1721	(void) dnlc_dir_update(&dp->i_danchor, "..",
1722	    INO_OFF_TO_H(dirp->dotdot_ino, 0));
1723
1724	err = TRANS_DIR(dp, 0);
1725	if (err)
1726		fbrelse(fbp, S_OTHER);
1727	else
1728		err = ufs_fbwrite(fbp, dp);
1729
1730	fbp = NULL;
1731	if (err)
1732		goto bad;
1733
1734	rw_exit(&dp->i_contents);
1735	rw_exit(&dp->i_rwlock);
1736
1737	/*
1738	 * Decrement the link count of the old parent inode and force it out.
1739	 */
1740	ASSERT(opdp);
1741	rw_enter(&opdp->i_contents, RW_WRITER);
1742	ASSERT(opdp->i_nlink > 0);
1743	opdp->i_nlink--;
1744	ufs_setreclaim(opdp);
1745	TRANS_INODE(opdp->i_ufsvfs, opdp);
1746	opdp->i_flag |= ICHG;
1747	opdp->i_seq++;
1748	ufs_iupdat(opdp, I_SYNC);
1749	rw_exit(&opdp->i_contents);
1750	return (0);
1751
1752bad:
1753	if (fbp)
1754		fbrelse(fbp, S_OTHER);
1755	rw_exit(&dp->i_contents);
1756	rw_exit(&dp->i_rwlock);
1757	return (err);
1758}
1759
1760/*
1761 * Enter the file sip in the directory tdp with name namep.
1762 */
1763static int
1764ufs_diraddentry(
1765	struct inode *tdp,
1766	char *namep,
1767	enum de_op op,
1768	int namlen,
1769	struct ufs_slot *slotp,
1770	struct inode *sip,
1771	struct inode *sdp,
1772	struct cred *cr)
1773{
1774	struct direct *ep, *nep;
1775	vnode_t *tdvp;
1776	dcanchor_t *dcap = &tdp->i_danchor;
1777	off_t offset;
1778	int err;
1779	ushort_t extra;
1780
1781	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1782	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1783	/*
1784	 * Prepare a new entry.  If the caller has not supplied an
1785	 * existing inode, make a new one.
1786	 */
1787	err = dirprepareentry(tdp, slotp, cr);
1788	if (err) {
1789		if (slotp->fbp) {
1790			fbrelse(slotp->fbp, S_OTHER);
1791			slotp->fbp = NULL;
1792		}
1793		return (err);
1794	}
1795	/*
1796	 * Check inode to be linked to see if it is in the
1797	 * same filesystem.
1798	 */
1799	if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
1800		err = EXDEV;
1801		goto bad;
1802	}
1803
1804	/*
1805	 * If renaming a directory then fix up the ".." entry in the
1806	 * directory to point to the new parent.
1807	 */
1808	if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
1809	    ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
1810		err = ufs_dirfixdotdot(sip, sdp, tdp);
1811		if (err)
1812			goto bad;
1813	}
1814
1815	/*
1816	 * Fill in entry data.
1817	 */
1818	ep = slotp->ep;
1819	ep->d_namlen = (ushort_t)namlen;
1820	(void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
1821	ep->d_ino = (uint32_t)sip->i_number;
1822	tdvp = ITOV(tdp);
1823	dnlc_update(tdvp, namep, ITOV(sip));
1824	/*
1825	 * Note the offset supplied for any named entry is
1826	 * the offset of the previous one, unless it's the 1st.
1827	 * slotp->size is used to pass the length to
1828	 * the previous entry.
1829	 */
1830	if (slotp->size) {
1831		offset = slotp->offset - slotp->size;
1832	} else {
1833		offset = slotp->offset + 1;
1834	}
1835
1836	if (slotp->cached) {
1837		/*
1838		 * Add back any usable unused space to the dnlc directory
1839		 * cache.
1840		 */
1841		extra = ep->d_reclen - DIRSIZ(ep);
1842		if (extra >= LDIRSIZ(1)) {
1843			(void) dnlc_dir_add_space(dcap, extra,
1844			    (uint64_t)slotp->offset);
1845		}
1846
1847		(void) dnlc_dir_add_entry(dcap, namep,
1848		    INO_OFF_TO_H(ep->d_ino, offset));
1849
1850		/* adjust the previous offset of the next entry */
1851		nep = (struct direct *)((char *)ep + ep->d_reclen);
1852		if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
1853			/*
1854			 * Not a new block.
1855			 *
1856			 * Check the validity of the next entry.
1857			 * If it's bad, then throw away the cache, and
1858			 * continue as before directory caching.
1859			 */
1860			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1861			    dnlc_dir_update(dcap, nep->d_name,
1862			    INO_OFF_TO_H(nep->d_ino, slotp->offset))
1863			    == DNOENT) {
1864				dnlc_dir_purge(dcap);
1865				slotp->cached = 0;
1866			}
1867		}
1868	}
1869
1870	/*
1871	 * Write out the directory block.
1872	 */
1873	err = TRANS_DIR(tdp, slotp->offset);
1874	if (err)
1875		fbrelse(slotp->fbp, S_OTHER);
1876	else
1877		err = ufs_fbwrite(slotp->fbp, tdp);
1878
1879	slotp->fbp = NULL;
1880	/*
1881	 * If this is a rename of a directory, then we have already
1882	 * fixed the ".." entry to refer to the new parent. If err
1883	 * is true at this point, we have failed to update the new
1884	 * parent to refer to the renamed directory.
1885	 * XXX - we need to unwind the ".." fix.
1886	 */
1887	if (err)
1888		return (err);
1889
1890	/*
1891	 * Mark the directory inode to reflect the changes.
1892	 * Truncate the directory to chop off blocks of empty entries.
1893	 */
1894
1895	TRANS_INODE(tdp->i_ufsvfs, tdp);
1896	tdp->i_flag |= IUPD|ICHG;
1897	tdp->i_seq++;
1898	tdp->i_diroff = 0;
1899	ITIMES_NOLOCK(tdp);
1900	/*
1901	 * If the directory grew then dirprepareentry() will have
1902	 * set IATTCHG in tdp->i_flag, then the directory inode must
1903	 * be flushed out. This is because if fsync() is used later
1904	 * the directory size must be correct, otherwise a crash would
1905	 * cause fsck to move the file to lost+found. Also because later
1906	 * a file may be linked in more than one directory, then there
1907	 * is no way to flush the original directory. So it must be
1908	 * flushed out on creation. See bug 4293809.
1909	 */
1910	if (tdp->i_flag & IATTCHG) {
1911		ufs_iupdat(tdp, I_SYNC);
1912	}
1913
1914	if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
1915		if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
1916			(void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0,
1917			    cr);
1918		}
1919	}
1920
1921
1922	return (0);
1923
1924bad:
1925	if (slotp->cached) {
1926		dnlc_dir_purge(dcap);
1927		fbrelse(slotp->fbp, S_OTHER);
1928		slotp->cached = 0;
1929		slotp->fbp = NULL;
1930		return (err);
1931	}
1932
1933	/*
1934	 * Clear out entry prepared by dirprepareent.
1935	 */
1936	slotp->ep->d_ino = 0;
1937	slotp->ep->d_namlen = 0;
1938
1939	/*
1940	 * Don't touch err so we don't clobber the real error that got us here.
1941	 */
1942	if (TRANS_DIR(tdp, slotp->offset))
1943		fbrelse(slotp->fbp, S_OTHER);
1944	else
1945		(void) ufs_fbwrite(slotp->fbp, tdp);
1946	slotp->fbp = NULL;
1947	return (err);
1948}
1949
1950/*
1951 * Prepare a directory slot to receive an entry.
1952 */
1953static int
1954dirprepareentry(
1955	struct inode *dp,	/* directory we are working in */
1956	struct ufs_slot *slotp,	/* available slot info */
1957	struct cred *cr)
1958{
1959	struct direct *ep, *nep;
1960	off_t entryend;
1961	int err;
1962	slotstat_t status = slotp->status;
1963	ushort_t dsize;
1964
1965	ASSERT((status == NONE) || (status == FOUND));
1966	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
1967	ASSERT(RW_WRITE_HELD(&dp->i_contents));
1968	/*
1969	 * If we didn't find a slot, then indicate that the
1970	 * new slot belongs at the end of the directory.
1971	 * If we found a slot, then the new entry can be
1972	 * put at slotp->offset.
1973	 */
1974	entryend = slotp->offset + slotp->size;
1975	if (status == NONE) {
1976		ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
1977		if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
1978			err = ufs_fault(ITOV(dp),
1979			    "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
1980			    " > dp->i_fs->fs_fsize: %d (%s)",
1981			    DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
1982			return (err);
1983		}
1984		/*
1985		 * Allocate the new block.
1986		 */
1987		err = BMAPALLOC(dp, (u_offset_t)slotp->offset,
1988		    (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
1989		if (err) {
1990			return (err);
1991		}
1992		dp->i_size = entryend;
1993		TRANS_INODE(dp->i_ufsvfs, dp);
1994		dp->i_flag |= IUPD|ICHG|IATTCHG;
1995		dp->i_seq++;
1996		ITIMES_NOLOCK(dp);
1997	} else if (entryend > dp->i_size) {
1998		/*
1999		 * Adjust directory size, if needed. This should never
2000		 * push the size past a new multiple of DIRBLKSIZ.
2001		 * This is an artifact of the old (4.2BSD) way of initializing
2002		 * directory sizes to be less than DIRBLKSIZ.
2003		 */
2004		dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
2005		TRANS_INODE(dp->i_ufsvfs, dp);
2006		dp->i_flag |= IUPD|ICHG|IATTCHG;
2007		dp->i_seq++;
2008		ITIMES_NOLOCK(dp);
2009	}
2010
2011	/*
2012	 * Get the block containing the space for the new directory entry.
2013	 */
2014	if (slotp->fbp == NULL) {
2015		err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
2016		    &slotp->fbp);
2017		if (err) {
2018			return (err);
2019		}
2020	}
2021	ep = slotp->ep;
2022
2023	switch (status) {
2024	case NONE:
2025		/*
2026		 * No space in the directory. slotp->offset will be on a
2027		 * directory block boundary and we will write the new entry
2028		 * into a fresh block.
2029		 */
2030		ep->d_reclen = DIRBLKSIZ;
2031		slotp->size = 0; /* length of previous entry */
2032		break;
2033	case FOUND:
2034		/*
2035		 * An entry of the required size has been found. Use it.
2036		 */
2037		if (ep->d_ino == 0) {
2038			/* this is the 1st record in a block */
2039			slotp->size = 0; /* length of previous entry */
2040		} else {
2041			dsize = DIRSIZ(ep);
2042			nep = (struct direct *)((char *)ep + dsize);
2043			nep->d_reclen = ep->d_reclen - dsize;
2044			ep->d_reclen = dsize;
2045			slotp->ep = nep;
2046			slotp->offset += dsize;
2047			slotp->size = dsize; /* length of previous entry */
2048		}
2049		break;
2050	default:
2051		break;
2052	}
2053	return (0);
2054}
2055
2056/*
2057 * Allocate and initialize a new inode that will go into directory tdp.
2058 * This routine is called from ufs_symlink(), as well as within this file.
2059 */
2060int
2061ufs_dirmakeinode(
2062	struct inode *tdp,
2063	struct inode **ipp,
2064	struct vattr *vap,
2065	enum de_op op,
2066	struct cred *cr)
2067{
2068	struct inode *ip;
2069	enum vtype type;
2070	int imode;			/* mode and format as in inode */
2071	ino_t ipref;
2072	int err;
2073	timestruc_t now;
2074
2075	ASSERT(vap != NULL);
2076	ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
2077	    op == DE_SYMLINK);
2078	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
2079	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
2080	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
2081	/*
2082	 * Allocate a new inode.
2083	 */
2084	type = vap->va_type;
2085	if (type == VDIR) {
2086		ipref = dirpref(tdp);
2087	} else {
2088		ipref = tdp->i_number;
2089	}
2090	if (op == DE_ATTRDIR)
2091		imode = vap->va_mode;
2092	else
2093		imode = MAKEIMODE(type, vap->va_mode);
2094	*ipp = NULL;
2095	err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
2096	if (err)
2097		return (err);
2098
2099	/*
2100	 * We don't need to grab vfs_dqrwlock here because it is held
2101	 * in ufs_direnter_*() above us.
2102	 */
2103	ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
2104	rw_enter(&ip->i_contents, RW_WRITER);
2105	if (ip->i_dquot != NULL) {
2106		err = ufs_fault(ITOV(ip),
2107		    "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
2108		    tdp->i_fs->fs_fsmnt);
2109		rw_exit(&ip->i_contents);
2110		return (err);
2111	}
2112	*ipp = ip;
2113	ip->i_mode = (o_mode_t)imode;
2114	if (type == VBLK || type == VCHR) {
2115		dev_t d = vap->va_rdev;
2116		dev32_t dev32;
2117
2118		/*
2119		 * Don't allow a special file to be created with a
2120		 * dev_t that cannot be represented by this filesystem
2121		 * format on disk.
2122		 */
2123		if (!cmpldev(&dev32, d)) {
2124			err = EOVERFLOW;
2125			goto fail;
2126		}
2127
2128		ITOV(ip)->v_rdev = ip->i_rdev = d;
2129
2130		if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
2131			ip->i_ordev = dev32; /* can't use old format */
2132		} else {
2133			ip->i_ordev = cmpdev(d);
2134		}
2135	}
2136	ITOV(ip)->v_type = type;
2137	ufs_reset_vnode(ip->i_vnode);
2138	if (type == VDIR) {
2139		ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
2140	} else {
2141		ip->i_nlink = 1;
2142	}
2143
2144	if (op == DE_ATTRDIR) {
2145		ip->i_uid = vap->va_uid;
2146		ip->i_gid = vap->va_gid;
2147	} else
2148		ip->i_uid = crgetuid(cr);
2149	/*
2150	 * To determine the group-id of the created file:
2151	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
2152	 *	clients are not likely to set the gid), then use it if
2153	 *	the process is privileged, belongs to the target group,
2154	 *	or the group is the same as the parent directory.
2155	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
2156	 *	GRPID option, and the directory's set-gid bit is clear,
2157	 *	then use the process's gid.
2158	 *   3) Otherwise, set the group-id to the gid of the parent directory.
2159	 */
2160	if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
2161	    ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
2162	    secpolicy_vnode_create_gid(cr) == 0)) {
2163		/*
2164		 * XXX - is this only the case when a 4.0 NFS client, or a
2165		 * client derived from that code, makes a call over the wire?
2166		 */
2167		ip->i_gid = vap->va_gid;
2168	} else
2169		ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);
2170
2171	/*
2172	 * For SunOS 5.0->5.4, the lines below read:
2173	 *
2174	 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
2175	 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
2176	 *
2177	 * where MAXUID was set to 60002.  See notes on this in ufs_inode.c
2178	 */
2179	ip->i_suid =
2180	    (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? UID_LONG : ip->i_uid;
2181	ip->i_sgid =
2182	    (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? GID_LONG : ip->i_gid;
2183
2184	/*
2185	 * If we're creating a directory, and the parent directory has the
2186	 * set-GID bit set, set it on the new directory.
2187	 * Otherwise, if the user is neither privileged nor a member of the
2188	 * file's new group, clear the file's set-GID bit.
2189	 */
2190	if ((tdp->i_mode & ISGID) && (type == VDIR))
2191		ip->i_mode |= ISGID;
2192	else {
2193		if ((ip->i_mode & ISGID) &&
2194		    secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
2195			ip->i_mode &= ~ISGID;
2196	}
2197
2198	if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2199	    ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2200		err = EOVERFLOW;
2201		goto fail;
2202	}
2203
2204	/*
2205	 * Extended attribute directories are not subject to quotas.
2206	 */
2207	if (op != DE_ATTRDIR)
2208		ip->i_dquot = getinoquota(ip);
2209	else
2210		ip->i_dquot = NULL;
2211
2212	if (op == DE_MKDIR || op == DE_ATTRDIR) {
2213		err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
2214		if (err)
2215			goto fail;
2216	}
2217
2218	/*
2219	 * generate the shadow inode and attach it to the new object
2220	 */
2221	ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
2222	    (!tdp->i_shadow && !tdp->i_ufs_acl));
2223	if (tdp->i_shadow && tdp->i_ufs_acl &&
2224	    (((tdp->i_mode & IFMT) == IFDIR) ||
2225	    ((tdp->i_mode & IFMT) == IFATTRDIR))) {
2226		err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
2227		if (err) {
2228			if (op == DE_MKDIR) {
2229				/*
2230				 * clean up parent directory
2231				 *
2232				 * tdp->i_contents already locked from
2233				 * ufs_direnter_*()
2234				 */
2235				tdp->i_nlink--;
2236				TRANS_INODE(tdp->i_ufsvfs, tdp);
2237				tdp->i_flag |= ICHG;
2238				tdp->i_seq++;
2239				ufs_iupdat(tdp, I_SYNC);
2240			}
2241			goto fail;
2242		}
2243	}
2244
2245	/*
2246	 * If the passed in attributes contain atime and/or mtime
2247	 * settings, then use them instead of using the current
2248	 * high resolution time.
2249	 */
2250	if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
2251		if (vap->va_mask & AT_ATIME) {
2252			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2253			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2254			ip->i_flag &= ~IACC;
2255		} else
2256			ip->i_flag |= IACC;
2257		if (vap->va_mask & AT_MTIME) {
2258			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2259			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2260			gethrestime(&now);
2261			if (now.tv_sec > TIME32_MAX) {
2262				/*
2263				 * In 2038, ctime sticks forever..
2264				 */
2265				ip->i_ctime.tv_sec = TIME32_MAX;
2266				ip->i_ctime.tv_usec = 0;
2267			} else {
2268				ip->i_ctime.tv_sec = now.tv_sec;
2269				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2270			}
2271			ip->i_flag &= ~(IUPD|ICHG);
2272			ip->i_flag |= IMODTIME;
2273		} else
2274			ip->i_flag |= IUPD|ICHG;
2275		ip->i_flag |= IMOD;
2276	} else
2277		ip->i_flag |= IACC|IUPD|ICHG;
2278	ip->i_seq++;
2279
2280	/*
2281	 * If this is an attribute tag it as one.
2282	 */
2283	if ((tdp->i_mode & IFMT) == IFATTRDIR) {
2284		ip->i_cflags |= IXATTR;
2285	}
2286
2287	/*
2288	 * push inode before it's name appears in a directory
2289	 */
2290	TRANS_INODE(ip->i_ufsvfs, ip);
2291	ufs_iupdat(ip, I_SYNC);
2292	rw_exit(&ip->i_contents);
2293	return (0);
2294
2295fail:
2296	/* Throw away inode we just allocated. */
2297	ip->i_nlink = 0;
2298	ufs_setreclaim(ip);
2299	TRANS_INODE(ip->i_ufsvfs, ip);
2300	ip->i_flag |= ICHG;
2301	ip->i_seq++;
2302	ITIMES_NOLOCK(ip);
2303	rw_exit(&ip->i_contents);
2304	return (err);
2305}
2306
2307/*
2308 * Write a prototype directory into the empty inode ip, whose parent is dp.
2309 */
2310static int
2311ufs_dirmakedirect(
2312	struct inode *ip,		/* new directory */
2313	struct inode *dp,		/* parent directory */
2314	int	attrdir,
2315	struct cred *cr)
2316{
2317	struct dirtemplate *dirp;
2318	struct fbuf *fbp;
2319	int err;
2320
2321	ASSERT(RW_WRITE_HELD(&ip->i_contents));
2322	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2323	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2324	/*
2325	 * Allocate space for the directory we're creating.
2326	 */
2327	err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr);
2328	if (err)
2329		return (err);
2330	if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
2331		err = ufs_fault(ITOV(dp),
2332"ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
2333		    DIRBLKSIZ, dp->i_fs->fs_fsize,
2334		    dp->i_fs->fs_fsmnt);
2335		return (err);
2336	}
2337	ip->i_size = DIRBLKSIZ;
2338	TRANS_INODE(ip->i_ufsvfs, ip);
2339	ip->i_flag |= IUPD|ICHG|IATTCHG;
2340	ip->i_seq++;
2341	ITIMES_NOLOCK(ip);
2342	/*
2343	 * Update the tdp link count and write out the change.
2344	 * This reflects the ".." entry we'll soon write.
2345	 */
2346	if (dp->i_nlink == MAXLINK)
2347		return (EMLINK);
2348	if (attrdir == 0)
2349		dp->i_nlink++;
2350	TRANS_INODE(dp->i_ufsvfs, dp);
2351	dp->i_flag |= ICHG;
2352	dp->i_seq++;
2353	ufs_iupdat(dp, I_SYNC);
2354	/*
2355	 * Initialize directory with "."
2356	 * and ".." from static template.
2357	 *
2358	 * Since the parent directory is locked, we don't have to
2359	 * worry about anything changing when we drop the write
2360	 * lock on (ip).
2361	 *
2362	 */
2363	err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize,
2364	    S_READ, &fbp);
2365
2366	if (err) {
2367		goto fail;
2368	}
2369	dirp = (struct dirtemplate *)fbp->fb_addr;
2370	/*
2371	 * Now initialize the directory we're creating
2372	 * with the "." and ".." entries.
2373	 */
2374	*dirp = mastertemplate;			/* structure assignment */
2375	dirp->dot_ino = (uint32_t)ip->i_number;
2376	dirp->dotdot_ino = (uint32_t)dp->i_number;
2377
2378	err = TRANS_DIR(ip, 0);
2379	if (err) {
2380		fbrelse(fbp, S_OTHER);
2381		goto fail;
2382	}
2383
2384	err = ufs_fbwrite(fbp, ip);
2385	if (err) {
2386		goto fail;
2387	}
2388
2389	return (0);
2390
2391fail:
2392	if (attrdir == 0)
2393		dp->i_nlink--;
2394	TRANS_INODE(dp->i_ufsvfs, dp);
2395	dp->i_flag |= ICHG;
2396	dp->i_seq++;
2397	ufs_iupdat(dp, I_SYNC);
2398	return (err);
2399}
2400
2401/*
2402 * Delete a directory entry.  If oip is nonzero the entry is checked
2403 * to make sure it still reflects oip.
2404 */
2405int
2406ufs_dirremove(
2407	struct inode *dp,
2408	char *namep,
2409	struct inode *oip,
2410	struct vnode *cdir,
2411	enum dr_op op,
2412	struct cred *cr)
2413{
2414	struct direct *ep, *pep, *nep;
2415	struct inode *ip;
2416	vnode_t *dvp, *vp;
2417	struct ufs_slot slot;
2418	int namlen;
2419	int err;
2420	int mode;
2421	ushort_t extra;
2422
2423	namlen = (int)strlen(namep);
2424	if (namlen == 0) {
2425		struct fs	*fs = dp->i_fs;
2426
2427		cmn_err(CE_WARN, "%s: ufs_dirremove: attempted to remove"
2428		    " nameless file in directory (directory inode %llu)",
2429		    fs->fs_fsmnt, (u_longlong_t)dp->i_number);
2430		ASSERT(namlen != 0);
2431
2432		return (ENOENT);
2433	}
2434
2435	/*
2436	 * return error when removing . and ..
2437	 */
2438	if (namep[0] == '.') {
2439		if (namlen == 1)
2440			return (EINVAL);
2441		else if (namlen == 2 && namep[1] == '.') {
2442			return (EEXIST);	/* SIGH should be ENOTEMPTY */
2443		}
2444	}
2445
2446	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2447
2448retry:
2449	/*
2450	 * Check accessibility of directory.
2451	 */
2452	if (err = ufs_diraccess(dp, IEXEC|IWRITE, cr))
2453		return (err);
2454
2455	ip = NULL;
2456	slot.fbp = NULL;
2457	slot.status = FOUND;	/* don't need to look for empty slot */
2458	rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
2459	rw_enter(&dp->i_contents, RW_WRITER);
2460
2461	err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
2462	if (err)
2463		goto out_novfs;
2464	if (ip == NULL) {
2465		err = ENOENT;
2466		goto out_novfs;
2467	}
2468	vp = ITOV(ip);
2469	if (oip && oip != ip) {
2470		err = ENOENT;
2471		goto out_novfs;
2472	}
2473
2474	mode = ip->i_mode & IFMT;
2475	if (mode == IFDIR || mode == IFATTRDIR) {
2476
2477		/*
2478		 * vn_vfsrlock() prevents races between mount and rmdir.
2479		 */
2480		if (vn_vfsrlock(vp)) {
2481			err = EBUSY;
2482			goto out_novfs;
2483		}
2484		if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
2485			err = EBUSY;
2486			goto out;
2487		}
2488		/*
2489		 * If we are removing a directory, get a lock on it.
2490		 * Taking a writer lock prevents a parallel ufs_dirlook from
2491		 * incorrectly entering a negative cache vnode entry in the dnlc
2492		 * If the directory is empty, it will stay empty until
2493		 * we can remove it.
2494		 */
2495		if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
2496			/*
2497			 * It is possible that a thread in rename would have
2498			 * acquired this rwlock. To prevent a deadlock we
2499			 * do a rw_tryenter. If we fail to get the lock
2500			 * we drop all the locks we have acquired, wait
2501			 * for 2 ticks and reacquire the
2502			 * directory's (dp) i_rwlock and try again.
2503			 * If we dont drop dp's i_rwlock then we will panic
2504			 * with a "Deadlock: cycle in blocking chain"
2505			 * since in ufs_dircheckpath we want dp's i_rwlock.
2506			 * dp is guaranteed to exist since ufs_dirremove is
2507			 * called after a VN_HOLD(dp) has been done.
2508			 */
2509			ufs_dirremove_retry_cnt++;
2510			vn_vfsunlock(vp);
2511			if (slot.fbp)
2512				fbrelse(slot.fbp, S_OTHER);
2513			rw_exit(&dp->i_contents);
2514			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2515			rw_exit(&dp->i_rwlock);
2516			VN_RELE(vp);
2517			delay(2);
2518			rw_enter(&dp->i_rwlock, RW_WRITER);
2519			goto retry;
2520		}
2521	}
2522	rw_enter(&ip->i_contents, RW_READER);
2523
2524	/*
2525	 * Now check the restrictions that apply on sticky directories.
2526	 */
2527	if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
2528		rw_exit(&ip->i_contents);
2529		if (mode == IFDIR || mode == IFATTRDIR)
2530			rw_exit(&ip->i_rwlock);
2531		goto out;
2532	}
2533
2534	if (op == DR_RMDIR) {
2535		/*
2536		 * For rmdir(2), some special checks are required.
2537		 * (a) Don't remove any alias of the parent (e.g. ".").
2538		 * (b) Don't remove the current directory.
2539		 * (c) Make sure the entry is (still) a directory.
2540		 * (d) Make sure the directory is empty.
2541		 */
2542
2543		if (dp == ip || vp == cdir)
2544			err = EINVAL;
2545		else if (((ip->i_mode & IFMT) != IFDIR) &&
2546		    ((ip->i_mode & IFMT) != IFATTRDIR))
2547			err = ENOTDIR;
2548		else if ((ip->i_nlink > 2) ||
2549		    !ufs_dirempty(ip, dp->i_number, cr)) {
2550			err = EEXIST;	/* SIGH should be ENOTEMPTY */
2551		}
2552
2553		if (err) {
2554			rw_exit(&ip->i_contents);
2555			if (mode == IFDIR || mode == IFATTRDIR)
2556				rw_exit(&ip->i_rwlock);
2557			goto out;
2558		}
2559	} else if (op == DR_REMOVE)  {
2560		/*
2561		 * unlink(2) requires a different check: allow only
2562		 * privileged users to unlink a directory.
2563		 */
2564		if (vp->v_type == VDIR &&
2565		    secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
2566			err = EPERM;
2567			rw_exit(&ip->i_contents);
2568			rw_exit(&ip->i_rwlock);
2569			goto out;
2570		}
2571	}
2572
2573	rw_exit(&ip->i_contents);
2574
2575	/*
2576	 * Remove the cache'd entry, if any.
2577	 */
2578	dvp = ITOV(dp);
2579	dnlc_remove(dvp, namep);
2580	ep = slot.ep;
2581	ep->d_ino = 0;
2582
2583	if (slot.cached) {
2584		dcanchor_t *dcap = &dp->i_danchor;
2585
2586		(void) dnlc_dir_rem_entry(dcap, namep, NULL);
2587		if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
2588			(void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
2589		}
2590		if (slot.offset & (DIRBLKSIZ - 1)) {
2591			/*
2592			 * Collapse new free space into previous entry.
2593			 * Note, the previous entry has already been
2594			 * validated in ufs_dircheckforname().
2595			 */
2596			ASSERT(slot.size);
2597			pep = (struct direct *)((char *)ep - slot.size);
2598			if ((pep->d_ino == 0) &&
2599			    ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
2600				dnlc_dir_purge(dcap);
2601				slot.cached = 0;
2602				goto nocache;
2603			}
2604			if (pep->d_ino) {
2605				extra = pep->d_reclen - DIRSIZ(pep);
2606			} else {
2607				extra = pep->d_reclen;
2608			}
2609			if (extra >= LDIRSIZ(1)) {
2610				(void) dnlc_dir_rem_space_by_handle(dcap,
2611				    (uint64_t)(slot.offset - slot.size));
2612			}
2613			pep->d_reclen += ep->d_reclen;
2614			(void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
2615			    (uint64_t)(slot.offset - slot.size));
2616			/* adjust the previous pointer in the next entry */
2617			nep = (struct direct *)((char *)ep + ep->d_reclen);
2618			if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
2619				/*
2620				 * Not a new block.
2621				 *
2622				 * Check the validity of the entry.
2623				 * If it's bad, then throw away the cache and
2624				 * continue.
2625				 */
2626				if ((nep->d_reclen == 0) ||
2627				    (nep->d_reclen & 0x3) ||
2628				    (dnlc_dir_update(dcap, nep->d_name,
2629				    INO_OFF_TO_H(nep->d_ino,
2630				    slot.offset - slot.size)) == DNOENT)) {
2631					dnlc_dir_purge(dcap);
2632					slot.cached = 0;
2633				}
2634			}
2635		} else {
2636			(void) dnlc_dir_add_space(dcap, ep->d_reclen,
2637			    (uint64_t)slot.offset);
2638		}
2639	} else {
2640		/*
2641		 * If the entry isn't the first in the directory, we must
2642		 * reclaim the space of the now empty record by adding
2643		 * the record size to the size of the previous entry.
2644		 */
2645		if (slot.offset & (DIRBLKSIZ - 1)) {
2646			/*
2647			 * Collapse new free space into previous entry.
2648			 */
2649			pep = (struct direct *)((char *)ep - slot.size);
2650			pep->d_reclen += ep->d_reclen;
2651		}
2652	}
2653nocache:
2654
2655
2656	err = TRANS_DIR(dp, slot.offset);
2657	if (err)
2658		fbrelse(slot.fbp, S_OTHER);
2659	else
2660		err = ufs_fbwrite(slot.fbp, dp);
2661	slot.fbp = NULL;
2662
2663	/*
2664	 * If we were removing a directory, it is 'gone' now, but we cannot
2665	 * unlock it as a thread may be waiting for the lock in ufs_create. If
2666	 * we did, it could then create a file in a deleted directory.
2667	 */
2668
2669	if (err) {
2670		if (mode == IFDIR || mode == IFATTRDIR)
2671			rw_exit(&ip->i_rwlock);
2672		goto out;
2673	}
2674
2675	rw_enter(&ip->i_contents, RW_WRITER);
2676
2677	dp->i_flag |= IUPD|ICHG;
2678	dp->i_seq++;
2679	ip->i_flag |= ICHG;
2680	ip->i_seq++;
2681
2682	TRANS_INODE(dp->i_ufsvfs, dp);
2683	TRANS_INODE(ip->i_ufsvfs, ip);
2684	/*
2685	 * Now dispose of the inode.
2686	 */
2687	if (ip->i_nlink > 0) {
2688		/*
2689		 * This is not done for IFATTRDIR's because they don't
2690		 * have entries in the dnlc and the link counts are
2691		 * not incremented when they are created.
2692		 */
2693		if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
2694			/*
2695			 * Decrement by 2 because we're trashing the "."
2696			 * entry as well as removing the entry in dp.
2697			 * Clear the directory entry, but there may be
2698			 * other hard links so don't free the inode.
2699			 * Decrement the dp linkcount because we're
2700			 * trashing the ".." entry.
2701			 */
2702			ip->i_nlink -= 2;
2703			dp->i_nlink--;
2704			ufs_setreclaim(dp);
2705			/*
2706			 * XXX need to discard negative cache entries
2707			 * for vp.  See comment in ufs_delete().
2708			 */
2709			dnlc_remove(vp, ".");
2710			dnlc_remove(vp, "..");
2711			/*
2712			 * The return value is ignored here bacause if
2713			 * the directory purge fails we don't want to
2714			 * stop the delete. If ufs_dirpurgedotdot fails
2715			 * the delete will continue with the preexiting
2716			 * behavior.
2717			 */
2718			(void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
2719		} else {
2720			ip->i_nlink--;
2721		}
2722		ufs_setreclaim(ip);
2723	}
2724	ITIMES_NOLOCK(dp);
2725	ITIMES_NOLOCK(ip);
2726
2727	if (!TRANS_ISTRANS(dp->i_ufsvfs))
2728		ufs_iupdat(dp, I_SYNC);
2729	if (!TRANS_ISTRANS(ip->i_ufsvfs))
2730		ufs_iupdat(ip, I_SYNC);
2731
2732	rw_exit(&ip->i_contents);
2733	if (mode == IFDIR || mode == IFATTRDIR)
2734		rw_exit(&ip->i_rwlock);
2735out:
2736	if (mode == IFDIR || mode == IFATTRDIR) {
2737		vn_vfsunlock(vp);
2738	}
2739out_novfs:
2740	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2741
2742	if (slot.fbp)
2743		fbrelse(slot.fbp, S_OTHER);
2744
2745	rw_exit(&dp->i_contents);
2746	rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2747
2748	/*
2749	 * Release (and delete) the inode after we drop vfs_dqrwlock to
2750	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
2751	 */
2752	if (ip)
2753		VN_RELE(vp);
2754
2755	return (err);
2756}
2757
2758/*
2759 * Return buffer with contents of block "offset"
2760 * from the beginning of directory "ip".  If "res"
2761 * is non-zero, fill it in with a pointer to the
2762 * remaining space in the directory.
2763 *
2764 */
2765
2766int
2767blkatoff(
2768	struct inode *ip,
2769	off_t offset,
2770	char **res,
2771	struct fbuf **fbpp)
2772{
2773	struct fs *fs;
2774	struct fbuf *fbp;
2775	daddr_t lbn;
2776	uint_t bsize;
2777	int err;
2778
2779	CPU_STATS_ADD_K(sys, ufsdirblk, 1);
2780	fs = ip->i_fs;
2781	lbn = (daddr_t)lblkno(fs, offset);
2782	bsize = (uint_t)blksize(fs, ip, lbn);
2783	err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
2784	    bsize, S_READ, &fbp);
2785	if (err) {
2786		*fbpp = (struct fbuf *)NULL;
2787		return (err);
2788	}
2789	if (res)
2790		*res = fbp->fb_addr + blkoff(fs, offset);
2791	*fbpp = fbp;
2792	return (0);
2793}
2794
2795/*
2796 * Do consistency checking:
2797 *	record length must be multiple of 4
2798 *	entry must fit in rest of its DIRBLKSIZ block
2799 *	record must be large enough to contain entry
2800 *	name is not longer than MAXNAMLEN
2801 *	name must be as long as advertised, and null terminated
2802 * NOTE: record length must not be zero (should be checked previously).
2803 *       This routine is only called if dirchk is true.
2804 *       It would be nice to set the FSBAD flag in the super-block when
2805 *       this routine fails so that a fsck is forced on next reboot,
2806 *       but locking is a problem.
2807 */
2808static int
2809dirmangled(
2810	struct inode *dp,
2811	struct direct *ep,
2812	int entryoffsetinblock,
2813	off_t offset)
2814{
2815	int i;
2816
2817	i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
2818	if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
2819	    (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
2820	    ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
2821		dirbad(dp, "mangled entry", offset);
2822		return (1);
2823	}
2824	return (0);
2825}
2826
2827static void
2828dirbad(struct inode *ip, char *how, off_t offset)
2829{
2830	cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
2831	    ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
2832}
2833
2834static int
2835dirbadname(char *sp, int l)
2836{
2837	while (l--) {			/* check for nulls */
2838		if (*sp++ == '\0') {
2839			return (1);
2840		}
2841	}
2842	return (*sp);			/* check for terminating null */
2843}
2844
2845/*
2846 * Check if a directory is empty or not.
2847 */
2848static int
2849ufs_dirempty(
2850	struct inode *ip,
2851	ino_t parentino,
2852	struct cred *cr)
2853{
2854	return (ufs_dirscan(ip, parentino, cr, 0));
2855}
2856
2857/*
2858 * clear the .. directory entry.
2859 */
2860static int
2861ufs_dirpurgedotdot(
2862	struct inode *ip,
2863	ino_t parentino,
2864	struct cred *cr)
2865{
2866	return (ufs_dirscan(ip, parentino, cr, 1));
2867}
2868
2869/*
2870 * Scan the directoy. If clr_dotdot is true clear the ..
2871 * directory else check to see if the directory is empty.
2872 *
2873 * Using a struct dirtemplate here is not precisely
2874 * what we want, but better than using a struct direct.
2875 *
2876 * clr_dotdot is used as a flag to tell us if we need
2877 * to clear the dotdot entry
2878 *
2879 * N.B.: does not handle corrupted directories.
2880 */
2881static int
2882ufs_dirscan(
2883	struct inode *ip,
2884	ino_t parentino,
2885	struct cred *cr,
2886	int clr_dotdot)
2887{
2888	offset_t off;
2889	struct dirtemplate dbuf;
2890	struct direct *dp = (struct direct *)&dbuf;
2891	int err, count;
2892	int empty = 1;	/* Assume it's empty */
2893#define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
2894
2895	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2896
2897	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
2898	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
2899		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
2900		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
2901		/*
2902		 * Since we read MINDIRSIZ, residual must
2903		 * be 0 unless we're at end of file.
2904		 */
2905		if (err || count != 0 || dp->d_reclen == 0) {
2906			empty = 0;
2907			break;
2908		}
2909		/* skip empty entries */
2910		if (dp->d_ino == 0)
2911			continue;
2912		/* accept only "." and ".." */
2913		if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
2914			empty = 0;
2915			break;
2916		}
2917		/*
2918		 * At this point d_namlen must be 1 or 2.
2919		 * 1 implies ".", 2 implies ".." if second
2920		 * char is also "."
2921		 */
2922		if (dp->d_namlen == 1)
2923			continue;
2924		if (dp->d_name[1] == '.' &&
2925		    (ino_t)dp->d_ino == parentino) {
2926			/*
2927			 * If we're doing a purge we need to check for
2928			 * the . and .. entries and clear the d_ino for ..
2929			 *
2930			 * if clr_dotdot is set ufs_dirscan does not
2931			 * check for an empty directory.
2932			 */
2933			if (clr_dotdot) {
2934				/*
2935				 * Have to actually zap the ..
2936				 * entry in the directory, as
2937				 * otherwise someone might have
2938				 * dp as its cwd and try to
2939				 * open .., which now points to
2940				 * an unallocated inode.
2941				 */
2942				empty = ufs_dirclrdotdot(ip, parentino);
2943				break;
2944			} else {
2945				continue;
2946			}
2947		}
2948		empty = 0;
2949		break;
2950	}
2951	return (empty);
2952}
2953
2954clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
2955uint64_t dircheck_retry_cnt;
2956/*
2957 * Check if source directory inode is in the path of the target directory.
2958 * Target is supplied locked.
2959 *
2960 * The source and target inode's should be different upon entry.
2961 */
2962int
2963ufs_dircheckpath(
2964	ino_t source_ino,
2965	struct inode *target,
2966	struct inode *sdp,
2967	struct cred *cr)
2968{
2969	struct fbuf *fbp;
2970	struct dirtemplate *dirp;
2971	struct inode *ip;
2972	struct ufsvfs *ufsvfsp;
2973	struct inode *tip;
2974	ino_t dotdotino;
2975	int err;
2976
2977	ASSERT(target->i_ufsvfs != NULL);
2978	ASSERT(RW_LOCK_HELD(&target->i_rwlock));
2979	ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));
2980
2981	ip = target;
2982	if (ip->i_number == source_ino) {
2983		err = EINVAL;
2984		goto out;
2985	}
2986	if (ip->i_number == UFSROOTINO) {
2987		err = 0;
2988		goto out;
2989	}
2990	/*
2991	 * Search back through the directory tree, using the ".." entries.
2992	 * Fail any attempt to move a directory into an ancestor directory.
2993	 */
2994	fbp = NULL;
2995	for (;;) {
2996		struct vfs	*vfs;
2997
2998		err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
2999		if (err)
3000			break;
3001		if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
3002		    ip->i_size < sizeof (struct dirtemplate)) {
3003			dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
3004			err = ENOTDIR;
3005			break;
3006		}
3007		if (dirp->dotdot_namlen != 2 ||
3008		    dirp->dotdot_name[0] != '.' ||
3009		    dirp->dotdot_name[1] != '.') {
3010			dirbad(ip, "mangled .. entry", (off_t)0);
3011			err = ENOTDIR;		/* Sanity check */
3012			break;
3013		}
3014		dotdotino = (ino_t)dirp->dotdot_ino;
3015		if (dotdotino == source_ino) {
3016			err = EINVAL;
3017			break;
3018		}
3019		if (dotdotino == UFSROOTINO)
3020			break;
3021		if (fbp) {
3022			fbrelse(fbp, S_OTHER);
3023			fbp = NULL;
3024		}
3025		vfs = ip->i_vfs;
3026		ufsvfsp = ip->i_ufsvfs;
3027
3028		if (ip != target) {
3029			rw_exit(&ip->i_rwlock);
3030			VN_RELE(ITOV(ip));
3031		}
3032		/*
3033		 * Race to get the inode.
3034		 */
3035		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3036		if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
3037			rw_exit(&ufsvfsp->vfs_dqrwlock);
3038			ip = NULL;
3039			break;
3040		}
3041		rw_exit(&ufsvfsp->vfs_dqrwlock);
3042		/*
3043		 * If the directory of the source inode (also a directory)
3044		 * is the same as this next entry up the chain, then
3045		 * we know the source directory itself can't be in the
3046		 * chain. This also prevents a panic because we already
3047		 * have sdp->i_rwlock locked.
3048		 */
3049		if (tip == sdp) {
3050			VN_RELE(ITOV(tip));
3051			ip = NULL;
3052			break;
3053		}
3054		ip = tip;
3055
3056		/*
3057		 * If someone has set the WRITE_WANTED bit in this lock and if
3058		 * this happens to be a sdp or tdp of another parallel rename
3059		 * which is executing  the same code and in similar situation
3060		 * we end up in a 4 way deadlock. We need to make sure that
3061		 * the WRITE_WANTED bit is not  set.
3062		 */
3063retry_lock:
3064		if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
3065			/*
3066			 * If the lock held as WRITER thats fine but if it
3067			 * has WRITE_WANTED bit set we might end up in a
3068			 * deadlock. If WRITE_WANTED is set we return
3069			 * with EAGAIN else we just go back and try.
3070			 */
3071			if (RW_ISWRITER(&ip->i_rwlock) &&
3072			    !(RW_WRITE_HELD(&ip->i_rwlock))) {
3073				err = EAGAIN;
3074				if (fbp) {
3075					fbrelse(fbp, S_OTHER);
3076				}
3077				VN_RELE(ITOV(ip));
3078				return (err);
3079			} else {
3080				/*
3081				 * The lock is being write held. We could
3082				 * just do a rw_enter here but there is a
3083				 * window between the check and now, where
3084				 * the status could have changed, so to
3085				 * avoid looping we backoff and go back to
3086				 * try for the lock.
3087				 */
3088				delay(retry_backoff_delay);
3089				dircheck_retry_cnt++;
3090				goto retry_lock;
3091			}
3092		}
3093	}
3094	if (fbp) {
3095		fbrelse(fbp, S_OTHER);
3096	}
3097out:
3098	if (ip) {
3099		if (ip != target) {
3100			rw_exit(&ip->i_rwlock);
3101			VN_RELE(ITOV(ip));
3102		}
3103	}
3104	return (err);
3105}
3106
3107int
3108ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
3109{
3110	offset_t off;
3111	struct dirtemplate dbuf;
3112	struct direct *dp = (struct direct *)&dbuf;
3113	int err, count;
3114	int empty = 1;	/* Assume it's empty */
3115#define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
3116
3117	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3118
3119	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
3120	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
3121		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
3122		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
3123		/*
3124		 * Since we read MINDIRSIZ, residual must
3125		 * be 0 unless we're at end of file.
3126		 */
3127
3128		if (err || count != 0 || dp->d_reclen == 0) {
3129			empty = 0;
3130			break;
3131		}
3132		/* skip empty entries */
3133		if (dp->d_ino == 0)
3134			continue;
3135		/*
3136		 * At this point d_namlen must be 1 or 2.
3137		 * 1 implies ".", 2 implies ".." if second
3138		 * char is also "."
3139		 */
3140
3141		if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
3142		    (ino_t)dp->d_ino == parentino)
3143			continue;
3144
3145		if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
3146		    dp->d_name[1] == '.') {
3147			continue;
3148		}
3149		empty = 0;
3150		break;
3151	}
3152	return (empty);
3153}
3154
3155
3156/*
3157 * Allocate and initialize a new shadow inode to contain extended attributes.
3158 */
3159int
3160ufs_xattrmkdir(
3161	struct inode *tdp,
3162	struct inode **ipp,
3163	int flags,
3164	struct cred *cr)
3165{
3166	struct inode *ip;
3167	struct vattr va;
3168	int err;
3169	int retry = 1;
3170	struct ufsvfs *ufsvfsp;
3171	struct ulockfs *ulp;
3172	int issync;
3173	int trans_size;
3174	int dorwlock;		/* 0 = not yet taken, */
3175				/* 1 = taken outside the transaction, */
3176				/* 2 = taken inside the transaction */
3177
3178	/*
3179	 * Validate permission to create attribute directory
3180	 */
3181
3182	if ((err = ufs_iaccess(tdp, IWRITE, cr, 1)) != 0) {
3183		return (err);
3184	}
3185
3186	if (vn_is_readonly(ITOV(tdp)))
3187		return (EROFS);
3188
3189	/*
3190	 * No need to re-init err after again:, since it's set before
3191	 * the next use of it.
3192	 */
3193again:
3194	dorwlock = 0;
3195	va.va_type = VDIR;
3196	va.va_uid = tdp->i_uid;
3197	va.va_gid = tdp->i_gid;
3198
3199	if ((tdp->i_mode & IFMT) == IFDIR) {
3200		va.va_mode = (o_mode_t)IFATTRDIR;
3201		va.va_mode |= tdp->i_mode & 0777;
3202	} else {
3203		va.va_mode = (o_mode_t)IFATTRDIR|0700;
3204		if (tdp->i_mode & 0040)
3205			va.va_mode |= 0750;
3206		if (tdp->i_mode & 0004)
3207			va.va_mode |= 0705;
3208	}
3209	va.va_mask = AT_TYPE|AT_MODE;
3210
3211	ufsvfsp = tdp->i_ufsvfs;
3212
3213	err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3214	if (err)
3215		return (err);
3216
3217	/*
3218	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
3219	 * This follows the protocol for read()/write().
3220	 */
3221	if (ITOV(tdp)->v_type != VDIR) {
3222		rw_enter(&tdp->i_rwlock, RW_WRITER);
3223		dorwlock = 1;
3224	}
3225
3226	if (ulp) {
3227		trans_size = (int)TOP_MKDIR_SIZE(tdp);
3228		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size);
3229	}
3230
3231	/*
3232	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
3233	 * This follows the protocol established by
3234	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
3235	 */
3236	if (dorwlock == 0) {
3237		rw_enter(&tdp->i_rwlock, RW_WRITER);
3238		dorwlock = 2;
3239	}
3240	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3241	rw_enter(&tdp->i_contents, RW_WRITER);
3242
3243	/*
3244	 * Suppress out of inodes messages if we will retry.
3245	 */
3246	if (retry)
3247		tdp->i_flag |= IQUIET;
3248	err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
3249	tdp->i_flag &= ~IQUIET;
3250
3251	if (err)
3252		goto fail;
3253
3254	if (flags) {
3255
3256		/*
3257		 * Now attach it to src file.
3258		 */
3259
3260		tdp->i_oeftflag = ip->i_number;
3261	}
3262
3263	ip->i_cflags |= IXATTR;
3264	ITOV(ip)->v_flag |= V_XATTRDIR;
3265	TRANS_INODE(ufsvfsp, tdp);
3266	tdp->i_flag |= ICHG | IUPD;
3267	tdp->i_seq++;
3268	ufs_iupdat(tdp, I_SYNC);
3269	rw_exit(&tdp->i_contents);
3270	rw_exit(&ufsvfsp->vfs_dqrwlock);
3271
3272	rw_enter(&ip->i_rwlock, RW_WRITER);
3273	rw_enter(&ip->i_contents, RW_WRITER);
3274	TRANS_INODE(ufsvfsp, ip);
3275	ip->i_flag |= ICHG| IUPD;
3276	ip->i_seq++;
3277	ufs_iupdat(ip, I_SYNC);
3278	rw_exit(&ip->i_contents);
3279	rw_exit(&ip->i_rwlock);
3280	if (dorwlock == 2)
3281		rw_exit(&tdp->i_rwlock);
3282	if (ulp) {
3283		int terr = 0;
3284
3285		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3286		ufs_lockfs_end(ulp);
3287		if (err == 0)
3288			err = terr;
3289	}
3290	if (dorwlock == 1)
3291		rw_exit(&tdp->i_rwlock);
3292	*ipp = ip;
3293	return (err);
3294
3295fail:
3296	rw_exit(&tdp->i_contents);
3297	rw_exit(&ufsvfsp->vfs_dqrwlock);
3298	if (dorwlock == 2)
3299		rw_exit(&tdp->i_rwlock);
3300	if (ulp) {
3301		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3302		ufs_lockfs_end(ulp);
3303	}
3304	if (dorwlock == 1)
3305		rw_exit(&tdp->i_rwlock);
3306	if (ip != NULL)
3307		VN_RELE(ITOV(ip));
3308
3309	/*
3310	 * No inodes?  See if any are tied up in pending deletions.
3311	 * This has to be done outside of any of the above, because
3312	 * the draining operation can't be done from inside a transaction.
3313	 */
3314	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3315		ufs_delete_drain_wait(ufsvfsp, 1);
3316		retry = 0;
3317		goto again;
3318	}
3319
3320	return (err);
3321}
3322
3323/*
3324 * clear the dotdot directory entry.
3325 * Used by ufs_dirscan when clr_dotdot
3326 * flag is set and we're deleting a
3327 * directory.
3328 */
3329static int
3330ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
3331{
3332	struct fbuf *fbp;
3333	struct direct *dotp, *dotdotp;
3334	int err = 0;
3335
3336	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
3337	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3338	err = blkatoff(ip, 0, NULL, &fbp);
3339	if (err) {
3340		return (err);
3341	}
3342
3343	dotp = (struct direct *)fbp->fb_addr;
3344	if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
3345	    ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
3346		dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
3347		if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
3348		    ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {
3349
3350			dotp->d_reclen += dotdotp->d_reclen;
3351			if (parentino == dotdotp->d_ino) {
3352				dotdotp->d_ino = 0;
3353				dotdotp->d_namlen = 0;
3354				dotdotp->d_reclen = 0;
3355			}
3356
3357			err = TRANS_DIR(ip, 0);
3358			if (err) {
3359				fbrelse(fbp, S_OTHER);
3360			} else {
3361				err = ufs_fbwrite(fbp, ip);
3362			}
3363		}
3364	} else {
3365		err = -1;
3366	}
3367	return (err);
3368}
3369