xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_dir.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 /*
44  * Directory manipulation routines.
45  *
46  * When manipulating directories, the i_rwlock provides serialization
47  * since directories cannot be mmapped. The i_contents lock is redundant.
48  */
49 
50 #include <sys/types.h>
51 #include <sys/t_lock.h>
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/signal.h>
55 #include <sys/cred.h>
56 #include <sys/proc.h>
57 #include <sys/disp.h>
58 #include <sys/user.h>
59 #include <sys/vfs.h>
60 #include <sys/vnode.h>
61 #include <sys/stat.h>
62 #include <sys/mode.h>
63 #include <sys/buf.h>
64 #include <sys/uio.h>
65 #include <sys/dnlc.h>
66 #include <sys/fs/ufs_inode.h>
67 #include <sys/fs/ufs_fs.h>
68 #include <sys/mount.h>
69 #include <sys/fs/ufs_fsdir.h>
70 #include <sys/fs/ufs_trans.h>
71 #include <sys/fs/ufs_panic.h>
72 #include <sys/fs/ufs_quota.h>
73 #include <sys/errno.h>
74 #include <sys/debug.h>
75 #include <vm/seg.h>
76 #include <sys/sysmacros.h>
77 #include <sys/cmn_err.h>
78 #include <sys/cpuvar.h>
79 #include <sys/unistd.h>
80 #include <sys/policy.h>
81 
82 /*
83  * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
84  */
85 #if !ISP2(DIRBLKSIZ)
86 #error	"DIRBLKSIZ not a power of 2"
87 #endif
88 
89 /*
90  * A virgin directory.
91  */
92 static struct dirtemplate mastertemplate = {
93 	0, 12, 1, ".",
94 	0, DIRBLKSIZ - 12, 2, ".."
95 };
96 
97 #define	LDIRSIZ(len) \
98 	((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
99 #define	MAX_DIR_NAME_LEN(len) \
100 	(((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
101 
102 /*
103  * The dnlc directory cache allows a 64 bit handle for directory entries.
104  * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
105  * into the handle. Note, a 32 bit offset allows a 4GB directory, which
106  * is way beyond what could be cached in memory by the directory
107  * caching routines. So we are quite safe with this limit.
108  * The macros below pack and unpack the handle.
109  */
110 #define	H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
111 #define	H_TO_OFF(h) (off_t)((h) >> 32)
112 #define	INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
113 
114 /*
115  * The average size of a typical on disk directory entry is about 16 bytes
116  * and so defines AV_DIRECT_SHIFT : log2(16)
117  * This define is only used to approximate the number of entries
118  * is a directory. This is needed for dnlc_dir_start() which will immediately
119  * return an error if the value is not within its acceptable range of
120  * number of files in a directory.
121  */
122 #define	AV_DIRECT_SHIFT 4
123 /*
124  * If the directory size (from i_size) is greater than the ufs_min_dir_cache
125  * tunable then we request dnlc directory caching.
126  * This has found to be profitable after 1024 file names.
127  */
128 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;
129 
130 #ifdef DEBUG
131 int dirchk = 1;
132 #else /* !DEBUG */
133 int dirchk = 0;
134 #endif /* DEBUG */
135 int ufs_negative_cache = 1;
136 uint64_t ufs_dirremove_retry_cnt;
137 
138 static void dirbad();
139 static int ufs_dircheckforname();
140 static int ufs_dirrename();
141 static int ufs_diraddentry();
142 static int ufs_dirempty();
143 static int ufs_dirscan();
144 static int ufs_dirclrdotdot();
145 static int ufs_dirfixdotdot();
146 static int ufs_dirpurgedotdot();
147 static int dirprepareentry();
148 static int ufs_dirmakedirect();
149 static int dirbadname();
150 static int dirmangled();
151 
152 /*
153  * Look for a given name in a directory.  On successful return, *ipp
154  * will point to the VN_HELD inode.
155  */
156 int
157 ufs_dirlook(
158 	struct inode *dp,
159 	char *namep,
160 	struct inode **ipp,
161 	struct cred *cr,
162 	int skipdnlc)			/* skip the 1st level dnlc */
163 {
164 	uint64_t handle;
165 	struct fbuf *fbp;		/* a buffer of directory entries */
166 	struct direct *ep;		/* the current directory entry */
167 	struct vnode *vp;
168 	struct vnode *dvp;		/* directory vnode ptr */
169 	dcanchor_t *dcap;
170 	off_t endsearch;		/* offset to end directory search */
171 	off_t offset;
172 	off_t start_off;		/* starting offset from middle search */
173 	off_t last_offset;		/* last offset */
174 	int entryoffsetinblock;		/* offset of ep in addr's buffer */
175 	int numdirpasses;		/* strategy for directory search */
176 	int namlen;			/* length of name */
177 	int err;
178 	int doingchk;
179 	int i;
180 	int caching;
181 	ino_t ep_ino;			/* entry i number */
182 	ino_t chkino;
183 	ushort_t ep_reclen;		/* direct local d_reclen */
184 
185 	ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */
186 
187 	/*
188 	 * Check accessibility of directory.
189 	 */
190 	if (((dp->i_mode & IFMT) != IFDIR) &&
191 	    ((dp->i_mode & IFMT) != IFATTRDIR))
192 		return (ENOTDIR);
193 
194 	if (err = ufs_iaccess(dp, IEXEC, cr))
195 		return (err);
196 
197 	/*
198 	 * Check the directory name lookup cache, first for individual files
199 	 * then for complete directories.
200 	 */
201 	dvp = ITOV(dp);
202 	if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
203 		/* vp is already held from dnlc_lookup */
204 		if (vp == DNLC_NO_VNODE) {
205 			VN_RELE(vp);
206 			return (ENOENT);
207 		}
208 		*ipp = VTOI(vp);
209 		return (0);
210 	}
211 
212 	dcap = &dp->i_danchor;
213 
214 	/*
215 	 * Grab the reader lock on the directory data before checking
216 	 * the dnlc to avoid a race with ufs_dirremove() & friends.
217 	 */
218 	rw_enter(&dp->i_rwlock, RW_READER);
219 
220 	switch (dnlc_dir_lookup(dcap, namep, &handle)) {
221 	case DFOUND:
222 		ep_ino = (ino_t)H_TO_INO(handle);
223 		if (dp->i_number == ep_ino) {
224 			VN_HOLD(dvp);	/* want ourself, "." */
225 			*ipp = dp;
226 			rw_exit(&dp->i_rwlock);
227 			return (0);
228 		}
229 		if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
230 			uint64_t handle2;
231 			/*
232 			 * release the lock on the dir we are searching
233 			 * to avoid a deadlock when grabbing the
234 			 * i_contents lock in ufs_iget_alloced().
235 			 */
236 			rw_exit(&dp->i_rwlock);
237 			rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
238 			err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
239 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
240 			/*
241 			 * must recheck as we dropped dp->i_rwlock
242 			 */
243 			rw_enter(&dp->i_rwlock, RW_READER);
244 			if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
245 			    == DFOUND) && (handle == handle2)) {
246 				dnlc_update(dvp, namep, ITOV(*ipp));
247 				rw_exit(&dp->i_rwlock);
248 				return (0);
249 			}
250 			/* check failed, read the actual directory */
251 			if (!err) {
252 				VN_RELE(ITOV(*ipp));
253 			}
254 			goto restart;
255 		}
256 		/* usual case of not "." nor ".." */
257 		rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
258 		err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
259 		rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
260 		if (err) {
261 			rw_exit(&dp->i_rwlock);
262 			return (err);
263 		}
264 		dnlc_update(dvp, namep, ITOV(*ipp));
265 		rw_exit(&dp->i_rwlock);
266 		return (0);
267 	case DNOENT:
268 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
269 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
270 		}
271 		rw_exit(&dp->i_rwlock);
272 		return (ENOENT);
273 	default:
274 		break;
275 	}
276 restart:
277 
278 	fbp = NULL;
279 	doingchk = 0;
280 	chkino = 0;
281 	caching = 0;
282 
283 	/*
284 	 * Attempt to cache any directories greater than
285 	 * the tunable ufs_min_cache_dir.
286 	 */
287 	if ((dp->i_size >= ufs_min_dir_cache) && (dp->i_cachedir)) {
288 		switch (dnlc_dir_start(dcap, dp->i_size >> AV_DIRECT_SHIFT)) {
289 		case DNOMEM:
290 		case DTOOBIG:
291 			dp->i_cachedir = 0;
292 			break;
293 		case DOK:
294 			caching = 1;
295 			break;
296 		default:
297 			break;
298 		}
299 	}
300 	/*
301 	 * If caching we don't stop when the file has been
302 	 * found, but need to know later, so clear *ipp now
303 	 */
304 	*ipp = NULL;
305 
306 recheck:
307 	if (caching) {
308 		offset = 0;
309 		entryoffsetinblock = 0;
310 		numdirpasses = 1;
311 	} else {
312 		/*
313 		 * Take care to look at dp->i_diroff only once, as it
314 		 * may be changing due to other threads/cpus.
315 		 */
316 		offset = dp->i_diroff;
317 		if (offset > dp->i_size) {
318 			offset = 0;
319 		}
320 		if (offset == 0) {
321 			entryoffsetinblock = 0;
322 			numdirpasses = 1;
323 		} else {
324 			start_off = offset;
325 
326 			entryoffsetinblock = blkoff(dp->i_fs, offset);
327 			if (entryoffsetinblock != 0) {
328 				err = blkatoff(dp, offset, (char **)0, &fbp);
329 				if (err)
330 					goto bad;
331 			}
332 			numdirpasses = 2;
333 		}
334 	}
335 	endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t);
336 	namlen = strlen(namep);
337 	last_offset = 0;
338 
339 searchloop:
340 	while (offset < endsearch) {
341 		/*
342 		 * If offset is on a block boundary,
343 		 * read the next directory block.
344 		 * Release previous if it exists.
345 		 */
346 		if (blkoff(dp->i_fs, offset) == 0) {
347 			if (fbp != NULL) {
348 				fbrelse(fbp, S_OTHER);
349 			}
350 			err = blkatoff(dp, offset, (char **)0, &fbp);
351 			if (err)
352 				goto bad;
353 			entryoffsetinblock = 0;
354 		}
355 
356 		/*
357 		 * If the offset to the next entry is invalid or if the
358 		 * next entry is a zero length record or if the record
359 		 * length is invalid, then skip to the next directory
360 		 * block.  Complete validation checks are done if the
361 		 * record length is invalid.
362 		 *
363 		 * Full validation checks are slow so they are disabled
364 		 * by default.  Complete checks can be run by patching
365 		 * "dirchk" to be true.
366 		 *
367 		 * We have to check the validity of entryoffsetinblock
368 		 * here because it can be set to i_diroff above.
369 		 */
370 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
371 		if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
372 		    (dirchk || (ep->d_reclen & 0x3)) &&
373 		    dirmangled(dp, ep, entryoffsetinblock, offset)) {
374 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
375 			offset += i;
376 			entryoffsetinblock += i;
377 			if (caching) {
378 				dnlc_dir_purge(dcap);
379 				caching = 0;
380 			}
381 			continue;
382 		}
383 
384 		ep_reclen = ep->d_reclen;
385 
386 		/*
387 		 * Add named entries and free space into the directory cache
388 		 */
389 		if (caching) {
390 			ushort_t extra;
391 			off_t off2;
392 
393 			if (ep->d_ino == 0) {
394 				extra = ep_reclen;
395 				if (offset & (DIRBLKSIZ - 1)) {
396 					dnlc_dir_purge(dcap);
397 					dp->i_cachedir = 0;
398 					caching = 0;
399 				}
400 			} else {
401 				/*
402 				 * entries hold the previous offset except the
403 				 * 1st which holds the offset + 1
404 				 */
405 				if (offset & (DIRBLKSIZ - 1)) {
406 					off2 = last_offset;
407 				} else {
408 					off2 = offset + 1;
409 				}
410 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
411 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
412 				extra = ep_reclen - DIRSIZ(ep);
413 			}
414 			if (caching && (extra >= LDIRSIZ(1))) {
415 				caching = (dnlc_dir_add_space(dcap, extra,
416 				    (uint64_t)offset) == DOK);
417 			}
418 		}
419 
420 		/*
421 		 * Check for a name match.
422 		 * We have the parent inode read locked with i_rwlock.
423 		 */
424 		if (ep->d_ino && ep->d_namlen == namlen &&
425 		    *namep == *ep->d_name &&	/* fast chk 1st chr */
426 		    bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {
427 
428 			/*
429 			 * We have to release the fbp early here to avoid
430 			 * a possible deadlock situation where we have the
431 			 * fbp and want the directory inode and someone doing
432 			 * a ufs_direnter_* has the directory inode and wants
433 			 * the fbp.  XXX - is this still needed?
434 			 */
435 			ep_ino = (ino_t)ep->d_ino;
436 			ASSERT(fbp != NULL);
437 			fbrelse(fbp, S_OTHER);
438 			fbp = NULL;
439 
440 			/*
441 			 * Atomic update (read lock held)
442 			 */
443 			dp->i_diroff = offset;
444 
445 			if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
446 				struct timeval32 omtime;
447 
448 				if (caching) {
449 					dnlc_dir_purge(dcap);
450 					caching = 0;
451 				}
452 				if (doingchk) {
453 					/*
454 					 * if the inumber didn't change
455 					 * continue with already found inode.
456 					 */
457 					if (ep_ino == chkino)
458 						goto checkok;
459 					else {
460 						VN_RELE(ITOV(*ipp));
461 						/* *ipp is nulled at restart */
462 						goto restart;
463 					}
464 				}
465 				/*
466 				 * release the lock on the dir we are searching
467 				 * to avoid a deadlock when grabbing the
468 				 * i_contents lock in ufs_iget_alloced().
469 				 */
470 				omtime = dp->i_mtime;
471 				rw_exit(&dp->i_rwlock);
472 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
473 						RW_READER);
474 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
475 				    cr);
476 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
477 				rw_enter(&dp->i_rwlock, RW_READER);
478 				if (err)
479 					goto bad;
480 				/*
481 				 * Since we released the lock on the directory,
482 				 * we must check that the same inode is still
483 				 * the ".." entry for this directory.
484 				 */
485 				/*CSTYLED*/
486 				if (timercmp(&omtime, &dp->i_mtime, !=)) {
487 					/*
488 					 * Modification time changed on the
489 					 * directory, we must go check if
490 					 * the inumber changed for ".."
491 					 */
492 					doingchk = 1;
493 					chkino = ep_ino;
494 					entryoffsetinblock = 0;
495 					if (caching) {
496 						/*
497 						 * Forget directory caching
498 						 * for this rare case
499 						 */
500 						dnlc_dir_purge(dcap);
501 						caching = 0;
502 					}
503 					goto recheck;
504 				}
505 			} else if (dp->i_number == ep_ino) {
506 				VN_HOLD(dvp);	/* want ourself, "." */
507 				*ipp = dp;
508 				if (caching) {
509 					dnlc_dir_purge(dcap);
510 					caching = 0;
511 				}
512 			} else {
513 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
514 						RW_READER);
515 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
516 				    cr);
517 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
518 				if (err)
519 					goto bad;
520 			}
521 checkok:
522 			ASSERT(*ipp);
523 			dnlc_update(dvp, namep, ITOV(*ipp));
524 			/*
525 			 * If we are not caching then just return the entry
526 			 * otherwise complete loading up the cache
527 			 */
528 			if (!caching) {
529 				rw_exit(&dp->i_rwlock);
530 				return (0);
531 			}
532 			err = blkatoff(dp, offset, (char **)0, &fbp);
533 			if (err)
534 				goto bad;
535 		}
536 		last_offset = offset;
537 		offset += ep_reclen;
538 		entryoffsetinblock += ep_reclen;
539 	}
540 	/*
541 	 * If we started in the middle of the directory and failed
542 	 * to find our target, we must check the beginning as well.
543 	 */
544 	if (numdirpasses == 2) {
545 		numdirpasses--;
546 		offset = 0;
547 		endsearch = start_off;
548 		goto searchloop;
549 	}
550 
551 	/*
552 	 * If whole directory caching is on (or was originally on) then
553 	 * the entry may have been found.
554 	 */
555 	if (*ipp == NULL) {
556 		err = ENOENT;
557 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
558 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
559 		}
560 	}
561 	if (caching) {
562 		dnlc_dir_complete(dcap);
563 		caching = 0;
564 	}
565 
566 bad:
567 	if (err && *ipp) {
568 		/*
569 		 * err and *ipp can both be set if we were attempting to
570 		 * cache the directory, and we found the entry, then later
571 		 * while trying to complete the directory cache encountered
572 		 * a error (eg reading a directory sector).
573 		 */
574 		VN_RELE(ITOV(*ipp));
575 		*ipp = NULL;
576 	}
577 
578 	if (fbp)
579 		fbrelse(fbp, S_OTHER);
580 	rw_exit(&dp->i_rwlock);
581 	if (caching)
582 		dnlc_dir_purge(dcap);
583 	return (err);
584 }
585 
586 /*
587  * If ufs_dircheckforname() fails to find an entry with the given name,
588  * this "slot" structure holds state for ufs_direnter_*() as to where
589  * there is space to put an entry with that name.
590  * If ufs_dircheckforname() finds an entry with the given name, this structure
591  * holds state for ufs_dirrename() and ufs_dirremove() as to where the
592  * entry is. "status" indicates what ufs_dircheckforname() found:
593  *	NONE		name not found, large enough free slot not found,
594  *	FOUND		name not found, large enough free slot found
595  *	EXIST		name found
596  * If ufs_dircheckforname() fails due to an error, this structure is not
597  * filled in.
598  *
599  * After ufs_dircheckforname() succeeds the values are:
600  *	status	offset		size		fbp, ep
601  *	------	------		----		-------
602  *	NONE	end of dir	needed		not valid
603  *	FOUND	start of entry	of ent		both valid if fbp != NULL
604  *	EXIST	start of entry	of prev ent	valid
605  *
606  * "endoff" is set to 0 if the an entry with the given name is found, or if no
607  * free slot could be found or made; this means that the directory should not
608  * be truncated.  If the entry was found, the search terminates so
609  * ufs_dircheckforname() didn't find out where the last valid entry in the
610  * directory was, so it doesn't know where to cut the directory off; if no free
611  * slot could be found or made, the directory has to be extended to make room
612  * for the new entry, so there's nothing to cut off.
613  * Otherwise, "endoff" is set to the larger of the offset of the last
614  * non-empty entry in the directory, or the offset at which the new entry will
615  * be placed, whichever is larger.  This is used by ufs_diraddentry(); if a new
616  * entry is to be added to the directory, any complete directory blocks at the
617  * end of the directory that contain no non-empty entries are lopped off the
618  * end, thus shrinking the directory dynamically.
619  */
620 typedef enum {NONE, FOUND, EXIST} slotstat_t;
621 struct slot {
622 	struct direct *ep;	/* pointer to slot */
623 	struct	fbuf *fbp;	/* dir buf where slot is */
624 	off_t	offset;		/* offset of area with free space */
625 	off_t	endoff;		/* last useful location found in search */
626 	slotstat_t status;	/* status of slot */
627 	int	size;		/* size of area at slotoffset */
628 	int	cached;		/* cached directory */
629 };
630 
631 
632 /*
633  * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
634  */
635 int
636 ufs_direnter_cm(
637 	struct inode *tdp,	/* target directory to make entry in */
638 	char *namep,		/* name of entry */
639 	enum de_op op,		/* entry operation */
640 	struct vattr *vap,	/* attributes if new inode needed */
641 	struct inode **ipp,	/* return entered inode here */
642 	struct cred *cr,	/* user credentials */
643 	int flags)		/* no entry exists */
644 {
645 	struct inode *tip;	/* inode of (existing) target file */
646 	char *s;
647 	struct slot slot;	/* slot info to pass around */
648 	int namlen;		/* length of name */
649 	int err;		/* error number */
650 	struct inode *nip;	/* new inode */
651 	int do_rele_nip = 0;	/* release nip */
652 	int noentry = flags & ~IQUIET;
653 	int quiet = flags & IQUIET;	/* Suppress out of inodes message */
654 
655 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
656 
657 	if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
658 	    ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
659 	    (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
660 	    (vap->va_type == VFIFO))))
661 		return (EINVAL);
662 
663 	/* don't allow '/' characters in pathname component */
664 	for (s = namep, namlen = 0; *s; s++, namlen++)
665 		if (*s == '/')
666 			return (EACCES);
667 	ASSERT(namlen);
668 
669 	/*
670 	 * If name is "." or ".." then if this is a create look it up
671 	 * and return EEXIST.
672 	 */
673 	if (namep[0] == '.' &&
674 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
675 		/*
676 		 * ufs_dirlook will acquire the i_rwlock
677 		 */
678 		rw_exit(&tdp->i_rwlock);
679 		if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) {
680 			rw_enter(&tdp->i_rwlock, RW_WRITER);
681 			return (err);
682 		}
683 		rw_enter(&tdp->i_rwlock, RW_WRITER);
684 		return (EEXIST);
685 	}
686 
687 	/*
688 	 * If target directory has not been removed, then we can consider
689 	 * allowing file to be created.
690 	 */
691 	if (tdp->i_nlink <= 0) {
692 		return (ENOENT);
693 	}
694 
695 	/*
696 	 * Check accessibility of directory.
697 	 */
698 	if (((tdp->i_mode & IFMT) != IFDIR) &&
699 	    ((tdp->i_mode & IFMT) != IFATTRDIR)) {
700 		return (ENOTDIR);
701 	}
702 
703 	/*
704 	 * Execute access is required to search the directory.
705 	 */
706 	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
707 		return (err);
708 	}
709 
710 	/*
711 	 * Search for the entry. Return VN_HELD tip if found.
712 	 */
713 	tip = NULL;
714 	slot.fbp = NULL;
715 	slot.status = NONE;
716 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
717 	rw_enter(&tdp->i_contents, RW_WRITER);
718 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
719 	if (err)
720 		goto out;
721 	if (tip) {
722 		ASSERT(!noentry);
723 		*ipp = tip;
724 		err = EEXIST;
725 	} else {
726 		/*
727 		 * The entry does not exist. Check write permission in
728 		 * directory to see if entry can be created.
729 		 */
730 		if (err = ufs_iaccess(tdp, IWRITE, cr))
731 			goto out;
732 		/*
733 		 * Make new inode and directory entry.
734 		 */
735 		tdp->i_flag |= quiet;
736 		if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
737 			if (nip != NULL)
738 				do_rele_nip = 1;
739 			goto out;
740 		}
741 		if (err = ufs_diraddentry(tdp, namep, op,
742 		    namlen, &slot, nip, NULL, cr)) {
743 			/*
744 			 * Unmake the inode we just made.
745 			 */
746 			rw_enter(&nip->i_contents, RW_WRITER);
747 			if (((nip->i_mode & IFMT) == IFDIR) ||
748 			    ((nip->i_mode & IFMT) == IFATTRDIR)) {
749 				tdp->i_nlink--;
750 				ufs_setreclaim(tdp);
751 				tdp->i_flag |= ICHG;
752 				tdp->i_seq++;
753 				TRANS_INODE(tdp->i_ufsvfs, tdp);
754 				ITIMES_NOLOCK(tdp);
755 			}
756 			nip->i_nlink = 0;
757 			ufs_setreclaim(nip);
758 			TRANS_INODE(nip->i_ufsvfs, nip);
759 			nip->i_flag |= ICHG;
760 			nip->i_seq++;
761 			ITIMES_NOLOCK(nip);
762 			rw_exit(&nip->i_contents);
763 			do_rele_nip = 1;
764 		} else {
765 			*ipp = nip;
766 		}
767 	}
768 
769 out:
770 	if (slot.fbp)
771 		fbrelse(slot.fbp, S_OTHER);
772 
773 	tdp->i_flag &= ~quiet;
774 	rw_exit(&tdp->i_contents);
775 
776 	/*
777 	 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
778 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
779 	 */
780 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
781 
782 	if (do_rele_nip) {
783 		VN_RELE(ITOV(nip));
784 	}
785 
786 	return (err);
787 }
788 
789 /*
790  * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
791  * If tvpp is non-null, return with the pointer to the target vnode.
792  */
793 int
794 ufs_direnter_lr(
795 	struct inode *tdp,	/* target directory to make entry in */
796 	char *namep,		/* name of entry */
797 	enum de_op op,		/* entry operation */
798 	struct inode *sdp,	/* source inode parent if rename */
799 	struct inode *sip,	/* source inode */
800 	struct cred *cr,	/* user credentials */
801 	vnode_t **tvpp)		/* Return: (held) vnode of (existing) target */
802 {
803 	struct inode *tip;	/* inode of (existing) target file */
804 	char *s;
805 	struct slot slot;	/* slot info to pass around */
806 	int namlen;		/* length of name */
807 	int err;		/* error number */
808 
809 	/* don't allow '/' characters in pathname component */
810 	for (s = namep, namlen = 0; *s; s++, namlen++)
811 		if (*s == '/')
812 			return (EACCES);
813 	ASSERT(namlen);
814 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
815 
816 	/*
817 	 * If name is "." or ".." then if this is a create look it up
818 	 * and return EEXIST.  Rename or link TO "." or ".." is forbidden.
819 	 */
820 	if (namep[0] == '.' &&
821 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
822 		if (op == DE_RENAME) {
823 			return (EINVAL);	/* *SIGH* should be ENOTEMPTY */
824 		}
825 		return (EEXIST);
826 	}
827 	/*
828 	 * For link and rename lock the source entry and check the link count
829 	 * to see if it has been removed while it was unlocked.  If not, we
830 	 * increment the link count and force the inode to disk to make sure
831 	 * that it is there before any directory entry that points to it.
832 	 *
833 	 * In the case of a symbolic link, we are dealing with a new inode
834 	 * which does not yet have any links.  We've created it with a link
835 	 * count of 1, and we don't want to increment it since this will be
836 	 * its first link.
837 	 *
838 	 * We are about to push the inode to disk. We make sure
839 	 * that the inode's data blocks are flushed first so the
840 	 * inode and it's data blocks are always in sync.  This
841 	 * adds some robustness in in the event of a power failure
842 	 * or panic where sync fails. If we panic before the
843 	 * inode is updated, then the inode still refers to the
844 	 * old data blocks (or none for a new file). If we panic
845 	 * after the inode is updated, then the inode refers to
846 	 * the new data blocks.
847 	 *
848 	 * We do this before grabbing the i_contents lock because
849 	 * ufs_syncip() will want that lock. We could do the data
850 	 * syncing after the removal checks, but upon return from
851 	 * the data sync we would have to repeat the removal
852 	 * checks.
853 	 */
854 	if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
855 		return (err);
856 	}
857 
858 	rw_enter(&sip->i_contents, RW_WRITER);
859 	if (sip->i_nlink <= 0) {
860 		rw_exit(&sip->i_contents);
861 		return (ENOENT);
862 	}
863 	if (sip->i_nlink == MAXLINK) {
864 		rw_exit(&sip->i_contents);
865 		return (EMLINK);
866 	}
867 
868 	/*
869 	 * Sync the indirect blocks associated with the file
870 	 * for the same reasons as described above.  Since this
871 	 * call wants the i_contents lock held for it we can do
872 	 * this here with no extra work.
873 	 */
874 	if (err = ufs_sync_indir(sip)) {
875 		rw_exit(&sip->i_contents);
876 		return (err);
877 	}
878 
879 	if (op != DE_SYMLINK)
880 		sip->i_nlink++;
881 	TRANS_INODE(sip->i_ufsvfs, sip);
882 	sip->i_flag |= ICHG;
883 	sip->i_seq++;
884 	ufs_iupdat(sip, I_SYNC);
885 	rw_exit(&sip->i_contents);
886 
887 	/*
888 	 * If target directory has not been removed, then we can consider
889 	 * allowing file to be created.
890 	 */
891 	if (tdp->i_nlink <= 0) {
892 		err = ENOENT;
893 		goto out2;
894 	}
895 	/*
896 	 * Check accessibility of directory.
897 	 */
898 	if (((tdp->i_mode & IFMT) != IFDIR) &&
899 	    (tdp->i_mode & IFMT) != IFATTRDIR) {
900 		err = ENOTDIR;
901 		goto out2;
902 	}
903 	/*
904 	 * Execute access is required to search the directory.
905 	 */
906 	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
907 		goto out2;
908 	}
909 
910 	/*
911 	 * Search for the entry. Return VN_HELD tip if found.
912 	 */
913 	tip = NULL;
914 	slot.status = NONE;
915 	slot.fbp = NULL;
916 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
917 	rw_enter(&tdp->i_contents, RW_WRITER);
918 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
919 	if (err)
920 		goto out;
921 
922 	if (tip) {
923 		switch (op) {
924 		case DE_RENAME:
925 			err = ufs_dirrename(sdp, sip, tdp, namep,
926 			    tip, &slot, cr);
927 			break;
928 
929 		case DE_LINK:
930 		case DE_SYMLINK:
931 			/*
932 			 * Can't link to an existing file.
933 			 */
934 			err = EEXIST;
935 			break;
936 		default:
937 			break;
938 		}
939 	} else {
940 		/*
941 		 * The entry does not exist. Check write permission in
942 		 * directory to see if entry can be created.
943 		 */
944 		if (err = ufs_iaccess(tdp, IWRITE, cr))
945 			goto out;
946 		err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
947 		    cr);
948 	}
949 
950 out:
951 	if (slot.fbp)
952 		fbrelse(slot.fbp, S_OTHER);
953 
954 	rw_exit(&tdp->i_contents);
955 
956 	/*
957 	 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
958 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
959 	 */
960 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
961 
962 	/*
963 	 * If we renamed a file over the top of an existing file,
964 	 * or linked a file to an existing file (or tried to),
965 	 * then set *tvpp to the target vnode, if tvpp is non-null
966 	 * otherwise, release and delete (or just release) the inode.
967 	 *
968 	 * N.B., by returning the target's vnode pointer to the caller,
969 	 * that caller becomes responsible for doing the VN_RELE.
970 	 */
971 	if (tip) {
972 		if ((err == 0) && (tvpp != NULL)) {
973 			*tvpp = ITOV(tip);
974 		} else {
975 			VN_RELE(ITOV(tip));
976 		}
977 	}
978 
979 out2:
980 	if (err) {
981 		/*
982 		 * Undo bumped link count.
983 		 */
984 		if (op != DE_SYMLINK) {
985 			rw_enter(&sip->i_contents, RW_WRITER);
986 			sip->i_nlink--;
987 			ufs_setreclaim(sip);
988 			TRANS_INODE(sip->i_ufsvfs, sip);
989 			sip->i_flag |= ICHG;
990 			sip->i_seq++;
991 			ITIMES_NOLOCK(sip);
992 			rw_exit(&sip->i_contents);
993 		}
994 	}
995 	return (err);
996 }
997 
998 /*
999  * Check for the existence of a name in a directory (unless noentry
1000  * is set) , or else of an empty
1001  * slot in which an entry may be made.  If the requested name is found,
1002  * then on return *ipp points at the inode and *offp contains
1003  * its offset in the directory.  If the name is not found, then *ipp
1004  * will be NULL and *slotp will contain information about a directory slot in
1005  * which an entry may be made (either an empty slot, or the first position
1006  * past the end of the directory).
1007  * The target directory inode (tdp) is supplied write locked (i_rwlock).
1008  *
1009  * This may not be used on "." or "..", but aliases of "." are ok.
1010  */
1011 static int
1012 ufs_dircheckforname(
1013 	struct inode *tdp,	/* inode of directory being checked */
1014 	char *namep,		/* name we're checking for */
1015 	int namlen,		/* length of name, excluding null */
1016 	struct slot *slotp,	/* slot structure */
1017 	struct inode **ipp,	/* return inode if we find one */
1018 	struct cred *cr,
1019 	int noentry)		/* noentry - just look for space */
1020 {
1021 	uint64_t handle;
1022 	struct fbuf *fbp;	/* pointer to directory block */
1023 	struct direct *ep;	/* directory entry */
1024 	struct direct *nep;	/* next directory entry */
1025 	dcanchor_t *dcap;
1026 	vnode_t *dvp;		/* directory vnode ptr */
1027 	off_t dirsize;		/* size of the directory */
1028 	off_t offset;		/* offset in the directory */
1029 	off_t last_offset;	/* last offset */
1030 	off_t enduseful;	/* pointer past last used dir slot */
1031 	int entryoffsetinblk;	/* offset of ep in fbp's buffer */
1032 	int i;			/* length of mangled entry */
1033 	int needed;
1034 	int err;
1035 	int first;
1036 	int caching;
1037 	int stat;
1038 	ino_t ep_ino;
1039 	slotstat_t initstat = slotp->status;
1040 
1041 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1042 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1043 	ASSERT(*ipp == NULL);
1044 	fbp = NULL;
1045 
1046 	/*
1047 	 * First check if there is a complete cache of the directory.
1048 	 */
1049 	dvp = ITOV(tdp);
1050 
1051 	dcap = &tdp->i_danchor;
1052 	if (noentry) {
1053 		/*
1054 		 * We know from the 1st level dnlc cache that the entry
1055 		 * doesn't exist, so don't bother searching the directory
1056 		 * cache, but just look for space (possibly in the directory
1057 		 * cache).
1058 		 */
1059 		stat = DNOENT;
1060 	} else {
1061 		stat = dnlc_dir_lookup(dcap, namep, &handle);
1062 	}
1063 	switch (stat) {
1064 	case DFOUND:
1065 		ep_ino = (ino_t)H_TO_INO(handle);
1066 		if (tdp->i_number == ep_ino) {
1067 			*ipp = tdp;	/* we want ourself, ie "." */
1068 			VN_HOLD(dvp);
1069 		} else {
1070 			err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
1071 			if (err)
1072 				return (err);
1073 		}
1074 		offset = H_TO_OFF(handle);
1075 		first = 0;
1076 		if (offset & 1) {
1077 			/* This is the first entry in the block */
1078 			first = 1;
1079 			offset -= 1;
1080 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1081 		}
1082 		err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1083 		if (err) {
1084 			VN_RELE(ITOV(*ipp));
1085 			*ipp = NULL;
1086 			return (err);
1087 		}
1088 		/*
1089 		 * Check the validity of the entry.
1090 		 * If it's bad, then throw away the cache and
1091 		 * continue without it. The dirmangled() routine
1092 		 * will then be called upon it.
1093 		 */
1094 		if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1095 			VN_RELE(ITOV(*ipp));
1096 			*ipp = NULL;
1097 			dnlc_dir_purge(dcap);
1098 			break;
1099 		}
1100 		/*
1101 		 * Remember the returned offset is the offset of the
1102 		 * preceding record (unless this is the 1st record
1103 		 * in the DIRBLKSIZ sized block (disk sector)), then it's
1104 		 * offset + 1. Note, no real offsets are on odd boundaries.
1105 		 */
1106 		if (first) {
1107 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1108 			slotp->offset = offset;
1109 			slotp->size = 0;
1110 			slotp->ep = ep;
1111 		} else {
1112 			/* get the next entry */
1113 			nep = (struct direct *)((char *)ep + ep->d_reclen);
1114 			/*
1115 			 * Check the validity of this entry as well
1116 			 * If it's bad, then throw away the cache and
1117 			 * continue without it. The dirmangled() routine
1118 			 * will then be called upon it.
1119 			 */
1120 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1121 			    (nep->d_ino != ep_ino)) {
1122 				VN_RELE(ITOV(*ipp));
1123 				*ipp = NULL;
1124 				dnlc_dir_purge(dcap);
1125 				break;
1126 			}
1127 			slotp->offset = offset + ep->d_reclen;
1128 			slotp->size = ep->d_reclen;
1129 			slotp->ep = nep;
1130 		}
1131 		slotp->status = EXIST;
1132 		slotp->fbp = fbp;
1133 		slotp->endoff = 0;
1134 		slotp->cached = 1;
1135 		dnlc_update(dvp, namep, ITOV(*ipp));
1136 		return (0);
1137 	case DNOENT:
1138 		/*
1139 		 * The caller gets to set the initial slot status to
1140 		 * indicate whether it's interested in getting a
1141 		 * empty slot. For example, the status can be set
1142 		 * to FOUND when an entry is being deleted.
1143 		 */
1144 		ASSERT(slotp->fbp == NULL);
1145 		if (slotp->status == FOUND) {
1146 			return (0);
1147 		}
1148 		switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
1149 		    &handle)) {
1150 		case DFOUND:
1151 			offset = (off_t)handle;
1152 			err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1153 			if (err) {
1154 				dnlc_dir_purge(dcap);
1155 				ASSERT(*ipp == NULL);
1156 				return (err);
1157 			}
1158 			/*
1159 			 * Check the validity of the entry.
1160 			 * If it's bad, then throw away the cache and
1161 			 * continue without it. The dirmangled() routine
1162 			 * will then be called upon it.
1163 			 */
1164 			if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1165 				dnlc_dir_purge(dcap);
1166 				break;
1167 			}
1168 			/*
1169 			 * Remember the returned offset is the offset of the
1170 			 * containing record.
1171 			 */
1172 			slotp->status = FOUND;
1173 			slotp->ep = ep;
1174 			slotp->offset = offset;
1175 			slotp->fbp = fbp;
1176 			slotp->size = ep->d_reclen;
1177 			/*
1178 			 * Set end offset to 0. Truncation is handled
1179 			 * because the dnlc cache will blow away the
1180 			 * cached directory when an entry is removed
1181 			 * that drops the entries left to less than half
1182 			 * the minumum number (dnlc_min_dir_cache).
1183 			 */
1184 			slotp->endoff = 0;
1185 			slotp->cached = 1;
1186 			return (0);
1187 		case DNOENT:
1188 			slotp->status = NONE;
1189 			slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
1190 			    DIRBLKSIZ, u_offset_t);
1191 			slotp->size = DIRBLKSIZ;
1192 			slotp->endoff = 0;
1193 			slotp->cached = 1;
1194 			return (0);
1195 		default:
1196 			break;
1197 		}
1198 		break;
1199 	}
1200 	slotp->cached = 0;
1201 	caching = NULL;
1202 	if (tdp->i_cachedir && !noentry) {
1203 		/*
1204 		 * Attempt to cache any directories greater than
1205 		 * the tunable ufs_min_cache_dir.
1206 		 */
1207 		if (tdp->i_size >= ufs_min_dir_cache) {
1208 			switch (dnlc_dir_start(dcap,
1209 			    tdp->i_size >> AV_DIRECT_SHIFT)) {
1210 			case DNOMEM:
1211 			case DTOOBIG:
1212 				tdp->i_cachedir = 0;
1213 				break;
1214 			case DOK:
1215 				caching = 1;
1216 				break;
1217 			default:
1218 				break;
1219 			}
1220 		}
1221 	}
1222 
1223 	/*
1224 	 * No point in using i_diroff since we must search whole directory
1225 	 */
1226 	dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t);
1227 	enduseful = 0;
1228 	offset = last_offset = 0;
1229 	entryoffsetinblk = 0;
1230 	needed = (int)LDIRSIZ(namlen);
1231 	while (offset < dirsize) {
1232 		/*
1233 		 * If offset is on a block boundary,
1234 		 * read the next directory block.
1235 		 * Release previous if it exists.
1236 		 */
1237 		if (blkoff(tdp->i_fs, offset) == 0) {
1238 			if (fbp != NULL)
1239 				fbrelse(fbp, S_OTHER);
1240 
1241 			err = blkatoff(tdp, offset, (char **)0, &fbp);
1242 			if (err) {
1243 				ASSERT(*ipp == NULL);
1244 				if (caching) {
1245 					dnlc_dir_purge(dcap);
1246 				}
1247 				return (err);
1248 			}
1249 			entryoffsetinblk = 0;
1250 		}
1251 		/*
1252 		 * If still looking for a slot, and at a DIRBLKSIZ
1253 		 * boundary, have to start looking for free space
1254 		 * again.
1255 		 */
1256 		if (slotp->status == NONE &&
1257 		    (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
1258 			slotp->offset = -1;
1259 		}
1260 		/*
1261 		 * If the next entry is a zero length record or if the
1262 		 * record length is invalid, then skip to the next
1263 		 * directory block.  Complete validation checks are
1264 		 * done if the record length is invalid.
1265 		 *
1266 		 * Full validation checks are slow so they are disabled
1267 		 * by default.  Complete checks can be run by patching
1268 		 * "dirchk" to be true.
1269 		 *
1270 		 * We do not have to check the validity of
1271 		 * entryoffsetinblk here because it starts out as zero
1272 		 * and is only incremented by d_reclen values that we
1273 		 * validate here.
1274 		 */
1275 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1276 		if (ep->d_reclen == 0 ||
1277 		    (dirchk || (ep->d_reclen & 0x3)) &&
1278 		    dirmangled(tdp, ep, entryoffsetinblk, offset)) {
1279 			i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
1280 			offset += i;
1281 			entryoffsetinblk += i;
1282 			if (caching) {
1283 				dnlc_dir_purge(dcap);
1284 				caching = 0;
1285 			}
1286 			continue;
1287 		}
1288 
1289 		/*
1290 		 * Add named entries and free space into the directory cache
1291 		 */
1292 		if (caching) {
1293 			ushort_t extra;
1294 			off_t off2;
1295 
1296 			if (ep->d_ino == 0) {
1297 				extra = ep->d_reclen;
1298 				if (offset & (DIRBLKSIZ - 1)) {
1299 					dnlc_dir_purge(dcap);
1300 					caching = 0;
1301 				}
1302 			} else {
1303 				/*
1304 				 * entries hold the previous offset if
1305 				 * not the 1st one
1306 				 */
1307 				if (offset & (DIRBLKSIZ - 1)) {
1308 					off2 = last_offset;
1309 				} else {
1310 					off2 = offset + 1;
1311 				}
1312 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
1313 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
1314 				extra = ep->d_reclen - DIRSIZ(ep);
1315 			}
1316 			if (caching && (extra >= LDIRSIZ(1))) {
1317 				caching = (dnlc_dir_add_space(dcap, extra,
1318 				    (uint64_t)offset) == DOK);
1319 			}
1320 		}
1321 
1322 		/*
1323 		 * If an appropriate sized slot has not yet been found,
1324 		 * check to see if one is available.
1325 		 */
1326 		if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
1327 			int size = ep->d_reclen;
1328 
1329 			if (ep->d_ino != 0)
1330 				size -= DIRSIZ(ep);
1331 			if (size > 0) {
1332 				if (size >= needed) {
1333 					slotp->offset = offset;
1334 					slotp->size = ep->d_reclen;
1335 					if (noentry) {
1336 						slotp->ep = ep;
1337 						slotp->fbp = fbp;
1338 						slotp->status = FOUND;
1339 						slotp->endoff = 0;
1340 						return (0);
1341 					}
1342 					slotp->status = FOUND;
1343 				} else if (slotp->status == NONE) {
1344 					if (slotp->offset == -1)
1345 						slotp->offset = offset;
1346 				}
1347 			}
1348 		}
1349 		/*
1350 		 * Check for a name match.
1351 		 */
1352 		if (ep->d_ino && ep->d_namlen == namlen &&
1353 		    *namep == *ep->d_name &&	/* fast chk 1st char */
1354 		    bcmp(namep, ep->d_name, namlen) == 0) {
1355 
1356 			tdp->i_diroff = offset;
1357 
1358 			if (tdp->i_number == ep->d_ino) {
1359 				*ipp = tdp;	/* we want ourself, ie "." */
1360 				VN_HOLD(dvp);
1361 			} else {
1362 				err = ufs_iget_alloced(tdp->i_vfs,
1363 				    (ino_t)ep->d_ino, ipp, cr);
1364 				if (err) {
1365 					fbrelse(fbp, S_OTHER);
1366 					if (caching)
1367 						dnlc_dir_purge(dcap);
1368 					return (err);
1369 				}
1370 			}
1371 			slotp->status = EXIST;
1372 			slotp->offset = offset;
1373 			slotp->size = (int)(offset - last_offset);
1374 			slotp->fbp = fbp;
1375 			slotp->ep = ep;
1376 			slotp->endoff = 0;
1377 			if (caching)
1378 				dnlc_dir_purge(dcap);
1379 			return (0);
1380 		}
1381 		last_offset = offset;
1382 		offset += ep->d_reclen;
1383 		entryoffsetinblk += ep->d_reclen;
1384 		if (ep->d_ino)
1385 			enduseful = offset;
1386 	}
1387 	if (fbp) {
1388 		fbrelse(fbp, S_OTHER);
1389 	}
1390 
1391 	if (caching) {
1392 		dnlc_dir_complete(dcap);
1393 		slotp->cached = 1;
1394 		if (slotp->status == FOUND) {
1395 			if (initstat == FOUND) {
1396 				return (0);
1397 			}
1398 			(void) dnlc_dir_rem_space_by_handle(dcap,
1399 			    slotp->offset);
1400 			slotp->endoff = 0;
1401 			return (0);
1402 		}
1403 	}
1404 
1405 	if (slotp->status == NONE) {
1406 		/*
1407 		 * We didn't find a slot; the new directory entry should be put
1408 		 * at the end of the directory.  Return an indication of where
1409 		 * this is, and set "endoff" to zero; since we're going to have
1410 		 * to extend the directory, we're certainly not going to
1411 		 * truncate it.
1412 		 */
1413 		slotp->offset = dirsize;
1414 		slotp->size = DIRBLKSIZ;
1415 		slotp->endoff = 0;
1416 	} else {
1417 		/*
1418 		 * We found a slot, and will return an indication of where that
1419 		 * slot is, as any new directory entry will be put there.
1420 		 * Since that slot will become a useful entry, if the last
1421 		 * useful entry we found was before this one, update the offset
1422 		 * of the last useful entry.
1423 		 */
1424 		if (enduseful < slotp->offset + slotp->size)
1425 			enduseful = slotp->offset + slotp->size;
1426 		slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
1427 	}
1428 	*ipp = NULL;
1429 	return (0);
1430 }
1431 
1432 uint64_t ufs_dirrename_retry_cnt;
1433 
1434 /*
1435  * Rename the entry in the directory tdp so that it points to
1436  * sip instead of tip.
1437  */
1438 static int
1439 ufs_dirrename(
1440 	struct inode *sdp,	/* parent directory of source */
1441 	struct inode *sip,	/* source inode */
1442 	struct inode *tdp,	/* parent directory of target */
1443 	char *namep,		/* entry we are trying to change */
1444 	struct inode *tip,	/* target inode */
1445 	struct slot *slotp,	/* slot for entry */
1446 	struct cred *cr)	/* credentials */
1447 {
1448 	vnode_t *tdvp;
1449 	off_t offset;
1450 	int err;
1451 	int doingdirectory;
1452 
1453 	ASSERT(sdp->i_ufsvfs != NULL);
1454 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1455 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1456 	/*
1457 	 * Short circuit rename of something to itself.
1458 	 */
1459 	if (sip->i_number == tip->i_number) {
1460 		return (ESAME); /* special KLUDGE error code */
1461 	}
1462 
1463 	/*
1464 	 * We're locking 2 peer level locks, so must use tryenter
1465 	 * on the 2nd to avoid deadlocks that would occur
1466 	 * if we renamed a->b and b->a concurrently.
1467 	 */
1468 retry:
1469 	rw_enter(&tip->i_contents, RW_WRITER);
1470 	if (!rw_tryenter(&sip->i_contents, RW_READER)) {
1471 		/*
1472 		 * drop tip and wait (sleep) until we stand a chance
1473 		 * of holding sip
1474 		 */
1475 		rw_exit(&tip->i_contents);
1476 		rw_enter(&sip->i_contents, RW_READER);
1477 		/*
1478 		 * Reverse the lock grabs in case we have heavy
1479 		 * contention on the 2nd lock.
1480 		 */
1481 		if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
1482 			ufs_dirrename_retry_cnt++;
1483 			rw_exit(&sip->i_contents);
1484 			goto retry;
1485 		}
1486 	}
1487 
1488 	/*
1489 	 * Check that everything is on the same filesystem.
1490 	 */
1491 	if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
1492 	    (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
1493 		err = EXDEV;		/* XXX archaic */
1494 		goto out;
1495 	}
1496 	/*
1497 	 * Must have write permission to rewrite target entry.
1498 	 * Perform additional checks for sticky directories.
1499 	 */
1500 	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 ||
1501 	    (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
1502 		goto out;
1503 
1504 	/*
1505 	 * Ensure source and target are compatible (both directories
1506 	 * or both not directories).  If target is a directory it must
1507 	 * be empty and have no links to it; in addition it must not
1508 	 * be a mount point, and both the source and target must be
1509 	 * writable.
1510 	 */
1511 	doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
1512 	    ((sip->i_mode & IFMT) == IFATTRDIR));
1513 	if (((tip->i_mode & IFMT) == IFDIR) ||
1514 	    ((tip->i_mode & IFMT) == IFATTRDIR)) {
1515 		if (!doingdirectory) {
1516 			err = EISDIR;
1517 			goto out;
1518 		}
1519 		/*
1520 		 * vn_vfslock will prevent mounts from using the directory until
1521 		 * we are done.
1522 		 */
1523 		if (vn_vfslock(ITOV(tip))) {
1524 			err = EBUSY;
1525 			goto out;
1526 		}
1527 		if (vn_mountedvfs(ITOV(tip)) != NULL) {
1528 			vn_vfsunlock(ITOV(tip));
1529 			err = EBUSY;
1530 			goto out;
1531 		}
1532 		if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
1533 			vn_vfsunlock(ITOV(tip));
1534 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
1535 			goto out;
1536 		}
1537 	} else if (doingdirectory) {
1538 		err = ENOTDIR;
1539 		goto out;
1540 	}
1541 
1542 	/*
1543 	 * Rewrite the inode pointer for target name entry
1544 	 * from the target inode (ip) to the source inode (sip).
1545 	 * This prevents the target entry from disappearing
1546 	 * during a crash. Mark the directory inode to reflect the changes.
1547 	 */
1548 	tdvp = ITOV(tdp);
1549 	slotp->ep->d_ino = (int32_t)sip->i_number;
1550 	dnlc_update(tdvp, namep, ITOV(sip));
1551 	if (slotp->size) {
1552 		offset = slotp->offset - slotp->size;
1553 	} else {
1554 		offset = slotp->offset + 1;
1555 	}
1556 	if (slotp->cached) {
1557 		(void) dnlc_dir_update(&tdp->i_danchor, namep,
1558 		    INO_OFF_TO_H(slotp->ep->d_ino, offset));
1559 	}
1560 
1561 	err = TRANS_DIR(tdp, slotp->offset);
1562 	if (err)
1563 		fbrelse(slotp->fbp, S_OTHER);
1564 	else
1565 		err = ufs_fbwrite(slotp->fbp, tdp);
1566 
1567 	slotp->fbp = NULL;
1568 	if (err) {
1569 		if (doingdirectory)
1570 			vn_vfsunlock(ITOV(tip));
1571 		goto out;
1572 	}
1573 
1574 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1575 	tdp->i_flag |= IUPD|ICHG;
1576 	tdp->i_seq++;
1577 	ITIMES_NOLOCK(tdp);
1578 
1579 	/*
1580 	 * Decrement the link count of the target inode.
1581 	 * Fix the ".." entry in sip to point to dp.
1582 	 * This is done after the new entry is on the disk.
1583 	 */
1584 	tip->i_nlink--;
1585 	TRANS_INODE(tip->i_ufsvfs, tip);
1586 	tip->i_flag |= ICHG;
1587 	tip->i_seq++;
1588 	ITIMES_NOLOCK(tip);
1589 	if (doingdirectory) {
1590 		/*
1591 		 * The entry for tip no longer exists so I can unlock the
1592 		 * vfslock.
1593 		 */
1594 		vn_vfsunlock(ITOV(tip));
1595 		/*
1596 		 * Decrement target link count once more if it was a directory.
1597 		 */
1598 		if (--tip->i_nlink != 0) {
1599 			err = ufs_fault(ITOV(tip),
1600 		    "ufs_dirrename: target directory link count != 0 (%s)",
1601 			    tip->i_fs->fs_fsmnt);
1602 			rw_exit(&tip->i_contents);
1603 			return (err);
1604 		}
1605 		TRANS_INODE(tip->i_ufsvfs, tip);
1606 		ufs_setreclaim(tip);
1607 		/*
1608 		 * Renaming a directory with the parent different
1609 		 * requires that ".." be rewritten.  The window is
1610 		 * still there for ".." to be inconsistent, but this
1611 		 * is unavoidable, and a lot shorter than when it was
1612 		 * done in a user process.  We decrement the link
1613 		 * count in the new parent as appropriate to reflect
1614 		 * the just-removed target.  If the parent is the
1615 		 * same, this is appropriate since the original
1616 		 * directory is going away.  If the new parent is
1617 		 * different, ufs_dirfixdotdot() will bump the link count
1618 		 * back.
1619 		 */
1620 		tdp->i_nlink--;
1621 		ufs_setreclaim(tdp);
1622 		TRANS_INODE(tdp->i_ufsvfs, tdp);
1623 		tdp->i_flag |= ICHG;
1624 		tdp->i_seq++;
1625 		ITIMES_NOLOCK(tdp);
1626 		if (sdp != tdp) {
1627 			rw_exit(&tip->i_contents);
1628 			rw_exit(&sip->i_contents);
1629 			err = ufs_dirfixdotdot(sip, sdp, tdp);
1630 			return (err);
1631 		}
1632 	} else
1633 		ufs_setreclaim(tip);
1634 out:
1635 	rw_exit(&tip->i_contents);
1636 	rw_exit(&sip->i_contents);
1637 	return (err);
1638 }
1639 
1640 /*
1641  * Fix the ".." entry of the child directory so that it points
1642  * to the new parent directory instead of the old one.  Routine
1643  * assumes that dp is a directory and that all the inodes are on
1644  * the same file system.
1645  */
1646 static int
1647 ufs_dirfixdotdot(
1648 	struct inode *dp,	/* child directory */
1649 	struct inode *opdp,	/* old parent directory */
1650 	struct inode *npdp)	/* new parent directory */
1651 {
1652 	struct fbuf *fbp;
1653 	struct dirtemplate *dirp;
1654 	vnode_t *dvp;
1655 	int err;
1656 
1657 	ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
1658 	ASSERT(RW_WRITE_HELD(&npdp->i_contents));
1659 
1660 	/*
1661 	 * We hold the child directory's i_contents lock before calling
1662 	 * blkatoff so that we honor correct locking protocol which is
1663 	 * i_contents lock and then page lock. (blkatoff will call
1664 	 * ufs_getpage where we want the page lock)
1665 	 * We hold the child directory's i_rwlock before i_contents (as
1666 	 * per the locking protocol) since we are modifying the ".." entry
1667 	 * of the child directory.
1668 	 * We hold the i_rwlock and i_contents lock until we record
1669 	 * this directory delta to the log (via ufs_trans_dir) and have
1670 	 * done fbrelse.
1671 	 */
1672 	rw_enter(&dp->i_rwlock, RW_WRITER);
1673 	rw_enter(&dp->i_contents, RW_WRITER);
1674 	err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
1675 	if (err)
1676 		goto bad;
1677 
1678 	if (dp->i_nlink <= 0 ||
1679 	    dp->i_size < sizeof (struct dirtemplate)) {
1680 		err = ENOENT;
1681 		goto bad;
1682 	}
1683 
1684 	if (dirp->dotdot_namlen != 2 ||
1685 	    dirp->dotdot_name[0] != '.' ||
1686 	    dirp->dotdot_name[1] != '.') {	/* Sanity check. */
1687 		dirbad(dp, "mangled .. entry", (off_t)0);
1688 		err = ENOTDIR;
1689 		goto bad;
1690 	}
1691 
1692 	/*
1693 	 * Increment the link count in the new parent inode and force it out.
1694 	 */
1695 	if (npdp->i_nlink == MAXLINK) {
1696 		err = EMLINK;
1697 		goto bad;
1698 	}
1699 	npdp->i_nlink++;
1700 	TRANS_INODE(npdp->i_ufsvfs, npdp);
1701 	npdp->i_flag |= ICHG;
1702 	npdp->i_seq++;
1703 	ufs_iupdat(npdp, I_SYNC);
1704 
1705 	/*
1706 	 * Rewrite the child ".." entry and force it out.
1707 	 */
1708 	dvp = ITOV(dp);
1709 	dirp->dotdot_ino = (uint32_t)npdp->i_number;
1710 	dnlc_update(dvp, "..", ITOV(npdp));
1711 	(void) dnlc_dir_update(&dp->i_danchor, "..",
1712 	    INO_OFF_TO_H(dirp->dotdot_ino, 0));
1713 
1714 	err = TRANS_DIR(dp, 0);
1715 	if (err)
1716 		fbrelse(fbp, S_OTHER);
1717 	else
1718 		err = ufs_fbwrite(fbp, dp);
1719 
1720 	fbp = NULL;
1721 	if (err)
1722 		goto bad;
1723 
1724 	rw_exit(&dp->i_contents);
1725 	rw_exit(&dp->i_rwlock);
1726 
1727 	/*
1728 	 * Decrement the link count of the old parent inode and force it out.
1729 	 */
1730 	ASSERT(opdp);
1731 	rw_enter(&opdp->i_contents, RW_WRITER);
1732 	ASSERT(opdp->i_nlink > 0);
1733 	opdp->i_nlink--;
1734 	ufs_setreclaim(opdp);
1735 	TRANS_INODE(opdp->i_ufsvfs, opdp);
1736 	opdp->i_flag |= ICHG;
1737 	opdp->i_seq++;
1738 	ufs_iupdat(opdp, I_SYNC);
1739 	rw_exit(&opdp->i_contents);
1740 	return (0);
1741 
1742 bad:
1743 	if (fbp)
1744 		fbrelse(fbp, S_OTHER);
1745 	rw_exit(&dp->i_contents);
1746 	rw_exit(&dp->i_rwlock);
1747 	return (err);
1748 }
1749 
1750 /*
1751  * Enter the file sip in the directory tdp with name namep.
1752  */
1753 static int
1754 ufs_diraddentry(
1755 	struct inode *tdp,
1756 	char *namep,
1757 	enum de_op op,
1758 	int namlen,
1759 	struct slot *slotp,
1760 	struct inode *sip,
1761 	struct inode *sdp,
1762 	struct cred *cr)
1763 {
1764 	struct direct *ep, *nep;
1765 	vnode_t *tdvp;
1766 	dcanchor_t *dcap = &tdp->i_danchor;
1767 	off_t offset;
1768 	int err;
1769 	ushort_t extra;
1770 
1771 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1772 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1773 	/*
1774 	 * Prepare a new entry.  If the caller has not supplied an
1775 	 * existing inode, make a new one.
1776 	 */
1777 	err = dirprepareentry(tdp, slotp, cr);
1778 	if (err) {
1779 		if (slotp->fbp) {
1780 			fbrelse(slotp->fbp, S_OTHER);
1781 			slotp->fbp = NULL;
1782 		}
1783 		return (err);
1784 	}
1785 	/*
1786 	 * Check inode to be linked to see if it is in the
1787 	 * same filesystem.
1788 	 */
1789 	if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
1790 		err = EXDEV;
1791 		goto bad;
1792 	}
1793 
1794 	/*
1795 	 * If renaming a directory then fix up the ".." entry in the
1796 	 * directory to point to the new parent.
1797 	 */
1798 	if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
1799 	    ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
1800 		err = ufs_dirfixdotdot(sip, sdp, tdp);
1801 		if (err)
1802 			goto bad;
1803 	}
1804 
1805 	/*
1806 	 * Fill in entry data.
1807 	 */
1808 	ep = slotp->ep;
1809 	ep->d_namlen = (ushort_t)namlen;
1810 	(void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
1811 	ep->d_ino = (uint32_t)sip->i_number;
1812 	tdvp = ITOV(tdp);
1813 	dnlc_update(tdvp, namep, ITOV(sip));
1814 	/*
1815 	 * Note the offset supplied for any named entry is
1816 	 * the offset of the previous one, unless it's the 1st.
1817 	 * slotp->size is used to pass the length to
1818 	 * the previous entry.
1819 	 */
1820 	if (slotp->size) {
1821 		offset = slotp->offset - slotp->size;
1822 	} else {
1823 		offset = slotp->offset + 1;
1824 	}
1825 
1826 	if (slotp->cached) {
1827 		/*
1828 		 * Add back any usable unused space to the dnlc directory
1829 		 * cache.
1830 		 */
1831 		extra = ep->d_reclen - DIRSIZ(ep);
1832 		if (extra >= LDIRSIZ(1)) {
1833 			(void) dnlc_dir_add_space(dcap, extra,
1834 			    (uint64_t)slotp->offset);
1835 		}
1836 
1837 		(void) dnlc_dir_add_entry(dcap, namep,
1838 		    INO_OFF_TO_H(ep->d_ino, offset));
1839 
1840 		/* adjust the previous offset of the next entry */
1841 		nep = (struct direct *)((char *)ep + ep->d_reclen);
1842 		if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
1843 			/*
1844 			 * Not a new block.
1845 			 *
1846 			 * Check the validity of the next entry.
1847 			 * If it's bad, then throw away the cache, and
1848 			 * continue as before directory caching.
1849 			 */
1850 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1851 			    dnlc_dir_update(dcap, nep->d_name,
1852 			    INO_OFF_TO_H(nep->d_ino, slotp->offset))
1853 			    == DNOENT) {
1854 				dnlc_dir_purge(dcap);
1855 				slotp->cached = 0;
1856 			}
1857 		}
1858 	}
1859 
1860 	/*
1861 	 * Write out the directory block.
1862 	 */
1863 	err = TRANS_DIR(tdp, slotp->offset);
1864 	if (err)
1865 		fbrelse(slotp->fbp, S_OTHER);
1866 	else
1867 		err = ufs_fbwrite(slotp->fbp, tdp);
1868 
1869 	slotp->fbp = NULL;
1870 	/*
1871 	 * If this is a rename of a directory, then we have already
1872 	 * fixed the ".." entry to refer to the new parent. If err
1873 	 * is true at this point, we have failed to update the new
1874 	 * parent to refer to the renamed directory.
1875 	 * XXX - we need to unwind the ".." fix.
1876 	 */
1877 	if (err)
1878 		return (err);
1879 
1880 	/*
1881 	 * Mark the directory inode to reflect the changes.
1882 	 * Truncate the directory to chop off blocks of empty entries.
1883 	 */
1884 
1885 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1886 	tdp->i_flag |= IUPD|ICHG;
1887 	tdp->i_seq++;
1888 	tdp->i_diroff = 0;
1889 	ITIMES_NOLOCK(tdp);
1890 	/*
1891 	 * If the directory grew then dirprepareentry() will have
1892 	 * set IATTCHG in tdp->i_flag, then the directory inode must
1893 	 * be flushed out. This is because if fsync() is used later
1894 	 * the directory size must be correct, otherwise a crash would
1895 	 * cause fsck to move the file to lost+found. Also because later
1896 	 * a file may be linked in more than one directory, then there
1897 	 * is no way to flush the original directory. So it must be
1898 	 * flushed out on creation. See bug 4293809.
1899 	 */
1900 	if (tdp->i_flag & IATTCHG) {
1901 		ufs_iupdat(tdp, I_SYNC);
1902 	}
1903 
1904 	if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
1905 		if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
1906 			(void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0,
1907 						cr);
1908 		}
1909 	}
1910 
1911 
1912 	return (0);
1913 
1914 bad:
1915 	if (slotp->cached) {
1916 		dnlc_dir_purge(dcap);
1917 		fbrelse(slotp->fbp, S_OTHER);
1918 		slotp->cached = 0;
1919 		slotp->fbp = NULL;
1920 		return (err);
1921 	}
1922 
1923 	/*
1924 	 * Clear out entry prepared by dirprepareent.
1925 	 */
1926 	slotp->ep->d_ino = 0;
1927 	slotp->ep->d_namlen = 0;
1928 
1929 	/*
1930 	 * Don't touch err so we don't clobber the real error that got us here.
1931 	 */
1932 	if (TRANS_DIR(tdp, slotp->offset))
1933 		fbrelse(slotp->fbp, S_OTHER);
1934 	else
1935 		(void) ufs_fbwrite(slotp->fbp, tdp);
1936 	slotp->fbp = NULL;
1937 	return (err);
1938 }
1939 
1940 /*
1941  * Prepare a directory slot to receive an entry.
1942  */
1943 static int
1944 dirprepareentry(
1945 	struct inode *dp,	/* directory we are working in */
1946 	struct slot *slotp,	/* available slot info */
1947 	struct cred *cr)
1948 {
1949 	struct direct *ep, *nep;
1950 	off_t entryend;
1951 	int err;
1952 	slotstat_t status = slotp->status;
1953 	ushort_t dsize;
1954 
1955 	ASSERT((status == NONE) || (status == FOUND));
1956 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
1957 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
1958 	/*
1959 	 * If we didn't find a slot, then indicate that the
1960 	 * new slot belongs at the end of the directory.
1961 	 * If we found a slot, then the new entry can be
1962 	 * put at slotp->offset.
1963 	 */
1964 	entryend = slotp->offset + slotp->size;
1965 	if (status == NONE) {
1966 		ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
1967 		if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
1968 			err = ufs_fault(ITOV(dp),
1969 			    "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
1970 			    " > dp->i_fs->fs_fsize: %d (%s)",
1971 			    DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
1972 			return (err);
1973 		}
1974 		/*
1975 		 * Allocate the new block.
1976 		 */
1977 		err = BMAPALLOC(dp, (u_offset_t)slotp->offset,
1978 		    (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
1979 		if (err) {
1980 			return (err);
1981 		}
1982 		dp->i_size = entryend;
1983 		TRANS_INODE(dp->i_ufsvfs, dp);
1984 		dp->i_flag |= IUPD|ICHG|IATTCHG;
1985 		dp->i_seq++;
1986 		ITIMES_NOLOCK(dp);
1987 	} else if (entryend > dp->i_size) {
1988 		/*
1989 		 * Adjust directory size, if needed. This should never
1990 		 * push the size past a new multiple of DIRBLKSIZ.
1991 		 * This is an artifact of the old (4.2BSD) way of initializing
1992 		 * directory sizes to be less than DIRBLKSIZ.
1993 		 */
1994 		dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
1995 		TRANS_INODE(dp->i_ufsvfs, dp);
1996 		dp->i_flag |= IUPD|ICHG|IATTCHG;
1997 		dp->i_seq++;
1998 		ITIMES_NOLOCK(dp);
1999 	}
2000 
2001 	/*
2002 	 * Get the block containing the space for the new directory entry.
2003 	 */
2004 	if (slotp->fbp == NULL) {
2005 		err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
2006 		    &slotp->fbp);
2007 		if (err) {
2008 			return (err);
2009 		}
2010 	}
2011 	ep = slotp->ep;
2012 
2013 	switch (status) {
2014 	case NONE:
2015 		/*
2016 		 * No space in the directory. slotp->offset will be on a
2017 		 * directory block boundary and we will write the new entry
2018 		 * into a fresh block.
2019 		 */
2020 		ep->d_reclen = DIRBLKSIZ;
2021 		slotp->size = 0; /* length of previous entry */
2022 		break;
2023 	case FOUND:
2024 		/*
2025 		 * An entry of the required size has been found. Use it.
2026 		 */
2027 		if (ep->d_ino == 0) {
2028 			/* this is the 1st record in a block */
2029 			slotp->size = 0; /* length of previous entry */
2030 		} else {
2031 			dsize = DIRSIZ(ep);
2032 			nep = (struct direct *)((char *)ep + dsize);
2033 			nep->d_reclen = ep->d_reclen - dsize;
2034 			ep->d_reclen = dsize;
2035 			slotp->ep = nep;
2036 			slotp->offset += dsize;
2037 			slotp->size = dsize; /* length of previous entry */
2038 		}
2039 		break;
2040 	default:
2041 		break;
2042 	}
2043 	return (0);
2044 }
2045 
2046 /*
2047  * Allocate and initialize a new inode that will go into directory tdp.
2048  * This routine is called from ufs_symlink(), as well as within this file.
2049  */
2050 int
2051 ufs_dirmakeinode(
2052 	struct inode *tdp,
2053 	struct inode **ipp,
2054 	struct vattr *vap,
2055 	enum de_op op,
2056 	struct cred *cr)
2057 {
2058 	struct inode *ip;
2059 	enum vtype type;
2060 	int imode;			/* mode and format as in inode */
2061 	ino_t ipref;
2062 	int err;
2063 	timestruc_t now;
2064 
2065 	ASSERT(vap != NULL);
2066 	ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
2067 		op == DE_SYMLINK);
2068 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
2069 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
2070 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
2071 	/*
2072 	 * Allocate a new inode.
2073 	 */
2074 	type = vap->va_type;
2075 	if (type == VDIR) {
2076 		ipref = dirpref(tdp);
2077 	} else {
2078 		ipref = tdp->i_number;
2079 	}
2080 	if (op == DE_ATTRDIR)
2081 		imode = vap->va_mode;
2082 	else
2083 		imode = MAKEIMODE(type, vap->va_mode);
2084 	*ipp = NULL;
2085 	err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
2086 	if (err)
2087 		return (err);
2088 
2089 	/*
2090 	 * We don't need to grab vfs_dqrwlock here because it is held
2091 	 * in ufs_direnter_*() above us.
2092 	 */
2093 	ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
2094 	rw_enter(&ip->i_contents, RW_WRITER);
2095 	if (ip->i_dquot != NULL) {
2096 		err = ufs_fault(ITOV(ip),
2097 		    "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
2098 				    tdp->i_fs->fs_fsmnt);
2099 		rw_exit(&ip->i_contents);
2100 		return (err);
2101 	}
2102 	*ipp = ip;
2103 	ip->i_mode = (o_mode_t)imode;
2104 	if (type == VBLK || type == VCHR) {
2105 		dev_t d = vap->va_rdev;
2106 		dev32_t dev32;
2107 
2108 		/*
2109 		 * Don't allow a special file to be created with a
2110 		 * dev_t that cannot be represented by this filesystem
2111 		 * format on disk.
2112 		 */
2113 		if (!cmpldev(&dev32, d)) {
2114 			err = EOVERFLOW;
2115 			goto fail;
2116 		}
2117 
2118 		ITOV(ip)->v_rdev = ip->i_rdev = d;
2119 
2120 		if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
2121 			ip->i_ordev = dev32; /* can't use old format */
2122 		} else {
2123 			ip->i_ordev = cmpdev(d);
2124 		}
2125 	}
2126 	ITOV(ip)->v_type = type;
2127 	ufs_reset_vnode(ip->i_vnode);
2128 	if (type == VDIR) {
2129 		ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
2130 	} else {
2131 		ip->i_nlink = 1;
2132 	}
2133 
2134 	if (op == DE_ATTRDIR) {
2135 		ip->i_uid = vap->va_uid;
2136 		ip->i_gid = vap->va_gid;
2137 	} else
2138 		ip->i_uid = crgetuid(cr);
2139 	/*
2140 	 * To determine the group-id of the created file:
2141 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
2142 	 *	clients are not likely to set the gid), then use it if
2143 	 *	the process is privileged, belongs to the target group,
2144 	 *	or the group is the same as the parent directory.
2145 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
2146 	 *	GRPID option, and the directory's set-gid bit is clear,
2147 	 *	then use the process's gid.
2148 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
2149 	 */
2150 	if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
2151 	    ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
2152 	    secpolicy_vnode_create_gid(cr) == 0)) {
2153 		/*
2154 		 * XXX - is this only the case when a 4.0 NFS client, or a
2155 		 * client derived from that code, makes a call over the wire?
2156 		 */
2157 		ip->i_gid = vap->va_gid;
2158 	} else
2159 		ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);
2160 
2161 	/*
2162 	 * For SunOS 5.0->5.4, the lines below read:
2163 	 *
2164 	 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
2165 	 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
2166 	 *
2167 	 * where MAXUID was set to 60002.  See notes on this in ufs_inode.c
2168 	 */
2169 	ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
2170 		UID_LONG : ip->i_uid;
2171 	ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
2172 		GID_LONG : ip->i_gid;
2173 
2174 	/*
2175 	 * If we're creating a directory, and the parent directory has the
2176 	 * set-GID bit set, set it on the new directory.
2177 	 * Otherwise, if the user is neither privileged nor a member of the
2178 	 * file's new group, clear the file's set-GID bit.
2179 	 */
2180 	if ((tdp->i_mode & ISGID) && (type == VDIR))
2181 		ip->i_mode |= ISGID;
2182 	else {
2183 		if ((ip->i_mode & ISGID) &&
2184 		    secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
2185 			ip->i_mode &= ~ISGID;
2186 	}
2187 
2188 	if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2189 	    ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2190 		err = EOVERFLOW;
2191 		goto fail;
2192 	}
2193 
2194 	/*
2195 	 * Extended attribute directories are not subject to quotas.
2196 	 */
2197 	if (op != DE_ATTRDIR)
2198 		ip->i_dquot = getinoquota(ip);
2199 	else
2200 		ip->i_dquot = NULL;
2201 
2202 	if (op == DE_MKDIR || op == DE_ATTRDIR) {
2203 		err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
2204 		if (err)
2205 			goto fail;
2206 	}
2207 
2208 	/*
2209 	 * generate the shadow inode and attach it to the new object
2210 	 */
2211 	ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
2212 	    (!tdp->i_shadow && !tdp->i_ufs_acl));
2213 	if (tdp->i_shadow && tdp->i_ufs_acl &&
2214 	    (((tdp->i_mode & IFMT) == IFDIR) ||
2215 	    ((tdp->i_mode & IFMT) == IFATTRDIR))) {
2216 		err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
2217 		if (err) {
2218 			if (op == DE_MKDIR) {
2219 				/*
2220 				 * clean up parent directory
2221 				 *
2222 				 * tdp->i_contents already locked from
2223 				 * ufs_direnter_*()
2224 				 */
2225 				tdp->i_nlink--;
2226 				TRANS_INODE(tdp->i_ufsvfs, tdp);
2227 				tdp->i_flag |= ICHG;
2228 				tdp->i_seq++;
2229 				ufs_iupdat(tdp, I_SYNC);
2230 			}
2231 			goto fail;
2232 		}
2233 	}
2234 
2235 	/*
2236 	 * If the passed in attributes contain atime and/or mtime
2237 	 * settings, then use them instead of using the current
2238 	 * high resolution time.
2239 	 */
2240 	if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
2241 		if (vap->va_mask & AT_ATIME) {
2242 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2243 			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2244 			ip->i_flag &= ~IACC;
2245 		} else
2246 			ip->i_flag |= IACC;
2247 		if (vap->va_mask & AT_MTIME) {
2248 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2249 			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2250 			gethrestime(&now);
2251 			if (now.tv_sec > TIME32_MAX) {
2252 				/*
2253 				 * In 2038, ctime sticks forever..
2254 				 */
2255 				ip->i_ctime.tv_sec = TIME32_MAX;
2256 				ip->i_ctime.tv_usec = 0;
2257 			} else {
2258 				ip->i_ctime.tv_sec = now.tv_sec;
2259 				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2260 			}
2261 			ip->i_flag &= ~(IUPD|ICHG);
2262 			ip->i_flag |= IMODTIME;
2263 		} else
2264 			ip->i_flag |= IUPD|ICHG;
2265 		ip->i_flag |= IMOD;
2266 	} else
2267 		ip->i_flag |= IACC|IUPD|ICHG;
2268 	ip->i_seq++;
2269 
2270 	/*
2271 	 * If this is an attribute tag it as one.
2272 	 */
2273 	if ((tdp->i_mode & IFMT) == IFATTRDIR) {
2274 		ip->i_cflags |= IXATTR;
2275 	}
2276 
2277 	/*
2278 	 * push inode before it's name appears in a directory
2279 	 */
2280 	TRANS_INODE(ip->i_ufsvfs, ip);
2281 	ufs_iupdat(ip, I_SYNC);
2282 	rw_exit(&ip->i_contents);
2283 	return (0);
2284 
2285 fail:
2286 	/* Throw away inode we just allocated. */
2287 	ip->i_nlink = 0;
2288 	ufs_setreclaim(ip);
2289 	TRANS_INODE(ip->i_ufsvfs, ip);
2290 	ip->i_flag |= ICHG;
2291 	ip->i_seq++;
2292 	ITIMES_NOLOCK(ip);
2293 	rw_exit(&ip->i_contents);
2294 	return (err);
2295 }
2296 
2297 /*
2298  * Write a prototype directory into the empty inode ip, whose parent is dp.
2299  */
2300 static int
2301 ufs_dirmakedirect(
2302 	struct inode *ip,		/* new directory */
2303 	struct inode *dp,		/* parent directory */
2304 	int	attrdir,
2305 	struct cred *cr)
2306 {
2307 	struct dirtemplate *dirp;
2308 	struct fbuf *fbp;
2309 	int err;
2310 
2311 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
2312 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2313 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2314 	/*
2315 	 * Allocate space for the directory we're creating.
2316 	 */
2317 	err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr);
2318 	if (err)
2319 		return (err);
2320 	if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
2321 		err = ufs_fault(ITOV(dp),
2322 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
2323 					DIRBLKSIZ, dp->i_fs->fs_fsize,
2324 					dp->i_fs->fs_fsmnt);
2325 		return (err);
2326 	}
2327 	ip->i_size = DIRBLKSIZ;
2328 	TRANS_INODE(ip->i_ufsvfs, ip);
2329 	ip->i_flag |= IUPD|ICHG|IATTCHG;
2330 	ip->i_seq++;
2331 	ITIMES_NOLOCK(ip);
2332 	/*
2333 	 * Update the tdp link count and write out the change.
2334 	 * This reflects the ".." entry we'll soon write.
2335 	 */
2336 	if (dp->i_nlink == MAXLINK)
2337 		return (EMLINK);
2338 	if (attrdir == 0)
2339 		dp->i_nlink++;
2340 	TRANS_INODE(dp->i_ufsvfs, dp);
2341 	dp->i_flag |= ICHG;
2342 	dp->i_seq++;
2343 	ufs_iupdat(dp, I_SYNC);
2344 	/*
2345 	 * Initialize directory with "."
2346 	 * and ".." from static template.
2347 	 *
2348 	 * Since the parent directory is locked, we don't have to
2349 	 * worry about anything changing when we drop the write
2350 	 * lock on (ip).
2351 	 *
2352 	 */
2353 	err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize,
2354 	    S_READ, &fbp);
2355 
2356 	if (err) {
2357 		goto fail;
2358 	}
2359 	dirp = (struct dirtemplate *)fbp->fb_addr;
2360 	/*
2361 	 * Now initialize the directory we're creating
2362 	 * with the "." and ".." entries.
2363 	 */
2364 	*dirp = mastertemplate;			/* structure assignment */
2365 	dirp->dot_ino = (uint32_t)ip->i_number;
2366 	dirp->dotdot_ino = (uint32_t)dp->i_number;
2367 
2368 	err = TRANS_DIR(ip, 0);
2369 	if (err) {
2370 		fbrelse(fbp, S_OTHER);
2371 		goto fail;
2372 	}
2373 
2374 	err = ufs_fbwrite(fbp, ip);
2375 	if (err) {
2376 		goto fail;
2377 	}
2378 
2379 	return (0);
2380 
2381 fail:
2382 	if (attrdir == 0)
2383 		dp->i_nlink--;
2384 	TRANS_INODE(dp->i_ufsvfs, dp);
2385 	dp->i_flag |= ICHG;
2386 	dp->i_seq++;
2387 	ufs_iupdat(dp, I_SYNC);
2388 	return (err);
2389 }
2390 
2391 /*
2392  * Delete a directory entry.  If oip is nonzero the entry is checked
2393  * to make sure it still reflects oip.
2394  *
2395  * If vpp is non-null, return the ptr of the (held) vnode associated with
2396  * the removed name.  The caller is responsible for doing the VN_RELE().
2397  */
2398 int
2399 ufs_dirremove(
2400 	struct inode *dp,
2401 	char *namep,
2402 	struct inode *oip,
2403 	struct vnode *cdir,
2404 	enum dr_op op,
2405 	struct cred *cr,
2406 	vnode_t **vpp)	/* Return (held) vnode ptr of removed file/dir */
2407 {
2408 	struct direct *ep, *pep, *nep;
2409 	struct inode *ip;
2410 	vnode_t *dvp, *vp;
2411 	struct slot slot;
2412 	int namlen;
2413 	int err;
2414 	int mode;
2415 	ushort_t extra;
2416 
2417 	namlen = (int)strlen(namep);
2418 	if (namlen == 0)
2419 		return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0"));
2420 	/*
2421 	 * return error when removing . and ..
2422 	 */
2423 	if (namep[0] == '.') {
2424 		if (namlen == 1)
2425 			return (EINVAL);
2426 		else if (namlen == 2 && namep[1] == '.') {
2427 			return (EEXIST);	/* SIGH should be ENOTEMPTY */
2428 		}
2429 	}
2430 
2431 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2432 	/*
2433 	 * Check accessibility of directory.
2434 	 */
2435 retry:
2436 	if (((dp->i_mode & IFMT) != IFDIR) &&
2437 	    ((dp->i_mode & IFMT) != IFATTRDIR)) {
2438 		return (ENOTDIR);
2439 	}
2440 
2441 	/*
2442 	 * Execute access is required to search the directory.
2443 	 * Access for write is interpreted as allowing
2444 	 * deletion of files in the directory.
2445 	 */
2446 	if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) {
2447 		return (err);
2448 	}
2449 
2450 	ip = NULL;
2451 	slot.fbp = NULL;
2452 	slot.status = FOUND;	/* don't need to look for empty slot */
2453 	rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
2454 	rw_enter(&dp->i_contents, RW_WRITER);
2455 	err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
2456 	if (err)
2457 		goto out_novfs;
2458 	if (ip == NULL) {
2459 		err = ENOENT;
2460 		goto out_novfs;
2461 	}
2462 	vp = ITOV(ip);
2463 	if (oip && oip != ip) {
2464 		err = ENOENT;
2465 		goto out_novfs;
2466 	}
2467 
2468 	mode = ip->i_mode & IFMT;
2469 	if (mode == IFDIR || mode == IFATTRDIR) {
2470 
2471 		/*
2472 		 * vn_vfslock() prevents races between mount and rmdir.
2473 		 */
2474 		if (vn_vfslock(vp)) {
2475 			err = EBUSY;
2476 			goto out_novfs;
2477 		}
2478 		if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
2479 			err = EBUSY;
2480 			goto out;
2481 		}
2482 		/*
2483 		 * If we are removing a directory, get a lock on it.
2484 		 * Taking a writer lock prevents a parallel ufs_dirlook from
2485 		 * incorrectly entering a negative cache vnode entry in the dnlc
2486 		 * If the directory is empty, it will stay empty until
2487 		 * we can remove it.
2488 		 */
2489 		if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
2490 			/*
2491 			 * It is possible that a thread in rename would have
2492 			 * acquired this rwlock. To prevent a deadlock we
2493 			 * do a rw_tryenter. If we fail to get the lock
2494 			 * we drop all the locks we have acquired, wait
2495 			 * for 2 ticks and reacquire the
2496 			 * directory's (dp) i_rwlock and try again.
2497 			 * If we dont drop dp's i_rwlock then we will panic
2498 			 * with a "Deadlock: cycle in blocking chain"
2499 			 * since in ufs_dircheckpath we want dp's i_rwlock.
2500 			 * dp is guaranteed to exist since ufs_dirremove is
2501 			 * called after a VN_HOLD(dp) has been done.
2502 			 */
2503 			ufs_dirremove_retry_cnt++;
2504 			vn_vfsunlock(vp);
2505 			if (slot.fbp)
2506 				fbrelse(slot.fbp, S_OTHER);
2507 			rw_exit(&dp->i_contents);
2508 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2509 			rw_exit(&dp->i_rwlock);
2510 			VN_RELE(vp);
2511 			delay(2);
2512 			rw_enter(&dp->i_rwlock, RW_WRITER);
2513 			goto retry;
2514 		}
2515 	}
2516 	rw_enter(&ip->i_contents, RW_READER);
2517 
2518 	/*
2519 	 * Now check the restrictions that apply on sticky directories.
2520 	 */
2521 	if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
2522 		rw_exit(&ip->i_contents);
2523 		if (mode == IFDIR || mode == IFATTRDIR)
2524 			rw_exit(&ip->i_rwlock);
2525 		goto out;
2526 	}
2527 
2528 	if (op == DR_RMDIR) {
2529 		/*
2530 		 * For rmdir(2), some special checks are required.
2531 		 * (a) Don't remove any alias of the parent (e.g. ".").
2532 		 * (b) Don't remove the current directory.
2533 		 * (c) Make sure the entry is (still) a directory.
2534 		 * (d) Make sure the directory is empty.
2535 		 */
2536 
2537 		if (dp == ip || vp == cdir)
2538 			err = EINVAL;
2539 		else if (((ip->i_mode & IFMT) != IFDIR) &&
2540 		    ((ip->i_mode & IFMT) != IFATTRDIR))
2541 			err = ENOTDIR;
2542 		else if ((ip->i_nlink > 2) ||
2543 		    !ufs_dirempty(ip, dp->i_number, cr)) {
2544 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
2545 		}
2546 
2547 		if (err) {
2548 			rw_exit(&ip->i_contents);
2549 			if (mode == IFDIR || mode == IFATTRDIR)
2550 				rw_exit(&ip->i_rwlock);
2551 			goto out;
2552 		}
2553 	} else if (op == DR_REMOVE)  {
2554 		/*
2555 		 * unlink(2) requires a different check: allow only
2556 		 * privileged users to unlink a directory.
2557 		 */
2558 		if (vp->v_type == VDIR &&
2559 		    secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
2560 			err = EPERM;
2561 			rw_exit(&ip->i_contents);
2562 			rw_exit(&ip->i_rwlock);
2563 			goto out;
2564 		}
2565 	}
2566 
2567 	rw_exit(&ip->i_contents);
2568 
2569 	/*
2570 	 * Remove the cache'd entry, if any.
2571 	 */
2572 	dvp = ITOV(dp);
2573 	dnlc_remove(dvp, namep);
2574 	ep = slot.ep;
2575 	ep->d_ino = 0;
2576 
2577 	if (slot.cached) {
2578 		dcanchor_t *dcap = &dp->i_danchor;
2579 
2580 		(void) dnlc_dir_rem_entry(dcap, namep, NULL);
2581 		if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
2582 			(void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
2583 		}
2584 		if (slot.offset & (DIRBLKSIZ - 1)) {
2585 			/*
2586 			 * Collapse new free space into previous entry.
2587 			 * Note, the previous entry has already been
2588 			 * validated in ufs_dircheckforname().
2589 			 */
2590 			ASSERT(slot.size);
2591 			pep = (struct direct *)((char *)ep - slot.size);
2592 			if ((pep->d_ino == 0) &&
2593 			    ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
2594 				dnlc_dir_purge(dcap);
2595 				slot.cached = 0;
2596 				goto nocache;
2597 			}
2598 			if (pep->d_ino) {
2599 				extra = pep->d_reclen - DIRSIZ(pep);
2600 			} else {
2601 				extra = pep->d_reclen;
2602 			}
2603 			if (extra >= LDIRSIZ(1)) {
2604 				(void) dnlc_dir_rem_space_by_handle(dcap,
2605 				    (uint64_t)(slot.offset - slot.size));
2606 			}
2607 			pep->d_reclen += ep->d_reclen;
2608 			(void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
2609 				(uint64_t)(slot.offset - slot.size));
2610 			/* adjust the previous pointer in the next entry */
2611 			nep = (struct direct *)((char *)ep + ep->d_reclen);
2612 			if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
2613 				/*
2614 				 * Not a new block.
2615 				 *
2616 				 * Check the validity of the entry.
2617 				 * If it's bad, then throw away the cache and
2618 				 * continue.
2619 				 */
2620 				if ((nep->d_reclen == 0) ||
2621 				    (nep->d_reclen & 0x3) ||
2622 				    (dnlc_dir_update(dcap, nep->d_name,
2623 				    INO_OFF_TO_H(nep->d_ino,
2624 				    slot.offset - slot.size)) == DNOENT)) {
2625 					dnlc_dir_purge(dcap);
2626 					slot.cached = 0;
2627 				}
2628 			}
2629 		} else {
2630 			(void) dnlc_dir_add_space(dcap, ep->d_reclen,
2631 			(uint64_t)slot.offset);
2632 		}
2633 	} else {
2634 		/*
2635 		 * If the entry isn't the first in the directory, we must
2636 		 * reclaim the space of the now empty record by adding
2637 		 * the record size to the size of the previous entry.
2638 		 */
2639 		if (slot.offset & (DIRBLKSIZ - 1)) {
2640 			/*
2641 			 * Collapse new free space into previous entry.
2642 			 */
2643 			pep = (struct direct *)((char *)ep - slot.size);
2644 			pep->d_reclen += ep->d_reclen;
2645 		}
2646 	}
2647 nocache:
2648 
2649 
2650 	err = TRANS_DIR(dp, slot.offset);
2651 	if (err)
2652 		fbrelse(slot.fbp, S_OTHER);
2653 	else
2654 		err = ufs_fbwrite(slot.fbp, dp);
2655 	slot.fbp = NULL;
2656 
2657 	/*
2658 	 * If we were removing a directory, it is 'gone' now, but we cannot
2659 	 * unlock it as a thread may be waiting for the lock in ufs_create. If
2660 	 * we did, it could then create a file in a deleted directory.
2661 	 */
2662 
2663 	if (err) {
2664 		if (mode == IFDIR || mode == IFATTRDIR)
2665 			rw_exit(&ip->i_rwlock);
2666 		goto out;
2667 	}
2668 
2669 	rw_enter(&ip->i_contents, RW_WRITER);
2670 
2671 	dp->i_flag |= IUPD|ICHG;
2672 	dp->i_seq++;
2673 	ip->i_flag |= ICHG;
2674 	ip->i_seq++;
2675 
2676 	TRANS_INODE(dp->i_ufsvfs, dp);
2677 	TRANS_INODE(ip->i_ufsvfs, ip);
2678 	/*
2679 	 * Now dispose of the inode.
2680 	 */
2681 	if (ip->i_nlink > 0) {
2682 		/*
2683 		 * This is not done for IFATTRDIR's because they don't
2684 		 * have entries in the dnlc and the link counts are
2685 		 * not incremented when they are created.
2686 		 */
2687 		if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
2688 			/*
2689 			 * Decrement by 2 because we're trashing the "."
2690 			 * entry as well as removing the entry in dp.
2691 			 * Clear the directory entry, but there may be
2692 			 * other hard links so don't free the inode.
2693 			 * Decrement the dp linkcount because we're
2694 			 * trashing the ".." entry.
2695 			 */
2696 			ip->i_nlink -= 2;
2697 			dp->i_nlink--;
2698 			ufs_setreclaim(dp);
2699 			/*
2700 			 * XXX need to discard negative cache entries
2701 			 * for vp.  See comment in ufs_delete().
2702 			 */
2703 			dnlc_remove(vp, ".");
2704 			dnlc_remove(vp, "..");
2705 			/*
2706 			 * The return value is ignored here bacause if
2707 			 * the directory purge fails we don't want to
2708 			 * stop the delete. If ufs_dirpurgedotdot fails
2709 			 * the delete will continue with the preexiting
2710 			 * behavior.
2711 			 */
2712 			(void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
2713 		} else {
2714 			ip->i_nlink--;
2715 		}
2716 		ufs_setreclaim(ip);
2717 	}
2718 	ITIMES_NOLOCK(dp);
2719 	ITIMES_NOLOCK(ip);
2720 
2721 	if (!TRANS_ISTRANS(dp->i_ufsvfs))
2722 		ufs_iupdat(dp, I_SYNC);
2723 	if (!TRANS_ISTRANS(ip->i_ufsvfs))
2724 		ufs_iupdat(ip, I_SYNC);
2725 
2726 	rw_exit(&ip->i_contents);
2727 	if (mode == IFDIR || mode == IFATTRDIR)
2728 		rw_exit(&ip->i_rwlock);
2729 out:
2730 	if (mode == IFDIR || mode == IFATTRDIR) {
2731 		vn_vfsunlock(vp);
2732 	}
2733 out_novfs:
2734 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2735 
2736 	if (slot.fbp)
2737 		fbrelse(slot.fbp, S_OTHER);
2738 
2739 	rw_exit(&dp->i_contents);
2740 	rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2741 
2742 	/*
2743 	 * If no error and vpp is non-NULL, return the vnode ptr to the caller.
2744 	 * The caller becomes responsible for the VN_RELE().  Otherwise,
2745 	 * Release (and delete) the inode after we drop vfs_dqrwlock to
2746 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
2747 	 */
2748 	if (ip) {
2749 		if ((err == 0) && (vpp != NULL)) {
2750 			*vpp = ITOV(ip);
2751 		} else {
2752 			VN_RELE(vp);
2753 		}
2754 	}
2755 
2756 	return (err);
2757 }
2758 
2759 /*
2760  * Return buffer with contents of block "offset"
2761  * from the beginning of directory "ip".  If "res"
2762  * is non-zero, fill it in with a pointer to the
2763  * remaining space in the directory.
2764  *
2765  */
2766 
2767 int
2768 blkatoff(
2769 	struct inode *ip,
2770 	off_t offset,
2771 	char **res,
2772 	struct fbuf **fbpp)
2773 {
2774 	struct fs *fs;
2775 	struct fbuf *fbp;
2776 	daddr_t lbn;
2777 	uint_t bsize;
2778 	int err;
2779 
2780 	CPU_STATS_ADD_K(sys, ufsdirblk, 1);
2781 	fs = ip->i_fs;
2782 	lbn = (daddr_t)lblkno(fs, offset);
2783 	bsize = (uint_t)blksize(fs, ip, lbn);
2784 	err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
2785 			bsize, S_READ, &fbp);
2786 	if (err) {
2787 		*fbpp = (struct fbuf *)NULL;
2788 		return (err);
2789 	}
2790 	if (res)
2791 		*res = fbp->fb_addr + blkoff(fs, offset);
2792 	*fbpp = fbp;
2793 	return (0);
2794 }
2795 
2796 /*
2797  * Do consistency checking:
2798  *	record length must be multiple of 4
2799  *	entry must fit in rest of its DIRBLKSIZ block
2800  *	record must be large enough to contain entry
2801  *	name is not longer than MAXNAMLEN
2802  *	name must be as long as advertised, and null terminated
2803  * NOTE: record length must not be zero (should be checked previously).
2804  *       This routine is only called if dirchk is true.
2805  *       It would be nice to set the FSBAD flag in the super-block when
2806  *       this routine fails so that a fsck is forced on next reboot,
2807  *       but locking is a problem.
2808  */
2809 static int
2810 dirmangled(
2811 	struct inode *dp,
2812 	struct direct *ep,
2813 	int entryoffsetinblock,
2814 	off_t offset)
2815 {
2816 	int i;
2817 
2818 	i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
2819 	if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
2820 	    (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
2821 	    ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
2822 		dirbad(dp, "mangled entry", offset);
2823 		return (1);
2824 	}
2825 	return (0);
2826 }
2827 
2828 static void
2829 dirbad(struct inode *ip, char *how, off_t offset)
2830 {
2831 	cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
2832 	    ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
2833 }
2834 
2835 static int
2836 dirbadname(char *sp, int l)
2837 {
2838 	while (l--) {			/* check for nulls */
2839 		if (*sp++ == '\0') {
2840 			return (1);
2841 		}
2842 	}
2843 	return (*sp);			/* check for terminating null */
2844 }
2845 
2846 /*
2847  * Check if a directory is empty or not.
2848  */
2849 static int
2850 ufs_dirempty(
2851 	struct inode *ip,
2852 	ino_t parentino,
2853 	struct cred *cr)
2854 {
2855 	return (ufs_dirscan(ip, parentino, cr, 0));
2856 }
2857 
2858 /*
2859  * clear the .. directory entry.
2860  */
2861 static int
2862 ufs_dirpurgedotdot(
2863 	struct inode *ip,
2864 	ino_t parentino,
2865 	struct cred *cr)
2866 {
2867 	return (ufs_dirscan(ip, parentino, cr, 1));
2868 }
2869 
2870 /*
2871  * Scan the directoy. If clr_dotdot is true clear the ..
2872  * directory else check to see if the directory is empty.
2873  *
2874  * Using a struct dirtemplate here is not precisely
2875  * what we want, but better than using a struct direct.
2876  *
2877  * clr_dotdot is used as a flag to tell us if we need
2878  * to clear the dotdot entry
2879  *
2880  * N.B.: does not handle corrupted directories.
2881  */
2882 static int
2883 ufs_dirscan(
2884 	struct inode *ip,
2885 	ino_t parentino,
2886 	struct cred *cr,
2887 	int clr_dotdot)
2888 {
2889 	offset_t off;
2890 	struct dirtemplate dbuf;
2891 	struct direct *dp = (struct direct *)&dbuf;
2892 	int err, count;
2893 	int empty = 1;	/* Assume it's empty */
2894 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
2895 
2896 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2897 
2898 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
2899 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
2900 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
2901 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
2902 		/*
2903 		 * Since we read MINDIRSIZ, residual must
2904 		 * be 0 unless we're at end of file.
2905 		 */
2906 		if (err || count != 0 || dp->d_reclen == 0) {
2907 			empty = 0;
2908 			break;
2909 		}
2910 		/* skip empty entries */
2911 		if (dp->d_ino == 0)
2912 			continue;
2913 		/* accept only "." and ".." */
2914 		if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
2915 			empty = 0;
2916 			break;
2917 		}
2918 		/*
2919 		 * At this point d_namlen must be 1 or 2.
2920 		 * 1 implies ".", 2 implies ".." if second
2921 		 * char is also "."
2922 		 */
2923 		if (dp->d_namlen == 1)
2924 			continue;
2925 		if (dp->d_name[1] == '.' &&
2926 		    (ino_t)dp->d_ino == parentino) {
2927 			/*
2928 			 * If we're doing a purge we need to check for
2929 			 * the . and .. entries and clear the d_ino for ..
2930 			 *
2931 			 * if clr_dotdot is set ufs_dirscan does not
2932 			 * check for an empty directory.
2933 			 */
2934 			if (clr_dotdot) {
2935 				/*
2936 				 * Have to actually zap the ..
2937 				 * entry in the directory, as
2938 				 * otherwise someone might have
2939 				 * dp as its cwd and try to
2940 				 * open .., which now points to
2941 				 * an unallocated inode.
2942 				 */
2943 				empty = ufs_dirclrdotdot(ip, parentino);
2944 				break;
2945 			} else {
2946 				continue;
2947 			}
2948 		}
2949 		empty = 0;
2950 		break;
2951 	}
2952 	return (empty);
2953 }
2954 
2955 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
2956 uint64_t dircheck_retry_cnt;
2957 /*
2958  * Check if source directory inode is in the path of the target directory.
2959  * Target is supplied locked.
2960  *
2961  * The source and target inode's should be different upon entry.
2962  */
2963 int
2964 ufs_dircheckpath(
2965 	ino_t source_ino,
2966 	struct inode *target,
2967 	struct inode *sdp,
2968 	struct cred *cr)
2969 {
2970 	struct fbuf *fbp;
2971 	struct dirtemplate *dirp;
2972 	struct inode *ip;
2973 	struct ufsvfs *ufsvfsp;
2974 	struct inode *tip;
2975 	ino_t dotdotino;
2976 	int err;
2977 
2978 	ASSERT(target->i_ufsvfs != NULL);
2979 	ASSERT(RW_LOCK_HELD(&target->i_rwlock));
2980 	ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));
2981 
2982 	ip = target;
2983 	if (ip->i_number == source_ino) {
2984 		err = EINVAL;
2985 		goto out;
2986 	}
2987 	if (ip->i_number == UFSROOTINO) {
2988 		err = 0;
2989 		goto out;
2990 	}
2991 	/*
2992 	 * Search back through the directory tree, using the ".." entries.
2993 	 * Fail any attempt to move a directory into an ancestor directory.
2994 	 */
2995 	fbp = NULL;
2996 	for (;;) {
2997 		struct vfs	*vfs;
2998 
2999 		err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
3000 		if (err)
3001 			break;
3002 		if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
3003 		    ip->i_size < sizeof (struct dirtemplate)) {
3004 			dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
3005 			err = ENOTDIR;
3006 			break;
3007 		}
3008 		if (dirp->dotdot_namlen != 2 ||
3009 		    dirp->dotdot_name[0] != '.' ||
3010 		    dirp->dotdot_name[1] != '.') {
3011 			dirbad(ip, "mangled .. entry", (off_t)0);
3012 			err = ENOTDIR;		/* Sanity check */
3013 			break;
3014 		}
3015 		dotdotino = (ino_t)dirp->dotdot_ino;
3016 		if (dotdotino == source_ino) {
3017 			err = EINVAL;
3018 			break;
3019 		}
3020 		if (dotdotino == UFSROOTINO)
3021 			break;
3022 		if (fbp) {
3023 			fbrelse(fbp, S_OTHER);
3024 			fbp = NULL;
3025 		}
3026 		vfs = ip->i_vfs;
3027 		ufsvfsp = ip->i_ufsvfs;
3028 
3029 		if (ip != target) {
3030 			rw_exit(&ip->i_rwlock);
3031 			VN_RELE(ITOV(ip));
3032 		}
3033 		/*
3034 		 * Race to get the inode.
3035 		 */
3036 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3037 		if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
3038 			rw_exit(&ufsvfsp->vfs_dqrwlock);
3039 			ip = NULL;
3040 			break;
3041 		}
3042 		rw_exit(&ufsvfsp->vfs_dqrwlock);
3043 		/*
3044 		 * If the directory of the source inode (also a directory)
3045 		 * is the same as this next entry up the chain, then
3046 		 * we know the source directory itself can't be in the
3047 		 * chain. This also prevents a panic because we already
3048 		 * have sdp->i_rwlock locked.
3049 		 */
3050 		if (tip == sdp) {
3051 			VN_RELE(ITOV(tip));
3052 			ip = NULL;
3053 			break;
3054 		}
3055 		ip = tip;
3056 
3057 		/*
3058 		 * If someone has set the WRITE_WANTED bit in this lock and if
3059 		 * this happens to be a sdp or tdp of another parallel rename
3060 		 * which is executing  the same code and in similar situation
3061 		 * we end up in a 4 way deadlock. We need to make sure that
3062 		 * the WRITE_WANTED bit is not  set.
3063 		 */
3064 retry_lock:
3065 		if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
3066 			/*
3067 			 * If the lock held as WRITER thats fine but if it
3068 			 * has WRITE_WANTED bit set we might end up in a
3069 			 * deadlock. If WRITE_WANTED is set we return
3070 			 * with EAGAIN else we just go back and try.
3071 			 */
3072 			if (RW_ISWRITER(&ip->i_rwlock) &&
3073 					!(RW_WRITE_HELD(&ip->i_rwlock))) {
3074 				err = EAGAIN;
3075 				if (fbp) {
3076 					fbrelse(fbp, S_OTHER);
3077 				}
3078 				VN_RELE(ITOV(ip));
3079 				return (err);
3080 			} else {
3081 				/*
3082 				 * The lock is being write held. We could
3083 				 * just do a rw_enter here but there is a
3084 				 * window between the check and now, where
3085 				 * the status could have changed, so to
3086 				 * avoid looping we backoff and go back to
3087 				 * try for the lock.
3088 				 */
3089 				delay(retry_backoff_delay);
3090 				dircheck_retry_cnt++;
3091 				goto retry_lock;
3092 			}
3093 		}
3094 	}
3095 	if (fbp) {
3096 		fbrelse(fbp, S_OTHER);
3097 	}
3098 out:
3099 	if (ip) {
3100 		if (ip != target) {
3101 			rw_exit(&ip->i_rwlock);
3102 			VN_RELE(ITOV(ip));
3103 		}
3104 	}
3105 	return (err);
3106 }
3107 
3108 int
3109 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
3110 {
3111 	offset_t off;
3112 	struct dirtemplate dbuf;
3113 	struct direct *dp = (struct direct *)&dbuf;
3114 	int err, count;
3115 	int empty = 1;	/* Assume it's empty */
3116 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
3117 
3118 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3119 
3120 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
3121 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
3122 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
3123 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
3124 		/*
3125 		 * Since we read MINDIRSIZ, residual must
3126 		 * be 0 unless we're at end of file.
3127 		 */
3128 
3129 		if (err || count != 0 || dp->d_reclen == 0) {
3130 			empty = 0;
3131 			break;
3132 		}
3133 		/* skip empty entries */
3134 		if (dp->d_ino == 0)
3135 			continue;
3136 		/*
3137 		 * At this point d_namlen must be 1 or 2.
3138 		 * 1 implies ".", 2 implies ".." if second
3139 		 * char is also "."
3140 		 */
3141 
3142 		if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
3143 				(ino_t)dp->d_ino == parentino)
3144 			continue;
3145 
3146 		if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
3147 			dp->d_name[1] == '.') {
3148 			continue;
3149 		}
3150 		empty = 0;
3151 		break;
3152 	}
3153 	return (empty);
3154 }
3155 
3156 
3157 /*
3158  * Allocate and initialize a new shadow inode to contain extended attributes.
3159  */
3160 int
3161 ufs_xattrmkdir(
3162 	struct inode *tdp,
3163 	struct inode **ipp,
3164 	int flags,
3165 	struct cred *cr)
3166 {
3167 	struct inode *ip;
3168 	struct vattr va;
3169 	int err;
3170 	int retry = 1;
3171 	struct ufsvfs *ufsvfsp;
3172 	struct ulockfs *ulp;
3173 	int issync;
3174 	int trans_size;
3175 	int dorwlock;		/* 0 = not yet taken, */
3176 				/* 1 = taken outside the transaction, */
3177 				/* 2 = taken inside the transaction */
3178 
3179 	/*
3180 	 * Validate permission to create attribute directory
3181 	 */
3182 
3183 	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) {
3184 		return (err);
3185 	}
3186 
3187 	if (vn_is_readonly(ITOV(tdp)))
3188 		return (EROFS);
3189 
3190 	/*
3191 	 * No need to re-init err after again:, since it's set before
3192 	 * the next use of it.
3193 	 */
3194 again:
3195 	dorwlock = 0;
3196 	va.va_type = VDIR;
3197 	va.va_uid = tdp->i_uid;
3198 	va.va_gid = tdp->i_gid;
3199 
3200 	if ((tdp->i_mode & IFMT) == IFDIR) {
3201 		va.va_mode = (o_mode_t)IFATTRDIR;
3202 		va.va_mode |= tdp->i_mode & 0777;
3203 	} else {
3204 		va.va_mode = (o_mode_t)IFATTRDIR|0700;
3205 		if (tdp->i_mode & 0040)
3206 			va.va_mode |= 0750;
3207 		if (tdp->i_mode & 0004)
3208 			va.va_mode |= 0705;
3209 	}
3210 	va.va_mask = AT_TYPE|AT_MODE;
3211 
3212 	ufsvfsp = tdp->i_ufsvfs;
3213 
3214 	err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3215 	if (err)
3216 		return (err);
3217 
3218 	/*
3219 	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
3220 	 * This follows the protocol for read()/write().
3221 	 */
3222 	if (ITOV(tdp)->v_type != VDIR) {
3223 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3224 		dorwlock = 1;
3225 	}
3226 
3227 	if (ulp) {
3228 		trans_size = (int)TOP_MKDIR_SIZE(tdp);
3229 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size);
3230 	}
3231 
3232 	/*
3233 	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
3234 	 * This follows the protocol established by
3235 	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
3236 	 */
3237 	if (dorwlock == 0) {
3238 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3239 		dorwlock = 2;
3240 	}
3241 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3242 	rw_enter(&tdp->i_contents, RW_WRITER);
3243 
3244 	/*
3245 	 * Suppress out of inodes messages if we will retry.
3246 	 */
3247 	if (retry)
3248 		tdp->i_flag |= IQUIET;
3249 	err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
3250 	tdp->i_flag &= ~IQUIET;
3251 
3252 	if (err)
3253 		goto fail;
3254 
3255 	if (flags) {
3256 
3257 		/*
3258 		 * Now attach it to src file.
3259 		 */
3260 
3261 		tdp->i_oeftflag = ip->i_number;
3262 	}
3263 
3264 	ip->i_cflags |= IXATTR;
3265 	ITOV(ip)->v_flag |= V_XATTRDIR;
3266 	TRANS_INODE(ufsvfsp, tdp);
3267 	tdp->i_flag |= ICHG | IUPD;
3268 	tdp->i_seq++;
3269 	ufs_iupdat(tdp, I_SYNC);
3270 	rw_exit(&tdp->i_contents);
3271 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3272 
3273 	rw_enter(&ip->i_rwlock, RW_WRITER);
3274 	rw_enter(&ip->i_contents, RW_WRITER);
3275 	TRANS_INODE(ufsvfsp, ip);
3276 	ip->i_flag |= ICHG| IUPD;
3277 	ip->i_seq++;
3278 	ufs_iupdat(ip, I_SYNC);
3279 	rw_exit(&ip->i_contents);
3280 	rw_exit(&ip->i_rwlock);
3281 	if (dorwlock == 2)
3282 		rw_exit(&tdp->i_rwlock);
3283 	if (ulp) {
3284 		int terr = 0;
3285 
3286 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3287 		ufs_lockfs_end(ulp);
3288 		if (err == 0)
3289 			err = terr;
3290 	}
3291 	if (dorwlock == 1)
3292 		rw_exit(&tdp->i_rwlock);
3293 	*ipp = ip;
3294 	return (err);
3295 
3296 fail:
3297 	rw_exit(&tdp->i_contents);
3298 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3299 	if (dorwlock == 2)
3300 		rw_exit(&tdp->i_rwlock);
3301 	if (ulp) {
3302 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3303 		ufs_lockfs_end(ulp);
3304 	}
3305 	if (dorwlock == 1)
3306 		rw_exit(&tdp->i_rwlock);
3307 	if (ip != NULL)
3308 		VN_RELE(ITOV(ip));
3309 
3310 	/*
3311 	 * No inodes?  See if any are tied up in pending deletions.
3312 	 * This has to be done outside of any of the above, because
3313 	 * the draining operation can't be done from inside a transaction.
3314 	 */
3315 	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3316 		ufs_delete_drain_wait(ufsvfsp, 1);
3317 		retry = 0;
3318 		goto again;
3319 	}
3320 
3321 	return (err);
3322 }
3323 
3324 /*
3325  * clear the dotdot directory entry.
3326  * Used by ufs_dirscan when clr_dotdot
3327  * flag is set and we're deleting a
3328  * directory.
3329  */
3330 static int
3331 ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
3332 {
3333 	struct fbuf *fbp;
3334 	struct direct *dotp, *dotdotp;
3335 	int err = 0;
3336 
3337 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
3338 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3339 	err = blkatoff(ip, 0, NULL, &fbp);
3340 	if (err) {
3341 		return (err);
3342 	}
3343 
3344 	dotp = (struct direct *)fbp->fb_addr;
3345 	if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
3346 	    ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
3347 		dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
3348 		if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
3349 		    ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {
3350 
3351 			dotp->d_reclen += dotdotp->d_reclen;
3352 			if (parentino == dotdotp->d_ino) {
3353 				dotdotp->d_ino = 0;
3354 				dotdotp->d_namlen = 0;
3355 				dotdotp->d_reclen = 0;
3356 			}
3357 
3358 			err = TRANS_DIR(ip, 0);
3359 			if (err) {
3360 				fbrelse(fbp, S_OTHER);
3361 			} else {
3362 				err = ufs_fbwrite(fbp, ip);
3363 			}
3364 		}
3365 	} else {
3366 		err = -1;
3367 	}
3368 	return (err);
3369 }
3370