1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/types.h>
29#include <sys/param.h>
30#include <sys/sysmacros.h>
31#include <sys/systm.h>
32#include <sys/time.h>
33#include <sys/vfs.h>
34#include <sys/vnode.h>
35#include <sys/errno.h>
36#include <sys/cmn_err.h>
37#include <sys/cred.h>
38#include <sys/stat.h>
39#include <sys/debug.h>
40#include <sys/policy.h>
41#include <sys/fs/tmpnode.h>
42#include <sys/fs/tmp.h>
43#include <sys/vtrace.h>
44
45static int tdircheckpath(struct tmpnode *, struct tmpnode *, struct cred *);
46static int tdirrename(struct tmpnode *, struct tmpnode *, struct tmpnode *,
47	char *, struct tmpnode *, struct tdirent *, struct cred *);
48static void tdirfixdotdot(struct tmpnode *, struct tmpnode *, struct tmpnode *);
49static int tdirmaketnode(struct tmpnode *, struct tmount *, struct vattr *,
50	enum de_op, struct tmpnode **, struct cred *);
51static int tdiraddentry(struct tmpnode *, struct tmpnode *, char *,
52	enum de_op, struct tmpnode *);
53
54
55#define	T_HASH_SIZE	8192		/* must be power of 2 */
56#define	T_MUTEX_SIZE	64
57
58static struct tdirent	*t_hashtable[T_HASH_SIZE];
59static kmutex_t		 t_hashmutex[T_MUTEX_SIZE];
60
61#define	T_HASH_INDEX(a)		((a) & (T_HASH_SIZE-1))
62#define	T_MUTEX_INDEX(a)	((a) & (T_MUTEX_SIZE-1))
63
64#define	TMPFS_HASH(tp, name, hash)				\
65	{							\
66		char Xc, *Xcp;					\
67		hash = (uint_t)(uintptr_t)(tp) >> 8;		\
68		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
69			hash = (hash << 4) + hash + (uint_t)Xc;	\
70	}
71
72void
73tmpfs_hash_init(void)
74{
75	int	ix;
76
77	for (ix = 0; ix < T_MUTEX_SIZE; ix++)
78		mutex_init(&t_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
79}
80
81/*
82 * This routine is where the rubber meets the road for identities.
83 */
84static void
85tmpfs_hash_in(struct tdirent *t)
86{
87	uint_t		hash;
88	struct tdirent	**prevpp;
89	kmutex_t	*t_hmtx;
90
91	TMPFS_HASH(t->td_parent, t->td_name, hash);
92	t->td_hash = hash;
93	prevpp = &t_hashtable[T_HASH_INDEX(hash)];
94	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
95	mutex_enter(t_hmtx);
96	t->td_link = *prevpp;
97	*prevpp = t;
98	mutex_exit(t_hmtx);
99}
100
101/*
102 * Remove tdirent *t from the hash list.
103 */
104static void
105tmpfs_hash_out(struct tdirent *t)
106{
107	uint_t		hash;
108	struct tdirent	**prevpp;
109	kmutex_t	*t_hmtx;
110
111	hash = t->td_hash;
112	prevpp = &t_hashtable[T_HASH_INDEX(hash)];
113	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
114	mutex_enter(t_hmtx);
115	while (*prevpp != t)
116		prevpp = &(*prevpp)->td_link;
117	*prevpp = t->td_link;
118	mutex_exit(t_hmtx);
119}
120
121/*
122 * Currently called by tdirrename() only.
123 * rename operation needs to be done with lock held, to ensure that
124 * no other operations can access the tmpnode at the same instance.
125 */
126static void
127tmpfs_hash_change(struct tdirent *tdp, struct tmpnode *fromtp)
128{
129	uint_t		hash;
130	kmutex_t	*t_hmtx;
131
132	hash = tdp->td_hash;
133	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
134	mutex_enter(t_hmtx);
135	tdp->td_tmpnode = fromtp;
136	mutex_exit(t_hmtx);
137}
138
139static struct tdirent *
140tmpfs_hash_lookup(char *name, struct tmpnode *parent, uint_t hold,
141	struct tmpnode **found)
142{
143	struct tdirent	*l;
144	uint_t		hash;
145	kmutex_t	*t_hmtx;
146	struct tmpnode	*tnp;
147
148	TMPFS_HASH(parent, name, hash);
149	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
150	mutex_enter(t_hmtx);
151	l = t_hashtable[T_HASH_INDEX(hash)];
152	while (l) {
153		if ((l->td_hash == hash) &&
154		    (l->td_parent == parent) &&
155		    (strcmp(l->td_name, name) == 0)) {
156			/*
157			 * We need to make sure that the tmpnode that
158			 * we put a hold on is the same one that we pass back.
159			 * Hence, temporary variable tnp is necessary.
160			 */
161			tnp = l->td_tmpnode;
162			if (hold) {
163				ASSERT(tnp);
164				tmpnode_hold(tnp);
165			}
166			if (found)
167				*found = tnp;
168			mutex_exit(t_hmtx);
169			return (l);
170		} else {
171			l = l->td_link;
172		}
173	}
174	mutex_exit(t_hmtx);
175	return (NULL);
176}
177
178/*
179 * Search directory 'parent' for entry 'name'.
180 *
181 * The calling thread can't hold the write version
182 * of the rwlock for the directory being searched
183 *
184 * 0 is returned on success and *foundtp points
185 * to the found tmpnode with its vnode held.
186 */
187int
188tdirlookup(
189	struct tmpnode *parent,
190	char *name,
191	struct tmpnode **foundtp,
192	struct cred *cred)
193{
194	int error;
195
196	*foundtp = NULL;
197	if (parent->tn_type != VDIR)
198		return (ENOTDIR);
199
200	if ((error = tmp_taccess(parent, VEXEC, cred)))
201		return (error);
202
203	if (*name == '\0') {
204		tmpnode_hold(parent);
205		*foundtp = parent;
206		return (0);
207	}
208
209	/*
210	 * Search the directory for the matching name
211	 * We need the lock protecting the tn_dir list
212	 * so that it doesn't change out from underneath us.
213	 * tmpfs_hash_lookup() will pass back the tmpnode
214	 * with a hold on it.
215	 */
216
217	if (tmpfs_hash_lookup(name, parent, 1, foundtp) != NULL) {
218		ASSERT(*foundtp);
219		return (0);
220	}
221
222	return (ENOENT);
223}
224
225/*
226 * Enter a directory entry for 'name' and 'tp' into directory 'dir'
227 *
228 * Returns 0 on success.
229 */
230int
231tdirenter(
232	struct tmount	*tm,
233	struct tmpnode	*dir,		/* target directory to make entry in */
234	char		*name,		/* name of entry */
235	enum de_op	op,		/* entry operation */
236	struct tmpnode	*fromparent,	/* source directory if rename */
237	struct tmpnode	*tp,		/* source tmpnode, if link/rename */
238	struct vattr	*va,
239	struct tmpnode	**tpp,		/* return tmpnode, if create/mkdir */
240	struct cred	*cred,
241	caller_context_t *ctp)
242{
243	struct tdirent *tdp;
244	struct tmpnode *found = NULL;
245	int error = 0;
246	char *s;
247
248	/*
249	 * tn_rwlock is held to serialize direnter and dirdeletes
250	 */
251	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
252	ASSERT(dir->tn_type == VDIR);
253
254	/*
255	 * Don't allow '/' characters in pathname component
256	 * (thus in ufs_direnter()).
257	 */
258	for (s = name; *s; s++)
259		if (*s == '/')
260			return (EACCES);
261
262	if (name[0] == '\0')
263		panic("tdirenter: NULL name");
264
265	/*
266	 * For link and rename lock the source entry and check the link count
267	 * to see if it has been removed while it was unlocked.
268	 */
269	if (op == DE_LINK || op == DE_RENAME) {
270		if (tp != dir)
271			rw_enter(&tp->tn_rwlock, RW_WRITER);
272		mutex_enter(&tp->tn_tlock);
273		if (tp->tn_nlink == 0) {
274			mutex_exit(&tp->tn_tlock);
275			if (tp != dir)
276				rw_exit(&tp->tn_rwlock);
277			return (ENOENT);
278		}
279
280		if (tp->tn_nlink == MAXLINK) {
281			mutex_exit(&tp->tn_tlock);
282			if (tp != dir)
283				rw_exit(&tp->tn_rwlock);
284			return (EMLINK);
285		}
286		tp->tn_nlink++;
287		gethrestime(&tp->tn_ctime);
288		mutex_exit(&tp->tn_tlock);
289		if (tp != dir)
290			rw_exit(&tp->tn_rwlock);
291	}
292
293	/*
294	 * This might be a "dangling detached directory".
295	 * it could have been removed, but a reference
296	 * to it kept in u_cwd.  don't bother searching
297	 * it, and with any luck the user will get tired
298	 * of dealing with us and cd to some absolute
299	 * pathway.  *sigh*, thus in ufs, too.
300	 */
301	if (dir->tn_nlink == 0) {
302		error = ENOENT;
303		goto out;
304	}
305
306	/*
307	 * If this is a rename of a directory and the parent is
308	 * different (".." must be changed), then the source
309	 * directory must not be in the directory hierarchy
310	 * above the target, as this would orphan everything
311	 * below the source directory.
312	 */
313	if (op == DE_RENAME) {
314		if (tp == dir) {
315			error = EINVAL;
316			goto out;
317		}
318		if (tp->tn_type == VDIR) {
319			if ((fromparent != dir) &&
320			    (error = tdircheckpath(tp, dir, cred))) {
321				goto out;
322			}
323		}
324	}
325
326	/*
327	 * Search for the entry.  Return "found" if it exists.
328	 */
329	tdp = tmpfs_hash_lookup(name, dir, 1, &found);
330
331	if (tdp) {
332		ASSERT(found);
333		switch (op) {
334		case DE_CREATE:
335		case DE_MKDIR:
336			if (tpp) {
337				*tpp = found;
338				error = EEXIST;
339			} else {
340				tmpnode_rele(found);
341			}
342			break;
343
344		case DE_RENAME:
345			error = tdirrename(fromparent, tp,
346			    dir, name, found, tdp, cred);
347			if (error == 0) {
348				if (found != NULL) {
349					vnevent_rename_dest(TNTOV(found),
350					    TNTOV(dir), name, ctp);
351				}
352			}
353
354			tmpnode_rele(found);
355			break;
356
357		case DE_LINK:
358			/*
359			 * Can't link to an existing file.
360			 */
361			error = EEXIST;
362			tmpnode_rele(found);
363			break;
364		}
365	} else {
366
367		/*
368		 * The entry does not exist. Check write permission in
369		 * directory to see if entry can be created.
370		 */
371		if (error = tmp_taccess(dir, VWRITE, cred))
372			goto out;
373		if (op == DE_CREATE || op == DE_MKDIR) {
374			/*
375			 * Make new tmpnode and directory entry as required.
376			 */
377			error = tdirmaketnode(dir, tm, va, op, &tp, cred);
378			if (error)
379				goto out;
380		}
381		if (error = tdiraddentry(dir, tp, name, op, fromparent)) {
382			if (op == DE_CREATE || op == DE_MKDIR) {
383				/*
384				 * Unmake the inode we just made.
385				 */
386				rw_enter(&tp->tn_rwlock, RW_WRITER);
387				if ((tp->tn_type) == VDIR) {
388					ASSERT(tdp == NULL);
389					/*
390					 * cleanup allocs made by tdirinit()
391					 */
392					tdirtrunc(tp);
393				}
394				mutex_enter(&tp->tn_tlock);
395				tp->tn_nlink = 0;
396				mutex_exit(&tp->tn_tlock);
397				gethrestime(&tp->tn_ctime);
398				rw_exit(&tp->tn_rwlock);
399				tmpnode_rele(tp);
400				tp = NULL;
401			}
402		} else if (tpp) {
403			*tpp = tp;
404		} else if (op == DE_CREATE || op == DE_MKDIR) {
405			tmpnode_rele(tp);
406		}
407	}
408
409out:
410	if (error && (op == DE_LINK || op == DE_RENAME)) {
411		/*
412		 * Undo bumped link count.
413		 */
414		DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
415		gethrestime(&tp->tn_ctime);
416	}
417	return (error);
418}
419
420/*
421 * Delete entry tp of name "nm" from dir.
422 * Free dir entry space and decrement link count on tmpnode(s).
423 *
424 * Return 0 on success.
425 */
426int
427tdirdelete(
428	struct tmpnode *dir,
429	struct tmpnode *tp,
430	char *nm,
431	enum dr_op op,
432	struct cred *cred)
433{
434	struct tdirent *tpdp;
435	int error;
436	size_t namelen;
437	struct tmpnode *tnp;
438	timestruc_t now;
439
440	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
441	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
442	ASSERT(dir->tn_type == VDIR);
443
444	if (nm[0] == '\0')
445		panic("tdirdelete: NULL name for %p", (void *)tp);
446
447	/*
448	 * return error when removing . and ..
449	 */
450	if (nm[0] == '.') {
451		if (nm[1] == '\0')
452			return (EINVAL);
453		if (nm[1] == '.' && nm[2] == '\0')
454			return (EEXIST); /* thus in ufs */
455	}
456
457	if (error = tmp_taccess(dir, VEXEC|VWRITE, cred))
458		return (error);
459
460	/*
461	 * If the parent directory is "sticky", then the user must
462	 * own the parent directory or the file in it, or else must
463	 * have permission to write the file.  Otherwise it may not
464	 * be deleted (except by privileged users).
465	 * Same as ufs_dirremove.
466	 */
467	if ((error = tmp_sticky_remove_access(dir, tp, cred)) != 0)
468		return (error);
469
470	if (dir->tn_dir == NULL)
471		return (ENOENT);
472
473	tpdp = tmpfs_hash_lookup(nm, dir, 0, &tnp);
474	if (tpdp == NULL) {
475		/*
476		 * If it is gone, some other thread got here first!
477		 * Return error ENOENT.
478		 */
479		return (ENOENT);
480	}
481
482	/*
483	 * If the tmpnode in the tdirent changed, we were probably
484	 * the victim of a concurrent rename operation.  The original
485	 * is gone, so return that status (same as UFS).
486	 */
487	if (tp != tnp)
488		return (ENOENT);
489
490	tmpfs_hash_out(tpdp);
491
492	/*
493	 * Take tpdp out of the directory list.
494	 */
495	ASSERT(tpdp->td_next != tpdp);
496	ASSERT(tpdp->td_prev != tpdp);
497	if (tpdp->td_prev) {
498		tpdp->td_prev->td_next = tpdp->td_next;
499	}
500	if (tpdp->td_next) {
501		tpdp->td_next->td_prev = tpdp->td_prev;
502	}
503
504	/*
505	 * If the roving slot pointer happens to match tpdp,
506	 * point it at the previous dirent.
507	 */
508	if (dir->tn_dir->td_prev == tpdp) {
509		dir->tn_dir->td_prev = tpdp->td_prev;
510	}
511	ASSERT(tpdp->td_next != tpdp);
512	ASSERT(tpdp->td_prev != tpdp);
513
514	/*
515	 * tpdp points to the correct directory entry
516	 */
517	namelen = strlen(tpdp->td_name) + 1;
518
519	tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
520	dir->tn_size -= (sizeof (struct tdirent) + namelen);
521	dir->tn_dirents--;
522
523	gethrestime(&now);
524	dir->tn_mtime = now;
525	dir->tn_ctime = now;
526	tp->tn_ctime = now;
527
528	ASSERT(tp->tn_nlink > 0);
529	DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
530	if (op == DR_RMDIR && tp->tn_type == VDIR) {
531		tdirtrunc(tp);
532		ASSERT(tp->tn_nlink == 0);
533	}
534	return (0);
535}
536
537/*
538 * tdirinit is used internally to initialize a directory (dir)
539 * with '.' and '..' entries without checking permissions and locking
540 */
541void
542tdirinit(
543	struct tmpnode *parent,		/* parent of directory to initialize */
544	struct tmpnode *dir)		/* the new directory */
545{
546	struct tdirent *dot, *dotdot;
547	timestruc_t now;
548
549	ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
550	ASSERT(dir->tn_type == VDIR);
551
552	dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
553	dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
554
555	/*
556	 * Initialize the entries
557	 */
558	dot->td_tmpnode = dir;
559	dot->td_offset = 0;
560	dot->td_name = (char *)dot + sizeof (struct tdirent);
561	dot->td_name[0] = '.';
562	dot->td_parent = dir;
563	tmpfs_hash_in(dot);
564
565	dotdot->td_tmpnode = parent;
566	dotdot->td_offset = 1;
567	dotdot->td_name = (char *)dotdot + sizeof (struct tdirent);
568	dotdot->td_name[0] = '.';
569	dotdot->td_name[1] = '.';
570	dotdot->td_parent = dir;
571	tmpfs_hash_in(dotdot);
572
573	/*
574	 * Initialize directory entry list.
575	 */
576	dot->td_next = dotdot;
577	dot->td_prev = dotdot;	/* dot's td_prev holds roving slot pointer */
578	dotdot->td_next = NULL;
579	dotdot->td_prev = dot;
580
581	gethrestime(&now);
582	dir->tn_mtime = now;
583	dir->tn_ctime = now;
584
585	/*
586	 * Link counts are special for the hidden attribute directory.
587	 * The only explicit reference in the name space is "." and
588	 * the reference through ".." is not counted on the parent
589	 * file. The attrdir is created as a side effect to lookup,
590	 * so don't change the ctime of the parent.
591	 * Since tdirinit is called with both dir and parent being the
592	 * same for the root vnode, we need to increment this before we set
593	 * tn_nlink = 2 below.
594	 */
595	if (!(dir->tn_vnode->v_flag & V_XATTRDIR)) {
596		INCR_COUNT(&parent->tn_nlink, &parent->tn_tlock);
597		parent->tn_ctime = now;
598	}
599
600	dir->tn_dir = dot;
601	dir->tn_size = 2 * sizeof (struct tdirent) + 5;	/* dot and dotdot */
602	dir->tn_dirents = 2;
603	dir->tn_nlink = 2;
604}
605
606
607/*
608 * tdirtrunc is called to remove all directory entries under this directory.
609 */
610void
611tdirtrunc(struct tmpnode *dir)
612{
613	struct tdirent *tdp;
614	struct tmpnode *tp;
615	size_t namelen;
616	timestruc_t now;
617	int isvattrdir, isdotdot, skip_decr;
618
619	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
620	ASSERT(dir->tn_type == VDIR);
621
622	isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0;
623	for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) {
624		ASSERT(tdp->td_next != tdp);
625		ASSERT(tdp->td_prev != tdp);
626		ASSERT(tdp->td_tmpnode);
627
628		dir->tn_dir = tdp->td_next;
629		namelen = strlen(tdp->td_name) + 1;
630
631		/*
632		 * Adjust the link counts to account for this directory
633		 * entry removal. Hidden attribute directories may
634		 * not be empty as they may be truncated as a side-
635		 * effect of removing the parent. We do hold/rele
636		 * operations to free up these tmpnodes.
637		 *
638		 * Skip the link count adjustment for parents of
639		 * attribute directories as those link counts
640		 * do not include the ".." reference in the hidden
641		 * directories.
642		 */
643		tp = tdp->td_tmpnode;
644		isdotdot = (strcmp("..", tdp->td_name) == 0);
645		skip_decr = (isvattrdir && isdotdot);
646		if (!skip_decr) {
647			ASSERT(tp->tn_nlink > 0);
648			DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
649		}
650
651		tmpfs_hash_out(tdp);
652
653		tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
654		dir->tn_size -= (sizeof (struct tdirent) + namelen);
655		dir->tn_dirents--;
656	}
657
658	gethrestime(&now);
659	dir->tn_mtime = now;
660	dir->tn_ctime = now;
661
662	ASSERT(dir->tn_dir == NULL);
663	ASSERT(dir->tn_size == 0);
664	ASSERT(dir->tn_dirents == 0);
665}
666
667/*
668 * Check if the source directory is in the path of the target directory.
669 * The target directory is locked by the caller.
670 *
671 * XXX - The source and target's should be different upon entry.
672 */
673static int
674tdircheckpath(
675	struct tmpnode *fromtp,
676	struct tmpnode	*toparent,
677	struct cred	*cred)
678{
679	int	error = 0;
680	struct tmpnode *dir, *dotdot;
681	struct tdirent *tdp;
682
683	ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
684
685	tdp = tmpfs_hash_lookup("..", toparent, 1, &dotdot);
686	if (tdp == NULL)
687		return (ENOENT);
688
689	ASSERT(dotdot);
690
691	if (dotdot == toparent) {
692		/* root of fs.  search trivially satisfied. */
693		tmpnode_rele(dotdot);
694		return (0);
695	}
696	for (;;) {
697		/*
698		 * Return error for cases like "mv c c/d",
699		 * "mv c c/d/e" and so on.
700		 */
701		if (dotdot == fromtp) {
702			tmpnode_rele(dotdot);
703			error = EINVAL;
704			break;
705		}
706		dir = dotdot;
707		error = tdirlookup(dir, "..", &dotdot, cred);
708		if (error) {
709			tmpnode_rele(dir);
710			break;
711		}
712		/*
713		 * We're okay if we traverse the directory tree up to
714		 * the root directory and don't run into the
715		 * parent directory.
716		 */
717		if (dir == dotdot) {
718			tmpnode_rele(dir);
719			tmpnode_rele(dotdot);
720			break;
721		}
722		tmpnode_rele(dir);
723	}
724	return (error);
725}
726
727static int
728tdirrename(
729	struct tmpnode *fromparent,	/* parent directory of source */
730	struct tmpnode *fromtp,		/* source tmpnode */
731	struct tmpnode *toparent,	/* parent directory of target */
732	char *nm,			/* entry we are trying to change */
733	struct tmpnode *to,		/* target tmpnode */
734	struct tdirent *where,		/* target tmpnode directory entry */
735	struct cred *cred)		/* credentials */
736{
737	int error = 0;
738	int doingdirectory;
739	timestruc_t now;
740
741#if defined(lint)
742	nm = nm;
743#endif
744	ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
745
746	/*
747	 * Short circuit rename of something to itself.
748	 */
749	if (fromtp == to)
750		return (ESAME);		/* special KLUDGE error code */
751
752	rw_enter(&fromtp->tn_rwlock, RW_READER);
753	rw_enter(&to->tn_rwlock, RW_READER);
754
755	/*
756	 * Check that everything is on the same filesystem.
757	 */
758	if (to->tn_vnode->v_vfsp != toparent->tn_vnode->v_vfsp ||
759	    to->tn_vnode->v_vfsp != fromtp->tn_vnode->v_vfsp) {
760		error = EXDEV;
761		goto out;
762	}
763
764	/*
765	 * Must have write permission to rewrite target entry.
766	 * Check for stickyness.
767	 */
768	if ((error = tmp_taccess(toparent, VWRITE, cred)) != 0 ||
769	    (error = tmp_sticky_remove_access(toparent, to, cred)) != 0)
770		goto out;
771
772	/*
773	 * Ensure source and target are compatible (both directories
774	 * or both not directories).  If target is a directory it must
775	 * be empty and have no links to it; in addition it must not
776	 * be a mount point, and both the source and target must be
777	 * writable.
778	 */
779	doingdirectory = (fromtp->tn_type == VDIR);
780	if (to->tn_type == VDIR) {
781		if (!doingdirectory) {
782			error = EISDIR;
783			goto out;
784		}
785		/*
786		 * vn_vfswlock will prevent mounts from using the directory
787		 * until we are done.
788		 */
789		if (vn_vfswlock(TNTOV(to))) {
790			error = EBUSY;
791			goto out;
792		}
793		if (vn_mountedvfs(TNTOV(to)) != NULL) {
794			vn_vfsunlock(TNTOV(to));
795			error = EBUSY;
796			goto out;
797		}
798
799		mutex_enter(&to->tn_tlock);
800		if (to->tn_dirents > 2 || to->tn_nlink > 2) {
801			mutex_exit(&to->tn_tlock);
802			vn_vfsunlock(TNTOV(to));
803			error = EEXIST; /* SIGH should be ENOTEMPTY */
804			/*
805			 * Update atime because checking tn_dirents is
806			 * logically equivalent to reading the directory
807			 */
808			gethrestime(&to->tn_atime);
809			goto out;
810		}
811		mutex_exit(&to->tn_tlock);
812	} else if (doingdirectory) {
813		error = ENOTDIR;
814		goto out;
815	}
816
817	tmpfs_hash_change(where, fromtp);
818	gethrestime(&now);
819	toparent->tn_mtime = now;
820	toparent->tn_ctime = now;
821
822	/*
823	 * Upgrade to write lock on "to" (i.e., the target tmpnode).
824	 */
825	rw_exit(&to->tn_rwlock);
826	rw_enter(&to->tn_rwlock, RW_WRITER);
827
828	/*
829	 * Decrement the link count of the target tmpnode.
830	 */
831	DECR_COUNT(&to->tn_nlink, &to->tn_tlock);
832	to->tn_ctime = now;
833
834	if (doingdirectory) {
835		/*
836		 * The entry for "to" no longer exists so release the vfslock.
837		 */
838		vn_vfsunlock(TNTOV(to));
839
840		/*
841		 * Decrement the target link count and delete all entires.
842		 */
843		tdirtrunc(to);
844		ASSERT(to->tn_nlink == 0);
845
846		/*
847		 * Renaming a directory with the parent different
848		 * requires that ".." be rewritten.  The window is
849		 * still there for ".." to be inconsistent, but this
850		 * is unavoidable, and a lot shorter than when it was
851		 * done in a user process.
852		 */
853		if (fromparent != toparent)
854			tdirfixdotdot(fromtp, fromparent, toparent);
855	}
856out:
857	rw_exit(&to->tn_rwlock);
858	rw_exit(&fromtp->tn_rwlock);
859	return (error);
860}
861
862static void
863tdirfixdotdot(
864	struct tmpnode	*fromtp,	/* child directory */
865	struct tmpnode	*fromparent,	/* old parent directory */
866	struct tmpnode	*toparent)	/* new parent directory */
867{
868	struct tdirent	*dotdot;
869
870	ASSERT(RW_LOCK_HELD(&toparent->tn_rwlock));
871
872	/*
873	 * Increment the link count in the new parent tmpnode
874	 */
875	INCR_COUNT(&toparent->tn_nlink, &toparent->tn_tlock);
876	gethrestime(&toparent->tn_ctime);
877
878	dotdot = tmpfs_hash_lookup("..", fromtp, 0, NULL);
879
880	ASSERT(dotdot->td_tmpnode == fromparent);
881	dotdot->td_tmpnode = toparent;
882
883	/*
884	 * Decrement the link count of the old parent tmpnode.
885	 * If fromparent is NULL, then this is a new directory link;
886	 * it has no parent, so we need not do anything.
887	 */
888	if (fromparent != NULL) {
889		mutex_enter(&fromparent->tn_tlock);
890		if (fromparent->tn_nlink != 0) {
891			fromparent->tn_nlink--;
892			gethrestime(&fromparent->tn_ctime);
893		}
894		mutex_exit(&fromparent->tn_tlock);
895	}
896}
897
898static int
899tdiraddentry(
900	struct tmpnode	*dir,	/* target directory to make entry in */
901	struct tmpnode	*tp,	/* new tmpnode */
902	char		*name,
903	enum de_op	op,
904	struct tmpnode	*fromtp)
905{
906	struct tdirent *tdp, *tpdp;
907	size_t		namelen, alloc_size;
908	timestruc_t	now;
909
910	/*
911	 * Make sure the parent directory wasn't removed from
912	 * underneath the caller.
913	 */
914	if (dir->tn_dir == NULL)
915		return (ENOENT);
916
917	/*
918	 * Check that everything is on the same filesystem.
919	 */
920	if (tp->tn_vnode->v_vfsp != dir->tn_vnode->v_vfsp)
921		return (EXDEV);
922
923	/*
924	 * Allocate and initialize directory entry
925	 */
926	namelen = strlen(name) + 1;
927	alloc_size = namelen + sizeof (struct tdirent);
928	tdp = tmp_memalloc(alloc_size, 0);
929	if (tdp == NULL)
930		return (ENOSPC);
931
932	if ((op == DE_RENAME) && (tp->tn_type == VDIR))
933		tdirfixdotdot(tp, fromtp, dir);
934
935	dir->tn_size += alloc_size;
936	dir->tn_dirents++;
937	tdp->td_tmpnode = tp;
938	tdp->td_parent = dir;
939
940	/*
941	 * The directory entry and its name were allocated sequentially.
942	 */
943	tdp->td_name = (char *)tdp + sizeof (struct tdirent);
944	(void) strcpy(tdp->td_name, name);
945
946	tmpfs_hash_in(tdp);
947
948	/*
949	 * Some utilities expect the size of a directory to remain
950	 * somewhat static.  For example, a routine which unlinks
951	 * files between calls to readdir(); the size of the
952	 * directory changes from underneath it and so the real
953	 * directory offset in bytes is invalid.  To circumvent
954	 * this problem, we initialize a directory entry with an
955	 * phony offset, and use this offset to determine end of
956	 * file in tmp_readdir.
957	 */
958	tpdp = dir->tn_dir->td_prev;
959	/*
960	 * Install at first empty "slot" in directory list.
961	 */
962	while (tpdp->td_next != NULL && (tpdp->td_next->td_offset -
963	    tpdp->td_offset) <= 1) {
964		ASSERT(tpdp->td_next != tpdp);
965		ASSERT(tpdp->td_prev != tpdp);
966		ASSERT(tpdp->td_next->td_offset > tpdp->td_offset);
967		tpdp = tpdp->td_next;
968	}
969	tdp->td_offset = tpdp->td_offset + 1;
970
971	/*
972	 * If we're at the end of the dirent list and the offset (which
973	 * is necessarily the largest offset in this directory) is more
974	 * than twice the number of dirents, that means the directory is
975	 * 50% holes.  At this point we reset the slot pointer back to
976	 * the beginning of the directory so we start using the holes.
977	 * The idea is that if there are N dirents, there must also be
978	 * N holes, so we can satisfy the next N creates by walking at
979	 * most 2N entries; thus the average cost of a create is constant.
980	 * Note that we use the first dirent's td_prev as the roving
981	 * slot pointer; it's ugly, but it saves a word in every dirent.
982	 */
983	if (tpdp->td_next == NULL && tpdp->td_offset > 2 * dir->tn_dirents)
984		dir->tn_dir->td_prev = dir->tn_dir->td_next;
985	else
986		dir->tn_dir->td_prev = tdp;
987
988	ASSERT(tpdp->td_next != tpdp);
989	ASSERT(tpdp->td_prev != tpdp);
990
991	tdp->td_next = tpdp->td_next;
992	if (tdp->td_next) {
993		tdp->td_next->td_prev = tdp;
994	}
995	tdp->td_prev = tpdp;
996	tpdp->td_next = tdp;
997
998	ASSERT(tdp->td_next != tdp);
999	ASSERT(tdp->td_prev != tdp);
1000	ASSERT(tpdp->td_next != tpdp);
1001	ASSERT(tpdp->td_prev != tpdp);
1002
1003	gethrestime(&now);
1004	dir->tn_mtime = now;
1005	dir->tn_ctime = now;
1006
1007	return (0);
1008}
1009
1010static int
1011tdirmaketnode(
1012	struct tmpnode *dir,
1013	struct tmount	*tm,
1014	struct vattr	*va,
1015	enum	de_op	op,
1016	struct tmpnode **newnode,
1017	struct cred	*cred)
1018{
1019	struct tmpnode *tp;
1020	enum vtype	type;
1021
1022	ASSERT(va != NULL);
1023	ASSERT(op == DE_CREATE || op == DE_MKDIR);
1024	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
1025	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
1026		return (EOVERFLOW);
1027	type = va->va_type;
1028	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
1029	tmpnode_init(tm, tp, va, cred);
1030
1031	/* setup normal file/dir's extended attribute directory */
1032	if (dir->tn_flags & ISXATTR) {
1033		/* parent dir is , mark file as xattr */
1034		tp->tn_flags |= ISXATTR;
1035	}
1036
1037
1038	if (type == VBLK || type == VCHR) {
1039		tp->tn_vnode->v_rdev = tp->tn_rdev = va->va_rdev;
1040	} else {
1041		tp->tn_vnode->v_rdev = tp->tn_rdev = NODEV;
1042	}
1043	tp->tn_vnode->v_type = type;
1044	tp->tn_uid = crgetuid(cred);
1045
1046	/*
1047	 * To determine the group-id of the created file:
1048	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
1049	 *	clients are not likely to set the gid), then use it if
1050	 *	the process is privileged, belongs to the target group,
1051	 *	or the group is the same as the parent directory.
1052	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
1053	 *	GRPID option, and the directory's set-gid bit is clear,
1054	 *	then use the process's gid.
1055	 *   3) Otherwise, set the group-id to the gid of the parent directory.
1056	 */
1057	if ((va->va_mask & AT_GID) &&
1058	    ((va->va_gid == dir->tn_gid) || groupmember(va->va_gid, cred) ||
1059	    secpolicy_vnode_create_gid(cred) == 0)) {
1060		/*
1061		 * XXX - is this only the case when a 4.0 NFS client, or a
1062		 * client derived from that code, makes a call over the wire?
1063		 */
1064		tp->tn_gid = va->va_gid;
1065	} else {
1066		if (dir->tn_mode & VSGID)
1067			tp->tn_gid = dir->tn_gid;
1068		else
1069			tp->tn_gid = crgetgid(cred);
1070	}
1071	/*
1072	 * If we're creating a directory, and the parent directory has the
1073	 * set-GID bit set, set it on the new directory.
1074	 * Otherwise, if the user is neither privileged nor a member of the
1075	 * file's new group, clear the file's set-GID bit.
1076	 */
1077	if (dir->tn_mode & VSGID && type == VDIR)
1078		tp->tn_mode |= VSGID;
1079	else {
1080		if ((tp->tn_mode & VSGID) &&
1081		    secpolicy_vnode_setids_setgids(cred, tp->tn_gid) != 0)
1082			tp->tn_mode &= ~VSGID;
1083	}
1084
1085	if (va->va_mask & AT_ATIME)
1086		tp->tn_atime = va->va_atime;
1087	if (va->va_mask & AT_MTIME)
1088		tp->tn_mtime = va->va_mtime;
1089
1090	if (op == DE_MKDIR)
1091		tdirinit(dir, tp);
1092
1093	*newnode = tp;
1094	return (0);
1095}
1096