1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright 1998, 2000 Marshall Kirk McKusick.
5 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
6 * All rights reserved.
7 *
8 * The soft updates code is derived from the appendix of a University
9 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
10 * "Soft Updates: A Solution to the Metadata Update Problem in File
11 * Systems", CSE-TR-254-95, August 1995).
12 *
13 * Further information about soft updates can be obtained from:
14 *
15 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
16 *	1614 Oxford Street		mckusick@mckusick.com
17 *	Berkeley, CA 94709-1608		+1-510-843-9542
18 *	USA
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 *
24 * 1. Redistributions of source code must retain the above copyright
25 *    notice, this list of conditions and the following disclaimer.
26 * 2. Redistributions in binary form must reproduce the above copyright
27 *    notice, this list of conditions and the following disclaimer in the
28 *    documentation and/or other materials provided with the distribution.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
31 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
33 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
36 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
37 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
38 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
39 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 *
41 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
42 */
43
44#include <sys/cdefs.h>
45__FBSDID("$FreeBSD$");
46
47#include "opt_ffs.h"
48#include "opt_quota.h"
49#include "opt_ddb.h"
50
51#include <sys/param.h>
52#include <sys/kernel.h>
53#include <sys/systm.h>
54#include <sys/bio.h>
55#include <sys/buf.h>
56#include <sys/kdb.h>
57#include <sys/kthread.h>
58#include <sys/ktr.h>
59#include <sys/limits.h>
60#include <sys/lock.h>
61#include <sys/malloc.h>
62#include <sys/mount.h>
63#include <sys/mutex.h>
64#include <sys/namei.h>
65#include <sys/priv.h>
66#include <sys/proc.h>
67#include <sys/racct.h>
68#include <sys/rwlock.h>
69#include <sys/stat.h>
70#include <sys/sysctl.h>
71#include <sys/syslog.h>
72#include <sys/vnode.h>
73#include <sys/conf.h>
74
75#include <ufs/ufs/dir.h>
76#include <ufs/ufs/extattr.h>
77#include <ufs/ufs/quota.h>
78#include <ufs/ufs/inode.h>
79#include <ufs/ufs/ufsmount.h>
80#include <ufs/ffs/fs.h>
81#include <ufs/ffs/softdep.h>
82#include <ufs/ffs/ffs_extern.h>
83#include <ufs/ufs/ufs_extern.h>
84
85#include <vm/vm.h>
86#include <vm/vm_extern.h>
87#include <vm/vm_object.h>
88
89#include <geom/geom.h>
90#include <geom/geom_vfs.h>
91
92#include <ddb/ddb.h>
93
94#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
95
96#ifndef SOFTUPDATES
97
98int
99softdep_flushfiles(oldmnt, flags, td)
100	struct mount *oldmnt;
101	int flags;
102	struct thread *td;
103{
104
105	panic("softdep_flushfiles called");
106}
107
108int
109softdep_mount(devvp, mp, fs, cred)
110	struct vnode *devvp;
111	struct mount *mp;
112	struct fs *fs;
113	struct ucred *cred;
114{
115
116	return (0);
117}
118
119void
120softdep_initialize()
121{
122
123	return;
124}
125
126void
127softdep_uninitialize()
128{
129
130	return;
131}
132
133void
134softdep_unmount(mp)
135	struct mount *mp;
136{
137
138	panic("softdep_unmount called");
139}
140
141void
142softdep_setup_sbupdate(ump, fs, bp)
143	struct ufsmount *ump;
144	struct fs *fs;
145	struct buf *bp;
146{
147
148	panic("softdep_setup_sbupdate called");
149}
150
151void
152softdep_setup_inomapdep(bp, ip, newinum, mode)
153	struct buf *bp;
154	struct inode *ip;
155	ino_t newinum;
156	int mode;
157{
158
159	panic("softdep_setup_inomapdep called");
160}
161
162void
163softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
164	struct buf *bp;
165	struct mount *mp;
166	ufs2_daddr_t newblkno;
167	int frags;
168	int oldfrags;
169{
170
171	panic("softdep_setup_blkmapdep called");
172}
173
174void
175softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
176	struct inode *ip;
177	ufs_lbn_t lbn;
178	ufs2_daddr_t newblkno;
179	ufs2_daddr_t oldblkno;
180	long newsize;
181	long oldsize;
182	struct buf *bp;
183{
184
185	panic("softdep_setup_allocdirect called");
186}
187
188void
189softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
190	struct inode *ip;
191	ufs_lbn_t lbn;
192	ufs2_daddr_t newblkno;
193	ufs2_daddr_t oldblkno;
194	long newsize;
195	long oldsize;
196	struct buf *bp;
197{
198
199	panic("softdep_setup_allocext called");
200}
201
202void
203softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
204	struct inode *ip;
205	ufs_lbn_t lbn;
206	struct buf *bp;
207	int ptrno;
208	ufs2_daddr_t newblkno;
209	ufs2_daddr_t oldblkno;
210	struct buf *nbp;
211{
212
213	panic("softdep_setup_allocindir_page called");
214}
215
216void
217softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
218	struct buf *nbp;
219	struct inode *ip;
220	struct buf *bp;
221	int ptrno;
222	ufs2_daddr_t newblkno;
223{
224
225	panic("softdep_setup_allocindir_meta called");
226}
227
228void
229softdep_journal_freeblocks(ip, cred, length, flags)
230	struct inode *ip;
231	struct ucred *cred;
232	off_t length;
233	int flags;
234{
235
236	panic("softdep_journal_freeblocks called");
237}
238
239void
240softdep_journal_fsync(ip)
241	struct inode *ip;
242{
243
244	panic("softdep_journal_fsync called");
245}
246
247void
248softdep_setup_freeblocks(ip, length, flags)
249	struct inode *ip;
250	off_t length;
251	int flags;
252{
253
254	panic("softdep_setup_freeblocks called");
255}
256
257void
258softdep_freefile(pvp, ino, mode)
259		struct vnode *pvp;
260		ino_t ino;
261		int mode;
262{
263
264	panic("softdep_freefile called");
265}
266
267int
268softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
269	struct buf *bp;
270	struct inode *dp;
271	off_t diroffset;
272	ino_t newinum;
273	struct buf *newdirbp;
274	int isnewblk;
275{
276
277	panic("softdep_setup_directory_add called");
278}
279
280void
281softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
282	struct buf *bp;
283	struct inode *dp;
284	caddr_t base;
285	caddr_t oldloc;
286	caddr_t newloc;
287	int entrysize;
288{
289
290	panic("softdep_change_directoryentry_offset called");
291}
292
293void
294softdep_setup_remove(bp, dp, ip, isrmdir)
295	struct buf *bp;
296	struct inode *dp;
297	struct inode *ip;
298	int isrmdir;
299{
300
301	panic("softdep_setup_remove called");
302}
303
304void
305softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
306	struct buf *bp;
307	struct inode *dp;
308	struct inode *ip;
309	ino_t newinum;
310	int isrmdir;
311{
312
313	panic("softdep_setup_directory_change called");
314}
315
316void
317softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
318	struct mount *mp;
319	struct buf *bp;
320	ufs2_daddr_t blkno;
321	int frags;
322	struct workhead *wkhd;
323{
324
325	panic("%s called", __FUNCTION__);
326}
327
328void
329softdep_setup_inofree(mp, bp, ino, wkhd)
330	struct mount *mp;
331	struct buf *bp;
332	ino_t ino;
333	struct workhead *wkhd;
334{
335
336	panic("%s called", __FUNCTION__);
337}
338
339void
340softdep_setup_unlink(dp, ip)
341	struct inode *dp;
342	struct inode *ip;
343{
344
345	panic("%s called", __FUNCTION__);
346}
347
348void
349softdep_setup_link(dp, ip)
350	struct inode *dp;
351	struct inode *ip;
352{
353
354	panic("%s called", __FUNCTION__);
355}
356
357void
358softdep_revert_link(dp, ip)
359	struct inode *dp;
360	struct inode *ip;
361{
362
363	panic("%s called", __FUNCTION__);
364}
365
366void
367softdep_setup_rmdir(dp, ip)
368	struct inode *dp;
369	struct inode *ip;
370{
371
372	panic("%s called", __FUNCTION__);
373}
374
375void
376softdep_revert_rmdir(dp, ip)
377	struct inode *dp;
378	struct inode *ip;
379{
380
381	panic("%s called", __FUNCTION__);
382}
383
384void
385softdep_setup_create(dp, ip)
386	struct inode *dp;
387	struct inode *ip;
388{
389
390	panic("%s called", __FUNCTION__);
391}
392
393void
394softdep_revert_create(dp, ip)
395	struct inode *dp;
396	struct inode *ip;
397{
398
399	panic("%s called", __FUNCTION__);
400}
401
402void
403softdep_setup_mkdir(dp, ip)
404	struct inode *dp;
405	struct inode *ip;
406{
407
408	panic("%s called", __FUNCTION__);
409}
410
411void
412softdep_revert_mkdir(dp, ip)
413	struct inode *dp;
414	struct inode *ip;
415{
416
417	panic("%s called", __FUNCTION__);
418}
419
420void
421softdep_setup_dotdot_link(dp, ip)
422	struct inode *dp;
423	struct inode *ip;
424{
425
426	panic("%s called", __FUNCTION__);
427}
428
429int
430softdep_prealloc(vp, waitok)
431	struct vnode *vp;
432	int waitok;
433{
434
435	panic("%s called", __FUNCTION__);
436}
437
438int
439softdep_journal_lookup(mp, vpp)
440	struct mount *mp;
441	struct vnode **vpp;
442{
443
444	return (ENOENT);
445}
446
447void
448softdep_change_linkcnt(ip)
449	struct inode *ip;
450{
451
452	panic("softdep_change_linkcnt called");
453}
454
455void
456softdep_load_inodeblock(ip)
457	struct inode *ip;
458{
459
460	panic("softdep_load_inodeblock called");
461}
462
463void
464softdep_update_inodeblock(ip, bp, waitfor)
465	struct inode *ip;
466	struct buf *bp;
467	int waitfor;
468{
469
470	panic("softdep_update_inodeblock called");
471}
472
473int
474softdep_fsync(vp)
475	struct vnode *vp;	/* the "in_core" copy of the inode */
476{
477
478	return (0);
479}
480
481void
482softdep_fsync_mountdev(vp)
483	struct vnode *vp;
484{
485
486	return;
487}
488
489int
490softdep_flushworklist(oldmnt, countp, td)
491	struct mount *oldmnt;
492	int *countp;
493	struct thread *td;
494{
495
496	*countp = 0;
497	return (0);
498}
499
500int
501softdep_sync_metadata(struct vnode *vp)
502{
503
504	panic("softdep_sync_metadata called");
505}
506
507int
508softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
509{
510
511	panic("softdep_sync_buf called");
512}
513
514int
515softdep_slowdown(vp)
516	struct vnode *vp;
517{
518
519	panic("softdep_slowdown called");
520}
521
522int
523softdep_request_cleanup(fs, vp, cred, resource)
524	struct fs *fs;
525	struct vnode *vp;
526	struct ucred *cred;
527	int resource;
528{
529
530	return (0);
531}
532
533int
534softdep_check_suspend(struct mount *mp,
535		      struct vnode *devvp,
536		      int softdep_depcnt,
537		      int softdep_accdepcnt,
538		      int secondary_writes,
539		      int secondary_accwrites)
540{
541	struct bufobj *bo;
542	int error;
543
544	(void) softdep_depcnt,
545	(void) softdep_accdepcnt;
546
547	bo = &devvp->v_bufobj;
548	ASSERT_BO_WLOCKED(bo);
549
550	MNT_ILOCK(mp);
551	while (mp->mnt_secondary_writes != 0) {
552		BO_UNLOCK(bo);
553		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
554		    (PUSER - 1) | PDROP, "secwr", 0);
555		BO_LOCK(bo);
556		MNT_ILOCK(mp);
557	}
558
559	/*
560	 * Reasons for needing more work before suspend:
561	 * - Dirty buffers on devvp.
562	 * - Secondary writes occurred after start of vnode sync loop
563	 */
564	error = 0;
565	if (bo->bo_numoutput > 0 ||
566	    bo->bo_dirty.bv_cnt > 0 ||
567	    secondary_writes != 0 ||
568	    mp->mnt_secondary_writes != 0 ||
569	    secondary_accwrites != mp->mnt_secondary_accwrites)
570		error = EAGAIN;
571	BO_UNLOCK(bo);
572	return (error);
573}
574
575void
576softdep_get_depcounts(struct mount *mp,
577		      int *softdepactivep,
578		      int *softdepactiveaccp)
579{
580	(void) mp;
581	*softdepactivep = 0;
582	*softdepactiveaccp = 0;
583}
584
585void
586softdep_buf_append(bp, wkhd)
587	struct buf *bp;
588	struct workhead *wkhd;
589{
590
591	panic("softdep_buf_appendwork called");
592}
593
594void
595softdep_inode_append(ip, cred, wkhd)
596	struct inode *ip;
597	struct ucred *cred;
598	struct workhead *wkhd;
599{
600
601	panic("softdep_inode_appendwork called");
602}
603
604void
605softdep_freework(wkhd)
606	struct workhead *wkhd;
607{
608
609	panic("softdep_freework called");
610}
611
612#else
613
614FEATURE(softupdates, "FFS soft-updates support");
615
616static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
617    "soft updates stats");
618static SYSCTL_NODE(_debug_softdep, OID_AUTO, total,
619    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
620    "total dependencies allocated");
621static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse,
622    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
623    "high use dependencies allocated");
624static SYSCTL_NODE(_debug_softdep, OID_AUTO, current,
625    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
626    "current dependencies allocated");
627static SYSCTL_NODE(_debug_softdep, OID_AUTO, write,
628    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
629    "current dependencies written");
630
631unsigned long dep_current[D_LAST + 1];
632unsigned long dep_highuse[D_LAST + 1];
633unsigned long dep_total[D_LAST + 1];
634unsigned long dep_write[D_LAST + 1];
635
636#define	SOFTDEP_TYPE(type, str, long)					\
637    static MALLOC_DEFINE(M_ ## type, #str, long);			\
638    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
639	&dep_total[D_ ## type], 0, "");					\
640    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
641	&dep_current[D_ ## type], 0, "");				\
642    SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
643	&dep_highuse[D_ ## type], 0, "");				\
644    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
645	&dep_write[D_ ## type], 0, "");
646
647SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
648SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
649SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
650    "Block or frag allocated from cyl group map");
651SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
652SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
653SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
654SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
655SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
656SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
657SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
658SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
659SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
660SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
661SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
662SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
663SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
664SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
665SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
666SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
667SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
668SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
669SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
670SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
671SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
672SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
673SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
674SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
675
676static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
677
678static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
679static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
680static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
681
682#define M_SOFTDEP_FLAGS	(M_WAITOK)
683
684/*
685 * translate from workitem type to memory type
686 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
687 */
688static struct malloc_type *memtype[] = {
689	NULL,
690	M_PAGEDEP,
691	M_INODEDEP,
692	M_BMSAFEMAP,
693	M_NEWBLK,
694	M_ALLOCDIRECT,
695	M_INDIRDEP,
696	M_ALLOCINDIR,
697	M_FREEFRAG,
698	M_FREEBLKS,
699	M_FREEFILE,
700	M_DIRADD,
701	M_MKDIR,
702	M_DIRREM,
703	M_NEWDIRBLK,
704	M_FREEWORK,
705	M_FREEDEP,
706	M_JADDREF,
707	M_JREMREF,
708	M_JMVREF,
709	M_JNEWBLK,
710	M_JFREEBLK,
711	M_JFREEFRAG,
712	M_JSEG,
713	M_JSEGDEP,
714	M_SBDEP,
715	M_JTRUNC,
716	M_JFSYNC,
717	M_SENTINEL
718};
719
720#define DtoM(type) (memtype[type])
721
722/*
723 * Names of malloc types.
724 */
725#define TYPENAME(type)  \
726	((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \
727	memtype[type]->ks_shortdesc : "???")
728/*
729 * End system adaptation definitions.
730 */
731
732#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
733#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
734
735/*
736 * Internal function prototypes.
737 */
738static	void check_clear_deps(struct mount *);
739static	void softdep_error(char *, int);
740static	int softdep_process_worklist(struct mount *, int);
741static	int softdep_waitidle(struct mount *, int);
742static	void drain_output(struct vnode *);
743static	struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
744static	int check_inodedep_free(struct inodedep *);
745static	void clear_remove(struct mount *);
746static	void clear_inodedeps(struct mount *);
747static	void unlinked_inodedep(struct mount *, struct inodedep *);
748static	void clear_unlinked_inodedep(struct inodedep *);
749static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
750static	int flush_pagedep_deps(struct vnode *, struct mount *,
751	    struct diraddhd *);
752static	int free_pagedep(struct pagedep *);
753static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
754static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
755static	int flush_deplist(struct allocdirectlst *, int, int *);
756static	int sync_cgs(struct mount *, int);
757static	int handle_written_filepage(struct pagedep *, struct buf *, int);
758static	int handle_written_sbdep(struct sbdep *, struct buf *);
759static	void initiate_write_sbdep(struct sbdep *);
760static	void diradd_inode_written(struct diradd *, struct inodedep *);
761static	int handle_written_indirdep(struct indirdep *, struct buf *,
762	    struct buf**, int);
763static	int handle_written_inodeblock(struct inodedep *, struct buf *, int);
764static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
765	    uint8_t *);
766static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
767static	void handle_written_jaddref(struct jaddref *);
768static	void handle_written_jremref(struct jremref *);
769static	void handle_written_jseg(struct jseg *, struct buf *);
770static	void handle_written_jnewblk(struct jnewblk *);
771static	void handle_written_jblkdep(struct jblkdep *);
772static	void handle_written_jfreefrag(struct jfreefrag *);
773static	void complete_jseg(struct jseg *);
774static	void complete_jsegs(struct jseg *);
775static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
776static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
777static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
778static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
779static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
780static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
781static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
782static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
783static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
784static	inline void inoref_write(struct inoref *, struct jseg *,
785	    struct jrefrec *);
786static	void handle_allocdirect_partdone(struct allocdirect *,
787	    struct workhead *);
788static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
789	    struct workhead *);
790static	void indirdep_complete(struct indirdep *);
791static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
792static	void indirblk_insert(struct freework *);
793static	void indirblk_remove(struct freework *);
794static	void handle_allocindir_partdone(struct allocindir *);
795static	void initiate_write_filepage(struct pagedep *, struct buf *);
796static	void initiate_write_indirdep(struct indirdep*, struct buf *);
797static	void handle_written_mkdir(struct mkdir *, int);
798static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
799	    uint8_t *);
800static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
801static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
802static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
803static	void handle_workitem_freefile(struct freefile *);
804static	int handle_workitem_remove(struct dirrem *, int);
805static	struct dirrem *newdirrem(struct buf *, struct inode *,
806	    struct inode *, int, struct dirrem **);
807static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
808	    struct buf *);
809static	void cancel_indirdep(struct indirdep *, struct buf *,
810	    struct freeblks *);
811static	void free_indirdep(struct indirdep *);
812static	void free_diradd(struct diradd *, struct workhead *);
813static	void merge_diradd(struct inodedep *, struct diradd *);
814static	void complete_diradd(struct diradd *);
815static	struct diradd *diradd_lookup(struct pagedep *, int);
816static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
817	    struct jremref *);
818static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
819	    struct jremref *);
820static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
821	    struct jremref *, struct jremref *);
822static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
823	    struct jremref *);
824static	void cancel_allocindir(struct allocindir *, struct buf *bp,
825	    struct freeblks *, int);
826static	int setup_trunc_indir(struct freeblks *, struct inode *,
827	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
828static	void complete_trunc_indir(struct freework *);
829static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
830	    int);
831static	void complete_mkdir(struct mkdir *);
832static	void free_newdirblk(struct newdirblk *);
833static	void free_jremref(struct jremref *);
834static	void free_jaddref(struct jaddref *);
835static	void free_jsegdep(struct jsegdep *);
836static	void free_jsegs(struct jblocks *);
837static	void rele_jseg(struct jseg *);
838static	void free_jseg(struct jseg *, struct jblocks *);
839static	void free_jnewblk(struct jnewblk *);
840static	void free_jblkdep(struct jblkdep *);
841static	void free_jfreefrag(struct jfreefrag *);
842static	void free_freedep(struct freedep *);
843static	void journal_jremref(struct dirrem *, struct jremref *,
844	    struct inodedep *);
845static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
846static	int cancel_jaddref(struct jaddref *, struct inodedep *,
847	    struct workhead *);
848static	void cancel_jfreefrag(struct jfreefrag *);
849static	inline void setup_freedirect(struct freeblks *, struct inode *,
850	    int, int);
851static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
852static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
853	    ufs_lbn_t, int);
854static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
855static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
856static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
857static	ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
858static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
859static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
860	    int, int);
861static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
862static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
863static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
864static	void newblk_freefrag(struct newblk*);
865static	void free_newblk(struct newblk *);
866static	void cancel_allocdirect(struct allocdirectlst *,
867	    struct allocdirect *, struct freeblks *);
868static	int check_inode_unwritten(struct inodedep *);
869static	int free_inodedep(struct inodedep *);
870static	void freework_freeblock(struct freework *, u_long);
871static	void freework_enqueue(struct freework *);
872static	int handle_workitem_freeblocks(struct freeblks *, int);
873static	int handle_complete_freeblocks(struct freeblks *, int);
874static	void handle_workitem_indirblk(struct freework *);
875static	void handle_written_freework(struct freework *);
876static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
877static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
878	    struct workhead *);
879static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
880	    struct inodedep *, struct allocindir *, ufs_lbn_t);
881static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
882	    ufs2_daddr_t, ufs_lbn_t);
883static	void handle_workitem_freefrag(struct freefrag *);
884static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
885	    ufs_lbn_t, u_long);
886static	void allocdirect_merge(struct allocdirectlst *,
887	    struct allocdirect *, struct allocdirect *);
888static	struct freefrag *allocindir_merge(struct allocindir *,
889	    struct allocindir *);
890static	int bmsafemap_find(struct bmsafemap_hashhead *, int,
891	    struct bmsafemap **);
892static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
893	    int cg, struct bmsafemap *);
894static	int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
895	    struct newblk **);
896static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
897static	int inodedep_find(struct inodedep_hashhead *, ino_t,
898	    struct inodedep **);
899static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
900static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
901	    int, struct pagedep **);
902static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
903	    struct pagedep **);
904static	void pause_timer(void *);
905static	int request_cleanup(struct mount *, int);
906static	int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
907static	void schedule_cleanup(struct mount *);
908static void softdep_ast_cleanup_proc(struct thread *);
909static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
910static	int process_worklist_item(struct mount *, int, int);
911static	void process_removes(struct vnode *);
912static	void process_truncates(struct vnode *);
913static	void jwork_move(struct workhead *, struct workhead *);
914static	void jwork_insert(struct workhead *, struct jsegdep *);
915static	void add_to_worklist(struct worklist *, int);
916static	void wake_worklist(struct worklist *);
917static	void wait_worklist(struct worklist *, char *);
918static	void remove_from_worklist(struct worklist *);
919static	void softdep_flush(void *);
920static	void softdep_flushjournal(struct mount *);
921static	int softdep_speedup(struct ufsmount *);
922static	void worklist_speedup(struct mount *);
923static	int journal_mount(struct mount *, struct fs *, struct ucred *);
924static	void journal_unmount(struct ufsmount *);
925static	int journal_space(struct ufsmount *, int);
926static	void journal_suspend(struct ufsmount *);
927static	int journal_unsuspend(struct ufsmount *ump);
928static	void softdep_prelink(struct vnode *, struct vnode *);
929static	void add_to_journal(struct worklist *);
930static	void remove_from_journal(struct worklist *);
931static	bool softdep_excess_items(struct ufsmount *, int);
932static	void softdep_process_journal(struct mount *, struct worklist *, int);
933static	struct jremref *newjremref(struct dirrem *, struct inode *,
934	    struct inode *ip, off_t, nlink_t);
935static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
936	    uint16_t);
937static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
938	    uint16_t);
939static	inline struct jsegdep *inoref_jseg(struct inoref *);
940static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
941static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
942	    ufs2_daddr_t, int);
943static	void adjust_newfreework(struct freeblks *, int);
944static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
945static	void move_newblock_dep(struct jaddref *, struct inodedep *);
946static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
947static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
948	    ufs2_daddr_t, long, ufs_lbn_t);
949static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
950	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
951static	int jwait(struct worklist *, int);
952static	struct inodedep *inodedep_lookup_ip(struct inode *);
953static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
954static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
955static	void handle_jwork(struct workhead *);
956static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
957	    struct mkdir **);
958static	struct jblocks *jblocks_create(void);
959static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
960static	void jblocks_free(struct jblocks *, struct mount *, int);
961static	void jblocks_destroy(struct jblocks *);
962static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
963
964/*
965 * Exported softdep operations.
966 */
967static	void softdep_disk_io_initiation(struct buf *);
968static	void softdep_disk_write_complete(struct buf *);
969static	void softdep_deallocate_dependencies(struct buf *);
970static	int softdep_count_dependencies(struct buf *bp, int);
971
972/*
973 * Global lock over all of soft updates.
974 */
975static struct mtx lk;
976MTX_SYSINIT(softdep_lock, &lk, "global softdep", MTX_DEF);
977
978#define ACQUIRE_GBLLOCK(lk)	mtx_lock(lk)
979#define FREE_GBLLOCK(lk)	mtx_unlock(lk)
980#define GBLLOCK_OWNED(lk)	mtx_assert((lk), MA_OWNED)
981
982/*
983 * Per-filesystem soft-updates locking.
984 */
985#define LOCK_PTR(ump)		(&(ump)->um_softdep->sd_fslock)
986#define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock(&(ump)->um_softdep->sd_fslock)
987#define ACQUIRE_LOCK(ump)	rw_wlock(&(ump)->um_softdep->sd_fslock)
988#define FREE_LOCK(ump)		rw_wunlock(&(ump)->um_softdep->sd_fslock)
989#define LOCK_OWNED(ump)		rw_assert(&(ump)->um_softdep->sd_fslock, \
990				    RA_WLOCKED)
991
992#define	BUF_AREC(bp)		lockallowrecurse(&(bp)->b_lock)
993#define	BUF_NOREC(bp)		lockdisablerecurse(&(bp)->b_lock)
994
995/*
996 * Worklist queue management.
997 * These routines require that the lock be held.
998 */
999#ifndef /* NOT */ INVARIANTS
1000#define WORKLIST_INSERT(head, item) do {	\
1001	(item)->wk_state |= ONWORKLIST;		\
1002	LIST_INSERT_HEAD(head, item, wk_list);	\
1003} while (0)
1004#define WORKLIST_REMOVE(item) do {		\
1005	(item)->wk_state &= ~ONWORKLIST;	\
1006	LIST_REMOVE(item, wk_list);		\
1007} while (0)
1008#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1009#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1010
1011#else /* INVARIANTS */
1012static	void worklist_insert(struct workhead *, struct worklist *, int,
1013	const char *, int);
1014static	void worklist_remove(struct worklist *, int, const char *, int);
1015
1016#define WORKLIST_INSERT(head, item) \
1017	worklist_insert(head, item, 1, __func__, __LINE__)
1018#define WORKLIST_INSERT_UNLOCKED(head, item)\
1019	worklist_insert(head, item, 0, __func__, __LINE__)
1020#define WORKLIST_REMOVE(item)\
1021	worklist_remove(item, 1, __func__, __LINE__)
1022#define WORKLIST_REMOVE_UNLOCKED(item)\
1023	worklist_remove(item, 0, __func__, __LINE__)
1024
1025static void
1026worklist_insert(head, item, locked, func, line)
1027	struct workhead *head;
1028	struct worklist *item;
1029	int locked;
1030	const char *func;
1031	int line;
1032{
1033
1034	if (locked)
1035		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1036	if (item->wk_state & ONWORKLIST)
1037		panic("worklist_insert: %p %s(0x%X) already on list, "
1038		    "added in function %s at line %d",
1039		    item, TYPENAME(item->wk_type), item->wk_state,
1040		    item->wk_func, item->wk_line);
1041	item->wk_state |= ONWORKLIST;
1042	item->wk_func = func;
1043	item->wk_line = line;
1044	LIST_INSERT_HEAD(head, item, wk_list);
1045}
1046
1047static void
1048worklist_remove(item, locked, func, line)
1049	struct worklist *item;
1050	int locked;
1051	const char *func;
1052	int line;
1053{
1054
1055	if (locked)
1056		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1057	if ((item->wk_state & ONWORKLIST) == 0)
1058		panic("worklist_remove: %p %s(0x%X) not on list, "
1059		    "removed in function %s at line %d",
1060		    item, TYPENAME(item->wk_type), item->wk_state,
1061		    item->wk_func, item->wk_line);
1062	item->wk_state &= ~ONWORKLIST;
1063	item->wk_func = func;
1064	item->wk_line = line;
1065	LIST_REMOVE(item, wk_list);
1066}
1067#endif /* INVARIANTS */
1068
1069/*
1070 * Merge two jsegdeps keeping only the oldest one as newer references
1071 * can't be discarded until after older references.
1072 */
1073static inline struct jsegdep *
1074jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1075{
1076	struct jsegdep *swp;
1077
1078	if (two == NULL)
1079		return (one);
1080
1081	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1082		swp = one;
1083		one = two;
1084		two = swp;
1085	}
1086	WORKLIST_REMOVE(&two->jd_list);
1087	free_jsegdep(two);
1088
1089	return (one);
1090}
1091
1092/*
1093 * If two freedeps are compatible free one to reduce list size.
1094 */
1095static inline struct freedep *
1096freedep_merge(struct freedep *one, struct freedep *two)
1097{
1098	if (two == NULL)
1099		return (one);
1100
1101	if (one->fd_freework == two->fd_freework) {
1102		WORKLIST_REMOVE(&two->fd_list);
1103		free_freedep(two);
1104	}
1105	return (one);
1106}
1107
1108/*
1109 * Move journal work from one list to another.  Duplicate freedeps and
1110 * jsegdeps are coalesced to keep the lists as small as possible.
1111 */
1112static void
1113jwork_move(dst, src)
1114	struct workhead *dst;
1115	struct workhead *src;
1116{
1117	struct freedep *freedep;
1118	struct jsegdep *jsegdep;
1119	struct worklist *wkn;
1120	struct worklist *wk;
1121
1122	KASSERT(dst != src,
1123	    ("jwork_move: dst == src"));
1124	freedep = NULL;
1125	jsegdep = NULL;
1126	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1127		if (wk->wk_type == D_JSEGDEP)
1128			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1129		else if (wk->wk_type == D_FREEDEP)
1130			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1131	}
1132
1133	while ((wk = LIST_FIRST(src)) != NULL) {
1134		WORKLIST_REMOVE(wk);
1135		WORKLIST_INSERT(dst, wk);
1136		if (wk->wk_type == D_JSEGDEP) {
1137			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1138			continue;
1139		}
1140		if (wk->wk_type == D_FREEDEP)
1141			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1142	}
1143}
1144
1145static void
1146jwork_insert(dst, jsegdep)
1147	struct workhead *dst;
1148	struct jsegdep *jsegdep;
1149{
1150	struct jsegdep *jsegdepn;
1151	struct worklist *wk;
1152
1153	LIST_FOREACH(wk, dst, wk_list)
1154		if (wk->wk_type == D_JSEGDEP)
1155			break;
1156	if (wk == NULL) {
1157		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1158		return;
1159	}
1160	jsegdepn = WK_JSEGDEP(wk);
1161	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1162		WORKLIST_REMOVE(wk);
1163		free_jsegdep(jsegdepn);
1164		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1165	} else
1166		free_jsegdep(jsegdep);
1167}
1168
1169/*
1170 * Routines for tracking and managing workitems.
1171 */
1172static	void workitem_free(struct worklist *, int);
1173static	void workitem_alloc(struct worklist *, int, struct mount *);
1174static	void workitem_reassign(struct worklist *, int);
1175
1176#define	WORKITEM_FREE(item, type) \
1177	workitem_free((struct worklist *)(item), (type))
1178#define	WORKITEM_REASSIGN(item, type) \
1179	workitem_reassign((struct worklist *)(item), (type))
1180
1181static void
1182workitem_free(item, type)
1183	struct worklist *item;
1184	int type;
1185{
1186	struct ufsmount *ump;
1187
1188#ifdef INVARIANTS
1189	if (item->wk_state & ONWORKLIST)
1190		panic("workitem_free: %s(0x%X) still on list, "
1191		    "added in function %s at line %d",
1192		    TYPENAME(item->wk_type), item->wk_state,
1193		    item->wk_func, item->wk_line);
1194	if (item->wk_type != type && type != D_NEWBLK)
1195		panic("workitem_free: type mismatch %s != %s",
1196		    TYPENAME(item->wk_type), TYPENAME(type));
1197#endif
1198	if (item->wk_state & IOWAITING)
1199		wakeup(item);
1200	ump = VFSTOUFS(item->wk_mp);
1201	LOCK_OWNED(ump);
1202	KASSERT(ump->softdep_deps > 0,
1203	    ("workitem_free: %s: softdep_deps going negative",
1204	    ump->um_fs->fs_fsmnt));
1205	if (--ump->softdep_deps == 0 && ump->softdep_req)
1206		wakeup(&ump->softdep_deps);
1207	KASSERT(dep_current[item->wk_type] > 0,
1208	    ("workitem_free: %s: dep_current[%s] going negative",
1209	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1210	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1211	    ("workitem_free: %s: softdep_curdeps[%s] going negative",
1212	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1213	atomic_subtract_long(&dep_current[item->wk_type], 1);
1214	ump->softdep_curdeps[item->wk_type] -= 1;
1215#ifdef INVARIANTS
1216	LIST_REMOVE(item, wk_all);
1217#endif
1218	free(item, DtoM(type));
1219}
1220
1221static void
1222workitem_alloc(item, type, mp)
1223	struct worklist *item;
1224	int type;
1225	struct mount *mp;
1226{
1227	struct ufsmount *ump;
1228
1229	item->wk_type = type;
1230	item->wk_mp = mp;
1231	item->wk_state = 0;
1232
1233	ump = VFSTOUFS(mp);
1234	ACQUIRE_GBLLOCK(&lk);
1235	dep_current[type]++;
1236	if (dep_current[type] > dep_highuse[type])
1237		dep_highuse[type] = dep_current[type];
1238	dep_total[type]++;
1239	FREE_GBLLOCK(&lk);
1240	ACQUIRE_LOCK(ump);
1241	ump->softdep_curdeps[type] += 1;
1242	ump->softdep_deps++;
1243	ump->softdep_accdeps++;
1244#ifdef INVARIANTS
1245	LIST_INSERT_HEAD(&ump->softdep_alldeps[type], item, wk_all);
1246#endif
1247	FREE_LOCK(ump);
1248}
1249
1250static void
1251workitem_reassign(item, newtype)
1252	struct worklist *item;
1253	int newtype;
1254{
1255	struct ufsmount *ump;
1256
1257	ump = VFSTOUFS(item->wk_mp);
1258	LOCK_OWNED(ump);
1259	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1260	    ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1261	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1262	ump->softdep_curdeps[item->wk_type] -= 1;
1263	ump->softdep_curdeps[newtype] += 1;
1264	KASSERT(dep_current[item->wk_type] > 0,
1265	    ("workitem_reassign: %s: dep_current[%s] going negative",
1266	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1267	ACQUIRE_GBLLOCK(&lk);
1268	dep_current[newtype]++;
1269	dep_current[item->wk_type]--;
1270	if (dep_current[newtype] > dep_highuse[newtype])
1271		dep_highuse[newtype] = dep_current[newtype];
1272	dep_total[newtype]++;
1273	FREE_GBLLOCK(&lk);
1274	item->wk_type = newtype;
1275}
1276
1277/*
1278 * Workitem queue management
1279 */
1280static int max_softdeps;	/* maximum number of structs before slowdown */
1281static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1282static int proc_waiting;	/* tracks whether we have a timeout posted */
1283static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1284static struct callout softdep_callout;
1285static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1286static int req_clear_remove;	/* syncer process flush some freeblks */
1287static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1288
1289/*
1290 * runtime statistics
1291 */
1292static int stat_flush_threads;	/* number of softdep flushing threads */
1293static int stat_worklist_push;	/* number of worklist cleanups */
1294static int stat_blk_limit_push;	/* number of times block limit neared */
1295static int stat_ino_limit_push;	/* number of times inode limit neared */
1296static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1297static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1298static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1299static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1300static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1301static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1302static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1303static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1304static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1305static int stat_journal_min;	/* Times hit journal min threshold */
1306static int stat_journal_low;	/* Times hit journal low threshold */
1307static int stat_journal_wait;	/* Times blocked in jwait(). */
1308static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1309static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1310static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1311static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1312static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1313static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1314static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1315static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1316static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1317static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1318
1319SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1320    &max_softdeps, 0, "");
1321SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1322    &tickdelay, 0, "");
1323SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1324    &stat_flush_threads, 0, "");
1325SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push,
1326    CTLFLAG_RW | CTLFLAG_STATS, &stat_worklist_push, 0,"");
1327SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push,
1328    CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_push, 0,"");
1329SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push,
1330    CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_push, 0,"");
1331SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit,
1332    CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_hit, 0, "");
1333SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit,
1334    CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_hit, 0, "");
1335SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit,
1336    CTLFLAG_RW | CTLFLAG_STATS, &stat_sync_limit_hit, 0, "");
1337SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs,
1338    CTLFLAG_RW | CTLFLAG_STATS, &stat_indir_blk_ptrs, 0, "");
1339SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap,
1340    CTLFLAG_RW | CTLFLAG_STATS, &stat_inode_bitmap, 0, "");
1341SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs,
1342    CTLFLAG_RW | CTLFLAG_STATS, &stat_direct_blk_ptrs, 0, "");
1343SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry,
1344    CTLFLAG_RW | CTLFLAG_STATS, &stat_dir_entry, 0, "");
1345SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback,
1346    CTLFLAG_RW | CTLFLAG_STATS, &stat_jaddref, 0, "");
1347SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback,
1348    CTLFLAG_RW | CTLFLAG_STATS, &stat_jnewblk, 0, "");
1349SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low,
1350    CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_low, 0, "");
1351SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min,
1352    CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_min, 0, "");
1353SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait,
1354    CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_wait, 0, "");
1355SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage,
1356    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_filepage, 0, "");
1357SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks,
1358    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_freeblks, 0, "");
1359SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode,
1360    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_inode, 0, "");
1361SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk,
1362    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_newblk, 0, "");
1363SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests,
1364    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_blkrequests, 0, "");
1365SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests,
1366    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_inorequests, 0, "");
1367SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay,
1368    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_high_delay, 0, "");
1369SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries,
1370    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_retries, 0, "");
1371SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures,
1372    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_failures, 0, "");
1373
1374SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1375    &softdep_flushcache, 0, "");
1376SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1377    &stat_emptyjblocks, 0, "");
1378
1379SYSCTL_DECL(_vfs_ffs);
1380
1381/* Whether to recompute the summary at mount time */
1382static int compute_summary_at_mount = 0;
1383SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1384	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1385static int print_threads = 0;
1386SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1387    &print_threads, 0, "Notify flusher thread start/stop");
1388
1389/* List of all filesystems mounted with soft updates */
1390static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1391
1392/*
1393 * This function cleans the worklist for a filesystem.
1394 * Each filesystem running with soft dependencies gets its own
1395 * thread to run in this function. The thread is started up in
1396 * softdep_mount and shutdown in softdep_unmount. They show up
1397 * as part of the kernel "bufdaemon" process whose process
1398 * entry is available in bufdaemonproc.
1399 */
1400static int searchfailed;
1401extern struct proc *bufdaemonproc;
1402static void
1403softdep_flush(addr)
1404	void *addr;
1405{
1406	struct mount *mp;
1407	struct thread *td;
1408	struct ufsmount *ump;
1409
1410	td = curthread;
1411	td->td_pflags |= TDP_NORUNNINGBUF;
1412	mp = (struct mount *)addr;
1413	ump = VFSTOUFS(mp);
1414	atomic_add_int(&stat_flush_threads, 1);
1415	ACQUIRE_LOCK(ump);
1416	ump->softdep_flags &= ~FLUSH_STARTING;
1417	wakeup(&ump->softdep_flushtd);
1418	FREE_LOCK(ump);
1419	if (print_threads) {
1420		if (stat_flush_threads == 1)
1421			printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1422			    bufdaemonproc->p_pid);
1423		printf("Start thread %s\n", td->td_name);
1424	}
1425	for (;;) {
1426		while (softdep_process_worklist(mp, 0) > 0 ||
1427		    (MOUNTEDSUJ(mp) &&
1428		    VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1429			kthread_suspend_check();
1430		ACQUIRE_LOCK(ump);
1431		if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1432			msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1433			    "sdflush", hz / 2);
1434		ump->softdep_flags &= ~FLUSH_CLEANUP;
1435		/*
1436		 * Check to see if we are done and need to exit.
1437		 */
1438		if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1439			FREE_LOCK(ump);
1440			continue;
1441		}
1442		ump->softdep_flags &= ~FLUSH_EXIT;
1443		FREE_LOCK(ump);
1444		wakeup(&ump->softdep_flags);
1445		if (print_threads)
1446			printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
1447		atomic_subtract_int(&stat_flush_threads, 1);
1448		kthread_exit();
1449		panic("kthread_exit failed\n");
1450	}
1451}
1452
1453static void
1454worklist_speedup(mp)
1455	struct mount *mp;
1456{
1457	struct ufsmount *ump;
1458
1459	ump = VFSTOUFS(mp);
1460	LOCK_OWNED(ump);
1461	if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1462		ump->softdep_flags |= FLUSH_CLEANUP;
1463	wakeup(&ump->softdep_flushtd);
1464}
1465
1466static void
1467softdep_send_speedup(struct ufsmount *ump, size_t shortage, u_int flags)
1468{
1469	struct buf *bp;
1470
1471	if ((ump->um_flags & UM_CANSPEEDUP) == 0)
1472		return;
1473
1474	bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
1475	bp->b_iocmd = BIO_SPEEDUP;
1476	bp->b_ioflags = flags;
1477	bp->b_bcount = shortage;
1478	g_vfs_strategy(ump->um_bo, bp);
1479	bufwait(bp);
1480	free(bp, M_TRIM);
1481}
1482
1483static int
1484softdep_speedup(ump)
1485	struct ufsmount *ump;
1486{
1487	struct ufsmount *altump;
1488	struct mount_softdeps *sdp;
1489
1490	LOCK_OWNED(ump);
1491	worklist_speedup(ump->um_mountp);
1492	bd_speedup();
1493	/*
1494	 * If we have global shortages, then we need other
1495	 * filesystems to help with the cleanup. Here we wakeup a
1496	 * flusher thread for a filesystem that is over its fair
1497	 * share of resources.
1498	 */
1499	if (req_clear_inodedeps || req_clear_remove) {
1500		ACQUIRE_GBLLOCK(&lk);
1501		TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1502			if ((altump = sdp->sd_ump) == ump)
1503				continue;
1504			if (((req_clear_inodedeps &&
1505			    altump->softdep_curdeps[D_INODEDEP] >
1506			    max_softdeps / stat_flush_threads) ||
1507			    (req_clear_remove &&
1508			    altump->softdep_curdeps[D_DIRREM] >
1509			    (max_softdeps / 2) / stat_flush_threads)) &&
1510			    TRY_ACQUIRE_LOCK(altump))
1511				break;
1512		}
1513		if (sdp == NULL) {
1514			searchfailed++;
1515			FREE_GBLLOCK(&lk);
1516		} else {
1517			/*
1518			 * Move to the end of the list so we pick a
1519			 * different one on out next try.
1520			 */
1521			TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1522			TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1523			FREE_GBLLOCK(&lk);
1524			if ((altump->softdep_flags &
1525			    (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1526				altump->softdep_flags |= FLUSH_CLEANUP;
1527			altump->um_softdep->sd_cleanups++;
1528			wakeup(&altump->softdep_flushtd);
1529			FREE_LOCK(altump);
1530		}
1531	}
1532	return (speedup_syncer());
1533}
1534
1535/*
1536 * Add an item to the end of the work queue.
1537 * This routine requires that the lock be held.
1538 * This is the only routine that adds items to the list.
1539 * The following routine is the only one that removes items
1540 * and does so in order from first to last.
1541 */
1542
1543#define	WK_HEAD		0x0001	/* Add to HEAD. */
1544#define	WK_NODELAY	0x0002	/* Process immediately. */
1545
1546static void
1547add_to_worklist(wk, flags)
1548	struct worklist *wk;
1549	int flags;
1550{
1551	struct ufsmount *ump;
1552
1553	ump = VFSTOUFS(wk->wk_mp);
1554	LOCK_OWNED(ump);
1555	if (wk->wk_state & ONWORKLIST)
1556		panic("add_to_worklist: %s(0x%X) already on list",
1557		    TYPENAME(wk->wk_type), wk->wk_state);
1558	wk->wk_state |= ONWORKLIST;
1559	if (ump->softdep_on_worklist == 0) {
1560		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1561		ump->softdep_worklist_tail = wk;
1562	} else if (flags & WK_HEAD) {
1563		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1564	} else {
1565		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1566		ump->softdep_worklist_tail = wk;
1567	}
1568	ump->softdep_on_worklist += 1;
1569	if (flags & WK_NODELAY)
1570		worklist_speedup(wk->wk_mp);
1571}
1572
1573/*
1574 * Remove the item to be processed. If we are removing the last
1575 * item on the list, we need to recalculate the tail pointer.
1576 */
1577static void
1578remove_from_worklist(wk)
1579	struct worklist *wk;
1580{
1581	struct ufsmount *ump;
1582
1583	ump = VFSTOUFS(wk->wk_mp);
1584	if (ump->softdep_worklist_tail == wk)
1585		ump->softdep_worklist_tail =
1586		    (struct worklist *)wk->wk_list.le_prev;
1587	WORKLIST_REMOVE(wk);
1588	ump->softdep_on_worklist -= 1;
1589}
1590
1591static void
1592wake_worklist(wk)
1593	struct worklist *wk;
1594{
1595	if (wk->wk_state & IOWAITING) {
1596		wk->wk_state &= ~IOWAITING;
1597		wakeup(wk);
1598	}
1599}
1600
1601static void
1602wait_worklist(wk, wmesg)
1603	struct worklist *wk;
1604	char *wmesg;
1605{
1606	struct ufsmount *ump;
1607
1608	ump = VFSTOUFS(wk->wk_mp);
1609	wk->wk_state |= IOWAITING;
1610	msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1611}
1612
1613/*
1614 * Process that runs once per second to handle items in the background queue.
1615 *
1616 * Note that we ensure that everything is done in the order in which they
1617 * appear in the queue. The code below depends on this property to ensure
1618 * that blocks of a file are freed before the inode itself is freed. This
1619 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1620 * until all the old ones have been purged from the dependency lists.
1621 */
1622static int
1623softdep_process_worklist(mp, full)
1624	struct mount *mp;
1625	int full;
1626{
1627	int cnt, matchcnt;
1628	struct ufsmount *ump;
1629	long starttime;
1630
1631	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1632	if (MOUNTEDSOFTDEP(mp) == 0)
1633		return (0);
1634	matchcnt = 0;
1635	ump = VFSTOUFS(mp);
1636	ACQUIRE_LOCK(ump);
1637	starttime = time_second;
1638	softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1639	check_clear_deps(mp);
1640	while (ump->softdep_on_worklist > 0) {
1641		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1642			break;
1643		else
1644			matchcnt += cnt;
1645		check_clear_deps(mp);
1646		/*
1647		 * We do not generally want to stop for buffer space, but if
1648		 * we are really being a buffer hog, we will stop and wait.
1649		 */
1650		if (should_yield()) {
1651			FREE_LOCK(ump);
1652			kern_yield(PRI_USER);
1653			bwillwrite();
1654			ACQUIRE_LOCK(ump);
1655		}
1656		/*
1657		 * Never allow processing to run for more than one
1658		 * second. This gives the syncer thread the opportunity
1659		 * to pause if appropriate.
1660		 */
1661		if (!full && starttime != time_second)
1662			break;
1663	}
1664	if (full == 0)
1665		journal_unsuspend(ump);
1666	FREE_LOCK(ump);
1667	return (matchcnt);
1668}
1669
1670/*
1671 * Process all removes associated with a vnode if we are running out of
1672 * journal space.  Any other process which attempts to flush these will
1673 * be unable as we have the vnodes locked.
1674 */
1675static void
1676process_removes(vp)
1677	struct vnode *vp;
1678{
1679	struct inodedep *inodedep;
1680	struct dirrem *dirrem;
1681	struct ufsmount *ump;
1682	struct mount *mp;
1683	ino_t inum;
1684
1685	mp = vp->v_mount;
1686	ump = VFSTOUFS(mp);
1687	LOCK_OWNED(ump);
1688	inum = VTOI(vp)->i_number;
1689	for (;;) {
1690top:
1691		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1692			return;
1693		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1694			/*
1695			 * If another thread is trying to lock this vnode
1696			 * it will fail but we must wait for it to do so
1697			 * before we can proceed.
1698			 */
1699			if (dirrem->dm_state & INPROGRESS) {
1700				wait_worklist(&dirrem->dm_list, "pwrwait");
1701				goto top;
1702			}
1703			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1704			    (COMPLETE | ONWORKLIST))
1705				break;
1706		}
1707		if (dirrem == NULL)
1708			return;
1709		remove_from_worklist(&dirrem->dm_list);
1710		FREE_LOCK(ump);
1711		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1712			panic("process_removes: suspended filesystem");
1713		handle_workitem_remove(dirrem, 0);
1714		vn_finished_secondary_write(mp);
1715		ACQUIRE_LOCK(ump);
1716	}
1717}
1718
1719/*
1720 * Process all truncations associated with a vnode if we are running out
1721 * of journal space.  This is called when the vnode lock is already held
1722 * and no other process can clear the truncation.  This function returns
1723 * a value greater than zero if it did any work.
1724 */
1725static void
1726process_truncates(vp)
1727	struct vnode *vp;
1728{
1729	struct inodedep *inodedep;
1730	struct freeblks *freeblks;
1731	struct ufsmount *ump;
1732	struct mount *mp;
1733	ino_t inum;
1734	int cgwait;
1735
1736	mp = vp->v_mount;
1737	ump = VFSTOUFS(mp);
1738	LOCK_OWNED(ump);
1739	inum = VTOI(vp)->i_number;
1740	for (;;) {
1741		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1742			return;
1743		cgwait = 0;
1744		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1745			/* Journal entries not yet written.  */
1746			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1747				jwait(&LIST_FIRST(
1748				    &freeblks->fb_jblkdephd)->jb_list,
1749				    MNT_WAIT);
1750				break;
1751			}
1752			/* Another thread is executing this item. */
1753			if (freeblks->fb_state & INPROGRESS) {
1754				wait_worklist(&freeblks->fb_list, "ptrwait");
1755				break;
1756			}
1757			/* Freeblks is waiting on a inode write. */
1758			if ((freeblks->fb_state & COMPLETE) == 0) {
1759				FREE_LOCK(ump);
1760				ffs_update(vp, 1);
1761				ACQUIRE_LOCK(ump);
1762				break;
1763			}
1764			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1765			    (ALLCOMPLETE | ONWORKLIST)) {
1766				remove_from_worklist(&freeblks->fb_list);
1767				freeblks->fb_state |= INPROGRESS;
1768				FREE_LOCK(ump);
1769				if (vn_start_secondary_write(NULL, &mp,
1770				    V_NOWAIT))
1771					panic("process_truncates: "
1772					    "suspended filesystem");
1773				handle_workitem_freeblocks(freeblks, 0);
1774				vn_finished_secondary_write(mp);
1775				ACQUIRE_LOCK(ump);
1776				break;
1777			}
1778			if (freeblks->fb_cgwait)
1779				cgwait++;
1780		}
1781		if (cgwait) {
1782			FREE_LOCK(ump);
1783			sync_cgs(mp, MNT_WAIT);
1784			ffs_sync_snap(mp, MNT_WAIT);
1785			ACQUIRE_LOCK(ump);
1786			continue;
1787		}
1788		if (freeblks == NULL)
1789			break;
1790	}
1791	return;
1792}
1793
1794/*
1795 * Process one item on the worklist.
1796 */
1797static int
1798process_worklist_item(mp, target, flags)
1799	struct mount *mp;
1800	int target;
1801	int flags;
1802{
1803	struct worklist sentinel;
1804	struct worklist *wk;
1805	struct ufsmount *ump;
1806	int matchcnt;
1807	int error;
1808
1809	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1810	/*
1811	 * If we are being called because of a process doing a
1812	 * copy-on-write, then it is not safe to write as we may
1813	 * recurse into the copy-on-write routine.
1814	 */
1815	if (curthread->td_pflags & TDP_COWINPROGRESS)
1816		return (-1);
1817	PHOLD(curproc);	/* Don't let the stack go away. */
1818	ump = VFSTOUFS(mp);
1819	LOCK_OWNED(ump);
1820	matchcnt = 0;
1821	sentinel.wk_mp = NULL;
1822	sentinel.wk_type = D_SENTINEL;
1823	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1824	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1825	    wk = LIST_NEXT(&sentinel, wk_list)) {
1826		if (wk->wk_type == D_SENTINEL) {
1827			LIST_REMOVE(&sentinel, wk_list);
1828			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1829			continue;
1830		}
1831		if (wk->wk_state & INPROGRESS)
1832			panic("process_worklist_item: %p already in progress.",
1833			    wk);
1834		wk->wk_state |= INPROGRESS;
1835		remove_from_worklist(wk);
1836		FREE_LOCK(ump);
1837		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1838			panic("process_worklist_item: suspended filesystem");
1839		switch (wk->wk_type) {
1840		case D_DIRREM:
1841			/* removal of a directory entry */
1842			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1843			break;
1844
1845		case D_FREEBLKS:
1846			/* releasing blocks and/or fragments from a file */
1847			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1848			    flags);
1849			break;
1850
1851		case D_FREEFRAG:
1852			/* releasing a fragment when replaced as a file grows */
1853			handle_workitem_freefrag(WK_FREEFRAG(wk));
1854			error = 0;
1855			break;
1856
1857		case D_FREEFILE:
1858			/* releasing an inode when its link count drops to 0 */
1859			handle_workitem_freefile(WK_FREEFILE(wk));
1860			error = 0;
1861			break;
1862
1863		default:
1864			panic("%s_process_worklist: Unknown type %s",
1865			    "softdep", TYPENAME(wk->wk_type));
1866			/* NOTREACHED */
1867		}
1868		vn_finished_secondary_write(mp);
1869		ACQUIRE_LOCK(ump);
1870		if (error == 0) {
1871			if (++matchcnt == target)
1872				break;
1873			continue;
1874		}
1875		/*
1876		 * We have to retry the worklist item later.  Wake up any
1877		 * waiters who may be able to complete it immediately and
1878		 * add the item back to the head so we don't try to execute
1879		 * it again.
1880		 */
1881		wk->wk_state &= ~INPROGRESS;
1882		wake_worklist(wk);
1883		add_to_worklist(wk, WK_HEAD);
1884	}
1885	/* Sentinal could've become the tail from remove_from_worklist. */
1886	if (ump->softdep_worklist_tail == &sentinel)
1887		ump->softdep_worklist_tail =
1888		    (struct worklist *)sentinel.wk_list.le_prev;
1889	LIST_REMOVE(&sentinel, wk_list);
1890	PRELE(curproc);
1891	return (matchcnt);
1892}
1893
1894/*
1895 * Move dependencies from one buffer to another.
1896 */
1897int
1898softdep_move_dependencies(oldbp, newbp)
1899	struct buf *oldbp;
1900	struct buf *newbp;
1901{
1902	struct worklist *wk, *wktail;
1903	struct ufsmount *ump;
1904	int dirty;
1905
1906	if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1907		return (0);
1908	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1909	    ("softdep_move_dependencies called on non-softdep filesystem"));
1910	dirty = 0;
1911	wktail = NULL;
1912	ump = VFSTOUFS(wk->wk_mp);
1913	ACQUIRE_LOCK(ump);
1914	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1915		LIST_REMOVE(wk, wk_list);
1916		if (wk->wk_type == D_BMSAFEMAP &&
1917		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1918			dirty = 1;
1919		if (wktail == NULL)
1920			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1921		else
1922			LIST_INSERT_AFTER(wktail, wk, wk_list);
1923		wktail = wk;
1924	}
1925	FREE_LOCK(ump);
1926
1927	return (dirty);
1928}
1929
1930/*
1931 * Purge the work list of all items associated with a particular mount point.
1932 */
1933int
1934softdep_flushworklist(oldmnt, countp, td)
1935	struct mount *oldmnt;
1936	int *countp;
1937	struct thread *td;
1938{
1939	struct vnode *devvp;
1940	struct ufsmount *ump;
1941	int count, error;
1942
1943	/*
1944	 * Alternately flush the block device associated with the mount
1945	 * point and process any dependencies that the flushing
1946	 * creates. We continue until no more worklist dependencies
1947	 * are found.
1948	 */
1949	*countp = 0;
1950	error = 0;
1951	ump = VFSTOUFS(oldmnt);
1952	devvp = ump->um_devvp;
1953	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1954		*countp += count;
1955		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1956		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1957		VOP_UNLOCK(devvp);
1958		if (error != 0)
1959			break;
1960	}
1961	return (error);
1962}
1963
1964#define	SU_WAITIDLE_RETRIES	20
1965static int
1966softdep_waitidle(struct mount *mp, int flags __unused)
1967{
1968	struct ufsmount *ump;
1969	struct vnode *devvp;
1970	struct thread *td;
1971	int error, i;
1972
1973	ump = VFSTOUFS(mp);
1974	devvp = ump->um_devvp;
1975	td = curthread;
1976	error = 0;
1977	ACQUIRE_LOCK(ump);
1978	for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
1979		ump->softdep_req = 1;
1980		KASSERT((flags & FORCECLOSE) == 0 ||
1981		    ump->softdep_on_worklist == 0,
1982		    ("softdep_waitidle: work added after flush"));
1983		msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
1984		    "softdeps", 10 * hz);
1985		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1986		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1987		VOP_UNLOCK(devvp);
1988		ACQUIRE_LOCK(ump);
1989		if (error != 0)
1990			break;
1991	}
1992	ump->softdep_req = 0;
1993	if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
1994		error = EBUSY;
1995		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1996		    mp);
1997	}
1998	FREE_LOCK(ump);
1999	return (error);
2000}
2001
2002/*
2003 * Flush all vnodes and worklist items associated with a specified mount point.
2004 */
2005int
2006softdep_flushfiles(oldmnt, flags, td)
2007	struct mount *oldmnt;
2008	int flags;
2009	struct thread *td;
2010{
2011#ifdef QUOTA
2012	struct ufsmount *ump;
2013	int i;
2014#endif
2015	int error, early, depcount, loopcnt, retry_flush_count, retry;
2016	int morework;
2017
2018	KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
2019	    ("softdep_flushfiles called on non-softdep filesystem"));
2020	loopcnt = 10;
2021	retry_flush_count = 3;
2022retry_flush:
2023	error = 0;
2024
2025	/*
2026	 * Alternately flush the vnodes associated with the mount
2027	 * point and process any dependencies that the flushing
2028	 * creates. In theory, this loop can happen at most twice,
2029	 * but we give it a few extra just to be sure.
2030	 */
2031	for (; loopcnt > 0; loopcnt--) {
2032		/*
2033		 * Do another flush in case any vnodes were brought in
2034		 * as part of the cleanup operations.
2035		 */
2036		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
2037		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
2038		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
2039			break;
2040		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
2041		    depcount == 0)
2042			break;
2043	}
2044	/*
2045	 * If we are unmounting then it is an error to fail. If we
2046	 * are simply trying to downgrade to read-only, then filesystem
2047	 * activity can keep us busy forever, so we just fail with EBUSY.
2048	 */
2049	if (loopcnt == 0) {
2050		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2051			panic("softdep_flushfiles: looping");
2052		error = EBUSY;
2053	}
2054	if (!error)
2055		error = softdep_waitidle(oldmnt, flags);
2056	if (!error) {
2057		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2058			retry = 0;
2059			MNT_ILOCK(oldmnt);
2060			morework = oldmnt->mnt_nvnodelistsize > 0;
2061#ifdef QUOTA
2062			ump = VFSTOUFS(oldmnt);
2063			UFS_LOCK(ump);
2064			for (i = 0; i < MAXQUOTAS; i++) {
2065				if (ump->um_quotas[i] != NULLVP)
2066					morework = 1;
2067			}
2068			UFS_UNLOCK(ump);
2069#endif
2070			if (morework) {
2071				if (--retry_flush_count > 0) {
2072					retry = 1;
2073					loopcnt = 3;
2074				} else
2075					error = EBUSY;
2076			}
2077			MNT_IUNLOCK(oldmnt);
2078			if (retry)
2079				goto retry_flush;
2080		}
2081	}
2082	return (error);
2083}
2084
2085/*
2086 * Structure hashing.
2087 *
2088 * There are four types of structures that can be looked up:
2089 *	1) pagedep structures identified by mount point, inode number,
2090 *	   and logical block.
2091 *	2) inodedep structures identified by mount point and inode number.
2092 *	3) newblk structures identified by mount point and
2093 *	   physical block number.
2094 *	4) bmsafemap structures identified by mount point and
2095 *	   cylinder group number.
2096 *
2097 * The "pagedep" and "inodedep" dependency structures are hashed
2098 * separately from the file blocks and inodes to which they correspond.
2099 * This separation helps when the in-memory copy of an inode or
2100 * file block must be replaced. It also obviates the need to access
2101 * an inode or file page when simply updating (or de-allocating)
2102 * dependency structures. Lookup of newblk structures is needed to
2103 * find newly allocated blocks when trying to associate them with
2104 * their allocdirect or allocindir structure.
2105 *
2106 * The lookup routines optionally create and hash a new instance when
2107 * an existing entry is not found. The bmsafemap lookup routine always
2108 * allocates a new structure if an existing one is not found.
2109 */
2110#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2111
2112/*
2113 * Structures and routines associated with pagedep caching.
2114 */
2115#define	PAGEDEP_HASH(ump, inum, lbn) \
2116	(&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2117
2118static int
2119pagedep_find(pagedephd, ino, lbn, pagedeppp)
2120	struct pagedep_hashhead *pagedephd;
2121	ino_t ino;
2122	ufs_lbn_t lbn;
2123	struct pagedep **pagedeppp;
2124{
2125	struct pagedep *pagedep;
2126
2127	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2128		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2129			*pagedeppp = pagedep;
2130			return (1);
2131		}
2132	}
2133	*pagedeppp = NULL;
2134	return (0);
2135}
2136/*
2137 * Look up a pagedep. Return 1 if found, 0 otherwise.
2138 * If not found, allocate if DEPALLOC flag is passed.
2139 * Found or allocated entry is returned in pagedeppp.
2140 */
2141static int
2142pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2143	struct mount *mp;
2144	struct buf *bp;
2145	ino_t ino;
2146	ufs_lbn_t lbn;
2147	int flags;
2148	struct pagedep **pagedeppp;
2149{
2150	struct pagedep *pagedep;
2151	struct pagedep_hashhead *pagedephd;
2152	struct worklist *wk;
2153	struct ufsmount *ump;
2154	int ret;
2155	int i;
2156
2157	ump = VFSTOUFS(mp);
2158	LOCK_OWNED(ump);
2159	if (bp) {
2160		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2161			if (wk->wk_type == D_PAGEDEP) {
2162				*pagedeppp = WK_PAGEDEP(wk);
2163				return (1);
2164			}
2165		}
2166	}
2167	pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2168	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2169	if (ret) {
2170		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2171			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2172		return (1);
2173	}
2174	if ((flags & DEPALLOC) == 0)
2175		return (0);
2176	FREE_LOCK(ump);
2177	pagedep = malloc(sizeof(struct pagedep),
2178	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2179	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2180	ACQUIRE_LOCK(ump);
2181	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2182	if (*pagedeppp) {
2183		/*
2184		 * This should never happen since we only create pagedeps
2185		 * with the vnode lock held.  Could be an assert.
2186		 */
2187		WORKITEM_FREE(pagedep, D_PAGEDEP);
2188		return (ret);
2189	}
2190	pagedep->pd_ino = ino;
2191	pagedep->pd_lbn = lbn;
2192	LIST_INIT(&pagedep->pd_dirremhd);
2193	LIST_INIT(&pagedep->pd_pendinghd);
2194	for (i = 0; i < DAHASHSZ; i++)
2195		LIST_INIT(&pagedep->pd_diraddhd[i]);
2196	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2197	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2198	*pagedeppp = pagedep;
2199	return (0);
2200}
2201
2202/*
2203 * Structures and routines associated with inodedep caching.
2204 */
2205#define	INODEDEP_HASH(ump, inum) \
2206      (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2207
2208static int
2209inodedep_find(inodedephd, inum, inodedeppp)
2210	struct inodedep_hashhead *inodedephd;
2211	ino_t inum;
2212	struct inodedep **inodedeppp;
2213{
2214	struct inodedep *inodedep;
2215
2216	LIST_FOREACH(inodedep, inodedephd, id_hash)
2217		if (inum == inodedep->id_ino)
2218			break;
2219	if (inodedep) {
2220		*inodedeppp = inodedep;
2221		return (1);
2222	}
2223	*inodedeppp = NULL;
2224
2225	return (0);
2226}
2227/*
2228 * Look up an inodedep. Return 1 if found, 0 if not found.
2229 * If not found, allocate if DEPALLOC flag is passed.
2230 * Found or allocated entry is returned in inodedeppp.
2231 */
2232static int
2233inodedep_lookup(mp, inum, flags, inodedeppp)
2234	struct mount *mp;
2235	ino_t inum;
2236	int flags;
2237	struct inodedep **inodedeppp;
2238{
2239	struct inodedep *inodedep;
2240	struct inodedep_hashhead *inodedephd;
2241	struct ufsmount *ump;
2242	struct fs *fs;
2243
2244	ump = VFSTOUFS(mp);
2245	LOCK_OWNED(ump);
2246	fs = ump->um_fs;
2247	inodedephd = INODEDEP_HASH(ump, inum);
2248
2249	if (inodedep_find(inodedephd, inum, inodedeppp))
2250		return (1);
2251	if ((flags & DEPALLOC) == 0)
2252		return (0);
2253	/*
2254	 * If the system is over its limit and our filesystem is
2255	 * responsible for more than our share of that usage and
2256	 * we are not in a rush, request some inodedep cleanup.
2257	 */
2258	if (softdep_excess_items(ump, D_INODEDEP))
2259		schedule_cleanup(mp);
2260	else
2261		FREE_LOCK(ump);
2262	inodedep = malloc(sizeof(struct inodedep),
2263		M_INODEDEP, M_SOFTDEP_FLAGS);
2264	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2265	ACQUIRE_LOCK(ump);
2266	if (inodedep_find(inodedephd, inum, inodedeppp)) {
2267		WORKITEM_FREE(inodedep, D_INODEDEP);
2268		return (1);
2269	}
2270	inodedep->id_fs = fs;
2271	inodedep->id_ino = inum;
2272	inodedep->id_state = ALLCOMPLETE;
2273	inodedep->id_nlinkdelta = 0;
2274	inodedep->id_nlinkwrote = -1;
2275	inodedep->id_savedino1 = NULL;
2276	inodedep->id_savedsize = -1;
2277	inodedep->id_savedextsize = -1;
2278	inodedep->id_savednlink = -1;
2279	inodedep->id_bmsafemap = NULL;
2280	inodedep->id_mkdiradd = NULL;
2281	LIST_INIT(&inodedep->id_dirremhd);
2282	LIST_INIT(&inodedep->id_pendinghd);
2283	LIST_INIT(&inodedep->id_inowait);
2284	LIST_INIT(&inodedep->id_bufwait);
2285	TAILQ_INIT(&inodedep->id_inoreflst);
2286	TAILQ_INIT(&inodedep->id_inoupdt);
2287	TAILQ_INIT(&inodedep->id_newinoupdt);
2288	TAILQ_INIT(&inodedep->id_extupdt);
2289	TAILQ_INIT(&inodedep->id_newextupdt);
2290	TAILQ_INIT(&inodedep->id_freeblklst);
2291	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2292	*inodedeppp = inodedep;
2293	return (0);
2294}
2295
2296/*
2297 * Structures and routines associated with newblk caching.
2298 */
2299#define	NEWBLK_HASH(ump, inum) \
2300	(&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2301
2302static int
2303newblk_find(newblkhd, newblkno, flags, newblkpp)
2304	struct newblk_hashhead *newblkhd;
2305	ufs2_daddr_t newblkno;
2306	int flags;
2307	struct newblk **newblkpp;
2308{
2309	struct newblk *newblk;
2310
2311	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2312		if (newblkno != newblk->nb_newblkno)
2313			continue;
2314		/*
2315		 * If we're creating a new dependency don't match those that
2316		 * have already been converted to allocdirects.  This is for
2317		 * a frag extend.
2318		 */
2319		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2320			continue;
2321		break;
2322	}
2323	if (newblk) {
2324		*newblkpp = newblk;
2325		return (1);
2326	}
2327	*newblkpp = NULL;
2328	return (0);
2329}
2330
2331/*
2332 * Look up a newblk. Return 1 if found, 0 if not found.
2333 * If not found, allocate if DEPALLOC flag is passed.
2334 * Found or allocated entry is returned in newblkpp.
2335 */
2336static int
2337newblk_lookup(mp, newblkno, flags, newblkpp)
2338	struct mount *mp;
2339	ufs2_daddr_t newblkno;
2340	int flags;
2341	struct newblk **newblkpp;
2342{
2343	struct newblk *newblk;
2344	struct newblk_hashhead *newblkhd;
2345	struct ufsmount *ump;
2346
2347	ump = VFSTOUFS(mp);
2348	LOCK_OWNED(ump);
2349	newblkhd = NEWBLK_HASH(ump, newblkno);
2350	if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2351		return (1);
2352	if ((flags & DEPALLOC) == 0)
2353		return (0);
2354	if (softdep_excess_items(ump, D_NEWBLK) ||
2355	    softdep_excess_items(ump, D_ALLOCDIRECT) ||
2356	    softdep_excess_items(ump, D_ALLOCINDIR))
2357		schedule_cleanup(mp);
2358	else
2359		FREE_LOCK(ump);
2360	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2361	    M_SOFTDEP_FLAGS | M_ZERO);
2362	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2363	ACQUIRE_LOCK(ump);
2364	if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2365		WORKITEM_FREE(newblk, D_NEWBLK);
2366		return (1);
2367	}
2368	newblk->nb_freefrag = NULL;
2369	LIST_INIT(&newblk->nb_indirdeps);
2370	LIST_INIT(&newblk->nb_newdirblk);
2371	LIST_INIT(&newblk->nb_jwork);
2372	newblk->nb_state = ATTACHED;
2373	newblk->nb_newblkno = newblkno;
2374	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2375	*newblkpp = newblk;
2376	return (0);
2377}
2378
2379/*
2380 * Structures and routines associated with freed indirect block caching.
2381 */
2382#define	INDIR_HASH(ump, blkno) \
2383	(&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2384
2385/*
2386 * Lookup an indirect block in the indir hash table.  The freework is
2387 * removed and potentially freed.  The caller must do a blocking journal
2388 * write before writing to the blkno.
2389 */
2390static int
2391indirblk_lookup(mp, blkno)
2392	struct mount *mp;
2393	ufs2_daddr_t blkno;
2394{
2395	struct freework *freework;
2396	struct indir_hashhead *wkhd;
2397	struct ufsmount *ump;
2398
2399	ump = VFSTOUFS(mp);
2400	wkhd = INDIR_HASH(ump, blkno);
2401	TAILQ_FOREACH(freework, wkhd, fw_next) {
2402		if (freework->fw_blkno != blkno)
2403			continue;
2404		indirblk_remove(freework);
2405		return (1);
2406	}
2407	return (0);
2408}
2409
2410/*
2411 * Insert an indirect block represented by freework into the indirblk
2412 * hash table so that it may prevent the block from being re-used prior
2413 * to the journal being written.
2414 */
2415static void
2416indirblk_insert(freework)
2417	struct freework *freework;
2418{
2419	struct jblocks *jblocks;
2420	struct jseg *jseg;
2421	struct ufsmount *ump;
2422
2423	ump = VFSTOUFS(freework->fw_list.wk_mp);
2424	jblocks = ump->softdep_jblocks;
2425	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2426	if (jseg == NULL)
2427		return;
2428
2429	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2430	TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2431	    fw_next);
2432	freework->fw_state &= ~DEPCOMPLETE;
2433}
2434
2435static void
2436indirblk_remove(freework)
2437	struct freework *freework;
2438{
2439	struct ufsmount *ump;
2440
2441	ump = VFSTOUFS(freework->fw_list.wk_mp);
2442	LIST_REMOVE(freework, fw_segs);
2443	TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2444	freework->fw_state |= DEPCOMPLETE;
2445	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2446		WORKITEM_FREE(freework, D_FREEWORK);
2447}
2448
2449/*
2450 * Executed during filesystem system initialization before
2451 * mounting any filesystems.
2452 */
2453void
2454softdep_initialize()
2455{
2456
2457	TAILQ_INIT(&softdepmounts);
2458#ifdef __LP64__
2459	max_softdeps = desiredvnodes * 4;
2460#else
2461	max_softdeps = desiredvnodes * 2;
2462#endif
2463
2464	/* initialise bioops hack */
2465	bioops.io_start = softdep_disk_io_initiation;
2466	bioops.io_complete = softdep_disk_write_complete;
2467	bioops.io_deallocate = softdep_deallocate_dependencies;
2468	bioops.io_countdeps = softdep_count_dependencies;
2469	softdep_ast_cleanup = softdep_ast_cleanup_proc;
2470
2471	/* Initialize the callout with an mtx. */
2472	callout_init_mtx(&softdep_callout, &lk, 0);
2473}
2474
2475/*
2476 * Executed after all filesystems have been unmounted during
2477 * filesystem module unload.
2478 */
2479void
2480softdep_uninitialize()
2481{
2482
2483	/* clear bioops hack */
2484	bioops.io_start = NULL;
2485	bioops.io_complete = NULL;
2486	bioops.io_deallocate = NULL;
2487	bioops.io_countdeps = NULL;
2488	softdep_ast_cleanup = NULL;
2489
2490	callout_drain(&softdep_callout);
2491}
2492
2493/*
2494 * Called at mount time to notify the dependency code that a
2495 * filesystem wishes to use it.
2496 */
2497int
2498softdep_mount(devvp, mp, fs, cred)
2499	struct vnode *devvp;
2500	struct mount *mp;
2501	struct fs *fs;
2502	struct ucred *cred;
2503{
2504	struct csum_total cstotal;
2505	struct mount_softdeps *sdp;
2506	struct ufsmount *ump;
2507	struct cg *cgp;
2508	struct buf *bp;
2509	u_int cyl, i;
2510	int error;
2511
2512	sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2513	    M_WAITOK | M_ZERO);
2514	MNT_ILOCK(mp);
2515	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2516	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2517		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2518			MNTK_SOFTDEP | MNTK_NOASYNC;
2519	}
2520	ump = VFSTOUFS(mp);
2521	ump->um_softdep = sdp;
2522	MNT_IUNLOCK(mp);
2523	rw_init(LOCK_PTR(ump), "per-fs softdep");
2524	sdp->sd_ump = ump;
2525	LIST_INIT(&ump->softdep_workitem_pending);
2526	LIST_INIT(&ump->softdep_journal_pending);
2527	TAILQ_INIT(&ump->softdep_unlinked);
2528	LIST_INIT(&ump->softdep_dirtycg);
2529	ump->softdep_worklist_tail = NULL;
2530	ump->softdep_on_worklist = 0;
2531	ump->softdep_deps = 0;
2532	LIST_INIT(&ump->softdep_mkdirlisthd);
2533	ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2534	    &ump->pagedep_hash_size);
2535	ump->pagedep_nextclean = 0;
2536	ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2537	    &ump->inodedep_hash_size);
2538	ump->inodedep_nextclean = 0;
2539	ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
2540	    &ump->newblk_hash_size);
2541	ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2542	    &ump->bmsafemap_hash_size);
2543	i = 1 << (ffs(desiredvnodes / 10) - 1);
2544	ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2545	    M_FREEWORK, M_WAITOK);
2546	ump->indir_hash_size = i - 1;
2547	for (i = 0; i <= ump->indir_hash_size; i++)
2548		TAILQ_INIT(&ump->indir_hashtbl[i]);
2549#ifdef INVARIANTS
2550	for (i = 0; i <= D_LAST; i++)
2551		LIST_INIT(&ump->softdep_alldeps[i]);
2552#endif
2553	ACQUIRE_GBLLOCK(&lk);
2554	TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2555	FREE_GBLLOCK(&lk);
2556	if ((fs->fs_flags & FS_SUJ) &&
2557	    (error = journal_mount(mp, fs, cred)) != 0) {
2558		printf("Failed to start journal: %d\n", error);
2559		softdep_unmount(mp);
2560		return (error);
2561	}
2562	/*
2563	 * Start our flushing thread in the bufdaemon process.
2564	 */
2565	ACQUIRE_LOCK(ump);
2566	ump->softdep_flags |= FLUSH_STARTING;
2567	FREE_LOCK(ump);
2568	kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2569	    &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2570	    mp->mnt_stat.f_mntonname);
2571	ACQUIRE_LOCK(ump);
2572	while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2573		msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2574		    hz / 2);
2575	}
2576	FREE_LOCK(ump);
2577	/*
2578	 * When doing soft updates, the counters in the
2579	 * superblock may have gotten out of sync. Recomputation
2580	 * can take a long time and can be deferred for background
2581	 * fsck.  However, the old behavior of scanning the cylinder
2582	 * groups and recalculating them at mount time is available
2583	 * by setting vfs.ffs.compute_summary_at_mount to one.
2584	 */
2585	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2586		return (0);
2587	bzero(&cstotal, sizeof cstotal);
2588	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2589		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2590		    fs->fs_cgsize, cred, &bp)) != 0) {
2591			brelse(bp);
2592			softdep_unmount(mp);
2593			return (error);
2594		}
2595		cgp = (struct cg *)bp->b_data;
2596		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2597		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2598		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2599		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2600		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2601		brelse(bp);
2602	}
2603#ifdef INVARIANTS
2604	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2605		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2606#endif
2607	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2608	return (0);
2609}
2610
2611void
2612softdep_unmount(mp)
2613	struct mount *mp;
2614{
2615	struct ufsmount *ump;
2616#ifdef INVARIANTS
2617	int i;
2618#endif
2619
2620	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2621	    ("softdep_unmount called on non-softdep filesystem"));
2622	ump = VFSTOUFS(mp);
2623	MNT_ILOCK(mp);
2624	mp->mnt_flag &= ~MNT_SOFTDEP;
2625	if (MOUNTEDSUJ(mp) == 0) {
2626		MNT_IUNLOCK(mp);
2627	} else {
2628		mp->mnt_flag &= ~MNT_SUJ;
2629		MNT_IUNLOCK(mp);
2630		journal_unmount(ump);
2631	}
2632	/*
2633	 * Shut down our flushing thread. Check for NULL is if
2634	 * softdep_mount errors out before the thread has been created.
2635	 */
2636	if (ump->softdep_flushtd != NULL) {
2637		ACQUIRE_LOCK(ump);
2638		ump->softdep_flags |= FLUSH_EXIT;
2639		wakeup(&ump->softdep_flushtd);
2640		msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
2641		    "sdwait", 0);
2642		KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2643		    ("Thread shutdown failed"));
2644	}
2645	/*
2646	 * Free up our resources.
2647	 */
2648	ACQUIRE_GBLLOCK(&lk);
2649	TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
2650	FREE_GBLLOCK(&lk);
2651	rw_destroy(LOCK_PTR(ump));
2652	hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2653	hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2654	hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2655	hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2656	    ump->bmsafemap_hash_size);
2657	free(ump->indir_hashtbl, M_FREEWORK);
2658#ifdef INVARIANTS
2659	for (i = 0; i <= D_LAST; i++) {
2660		KASSERT(ump->softdep_curdeps[i] == 0,
2661		    ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2662		    TYPENAME(i), ump->softdep_curdeps[i]));
2663		KASSERT(LIST_EMPTY(&ump->softdep_alldeps[i]),
2664		    ("Unmount %s: Dep type %s not empty (%p)", ump->um_fs->fs_fsmnt,
2665		    TYPENAME(i), LIST_FIRST(&ump->softdep_alldeps[i])));
2666	}
2667#endif
2668	free(ump->um_softdep, M_MOUNTDATA);
2669}
2670
2671static struct jblocks *
2672jblocks_create(void)
2673{
2674	struct jblocks *jblocks;
2675
2676	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2677	TAILQ_INIT(&jblocks->jb_segs);
2678	jblocks->jb_avail = 10;
2679	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2680	    M_JBLOCKS, M_WAITOK | M_ZERO);
2681
2682	return (jblocks);
2683}
2684
2685static ufs2_daddr_t
2686jblocks_alloc(jblocks, bytes, actual)
2687	struct jblocks *jblocks;
2688	int bytes;
2689	int *actual;
2690{
2691	ufs2_daddr_t daddr;
2692	struct jextent *jext;
2693	int freecnt;
2694	int blocks;
2695
2696	blocks = bytes / DEV_BSIZE;
2697	jext = &jblocks->jb_extent[jblocks->jb_head];
2698	freecnt = jext->je_blocks - jblocks->jb_off;
2699	if (freecnt == 0) {
2700		jblocks->jb_off = 0;
2701		if (++jblocks->jb_head > jblocks->jb_used)
2702			jblocks->jb_head = 0;
2703		jext = &jblocks->jb_extent[jblocks->jb_head];
2704		freecnt = jext->je_blocks;
2705	}
2706	if (freecnt > blocks)
2707		freecnt = blocks;
2708	*actual = freecnt * DEV_BSIZE;
2709	daddr = jext->je_daddr + jblocks->jb_off;
2710	jblocks->jb_off += freecnt;
2711	jblocks->jb_free -= freecnt;
2712
2713	return (daddr);
2714}
2715
2716static void
2717jblocks_free(jblocks, mp, bytes)
2718	struct jblocks *jblocks;
2719	struct mount *mp;
2720	int bytes;
2721{
2722
2723	LOCK_OWNED(VFSTOUFS(mp));
2724	jblocks->jb_free += bytes / DEV_BSIZE;
2725	if (jblocks->jb_suspended)
2726		worklist_speedup(mp);
2727	wakeup(jblocks);
2728}
2729
2730static void
2731jblocks_destroy(jblocks)
2732	struct jblocks *jblocks;
2733{
2734
2735	if (jblocks->jb_extent)
2736		free(jblocks->jb_extent, M_JBLOCKS);
2737	free(jblocks, M_JBLOCKS);
2738}
2739
2740static void
2741jblocks_add(jblocks, daddr, blocks)
2742	struct jblocks *jblocks;
2743	ufs2_daddr_t daddr;
2744	int blocks;
2745{
2746	struct jextent *jext;
2747
2748	jblocks->jb_blocks += blocks;
2749	jblocks->jb_free += blocks;
2750	jext = &jblocks->jb_extent[jblocks->jb_used];
2751	/* Adding the first block. */
2752	if (jext->je_daddr == 0) {
2753		jext->je_daddr = daddr;
2754		jext->je_blocks = blocks;
2755		return;
2756	}
2757	/* Extending the last extent. */
2758	if (jext->je_daddr + jext->je_blocks == daddr) {
2759		jext->je_blocks += blocks;
2760		return;
2761	}
2762	/* Adding a new extent. */
2763	if (++jblocks->jb_used == jblocks->jb_avail) {
2764		jblocks->jb_avail *= 2;
2765		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2766		    M_JBLOCKS, M_WAITOK | M_ZERO);
2767		memcpy(jext, jblocks->jb_extent,
2768		    sizeof(struct jextent) * jblocks->jb_used);
2769		free(jblocks->jb_extent, M_JBLOCKS);
2770		jblocks->jb_extent = jext;
2771	}
2772	jext = &jblocks->jb_extent[jblocks->jb_used];
2773	jext->je_daddr = daddr;
2774	jext->je_blocks = blocks;
2775	return;
2776}
2777
2778int
2779softdep_journal_lookup(mp, vpp)
2780	struct mount *mp;
2781	struct vnode **vpp;
2782{
2783	struct componentname cnp;
2784	struct vnode *dvp;
2785	ino_t sujournal;
2786	int error;
2787
2788	error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp);
2789	if (error)
2790		return (error);
2791	bzero(&cnp, sizeof(cnp));
2792	cnp.cn_nameiop = LOOKUP;
2793	cnp.cn_flags = ISLASTCN;
2794	cnp.cn_thread = curthread;
2795	cnp.cn_cred = curthread->td_ucred;
2796	cnp.cn_pnbuf = SUJ_FILE;
2797	cnp.cn_nameptr = SUJ_FILE;
2798	cnp.cn_namelen = strlen(SUJ_FILE);
2799	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2800	vput(dvp);
2801	if (error != 0)
2802		return (error);
2803	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2804	return (error);
2805}
2806
2807/*
2808 * Open and verify the journal file.
2809 */
2810static int
2811journal_mount(mp, fs, cred)
2812	struct mount *mp;
2813	struct fs *fs;
2814	struct ucred *cred;
2815{
2816	struct jblocks *jblocks;
2817	struct ufsmount *ump;
2818	struct vnode *vp;
2819	struct inode *ip;
2820	ufs2_daddr_t blkno;
2821	int bcount;
2822	int error;
2823	int i;
2824
2825	ump = VFSTOUFS(mp);
2826	ump->softdep_journal_tail = NULL;
2827	ump->softdep_on_journal = 0;
2828	ump->softdep_accdeps = 0;
2829	ump->softdep_req = 0;
2830	ump->softdep_jblocks = NULL;
2831	error = softdep_journal_lookup(mp, &vp);
2832	if (error != 0) {
2833		printf("Failed to find journal.  Use tunefs to create one\n");
2834		return (error);
2835	}
2836	ip = VTOI(vp);
2837	if (ip->i_size < SUJ_MIN) {
2838		error = ENOSPC;
2839		goto out;
2840	}
2841	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2842	jblocks = jblocks_create();
2843	for (i = 0; i < bcount; i++) {
2844		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2845		if (error)
2846			break;
2847		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2848	}
2849	if (error) {
2850		jblocks_destroy(jblocks);
2851		goto out;
2852	}
2853	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2854	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2855	ump->softdep_jblocks = jblocks;
2856out:
2857	if (error == 0) {
2858		MNT_ILOCK(mp);
2859		mp->mnt_flag |= MNT_SUJ;
2860		mp->mnt_flag &= ~MNT_SOFTDEP;
2861		MNT_IUNLOCK(mp);
2862		/*
2863		 * Only validate the journal contents if the
2864		 * filesystem is clean, otherwise we write the logs
2865		 * but they'll never be used.  If the filesystem was
2866		 * still dirty when we mounted it the journal is
2867		 * invalid and a new journal can only be valid if it
2868		 * starts from a clean mount.
2869		 */
2870		if (fs->fs_clean) {
2871			DIP_SET(ip, i_modrev, fs->fs_mtime);
2872			ip->i_flags |= IN_MODIFIED;
2873			ffs_update(vp, 1);
2874		}
2875	}
2876	vput(vp);
2877	return (error);
2878}
2879
2880static void
2881journal_unmount(ump)
2882	struct ufsmount *ump;
2883{
2884
2885	if (ump->softdep_jblocks)
2886		jblocks_destroy(ump->softdep_jblocks);
2887	ump->softdep_jblocks = NULL;
2888}
2889
2890/*
2891 * Called when a journal record is ready to be written.  Space is allocated
2892 * and the journal entry is created when the journal is flushed to stable
2893 * store.
2894 */
2895static void
2896add_to_journal(wk)
2897	struct worklist *wk;
2898{
2899	struct ufsmount *ump;
2900
2901	ump = VFSTOUFS(wk->wk_mp);
2902	LOCK_OWNED(ump);
2903	if (wk->wk_state & ONWORKLIST)
2904		panic("add_to_journal: %s(0x%X) already on list",
2905		    TYPENAME(wk->wk_type), wk->wk_state);
2906	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2907	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2908		ump->softdep_jblocks->jb_age = ticks;
2909		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2910	} else
2911		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2912	ump->softdep_journal_tail = wk;
2913	ump->softdep_on_journal += 1;
2914}
2915
2916/*
2917 * Remove an arbitrary item for the journal worklist maintain the tail
2918 * pointer.  This happens when a new operation obviates the need to
2919 * journal an old operation.
2920 */
2921static void
2922remove_from_journal(wk)
2923	struct worklist *wk;
2924{
2925	struct ufsmount *ump;
2926
2927	ump = VFSTOUFS(wk->wk_mp);
2928	LOCK_OWNED(ump);
2929#ifdef INVARIANTS
2930	{
2931		struct worklist *wkn;
2932
2933		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2934			if (wkn == wk)
2935				break;
2936		if (wkn == NULL)
2937			panic("remove_from_journal: %p is not in journal", wk);
2938	}
2939#endif
2940	/*
2941	 * We emulate a TAILQ to save space in most structures which do not
2942	 * require TAILQ semantics.  Here we must update the tail position
2943	 * when removing the tail which is not the final entry. This works
2944	 * only if the worklist linkage are at the beginning of the structure.
2945	 */
2946	if (ump->softdep_journal_tail == wk)
2947		ump->softdep_journal_tail =
2948		    (struct worklist *)wk->wk_list.le_prev;
2949	WORKLIST_REMOVE(wk);
2950	ump->softdep_on_journal -= 1;
2951}
2952
2953/*
2954 * Check for journal space as well as dependency limits so the prelink
2955 * code can throttle both journaled and non-journaled filesystems.
2956 * Threshold is 0 for low and 1 for min.
2957 */
2958static int
2959journal_space(ump, thresh)
2960	struct ufsmount *ump;
2961	int thresh;
2962{
2963	struct jblocks *jblocks;
2964	int limit, avail;
2965
2966	jblocks = ump->softdep_jblocks;
2967	if (jblocks == NULL)
2968		return (1);
2969	/*
2970	 * We use a tighter restriction here to prevent request_cleanup()
2971	 * running in threads from running into locks we currently hold.
2972	 * We have to be over the limit and our filesystem has to be
2973	 * responsible for more than our share of that usage.
2974	 */
2975	limit = (max_softdeps / 10) * 9;
2976	if (dep_current[D_INODEDEP] > limit &&
2977	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
2978		return (0);
2979	if (thresh)
2980		thresh = jblocks->jb_min;
2981	else
2982		thresh = jblocks->jb_low;
2983	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2984	avail = jblocks->jb_free - avail;
2985
2986	return (avail > thresh);
2987}
2988
2989static void
2990journal_suspend(ump)
2991	struct ufsmount *ump;
2992{
2993	struct jblocks *jblocks;
2994	struct mount *mp;
2995	bool set;
2996
2997	mp = UFSTOVFS(ump);
2998	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0)
2999		return;
3000
3001	jblocks = ump->softdep_jblocks;
3002	vfs_op_enter(mp);
3003	set = false;
3004	MNT_ILOCK(mp);
3005	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
3006		stat_journal_min++;
3007		mp->mnt_kern_flag |= MNTK_SUSPEND;
3008		mp->mnt_susp_owner = ump->softdep_flushtd;
3009		set = true;
3010	}
3011	jblocks->jb_suspended = 1;
3012	MNT_IUNLOCK(mp);
3013	if (!set)
3014		vfs_op_exit(mp);
3015}
3016
3017static int
3018journal_unsuspend(struct ufsmount *ump)
3019{
3020	struct jblocks *jblocks;
3021	struct mount *mp;
3022
3023	mp = UFSTOVFS(ump);
3024	jblocks = ump->softdep_jblocks;
3025
3026	if (jblocks != NULL && jblocks->jb_suspended &&
3027	    journal_space(ump, jblocks->jb_min)) {
3028		jblocks->jb_suspended = 0;
3029		FREE_LOCK(ump);
3030		mp->mnt_susp_owner = curthread;
3031		vfs_write_resume(mp, 0);
3032		ACQUIRE_LOCK(ump);
3033		return (1);
3034	}
3035	return (0);
3036}
3037
3038/*
3039 * Called before any allocation function to be certain that there is
3040 * sufficient space in the journal prior to creating any new records.
3041 * Since in the case of block allocation we may have multiple locked
3042 * buffers at the time of the actual allocation we can not block
3043 * when the journal records are created.  Doing so would create a deadlock
3044 * if any of these buffers needed to be flushed to reclaim space.  Instead
3045 * we require a sufficiently large amount of available space such that
3046 * each thread in the system could have passed this allocation check and
3047 * still have sufficient free space.  With 20% of a minimum journal size
3048 * of 1MB we have 6553 records available.
3049 */
3050int
3051softdep_prealloc(vp, waitok)
3052	struct vnode *vp;
3053	int waitok;
3054{
3055	struct ufsmount *ump;
3056
3057	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
3058	    ("softdep_prealloc called on non-softdep filesystem"));
3059	/*
3060	 * Nothing to do if we are not running journaled soft updates.
3061	 * If we currently hold the snapshot lock, we must avoid
3062	 * handling other resources that could cause deadlock.  Do not
3063	 * touch quotas vnode since it is typically recursed with
3064	 * other vnode locks held.
3065	 */
3066	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
3067	    (vp->v_vflag & VV_SYSTEM) != 0)
3068		return (0);
3069	ump = VFSTOUFS(vp->v_mount);
3070	ACQUIRE_LOCK(ump);
3071	if (journal_space(ump, 0)) {
3072		FREE_LOCK(ump);
3073		return (0);
3074	}
3075	stat_journal_low++;
3076	FREE_LOCK(ump);
3077	if (waitok == MNT_NOWAIT)
3078		return (ENOSPC);
3079	/*
3080	 * Attempt to sync this vnode once to flush any journal
3081	 * work attached to it.
3082	 */
3083	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3084		ffs_syncvnode(vp, waitok, 0);
3085	ACQUIRE_LOCK(ump);
3086	process_removes(vp);
3087	process_truncates(vp);
3088	if (journal_space(ump, 0) == 0) {
3089		softdep_speedup(ump);
3090		if (journal_space(ump, 1) == 0)
3091			journal_suspend(ump);
3092	}
3093	FREE_LOCK(ump);
3094
3095	return (0);
3096}
3097
3098/*
3099 * Before adjusting a link count on a vnode verify that we have sufficient
3100 * journal space.  If not, process operations that depend on the currently
3101 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3102 * and softdep flush threads can not acquire these locks to reclaim space.
3103 */
3104static void
3105softdep_prelink(dvp, vp)
3106	struct vnode *dvp;
3107	struct vnode *vp;
3108{
3109	struct ufsmount *ump;
3110
3111	ump = VFSTOUFS(dvp->v_mount);
3112	LOCK_OWNED(ump);
3113	/*
3114	 * Nothing to do if we have sufficient journal space.
3115	 * If we currently hold the snapshot lock, we must avoid
3116	 * handling other resources that could cause deadlock.
3117	 */
3118	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
3119		return;
3120	stat_journal_low++;
3121	FREE_LOCK(ump);
3122	if (vp)
3123		ffs_syncvnode(vp, MNT_NOWAIT, 0);
3124	ffs_syncvnode(dvp, MNT_WAIT, 0);
3125	ACQUIRE_LOCK(ump);
3126	/* Process vp before dvp as it may create .. removes. */
3127	if (vp) {
3128		process_removes(vp);
3129		process_truncates(vp);
3130	}
3131	process_removes(dvp);
3132	process_truncates(dvp);
3133	softdep_speedup(ump);
3134	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3135	if (journal_space(ump, 0) == 0) {
3136		softdep_speedup(ump);
3137		if (journal_space(ump, 1) == 0)
3138			journal_suspend(ump);
3139	}
3140}
3141
3142static void
3143jseg_write(ump, jseg, data)
3144	struct ufsmount *ump;
3145	struct jseg *jseg;
3146	uint8_t *data;
3147{
3148	struct jsegrec *rec;
3149
3150	rec = (struct jsegrec *)data;
3151	rec->jsr_seq = jseg->js_seq;
3152	rec->jsr_oldest = jseg->js_oldseq;
3153	rec->jsr_cnt = jseg->js_cnt;
3154	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3155	rec->jsr_crc = 0;
3156	rec->jsr_time = ump->um_fs->fs_mtime;
3157}
3158
3159static inline void
3160inoref_write(inoref, jseg, rec)
3161	struct inoref *inoref;
3162	struct jseg *jseg;
3163	struct jrefrec *rec;
3164{
3165
3166	inoref->if_jsegdep->jd_seg = jseg;
3167	rec->jr_ino = inoref->if_ino;
3168	rec->jr_parent = inoref->if_parent;
3169	rec->jr_nlink = inoref->if_nlink;
3170	rec->jr_mode = inoref->if_mode;
3171	rec->jr_diroff = inoref->if_diroff;
3172}
3173
3174static void
3175jaddref_write(jaddref, jseg, data)
3176	struct jaddref *jaddref;
3177	struct jseg *jseg;
3178	uint8_t *data;
3179{
3180	struct jrefrec *rec;
3181
3182	rec = (struct jrefrec *)data;
3183	rec->jr_op = JOP_ADDREF;
3184	inoref_write(&jaddref->ja_ref, jseg, rec);
3185}
3186
3187static void
3188jremref_write(jremref, jseg, data)
3189	struct jremref *jremref;
3190	struct jseg *jseg;
3191	uint8_t *data;
3192{
3193	struct jrefrec *rec;
3194
3195	rec = (struct jrefrec *)data;
3196	rec->jr_op = JOP_REMREF;
3197	inoref_write(&jremref->jr_ref, jseg, rec);
3198}
3199
3200static void
3201jmvref_write(jmvref, jseg, data)
3202	struct jmvref *jmvref;
3203	struct jseg *jseg;
3204	uint8_t *data;
3205{
3206	struct jmvrec *rec;
3207
3208	rec = (struct jmvrec *)data;
3209	rec->jm_op = JOP_MVREF;
3210	rec->jm_ino = jmvref->jm_ino;
3211	rec->jm_parent = jmvref->jm_parent;
3212	rec->jm_oldoff = jmvref->jm_oldoff;
3213	rec->jm_newoff = jmvref->jm_newoff;
3214}
3215
3216static void
3217jnewblk_write(jnewblk, jseg, data)
3218	struct jnewblk *jnewblk;
3219	struct jseg *jseg;
3220	uint8_t *data;
3221{
3222	struct jblkrec *rec;
3223
3224	jnewblk->jn_jsegdep->jd_seg = jseg;
3225	rec = (struct jblkrec *)data;
3226	rec->jb_op = JOP_NEWBLK;
3227	rec->jb_ino = jnewblk->jn_ino;
3228	rec->jb_blkno = jnewblk->jn_blkno;
3229	rec->jb_lbn = jnewblk->jn_lbn;
3230	rec->jb_frags = jnewblk->jn_frags;
3231	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3232}
3233
3234static void
3235jfreeblk_write(jfreeblk, jseg, data)
3236	struct jfreeblk *jfreeblk;
3237	struct jseg *jseg;
3238	uint8_t *data;
3239{
3240	struct jblkrec *rec;
3241
3242	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3243	rec = (struct jblkrec *)data;
3244	rec->jb_op = JOP_FREEBLK;
3245	rec->jb_ino = jfreeblk->jf_ino;
3246	rec->jb_blkno = jfreeblk->jf_blkno;
3247	rec->jb_lbn = jfreeblk->jf_lbn;
3248	rec->jb_frags = jfreeblk->jf_frags;
3249	rec->jb_oldfrags = 0;
3250}
3251
3252static void
3253jfreefrag_write(jfreefrag, jseg, data)
3254	struct jfreefrag *jfreefrag;
3255	struct jseg *jseg;
3256	uint8_t *data;
3257{
3258	struct jblkrec *rec;
3259
3260	jfreefrag->fr_jsegdep->jd_seg = jseg;
3261	rec = (struct jblkrec *)data;
3262	rec->jb_op = JOP_FREEBLK;
3263	rec->jb_ino = jfreefrag->fr_ino;
3264	rec->jb_blkno = jfreefrag->fr_blkno;
3265	rec->jb_lbn = jfreefrag->fr_lbn;
3266	rec->jb_frags = jfreefrag->fr_frags;
3267	rec->jb_oldfrags = 0;
3268}
3269
3270static void
3271jtrunc_write(jtrunc, jseg, data)
3272	struct jtrunc *jtrunc;
3273	struct jseg *jseg;
3274	uint8_t *data;
3275{
3276	struct jtrncrec *rec;
3277
3278	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3279	rec = (struct jtrncrec *)data;
3280	rec->jt_op = JOP_TRUNC;
3281	rec->jt_ino = jtrunc->jt_ino;
3282	rec->jt_size = jtrunc->jt_size;
3283	rec->jt_extsize = jtrunc->jt_extsize;
3284}
3285
3286static void
3287jfsync_write(jfsync, jseg, data)
3288	struct jfsync *jfsync;
3289	struct jseg *jseg;
3290	uint8_t *data;
3291{
3292	struct jtrncrec *rec;
3293
3294	rec = (struct jtrncrec *)data;
3295	rec->jt_op = JOP_SYNC;
3296	rec->jt_ino = jfsync->jfs_ino;
3297	rec->jt_size = jfsync->jfs_size;
3298	rec->jt_extsize = jfsync->jfs_extsize;
3299}
3300
3301static void
3302softdep_flushjournal(mp)
3303	struct mount *mp;
3304{
3305	struct jblocks *jblocks;
3306	struct ufsmount *ump;
3307
3308	if (MOUNTEDSUJ(mp) == 0)
3309		return;
3310	ump = VFSTOUFS(mp);
3311	jblocks = ump->softdep_jblocks;
3312	ACQUIRE_LOCK(ump);
3313	while (ump->softdep_on_journal) {
3314		jblocks->jb_needseg = 1;
3315		softdep_process_journal(mp, NULL, MNT_WAIT);
3316	}
3317	FREE_LOCK(ump);
3318}
3319
3320static void softdep_synchronize_completed(struct bio *);
3321static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3322
3323static void
3324softdep_synchronize_completed(bp)
3325        struct bio *bp;
3326{
3327	struct jseg *oldest;
3328	struct jseg *jseg;
3329	struct ufsmount *ump;
3330
3331	/*
3332	 * caller1 marks the last segment written before we issued the
3333	 * synchronize cache.
3334	 */
3335	jseg = bp->bio_caller1;
3336	if (jseg == NULL) {
3337		g_destroy_bio(bp);
3338		return;
3339	}
3340	ump = VFSTOUFS(jseg->js_list.wk_mp);
3341	ACQUIRE_LOCK(ump);
3342	oldest = NULL;
3343	/*
3344	 * Mark all the journal entries waiting on the synchronize cache
3345	 * as completed so they may continue on.
3346	 */
3347	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3348		jseg->js_state |= COMPLETE;
3349		oldest = jseg;
3350		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3351	}
3352	/*
3353	 * Restart deferred journal entry processing from the oldest
3354	 * completed jseg.
3355	 */
3356	if (oldest)
3357		complete_jsegs(oldest);
3358
3359	FREE_LOCK(ump);
3360	g_destroy_bio(bp);
3361}
3362
3363/*
3364 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3365 * barriers.  The journal must be written prior to any blocks that depend
3366 * on it and the journal can not be released until the blocks have be
3367 * written.  This code handles both barriers simultaneously.
3368 */
3369static void
3370softdep_synchronize(bp, ump, caller1)
3371	struct bio *bp;
3372	struct ufsmount *ump;
3373	void *caller1;
3374{
3375
3376	bp->bio_cmd = BIO_FLUSH;
3377	bp->bio_flags |= BIO_ORDERED;
3378	bp->bio_data = NULL;
3379	bp->bio_offset = ump->um_cp->provider->mediasize;
3380	bp->bio_length = 0;
3381	bp->bio_done = softdep_synchronize_completed;
3382	bp->bio_caller1 = caller1;
3383	g_io_request(bp, ump->um_cp);
3384}
3385
3386/*
3387 * Flush some journal records to disk.
3388 */
3389static void
3390softdep_process_journal(mp, needwk, flags)
3391	struct mount *mp;
3392	struct worklist *needwk;
3393	int flags;
3394{
3395	struct jblocks *jblocks;
3396	struct ufsmount *ump;
3397	struct worklist *wk;
3398	struct jseg *jseg;
3399	struct buf *bp;
3400	struct bio *bio;
3401	uint8_t *data;
3402	struct fs *fs;
3403	int shouldflush;
3404	int segwritten;
3405	int jrecmin;	/* Minimum records per block. */
3406	int jrecmax;	/* Maximum records per block. */
3407	int size;
3408	int cnt;
3409	int off;
3410	int devbsize;
3411
3412	if (MOUNTEDSUJ(mp) == 0)
3413		return;
3414	shouldflush = softdep_flushcache;
3415	bio = NULL;
3416	jseg = NULL;
3417	ump = VFSTOUFS(mp);
3418	LOCK_OWNED(ump);
3419	fs = ump->um_fs;
3420	jblocks = ump->softdep_jblocks;
3421	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3422	/*
3423	 * We write anywhere between a disk block and fs block.  The upper
3424	 * bound is picked to prevent buffer cache fragmentation and limit
3425	 * processing time per I/O.
3426	 */
3427	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3428	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3429	segwritten = 0;
3430	for (;;) {
3431		cnt = ump->softdep_on_journal;
3432		/*
3433		 * Criteria for writing a segment:
3434		 * 1) We have a full block.
3435		 * 2) We're called from jwait() and haven't found the
3436		 *    journal item yet.
3437		 * 3) Always write if needseg is set.
3438		 * 4) If we are called from process_worklist and have
3439		 *    not yet written anything we write a partial block
3440		 *    to enforce a 1 second maximum latency on journal
3441		 *    entries.
3442		 */
3443		if (cnt < (jrecmax - 1) && needwk == NULL &&
3444		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3445			break;
3446		cnt++;
3447		/*
3448		 * Verify some free journal space.  softdep_prealloc() should
3449		 * guarantee that we don't run out so this is indicative of
3450		 * a problem with the flow control.  Try to recover
3451		 * gracefully in any event.
3452		 */
3453		while (jblocks->jb_free == 0) {
3454			if (flags != MNT_WAIT)
3455				break;
3456			printf("softdep: Out of journal space!\n");
3457			softdep_speedup(ump);
3458			msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3459		}
3460		FREE_LOCK(ump);
3461		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3462		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3463		LIST_INIT(&jseg->js_entries);
3464		LIST_INIT(&jseg->js_indirs);
3465		jseg->js_state = ATTACHED;
3466		if (shouldflush == 0)
3467			jseg->js_state |= COMPLETE;
3468		else if (bio == NULL)
3469			bio = g_alloc_bio();
3470		jseg->js_jblocks = jblocks;
3471		bp = geteblk(fs->fs_bsize, 0);
3472		ACQUIRE_LOCK(ump);
3473		/*
3474		 * If there was a race while we were allocating the block
3475		 * and jseg the entry we care about was likely written.
3476		 * We bail out in both the WAIT and NOWAIT case and assume
3477		 * the caller will loop if the entry it cares about is
3478		 * not written.
3479		 */
3480		cnt = ump->softdep_on_journal;
3481		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3482			bp->b_flags |= B_INVAL | B_NOCACHE;
3483			WORKITEM_FREE(jseg, D_JSEG);
3484			FREE_LOCK(ump);
3485			brelse(bp);
3486			ACQUIRE_LOCK(ump);
3487			break;
3488		}
3489		/*
3490		 * Calculate the disk block size required for the available
3491		 * records rounded to the min size.
3492		 */
3493		if (cnt == 0)
3494			size = devbsize;
3495		else if (cnt < jrecmax)
3496			size = howmany(cnt, jrecmin) * devbsize;
3497		else
3498			size = fs->fs_bsize;
3499		/*
3500		 * Allocate a disk block for this journal data and account
3501		 * for truncation of the requested size if enough contiguous
3502		 * space was not available.
3503		 */
3504		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3505		bp->b_lblkno = bp->b_blkno;
3506		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3507		bp->b_bcount = size;
3508		bp->b_flags &= ~B_INVAL;
3509		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3510		/*
3511		 * Initialize our jseg with cnt records.  Assign the next
3512		 * sequence number to it and link it in-order.
3513		 */
3514		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3515		jseg->js_buf = bp;
3516		jseg->js_cnt = cnt;
3517		jseg->js_refs = cnt + 1;	/* Self ref. */
3518		jseg->js_size = size;
3519		jseg->js_seq = jblocks->jb_nextseq++;
3520		if (jblocks->jb_oldestseg == NULL)
3521			jblocks->jb_oldestseg = jseg;
3522		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3523		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3524		if (jblocks->jb_writeseg == NULL)
3525			jblocks->jb_writeseg = jseg;
3526		/*
3527		 * Start filling in records from the pending list.
3528		 */
3529		data = bp->b_data;
3530		off = 0;
3531
3532		/*
3533		 * Always put a header on the first block.
3534		 * XXX As with below, there might not be a chance to get
3535		 * into the loop.  Ensure that something valid is written.
3536		 */
3537		jseg_write(ump, jseg, data);
3538		off += JREC_SIZE;
3539		data = bp->b_data + off;
3540
3541		/*
3542		 * XXX Something is wrong here.  There's no work to do,
3543		 * but we need to perform and I/O and allow it to complete
3544		 * anyways.
3545		 */
3546		if (LIST_EMPTY(&ump->softdep_journal_pending))
3547			stat_emptyjblocks++;
3548
3549		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3550		    != NULL) {
3551			if (cnt == 0)
3552				break;
3553			/* Place a segment header on every device block. */
3554			if ((off % devbsize) == 0) {
3555				jseg_write(ump, jseg, data);
3556				off += JREC_SIZE;
3557				data = bp->b_data + off;
3558			}
3559			if (wk == needwk)
3560				needwk = NULL;
3561			remove_from_journal(wk);
3562			wk->wk_state |= INPROGRESS;
3563			WORKLIST_INSERT(&jseg->js_entries, wk);
3564			switch (wk->wk_type) {
3565			case D_JADDREF:
3566				jaddref_write(WK_JADDREF(wk), jseg, data);
3567				break;
3568			case D_JREMREF:
3569				jremref_write(WK_JREMREF(wk), jseg, data);
3570				break;
3571			case D_JMVREF:
3572				jmvref_write(WK_JMVREF(wk), jseg, data);
3573				break;
3574			case D_JNEWBLK:
3575				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3576				break;
3577			case D_JFREEBLK:
3578				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3579				break;
3580			case D_JFREEFRAG:
3581				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3582				break;
3583			case D_JTRUNC:
3584				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3585				break;
3586			case D_JFSYNC:
3587				jfsync_write(WK_JFSYNC(wk), jseg, data);
3588				break;
3589			default:
3590				panic("process_journal: Unknown type %s",
3591				    TYPENAME(wk->wk_type));
3592				/* NOTREACHED */
3593			}
3594			off += JREC_SIZE;
3595			data = bp->b_data + off;
3596			cnt--;
3597		}
3598
3599		/* Clear any remaining space so we don't leak kernel data */
3600		if (size > off)
3601			bzero(data, size - off);
3602
3603		/*
3604		 * Write this one buffer and continue.
3605		 */
3606		segwritten = 1;
3607		jblocks->jb_needseg = 0;
3608		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3609		FREE_LOCK(ump);
3610		bp->b_xflags |= BX_CVTENXIO;
3611		pbgetvp(ump->um_devvp, bp);
3612		/*
3613		 * We only do the blocking wait once we find the journal
3614		 * entry we're looking for.
3615		 */
3616		if (needwk == NULL && flags == MNT_WAIT)
3617			bwrite(bp);
3618		else
3619			bawrite(bp);
3620		ACQUIRE_LOCK(ump);
3621	}
3622	/*
3623	 * If we wrote a segment issue a synchronize cache so the journal
3624	 * is reflected on disk before the data is written.  Since reclaiming
3625	 * journal space also requires writing a journal record this
3626	 * process also enforces a barrier before reclamation.
3627	 */
3628	if (segwritten && shouldflush) {
3629		softdep_synchronize(bio, ump,
3630		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3631	} else if (bio)
3632		g_destroy_bio(bio);
3633	/*
3634	 * If we've suspended the filesystem because we ran out of journal
3635	 * space either try to sync it here to make some progress or
3636	 * unsuspend it if we already have.
3637	 */
3638	if (flags == 0 && jblocks->jb_suspended) {
3639		if (journal_unsuspend(ump))
3640			return;
3641		FREE_LOCK(ump);
3642		VFS_SYNC(mp, MNT_NOWAIT);
3643		ffs_sbupdate(ump, MNT_WAIT, 0);
3644		ACQUIRE_LOCK(ump);
3645	}
3646}
3647
3648/*
3649 * Complete a jseg, allowing all dependencies awaiting journal writes
3650 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3651 * structures so that the journal segment can be freed to reclaim space.
3652 */
3653static void
3654complete_jseg(jseg)
3655	struct jseg *jseg;
3656{
3657	struct worklist *wk;
3658	struct jmvref *jmvref;
3659#ifdef INVARIANTS
3660	int i = 0;
3661#endif
3662
3663	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3664		WORKLIST_REMOVE(wk);
3665		wk->wk_state &= ~INPROGRESS;
3666		wk->wk_state |= COMPLETE;
3667		KASSERT(i++ < jseg->js_cnt,
3668		    ("handle_written_jseg: overflow %d >= %d",
3669		    i - 1, jseg->js_cnt));
3670		switch (wk->wk_type) {
3671		case D_JADDREF:
3672			handle_written_jaddref(WK_JADDREF(wk));
3673			break;
3674		case D_JREMREF:
3675			handle_written_jremref(WK_JREMREF(wk));
3676			break;
3677		case D_JMVREF:
3678			rele_jseg(jseg);	/* No jsegdep. */
3679			jmvref = WK_JMVREF(wk);
3680			LIST_REMOVE(jmvref, jm_deps);
3681			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3682				free_pagedep(jmvref->jm_pagedep);
3683			WORKITEM_FREE(jmvref, D_JMVREF);
3684			break;
3685		case D_JNEWBLK:
3686			handle_written_jnewblk(WK_JNEWBLK(wk));
3687			break;
3688		case D_JFREEBLK:
3689			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3690			break;
3691		case D_JTRUNC:
3692			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3693			break;
3694		case D_JFSYNC:
3695			rele_jseg(jseg);	/* No jsegdep. */
3696			WORKITEM_FREE(wk, D_JFSYNC);
3697			break;
3698		case D_JFREEFRAG:
3699			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3700			break;
3701		default:
3702			panic("handle_written_jseg: Unknown type %s",
3703			    TYPENAME(wk->wk_type));
3704			/* NOTREACHED */
3705		}
3706	}
3707	/* Release the self reference so the structure may be freed. */
3708	rele_jseg(jseg);
3709}
3710
3711/*
3712 * Determine which jsegs are ready for completion processing.  Waits for
3713 * synchronize cache to complete as well as forcing in-order completion
3714 * of journal entries.
3715 */
3716static void
3717complete_jsegs(jseg)
3718	struct jseg *jseg;
3719{
3720	struct jblocks *jblocks;
3721	struct jseg *jsegn;
3722
3723	jblocks = jseg->js_jblocks;
3724	/*
3725	 * Don't allow out of order completions.  If this isn't the first
3726	 * block wait for it to write before we're done.
3727	 */
3728	if (jseg != jblocks->jb_writeseg)
3729		return;
3730	/* Iterate through available jsegs processing their entries. */
3731	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3732		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3733		jsegn = TAILQ_NEXT(jseg, js_next);
3734		complete_jseg(jseg);
3735		jseg = jsegn;
3736	}
3737	jblocks->jb_writeseg = jseg;
3738	/*
3739	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3740	 */
3741	free_jsegs(jblocks);
3742}
3743
3744/*
3745 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3746 * the final completions.
3747 */
3748static void
3749handle_written_jseg(jseg, bp)
3750	struct jseg *jseg;
3751	struct buf *bp;
3752{
3753
3754	if (jseg->js_refs == 0)
3755		panic("handle_written_jseg: No self-reference on %p", jseg);
3756	jseg->js_state |= DEPCOMPLETE;
3757	/*
3758	 * We'll never need this buffer again, set flags so it will be
3759	 * discarded.
3760	 */
3761	bp->b_flags |= B_INVAL | B_NOCACHE;
3762	pbrelvp(bp);
3763	complete_jsegs(jseg);
3764}
3765
3766static inline struct jsegdep *
3767inoref_jseg(inoref)
3768	struct inoref *inoref;
3769{
3770	struct jsegdep *jsegdep;
3771
3772	jsegdep = inoref->if_jsegdep;
3773	inoref->if_jsegdep = NULL;
3774
3775	return (jsegdep);
3776}
3777
3778/*
3779 * Called once a jremref has made it to stable store.  The jremref is marked
3780 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3781 * for the jremref to complete will be awoken by free_jremref.
3782 */
3783static void
3784handle_written_jremref(jremref)
3785	struct jremref *jremref;
3786{
3787	struct inodedep *inodedep;
3788	struct jsegdep *jsegdep;
3789	struct dirrem *dirrem;
3790
3791	/* Grab the jsegdep. */
3792	jsegdep = inoref_jseg(&jremref->jr_ref);
3793	/*
3794	 * Remove us from the inoref list.
3795	 */
3796	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3797	    0, &inodedep) == 0)
3798		panic("handle_written_jremref: Lost inodedep");
3799	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3800	/*
3801	 * Complete the dirrem.
3802	 */
3803	dirrem = jremref->jr_dirrem;
3804	jremref->jr_dirrem = NULL;
3805	LIST_REMOVE(jremref, jr_deps);
3806	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3807	jwork_insert(&dirrem->dm_jwork, jsegdep);
3808	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3809	    (dirrem->dm_state & COMPLETE) != 0)
3810		add_to_worklist(&dirrem->dm_list, 0);
3811	free_jremref(jremref);
3812}
3813
3814/*
3815 * Called once a jaddref has made it to stable store.  The dependency is
3816 * marked complete and any dependent structures are added to the inode
3817 * bufwait list to be completed as soon as it is written.  If a bitmap write
3818 * depends on this entry we move the inode into the inodedephd of the
3819 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3820 */
3821static void
3822handle_written_jaddref(jaddref)
3823	struct jaddref *jaddref;
3824{
3825	struct jsegdep *jsegdep;
3826	struct inodedep *inodedep;
3827	struct diradd *diradd;
3828	struct mkdir *mkdir;
3829
3830	/* Grab the jsegdep. */
3831	jsegdep = inoref_jseg(&jaddref->ja_ref);
3832	mkdir = NULL;
3833	diradd = NULL;
3834	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3835	    0, &inodedep) == 0)
3836		panic("handle_written_jaddref: Lost inodedep.");
3837	if (jaddref->ja_diradd == NULL)
3838		panic("handle_written_jaddref: No dependency");
3839	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3840		diradd = jaddref->ja_diradd;
3841		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3842	} else if (jaddref->ja_state & MKDIR_PARENT) {
3843		mkdir = jaddref->ja_mkdir;
3844		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3845	} else if (jaddref->ja_state & MKDIR_BODY)
3846		mkdir = jaddref->ja_mkdir;
3847	else
3848		panic("handle_written_jaddref: Unknown dependency %p",
3849		    jaddref->ja_diradd);
3850	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3851	/*
3852	 * Remove us from the inode list.
3853	 */
3854	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3855	/*
3856	 * The mkdir may be waiting on the jaddref to clear before freeing.
3857	 */
3858	if (mkdir) {
3859		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3860		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3861		    TYPENAME(mkdir->md_list.wk_type)));
3862		mkdir->md_jaddref = NULL;
3863		diradd = mkdir->md_diradd;
3864		mkdir->md_state |= DEPCOMPLETE;
3865		complete_mkdir(mkdir);
3866	}
3867	jwork_insert(&diradd->da_jwork, jsegdep);
3868	if (jaddref->ja_state & NEWBLOCK) {
3869		inodedep->id_state |= ONDEPLIST;
3870		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3871		    inodedep, id_deps);
3872	}
3873	free_jaddref(jaddref);
3874}
3875
3876/*
3877 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3878 * is placed in the bmsafemap to await notification of a written bitmap.  If
3879 * the operation was canceled we add the segdep to the appropriate
3880 * dependency to free the journal space once the canceling operation
3881 * completes.
3882 */
3883static void
3884handle_written_jnewblk(jnewblk)
3885	struct jnewblk *jnewblk;
3886{
3887	struct bmsafemap *bmsafemap;
3888	struct freefrag *freefrag;
3889	struct freework *freework;
3890	struct jsegdep *jsegdep;
3891	struct newblk *newblk;
3892
3893	/* Grab the jsegdep. */
3894	jsegdep = jnewblk->jn_jsegdep;
3895	jnewblk->jn_jsegdep = NULL;
3896	if (jnewblk->jn_dep == NULL)
3897		panic("handle_written_jnewblk: No dependency for the segdep.");
3898	switch (jnewblk->jn_dep->wk_type) {
3899	case D_NEWBLK:
3900	case D_ALLOCDIRECT:
3901	case D_ALLOCINDIR:
3902		/*
3903		 * Add the written block to the bmsafemap so it can
3904		 * be notified when the bitmap is on disk.
3905		 */
3906		newblk = WK_NEWBLK(jnewblk->jn_dep);
3907		newblk->nb_jnewblk = NULL;
3908		if ((newblk->nb_state & GOINGAWAY) == 0) {
3909			bmsafemap = newblk->nb_bmsafemap;
3910			newblk->nb_state |= ONDEPLIST;
3911			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3912			    nb_deps);
3913		}
3914		jwork_insert(&newblk->nb_jwork, jsegdep);
3915		break;
3916	case D_FREEFRAG:
3917		/*
3918		 * A newblock being removed by a freefrag when replaced by
3919		 * frag extension.
3920		 */
3921		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3922		freefrag->ff_jdep = NULL;
3923		jwork_insert(&freefrag->ff_jwork, jsegdep);
3924		break;
3925	case D_FREEWORK:
3926		/*
3927		 * A direct block was removed by truncate.
3928		 */
3929		freework = WK_FREEWORK(jnewblk->jn_dep);
3930		freework->fw_jnewblk = NULL;
3931		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3932		break;
3933	default:
3934		panic("handle_written_jnewblk: Unknown type %d.",
3935		    jnewblk->jn_dep->wk_type);
3936	}
3937	jnewblk->jn_dep = NULL;
3938	free_jnewblk(jnewblk);
3939}
3940
3941/*
3942 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3943 * an in-flight allocation that has not yet been committed.  Divorce us
3944 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3945 * to the worklist.
3946 */
3947static void
3948cancel_jfreefrag(jfreefrag)
3949	struct jfreefrag *jfreefrag;
3950{
3951	struct freefrag *freefrag;
3952
3953	if (jfreefrag->fr_jsegdep) {
3954		free_jsegdep(jfreefrag->fr_jsegdep);
3955		jfreefrag->fr_jsegdep = NULL;
3956	}
3957	freefrag = jfreefrag->fr_freefrag;
3958	jfreefrag->fr_freefrag = NULL;
3959	free_jfreefrag(jfreefrag);
3960	freefrag->ff_state |= DEPCOMPLETE;
3961	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3962}
3963
3964/*
3965 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3966 */
3967static void
3968free_jfreefrag(jfreefrag)
3969	struct jfreefrag *jfreefrag;
3970{
3971
3972	if (jfreefrag->fr_state & INPROGRESS)
3973		WORKLIST_REMOVE(&jfreefrag->fr_list);
3974	else if (jfreefrag->fr_state & ONWORKLIST)
3975		remove_from_journal(&jfreefrag->fr_list);
3976	if (jfreefrag->fr_freefrag != NULL)
3977		panic("free_jfreefrag:  Still attached to a freefrag.");
3978	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3979}
3980
3981/*
3982 * Called when the journal write for a jfreefrag completes.  The parent
3983 * freefrag is added to the worklist if this completes its dependencies.
3984 */
3985static void
3986handle_written_jfreefrag(jfreefrag)
3987	struct jfreefrag *jfreefrag;
3988{
3989	struct jsegdep *jsegdep;
3990	struct freefrag *freefrag;
3991
3992	/* Grab the jsegdep. */
3993	jsegdep = jfreefrag->fr_jsegdep;
3994	jfreefrag->fr_jsegdep = NULL;
3995	freefrag = jfreefrag->fr_freefrag;
3996	if (freefrag == NULL)
3997		panic("handle_written_jfreefrag: No freefrag.");
3998	freefrag->ff_state |= DEPCOMPLETE;
3999	freefrag->ff_jdep = NULL;
4000	jwork_insert(&freefrag->ff_jwork, jsegdep);
4001	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
4002		add_to_worklist(&freefrag->ff_list, 0);
4003	jfreefrag->fr_freefrag = NULL;
4004	free_jfreefrag(jfreefrag);
4005}
4006
4007/*
4008 * Called when the journal write for a jfreeblk completes.  The jfreeblk
4009 * is removed from the freeblks list of pending journal writes and the
4010 * jsegdep is moved to the freeblks jwork to be completed when all blocks
4011 * have been reclaimed.
4012 */
4013static void
4014handle_written_jblkdep(jblkdep)
4015	struct jblkdep *jblkdep;
4016{
4017	struct freeblks *freeblks;
4018	struct jsegdep *jsegdep;
4019
4020	/* Grab the jsegdep. */
4021	jsegdep = jblkdep->jb_jsegdep;
4022	jblkdep->jb_jsegdep = NULL;
4023	freeblks = jblkdep->jb_freeblks;
4024	LIST_REMOVE(jblkdep, jb_deps);
4025	jwork_insert(&freeblks->fb_jwork, jsegdep);
4026	/*
4027	 * If the freeblks is all journaled, we can add it to the worklist.
4028	 */
4029	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
4030	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
4031		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
4032
4033	free_jblkdep(jblkdep);
4034}
4035
4036static struct jsegdep *
4037newjsegdep(struct worklist *wk)
4038{
4039	struct jsegdep *jsegdep;
4040
4041	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
4042	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
4043	jsegdep->jd_seg = NULL;
4044
4045	return (jsegdep);
4046}
4047
4048static struct jmvref *
4049newjmvref(dp, ino, oldoff, newoff)
4050	struct inode *dp;
4051	ino_t ino;
4052	off_t oldoff;
4053	off_t newoff;
4054{
4055	struct jmvref *jmvref;
4056
4057	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
4058	workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
4059	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
4060	jmvref->jm_parent = dp->i_number;
4061	jmvref->jm_ino = ino;
4062	jmvref->jm_oldoff = oldoff;
4063	jmvref->jm_newoff = newoff;
4064
4065	return (jmvref);
4066}
4067
4068/*
4069 * Allocate a new jremref that tracks the removal of ip from dp with the
4070 * directory entry offset of diroff.  Mark the entry as ATTACHED and
4071 * DEPCOMPLETE as we have all the information required for the journal write
4072 * and the directory has already been removed from the buffer.  The caller
4073 * is responsible for linking the jremref into the pagedep and adding it
4074 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
4075 * a DOTDOT addition so handle_workitem_remove() can properly assign
4076 * the jsegdep when we're done.
4077 */
4078static struct jremref *
4079newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
4080    off_t diroff, nlink_t nlink)
4081{
4082	struct jremref *jremref;
4083
4084	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4085	workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
4086	jremref->jr_state = ATTACHED;
4087	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4088	   nlink, ip->i_mode);
4089	jremref->jr_dirrem = dirrem;
4090
4091	return (jremref);
4092}
4093
4094static inline void
4095newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
4096    nlink_t nlink, uint16_t mode)
4097{
4098
4099	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4100	inoref->if_diroff = diroff;
4101	inoref->if_ino = ino;
4102	inoref->if_parent = parent;
4103	inoref->if_nlink = nlink;
4104	inoref->if_mode = mode;
4105}
4106
4107/*
4108 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4109 * directory offset may not be known until later.  The caller is responsible
4110 * adding the entry to the journal when this information is available.  nlink
4111 * should be the link count prior to the addition and mode is only required
4112 * to have the correct FMT.
4113 */
4114static struct jaddref *
4115newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
4116    uint16_t mode)
4117{
4118	struct jaddref *jaddref;
4119
4120	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4121	workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
4122	jaddref->ja_state = ATTACHED;
4123	jaddref->ja_mkdir = NULL;
4124	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4125
4126	return (jaddref);
4127}
4128
4129/*
4130 * Create a new free dependency for a freework.  The caller is responsible
4131 * for adjusting the reference count when it has the lock held.  The freedep
4132 * will track an outstanding bitmap write that will ultimately clear the
4133 * freework to continue.
4134 */
4135static struct freedep *
4136newfreedep(struct freework *freework)
4137{
4138	struct freedep *freedep;
4139
4140	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4141	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4142	freedep->fd_freework = freework;
4143
4144	return (freedep);
4145}
4146
4147/*
4148 * Free a freedep structure once the buffer it is linked to is written.  If
4149 * this is the last reference to the freework schedule it for completion.
4150 */
4151static void
4152free_freedep(freedep)
4153	struct freedep *freedep;
4154{
4155	struct freework *freework;
4156
4157	freework = freedep->fd_freework;
4158	freework->fw_freeblks->fb_cgwait--;
4159	if (--freework->fw_ref == 0)
4160		freework_enqueue(freework);
4161	WORKITEM_FREE(freedep, D_FREEDEP);
4162}
4163
4164/*
4165 * Allocate a new freework structure that may be a level in an indirect
4166 * when parent is not NULL or a top level block when it is.  The top level
4167 * freework structures are allocated without the per-filesystem lock held
4168 * and before the freeblks is visible outside of softdep_setup_freeblocks().
4169 */
4170static struct freework *
4171newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
4172	struct ufsmount *ump;
4173	struct freeblks *freeblks;
4174	struct freework *parent;
4175	ufs_lbn_t lbn;
4176	ufs2_daddr_t nb;
4177	int frags;
4178	int off;
4179	int journal;
4180{
4181	struct freework *freework;
4182
4183	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4184	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4185	freework->fw_state = ATTACHED;
4186	freework->fw_jnewblk = NULL;
4187	freework->fw_freeblks = freeblks;
4188	freework->fw_parent = parent;
4189	freework->fw_lbn = lbn;
4190	freework->fw_blkno = nb;
4191	freework->fw_frags = frags;
4192	freework->fw_indir = NULL;
4193	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 ||
4194	    lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
4195	freework->fw_start = freework->fw_off = off;
4196	if (journal)
4197		newjfreeblk(freeblks, lbn, nb, frags);
4198	if (parent == NULL) {
4199		ACQUIRE_LOCK(ump);
4200		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4201		freeblks->fb_ref++;
4202		FREE_LOCK(ump);
4203	}
4204
4205	return (freework);
4206}
4207
4208/*
4209 * Eliminate a jfreeblk for a block that does not need journaling.
4210 */
4211static void
4212cancel_jfreeblk(freeblks, blkno)
4213	struct freeblks *freeblks;
4214	ufs2_daddr_t blkno;
4215{
4216	struct jfreeblk *jfreeblk;
4217	struct jblkdep *jblkdep;
4218
4219	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4220		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4221			continue;
4222		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4223		if (jfreeblk->jf_blkno == blkno)
4224			break;
4225	}
4226	if (jblkdep == NULL)
4227		return;
4228	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4229	free_jsegdep(jblkdep->jb_jsegdep);
4230	LIST_REMOVE(jblkdep, jb_deps);
4231	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4232}
4233
4234/*
4235 * Allocate a new jfreeblk to journal top level block pointer when truncating
4236 * a file.  The caller must add this to the worklist when the per-filesystem
4237 * lock is held.
4238 */
4239static struct jfreeblk *
4240newjfreeblk(freeblks, lbn, blkno, frags)
4241	struct freeblks *freeblks;
4242	ufs_lbn_t lbn;
4243	ufs2_daddr_t blkno;
4244	int frags;
4245{
4246	struct jfreeblk *jfreeblk;
4247
4248	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4249	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4250	    freeblks->fb_list.wk_mp);
4251	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4252	jfreeblk->jf_dep.jb_freeblks = freeblks;
4253	jfreeblk->jf_ino = freeblks->fb_inum;
4254	jfreeblk->jf_lbn = lbn;
4255	jfreeblk->jf_blkno = blkno;
4256	jfreeblk->jf_frags = frags;
4257	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4258
4259	return (jfreeblk);
4260}
4261
4262/*
4263 * The journal is only prepared to handle full-size block numbers, so we
4264 * have to adjust the record to reflect the change to a full-size block.
4265 * For example, suppose we have a block made up of fragments 8-15 and
4266 * want to free its last two fragments. We are given a request that says:
4267 *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4268 * where frags are the number of fragments to free and oldfrags are the
4269 * number of fragments to keep. To block align it, we have to change it to
4270 * have a valid full-size blkno, so it becomes:
4271 *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4272 */
4273static void
4274adjust_newfreework(freeblks, frag_offset)
4275	struct freeblks *freeblks;
4276	int frag_offset;
4277{
4278	struct jfreeblk *jfreeblk;
4279
4280	KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4281	    LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4282	    ("adjust_newfreework: Missing freeblks dependency"));
4283
4284	jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4285	jfreeblk->jf_blkno -= frag_offset;
4286	jfreeblk->jf_frags += frag_offset;
4287}
4288
4289/*
4290 * Allocate a new jtrunc to track a partial truncation.
4291 */
4292static struct jtrunc *
4293newjtrunc(freeblks, size, extsize)
4294	struct freeblks *freeblks;
4295	off_t size;
4296	int extsize;
4297{
4298	struct jtrunc *jtrunc;
4299
4300	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4301	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4302	    freeblks->fb_list.wk_mp);
4303	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4304	jtrunc->jt_dep.jb_freeblks = freeblks;
4305	jtrunc->jt_ino = freeblks->fb_inum;
4306	jtrunc->jt_size = size;
4307	jtrunc->jt_extsize = extsize;
4308	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4309
4310	return (jtrunc);
4311}
4312
4313/*
4314 * If we're canceling a new bitmap we have to search for another ref
4315 * to move into the bmsafemap dep.  This might be better expressed
4316 * with another structure.
4317 */
4318static void
4319move_newblock_dep(jaddref, inodedep)
4320	struct jaddref *jaddref;
4321	struct inodedep *inodedep;
4322{
4323	struct inoref *inoref;
4324	struct jaddref *jaddrefn;
4325
4326	jaddrefn = NULL;
4327	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4328	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4329		if ((jaddref->ja_state & NEWBLOCK) &&
4330		    inoref->if_list.wk_type == D_JADDREF) {
4331			jaddrefn = (struct jaddref *)inoref;
4332			break;
4333		}
4334	}
4335	if (jaddrefn == NULL)
4336		return;
4337	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4338	jaddrefn->ja_state |= jaddref->ja_state &
4339	    (ATTACHED | UNDONE | NEWBLOCK);
4340	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4341	jaddref->ja_state |= ATTACHED;
4342	LIST_REMOVE(jaddref, ja_bmdeps);
4343	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4344	    ja_bmdeps);
4345}
4346
4347/*
4348 * Cancel a jaddref either before it has been written or while it is being
4349 * written.  This happens when a link is removed before the add reaches
4350 * the disk.  The jaddref dependency is kept linked into the bmsafemap
4351 * and inode to prevent the link count or bitmap from reaching the disk
4352 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4353 * required.
4354 *
4355 * Returns 1 if the canceled addref requires journaling of the remove and
4356 * 0 otherwise.
4357 */
4358static int
4359cancel_jaddref(jaddref, inodedep, wkhd)
4360	struct jaddref *jaddref;
4361	struct inodedep *inodedep;
4362	struct workhead *wkhd;
4363{
4364	struct inoref *inoref;
4365	struct jsegdep *jsegdep;
4366	int needsj;
4367
4368	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4369	    ("cancel_jaddref: Canceling complete jaddref"));
4370	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4371		needsj = 1;
4372	else
4373		needsj = 0;
4374	if (inodedep == NULL)
4375		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4376		    0, &inodedep) == 0)
4377			panic("cancel_jaddref: Lost inodedep");
4378	/*
4379	 * We must adjust the nlink of any reference operation that follows
4380	 * us so that it is consistent with the in-memory reference.  This
4381	 * ensures that inode nlink rollbacks always have the correct link.
4382	 */
4383	if (needsj == 0) {
4384		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4385		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4386			if (inoref->if_state & GOINGAWAY)
4387				break;
4388			inoref->if_nlink--;
4389		}
4390	}
4391	jsegdep = inoref_jseg(&jaddref->ja_ref);
4392	if (jaddref->ja_state & NEWBLOCK)
4393		move_newblock_dep(jaddref, inodedep);
4394	wake_worklist(&jaddref->ja_list);
4395	jaddref->ja_mkdir = NULL;
4396	if (jaddref->ja_state & INPROGRESS) {
4397		jaddref->ja_state &= ~INPROGRESS;
4398		WORKLIST_REMOVE(&jaddref->ja_list);
4399		jwork_insert(wkhd, jsegdep);
4400	} else {
4401		free_jsegdep(jsegdep);
4402		if (jaddref->ja_state & DEPCOMPLETE)
4403			remove_from_journal(&jaddref->ja_list);
4404	}
4405	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4406	/*
4407	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4408	 * can arrange for them to be freed with the bitmap.  Otherwise we
4409	 * no longer need this addref attached to the inoreflst and it
4410	 * will incorrectly adjust nlink if we leave it.
4411	 */
4412	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4413		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4414		    if_deps);
4415		jaddref->ja_state |= COMPLETE;
4416		free_jaddref(jaddref);
4417		return (needsj);
4418	}
4419	/*
4420	 * Leave the head of the list for jsegdeps for fast merging.
4421	 */
4422	if (LIST_FIRST(wkhd) != NULL) {
4423		jaddref->ja_state |= ONWORKLIST;
4424		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4425	} else
4426		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4427
4428	return (needsj);
4429}
4430
4431/*
4432 * Attempt to free a jaddref structure when some work completes.  This
4433 * should only succeed once the entry is written and all dependencies have
4434 * been notified.
4435 */
4436static void
4437free_jaddref(jaddref)
4438	struct jaddref *jaddref;
4439{
4440
4441	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4442		return;
4443	if (jaddref->ja_ref.if_jsegdep)
4444		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4445		    jaddref, jaddref->ja_state);
4446	if (jaddref->ja_state & NEWBLOCK)
4447		LIST_REMOVE(jaddref, ja_bmdeps);
4448	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4449		panic("free_jaddref: Bad state %p(0x%X)",
4450		    jaddref, jaddref->ja_state);
4451	if (jaddref->ja_mkdir != NULL)
4452		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4453	WORKITEM_FREE(jaddref, D_JADDREF);
4454}
4455
4456/*
4457 * Free a jremref structure once it has been written or discarded.
4458 */
4459static void
4460free_jremref(jremref)
4461	struct jremref *jremref;
4462{
4463
4464	if (jremref->jr_ref.if_jsegdep)
4465		free_jsegdep(jremref->jr_ref.if_jsegdep);
4466	if (jremref->jr_state & INPROGRESS)
4467		panic("free_jremref: IO still pending");
4468	WORKITEM_FREE(jremref, D_JREMREF);
4469}
4470
4471/*
4472 * Free a jnewblk structure.
4473 */
4474static void
4475free_jnewblk(jnewblk)
4476	struct jnewblk *jnewblk;
4477{
4478
4479	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4480		return;
4481	LIST_REMOVE(jnewblk, jn_deps);
4482	if (jnewblk->jn_dep != NULL)
4483		panic("free_jnewblk: Dependency still attached.");
4484	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4485}
4486
4487/*
4488 * Cancel a jnewblk which has been been made redundant by frag extension.
4489 */
4490static void
4491cancel_jnewblk(jnewblk, wkhd)
4492	struct jnewblk *jnewblk;
4493	struct workhead *wkhd;
4494{
4495	struct jsegdep *jsegdep;
4496
4497	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4498	jsegdep = jnewblk->jn_jsegdep;
4499	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4500		panic("cancel_jnewblk: Invalid state");
4501	jnewblk->jn_jsegdep  = NULL;
4502	jnewblk->jn_dep = NULL;
4503	jnewblk->jn_state |= GOINGAWAY;
4504	if (jnewblk->jn_state & INPROGRESS) {
4505		jnewblk->jn_state &= ~INPROGRESS;
4506		WORKLIST_REMOVE(&jnewblk->jn_list);
4507		jwork_insert(wkhd, jsegdep);
4508	} else {
4509		free_jsegdep(jsegdep);
4510		remove_from_journal(&jnewblk->jn_list);
4511	}
4512	wake_worklist(&jnewblk->jn_list);
4513	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4514}
4515
4516static void
4517free_jblkdep(jblkdep)
4518	struct jblkdep *jblkdep;
4519{
4520
4521	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4522		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4523	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4524		WORKITEM_FREE(jblkdep, D_JTRUNC);
4525	else
4526		panic("free_jblkdep: Unexpected type %s",
4527		    TYPENAME(jblkdep->jb_list.wk_type));
4528}
4529
4530/*
4531 * Free a single jseg once it is no longer referenced in memory or on
4532 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4533 * to disappear.
4534 */
4535static void
4536free_jseg(jseg, jblocks)
4537	struct jseg *jseg;
4538	struct jblocks *jblocks;
4539{
4540	struct freework *freework;
4541
4542	/*
4543	 * Free freework structures that were lingering to indicate freed
4544	 * indirect blocks that forced journal write ordering on reallocate.
4545	 */
4546	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4547		indirblk_remove(freework);
4548	if (jblocks->jb_oldestseg == jseg)
4549		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4550	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4551	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4552	KASSERT(LIST_EMPTY(&jseg->js_entries),
4553	    ("free_jseg: Freed jseg has valid entries."));
4554	WORKITEM_FREE(jseg, D_JSEG);
4555}
4556
4557/*
4558 * Free all jsegs that meet the criteria for being reclaimed and update
4559 * oldestseg.
4560 */
4561static void
4562free_jsegs(jblocks)
4563	struct jblocks *jblocks;
4564{
4565	struct jseg *jseg;
4566
4567	/*
4568	 * Free only those jsegs which have none allocated before them to
4569	 * preserve the journal space ordering.
4570	 */
4571	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4572		/*
4573		 * Only reclaim space when nothing depends on this journal
4574		 * set and another set has written that it is no longer
4575		 * valid.
4576		 */
4577		if (jseg->js_refs != 0) {
4578			jblocks->jb_oldestseg = jseg;
4579			return;
4580		}
4581		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4582			break;
4583		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4584			break;
4585		/*
4586		 * We can free jsegs that didn't write entries when
4587		 * oldestwrseq == js_seq.
4588		 */
4589		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4590		    jseg->js_cnt != 0)
4591			break;
4592		free_jseg(jseg, jblocks);
4593	}
4594	/*
4595	 * If we exited the loop above we still must discover the
4596	 * oldest valid segment.
4597	 */
4598	if (jseg)
4599		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4600		     jseg = TAILQ_NEXT(jseg, js_next))
4601			if (jseg->js_refs != 0)
4602				break;
4603	jblocks->jb_oldestseg = jseg;
4604	/*
4605	 * The journal has no valid records but some jsegs may still be
4606	 * waiting on oldestwrseq to advance.  We force a small record
4607	 * out to permit these lingering records to be reclaimed.
4608	 */
4609	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4610		jblocks->jb_needseg = 1;
4611}
4612
4613/*
4614 * Release one reference to a jseg and free it if the count reaches 0.  This
4615 * should eventually reclaim journal space as well.
4616 */
4617static void
4618rele_jseg(jseg)
4619	struct jseg *jseg;
4620{
4621
4622	KASSERT(jseg->js_refs > 0,
4623	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4624	if (--jseg->js_refs != 0)
4625		return;
4626	free_jsegs(jseg->js_jblocks);
4627}
4628
4629/*
4630 * Release a jsegdep and decrement the jseg count.
4631 */
4632static void
4633free_jsegdep(jsegdep)
4634	struct jsegdep *jsegdep;
4635{
4636
4637	if (jsegdep->jd_seg)
4638		rele_jseg(jsegdep->jd_seg);
4639	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4640}
4641
4642/*
4643 * Wait for a journal item to make it to disk.  Initiate journal processing
4644 * if required.
4645 */
4646static int
4647jwait(wk, waitfor)
4648	struct worklist *wk;
4649	int waitfor;
4650{
4651
4652	LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4653	/*
4654	 * Blocking journal waits cause slow synchronous behavior.  Record
4655	 * stats on the frequency of these blocking operations.
4656	 */
4657	if (waitfor == MNT_WAIT) {
4658		stat_journal_wait++;
4659		switch (wk->wk_type) {
4660		case D_JREMREF:
4661		case D_JMVREF:
4662			stat_jwait_filepage++;
4663			break;
4664		case D_JTRUNC:
4665		case D_JFREEBLK:
4666			stat_jwait_freeblks++;
4667			break;
4668		case D_JNEWBLK:
4669			stat_jwait_newblk++;
4670			break;
4671		case D_JADDREF:
4672			stat_jwait_inode++;
4673			break;
4674		default:
4675			break;
4676		}
4677	}
4678	/*
4679	 * If IO has not started we process the journal.  We can't mark the
4680	 * worklist item as IOWAITING because we drop the lock while
4681	 * processing the journal and the worklist entry may be freed after
4682	 * this point.  The caller may call back in and re-issue the request.
4683	 */
4684	if ((wk->wk_state & INPROGRESS) == 0) {
4685		softdep_process_journal(wk->wk_mp, wk, waitfor);
4686		if (waitfor != MNT_WAIT)
4687			return (EBUSY);
4688		return (0);
4689	}
4690	if (waitfor != MNT_WAIT)
4691		return (EBUSY);
4692	wait_worklist(wk, "jwait");
4693	return (0);
4694}
4695
4696/*
4697 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4698 * appropriate.  This is a convenience function to reduce duplicate code
4699 * for the setup and revert functions below.
4700 */
4701static struct inodedep *
4702inodedep_lookup_ip(ip)
4703	struct inode *ip;
4704{
4705	struct inodedep *inodedep;
4706
4707	KASSERT(ip->i_nlink >= ip->i_effnlink,
4708	    ("inodedep_lookup_ip: bad delta"));
4709	(void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
4710	    &inodedep);
4711	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4712	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4713
4714	return (inodedep);
4715}
4716
4717/*
4718 * Called prior to creating a new inode and linking it to a directory.  The
4719 * jaddref structure must already be allocated by softdep_setup_inomapdep
4720 * and it is discovered here so we can initialize the mode and update
4721 * nlinkdelta.
4722 */
4723void
4724softdep_setup_create(dp, ip)
4725	struct inode *dp;
4726	struct inode *ip;
4727{
4728	struct inodedep *inodedep;
4729	struct jaddref *jaddref;
4730	struct vnode *dvp;
4731
4732	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4733	    ("softdep_setup_create called on non-softdep filesystem"));
4734	KASSERT(ip->i_nlink == 1,
4735	    ("softdep_setup_create: Invalid link count."));
4736	dvp = ITOV(dp);
4737	ACQUIRE_LOCK(ITOUMP(dp));
4738	inodedep = inodedep_lookup_ip(ip);
4739	if (DOINGSUJ(dvp)) {
4740		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4741		    inoreflst);
4742		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4743		    ("softdep_setup_create: No addref structure present."));
4744	}
4745	softdep_prelink(dvp, NULL);
4746	FREE_LOCK(ITOUMP(dp));
4747}
4748
4749/*
4750 * Create a jaddref structure to track the addition of a DOTDOT link when
4751 * we are reparenting an inode as part of a rename.  This jaddref will be
4752 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4753 * non-journaling softdep.
4754 */
4755void
4756softdep_setup_dotdot_link(dp, ip)
4757	struct inode *dp;
4758	struct inode *ip;
4759{
4760	struct inodedep *inodedep;
4761	struct jaddref *jaddref;
4762	struct vnode *dvp;
4763
4764	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4765	    ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4766	dvp = ITOV(dp);
4767	jaddref = NULL;
4768	/*
4769	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4770	 * is used as a normal link would be.
4771	 */
4772	if (DOINGSUJ(dvp))
4773		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4774		    dp->i_effnlink - 1, dp->i_mode);
4775	ACQUIRE_LOCK(ITOUMP(dp));
4776	inodedep = inodedep_lookup_ip(dp);
4777	if (jaddref)
4778		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4779		    if_deps);
4780	softdep_prelink(dvp, ITOV(ip));
4781	FREE_LOCK(ITOUMP(dp));
4782}
4783
4784/*
4785 * Create a jaddref structure to track a new link to an inode.  The directory
4786 * offset is not known until softdep_setup_directory_add or
4787 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4788 * softdep.
4789 */
4790void
4791softdep_setup_link(dp, ip)
4792	struct inode *dp;
4793	struct inode *ip;
4794{
4795	struct inodedep *inodedep;
4796	struct jaddref *jaddref;
4797	struct vnode *dvp;
4798
4799	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4800	    ("softdep_setup_link called on non-softdep filesystem"));
4801	dvp = ITOV(dp);
4802	jaddref = NULL;
4803	if (DOINGSUJ(dvp))
4804		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4805		    ip->i_mode);
4806	ACQUIRE_LOCK(ITOUMP(dp));
4807	inodedep = inodedep_lookup_ip(ip);
4808	if (jaddref)
4809		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4810		    if_deps);
4811	softdep_prelink(dvp, ITOV(ip));
4812	FREE_LOCK(ITOUMP(dp));
4813}
4814
4815/*
4816 * Called to create the jaddref structures to track . and .. references as
4817 * well as lookup and further initialize the incomplete jaddref created
4818 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4819 * nlinkdelta for non-journaling softdep.
4820 */
4821void
4822softdep_setup_mkdir(dp, ip)
4823	struct inode *dp;
4824	struct inode *ip;
4825{
4826	struct inodedep *inodedep;
4827	struct jaddref *dotdotaddref;
4828	struct jaddref *dotaddref;
4829	struct jaddref *jaddref;
4830	struct vnode *dvp;
4831
4832	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4833	    ("softdep_setup_mkdir called on non-softdep filesystem"));
4834	dvp = ITOV(dp);
4835	dotaddref = dotdotaddref = NULL;
4836	if (DOINGSUJ(dvp)) {
4837		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4838		    ip->i_mode);
4839		dotaddref->ja_state |= MKDIR_BODY;
4840		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4841		    dp->i_effnlink - 1, dp->i_mode);
4842		dotdotaddref->ja_state |= MKDIR_PARENT;
4843	}
4844	ACQUIRE_LOCK(ITOUMP(dp));
4845	inodedep = inodedep_lookup_ip(ip);
4846	if (DOINGSUJ(dvp)) {
4847		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4848		    inoreflst);
4849		KASSERT(jaddref != NULL,
4850		    ("softdep_setup_mkdir: No addref structure present."));
4851		KASSERT(jaddref->ja_parent == dp->i_number,
4852		    ("softdep_setup_mkdir: bad parent %ju",
4853		    (uintmax_t)jaddref->ja_parent));
4854		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4855		    if_deps);
4856	}
4857	inodedep = inodedep_lookup_ip(dp);
4858	if (DOINGSUJ(dvp))
4859		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4860		    &dotdotaddref->ja_ref, if_deps);
4861	softdep_prelink(ITOV(dp), NULL);
4862	FREE_LOCK(ITOUMP(dp));
4863}
4864
4865/*
4866 * Called to track nlinkdelta of the inode and parent directories prior to
4867 * unlinking a directory.
4868 */
4869void
4870softdep_setup_rmdir(dp, ip)
4871	struct inode *dp;
4872	struct inode *ip;
4873{
4874	struct vnode *dvp;
4875
4876	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4877	    ("softdep_setup_rmdir called on non-softdep filesystem"));
4878	dvp = ITOV(dp);
4879	ACQUIRE_LOCK(ITOUMP(dp));
4880	(void) inodedep_lookup_ip(ip);
4881	(void) inodedep_lookup_ip(dp);
4882	softdep_prelink(dvp, ITOV(ip));
4883	FREE_LOCK(ITOUMP(dp));
4884}
4885
4886/*
4887 * Called to track nlinkdelta of the inode and parent directories prior to
4888 * unlink.
4889 */
4890void
4891softdep_setup_unlink(dp, ip)
4892	struct inode *dp;
4893	struct inode *ip;
4894{
4895	struct vnode *dvp;
4896
4897	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4898	    ("softdep_setup_unlink called on non-softdep filesystem"));
4899	dvp = ITOV(dp);
4900	ACQUIRE_LOCK(ITOUMP(dp));
4901	(void) inodedep_lookup_ip(ip);
4902	(void) inodedep_lookup_ip(dp);
4903	softdep_prelink(dvp, ITOV(ip));
4904	FREE_LOCK(ITOUMP(dp));
4905}
4906
4907/*
4908 * Called to release the journal structures created by a failed non-directory
4909 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4910 */
4911void
4912softdep_revert_create(dp, ip)
4913	struct inode *dp;
4914	struct inode *ip;
4915{
4916	struct inodedep *inodedep;
4917	struct jaddref *jaddref;
4918	struct vnode *dvp;
4919
4920	KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
4921	    ("softdep_revert_create called on non-softdep filesystem"));
4922	dvp = ITOV(dp);
4923	ACQUIRE_LOCK(ITOUMP(dp));
4924	inodedep = inodedep_lookup_ip(ip);
4925	if (DOINGSUJ(dvp)) {
4926		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4927		    inoreflst);
4928		KASSERT(jaddref->ja_parent == dp->i_number,
4929		    ("softdep_revert_create: addref parent mismatch"));
4930		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4931	}
4932	FREE_LOCK(ITOUMP(dp));
4933}
4934
4935/*
4936 * Called to release the journal structures created by a failed link
4937 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4938 */
4939void
4940softdep_revert_link(dp, ip)
4941	struct inode *dp;
4942	struct inode *ip;
4943{
4944	struct inodedep *inodedep;
4945	struct jaddref *jaddref;
4946	struct vnode *dvp;
4947
4948	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4949	    ("softdep_revert_link called on non-softdep filesystem"));
4950	dvp = ITOV(dp);
4951	ACQUIRE_LOCK(ITOUMP(dp));
4952	inodedep = inodedep_lookup_ip(ip);
4953	if (DOINGSUJ(dvp)) {
4954		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4955		    inoreflst);
4956		KASSERT(jaddref->ja_parent == dp->i_number,
4957		    ("softdep_revert_link: addref parent mismatch"));
4958		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4959	}
4960	FREE_LOCK(ITOUMP(dp));
4961}
4962
4963/*
4964 * Called to release the journal structures created by a failed mkdir
4965 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4966 */
4967void
4968softdep_revert_mkdir(dp, ip)
4969	struct inode *dp;
4970	struct inode *ip;
4971{
4972	struct inodedep *inodedep;
4973	struct jaddref *jaddref;
4974	struct jaddref *dotaddref;
4975	struct vnode *dvp;
4976
4977	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4978	    ("softdep_revert_mkdir called on non-softdep filesystem"));
4979	dvp = ITOV(dp);
4980
4981	ACQUIRE_LOCK(ITOUMP(dp));
4982	inodedep = inodedep_lookup_ip(dp);
4983	if (DOINGSUJ(dvp)) {
4984		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4985		    inoreflst);
4986		KASSERT(jaddref->ja_parent == ip->i_number,
4987		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4988		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4989	}
4990	inodedep = inodedep_lookup_ip(ip);
4991	if (DOINGSUJ(dvp)) {
4992		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4993		    inoreflst);
4994		KASSERT(jaddref->ja_parent == dp->i_number,
4995		    ("softdep_revert_mkdir: addref parent mismatch"));
4996		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4997		    inoreflst, if_deps);
4998		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4999		KASSERT(dotaddref->ja_parent == ip->i_number,
5000		    ("softdep_revert_mkdir: dot addref parent mismatch"));
5001		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
5002	}
5003	FREE_LOCK(ITOUMP(dp));
5004}
5005
5006/*
5007 * Called to correct nlinkdelta after a failed rmdir.
5008 */
5009void
5010softdep_revert_rmdir(dp, ip)
5011	struct inode *dp;
5012	struct inode *ip;
5013{
5014
5015	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5016	    ("softdep_revert_rmdir called on non-softdep filesystem"));
5017	ACQUIRE_LOCK(ITOUMP(dp));
5018	(void) inodedep_lookup_ip(ip);
5019	(void) inodedep_lookup_ip(dp);
5020	FREE_LOCK(ITOUMP(dp));
5021}
5022
5023/*
5024 * Protecting the freemaps (or bitmaps).
5025 *
5026 * To eliminate the need to execute fsck before mounting a filesystem
5027 * after a power failure, one must (conservatively) guarantee that the
5028 * on-disk copy of the bitmaps never indicate that a live inode or block is
5029 * free.  So, when a block or inode is allocated, the bitmap should be
5030 * updated (on disk) before any new pointers.  When a block or inode is
5031 * freed, the bitmap should not be updated until all pointers have been
5032 * reset.  The latter dependency is handled by the delayed de-allocation
5033 * approach described below for block and inode de-allocation.  The former
5034 * dependency is handled by calling the following procedure when a block or
5035 * inode is allocated. When an inode is allocated an "inodedep" is created
5036 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
5037 * Each "inodedep" is also inserted into the hash indexing structure so
5038 * that any additional link additions can be made dependent on the inode
5039 * allocation.
5040 *
5041 * The ufs filesystem maintains a number of free block counts (e.g., per
5042 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
5043 * in addition to the bitmaps.  These counts are used to improve efficiency
5044 * during allocation and therefore must be consistent with the bitmaps.
5045 * There is no convenient way to guarantee post-crash consistency of these
5046 * counts with simple update ordering, for two main reasons: (1) The counts
5047 * and bitmaps for a single cylinder group block are not in the same disk
5048 * sector.  If a disk write is interrupted (e.g., by power failure), one may
5049 * be written and the other not.  (2) Some of the counts are located in the
5050 * superblock rather than the cylinder group block. So, we focus our soft
5051 * updates implementation on protecting the bitmaps. When mounting a
5052 * filesystem, we recompute the auxiliary counts from the bitmaps.
5053 */
5054
5055/*
5056 * Called just after updating the cylinder group block to allocate an inode.
5057 */
5058void
5059softdep_setup_inomapdep(bp, ip, newinum, mode)
5060	struct buf *bp;		/* buffer for cylgroup block with inode map */
5061	struct inode *ip;	/* inode related to allocation */
5062	ino_t newinum;		/* new inode number being allocated */
5063	int mode;
5064{
5065	struct inodedep *inodedep;
5066	struct bmsafemap *bmsafemap;
5067	struct jaddref *jaddref;
5068	struct mount *mp;
5069	struct fs *fs;
5070
5071	mp = ITOVFS(ip);
5072	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5073	    ("softdep_setup_inomapdep called on non-softdep filesystem"));
5074	fs = VFSTOUFS(mp)->um_fs;
5075	jaddref = NULL;
5076
5077	/*
5078	 * Allocate the journal reference add structure so that the bitmap
5079	 * can be dependent on it.
5080	 */
5081	if (MOUNTEDSUJ(mp)) {
5082		jaddref = newjaddref(ip, newinum, 0, 0, mode);
5083		jaddref->ja_state |= NEWBLOCK;
5084	}
5085
5086	/*
5087	 * Create a dependency for the newly allocated inode.
5088	 * Panic if it already exists as something is seriously wrong.
5089	 * Otherwise add it to the dependency list for the buffer holding
5090	 * the cylinder group map from which it was allocated.
5091	 *
5092	 * We have to preallocate a bmsafemap entry in case it is needed
5093	 * in bmsafemap_lookup since once we allocate the inodedep, we
5094	 * have to finish initializing it before we can FREE_LOCK().
5095	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
5096	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5097	 * creating the inodedep as it can be freed during the time
5098	 * that we FREE_LOCK() while allocating the inodedep. We must
5099	 * call workitem_alloc() before entering the locked section as
5100	 * it also acquires the lock and we must avoid trying doing so
5101	 * recursively.
5102	 */
5103	bmsafemap = malloc(sizeof(struct bmsafemap),
5104	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5105	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5106	ACQUIRE_LOCK(ITOUMP(ip));
5107	if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
5108		panic("softdep_setup_inomapdep: dependency %p for new"
5109		    "inode already exists", inodedep);
5110	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5111	if (jaddref) {
5112		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5113		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5114		    if_deps);
5115	} else {
5116		inodedep->id_state |= ONDEPLIST;
5117		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5118	}
5119	inodedep->id_bmsafemap = bmsafemap;
5120	inodedep->id_state &= ~DEPCOMPLETE;
5121	FREE_LOCK(ITOUMP(ip));
5122}
5123
5124/*
5125 * Called just after updating the cylinder group block to
5126 * allocate block or fragment.
5127 */
5128void
5129softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
5130	struct buf *bp;		/* buffer for cylgroup block with block map */
5131	struct mount *mp;	/* filesystem doing allocation */
5132	ufs2_daddr_t newblkno;	/* number of newly allocated block */
5133	int frags;		/* Number of fragments. */
5134	int oldfrags;		/* Previous number of fragments for extend. */
5135{
5136	struct newblk *newblk;
5137	struct bmsafemap *bmsafemap;
5138	struct jnewblk *jnewblk;
5139	struct ufsmount *ump;
5140	struct fs *fs;
5141
5142	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5143	    ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5144	ump = VFSTOUFS(mp);
5145	fs = ump->um_fs;
5146	jnewblk = NULL;
5147	/*
5148	 * Create a dependency for the newly allocated block.
5149	 * Add it to the dependency list for the buffer holding
5150	 * the cylinder group map from which it was allocated.
5151	 */
5152	if (MOUNTEDSUJ(mp)) {
5153		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5154		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5155		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5156		jnewblk->jn_state = ATTACHED;
5157		jnewblk->jn_blkno = newblkno;
5158		jnewblk->jn_frags = frags;
5159		jnewblk->jn_oldfrags = oldfrags;
5160#ifdef INVARIANTS
5161		{
5162			struct cg *cgp;
5163			uint8_t *blksfree;
5164			long bno;
5165			int i;
5166
5167			cgp = (struct cg *)bp->b_data;
5168			blksfree = cg_blksfree(cgp);
5169			bno = dtogd(fs, jnewblk->jn_blkno);
5170			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5171			    i++) {
5172				if (isset(blksfree, bno + i))
5173					panic("softdep_setup_blkmapdep: "
5174					    "free fragment %d from %d-%d "
5175					    "state 0x%X dep %p", i,
5176					    jnewblk->jn_oldfrags,
5177					    jnewblk->jn_frags,
5178					    jnewblk->jn_state,
5179					    jnewblk->jn_dep);
5180			}
5181		}
5182#endif
5183	}
5184
5185	CTR3(KTR_SUJ,
5186	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5187	    newblkno, frags, oldfrags);
5188	ACQUIRE_LOCK(ump);
5189	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5190		panic("softdep_setup_blkmapdep: found block");
5191	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5192	    dtog(fs, newblkno), NULL);
5193	if (jnewblk) {
5194		jnewblk->jn_dep = (struct worklist *)newblk;
5195		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5196	} else {
5197		newblk->nb_state |= ONDEPLIST;
5198		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5199	}
5200	newblk->nb_bmsafemap = bmsafemap;
5201	newblk->nb_jnewblk = jnewblk;
5202	FREE_LOCK(ump);
5203}
5204
5205#define	BMSAFEMAP_HASH(ump, cg) \
5206      (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5207
5208static int
5209bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
5210	struct bmsafemap_hashhead *bmsafemaphd;
5211	int cg;
5212	struct bmsafemap **bmsafemapp;
5213{
5214	struct bmsafemap *bmsafemap;
5215
5216	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5217		if (bmsafemap->sm_cg == cg)
5218			break;
5219	if (bmsafemap) {
5220		*bmsafemapp = bmsafemap;
5221		return (1);
5222	}
5223	*bmsafemapp = NULL;
5224
5225	return (0);
5226}
5227
5228/*
5229 * Find the bmsafemap associated with a cylinder group buffer.
5230 * If none exists, create one. The buffer must be locked when
5231 * this routine is called and this routine must be called with
5232 * the softdep lock held. To avoid giving up the lock while
5233 * allocating a new bmsafemap, a preallocated bmsafemap may be
5234 * provided. If it is provided but not needed, it is freed.
5235 */
5236static struct bmsafemap *
5237bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5238	struct mount *mp;
5239	struct buf *bp;
5240	int cg;
5241	struct bmsafemap *newbmsafemap;
5242{
5243	struct bmsafemap_hashhead *bmsafemaphd;
5244	struct bmsafemap *bmsafemap, *collision;
5245	struct worklist *wk;
5246	struct ufsmount *ump;
5247
5248	ump = VFSTOUFS(mp);
5249	LOCK_OWNED(ump);
5250	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5251	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5252		if (wk->wk_type == D_BMSAFEMAP) {
5253			if (newbmsafemap)
5254				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5255			return (WK_BMSAFEMAP(wk));
5256		}
5257	}
5258	bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5259	if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5260		if (newbmsafemap)
5261			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5262		return (bmsafemap);
5263	}
5264	if (newbmsafemap) {
5265		bmsafemap = newbmsafemap;
5266	} else {
5267		FREE_LOCK(ump);
5268		bmsafemap = malloc(sizeof(struct bmsafemap),
5269			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5270		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5271		ACQUIRE_LOCK(ump);
5272	}
5273	bmsafemap->sm_buf = bp;
5274	LIST_INIT(&bmsafemap->sm_inodedephd);
5275	LIST_INIT(&bmsafemap->sm_inodedepwr);
5276	LIST_INIT(&bmsafemap->sm_newblkhd);
5277	LIST_INIT(&bmsafemap->sm_newblkwr);
5278	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5279	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5280	LIST_INIT(&bmsafemap->sm_freehd);
5281	LIST_INIT(&bmsafemap->sm_freewr);
5282	if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5283		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5284		return (collision);
5285	}
5286	bmsafemap->sm_cg = cg;
5287	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5288	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5289	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5290	return (bmsafemap);
5291}
5292
5293/*
5294 * Direct block allocation dependencies.
5295 *
5296 * When a new block is allocated, the corresponding disk locations must be
5297 * initialized (with zeros or new data) before the on-disk inode points to
5298 * them.  Also, the freemap from which the block was allocated must be
5299 * updated (on disk) before the inode's pointer. These two dependencies are
5300 * independent of each other and are needed for all file blocks and indirect
5301 * blocks that are pointed to directly by the inode.  Just before the
5302 * "in-core" version of the inode is updated with a newly allocated block
5303 * number, a procedure (below) is called to setup allocation dependency
5304 * structures.  These structures are removed when the corresponding
5305 * dependencies are satisfied or when the block allocation becomes obsolete
5306 * (i.e., the file is deleted, the block is de-allocated, or the block is a
5307 * fragment that gets upgraded).  All of these cases are handled in
5308 * procedures described later.
5309 *
5310 * When a file extension causes a fragment to be upgraded, either to a larger
5311 * fragment or to a full block, the on-disk location may change (if the
5312 * previous fragment could not simply be extended). In this case, the old
5313 * fragment must be de-allocated, but not until after the inode's pointer has
5314 * been updated. In most cases, this is handled by later procedures, which
5315 * will construct a "freefrag" structure to be added to the workitem queue
5316 * when the inode update is complete (or obsolete).  The main exception to
5317 * this is when an allocation occurs while a pending allocation dependency
5318 * (for the same block pointer) remains.  This case is handled in the main
5319 * allocation dependency setup procedure by immediately freeing the
5320 * unreferenced fragments.
5321 */
5322void
5323softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5324	struct inode *ip;	/* inode to which block is being added */
5325	ufs_lbn_t off;		/* block pointer within inode */
5326	ufs2_daddr_t newblkno;	/* disk block number being added */
5327	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
5328	long newsize;		/* size of new block */
5329	long oldsize;		/* size of new block */
5330	struct buf *bp;		/* bp for allocated block */
5331{
5332	struct allocdirect *adp, *oldadp;
5333	struct allocdirectlst *adphead;
5334	struct freefrag *freefrag;
5335	struct inodedep *inodedep;
5336	struct pagedep *pagedep;
5337	struct jnewblk *jnewblk;
5338	struct newblk *newblk;
5339	struct mount *mp;
5340	ufs_lbn_t lbn;
5341
5342	lbn = bp->b_lblkno;
5343	mp = ITOVFS(ip);
5344	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5345	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
5346	if (oldblkno && oldblkno != newblkno)
5347		/*
5348		 * The usual case is that a smaller fragment that
5349		 * was just allocated has been replaced with a bigger
5350		 * fragment or a full-size block. If it is marked as
5351		 * B_DELWRI, the current contents have not been written
5352		 * to disk. It is possible that the block was written
5353		 * earlier, but very uncommon. If the block has never
5354		 * been written, there is no need to send a BIO_DELETE
5355		 * for it when it is freed. The gain from avoiding the
5356		 * TRIMs for the common case of unwritten blocks far
5357		 * exceeds the cost of the write amplification for the
5358		 * uncommon case of failing to send a TRIM for a block
5359		 * that had been written.
5360		 */
5361		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5362		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5363	else
5364		freefrag = NULL;
5365
5366	CTR6(KTR_SUJ,
5367	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5368	    "off %jd newsize %ld oldsize %d",
5369	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5370	ACQUIRE_LOCK(ITOUMP(ip));
5371	if (off >= UFS_NDADDR) {
5372		if (lbn > 0)
5373			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5374			    lbn, off);
5375		/* allocating an indirect block */
5376		if (oldblkno != 0)
5377			panic("softdep_setup_allocdirect: non-zero indir");
5378	} else {
5379		if (off != lbn)
5380			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5381			    lbn, off);
5382		/*
5383		 * Allocating a direct block.
5384		 *
5385		 * If we are allocating a directory block, then we must
5386		 * allocate an associated pagedep to track additions and
5387		 * deletions.
5388		 */
5389		if ((ip->i_mode & IFMT) == IFDIR)
5390			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5391			    &pagedep);
5392	}
5393	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5394		panic("softdep_setup_allocdirect: lost block");
5395	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5396	    ("softdep_setup_allocdirect: newblk already initialized"));
5397	/*
5398	 * Convert the newblk to an allocdirect.
5399	 */
5400	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5401	adp = (struct allocdirect *)newblk;
5402	newblk->nb_freefrag = freefrag;
5403	adp->ad_offset = off;
5404	adp->ad_oldblkno = oldblkno;
5405	adp->ad_newsize = newsize;
5406	adp->ad_oldsize = oldsize;
5407
5408	/*
5409	 * Finish initializing the journal.
5410	 */
5411	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5412		jnewblk->jn_ino = ip->i_number;
5413		jnewblk->jn_lbn = lbn;
5414		add_to_journal(&jnewblk->jn_list);
5415	}
5416	if (freefrag && freefrag->ff_jdep != NULL &&
5417	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5418		add_to_journal(freefrag->ff_jdep);
5419	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5420	adp->ad_inodedep = inodedep;
5421
5422	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5423	/*
5424	 * The list of allocdirects must be kept in sorted and ascending
5425	 * order so that the rollback routines can quickly determine the
5426	 * first uncommitted block (the size of the file stored on disk
5427	 * ends at the end of the lowest committed fragment, or if there
5428	 * are no fragments, at the end of the highest committed block).
5429	 * Since files generally grow, the typical case is that the new
5430	 * block is to be added at the end of the list. We speed this
5431	 * special case by checking against the last allocdirect in the
5432	 * list before laboriously traversing the list looking for the
5433	 * insertion point.
5434	 */
5435	adphead = &inodedep->id_newinoupdt;
5436	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5437	if (oldadp == NULL || oldadp->ad_offset <= off) {
5438		/* insert at end of list */
5439		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5440		if (oldadp != NULL && oldadp->ad_offset == off)
5441			allocdirect_merge(adphead, adp, oldadp);
5442		FREE_LOCK(ITOUMP(ip));
5443		return;
5444	}
5445	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5446		if (oldadp->ad_offset >= off)
5447			break;
5448	}
5449	if (oldadp == NULL)
5450		panic("softdep_setup_allocdirect: lost entry");
5451	/* insert in middle of list */
5452	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5453	if (oldadp->ad_offset == off)
5454		allocdirect_merge(adphead, adp, oldadp);
5455
5456	FREE_LOCK(ITOUMP(ip));
5457}
5458
5459/*
5460 * Merge a newer and older journal record to be stored either in a
5461 * newblock or freefrag.  This handles aggregating journal records for
5462 * fragment allocation into a second record as well as replacing a
5463 * journal free with an aborted journal allocation.  A segment for the
5464 * oldest record will be placed on wkhd if it has been written.  If not
5465 * the segment for the newer record will suffice.
5466 */
5467static struct worklist *
5468jnewblk_merge(new, old, wkhd)
5469	struct worklist *new;
5470	struct worklist *old;
5471	struct workhead *wkhd;
5472{
5473	struct jnewblk *njnewblk;
5474	struct jnewblk *jnewblk;
5475
5476	/* Handle NULLs to simplify callers. */
5477	if (new == NULL)
5478		return (old);
5479	if (old == NULL)
5480		return (new);
5481	/* Replace a jfreefrag with a jnewblk. */
5482	if (new->wk_type == D_JFREEFRAG) {
5483		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5484			panic("jnewblk_merge: blkno mismatch: %p, %p",
5485			    old, new);
5486		cancel_jfreefrag(WK_JFREEFRAG(new));
5487		return (old);
5488	}
5489	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5490		panic("jnewblk_merge: Bad type: old %d new %d\n",
5491		    old->wk_type, new->wk_type);
5492	/*
5493	 * Handle merging of two jnewblk records that describe
5494	 * different sets of fragments in the same block.
5495	 */
5496	jnewblk = WK_JNEWBLK(old);
5497	njnewblk = WK_JNEWBLK(new);
5498	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5499		panic("jnewblk_merge: Merging disparate blocks.");
5500	/*
5501	 * The record may be rolled back in the cg.
5502	 */
5503	if (jnewblk->jn_state & UNDONE) {
5504		jnewblk->jn_state &= ~UNDONE;
5505		njnewblk->jn_state |= UNDONE;
5506		njnewblk->jn_state &= ~ATTACHED;
5507	}
5508	/*
5509	 * We modify the newer addref and free the older so that if neither
5510	 * has been written the most up-to-date copy will be on disk.  If
5511	 * both have been written but rolled back we only temporarily need
5512	 * one of them to fix the bits when the cg write completes.
5513	 */
5514	jnewblk->jn_state |= ATTACHED | COMPLETE;
5515	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5516	cancel_jnewblk(jnewblk, wkhd);
5517	WORKLIST_REMOVE(&jnewblk->jn_list);
5518	free_jnewblk(jnewblk);
5519	return (new);
5520}
5521
5522/*
5523 * Replace an old allocdirect dependency with a newer one.
5524 */
5525static void
5526allocdirect_merge(adphead, newadp, oldadp)
5527	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5528	struct allocdirect *newadp;	/* allocdirect being added */
5529	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5530{
5531	struct worklist *wk;
5532	struct freefrag *freefrag;
5533
5534	freefrag = NULL;
5535	LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5536	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5537	    newadp->ad_oldsize != oldadp->ad_newsize ||
5538	    newadp->ad_offset >= UFS_NDADDR)
5539		panic("%s %jd != new %jd || old size %ld != new %ld",
5540		    "allocdirect_merge: old blkno",
5541		    (intmax_t)newadp->ad_oldblkno,
5542		    (intmax_t)oldadp->ad_newblkno,
5543		    newadp->ad_oldsize, oldadp->ad_newsize);
5544	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5545	newadp->ad_oldsize = oldadp->ad_oldsize;
5546	/*
5547	 * If the old dependency had a fragment to free or had never
5548	 * previously had a block allocated, then the new dependency
5549	 * can immediately post its freefrag and adopt the old freefrag.
5550	 * This action is done by swapping the freefrag dependencies.
5551	 * The new dependency gains the old one's freefrag, and the
5552	 * old one gets the new one and then immediately puts it on
5553	 * the worklist when it is freed by free_newblk. It is
5554	 * not possible to do this swap when the old dependency had a
5555	 * non-zero size but no previous fragment to free. This condition
5556	 * arises when the new block is an extension of the old block.
5557	 * Here, the first part of the fragment allocated to the new
5558	 * dependency is part of the block currently claimed on disk by
5559	 * the old dependency, so cannot legitimately be freed until the
5560	 * conditions for the new dependency are fulfilled.
5561	 */
5562	freefrag = newadp->ad_freefrag;
5563	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5564		newadp->ad_freefrag = oldadp->ad_freefrag;
5565		oldadp->ad_freefrag = freefrag;
5566	}
5567	/*
5568	 * If we are tracking a new directory-block allocation,
5569	 * move it from the old allocdirect to the new allocdirect.
5570	 */
5571	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5572		WORKLIST_REMOVE(wk);
5573		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5574			panic("allocdirect_merge: extra newdirblk");
5575		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5576	}
5577	TAILQ_REMOVE(adphead, oldadp, ad_next);
5578	/*
5579	 * We need to move any journal dependencies over to the freefrag
5580	 * that releases this block if it exists.  Otherwise we are
5581	 * extending an existing block and we'll wait until that is
5582	 * complete to release the journal space and extend the
5583	 * new journal to cover this old space as well.
5584	 */
5585	if (freefrag == NULL) {
5586		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5587			panic("allocdirect_merge: %jd != %jd",
5588			    oldadp->ad_newblkno, newadp->ad_newblkno);
5589		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5590		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5591		    &oldadp->ad_block.nb_jnewblk->jn_list,
5592		    &newadp->ad_block.nb_jwork);
5593		oldadp->ad_block.nb_jnewblk = NULL;
5594		cancel_newblk(&oldadp->ad_block, NULL,
5595		    &newadp->ad_block.nb_jwork);
5596	} else {
5597		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5598		    &freefrag->ff_list, &freefrag->ff_jwork);
5599		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5600		    &freefrag->ff_jwork);
5601	}
5602	free_newblk(&oldadp->ad_block);
5603}
5604
5605/*
5606 * Allocate a jfreefrag structure to journal a single block free.
5607 */
5608static struct jfreefrag *
5609newjfreefrag(freefrag, ip, blkno, size, lbn)
5610	struct freefrag *freefrag;
5611	struct inode *ip;
5612	ufs2_daddr_t blkno;
5613	long size;
5614	ufs_lbn_t lbn;
5615{
5616	struct jfreefrag *jfreefrag;
5617	struct fs *fs;
5618
5619	fs = ITOFS(ip);
5620	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5621	    M_SOFTDEP_FLAGS);
5622	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip));
5623	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5624	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5625	jfreefrag->fr_ino = ip->i_number;
5626	jfreefrag->fr_lbn = lbn;
5627	jfreefrag->fr_blkno = blkno;
5628	jfreefrag->fr_frags = numfrags(fs, size);
5629	jfreefrag->fr_freefrag = freefrag;
5630
5631	return (jfreefrag);
5632}
5633
5634/*
5635 * Allocate a new freefrag structure.
5636 */
5637static struct freefrag *
5638newfreefrag(ip, blkno, size, lbn, key)
5639	struct inode *ip;
5640	ufs2_daddr_t blkno;
5641	long size;
5642	ufs_lbn_t lbn;
5643	u_long key;
5644{
5645	struct freefrag *freefrag;
5646	struct ufsmount *ump;
5647	struct fs *fs;
5648
5649	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5650	    ip->i_number, blkno, size, lbn);
5651	ump = ITOUMP(ip);
5652	fs = ump->um_fs;
5653	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5654		panic("newfreefrag: frag size");
5655	freefrag = malloc(sizeof(struct freefrag),
5656	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5657	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump));
5658	freefrag->ff_state = ATTACHED;
5659	LIST_INIT(&freefrag->ff_jwork);
5660	freefrag->ff_inum = ip->i_number;
5661	freefrag->ff_vtype = ITOV(ip)->v_type;
5662	freefrag->ff_blkno = blkno;
5663	freefrag->ff_fragsize = size;
5664	freefrag->ff_key = key;
5665
5666	if (MOUNTEDSUJ(UFSTOVFS(ump))) {
5667		freefrag->ff_jdep = (struct worklist *)
5668		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5669	} else {
5670		freefrag->ff_state |= DEPCOMPLETE;
5671		freefrag->ff_jdep = NULL;
5672	}
5673
5674	return (freefrag);
5675}
5676
5677/*
5678 * This workitem de-allocates fragments that were replaced during
5679 * file block allocation.
5680 */
5681static void
5682handle_workitem_freefrag(freefrag)
5683	struct freefrag *freefrag;
5684{
5685	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5686	struct workhead wkhd;
5687
5688	CTR3(KTR_SUJ,
5689	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5690	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5691	/*
5692	 * It would be illegal to add new completion items to the
5693	 * freefrag after it was schedule to be done so it must be
5694	 * safe to modify the list head here.
5695	 */
5696	LIST_INIT(&wkhd);
5697	ACQUIRE_LOCK(ump);
5698	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5699	/*
5700	 * If the journal has not been written we must cancel it here.
5701	 */
5702	if (freefrag->ff_jdep) {
5703		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5704			panic("handle_workitem_freefrag: Unexpected type %d\n",
5705			    freefrag->ff_jdep->wk_type);
5706		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5707	}
5708	FREE_LOCK(ump);
5709	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5710	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
5711	   &wkhd, freefrag->ff_key);
5712	ACQUIRE_LOCK(ump);
5713	WORKITEM_FREE(freefrag, D_FREEFRAG);
5714	FREE_LOCK(ump);
5715}
5716
5717/*
5718 * Set up a dependency structure for an external attributes data block.
5719 * This routine follows much of the structure of softdep_setup_allocdirect.
5720 * See the description of softdep_setup_allocdirect above for details.
5721 */
5722void
5723softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5724	struct inode *ip;
5725	ufs_lbn_t off;
5726	ufs2_daddr_t newblkno;
5727	ufs2_daddr_t oldblkno;
5728	long newsize;
5729	long oldsize;
5730	struct buf *bp;
5731{
5732	struct allocdirect *adp, *oldadp;
5733	struct allocdirectlst *adphead;
5734	struct freefrag *freefrag;
5735	struct inodedep *inodedep;
5736	struct jnewblk *jnewblk;
5737	struct newblk *newblk;
5738	struct mount *mp;
5739	struct ufsmount *ump;
5740	ufs_lbn_t lbn;
5741
5742	mp = ITOVFS(ip);
5743	ump = VFSTOUFS(mp);
5744	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5745	    ("softdep_setup_allocext called on non-softdep filesystem"));
5746	KASSERT(off < UFS_NXADDR,
5747	    ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off));
5748
5749	lbn = bp->b_lblkno;
5750	if (oldblkno && oldblkno != newblkno)
5751		/*
5752		 * The usual case is that a smaller fragment that
5753		 * was just allocated has been replaced with a bigger
5754		 * fragment or a full-size block. If it is marked as
5755		 * B_DELWRI, the current contents have not been written
5756		 * to disk. It is possible that the block was written
5757		 * earlier, but very uncommon. If the block has never
5758		 * been written, there is no need to send a BIO_DELETE
5759		 * for it when it is freed. The gain from avoiding the
5760		 * TRIMs for the common case of unwritten blocks far
5761		 * exceeds the cost of the write amplification for the
5762		 * uncommon case of failing to send a TRIM for a block
5763		 * that had been written.
5764		 */
5765		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5766		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5767	else
5768		freefrag = NULL;
5769
5770	ACQUIRE_LOCK(ump);
5771	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5772		panic("softdep_setup_allocext: lost block");
5773	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5774	    ("softdep_setup_allocext: newblk already initialized"));
5775	/*
5776	 * Convert the newblk to an allocdirect.
5777	 */
5778	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5779	adp = (struct allocdirect *)newblk;
5780	newblk->nb_freefrag = freefrag;
5781	adp->ad_offset = off;
5782	adp->ad_oldblkno = oldblkno;
5783	adp->ad_newsize = newsize;
5784	adp->ad_oldsize = oldsize;
5785	adp->ad_state |=  EXTDATA;
5786
5787	/*
5788	 * Finish initializing the journal.
5789	 */
5790	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5791		jnewblk->jn_ino = ip->i_number;
5792		jnewblk->jn_lbn = lbn;
5793		add_to_journal(&jnewblk->jn_list);
5794	}
5795	if (freefrag && freefrag->ff_jdep != NULL &&
5796	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5797		add_to_journal(freefrag->ff_jdep);
5798	inodedep_lookup(mp, ip->i_number,