1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27/* All Rights Reserved */
28
29/*
30 * Portions of this source code were derived from Berkeley 4.3 BSD
31 * under license from the Regents of the University of California.
32 */
33
34#include <sys/sysmacros.h>
35#include <sys/param.h>
36#include <sys/types.h>
37#include <sys/systm.h>
38#include <sys/t_lock.h>
39#include <sys/uio.h>
40#include <sys/kmem.h>
41#include <sys/thread.h>
42#include <sys/vfs.h>
43#include <sys/errno.h>
44#include <sys/buf.h>
45#include <sys/vnode.h>
46#include <sys/fs/ufs_trans.h>
47#include <sys/fs/ufs_inode.h>
48#include <sys/fs/ufs_fs.h>
49#include <sys/fs/ufs_fsdir.h>
50#include <sys/fs/ufs_quota.h>
51#include <sys/fs/ufs_panic.h>
52#include <sys/fs/ufs_bio.h>
53#include <sys/fs/ufs_log.h>
54#include <sys/cmn_err.h>
55#include <sys/file.h>
56#include <sys/debug.h>
57
58
59extern kmutex_t ufsvfs_mutex;
60extern struct ufsvfs *ufs_instances;
61
62/*
63 * hlock any file systems w/errored logs
64 */
65int
66ufs_trans_hlock()
67{
68	struct ufsvfs	*ufsvfsp;
69	struct lockfs	lockfs;
70	int		error;
71	int		retry	= 0;
72
73	/*
74	 * find fs's that paniced or have errored logging devices
75	 */
76	mutex_enter(&ufsvfs_mutex);
77	for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) {
78		/*
79		 * not mounted; continue
80		 */
81		if ((ufsvfsp->vfs_vfs == NULL) ||
82		    (ufsvfsp->vfs_validfs == UT_UNMOUNTED))
83			continue;
84		/*
85		 * disallow unmounts (hlock occurs below)
86		 */
87		if (TRANS_ISERROR(ufsvfsp))
88			ufsvfsp->vfs_validfs = UT_HLOCKING;
89	}
90	mutex_exit(&ufsvfs_mutex);
91
92	/*
93	 * hlock the fs's that paniced or have errored logging devices
94	 */
95again:
96	mutex_enter(&ufsvfs_mutex);
97	for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next)
98		if (ufsvfsp->vfs_validfs == UT_HLOCKING)
99			break;
100	mutex_exit(&ufsvfs_mutex);
101	if (ufsvfsp == NULL)
102		return (retry);
103	/*
104	 * hlock the file system
105	 */
106	(void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs);
107	if (!LOCKFS_IS_ELOCK(&lockfs)) {
108		lockfs.lf_lock = LOCKFS_HLOCK;
109		lockfs.lf_flags = 0;
110		lockfs.lf_comlen = 0;
111		lockfs.lf_comment = NULL;
112		error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0);
113		/*
114		 * retry after awhile; another app currently doing lockfs
115		 */
116		if (error == EBUSY || error == EINVAL)
117			retry = 1;
118	} else {
119		if (ufsfx_get_failure_qlen() > 0) {
120			if (mutex_tryenter(&ufs_fix.uq_mutex)) {
121				ufs_fix.uq_lowat = ufs_fix.uq_ne;
122				cv_broadcast(&ufs_fix.uq_cv);
123				mutex_exit(&ufs_fix.uq_mutex);
124			}
125		}
126		retry = 1;
127	}
128
129	/*
130	 * allow unmounts
131	 */
132	ufsvfsp->vfs_validfs = UT_MOUNTED;
133	goto again;
134}
135
136/*ARGSUSED*/
137void
138ufs_trans_onerror()
139{
140	mutex_enter(&ufs_hlock.uq_mutex);
141	ufs_hlock.uq_ne = ufs_hlock.uq_lowat;
142	cv_broadcast(&ufs_hlock.uq_cv);
143	mutex_exit(&ufs_hlock.uq_mutex);
144}
145
146void
147ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid)
148{
149	if (curthread->t_flag & T_DONTBLOCK) {
150		sbupdate(vfsp);
151		return;
152	} else {
153
154		if (panicstr && TRANS_ISTRANS(ufsvfsp))
155			return;
156
157		curthread->t_flag |= T_DONTBLOCK;
158		TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
159		sbupdate(vfsp);
160		TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
161		curthread->t_flag &= ~T_DONTBLOCK;
162	}
163}
164
165void
166ufs_trans_iupdat(struct inode *ip, int waitfor)
167{
168	struct ufsvfs	*ufsvfsp;
169
170	if (curthread->t_flag & T_DONTBLOCK) {
171		rw_enter(&ip->i_contents, RW_READER);
172		ufs_iupdat(ip, waitfor);
173		rw_exit(&ip->i_contents);
174		return;
175	} else {
176		ufsvfsp = ip->i_ufsvfs;
177
178		if (panicstr && TRANS_ISTRANS(ufsvfsp))
179			return;
180
181		curthread->t_flag |= T_DONTBLOCK;
182		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
183		rw_enter(&ip->i_contents, RW_READER);
184		ufs_iupdat(ip, waitfor);
185		rw_exit(&ip->i_contents);
186		TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
187		curthread->t_flag &= ~T_DONTBLOCK;
188	}
189}
190
191void
192ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid)
193{
194	if (curthread->t_flag & T_DONTBLOCK) {
195		mutex_enter(&ufsvfsp->vfs_lock);
196		ufs_sbwrite(ufsvfsp);
197		mutex_exit(&ufsvfsp->vfs_lock);
198		return;
199	} else {
200
201		if (panicstr && TRANS_ISTRANS(ufsvfsp))
202			return;
203
204		curthread->t_flag |= T_DONTBLOCK;
205		TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
206		mutex_enter(&ufsvfsp->vfs_lock);
207		ufs_sbwrite(ufsvfsp);
208		mutex_exit(&ufsvfsp->vfs_lock);
209		TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
210		curthread->t_flag &= ~T_DONTBLOCK;
211	}
212}
213
214/*ARGSUSED*/
215int
216ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore)
217{
218	struct fs	*fs;
219
220	fs = ufsvfsp->vfs_fs;
221	mutex_enter(&ufsvfsp->vfs_lock);
222	TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp,
223	    ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize,
224	    (caddr_t)fs->fs_u.fs_csp, fs->fs_cssize);
225	mutex_exit(&ufsvfsp->vfs_lock);
226	return (0);
227}
228
229/*ARGSUSED*/
230int
231ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno)
232{
233	struct buf	*bp;
234
235	bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1);
236	if (bp == NULL)
237		return (ENOENT);
238
239	if (bp->b_flags & B_DELWRI) {
240		/*
241		 * Do not use brwrite() here since the buffer is already
242		 * marked for retry or not by the code that called
243		 * TRANS_BUF().
244		 */
245		UFS_BWRITE(ufsvfsp, bp);
246		return (0);
247	}
248	/*
249	 * If we did not find the real buf for this block above then
250	 * clear the dev so the buf won't be found by mistake
251	 * for this block later.  We had to allocate at least a 1 byte
252	 * buffer to keep brelse happy.
253	 */
254	if (bp->b_bufsize == 1) {
255		bp->b_dev = (o_dev_t)NODEV;
256		bp->b_edev = NODEV;
257		bp->b_flags = 0;
258	}
259	brelse(bp);
260	return (ENOENT);
261}
262
263/*ARGSUSED*/
264int
265ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino)
266{
267	int		error;
268	struct inode	*ip;
269
270	/*
271	 * Grab the quota lock (if the file system has not been forcibly
272	 * unmounted).
273	 */
274	if (ufsvfsp)
275		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
276
277	error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred);
278
279	if (ufsvfsp)
280		rw_exit(&ufsvfsp->vfs_dqrwlock);
281	if (error)
282		return (ENOENT);
283
284	if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) {
285		rw_enter(&ip->i_contents, RW_READER);
286		ufs_iupdat(ip, 1);
287		rw_exit(&ip->i_contents);
288		VN_RELE(ITOV(ip));
289		return (0);
290	}
291	VN_RELE(ITOV(ip));
292	return (ENOENT);
293}
294
295#ifdef DEBUG
296/*
297 *	These routines maintain the metadata map (matamap)
298 */
299
300/*
301 * update the metadata map at mount
302 */
303static int
304ufs_trans_mata_mount_scan(struct inode *ip, void *arg)
305{
306	/*
307	 * wrong file system; keep looking
308	 */
309	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
310		return (0);
311
312	/*
313	 * load the metadata map
314	 */
315	rw_enter(&ip->i_contents, RW_WRITER);
316	ufs_trans_mata_iget(ip);
317	rw_exit(&ip->i_contents);
318	return (0);
319}
320
321void
322ufs_trans_mata_mount(struct ufsvfs *ufsvfsp)
323{
324	struct fs	*fs	= ufsvfsp->vfs_fs;
325	ino_t		ino;
326	int		i;
327
328	/*
329	 * put static metadata into matamap
330	 *	superblock
331	 *	cylinder groups
332	 *	inode groups
333	 *	existing inodes
334	 */
335	TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize);
336
337	for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) {
338		TRANS_MATAADD(ufsvfsp,
339		    ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize);
340		TRANS_MATAADD(ufsvfsp,
341		    ldbtob(fsbtodb(fs, itod(fs, ino))),
342		    fs->fs_ipg * sizeof (struct dinode));
343	}
344	(void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp);
345}
346
347/*
348 * clear the metadata map at umount
349 */
350void
351ufs_trans_mata_umount(struct ufsvfs *ufsvfsp)
352{
353	top_mataclr(ufsvfsp);
354}
355
356/*
357 * summary info (may be extended during growfs test)
358 */
359void
360ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs)
361{
362	TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)),
363	    fs->fs_cssize);
364}
365
366/*
367 * scan an allocation block (either inode or true block)
368 */
369static void
370ufs_trans_mata_direct(
371	struct inode *ip,
372	daddr_t *fragsp,
373	daddr32_t *blkp,
374	unsigned int nblk)
375{
376	int		i;
377	daddr_t		frag;
378	ulong_t		nb;
379	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
380	struct fs	*fs		= ufsvfsp->vfs_fs;
381
382	for (i = 0; i < nblk && *fragsp; ++i, ++blkp)
383		if ((frag = *blkp) != 0) {
384			if (*fragsp > fs->fs_frag) {
385				nb = fs->fs_bsize;
386				*fragsp -= fs->fs_frag;
387			} else {
388				nb = *fragsp * fs->fs_fsize;
389				*fragsp = 0;
390			}
391			TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
392		}
393}
394
395/*
396 * scan an indirect allocation block (either inode or true block)
397 */
398static void
399ufs_trans_mata_indir(
400	struct inode *ip,
401	daddr_t *fragsp,
402	daddr_t frag,
403	int level)
404{
405	struct ufsvfs *ufsvfsp	= ip->i_ufsvfs;
406	struct fs *fs = ufsvfsp->vfs_fs;
407	int ne = fs->fs_bsize / (int)sizeof (daddr32_t);
408	int i;
409	struct buf *bp;
410	daddr32_t *blkp;
411	o_mode_t ifmt = ip->i_mode & IFMT;
412
413	bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize);
414	if (bp->b_flags & B_ERROR) {
415		brelse(bp);
416		return;
417	}
418	blkp = bp->b_un.b_daddr;
419
420	if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) ||
421	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))
422		ufs_trans_mata_direct(ip, fragsp, blkp, ne);
423
424	if (level)
425		for (i = 0; i < ne && *fragsp; ++i, ++blkp)
426			ufs_trans_mata_indir(ip, fragsp, *blkp, level-1);
427	brelse(bp);
428}
429
430/*
431 * put appropriate metadata into matamap for this inode
432 */
433void
434ufs_trans_mata_iget(struct inode *ip)
435{
436	int		i;
437	daddr_t		frags	= dbtofsb(ip->i_fs, ip->i_blocks);
438	o_mode_t	ifmt 	= ip->i_mode & IFMT;
439
440	if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
441	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
442		ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR);
443
444	if (frags)
445		ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR);
446
447	for (i = 0; i < NIADDR && frags; ++i)
448		if (ip->i_ib[i])
449			ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i);
450}
451
452/*
453 * freeing possible metadata (block of user data)
454 */
455void
456ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb)
457{
458	top_matadel(ufsvfsp, mof, nb);
459
460}
461
462/*
463 * allocating metadata
464 */
465void
466ufs_trans_mata_alloc(
467	struct ufsvfs *ufsvfsp,
468	struct inode *ip,
469	daddr_t frag,
470	ulong_t nb,
471	int indir)
472{
473	struct fs	*fs	= ufsvfsp->vfs_fs;
474	o_mode_t	ifmt 	= ip->i_mode & IFMT;
475
476	if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
477	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
478		TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
479}
480
481#endif /* DEBUG */
482
483/*
484 * ufs_trans_dir is used to declare a directory delta
485 */
486int
487ufs_trans_dir(struct inode *ip, off_t offset)
488{
489	daddr_t	bn;
490	int	contig = 0, error;
491
492	ASSERT(ip);
493	ASSERT(RW_WRITE_HELD(&ip->i_contents));
494	error = bmap_read(ip, (u_offset_t)offset, &bn, &contig);
495	if (error || (bn == UFS_HOLE)) {
496		cmn_err(CE_WARN, "ufs_trans_dir - could not get block"
497		    " number error = %d bn = %d\n", error, (int)bn);
498		if (error == 0)	/* treat UFS_HOLE as an I/O error */
499			error = EIO;
500		return (error);
501	}
502	TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0);
503	return (error);
504}
505
506/*ARGSUSED*/
507int
508ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp)
509{
510	/*
511	 * Lock the quota subsystem (ufsvfsp can be NULL
512	 * if the DQ_ERROR is set).
513	 */
514	if (ufsvfsp)
515		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
516	mutex_enter(&dqp->dq_lock);
517
518	/*
519	 * If this transaction has been cancelled by closedq_scan_inode(),
520	 * then bail out now.  We don't call dqput() in this case because
521	 * it has already been done.
522	 */
523	if ((dqp->dq_flags & DQ_TRANS) == 0) {
524		mutex_exit(&dqp->dq_lock);
525		if (ufsvfsp)
526			rw_exit(&ufsvfsp->vfs_dqrwlock);
527		return (0);
528	}
529
530	if (dqp->dq_flags & DQ_ERROR) {
531		/*
532		 * Paranoia to make sure that there is at least one
533		 * reference to the dquot struct.  We are done with
534		 * the dquot (due to an error) so clear logging
535		 * specific markers.
536		 */
537		ASSERT(dqp->dq_cnt >= 1);
538		dqp->dq_flags &= ~DQ_TRANS;
539		dqput(dqp);
540		mutex_exit(&dqp->dq_lock);
541		if (ufsvfsp)
542			rw_exit(&ufsvfsp->vfs_dqrwlock);
543		return (1);
544	}
545
546	if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) {
547		ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0));
548		TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb,
549		    dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0);
550		/*
551		 * Paranoia to make sure that there is at least one
552		 * reference to the dquot struct.  Clear the
553		 * modification flag because the operation is now in
554		 * the log.  Also clear the logging specific markers
555		 * that were set in ufs_trans_quota().
556		 */
557		ASSERT(dqp->dq_cnt >= 1);
558		dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS);
559		dqput(dqp);
560	}
561
562	/*
563	 * At this point, the logging specific flag should be clear,
564	 * but add paranoia just in case something has gone wrong.
565	 */
566	ASSERT((dqp->dq_flags & DQ_TRANS) == 0);
567	mutex_exit(&dqp->dq_lock);
568	if (ufsvfsp)
569		rw_exit(&ufsvfsp->vfs_dqrwlock);
570	return (0);
571}
572
573/*
574 * ufs_trans_quota take in a uid, allocates the disk space, placing the
575 * quota record into the metamap, then declares the delta.
576 */
577/*ARGSUSED*/
578void
579ufs_trans_quota(struct dquot *dqp)
580{
581
582	struct inode	*qip = dqp->dq_ufsvfsp->vfs_qinod;
583
584	ASSERT(qip);
585	ASSERT(MUTEX_HELD(&dqp->dq_lock));
586	ASSERT(dqp->dq_flags & DQ_MOD);
587	ASSERT(dqp->dq_mof != 0);
588	ASSERT(dqp->dq_mof != UFS_HOLE);
589
590	/*
591	 * Mark this dquot to indicate that we are starting a logging
592	 * file system operation for this dquot.  Also increment the
593	 * reference count so that the dquot does not get reused while
594	 * it is on the mapentry_t list.  DQ_TRANS is cleared and the
595	 * reference count is decremented by ufs_trans_push_quota.
596	 *
597	 * If the file system is force-unmounted while there is a
598	 * pending quota transaction, then closedq_scan_inode() will
599	 * clear the DQ_TRANS flag and decrement the reference count.
600	 *
601	 * Since deltamap_add() drops multiple transactions to the
602	 * same dq_mof and ufs_trans_push_quota() won't get called,
603	 * we use DQ_TRANS to prevent repeat transactions from
604	 * incrementing the reference count (or calling TRANS_DELTA()).
605	 */
606	if ((dqp->dq_flags & DQ_TRANS) == 0) {
607		dqp->dq_flags |= DQ_TRANS;
608		dqp->dq_cnt++;
609		TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk),
610		    DT_QR, ufs_trans_push_quota, (ulong_t)dqp);
611	}
612}
613
614void
615ufs_trans_dqrele(struct dquot *dqp)
616{
617	struct ufsvfs	*ufsvfsp = dqp->dq_ufsvfsp;
618
619	curthread->t_flag |= T_DONTBLOCK;
620	TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
621	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
622	dqrele(dqp);
623	rw_exit(&ufsvfsp->vfs_dqrwlock);
624	TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
625	curthread->t_flag &= ~T_DONTBLOCK;
626}
627
628int ufs_trans_max_resv = TOP_MAX_RESV;	/* will be adjusted for testing */
629long ufs_trans_avgbfree = 0;		/* will be adjusted for testing */
630#define	TRANS_MAX_WRITE	(1024 * 1024)
631size_t ufs_trans_max_resid = TRANS_MAX_WRITE;
632
633/*
634 * Calculate the log reservation for the given write or truncate
635 */
636static ulong_t
637ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc)
638{
639	long		ncg, last2blk;
640	long		niblk		= 0;
641	u_offset_t	writeend, offblk;
642	int		resv;
643	daddr_t		nblk, maxfblk;
644	long		avgbfree;
645	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
646	struct fs	*fs		= ufsvfsp->vfs_fs;
647	long		fni		= NINDIR(fs);
648	int		bsize		= fs->fs_bsize;
649
650	/*
651	 * Assume that the request will fit in 1 or 2 cg's,
652	 * resv is the amount of log space to reserve (in bytes).
653	 */
654	resv = SIZECG(ip) * 2 + INODESIZE + 1024;
655
656	/*
657	 * get max position of write in fs blocks
658	 */
659	writeend = offset + resid;
660	maxfblk = lblkno(fs, writeend);
661	offblk = lblkno(fs, offset);
662	/*
663	 * request size in fs blocks
664	 */
665	nblk = lblkno(fs, blkroundup(fs, resid));
666	/*
667	 * Adjust for sparse files
668	 */
669	if (trunc)
670		nblk = MIN(nblk, ip->i_blocks);
671
672	/*
673	 * Adjust avgbfree (for testing)
674	 */
675	avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1;
676
677	/*
678	 * Calculate maximum number of blocks of triple indirect
679	 * pointers to write.
680	 */
681	last2blk = NDADDR + fni + fni * fni;
682	if (maxfblk > last2blk) {
683		long nl2ptr;
684		long n3blk;
685
686		if (offblk > last2blk)
687			n3blk = maxfblk - offblk;
688		else
689			n3blk = maxfblk - last2blk;
690		niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1;
691		nl2ptr = roundup(niblk, fni) / fni + 1;
692		niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2;
693		maxfblk -= n3blk;
694	}
695	/*
696	 * calculate maximum number of blocks of double indirect
697	 * pointers to write.
698	 */
699	if (maxfblk > NDADDR + fni) {
700		long n2blk;
701
702		if (offblk > NDADDR + fni)
703			n2blk = maxfblk - offblk;
704		else
705			n2blk = maxfblk - NDADDR + fni;
706		niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2;
707		maxfblk -= n2blk;
708	}
709	/*
710	 * Add in indirect pointer block write
711	 */
712	if (maxfblk > NDADDR) {
713		niblk += 1;
714	}
715	/*
716	 * Calculate deltas for indirect pointer writes
717	 */
718	resv += niblk * (fs->fs_bsize + sizeof (struct delta));
719	/*
720	 * maximum number of cg's needed for request
721	 */
722	ncg = nblk / avgbfree;
723	if (ncg > fs->fs_ncg)
724		ncg = fs->fs_ncg;
725
726	/*
727	 * maximum amount of log space needed for request
728	 */
729	if (ncg > 2)
730		resv += (ncg - 2) * SIZECG(ip);
731
732	return (resv);
733}
734
735/*
736 * Calculate the amount of log space that needs to be reserved for this
737 * trunc request.  If the amount of log space is too large, then
738 * calculate the the size that the requests needs to be split into.
739 */
740void
741ufs_trans_trunc_resv(
742	struct inode *ip,
743	u_offset_t length,
744	int *resvp,
745	u_offset_t *residp)
746{
747	ulong_t		resv;
748	u_offset_t	size, offset, resid;
749	int		nchunks, flag;
750
751	/*
752	 *    *resvp is the amount of log space to reserve (in bytes).
753	 *    when nonzero, *residp is the number of bytes to truncate.
754	 */
755	*residp = 0;
756
757	if (length < ip->i_size) {
758		size = ip->i_size - length;
759	} else {
760		resv = SIZECG(ip) * 2 + INODESIZE + 1024;
761		/*
762		 * truncate up, doesn't really use much space,
763		 * the default above should be sufficient.
764		 */
765		goto done;
766	}
767
768	offset = length;
769	resid = size;
770	nchunks = 1;
771	flag = 0;
772
773	/*
774	 * If this request takes too much log space, it will be split into
775	 * "nchunks". If this split is not enough, linearly increment the
776	 * nchunks in the next iteration.
777	 */
778	for (; (resv = ufs_log_amt(ip, offset, resid, 1)) > ufs_trans_max_resv;
779	    offset = length + (nchunks - 1) * resid) {
780		if (!flag) {
781			nchunks = roundup(resv, ufs_trans_max_resv) /
782			    ufs_trans_max_resv;
783			flag = 1;
784		} else {
785			nchunks++;
786		}
787		resid = size / nchunks;
788	}
789
790	if (nchunks > 1) {
791		*residp = resid;
792	}
793done:
794	*resvp = resv;
795}
796
797int
798ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr)
799{
800	int 		err, issync, resv;
801	u_offset_t	resid;
802	int		do_block	= 0;
803	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
804	struct fs	*fs		= ufsvfsp->vfs_fs;
805
806	/*
807	 * Not logging; just do the trunc
808	 */
809	if (!TRANS_ISTRANS(ufsvfsp)) {
810		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
811		rw_enter(&ip->i_contents, RW_WRITER);
812		err = ufs_itrunc(ip, length, flags, cr);
813		rw_exit(&ip->i_contents);
814		rw_exit(&ufsvfsp->vfs_dqrwlock);
815		return (err);
816	}
817
818	/*
819	 * within the lockfs protocol but *not* part of a transaction
820	 */
821	do_block = curthread->t_flag & T_DONTBLOCK;
822	curthread->t_flag |= T_DONTBLOCK;
823
824	/*
825	 * Trunc the file (in pieces, if necessary)
826	 */
827again:
828	ufs_trans_trunc_resv(ip, length, &resv, &resid);
829	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv);
830	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
831	rw_enter(&ip->i_contents, RW_WRITER);
832	if (resid) {
833		/*
834		 * resid is only set if we have to truncate in chunks
835		 */
836		ASSERT(length + resid < ip->i_size);
837
838		/*
839		 * Partially trunc file down to desired size (length).
840		 * Only retain I_FREE on the last partial trunc.
841		 * Round up size to a block boundary, to ensure the truncate
842		 * doesn't have to allocate blocks. This is done both for
843		 * performance and to fix a bug where if the block can't be
844		 * allocated then the inode delete fails, but the inode
845		 * is still freed with attached blocks and non-zero size
846		 * (bug 4348738).
847		 */
848		err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)),
849		    flags & ~I_FREE, cr);
850		ASSERT(ip->i_size != length);
851	} else
852		err = ufs_itrunc(ip, length, flags, cr);
853	if (!do_block)
854		curthread->t_flag &= ~T_DONTBLOCK;
855	rw_exit(&ip->i_contents);
856	rw_exit(&ufsvfsp->vfs_dqrwlock);
857	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv);
858
859	if ((err == 0) && resid) {
860		ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
861		goto again;
862	}
863	return (err);
864}
865
866/*
867 * Calculate the amount of log space that needs to be reserved for this
868 * write request.  If the amount of log space is too large, then
869 * calculate the size that the requests needs to be split into.
870 * First try fixed chunks of size ufs_trans_max_resid. If that
871 * is too big, iterate down to the largest size that will fit.
872 * Pagein the pages in the first chunk here, so that the pagein is
873 * avoided later when the transaction is open.
874 */
875void
876ufs_trans_write_resv(
877	struct inode *ip,
878	struct uio *uio,
879	int *resvp,
880	int *residp)
881{
882	ulong_t		resv;
883	offset_t	offset;
884	ssize_t		resid;
885	int		nchunks;
886
887	*residp = 0;
888	offset = uio->uio_offset;
889	resid = MIN(uio->uio_resid, ufs_trans_max_resid);
890	resv = ufs_log_amt(ip, offset, resid, 0);
891	if (resv <= ufs_trans_max_resv) {
892		uio_prefaultpages(resid, uio);
893		if (resid != uio->uio_resid)
894			*residp = resid;
895		*resvp = resv;
896		return;
897	}
898
899	resid = uio->uio_resid;
900	nchunks = 1;
901	for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv;
902	    offset = uio->uio_offset + (nchunks - 1) * resid) {
903		nchunks++;
904		resid = uio->uio_resid / nchunks;
905	}
906	uio_prefaultpages(resid, uio);
907	/*
908	 * If this request takes too much log space, it will be split
909	 */
910	if (nchunks > 1)
911		*residp = resid;
912	*resvp = resv;
913}
914
915/*
916 * Issue write request.
917 *
918 * Split a large request into smaller chunks.
919 */
920int
921ufs_trans_write(
922	struct inode *ip,
923	struct uio *uio,
924	int ioflag,
925	cred_t *cr,
926	int resv,
927	long resid)
928{
929	long		realresid;
930	int		err;
931	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
932
933	/*
934	 * since the write is too big and would "HOG THE LOG" it needs to
935	 * be broken up and done in pieces.  NOTE, the caller will
936	 * issue the EOT after the request has been completed
937	 */
938	realresid = uio->uio_resid;
939
940again:
941	/*
942	 * Perform partial request (uiomove will update uio for us)
943	 *	Request is split up into "resid" size chunks until
944	 *	"realresid" bytes have been transferred.
945	 */
946	uio->uio_resid = MIN(resid, realresid);
947	realresid -= uio->uio_resid;
948	err = wrip(ip, uio, ioflag, cr);
949
950	/*
951	 * Error or request is done; caller issues final EOT
952	 */
953	if (err || uio->uio_resid || (realresid == 0)) {
954		uio->uio_resid += realresid;
955		return (err);
956	}
957
958	/*
959	 * Generate EOT for this part of the request
960	 */
961	rw_exit(&ip->i_contents);
962	rw_exit(&ufsvfsp->vfs_dqrwlock);
963	if (ioflag & (FSYNC|FDSYNC)) {
964		TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv);
965	} else {
966		TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
967	}
968
969	/*
970	 * Make sure the input buffer is resident before starting
971	 * the next transaction.
972	 */
973	uio_prefaultpages(MIN(resid, realresid), uio);
974
975	/*
976	 * Generate BOT for next part of the request
977	 */
978	if (ioflag & (FSYNC|FDSYNC)) {
979		int error;
980		TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error);
981		ASSERT(!error);
982	} else {
983		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
984	}
985	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
986	rw_enter(&ip->i_contents, RW_WRITER);
987	/*
988	 * Error during EOT (probably device error while writing commit rec)
989	 */
990	if (err)
991		return (err);
992	goto again;
993}
994