1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/conf.h>
30 #include <sys/fssnap_if.h>
31 #include <sys/fs/ufs_inode.h>
32 #include <sys/fs/ufs_lockfs.h>
33 #include <sys/fs/ufs_log.h>
34 #include <sys/fs/ufs_trans.h>
35 #include <sys/cmn_err.h>
36 #include <vm/pvn.h>
37 #include <vm/seg_map.h>
38 #include <sys/fdbuffer.h>
39 
40 #ifdef DEBUG
41 int evn_ufs_debug = 0;
42 #define	DEBUGF(args)	{ if (evn_ufs_debug) cmn_err args; }
43 #else
44 #define	DEBUGF(args)
45 #endif
46 
47 /*
48  * ufs_rdwr_data - supports reading or writing data when
49  * no changes are permitted in file size or space allocation.
50  *
51  * Inputs:
52  * fdb - The mandatory fdbuffer supports
53  *	the read or write operation.
54  * flags - defaults (zero value) to synchronous write
55  *	B_READ - indicates read operation
56  *	B_ASYNC - indicates perform operation asynchronously
57  */
58 /*ARGSUSED*/
59 int
ufs_rdwr_data(vnode_t * vnodep,u_offset_t offset,size_t len,fdbuffer_t * fdbp,int flags,cred_t * credp)60 ufs_rdwr_data(
61 	vnode_t		*vnodep,
62 	u_offset_t	offset,
63 	size_t		len,
64 	fdbuffer_t	*fdbp,
65 	int		flags,
66 	cred_t		*credp)
67 {
68 	struct inode	*ip = VTOI(vnodep);
69 	struct fs	*fs;
70 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
71 	struct buf	*bp;
72 	krw_t		rwtype = RW_READER;
73 	u_offset_t	offset1 = offset;	/* Initial offset */
74 	size_t		iolen;
75 	int		curlen = 0;
76 	int		pplen;
77 	daddr_t		bn;
78 	int		contig = 0;
79 	int		error = 0;
80 	int		nbytes;			/* Number bytes this IO */
81 	int		offsetn;		/* Start point this IO */
82 	int		iswrite = flags & B_WRITE;
83 	int		io_started = 0;		/* No IO started */
84 	struct ulockfs	*ulp;
85 	uint_t		protp = PROT_ALL;
86 
87 	error = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, segkmap, !iswrite,
88 	    &protp);
89 	if (error) {
90 		if (flags & B_ASYNC) {
91 			fdb_ioerrdone(fdbp, error);
92 		}
93 		return (error);
94 	}
95 	fs = ufsvfsp->vfs_fs;
96 	iolen = len;
97 
98 	DEBUGF((CE_CONT, "?ufs_rdwr: %s vp: %p pages:%p  off %llx len %lx"
99 	    " isize: %llx fdb: %p\n",
100 	    flags & B_READ ? "READ" : "WRITE", (void *)vnodep,
101 	    (void *)vnodep->v_pages, offset1, iolen, ip->i_size, (void *)fdbp));
102 
103 	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
104 	rw_enter(&ip->i_contents, rwtype);
105 
106 	ASSERT(offset1 < ip->i_size);
107 
108 	if ((offset1 + iolen) > ip->i_size) {
109 		iolen = ip->i_size - offset1;
110 	}
111 	while (!error && curlen < iolen) {
112 
113 		contig = 0;
114 
115 		if ((error = bmap_read(ip, offset1, &bn, &contig)) != 0) {
116 			break;
117 		}
118 		ASSERT(!(bn == UFS_HOLE && iswrite));
119 		if (bn == UFS_HOLE) {
120 			/*
121 			 * If the above assertion is true,
122 			 * then the following if statement can never be true.
123 			 */
124 			if (iswrite && (rwtype == RW_READER)) {
125 				rwtype = RW_WRITER;
126 				if (!rw_tryupgrade(&ip->i_contents)) {
127 					rw_exit(&ip->i_contents);
128 					rw_enter(&ip->i_contents, rwtype);
129 					continue;
130 				}
131 			}
132 			offsetn = blkoff(fs, offset1);
133 			pplen = P2ROUNDUP(len, PAGESIZE);
134 			nbytes = MIN((pplen - curlen),
135 			    (fs->fs_bsize - offsetn));
136 			ASSERT(nbytes > 0);
137 
138 			/*
139 			 * We may be reading or writing.
140 			 */
141 			DEBUGF((CE_CONT, "?ufs_rdwr_data: hole %llx - %lx\n",
142 			    offset1, (iolen - curlen)));
143 
144 			if (iswrite) {
145 				printf("**WARNING: ignoring hole in write\n");
146 				error = ENOSPC;
147 			} else {
148 				fdb_add_hole(fdbp, offset1 - offset, nbytes);
149 			}
150 			offset1 += nbytes;
151 			curlen += nbytes;
152 			continue;
153 
154 		}
155 		ASSERT(contig > 0);
156 		pplen = P2ROUNDUP(len, PAGESIZE);
157 
158 		contig = MIN(contig, len - curlen);
159 		contig = P2ROUNDUP(contig, DEV_BSIZE);
160 
161 		bp = fdb_iosetup(fdbp, offset1 - offset, contig, vnodep, flags);
162 
163 		bp->b_edev = ip->i_dev;
164 		bp->b_dev = cmpdev(ip->i_dev);
165 		bp->b_blkno = bn;
166 		bp->b_file = ip->i_vnode;
167 		bp->b_offset = (offset_t)offset1;
168 
169 		if (ufsvfsp->vfs_snapshot) {
170 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
171 		} else {
172 			(void) bdev_strategy(bp);
173 		}
174 		io_started = 1;
175 
176 		offset1 += contig;
177 		curlen += contig;
178 		if (iswrite)
179 			lwp_stat_update(LWP_STAT_OUBLK, 1);
180 		else
181 			lwp_stat_update(LWP_STAT_INBLK, 1);
182 
183 		if ((flags & B_ASYNC) == 0) {
184 			error = biowait(bp);
185 			fdb_iodone(bp);
186 		}
187 
188 		DEBUGF((CE_CONT, "?loop ufs_rdwr_data.. off %llx len %lx\n",
189 		    offset1, (iolen - curlen)));
190 	}
191 
192 	DEBUGF((CE_CONT, "?ufs_rdwr_data: off %llx len %lx pages: %p ------\n",
193 	    offset1, (iolen - curlen), (void *)vnodep->v_pages));
194 
195 	rw_exit(&ip->i_contents);
196 	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);
197 
198 	if (flags & B_ASYNC) {
199 		/*
200 		 * Show that no more asynchronous IO will be added
201 		 */
202 		fdb_ioerrdone(fdbp, error);
203 	}
204 	if (ulp) {
205 		ufs_lockfs_end(ulp);
206 	}
207 	if (io_started && flags & B_ASYNC) {
208 		return (0);
209 	} else {
210 		return (error);
211 	}
212 }
213 
214 /*
215  * ufs_alloc_data - supports allocating space and reads or writes
216  * that involve changes to file length or space allocation.
217  *
218  * This function is more expensive, because of the UFS log transaction,
219  * so ufs_rdwr_data() should be used when space or file length changes
220  * will not occur.
221  *
222  * Inputs:
223  * fdb - A null pointer instructs this function to only allocate
224  *	space for the specified offset and length.
225  *	An actual fdbuffer instructs this function to perform
226  *	the read or write operation.
227  * flags - defaults (zero value) to synchronous write
228  *	B_READ - indicates read operation
229  *	B_ASYNC - indicates perform operation asynchronously
230  */
231 int
ufs_alloc_data(vnode_t * vnodep,u_offset_t offset,size_t * len,fdbuffer_t * fdbp,int flags,cred_t * credp)232 ufs_alloc_data(
233 	vnode_t		*vnodep,
234 	u_offset_t	offset,
235 	size_t		*len,
236 	fdbuffer_t	*fdbp,
237 	int		flags,
238 	cred_t		*credp)
239 {
240 	struct inode	*ip = VTOI(vnodep);
241 	size_t		done_len, io_len;
242 	int		contig;
243 	u_offset_t	uoff, io_off;
244 	int		error = 0;		/* No error occurred */
245 	int		offsetn;		/* Start point this IO */
246 	int		nbytes;			/* Number bytes in this IO */
247 	daddr_t		bn;
248 	struct fs	*fs;
249 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
250 	int		i_size_changed = 0;
251 	u_offset_t	old_i_size;
252 	struct ulockfs	*ulp;
253 	int		trans_size;
254 	int		issync;			/* UFS Log transaction */
255 						/* synchronous when non-zero */
256 
257 	int		io_started = 0;		/* No IO started */
258 	uint_t		protp = PROT_ALL;
259 
260 	ASSERT((flags & B_WRITE) == 0);
261 
262 	/*
263 	 * Obey the lockfs protocol
264 	 */
265 	error = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, segkmap, 0, &protp);
266 	if (error) {
267 		if ((fdbp != NULL) && (flags & B_ASYNC)) {
268 			fdb_ioerrdone(fdbp, error);
269 		}
270 		return (error);
271 	}
272 	if (ulp) {
273 		/*
274 		 * Try to begin a UFS log transaction
275 		 */
276 		trans_size = TOP_GETPAGE_SIZE(ip);
277 		TRANS_TRY_BEGIN_CSYNC(ufsvfsp, issync, TOP_GETPAGE,
278 		    trans_size, error);
279 		if (error == EWOULDBLOCK) {
280 			ufs_lockfs_end(ulp);
281 			if ((fdbp != NULL) && (flags & B_ASYNC)) {
282 				fdb_ioerrdone(fdbp, EDEADLK);
283 			}
284 			return (EDEADLK);
285 		}
286 	}
287 
288 	uoff = offset;
289 	io_off = offset;
290 	io_len = *len;
291 	done_len = 0;
292 
293 	DEBUGF((CE_CONT, "?ufs_alloc: off %llx len %lx size %llx fdb: %p\n",
294 	    uoff, (io_len - done_len), ip->i_size, (void *)fdbp));
295 
296 	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
297 	rw_enter(&ip->i_contents, RW_WRITER);
298 
299 	ASSERT((ip->i_mode & IFMT) == IFREG);
300 
301 	fs = ip->i_fs;
302 
303 	while (error == 0 && done_len < io_len) {
304 		uoff = (u_offset_t)(io_off + done_len);
305 		offsetn = (int)blkoff(fs, uoff);
306 		nbytes = (int)MIN(fs->fs_bsize - offsetn, io_len - done_len);
307 
308 		DEBUGF((CE_CONT, "?ufs_alloc_data: offset: %llx len %x\n",
309 		    uoff, nbytes));
310 
311 		if (uoff + nbytes > ip->i_size) {
312 			/*
313 			 * We are extending the length of the file.
314 			 * bmap is used so that we are sure that
315 			 * if we need to allocate new blocks, that it
316 			 * is done here before we up the file size.
317 			 */
318 			DEBUGF((CE_CONT, "?ufs_alloc_data: grow %llx -> %llx\n",
319 			    ip->i_size, uoff + nbytes));
320 
321 			error = bmap_write(ip, uoff, (offsetn + nbytes),
322 			    BI_ALLOC_ONLY, NULL, credp);
323 			if (ip->i_flag & (ICHG|IUPD))
324 				ip->i_seq++;
325 			if (error) {
326 				DEBUGF((CE_CONT, "?ufs_alloc_data: grow "
327 				    "failed err: %d\n", error));
328 				break;
329 			}
330 			if (fdbp != NULL) {
331 				if (uoff >= ip->i_size) {
332 					/*
333 					 * Desired offset is past end of bytes
334 					 * in file, so we have a hole.
335 					 */
336 					fdb_add_hole(fdbp, uoff - offset,
337 					    nbytes);
338 				} else {
339 					int contig;
340 					buf_t *bp;
341 
342 					error = bmap_read(ip, uoff, &bn,
343 					    &contig);
344 					if (error) {
345 						break;
346 					}
347 
348 					contig = ip->i_size - uoff;
349 					contig = P2ROUNDUP(contig, DEV_BSIZE);
350 
351 					bp = fdb_iosetup(fdbp, uoff - offset,
352 					    contig, vnodep, flags);
353 
354 					bp->b_edev = ip->i_dev;
355 					bp->b_dev = cmpdev(ip->i_dev);
356 					bp->b_blkno = bn;
357 					bp->b_file = ip->i_vnode;
358 					bp->b_offset = (offset_t)uoff;
359 
360 					if (ufsvfsp->vfs_snapshot) {
361 						fssnap_strategy(
362 						    &ufsvfsp->vfs_snapshot, bp);
363 					} else {
364 						(void) bdev_strategy(bp);
365 					}
366 					io_started = 1;
367 
368 					lwp_stat_update(LWP_STAT_OUBLK, 1);
369 
370 					if ((flags & B_ASYNC) == 0) {
371 						error = biowait(bp);
372 						fdb_iodone(bp);
373 						if (error) {
374 							break;
375 						}
376 					}
377 					if (contig > (ip->i_size - uoff)) {
378 						contig -= ip->i_size - uoff;
379 
380 						fdb_add_hole(fdbp,
381 						    ip->i_size - offset,
382 						    contig);
383 					}
384 				}
385 			}
386 
387 			i_size_changed = 1;
388 			old_i_size = ip->i_size;
389 			UFS_SET_ISIZE(uoff + nbytes, ip);
390 			TRANS_INODE(ip->i_ufsvfs, ip);
391 			/*
392 			 * file has grown larger than 2GB. Set flag
393 			 * in superblock to indicate this, if it
394 			 * is not already set.
395 			 */
396 			if ((ip->i_size > MAXOFF32_T) &&
397 			    !(fs->fs_flags & FSLARGEFILES)) {
398 				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
399 				mutex_enter(&ufsvfsp->vfs_lock);
400 				fs->fs_flags |= FSLARGEFILES;
401 				ufs_sbwrite(ufsvfsp);
402 				mutex_exit(&ufsvfsp->vfs_lock);
403 			}
404 		} else {
405 			/*
406 			 * The file length is not being extended.
407 			 */
408 			error = bmap_read(ip, uoff, &bn, &contig);
409 			if (error) {
410 				DEBUGF((CE_CONT, "?ufs_alloc_data: "
411 				    "bmap_read err: %d\n", error));
412 				break;
413 			}
414 
415 			if (bn != UFS_HOLE) {
416 				/*
417 				 * Did not map a hole in the file
418 				 */
419 				int	contig = P2ROUNDUP(nbytes, DEV_BSIZE);
420 				buf_t	*bp;
421 
422 				if (fdbp != NULL) {
423 					bp = fdb_iosetup(fdbp, uoff - offset,
424 					    contig, vnodep, flags);
425 
426 					bp->b_edev = ip->i_dev;
427 					bp->b_dev = cmpdev(ip->i_dev);
428 					bp->b_blkno = bn;
429 					bp->b_file = ip->i_vnode;
430 					bp->b_offset = (offset_t)uoff;
431 
432 					if (ufsvfsp->vfs_snapshot) {
433 						fssnap_strategy(
434 						    &ufsvfsp->vfs_snapshot, bp);
435 					} else {
436 						(void) bdev_strategy(bp);
437 					}
438 					io_started = 1;
439 
440 					lwp_stat_update(LWP_STAT_OUBLK, 1);
441 
442 					if ((flags & B_ASYNC) == 0) {
443 						error = biowait(bp);
444 						fdb_iodone(bp);
445 						if (error) {
446 							break;
447 						}
448 					}
449 				}
450 			} else {
451 				/*
452 				 * We read a hole in the file.
453 				 * We have to allocate blocks for the hole.
454 				 */
455 				error = bmap_write(ip, uoff, (offsetn + nbytes),
456 				    BI_ALLOC_ONLY, NULL, credp);
457 				if (ip->i_flag & (ICHG|IUPD))
458 					ip->i_seq++;
459 				if (error) {
460 					DEBUGF((CE_CONT, "?ufs_alloc_data: fill"
461 					    " hole failed error: %d\n", error));
462 					break;
463 				}
464 				if (fdbp != NULL) {
465 					fdb_add_hole(fdbp, uoff - offset,
466 					    nbytes);
467 				}
468 			}
469 		}
470 		done_len += nbytes;
471 	}
472 
473 	if (error) {
474 		if (i_size_changed) {
475 			/*
476 			 * Allocation of the blocks for the file failed.
477 			 * So truncate the file size back to its original size.
478 			 */
479 			(void) ufs_itrunc(ip, old_i_size, 0, credp);
480 		}
481 	}
482 
483 	DEBUGF((CE_CONT, "?ufs_alloc: uoff %llx len %lx\n",
484 	    uoff, (io_len - done_len)));
485 
486 	if ((offset + *len) < (NDADDR * fs->fs_bsize)) {
487 		*len = (size_t)(roundup(offset + *len, fs->fs_fsize) - offset);
488 	} else {
489 		*len = (size_t)(roundup(offset + *len, fs->fs_bsize) - offset);
490 	}
491 
492 	/*
493 	 * Flush cached pages.
494 	 *
495 	 * XXX - There should be no pages involved, since the I/O was performed
496 	 * through the device strategy routine and the page cache was bypassed.
497 	 * However, testing has demonstrated that this VOP_PUTPAGE is
498 	 * necessary. Without this, data might not always be read back as it
499 	 * was written.
500 	 *
501 	 */
502 	(void) VOP_PUTPAGE(vnodep, 0, 0, B_INVAL, credp, NULL);
503 
504 	rw_exit(&ip->i_contents);
505 	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);
506 
507 	if ((fdbp != NULL) && (flags & B_ASYNC)) {
508 		/*
509 		 * Show that no more asynchronous IO will be added
510 		 */
511 		fdb_ioerrdone(fdbp, error);
512 	}
513 	if (ulp) {
514 		/*
515 		 * End the UFS Log transaction
516 		 */
517 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_GETPAGE,
518 		    trans_size);
519 		ufs_lockfs_end(ulp);
520 	}
521 	if (io_started && (flags & B_ASYNC)) {
522 		return (0);
523 	} else {
524 		return (error);
525 	}
526 }
527