1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/vnode.h>
29 #include <sys/errno.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/kmem.h>
33 #include <sys/conf.h>
34 #include <sys/proc.h>
35 #include <sys/cmn_err.h>
36 #include <sys/fssnap_if.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/fs/ufs_bio.h>
41 #include <sys/inttypes.h>
42 #include <sys/callb.h>
43 
44 /*
45  * Kernel threads for logging
46  * Currently only one for rolling the log (one per log).
47  */
48 
49 #define	LUFS_DEFAULT_NUM_ROLL_BUFS 16
50 #define	LUFS_DEFAULT_MIN_ROLL_BUFS 4
51 #define	LUFS_DEFAULT_MAX_ROLL_BUFS 64
52 
53 /*
54  * Macros
55  */
56 #define	logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
57 #define	ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
58 
59 /*
60  * Tunables
61  */
62 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
63 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
64 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
65 long logmap_maxnme = 1536;
66 int trans_roll_tics = 0;
67 uint64_t trans_roll_new_delta = 0;
68 uint64_t lrr_wait = 0;
69 /*
70  * Key for thread specific data for the roll thread to
71  * bypass snapshot throttling
72  */
73 uint_t bypass_snapshot_throttle_key;
74 
75 /*
76  * externs
77  */
78 extern kmutex_t		ml_scan;
79 extern kcondvar_t	ml_scan_cv;
80 extern int		maxphys;
81 
82 static void
trans_roll_wait(mt_map_t * logmap,callb_cpr_t * cprinfop)83 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
84 {
85 	mutex_enter(&logmap->mtm_mutex);
86 	logmap->mtm_ref = 0;
87 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
88 		cv_broadcast(&logmap->mtm_from_roll_cv);
89 	}
90 	logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
91 	CALLB_CPR_SAFE_BEGIN(cprinfop);
92 	(void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
93 	    trans_roll_tics, TR_CLOCK_TICK);
94 	CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
95 	logmap->mtm_flags |= MTM_ROLLING;
96 	mutex_exit(&logmap->mtm_mutex);
97 }
98 
99 /*
100  * returns the number of 8K buffers to use for rolling the log
101  */
102 static uint32_t
log_roll_buffers()103 log_roll_buffers()
104 {
105 	/*
106 	 * sanity validate the tunable lufs_num_roll_bufs
107 	 */
108 	if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
109 		return (lufs_min_roll_bufs);
110 	}
111 	if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
112 		return (lufs_max_roll_bufs);
113 	}
114 	return (lufs_num_roll_bufs);
115 }
116 
117 /*
118  * Find something to roll, then if we don't have cached roll buffers
119  * covering all the deltas in that MAPBLOCK then read the master
120  * and overlay the deltas.
121  * returns;
122  * 	0 if sucessful
123  *	1 on finding nothing to roll
124  *	2 on error
125  */
126 int
log_roll_read(ml_unit_t * ul,rollbuf_t * rbs,int nmblk,caddr_t roll_bufs,int * retnbuf)127 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
128     int *retnbuf)
129 {
130 	offset_t	mof;
131 	buf_t		*bp;
132 	rollbuf_t	*rbp;
133 	mt_map_t	*logmap = ul->un_logmap;
134 	daddr_t		mblkno;
135 	int		i;
136 	int		error;
137 	int		nbuf;
138 
139 	/*
140 	 * Make sure there is really something to roll
141 	 */
142 	mof = 0;
143 	if (!logmap_next_roll(logmap, &mof)) {
144 		return (1);
145 	}
146 
147 	/*
148 	 * build some master blocks + deltas to roll forward
149 	 */
150 	rw_enter(&logmap->mtm_rwlock, RW_READER);
151 	nbuf = 0;
152 	do {
153 		mof = mof & (offset_t)MAPBLOCKMASK;
154 		mblkno = lbtodb(mof);
155 
156 		/*
157 		 * Check for the case of a new delta to a set up buffer
158 		 */
159 		for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
160 			if (P2ALIGN(rbp->rb_bh.b_blkno,
161 			    MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
162 				trans_roll_new_delta++;
163 				/* Flush out the current set of buffers */
164 				goto flush_bufs;
165 			}
166 		}
167 
168 		/*
169 		 * Work out what to roll next. If it isn't cached then read
170 		 * it asynchronously from the master.
171 		 */
172 		bp = &rbp->rb_bh;
173 		bp->b_blkno = mblkno;
174 		bp->b_flags = B_READ;
175 		bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
176 		bp->b_bufsize = MAPBLOCKSIZE;
177 		if (top_read_roll(rbp, ul)) {
178 			/* logmap deltas were in use */
179 			if (nbuf == 0) {
180 				/*
181 				 * On first buffer wait for the logmap user
182 				 * to finish by grabbing the logmap lock
183 				 * exclusively rather than spinning
184 				 */
185 				rw_exit(&logmap->mtm_rwlock);
186 				lrr_wait++;
187 				rw_enter(&logmap->mtm_rwlock, RW_WRITER);
188 				rw_exit(&logmap->mtm_rwlock);
189 				return (1);
190 			}
191 			/* we have at least one buffer - flush it */
192 			goto flush_bufs;
193 		}
194 		if ((bp->b_flags & B_INVAL) == 0) {
195 			nbuf++;
196 		}
197 		mof += MAPBLOCKSIZE;
198 	} while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
199 
200 	/*
201 	 * If there was nothing to roll cycle back
202 	 */
203 	if (nbuf == 0) {
204 		rw_exit(&logmap->mtm_rwlock);
205 		return (1);
206 	}
207 
208 flush_bufs:
209 	/*
210 	 * For each buffer, if it isn't cached then wait for the read to
211 	 * finish and overlay the deltas.
212 	 */
213 	for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
214 		if (!rbp->rb_crb) {
215 			bp = &rbp->rb_bh;
216 			if (trans_not_wait(bp)) {
217 				ldl_seterror(ul,
218 				    "Error reading master during ufs log roll");
219 				error = 1;
220 			}
221 			/*
222 			 * sync read the data from the log
223 			 */
224 			if (ldl_read(ul, bp->b_un.b_addr,
225 			    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
226 			    MAPBLOCKSIZE, rbp->rb_age)) {
227 				error = 1;
228 			}
229 		}
230 
231 		/*
232 		 * reset the age bit in the age list
233 		 */
234 		logmap_list_put_roll(logmap, rbp->rb_age);
235 
236 		if (ul->un_flags & LDL_ERROR) {
237 			error = 1;
238 		}
239 	}
240 	rw_exit(&logmap->mtm_rwlock);
241 	if (error)
242 		return (2);
243 	*retnbuf = nbuf;
244 	return (0);
245 }
246 
247 /*
248  * Write out a cached roll buffer
249  */
250 void
log_roll_write_crb(ufsvfs_t * ufsvfsp,rollbuf_t * rbp)251 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
252 {
253 	crb_t *crb = rbp->rb_crb;
254 	buf_t *bp = &rbp->rb_bh;
255 
256 	bp->b_blkno = lbtodb(crb->c_mof);
257 	bp->b_un.b_addr = crb->c_buf;
258 	bp->b_bcount = crb->c_nb;
259 	bp->b_bufsize = crb->c_nb;
260 	ASSERT((crb->c_nb & DEV_BMASK) == 0);
261 	bp->b_flags = B_WRITE;
262 	logstats.ls_rwrites.value.ui64++;
263 
264 	/* if snapshots are enabled, call it */
265 	if (ufsvfsp->vfs_snapshot) {
266 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
267 	} else {
268 		(void) bdev_strategy(bp);
269 	}
270 }
271 
272 /*
273  * Write out a set of non cached roll buffers
274  */
275 void
log_roll_write_bufs(ufsvfs_t * ufsvfsp,rollbuf_t * rbp)276 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
277 {
278 	buf_t		*bp = &rbp->rb_bh;
279 	buf_t		*bp2;
280 	rbsecmap_t	secmap = rbp->rb_secmap;
281 	int		j, k;
282 
283 	ASSERT(secmap);
284 	ASSERT((bp->b_flags & B_INVAL) == 0);
285 
286 	do { /* for each contiguous block of sectors */
287 		/* find start of next sector to write */
288 		for (j = 0; j < 16; ++j) {
289 			if (secmap & UINT16_C(1))
290 				break;
291 			secmap >>= 1;
292 		}
293 		bp->b_un.b_addr += (j << DEV_BSHIFT);
294 		bp->b_blkno += j;
295 
296 		/* calculate number of sectors */
297 		secmap >>= 1;
298 		j++;
299 		for (k = 1; j < 16; ++j) {
300 			if ((secmap & UINT16_C(1)) == 0)
301 				break;
302 			secmap >>= 1;
303 			k++;
304 		}
305 		bp->b_bcount = k << DEV_BSHIFT;
306 		bp->b_flags = B_WRITE;
307 		logstats.ls_rwrites.value.ui64++;
308 
309 		/* if snapshots are enabled, call it */
310 		if (ufsvfsp->vfs_snapshot)
311 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
312 		else
313 			(void) bdev_strategy(bp);
314 		if (secmap) {
315 			/*
316 			 * Allocate another buf_t to handle
317 			 * the next write in this MAPBLOCK
318 			 * Chain them via b_list.
319 			 */
320 			bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
321 			bp->b_list = bp2;
322 			bioinit(bp2);
323 			bp2->b_iodone = trans_not_done;
324 			bp2->b_bufsize = MAPBLOCKSIZE;
325 			bp2->b_edev = bp->b_edev;
326 			bp2->b_un.b_addr =
327 			    bp->b_un.b_addr + bp->b_bcount;
328 			bp2->b_blkno = bp->b_blkno + k;
329 			bp = bp2;
330 		}
331 	} while (secmap);
332 }
333 
334 /*
335  * Asynchronously roll the deltas, using the sector map
336  * in each rollbuf_t.
337  */
338 int
log_roll_write(ml_unit_t * ul,rollbuf_t * rbs,int nbuf)339 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
340 {
341 
342 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
343 	rollbuf_t	*rbp;
344 	buf_t		*bp, *bp2;
345 	rollbuf_t	*head, *prev, *rbp2;
346 
347 	/*
348 	 * Order the buffers by blkno
349 	 */
350 	ASSERT(nbuf > 0);
351 #ifdef lint
352 	prev = rbs;
353 #endif
354 	for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
355 		for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
356 			if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
357 				if (rbp2 == head) {
358 					rbp->rb_next = head;
359 					head = rbp;
360 				} else {
361 					prev->rb_next = rbp;
362 					rbp->rb_next = rbp2;
363 				}
364 				break;
365 			}
366 		}
367 		if (rbp2 == NULL) {
368 			prev->rb_next = rbp;
369 			rbp->rb_next = NULL;
370 		}
371 	}
372 
373 	/*
374 	 * issue the in-order writes
375 	 */
376 	for (rbp = head; rbp; rbp = rbp2) {
377 		if (rbp->rb_crb) {
378 			log_roll_write_crb(ufsvfsp, rbp);
379 		} else {
380 			log_roll_write_bufs(ufsvfsp, rbp);
381 		}
382 		/* null out the rb_next link for next set of rolling */
383 		rbp2 = rbp->rb_next;
384 		rbp->rb_next = NULL;
385 	}
386 
387 	/*
388 	 * wait for all the writes to finish
389 	 */
390 	for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
391 		bp = &rbp->rb_bh;
392 		if (trans_not_wait(bp)) {
393 			ldl_seterror(ul,
394 			    "Error writing master during ufs log roll");
395 		}
396 
397 		/*
398 		 * Now wait for all the "cloned" buffer writes (if any)
399 		 * and free those headers
400 		 */
401 		bp2 = bp->b_list;
402 		bp->b_list = NULL;
403 		while (bp2) {
404 			if (trans_not_wait(bp2)) {
405 				ldl_seterror(ul,
406 				    "Error writing master during ufs log roll");
407 			}
408 			bp = bp2;
409 			bp2 = bp2->b_list;
410 			kmem_free(bp, sizeof (buf_t));
411 		}
412 	}
413 
414 	if (ul->un_flags & LDL_ERROR)
415 		return (1);
416 	return (0);
417 }
418 
419 void
trans_roll(ml_unit_t * ul)420 trans_roll(ml_unit_t *ul)
421 {
422 	callb_cpr_t	cprinfo;
423 	mt_map_t	*logmap = ul->un_logmap;
424 	rollbuf_t	*rbs;
425 	rollbuf_t	*rbp;
426 	buf_t		*bp;
427 	caddr_t		roll_bufs;
428 	uint32_t	nmblk;
429 	int		i;
430 	int		doingforceroll;
431 	int		nbuf;
432 
433 	CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
434 	    "trans_roll");
435 
436 	/*
437 	 * We do not want the roll thread's writes to be
438 	 * throttled by the snapshot.
439 	 * If they are throttled then we can have a deadlock
440 	 * between the roll thread and the snapshot taskq thread:
441 	 * roll thread wants the throttling semaphore and
442 	 * the snapshot taskq thread cannot release the semaphore
443 	 * because it is writing to the log and the log is full.
444 	 */
445 
446 	(void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
447 
448 	/*
449 	 * setup some roll parameters
450 	 */
451 	if (trans_roll_tics == 0)
452 		trans_roll_tics = 5 * hz;
453 	nmblk = log_roll_buffers();
454 
455 	/*
456 	 * allocate the buffers and buffer headers
457 	 */
458 	roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
459 	rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
460 
461 	/*
462 	 * initialize the buffer headers
463 	 */
464 	for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
465 		rbp->rb_next = NULL;
466 		bp = &rbp->rb_bh;
467 		bioinit(bp);
468 		bp->b_edev = ul->un_dev;
469 		bp->b_iodone = trans_not_done;
470 		bp->b_bufsize = MAPBLOCKSIZE;
471 	}
472 
473 	doingforceroll = 0;
474 
475 again:
476 	/*
477 	 * LOOP FOREVER
478 	 */
479 
480 	/*
481 	 * exit on demand
482 	 */
483 	mutex_enter(&logmap->mtm_mutex);
484 	if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
485 		kmem_free(rbs, nmblk * sizeof (rollbuf_t));
486 		kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
487 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
488 		    MTM_ROLL_EXIT | MTM_ROLLING);
489 		cv_broadcast(&logmap->mtm_from_roll_cv);
490 		CALLB_CPR_EXIT(&cprinfo);
491 		thread_exit();
492 		/* NOTREACHED */
493 	}
494 
495 	/*
496 	 * MT_SCAN debug mode
497 	 *	don't roll except in FORCEROLL situations
498 	 */
499 	if (logmap->mtm_debug & MT_SCAN)
500 		if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
501 			mutex_exit(&logmap->mtm_mutex);
502 			trans_roll_wait(logmap, &cprinfo);
503 			goto again;
504 		}
505 	ASSERT(logmap->mtm_trimlof == 0);
506 
507 	/*
508 	 * If we've finished a force roll cycle then wakeup any
509 	 * waiters.
510 	 */
511 	if (doingforceroll) {
512 		doingforceroll = 0;
513 		logmap->mtm_flags &= ~MTM_FORCE_ROLL;
514 		mutex_exit(&logmap->mtm_mutex);
515 		cv_broadcast(&logmap->mtm_from_roll_cv);
516 	} else {
517 		mutex_exit(&logmap->mtm_mutex);
518 	}
519 
520 	/*
521 	 * If someone wants us to roll something; then do it
522 	 */
523 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
524 		doingforceroll = 1;
525 		goto rollsomething;
526 	}
527 
528 	/*
529 	 * Log is busy, check if logmap is getting full.
530 	 */
531 	if (logmap_need_roll(logmap)) {
532 		goto rollsomething;
533 	}
534 
535 	/*
536 	 * Check if the log is idle and is not empty
537 	 */
538 	if (!logmap->mtm_ref && !ldl_empty(ul)) {
539 		goto rollsomething;
540 	}
541 
542 	/*
543 	 * Log is busy, check if its getting full
544 	 */
545 	if (ldl_need_roll(ul)) {
546 		goto rollsomething;
547 	}
548 
549 	/*
550 	 * nothing to do; wait a bit and then start over
551 	 */
552 	trans_roll_wait(logmap, &cprinfo);
553 	goto again;
554 
555 	/*
556 	 * ROLL SOMETHING
557 	 */
558 
559 rollsomething:
560 	/*
561 	 * Use the cached roll buffers, or read the master
562 	 * and overlay the deltas
563 	 */
564 	switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
565 	case 1: trans_roll_wait(logmap, &cprinfo);
566 		/* FALLTHROUGH */
567 	case 2: goto again;
568 	/* default case is success */
569 	}
570 
571 	/*
572 	 * Asynchronously write out the deltas
573 	 */
574 	if (log_roll_write(ul, rbs, nbuf))
575 		goto again;
576 
577 	/*
578 	 * free up the deltas in the logmap
579 	 */
580 	for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
581 		bp = &rbp->rb_bh;
582 		logmap_remove_roll(logmap,
583 		    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
584 	}
585 
586 	/*
587 	 * free up log space; if possible
588 	 */
589 	logmap_sethead(logmap, ul);
590 
591 	/*
592 	 * LOOP
593 	 */
594 	goto again;
595 }
596