2ea8dc4beschrock * CDDL HEADER START
3ea8dc4beschrock *
4ea8dc4beschrock * The contents of this file are subject to the terms of the
5ea8dc4beschrock * Common Development and Distribution License (the "License").
6ea8dc4beschrock * You may not use this file except in compliance with the License.
7ea8dc4beschrock *
8ea8dc4beschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9ea8dc4beschrock * or http://www.opensolaris.org/os/licensing.
10ea8dc4beschrock * See the License for the specific language governing permissions
11ea8dc4beschrock * and limitations under the License.
12ea8dc4beschrock *
13ea8dc4beschrock * When distributing Covered Code, include this CDDL HEADER in each
14ea8dc4beschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15ea8dc4beschrock * If applicable, add the following below this CDDL HEADER, with the
16ea8dc4beschrock * fields enclosed by brackets "[]" replaced with your own identifying
17ea8dc4beschrock * information: Portions Copyright [yyyy] [name of copyright owner]
18ea8dc4beschrock *
19ea8dc4beschrock * CDDL HEADER END
20ea8dc4beschrock */
226809eb4Eric Schrock * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23ea8dc4beschrock * Use is subject to license terms.
24ea8dc4beschrock */
26cd0837cGeorge Wilson/*
27cd0837cGeorge Wilson * Copyright (c) 2012 by Delphix. All rights reserved.
28cd0837cGeorge Wilson */
29cd0837cGeorge Wilson
30ea8dc4beschrock#include <sys/spa.h>
31ea8dc4beschrock#include <sys/spa_impl.h>
32ea8dc4beschrock#include <sys/vdev.h>
33ea8dc4beschrock#include <sys/vdev_impl.h>
34ea8dc4beschrock#include <sys/zio.h>
3522fe2c8Jonathan Adams#include <sys/zio_checksum.h>
37ea8dc4beschrock#include <sys/fm/fs/zfs.h>
38ea8dc4beschrock#include <sys/fm/protocol.h>
39ea8dc4beschrock#include <sys/fm/util.h>
40ea8dc4beschrock#include <sys/sysevent.h>
43ea8dc4beschrock * This general routine is responsible for generating all the different ZFS
44ea8dc4beschrock * ereports.  The payload is dependent on the class, and which arguments are
45ea8dc4beschrock * supplied to the function:
46ea8dc4beschrock *
475711d39loli *	EREPORT			POOL	VDEV	IO
485711d39loli *	block			X	X	X
495711d39loli *	data			X		X
505711d39loli *	device			X	X
515711d39loli *	pool			X
52ea8dc4beschrock *
53ea8dc4beschrock * If we are in a loading state, all errors are chained together by the same
5432b8793ek * SPA-wide ENA (Error Numeric Association).
55ea8dc4beschrock *
56ea8dc4beschrock * For isolated I/O requests, we get the ENA from the zio_t. The propagation
57ea8dc4beschrock * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
58ea8dc4beschrock * to chain together all ereports associated with a logical piece of data.  For
59ea8dc4beschrock * read I/Os, there  are basically three 'types' of I/O, which form a roughly
60ea8dc4beschrock * layered diagram:
61ea8dc4beschrock *
62ea8dc4beschrock *      +---------------+
635711d39loli *	| Aggregate I/O |	No associated logical data or device
645711d39loli *	+---------------+
65ea8dc4beschrock *              |
66ea8dc4beschrock *              V
675711d39loli *	+---------------+	Reads associated with a piece of logical data.
685711d39loli *	|   Read I/O    |	This includes reads on behalf of RAID-Z,
695711d39loli *	+---------------+       mirrors, gang blocks, retries, etc.
70ea8dc4beschrock *              |
71ea8dc4beschrock *              V
725711d39loli *	+---------------+	Reads associated with a particular device, but
735711d39loli *	| Physical I/O  |	no logical data.  Issued as part of vdev caching
745711d39loli *	+---------------+	and I/O aggregation.
75ea8dc4beschrock *
76ea8dc4beschrock * Note that 'physical I/O' here is not the same terminology as used in the rest
77ea8dc4beschrock * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
78ea8dc4beschrock * blockpointer.  But I/O with no associated block pointer can still be related
79ea8dc4beschrock * to a logical piece of data (i.e. RAID-Z requests).
80ea8dc4beschrock *
81ea8dc4beschrock * Purely physical I/O always have unique ENAs.  They are not related to a
82ea8dc4beschrock * particular piece of logical data, and therefore cannot be chained together.
83ea8dc4beschrock * We still generate an ereport, but the DE doesn't correlate it with any
84ea8dc4beschrock * logical piece of data.  When such an I/O fails, the delegated I/O requests
85ea8dc4beschrock * will issue a retry, which will trigger the 'real' ereport with the correct
86ea8dc4beschrock * ENA.
87ea8dc4beschrock *
88ea8dc4beschrock * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
89ea8dc4beschrock * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
90ea8dc4beschrock * then inherit this pointer, so that when it is first set subsequent failures
91e14bb32Jeff Bonwick * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
92e14bb32Jeff Bonwick * this pointer is set to NULL, and no ereport will be generated (since it
93e14bb32Jeff Bonwick * doesn't actually correspond to any particular device or piece of data,
94e14bb32Jeff Bonwick * and the caller will always retry without caching or queueing anyway).
9522fe2c8Jonathan Adams *
9622fe2c8Jonathan Adams * For checksum errors, we want to include more information about the actual
9722fe2c8Jonathan Adams * error which occurs.  Accordingly, we build an ereport when the error is
9822fe2c8Jonathan Adams * noticed, but instead of sending it in immediately, we hang it off of the
9922fe2c8Jonathan Adams * io_cksum_report field of the logical IO.  When the logical IO completes
10022fe2c8Jonathan Adams * (successfully or not), zfs_ereport_finish_checksum() is called with the
10122fe2c8Jonathan Adams * good and bad versions of the buffer (if available), and we annotate the
10222fe2c8Jonathan Adams * ereport with information about the differences.
103ea8dc4beschrock */
10422fe2c8Jonathan Adams#ifdef _KERNEL
10522fe2c8Jonathan Adamsstatic void
10622fe2c8Jonathan Adamszfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
107eb63303Tom Caputi    const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
108eb63303Tom Caputi    zio_t *zio, uint64_t stateoroffset, uint64_t size)
110ea8dc4beschrock	nvlist_t *ereport, *detector;
11122fe2c8Jonathan Adams
112ea8dc4beschrock	uint64_t ena;
113ea8dc4beschrock	char class[64];
115ea8dc4beschrock	/*
116468c413Tim Haley	 * If we are doing a spa_tryimport() or in recovery mode,
117468c413Tim Haley	 * ignore errors.
118ea8dc4beschrock	 */
119b16da2eGeorge Wilson	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
120b16da2eGeorge Wilson	    spa_load_state(spa) == SPA_LOAD_RECOVER)
121ea8dc4beschrock		return;
123ea8dc4beschrock	/*
124ea8dc4beschrock	 * If we are in the middle of opening a pool, and the previous attempt
125ea8dc4beschrock	 * failed, don't bother logging any new ereports - we're just going to
126ea8dc4beschrock	 * get the same diagnosis anyway.
127ea8dc4beschrock	 */
128b16da2eGeorge Wilson	if (spa_load_state(spa) != SPA_LOAD_NONE &&
129ea8dc4beschrock	    spa->spa_last_open_failed)
130ea8dc4beschrock		return;
132bf82a41eschrock	if (zio != NULL) {
133bf82a41eschrock		/*
134bf82a41eschrock		 * If this is not a read or write zio, ignore the error.  This
135bf82a41eschrock		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
136bf82a41eschrock		 */
137bf82a41eschrock		if (zio->io_type != ZIO_TYPE_READ &&
138bf82a41eschrock		    zio->io_type != ZIO_TYPE_WRITE)
139bf82a41eschrock			return;
141bf82a41eschrock		/*
142bf82a41eschrock		 * Ignore any errors from speculative I/Os, as failure is an
143bf82a41eschrock		 * expected result.
144bf82a41eschrock		 */
145bf82a41eschrock		if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
146bf82a41eschrock			return;
1488956713Eric Schrock		/*
1498956713Eric Schrock		 * If this I/O is not a retry I/O, don't post an ereport.
1508956713Eric Schrock		 * Otherwise, we risk making bad diagnoses based on B_FAILFAST
1518956713Eric Schrock		 * I/Os.
1528956713Eric Schrock		 */
1538956713Eric Schrock		if (zio->io_error == EIO &&
1548956713Eric Schrock		    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
1558956713Eric Schrock			return;
1568956713Eric Schrock
1576809eb4Eric Schrock		if (vd != NULL) {
1586809eb4Eric Schrock			/*
1596809eb4Eric Schrock			 * If the vdev has already been marked as failing due
1606809eb4Eric Schrock			 * to a failed probe, then ignore any subsequent I/O
1616809eb4Eric Schrock			 * errors, as the DE will automatically fault the vdev
1626809eb4Eric Schrock			 * on the first such failure.  This also catches cases
1636809eb4Eric Schrock			 * where vdev_remove_wanted is set and the device has
1646809eb4Eric Schrock			 * not yet been asynchronously placed into the REMOVED
1656809eb4Eric Schrock			 * state.
1666809eb4Eric Schrock			 */
1671d71320Eric Schrock			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
1686809eb4Eric Schrock				return;
1696809eb4Eric Schrock
1706809eb4Eric Schrock			/*
1716809eb4Eric Schrock			 * Ignore checksum errors for reads from DTL regions of
1726809eb4Eric Schrock			 * leaf vdevs.
1736809eb4Eric Schrock			 */
1746809eb4Eric Schrock			if (zio->io_type == ZIO_TYPE_READ &&
1756809eb4Eric Schrock			    zio->io_error == ECKSUM &&
1766809eb4Eric Schrock			    vd->vdev_ops->vdev_op_leaf &&
1776809eb4Eric Schrock			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
1786809eb4Eric Schrock				return;
1796809eb4Eric Schrock		}
180bf82a41eschrock	}
1821d71320Eric Schrock	/*
1831d71320Eric Schrock	 * For probe failure, we want to avoid posting ereports if we've
1841d71320Eric Schrock	 * already removed the device in the meantime.
1851d71320Eric Schrock	 */
1861d71320Eric Schrock	if (vd != NULL &&
1871d71320Eric Schrock	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
1881d71320Eric Schrock	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
1891d71320Eric Schrock		return;
1901d71320Eric Schrock
191ea8dc4beschrock	if ((ereport = fm_nvlist_create(NULL)) == NULL)
192ea8dc4beschrock		return;
194ea8dc4beschrock	if ((detector = fm_nvlist_create(NULL)) == NULL) {
195ea8dc4beschrock		fm_nvlist_destroy(ereport, FM_NVA_FREE);
196ea8dc4beschrock		return;
197ea8dc4beschrock	}
199ea8dc4beschrock	/*
200ea8dc4beschrock	 * Serialize ereport generation
201ea8dc4beschrock	 */
202ea8dc4beschrock	mutex_enter(&spa->spa_errlist_lock);
204ea8dc4beschrock	/*
205ea8dc4beschrock	 * Determine the ENA to use for this event.  If we are in a loading
206ea8dc4beschrock	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
207ea8dc4beschrock	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
208ea8dc4beschrock	 */
209b16da2eGeorge Wilson	if (spa_load_state(spa) != SPA_LOAD_NONE) {
210ea8dc4beschrock		if (spa->spa_ena == 0)
211ea8dc4beschrock			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
212ea8dc4beschrock		ena = spa->spa_ena;
213ea8dc4beschrock	} else if (zio != NULL && zio->io_logical != NULL) {
214ea8dc4beschrock		if (zio->io_logical->io_ena == 0)
215ea8dc4beschrock			zio->io_logical->io_ena =
216ea8dc4beschrock			    fm_ena_generate(0, FM_ENA_FMT1);
217ea8dc4beschrock		ena = zio->io_logical->io_ena;
218ea8dc4beschrock	} else {
219ea8dc4beschrock		ena = fm_ena_generate(0, FM_ENA_FMT1);
220ea8dc4beschrock	}
222ea8dc4beschrock	/*
223ea8dc4beschrock	 * Construct the full class, detector, and other standard FMA fields.
224ea8dc4beschrock	 */
225ea8dc4beschrock	(void) snprintf(class, sizeof (class), "%s.%s",
226ea8dc4beschrock	    ZFS_ERROR_CLASS, subclass);
228ea8dc4beschrock	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
229ea8dc4beschrock	    vd != NULL ? vd->vdev_guid : 0);
231ea8dc4beschrock	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
233ea8dc4beschrock	/*
234ea8dc4beschrock	 * Construct the per-ereport payload, depending on which parameters are
235ea8dc4beschrock	 * passed in.
236ea8dc4beschrock	 */
238ea8dc4beschrock	/*
239ea8dc4beschrock	 * Generic payload members common to all ereports.
240ea8dc4beschrock	 */
241ea8dc4beschrock	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
242e14bb32Jeff Bonwick	    DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
243ea8dc4beschrock	    DATA_TYPE_UINT64, spa_guid(spa),
245b16da2eGeorge Wilson	    spa_load_state(spa), NULL);
24732b8793ek	if (spa != NULL) {
24832b8793ek		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
24932b8793ek		    DATA_TYPE_STRING,
25032b8793ek		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
25132b8793ek		    FM_EREPORT_FAILMODE_WAIT :
25232b8793ek		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
25432b8793ek		    NULL);
25532b8793ek	}
257ea8dc4beschrock	if (vd != NULL) {
258ea8dc4beschrock		vdev_t *pvd = vd->vdev_parent;
260ea8dc4beschrock		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
261ea8dc4beschrock		    DATA_TYPE_UINT64, vd->vdev_guid,
262ea8dc4beschrock		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
263ea8dc4beschrock		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
2646809eb4Eric Schrock		if (vd->vdev_path != NULL)