2ea8dc4beschrock * CDDL HEADER START
3ea8dc4beschrock *
4ea8dc4beschrock * The contents of this file are subject to the terms of the
5ea8dc4beschrock * Common Development and Distribution License (the "License").
6ea8dc4beschrock * You may not use this file except in compliance with the License.
7ea8dc4beschrock *
8ea8dc4beschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9ea8dc4beschrock * or http://www.opensolaris.org/os/licensing.
10ea8dc4beschrock * See the License for the specific language governing permissions
11ea8dc4beschrock * and limitations under the License.
12ea8dc4beschrock *
13ea8dc4beschrock * When distributing Covered Code, include this CDDL HEADER in each
14ea8dc4beschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15ea8dc4beschrock * If applicable, add the following below this CDDL HEADER, with the
16ea8dc4beschrock * fields enclosed by brackets "[]" replaced with your own identifying
17ea8dc4beschrock * information: Portions Copyright [yyyy] [name of copyright owner]
18ea8dc4beschrock *
19ea8dc4beschrock * CDDL HEADER END
20ea8dc4beschrock */
226809eb4Eric Schrock * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23ea8dc4beschrock * Use is subject to license terms.
24ea8dc4beschrock */
26cd0837cGeorge Wilson/*
27cd0837cGeorge Wilson * Copyright (c) 2012 by Delphix. All rights reserved.
28cd0837cGeorge Wilson */
29cd0837cGeorge Wilson
30ea8dc4beschrock#include <sys/spa.h>
31ea8dc4beschrock#include <sys/spa_impl.h>
32ea8dc4beschrock#include <sys/vdev.h>
33ea8dc4beschrock#include <sys/vdev_impl.h>
34ea8dc4beschrock#include <sys/zio.h>
3522fe2c8Jonathan Adams#include <sys/zio_checksum.h>
37ea8dc4beschrock#include <sys/fm/fs/zfs.h>
38ea8dc4beschrock#include <sys/fm/protocol.h>
39ea8dc4beschrock#include <sys/fm/util.h>
40ea8dc4beschrock#include <sys/sysevent.h>
43ea8dc4beschrock * This general routine is responsible for generating all the different ZFS
44ea8dc4beschrock * ereports.  The payload is dependent on the class, and which arguments are
45ea8dc4beschrock * supplied to the function:
46ea8dc4beschrock *
475711d39loli *	EREPORT			POOL	VDEV	IO
485711d39loli *	block			X	X	X
495711d39loli *	data			X		X
505711d39loli *	device			X	X
515711d39loli *	pool			X
52ea8dc4beschrock *
53ea8dc4beschrock * If we are in a loading state, all errors are chained together by the same
5432b8793ek * SPA-wide ENA (Error Numeric Association).
55ea8dc4beschrock *
56ea8dc4beschrock * For isolated I/O requests, we get the ENA from the zio_t. The propagation
57ea8dc4beschrock * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
58ea8dc4beschrock * to chain together all ereports associated with a logical piece of data.  For
59ea8dc4beschrock * read I/Os, there  are basically three 'types' of I/O, which form a roughly
60ea8dc4beschrock * layered diagram:
61ea8dc4beschrock *
62ea8dc4beschrock *      +---------------+
635711d39loli *	| Aggregate I/O |	No associated logical data or device
645711d39loli *	+---------------+
65ea8dc4beschrock *              |
66ea8dc4beschrock *              V
675711d39loli *	+---------------+	Reads associated with a piece of logical data.
685711d39loli *	|   Read I/O    |	This includes reads on behalf of RAID-Z,
695711d39loli *	+---------------+       mirrors, gang blocks, retries, etc.
70ea8dc4beschrock *              |
71ea8dc4beschrock *              V
725711d39loli *	+---------------+	Reads associated with a particular device, but
735711d39loli *	| Physical I/O  |	no logical data.  Issued as part of vdev caching
745711d39loli *	+---------------+	and I/O aggregation.
75ea8dc4beschrock *
76ea8dc4beschrock * Note that 'physical I/O' here is not the same terminology as used in the rest
77ea8dc4beschrock * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
78ea8dc4beschrock * blockpointer.  But I/O with no associated block pointer can still be related
79ea8dc4beschrock * to a logical piece of data (i.e. RAID-Z requests).
80ea8dc4beschrock *
81ea8dc4beschrock * Purely physical I/O always have unique ENAs.  They are not related to a
82ea8dc4beschrock * particular piece of logical data, and therefore cannot be chained together.
83ea8dc4beschrock * We still generate an ereport, but the DE doesn't correlate it with any
84ea8dc4beschrock * logical piece of data.  When such an I/O fails, the delegated I/O requests
85ea8dc4beschrock * will issue a retry, which will trigger the 'real' ereport with the correct
86ea8dc4beschrock * ENA.
87ea8dc4beschrock *
88ea8dc4beschrock * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
89ea8dc4beschrock * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
90ea8dc4beschrock * then inherit this pointer, so that when it is first set subsequent failures
91e14bb32Jeff Bonwick * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
92e14bb32Jeff Bonwick * this pointer is set to NULL, and no ereport will be generated (since it
93e14bb32Jeff Bonwick * doesn't actually correspond to any particular device or piece of data,
94e14bb32Jeff Bonwick * and the caller will always retry without caching or queueing anyway).
9522fe2c8Jonathan Adams *
9622fe2c8Jonathan Adams * For checksum errors, we want to include more information about the actual
9722fe2c8Jonathan Adams * error which occurs.  Accordingly, we build an ereport when the error is
9822fe2c8Jonathan Adams * noticed, but instead of sending it in immediately, we hang it off of the
9922fe2c8Jonathan Adams * io_cksum_report field of the logical IO.  When the logical IO completes
10022fe2c8Jonathan Adams * (successfully or not), zfs_ereport_finish_checksum() is called with the
10122fe2c8Jonathan Adams * good and bad versions of the buffer (if available), and we annotate the
10222fe2c8Jonathan Adams * ereport with information about the differences.
103ea8dc4beschrock */
10422fe2c8Jonathan Adams#ifdef _KERNEL
105dd50e0cTony Hutter
106dd50e0cTony Hutter/*
107dd50e0cTony Hutter * Return B_TRUE if the event actually posted, B_FALSE if not.
108dd50e0cTony Hutter */
109dd50e0cTony Hutterstatic boolean_t
11022fe2c8Jonathan Adamszfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
111eb63303Tom Caputi    const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
112eb63303Tom Caputi    zio_t *zio, uint64_t stateoroffset, uint64_t size)
114ea8dc4beschrock	nvlist_t *ereport, *detector;
11522fe2c8Jonathan Adams
116ea8dc4beschrock	uint64_t ena;
117ea8dc4beschrock	char class[64];
119dd50e0cTony Hutter	if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
120dd50e0cTony Hutter		return (B_FALSE);
1211d71320Eric Schrock
122ea8dc4beschrock	if ((ereport = fm_nvlist_create(NULL)) == NULL)
123dd50e0cTony Hutter		return (B_FALSE);
125ea8dc4beschrock	if ((detector = fm_nvlist_create(NULL)) == NULL) {
126ea8dc4beschrock		fm_nvlist_destroy(ereport, FM_NVA_FREE);
127dd50e0cTony Hutter		return (B_FALSE);
128ea8dc4beschrock	}
130ea8dc4beschrock	/*
131ea8dc4beschrock	 * Serialize ereport generation
132ea8dc4beschrock	 */
133ea8dc4beschrock	mutex_enter(&spa->spa_errlist_lock);
135ea8dc4beschrock	/*
136ea8dc4beschrock	 * Determine the ENA to use for this event.  If we are in a loading
137ea8dc4beschrock	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
138ea8dc4beschrock	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
139ea8dc4beschrock	 */
140b16da2eGeorge Wilson	if (spa_load_state(spa) != SPA_LOAD_NONE) {
141ea8dc4beschrock		if (spa->spa_ena == 0)
142ea8dc4beschrock			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
143ea8dc4beschrock		ena = spa->spa_ena;
144ea8dc4beschrock	} else if (zio != NULL && zio->io_logical != NULL) {
145ea8dc4beschrock		if (zio->io_logical->io_ena == 0)
146ea8dc4beschrock			zio->io_logical->io_ena =
147ea8dc4beschrock			    fm_ena_generate(0, FM_ENA_FMT1);
148ea8dc4beschrock		ena = zio->io_logical->io_ena;
149ea8dc4beschrock	} else {
150ea8dc4beschrock		ena = fm_ena_generate(0, FM_ENA_FMT1);
151ea8dc4beschrock	}
153ea8dc4beschrock	/*
154ea8dc4beschrock	 * Construct the full class, detector, and other standard FMA fields.
155ea8dc4beschrock	 */
156ea8dc4beschrock	(void) snprintf(class, sizeof (class), "%s.%s",
157ea8dc4beschrock	    ZFS_ERROR_CLASS, subclass);
159ea8dc4beschrock	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
160ea8dc4beschrock	    vd != NULL ? vd->vdev_guid : 0);
162ea8dc4beschrock	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
164ea8dc4beschrock	/*
165ea8dc4beschrock	 * Construct the per-ereport payload, depending on which parameters are
166ea8dc4beschrock	 * passed in.
167ea8dc4beschrock	 */
169ea8dc4beschrock	/*
170ea8dc4beschrock	 * Generic payload members common to all ereports.
171ea8dc4beschrock	 */
172ea8dc4beschrock	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
173e14bb32Jeff Bonwick	    DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
174ea8dc4beschrock	    DATA_TYPE_UINT64, spa_guid(spa),
176b16da2eGeorge Wilson	    spa_load_state(spa), NULL);
17832b8793ek	if (spa != NULL) {
17932b8793ek		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
18032b8793ek		    DATA_TYPE_STRING,
18132b8793ek		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
18232b8793ek		    FM_EREPORT_FAILMODE_WAIT :
18332b8793ek		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
18532b8793ek		    NULL);
18632b8793ek	}
188ea8dc4beschrock	if (vd != NULL) {
189ea8dc4beschrock		vdev_t *pvd = vd->vdev_parent;
191ea8dc4beschrock		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
192ea8dc4beschrock		    DATA_TYPE_UINT64, vd->vdev_guid,
193ea8dc4beschrock		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
194ea8dc4beschrock		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
1956809eb4Eric Schrock		if (vd->vdev_path != NULL)
196ea8dc4beschrock			fm_payload_set(ereport,
197ea8dc4beschrock			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
198ea8dc4beschrock			    DATA_TYPE_STRING, vd->vdev_path, NULL);
1996809eb4Eric Schrock		if (vd->vdev_devid != NULL)
200ea8dc4beschrock			fm_payload_set(ereport,
201ea8dc4beschrock			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
202ea8dc4beschrock			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
2036809eb4Eric Schrock		if (vd->vdev_fru != NULL)
2046809eb4Eric Schrock			fm_payload_set(ereport,
2056809eb4Eric Schrock			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
2066809eb4Eric Schrock			    DATA_TYPE_STRING, vd->vdev_fru, NULL);
2075711d39loli		if (vd->vdev_ashift)
2085711d39loli			fm_payload_set(ereport,
2105711d39loli			    DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
212ea8dc4beschrock		if (pvd != NULL) {
213ea8dc4beschrock			fm_payload_set(ereport,
214ea8dc4beschrock			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
215ea8dc4beschrock			    DATA_TYPE_UINT64, pvd->vdev_guid,
216ea8dc4beschrock			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
217ea8dc4beschrock			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
218ea8dc4beschrock			    NULL);
219ea8dc4beschrock			if (pvd->vdev_path)
220ea8dc4beschrock				fm_payload_set(ereport,
221ea8dc4beschrock				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
222c25056dgw				    DATA_TYPE_STRING, pvd->vdev_path, NULL);
223ea8dc4beschrock			if (pvd->vdev_devid)
224ea8dc4beschrock				fm_payload_set(ereport,
225ea8dc4beschrock				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
226ea8dc4beschrock				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
227ea8dc4beschrock		}
228ea8dc4beschrock	}