xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_disk.c (revision 8b26092d)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5bef6b7d2Swebaker  * Common Development and Distribution License (the "License").
6bef6b7d2Swebaker  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22f13665b7Sbo zhou - Sun Microsystems - Beijing China  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
236fe4f300SPavel Zakharov  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24295438baSHans Rosenfeld  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
251b500975SMike Gerdts  * Copyright 2020 Joyent, Inc.
2630c304d9SJoshua M. Clulow  * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
2729621f01SHans Rosenfeld  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
28fa9e4066Sahrens  */
29fa9e4066Sahrens 
30fa9e4066Sahrens #include <sys/zfs_context.h>
31dcba9f3fSGeorge Wilson #include <sys/spa_impl.h>
32e7cbe64fSgw #include <sys/refcount.h>
33fa9e4066Sahrens #include <sys/vdev_impl.h>
34084fd14fSBrian Behlendorf #include <sys/vdev_trim.h>
35770499e1SDan Kimmel #include <sys/abd.h>
36fa9e4066Sahrens #include <sys/fs/zfs.h>
37fa9e4066Sahrens #include <sys/zio.h>
38afefbcddSeschrock #include <sys/sunldi.h>
394263d13fSGeorge Wilson #include <sys/efi_partition.h>
4051ece835Seschrock #include <sys/fm/fs/zfs.h>
41ac04831dSMike Gerdts #include <sys/ddi.h>
42fa9e4066Sahrens 
43fb05b94aSJerry Jelinek /*
44fc5c75cfSJerry Jelinek  * Tunable to disable TRIM in case we're using a problematic SSD.
45fb05b94aSJerry Jelinek  */
46fc5c75cfSJerry Jelinek uint_t zfs_no_trim = 0;
47fb05b94aSJerry Jelinek 
48f8fdf681SPrakash Surya /*
49f8fdf681SPrakash Surya  * Tunable parameter for debugging or performance analysis. Setting this
50f8fdf681SPrakash Surya  * will cause pool corruption on power loss if a volatile out-of-order
51f8fdf681SPrakash Surya  * write cache is enabled.
52f8fdf681SPrakash Surya  */
53f8fdf681SPrakash Surya boolean_t zfs_nocacheflush = B_FALSE;
54f8fdf681SPrakash Surya 
55fa9e4066Sahrens /*
56fa9e4066Sahrens  * Virtual device vector for disks.
57fa9e4066Sahrens  */
58fa9e4066Sahrens 
59fa9e4066Sahrens extern ldi_ident_t zfs_li;
60fa9e4066Sahrens 
6139cddb10SJoshua M. Clulow static void vdev_disk_close(vdev_t *);
6239cddb10SJoshua M. Clulow 
63ac04831dSMike Gerdts typedef struct vdev_disk {
64ac04831dSMike Gerdts 	ddi_devid_t	vd_devid;
65ac04831dSMike Gerdts 	char		*vd_minor;
66ac04831dSMike Gerdts 	ldi_handle_t	vd_lh;
67ac04831dSMike Gerdts 	list_t		vd_ldi_cbs;
68ac04831dSMike Gerdts 	boolean_t	vd_ldi_offline;
69ac04831dSMike Gerdts } vdev_disk_t;
70ac04831dSMike Gerdts 
71ac04831dSMike Gerdts typedef struct vdev_disk_buf {
72ac04831dSMike Gerdts 	buf_t	vdb_buf;
73ac04831dSMike Gerdts 	zio_t	*vdb_io;
74ac04831dSMike Gerdts } vdev_disk_buf_t;
75ac04831dSMike Gerdts 
7639cddb10SJoshua M. Clulow typedef struct vdev_disk_ldi_cb {
7739cddb10SJoshua M. Clulow 	list_node_t		lcb_next;
7839cddb10SJoshua M. Clulow 	ldi_callback_id_t	lcb_id;
7939cddb10SJoshua M. Clulow } vdev_disk_ldi_cb_t;
8039cddb10SJoshua M. Clulow 
816fe4f300SPavel Zakharov /*
826fe4f300SPavel Zakharov  * Bypass the devid when opening a disk vdev.
836fe4f300SPavel Zakharov  * There have been issues where the devids of several devices were shuffled,
846fe4f300SPavel Zakharov  * causing pool open failures. Note, that this flag is intended to be used
856fe4f300SPavel Zakharov  * for pool recovery only.
866fe4f300SPavel Zakharov  *
876fe4f300SPavel Zakharov  * Note that if a pool is imported with the devids bypassed, all its vdevs will
886fe4f300SPavel Zakharov  * cease storing devid information permanently. In practice, the devid is rarely
896fe4f300SPavel Zakharov  * useful as vdev paths do not tend to change unless the hardware is
906fe4f300SPavel Zakharov  * reconfigured. That said, if the paths do change and a pool fails to open
916fe4f300SPavel Zakharov  * automatically at boot, a simple zpool import should re-scan the paths and fix
926fe4f300SPavel Zakharov  * the issue.
936fe4f300SPavel Zakharov  */
946fe4f300SPavel Zakharov boolean_t vdev_disk_bypass_devid = B_FALSE;
956fe4f300SPavel Zakharov 
9639cddb10SJoshua M. Clulow static void
vdev_disk_alloc(vdev_t * vd)9739cddb10SJoshua M. Clulow vdev_disk_alloc(vdev_t *vd)
9839cddb10SJoshua M. Clulow {
9939cddb10SJoshua M. Clulow 	vdev_disk_t *dvd;
10039cddb10SJoshua M. Clulow 
10139cddb10SJoshua M. Clulow 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
10239cddb10SJoshua M. Clulow 	/*
10339cddb10SJoshua M. Clulow 	 * Create the LDI event callback list.
10439cddb10SJoshua M. Clulow 	 */
10539cddb10SJoshua M. Clulow 	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
10639cddb10SJoshua M. Clulow 	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
10739cddb10SJoshua M. Clulow }
10839cddb10SJoshua M. Clulow 
10939cddb10SJoshua M. Clulow static void
vdev_disk_free(vdev_t * vd)11039cddb10SJoshua M. Clulow vdev_disk_free(vdev_t *vd)
11139cddb10SJoshua M. Clulow {
11239cddb10SJoshua M. Clulow 	vdev_disk_t *dvd = vd->vdev_tsd;
11339cddb10SJoshua M. Clulow 	vdev_disk_ldi_cb_t *lcb;
11439cddb10SJoshua M. Clulow 
11539cddb10SJoshua M. Clulow 	if (dvd == NULL)
11639cddb10SJoshua M. Clulow 		return;
11739cddb10SJoshua M. Clulow 
11839cddb10SJoshua M. Clulow 	/*
11939cddb10SJoshua M. Clulow 	 * We have already closed the LDI handle. Clean up the LDI event
12039cddb10SJoshua M. Clulow 	 * callbacks and free vd->vdev_tsd.
12139cddb10SJoshua M. Clulow 	 */
12239cddb10SJoshua M. Clulow 	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
12339cddb10SJoshua M. Clulow 		list_remove(&dvd->vd_ldi_cbs, lcb);
12439cddb10SJoshua M. Clulow 		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
12539cddb10SJoshua M. Clulow 		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
12639cddb10SJoshua M. Clulow 	}
12739cddb10SJoshua M. Clulow 	list_destroy(&dvd->vd_ldi_cbs);
12839cddb10SJoshua M. Clulow 	kmem_free(dvd, sizeof (vdev_disk_t));
12939cddb10SJoshua M. Clulow 	vd->vdev_tsd = NULL;
13039cddb10SJoshua M. Clulow }
13139cddb10SJoshua M. Clulow 
13239cddb10SJoshua M. Clulow static int
vdev_disk_off_notify(ldi_handle_t lh __unused,ldi_ev_cookie_t ecookie,void * arg,void * ev_data __unused)1331b500975SMike Gerdts vdev_disk_off_notify(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
1341b500975SMike Gerdts     void *arg, void *ev_data __unused)
13539cddb10SJoshua M. Clulow {
13639cddb10SJoshua M. Clulow 	vdev_t *vd = (vdev_t *)arg;
13739cddb10SJoshua M. Clulow 	vdev_disk_t *dvd = vd->vdev_tsd;
13839cddb10SJoshua M. Clulow 
13939cddb10SJoshua M. Clulow 	/*
14039cddb10SJoshua M. Clulow 	 * Ignore events other than offline.
14139cddb10SJoshua M. Clulow 	 */
14239cddb10SJoshua M. Clulow 	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
14339cddb10SJoshua M. Clulow 		return (LDI_EV_SUCCESS);
14439cddb10SJoshua M. Clulow 
14539cddb10SJoshua M. Clulow 	/*
1461b500975SMike Gerdts 	 * Tell any new threads that stumble upon this vdev that they should not
1471b500975SMike Gerdts 	 * try to do I/O.
14839cddb10SJoshua M. Clulow 	 */
14939cddb10SJoshua M. Clulow 	dvd->vd_ldi_offline = B_TRUE;
15039cddb10SJoshua M. Clulow 
15139cddb10SJoshua M. Clulow 	/*
1521b500975SMike Gerdts 	 * Request that the spa_async_thread mark the device as REMOVED and
1531b500975SMike Gerdts 	 * notify FMA of the removal.  This should also trigger a vdev_close()
1541b500975SMike Gerdts 	 * in the async thread.
15539cddb10SJoshua M. Clulow 	 */
15639cddb10SJoshua M. Clulow 	zfs_post_remove(vd->vdev_spa, vd);
15739cddb10SJoshua M. Clulow 	vd->vdev_remove_wanted = B_TRUE;
15839cddb10SJoshua M. Clulow 	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
15939cddb10SJoshua M. Clulow 
16039cddb10SJoshua M. Clulow 	return (LDI_EV_SUCCESS);
16139cddb10SJoshua M. Clulow }
16239cddb10SJoshua M. Clulow 
16339cddb10SJoshua M. Clulow static void
vdev_disk_off_finalize(ldi_handle_t lh __unused,ldi_ev_cookie_t ecookie,int ldi_result,void * arg,void * ev_data __unused)1641b500975SMike Gerdts vdev_disk_off_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
1651b500975SMike Gerdts     int ldi_result, void *arg, void *ev_data __unused)
16639cddb10SJoshua M. Clulow {
16739cddb10SJoshua M. Clulow 	vdev_t *vd = (vdev_t *)arg;
16839cddb10SJoshua M. Clulow 
16939cddb10SJoshua M. Clulow 	/*
17039cddb10SJoshua M. Clulow 	 * Ignore events other than offline.
17139cddb10SJoshua M. Clulow 	 */
17239cddb10SJoshua M. Clulow 	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
17339cddb10SJoshua M. Clulow 		return;
17439cddb10SJoshua M. Clulow 
17539cddb10SJoshua M. Clulow 	/*
17639cddb10SJoshua M. Clulow 	 * Request that the vdev be reopened if the offline state change was
17739cddb10SJoshua M. Clulow 	 * unsuccessful.
17839cddb10SJoshua M. Clulow 	 */
17939cddb10SJoshua M. Clulow 	if (ldi_result != LDI_EV_SUCCESS) {
18039cddb10SJoshua M. Clulow 		vd->vdev_probe_wanted = B_TRUE;
18139cddb10SJoshua M. Clulow 		spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
18239cddb10SJoshua M. Clulow 	}
18339cddb10SJoshua M. Clulow }
18439cddb10SJoshua M. Clulow 
18539cddb10SJoshua M. Clulow static ldi_ev_callback_t vdev_disk_off_callb = {
18639cddb10SJoshua M. Clulow 	.cb_vers = LDI_EV_CB_VERS,
18739cddb10SJoshua M. Clulow 	.cb_notify = vdev_disk_off_notify,
18839cddb10SJoshua M. Clulow 	.cb_finalize = vdev_disk_off_finalize
18939cddb10SJoshua M. Clulow };
19039cddb10SJoshua M. Clulow 
19139cddb10SJoshua M. Clulow static void
vdev_disk_dgrd_finalize(ldi_handle_t lh __unused,ldi_ev_cookie_t ecookie,int ldi_result,void * arg,void * ev_data __unused)1921b500975SMike Gerdts vdev_disk_dgrd_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
1931b500975SMike Gerdts     int ldi_result, void *arg, void *ev_data __unused)
19439cddb10SJoshua M. Clulow {
19539cddb10SJoshua M. Clulow 	vdev_t *vd = (vdev_t *)arg;
19639cddb10SJoshua M. Clulow 
19739cddb10SJoshua M. Clulow 	/*
19839cddb10SJoshua M. Clulow 	 * Ignore events other than degrade.
19939cddb10SJoshua M. Clulow 	 */
20039cddb10SJoshua M. Clulow 	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
20139cddb10SJoshua M. Clulow 		return;
20239cddb10SJoshua M. Clulow 
20339cddb10SJoshua M. Clulow 	/*
20439cddb10SJoshua M. Clulow 	 * Degrade events always succeed. Mark the vdev as degraded.
20539cddb10SJoshua M. Clulow 	 * This status is purely informative for the user.
20639cddb10SJoshua M. Clulow 	 */
20739cddb10SJoshua M. Clulow 	(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
20839cddb10SJoshua M. Clulow }
20939cddb10SJoshua M. Clulow 
21039cddb10SJoshua M. Clulow static ldi_ev_callback_t vdev_disk_dgrd_callb = {
21139cddb10SJoshua M. Clulow 	.cb_vers = LDI_EV_CB_VERS,
21239cddb10SJoshua M. Clulow 	.cb_notify = NULL,
21339cddb10SJoshua M. Clulow 	.cb_finalize = vdev_disk_dgrd_finalize
21439cddb10SJoshua M. Clulow };
21539cddb10SJoshua M. Clulow 
216dcba9f3fSGeorge Wilson static void
vdev_disk_hold(vdev_t * vd)217dcba9f3fSGeorge Wilson vdev_disk_hold(vdev_t *vd)
218dcba9f3fSGeorge Wilson {
219dcba9f3fSGeorge Wilson 	ddi_devid_t devid;
220dcba9f3fSGeorge Wilson 	char *minor;
221dcba9f3fSGeorge Wilson 
222dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
223dcba9f3fSGeorge Wilson 
224dcba9f3fSGeorge Wilson 	/*
225dcba9f3fSGeorge Wilson 	 * We must have a pathname, and it must be absolute.
226dcba9f3fSGeorge Wilson 	 */
227dcba9f3fSGeorge Wilson 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
228dcba9f3fSGeorge Wilson 		return;
229dcba9f3fSGeorge Wilson 
230dcba9f3fSGeorge Wilson 	/*
231dcba9f3fSGeorge Wilson 	 * Only prefetch path and devid info if the device has
232dcba9f3fSGeorge Wilson 	 * never been opened.
233dcba9f3fSGeorge Wilson 	 */
234dcba9f3fSGeorge Wilson 	if (vd->vdev_tsd != NULL)
235dcba9f3fSGeorge Wilson 		return;
236dcba9f3fSGeorge Wilson 
237dcba9f3fSGeorge Wilson 	if (vd->vdev_wholedisk == -1ULL) {
238dcba9f3fSGeorge Wilson 		size_t len = strlen(vd->vdev_path) + 3;
239dcba9f3fSGeorge Wilson 		char *buf = kmem_alloc(len, KM_SLEEP);
240dcba9f3fSGeorge Wilson 
241dcba9f3fSGeorge Wilson 		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
242dcba9f3fSGeorge Wilson 
243dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
244dcba9f3fSGeorge Wilson 		kmem_free(buf, len);
245dcba9f3fSGeorge Wilson 	}
246dcba9f3fSGeorge Wilson 
247dcba9f3fSGeorge Wilson 	if (vd->vdev_name_vp == NULL)
248dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
249dcba9f3fSGeorge Wilson 
250dcba9f3fSGeorge Wilson 	if (vd->vdev_devid != NULL &&
251dcba9f3fSGeorge Wilson 	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
252dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
253dcba9f3fSGeorge Wilson 		ddi_devid_str_free(minor);
254dcba9f3fSGeorge Wilson 		ddi_devid_free(devid);
255dcba9f3fSGeorge Wilson 	}
256dcba9f3fSGeorge Wilson }
257dcba9f3fSGeorge Wilson 
258dcba9f3fSGeorge Wilson static void
vdev_disk_rele(vdev_t * vd)259dcba9f3fSGeorge Wilson vdev_disk_rele(vdev_t *vd)
260dcba9f3fSGeorge Wilson {
261dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
262dcba9f3fSGeorge Wilson 
263dcba9f3fSGeorge Wilson 	if (vd->vdev_name_vp) {
264dcba9f3fSGeorge Wilson 		VN_RELE_ASYNC(vd->vdev_name_vp,
265dcba9f3fSGeorge Wilson 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
266dcba9f3fSGeorge Wilson 		vd->vdev_name_vp = NULL;
267dcba9f3fSGeorge Wilson 	}
268dcba9f3fSGeorge Wilson 	if (vd->vdev_devid_vp) {
269dcba9f3fSGeorge Wilson 		VN_RELE_ASYNC(vd->vdev_devid_vp,
270dcba9f3fSGeorge Wilson 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
271dcba9f3fSGeorge Wilson 		vd->vdev_devid_vp = NULL;
272dcba9f3fSGeorge Wilson 	}
273dcba9f3fSGeorge Wilson }
274dcba9f3fSGeorge Wilson 
275a5b57771SDan McDonald /*
276a5b57771SDan McDonald  * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
277a5b57771SDan McDonald  * even a fallback to DKIOCGMEDIAINFO fails.
278a5b57771SDan McDonald  */
279a5b57771SDan McDonald #ifdef DEBUG
280a5b57771SDan McDonald #define	VDEV_DEBUG(...)	cmn_err(CE_NOTE, __VA_ARGS__)
281a5b57771SDan McDonald #else
282a5b57771SDan McDonald #define	VDEV_DEBUG(...)	/* Nothing... */
283a5b57771SDan McDonald #endif
284a5b57771SDan McDonald 
285fa9e4066Sahrens static int
vdev_disk_open(vdev_t * vd,uint64_t * psize,uint64_t * max_psize,uint64_t * ashift)2864263d13fSGeorge Wilson vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
2874263d13fSGeorge Wilson     uint64_t *ashift)
288fa9e4066Sahrens {
2898ad4d6ddSJeff Bonwick 	spa_t *spa = vd->vdev_spa;
29039cddb10SJoshua M. Clulow 	vdev_disk_t *dvd = vd->vdev_tsd;
29139cddb10SJoshua M. Clulow 	ldi_ev_cookie_t ecookie;
29239cddb10SJoshua M. Clulow 	vdev_disk_ldi_cb_t *lcb;
293a5b57771SDan McDonald 	union {
294a5b57771SDan McDonald 		struct dk_minfo_ext ude;
295a5b57771SDan McDonald 		struct dk_minfo ud;
296a5b57771SDan McDonald 	} dks;
297a5b57771SDan McDonald 	struct dk_minfo_ext *dkmext = &dks.ude;
298a5b57771SDan McDonald 	struct dk_minfo *dkm = &dks.ud;
299084fd14fSBrian Behlendorf 	int error, can_free;
300e14bb325SJeff Bonwick 	dev_t dev;
301e14bb325SJeff Bonwick 	int otyp;
302fb02ae02SGeorge Wilson 	boolean_t validate_devid = B_FALSE;
303a5b57771SDan McDonald 	uint64_t capacity = 0, blksz = 0, pbsize;
304*8b26092dSJoshua M. Clulow 	const char *rdpath = vdev_disk_preroot_force_path();
305fa9e4066Sahrens 
306fa9e4066Sahrens 	/*
307fa9e4066Sahrens 	 * We must have a pathname, and it must be absolute.
308fa9e4066Sahrens 	 */
309fa9e4066Sahrens 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
310fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
311be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
312fa9e4066Sahrens 	}
313fa9e4066Sahrens 
314095bcd66SGeorge Wilson 	/*
315095bcd66SGeorge Wilson 	 * Reopen the device if it's not currently open. Otherwise,
316095bcd66SGeorge Wilson 	 * just update the physical size of the device.
317095bcd66SGeorge Wilson 	 */
31839cddb10SJoshua M. Clulow 	if (dvd != NULL) {
3191b500975SMike Gerdts 		ASSERT(vd->vdev_reopening);
3201b500975SMike Gerdts 		goto skip_open;
321095bcd66SGeorge Wilson 	}
322095bcd66SGeorge Wilson 
32339cddb10SJoshua M. Clulow 	/*
32439cddb10SJoshua M. Clulow 	 * Create vd->vdev_tsd.
32539cddb10SJoshua M. Clulow 	 */
32639cddb10SJoshua M. Clulow 	vdev_disk_alloc(vd);
32739cddb10SJoshua M. Clulow 	dvd = vd->vdev_tsd;
328fa9e4066Sahrens 
3296fe4f300SPavel Zakharov 	/*
3306fe4f300SPavel Zakharov 	 * Allow bypassing the devid.
3316fe4f300SPavel Zakharov 	 */
332*8b26092dSJoshua M. Clulow 	if (vd->vdev_devid != NULL &&
333*8b26092dSJoshua M. Clulow 	    (vdev_disk_bypass_devid || rdpath != NULL)) {
3346fe4f300SPavel Zakharov 		vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed",
3356fe4f300SPavel Zakharov 		    vd->vdev_devid);
3366fe4f300SPavel Zakharov 		spa_strfree(vd->vdev_devid);
3376fe4f300SPavel Zakharov 		vd->vdev_devid = NULL;
3386fe4f300SPavel Zakharov 	}
3396fe4f300SPavel Zakharov 
340fa9e4066Sahrens 	/*
341fa9e4066Sahrens 	 * When opening a disk device, we want to preserve the user's original
342fa9e4066Sahrens 	 * intent.  We always want to open the device by the path the user gave
3431724dc7bSJoshua M. Clulow 	 * us, even if it is one of multiple paths to the same device.  But we
344fa9e4066Sahrens 	 * also want to be able to survive disks being removed/recabled.
345fa9e4066Sahrens 	 * Therefore the sequence of opening devices is:
346fa9e4066Sahrens 	 *
347afefbcddSeschrock 	 * 1. Try opening the device by path.  For legacy pools without the
348afefbcddSeschrock 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
349fa9e4066Sahrens 	 *
350fa9e4066Sahrens 	 * 2. If the devid of the device matches the stored value, return
351fa9e4066Sahrens 	 *    success.
352fa9e4066Sahrens 	 *
353fa9e4066Sahrens 	 * 3. Otherwise, the device may have moved.  Try opening the device
354fa9e4066Sahrens 	 *    by the devid instead.
355fa9e4066Sahrens 	 */
356fa9e4066Sahrens 	if (vd->vdev_devid != NULL) {
357fa9e4066Sahrens 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
358fa9e4066Sahrens 		    &dvd->vd_minor) != 0) {
35929621f01SHans Rosenfeld 			vdev_dbgmsg(vd,
36029621f01SHans Rosenfeld 			    "vdev_disk_open, invalid devid %s bypassed",
36129621f01SHans Rosenfeld 			    vd->vdev_devid);
36229621f01SHans Rosenfeld 			spa_strfree(vd->vdev_devid);
36329621f01SHans Rosenfeld 			vd->vdev_devid = NULL;
364fa9e4066Sahrens 		}
365fa9e4066Sahrens 	}
366fa9e4066Sahrens 
367fa9e4066Sahrens 	error = EINVAL;		/* presume failure */
368fa9e4066Sahrens 
369*8b26092dSJoshua M. Clulow 	if (rdpath != NULL) {
370*8b26092dSJoshua M. Clulow 		/*
371*8b26092dSJoshua M. Clulow 		 * We have been asked to open only a specific root device, and
372*8b26092dSJoshua M. Clulow 		 * to fail otherwise.
373*8b26092dSJoshua M. Clulow 		 */
374*8b26092dSJoshua M. Clulow 		error = ldi_open_by_name((char *)rdpath, spa_mode(spa), kcred,
375*8b26092dSJoshua M. Clulow 		    &dvd->vd_lh, zfs_li);
376*8b26092dSJoshua M. Clulow 		validate_devid = B_TRUE;
377*8b26092dSJoshua M. Clulow 		goto rootdisk_only;
378*8b26092dSJoshua M. Clulow 	}
379*8b26092dSJoshua M. Clulow 
380095bcd66SGeorge Wilson 	if (vd->vdev_path != NULL) {
381afefbcddSeschrock 		if (vd->vdev_wholedisk == -1ULL) {
382afefbcddSeschrock 			size_t len = strlen(vd->vdev_path) + 3;
383afefbcddSeschrock 			char *buf = kmem_alloc(len, KM_SLEEP);
384afefbcddSeschrock 
385afefbcddSeschrock 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
386afefbcddSeschrock 
38739cddb10SJoshua M. Clulow 			error = ldi_open_by_name(buf, spa_mode(spa), kcred,
38839cddb10SJoshua M. Clulow 			    &dvd->vd_lh, zfs_li);
38939cddb10SJoshua M. Clulow 			if (error == 0) {
390afefbcddSeschrock 				spa_strfree(vd->vdev_path);
391afefbcddSeschrock 				vd->vdev_path = buf;
392afefbcddSeschrock 				vd->vdev_wholedisk = 1ULL;
393afefbcddSeschrock 			} else {
394afefbcddSeschrock 				kmem_free(buf, len);
395afefbcddSeschrock 			}
396afefbcddSeschrock 		}
397fa9e4066Sahrens 
39839cddb10SJoshua M. Clulow 		/*
39939cddb10SJoshua M. Clulow 		 * If we have not yet opened the device, try to open it by the
40039cddb10SJoshua M. Clulow 		 * specified path.
40139cddb10SJoshua M. Clulow 		 */
40239cddb10SJoshua M. Clulow 		if (error != 0) {
40339cddb10SJoshua M. Clulow 			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
40439cddb10SJoshua M. Clulow 			    kcred, &dvd->vd_lh, zfs_li);
40539cddb10SJoshua M. Clulow 		}
406fa9e4066Sahrens 
407fa9e4066Sahrens 		/*
408fa9e4066Sahrens 		 * Compare the devid to the stored value.
409fa9e4066Sahrens 		 */
4106af23589SJoshua M. Clulow 		if (error == 0 && vd->vdev_devid != NULL) {
4116af23589SJoshua M. Clulow 			ddi_devid_t devid = NULL;
4126af23589SJoshua M. Clulow 
4136af23589SJoshua M. Clulow 			if (ldi_get_devid(dvd->vd_lh, &devid) != 0) {
4146af23589SJoshua M. Clulow 				/*
4156af23589SJoshua M. Clulow 				 * We expected a devid on this device but it no
4166af23589SJoshua M. Clulow 				 * longer appears to have one.  The validation
4176af23589SJoshua M. Clulow 				 * step may need to remove it from the
4186af23589SJoshua M. Clulow 				 * configuration.
4196af23589SJoshua M. Clulow 				 */
4206af23589SJoshua M. Clulow 				validate_devid = B_TRUE;
4216af23589SJoshua M. Clulow 
4226af23589SJoshua M. Clulow 			} else if (ddi_devid_compare(devid, dvd->vd_devid) !=
4236af23589SJoshua M. Clulow 			    0) {
4246fe4f300SPavel Zakharov 				/*
4256fe4f300SPavel Zakharov 				 * A mismatch here is unexpected, log it.
4266fe4f300SPavel Zakharov 				 */
4276fe4f300SPavel Zakharov 				char *devid_str = ddi_devid_str_encode(devid,
4286fe4f300SPavel Zakharov 				    dvd->vd_minor);
4296fe4f300SPavel Zakharov 				vdev_dbgmsg(vd, "vdev_disk_open: devid "
4306fe4f300SPavel Zakharov 				    "mismatch: %s != %s", vd->vdev_devid,
4316fe4f300SPavel Zakharov 				    devid_str);
4326fe4f300SPavel Zakharov 				cmn_err(CE_NOTE, "vdev_disk_open %s: devid "
4336fe4f300SPavel Zakharov 				    "mismatch: %s != %s", vd->vdev_path,
4346fe4f300SPavel Zakharov 				    vd->vdev_devid, devid_str);
4356fe4f300SPavel Zakharov 				ddi_devid_str_free(devid_str);
4366fe4f300SPavel Zakharov 
437be6fd75aSMatthew Ahrens 				error = SET_ERROR(EINVAL);
4388ad4d6ddSJeff Bonwick 				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
4398ad4d6ddSJeff Bonwick 				    kcred);
440fa9e4066Sahrens 				dvd->vd_lh = NULL;
441fa9e4066Sahrens 			}
4426af23589SJoshua M. Clulow 
4436af23589SJoshua M. Clulow 			if (devid != NULL) {
4446af23589SJoshua M. Clulow 				ddi_devid_free(devid);
4456af23589SJoshua M. Clulow 			}
446fa9e4066Sahrens 		}
447afefbcddSeschrock 
448afefbcddSeschrock 		/*
449afefbcddSeschrock 		 * If we succeeded in opening the device, but 'vdev_wholedisk'
450afefbcddSeschrock 		 * is not yet set, then this must be a slice.
451afefbcddSeschrock 		 */
452afefbcddSeschrock 		if (error == 0 && vd->vdev_wholedisk == -1ULL)
453afefbcddSeschrock 			vd->vdev_wholedisk = 0;
454fa9e4066Sahrens 	}
455fa9e4066Sahrens 
456fa9e4066Sahrens 	/*
457fa9e4066Sahrens 	 * If we were unable to open by path, or the devid check fails, open by
458fa9e4066Sahrens 	 * devid instead.
459fa9e4066Sahrens 	 */
460fb02ae02SGeorge Wilson 	if (error != 0 && vd->vdev_devid != NULL) {
461fa9e4066Sahrens 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
4628ad4d6ddSJeff Bonwick 		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
4636fe4f300SPavel Zakharov 		if (error != 0) {
4646fe4f300SPavel Zakharov 			vdev_dbgmsg(vd, "Failed to open by devid (%s)",
4656fe4f300SPavel Zakharov 			    vd->vdev_devid);
4666fe4f300SPavel Zakharov 		}
467fb02ae02SGeorge Wilson 	}
468fa9e4066Sahrens 
4693d7072f8Seschrock 	/*
4703d7072f8Seschrock 	 * If all else fails, then try opening by physical path (if available)
4713d7072f8Seschrock 	 * or the logical path (if we failed due to the devid check).  While not
4723d7072f8Seschrock 	 * as reliable as the devid, this will give us something, and the higher
4733d7072f8Seschrock 	 * level vdev validation will prevent us from opening the wrong device.
4743d7072f8Seschrock 	 */
4756af23589SJoshua M. Clulow 	if (error != 0) {
4766af23589SJoshua M. Clulow 		validate_devid = B_TRUE;
477fb02ae02SGeorge Wilson 
4783d7072f8Seschrock 		if (vd->vdev_physpath != NULL &&
4796af23589SJoshua M. Clulow 		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) {
4808ad4d6ddSJeff Bonwick 			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
4813d7072f8Seschrock 			    kcred, &dvd->vd_lh, zfs_li);
4826af23589SJoshua M. Clulow 		}
4833d7072f8Seschrock 
4843d7072f8Seschrock 		/*
4853d7072f8Seschrock 		 * Note that we don't support the legacy auto-wholedisk support
4863d7072f8Seschrock 		 * as above.  This hasn't been used in a very long time and we
4873d7072f8Seschrock 		 * don't need to propagate its oddities to this edge condition.
4883d7072f8Seschrock 		 */
4896af23589SJoshua M. Clulow 		if (error != 0 && vd->vdev_path != NULL) {
4908ad4d6ddSJeff Bonwick 			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
4918ad4d6ddSJeff Bonwick 			    kcred, &dvd->vd_lh, zfs_li);
4926af23589SJoshua M. Clulow 		}
4933d7072f8Seschrock 	}
4943d7072f8Seschrock 
49530c304d9SJoshua M. Clulow 	/*
49630c304d9SJoshua M. Clulow 	 * If this is early in boot, a sweep of available block devices may
49730c304d9SJoshua M. Clulow 	 * locate an alternative path that we can try.
49830c304d9SJoshua M. Clulow 	 */
49930c304d9SJoshua M. Clulow 	if (error != 0) {
50030c304d9SJoshua M. Clulow 		const char *altdevpath = vdev_disk_preroot_lookup(
50130c304d9SJoshua M. Clulow 		    spa_guid(spa), vd->vdev_guid);
50230c304d9SJoshua M. Clulow 
50330c304d9SJoshua M. Clulow 		if (altdevpath != NULL) {
50430c304d9SJoshua M. Clulow 			vdev_dbgmsg(vd, "Trying alternate preroot path (%s)",
50530c304d9SJoshua M. Clulow 			    altdevpath);
50630c304d9SJoshua M. Clulow 
50730c304d9SJoshua M. Clulow 			validate_devid = B_TRUE;
50830c304d9SJoshua M. Clulow 
50930c304d9SJoshua M. Clulow 			if ((error = ldi_open_by_name((char *)altdevpath,
51030c304d9SJoshua M. Clulow 			    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li)) != 0) {
51130c304d9SJoshua M. Clulow 				vdev_dbgmsg(vd, "Failed to open by preroot "
51230c304d9SJoshua M. Clulow 				    "path (%s)", altdevpath);
51330c304d9SJoshua M. Clulow 			}
51430c304d9SJoshua M. Clulow 		}
51530c304d9SJoshua M. Clulow 	}
51630c304d9SJoshua M. Clulow 
517*8b26092dSJoshua M. Clulow rootdisk_only:
5186af23589SJoshua M. Clulow 	if (error != 0) {
519fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
5203ee8c80cSPavel Zakharov 		vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]",
5213ee8c80cSPavel Zakharov 		    error);
522fa9e4066Sahrens 		return (error);
523e14bb325SJeff Bonwick 	}
524fa9e4066Sahrens 
525fb02ae02SGeorge Wilson 	/*
526fb02ae02SGeorge Wilson 	 * Now that the device has been successfully opened, update the devid
527fb02ae02SGeorge Wilson 	 * if necessary.
528fb02ae02SGeorge Wilson 	 */
5296af23589SJoshua M. Clulow 	if (validate_devid) {
5306af23589SJoshua M. Clulow 		ddi_devid_t devid = NULL;
5316af23589SJoshua M. Clulow 		char *minorname = NULL;
5326af23589SJoshua M. Clulow 		char *vd_devid = NULL;
5336af23589SJoshua M. Clulow 		boolean_t remove = B_FALSE, update = B_FALSE;
5346af23589SJoshua M. Clulow 
5356af23589SJoshua M. Clulow 		/*
5366af23589SJoshua M. Clulow 		 * Get the current devid and minor name for the device we
5376af23589SJoshua M. Clulow 		 * opened.
5386af23589SJoshua M. Clulow 		 */
5396af23589SJoshua M. Clulow 		if (ldi_get_devid(dvd->vd_lh, &devid) != 0 ||
5406af23589SJoshua M. Clulow 		    ldi_get_minor_name(dvd->vd_lh, &minorname) != 0) {
5416af23589SJoshua M. Clulow 			/*
5426af23589SJoshua M. Clulow 			 * If we are unable to get the devid or the minor name
5436af23589SJoshua M. Clulow 			 * for the device, we need to remove them from the
5446af23589SJoshua M. Clulow 			 * configuration to prevent potential inconsistencies.
5456af23589SJoshua M. Clulow 			 */
5466af23589SJoshua M. Clulow 			if (dvd->vd_minor != NULL || dvd->vd_devid != NULL ||
5476af23589SJoshua M. Clulow 			    vd->vdev_devid != NULL) {
5486af23589SJoshua M. Clulow 				/*
5496af23589SJoshua M. Clulow 				 * We only need to remove the devid if one
5506af23589SJoshua M. Clulow 				 * exists.
5516af23589SJoshua M. Clulow 				 */
5526af23589SJoshua M. Clulow 				remove = B_TRUE;
5536af23589SJoshua M. Clulow 			}
554fb02ae02SGeorge Wilson 
5556af23589SJoshua M. Clulow 		} else if (dvd->vd_devid == NULL || dvd->vd_minor == NULL) {
5566af23589SJoshua M. Clulow 			/*
5576af23589SJoshua M. Clulow 			 * There was previously no devid at all so we need to
5586af23589SJoshua M. Clulow 			 * add one.
5596af23589SJoshua M. Clulow 			 */
5606af23589SJoshua M. Clulow 			update = B_TRUE;
5616af23589SJoshua M. Clulow 
5626af23589SJoshua M. Clulow 		} else if (ddi_devid_compare(devid, dvd->vd_devid) != 0 ||
5636af23589SJoshua M. Clulow 		    strcmp(minorname, dvd->vd_minor) != 0) {
5646af23589SJoshua M. Clulow 			/*
5656af23589SJoshua M. Clulow 			 * The devid or minor name on file does not match the
5666af23589SJoshua M. Clulow 			 * one from the opened device.
5676af23589SJoshua M. Clulow 			 */
5686af23589SJoshua M. Clulow 			update = B_TRUE;
5696af23589SJoshua M. Clulow 		}
5706af23589SJoshua M. Clulow 
5716af23589SJoshua M. Clulow 		if (update) {
5726af23589SJoshua M. Clulow 			/*
5736af23589SJoshua M. Clulow 			 * Render the new devid and minor name as a string for
5746af23589SJoshua M. Clulow 			 * logging and to store in the vdev configuration.
5756af23589SJoshua M. Clulow 			 */
5766af23589SJoshua M. Clulow 			vd_devid = ddi_devid_str_encode(devid, minorname);
5776af23589SJoshua M. Clulow 		}
5786af23589SJoshua M. Clulow 
5796af23589SJoshua M. Clulow 		if (update || remove) {
5803ee8c80cSPavel Zakharov 			vdev_dbgmsg(vd, "vdev_disk_open: update devid from "
5816af23589SJoshua M. Clulow 			    "'%s' to '%s'",
5826af23589SJoshua M. Clulow 			    vd->vdev_devid != NULL ? vd->vdev_devid : "<none>",
5836af23589SJoshua M. Clulow 			    vd_devid != NULL ? vd_devid : "<none>");
5846fe4f300SPavel Zakharov 			cmn_err(CE_NOTE, "vdev_disk_open %s: update devid "
5856af23589SJoshua M. Clulow 			    "from '%s' to '%s'",
5866af23589SJoshua M. Clulow 			    vd->vdev_path != NULL ? vd->vdev_path : "?",
5876af23589SJoshua M. Clulow 			    vd->vdev_devid != NULL ? vd->vdev_devid : "<none>",
5886af23589SJoshua M. Clulow 			    vd_devid != NULL ? vd_devid : "<none>");
5896af23589SJoshua M. Clulow 
5906af23589SJoshua M. Clulow 			/*
5916af23589SJoshua M. Clulow 			 * Remove and free any existing values.
5926af23589SJoshua M. Clulow 			 */
5936af23589SJoshua M. Clulow 			if (dvd->vd_minor != NULL) {
5946af23589SJoshua M. Clulow 				ddi_devid_str_free(dvd->vd_minor);
5956af23589SJoshua M. Clulow 				dvd->vd_minor = NULL;
5966af23589SJoshua M. Clulow 			}
5976af23589SJoshua M. Clulow 			if (dvd->vd_devid != NULL) {
5986af23589SJoshua M. Clulow 				ddi_devid_free(dvd->vd_devid);
5996af23589SJoshua M. Clulow 				dvd->vd_devid = NULL;
6006af23589SJoshua M. Clulow 			}
6016af23589SJoshua M. Clulow 			if (vd->vdev_devid != NULL) {
6026af23589SJoshua M. Clulow 				spa_strfree(vd->vdev_devid);
6036af23589SJoshua M. Clulow 				vd->vdev_devid = NULL;
6046af23589SJoshua M. Clulow 			}
6056af23589SJoshua M. Clulow 		}
6066af23589SJoshua M. Clulow 
6076af23589SJoshua M. Clulow 		if (update) {
6086af23589SJoshua M. Clulow 			/*
6096af23589SJoshua M. Clulow 			 * Install the new values.
6106af23589SJoshua M. Clulow 			 */
6116af23589SJoshua M. Clulow 			vd->vdev_devid = vd_devid;
6126af23589SJoshua M. Clulow 			dvd->vd_minor = minorname;
6136af23589SJoshua M. Clulow 			dvd->vd_devid = devid;
6146af23589SJoshua M. Clulow 
6156af23589SJoshua M. Clulow 		} else {
6166af23589SJoshua M. Clulow 			if (devid != NULL) {
6176af23589SJoshua M. Clulow 				ddi_devid_free(devid);
6186af23589SJoshua M. Clulow 			}
6196af23589SJoshua M. Clulow 			if (minorname != NULL) {
6206af23589SJoshua M. Clulow 				kmem_free(minorname, strlen(minorname) + 1);
6216af23589SJoshua M. Clulow 			}
622fb02ae02SGeorge Wilson 		}
623fb02ae02SGeorge Wilson 	}
624fb02ae02SGeorge Wilson 
6253d7072f8Seschrock 	/*
6263d7072f8Seschrock 	 * Once a device is opened, verify that the physical device path (if
6273d7072f8Seschrock 	 * available) is up to date.
6283d7072f8Seschrock 	 */
6293d7072f8Seschrock 	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
6303d7072f8Seschrock 	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
6310a4e9518Sgw 		char *physpath, *minorname;
6320a4e9518Sgw 
6333d7072f8Seschrock 		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
6343d7072f8Seschrock 		minorname = NULL;
6353d7072f8Seschrock 		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
6363d7072f8Seschrock 		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
6373d7072f8Seschrock 		    (vd->vdev_physpath == NULL ||
6383d7072f8Seschrock 		    strcmp(vd->vdev_physpath, physpath) != 0)) {
6393d7072f8Seschrock 			if (vd->vdev_physpath)
6403d7072f8Seschrock 				spa_strfree(vd->vdev_physpath);
6413d7072f8Seschrock 			(void) strlcat(physpath, ":", MAXPATHLEN);
6423d7072f8Seschrock 			(void) strlcat(physpath, minorname, MAXPATHLEN);
6433d7072f8Seschrock 			vd->vdev_physpath = spa_strdup(physpath);
6443d7072f8Seschrock 		}
6453d7072f8Seschrock 		if (minorname)
6463d7072f8Seschrock 			kmem_free(minorname, strlen(minorname) + 1);
6473d7072f8Seschrock 		kmem_free(physpath, MAXPATHLEN);
6483d7072f8Seschrock 	}
6493d7072f8Seschrock 
65039cddb10SJoshua M. Clulow 	/*
65139cddb10SJoshua M. Clulow 	 * Register callbacks for the LDI offline event.
65239cddb10SJoshua M. Clulow 	 */
65339cddb10SJoshua M. Clulow 	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
65439cddb10SJoshua M. Clulow 	    LDI_EV_SUCCESS) {
65539cddb10SJoshua M. Clulow 		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
65639cddb10SJoshua M. Clulow 		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
65739cddb10SJoshua M. Clulow 		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
65839cddb10SJoshua M. Clulow 		    &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
65939cddb10SJoshua M. Clulow 	}
66039cddb10SJoshua M. Clulow 
66139cddb10SJoshua M. Clulow 	/*
66239cddb10SJoshua M. Clulow 	 * Register callbacks for the LDI degrade event.
66339cddb10SJoshua M. Clulow 	 */
66439cddb10SJoshua M. Clulow 	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
66539cddb10SJoshua M. Clulow 	    LDI_EV_SUCCESS) {
66639cddb10SJoshua M. Clulow 		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
66739cddb10SJoshua M. Clulow 		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
66839cddb10SJoshua M. Clulow 		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
66939cddb10SJoshua M. Clulow 		    &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
67039cddb10SJoshua M. Clulow 	}
671084fd14fSBrian Behlendorf 
672095bcd66SGeorge Wilson skip_open:
673fa9e4066Sahrens 	/*
674fa9e4066Sahrens 	 * Determine the actual size of the device.
675fa9e4066Sahrens 	 */
676fa9e4066Sahrens 	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
677fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
6783ee8c80cSPavel Zakharov 		vdev_dbgmsg(vd, "vdev_disk_open: failed to get size");
679be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
680fa9e4066Sahrens 	}
681fa9e4066Sahrens 
682a5b57771SDan McDonald 	*max_psize = *psize;
683a5b57771SDan McDonald 
684ecc2d604Sbonwick 	/*
685ecc2d604Sbonwick 	 * Determine the device's minimum transfer size.
686ecc2d604Sbonwick 	 * If the ioctl isn't supported, assume DEV_BSIZE.
687ecc2d604Sbonwick 	 */
688a5b57771SDan McDonald 	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
689a5b57771SDan McDonald 	    (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
690a5b57771SDan McDonald 		capacity = dkmext->dki_capacity - 1;
691a5b57771SDan McDonald 		blksz = dkmext->dki_lbsize;
692a5b57771SDan McDonald 		pbsize = dkmext->dki_pbsize;
693a5b57771SDan McDonald 	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
694a5b57771SDan McDonald 	    (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
695a5b57771SDan McDonald 		VDEV_DEBUG(
696a5b57771SDan McDonald 		    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
697a5b57771SDan McDonald 		    vd->vdev_path);
698a5b57771SDan McDonald 		capacity = dkm->dki_capacity - 1;
699a5b57771SDan McDonald 		blksz = dkm->dki_lbsize;
700a5b57771SDan McDonald 		pbsize = blksz;
701a5b57771SDan McDonald 	} else {
702a5b57771SDan McDonald 		VDEV_DEBUG("vdev_disk_open(\"%s\"): "
703a5b57771SDan McDonald 		    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
704a5b57771SDan McDonald 		    vd->vdev_path, error);
705a5b57771SDan McDonald 		pbsize = DEV_BSIZE;
706a5b57771SDan McDonald 	}
707bef6b7d2Swebaker 
708bf16b11eSMatthew Ahrens 	*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
709bef6b7d2Swebaker 
7104263d13fSGeorge Wilson 	if (vd->vdev_wholedisk == 1) {
7114263d13fSGeorge Wilson 		int wce = 1;
7124263d13fSGeorge Wilson 
713a5b57771SDan McDonald 		if (error == 0) {
714a5b57771SDan McDonald 			/*
715a5b57771SDan McDonald 			 * If we have the capability to expand, we'd have
716a5b57771SDan McDonald 			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
717a5b57771SDan McDonald 			 * Adjust max_psize upward accordingly since we know
718a5b57771SDan McDonald 			 * we own the whole disk now.
719a5b57771SDan McDonald 			 */
720c39a2aaeSGeorge Wilson 			*max_psize = capacity * blksz;
721a5b57771SDan McDonald 		}
722a5b57771SDan McDonald 
7234263d13fSGeorge Wilson 		/*
724a5b57771SDan McDonald 		 * Since we own the whole disk, try to enable disk write
725a5b57771SDan McDonald 		 * caching.  We ignore errors because it's OK if we can't do it.
7264263d13fSGeorge Wilson 		 */
7274263d13fSGeorge Wilson 		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
7284263d13fSGeorge Wilson 		    FKIOCTL, kcred, NULL);
7294263d13fSGeorge Wilson 	}
7304263d13fSGeorge Wilson 
731b468a217Seschrock 	/*
732b468a217Seschrock 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
733b468a217Seschrock 	 * try again.
734b468a217Seschrock 	 */
735b468a217Seschrock 	vd->vdev_nowritecache = B_FALSE;
736b468a217Seschrock 
737084fd14fSBrian Behlendorf 	if (ldi_ioctl(dvd->vd_lh, DKIOC_CANFREE, (intptr_t)&can_free, FKIOCTL,
738084fd14fSBrian Behlendorf 	    kcred, NULL) == 0 && can_free == 1) {
739084fd14fSBrian Behlendorf 		vd->vdev_has_trim = B_TRUE;
740084fd14fSBrian Behlendorf 	} else {
741084fd14fSBrian Behlendorf 		vd->vdev_has_trim = B_FALSE;
742084fd14fSBrian Behlendorf 	}
743084fd14fSBrian Behlendorf 
744fb05b94aSJerry Jelinek 	if (zfs_no_trim == 1)
745fb05b94aSJerry Jelinek 		vd->vdev_has_trim = B_FALSE;
746fb05b94aSJerry Jelinek 
747084fd14fSBrian Behlendorf 	/* Currently only supported for ZoL. */
748084fd14fSBrian Behlendorf 	vd->vdev_has_securetrim = B_FALSE;
749084fd14fSBrian Behlendorf 
75012a8814cSTom Caputi 	/* Inform the ZIO pipeline that we are non-rotational */
75112a8814cSTom Caputi 	vd->vdev_nonrot = B_FALSE;
75212a8814cSTom Caputi 	if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
75312a8814cSTom Caputi 	    "device-solid-state")) {
75412a8814cSTom Caputi 		if (ldi_prop_get_int(dvd->vd_lh,
75512a8814cSTom Caputi 		    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
75612a8814cSTom Caputi 		    "device-solid-state", B_FALSE) != 0)
75712a8814cSTom Caputi 			vd->vdev_nonrot = B_TRUE;
75812a8814cSTom Caputi 	}
75912a8814cSTom Caputi 
760fa9e4066Sahrens 	return (0);
761fa9e4066Sahrens }
762fa9e4066Sahrens 
763fa9e4066Sahrens static void
vdev_disk_close(vdev_t * vd)764fa9e4066Sahrens vdev_disk_close(vdev_t *vd)
765fa9e4066Sahrens {
766fa9e4066Sahrens 	vdev_disk_t *dvd = vd->vdev_tsd;
767fa9e4066Sahrens 
768095bcd66SGeorge Wilson 	if (vd->vdev_reopening || dvd == NULL)
769fa9e4066Sahrens 		return;
770fa9e4066Sahrens 
77139cddb10SJoshua M. Clulow 	if (dvd->vd_minor != NULL) {
772fa9e4066Sahrens 		ddi_devid_str_free(dvd->vd_minor);
77339cddb10SJoshua M. Clulow 		dvd->vd_minor = NULL;
77439cddb10SJoshua M. Clulow 	}
775fa9e4066Sahrens 
77639cddb10SJoshua M. Clulow 	if (dvd->vd_devid != NULL) {
777fa9e4066Sahrens 		ddi_devid_free(dvd->vd_devid);
77839cddb10SJoshua M. Clulow 		dvd->vd_devid = NULL;
77939cddb10SJoshua M. Clulow 	}
780fa9e4066Sahrens 
78139cddb10SJoshua M. Clulow 	if (dvd->vd_lh != NULL) {
7828ad4d6ddSJeff Bonwick 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
78339cddb10SJoshua M. Clulow 		dvd->vd_lh = NULL;
78439cddb10SJoshua M. Clulow 	}
785fa9e4066Sahrens 
78698d1cbfeSGeorge Wilson 	vd->vdev_delayed_close = B_FALSE;
78739cddb10SJoshua M. Clulow 	vdev_disk_free(vd);
788fa9e4066Sahrens }
789fa9e4066Sahrens 
790ac04831dSMike Gerdts static int
vdev_disk_ldi_physio(ldi_handle_t vd_lh,caddr_t data,size_t size,uint64_t offset,int flags)791810e43b2SBill Pijewski vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
792810e43b2SBill Pijewski     size_t size, uint64_t offset, int flags)
793e7cbe64fSgw {
794e7cbe64fSgw 	buf_t *bp;
795e7cbe64fSgw 	int error = 0;
796e7cbe64fSgw 
797e7cbe64fSgw 	if (vd_lh == NULL)
798be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
799e7cbe64fSgw 
800e7cbe64fSgw 	ASSERT(flags & B_READ || flags & B_WRITE);
801e7cbe64fSgw 
802e7cbe64fSgw 	bp = getrbuf(KM_SLEEP);
803e7cbe64fSgw 	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
804e7cbe64fSgw 	bp->b_bcount = size;
805e7cbe64fSgw 	bp->b_un.b_addr = (void *)data;
806e7cbe64fSgw 	bp->b_lblkno = lbtodb(offset);
807e7cbe64fSgw 	bp->b_bufsize = size;
808e7cbe64fSgw 
809e7cbe64fSgw 	error = ldi_strategy(vd_lh, bp);
810e7cbe64fSgw 	ASSERT(error == 0);
811e7cbe64fSgw 	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
812be6fd75aSMatthew Ahrens 		error = SET_ERROR(EIO);
813e7cbe64fSgw 	freerbuf(bp);
814e7cbe64fSgw 
815e7cbe64fSgw 	return (error);
816e7cbe64fSgw }
817e7cbe64fSgw 
818ac04831dSMike Gerdts static int
vdev_disk_dumpio(vdev_t * vd,caddr_t data,size_t size,uint64_t offset,uint64_t origoffset __unused,boolean_t doread,boolean_t isdump)819ac04831dSMike Gerdts vdev_disk_dumpio(vdev_t *vd, caddr_t data, size_t size,
8201b500975SMike Gerdts     uint64_t offset, uint64_t origoffset __unused, boolean_t doread,
8211b500975SMike Gerdts     boolean_t isdump)
822ac04831dSMike Gerdts {
823ac04831dSMike Gerdts 	vdev_disk_t *dvd = vd->vdev_tsd;
824ac04831dSMike Gerdts 	int flags = doread ? B_READ : B_WRITE;
825ac04831dSMike Gerdts 
826ac04831dSMike Gerdts 	/*
827ac04831dSMike Gerdts 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
828ac04831dSMike Gerdts 	 * Nothing to be done here but return failure.
829ac04831dSMike Gerdts 	 */
830ac04831dSMike Gerdts 	if (dvd == NULL || dvd->vd_ldi_offline) {
8311b500975SMike Gerdts 		return (SET_ERROR(ENXIO));
832ac04831dSMike Gerdts 	}
833ac04831dSMike Gerdts 
834ac04831dSMike Gerdts 	ASSERT(vd->vdev_ops == &vdev_disk_ops);
835ac04831dSMike Gerdts 
836ac04831dSMike Gerdts 	offset += VDEV_LABEL_START_SIZE;
837ac04831dSMike Gerdts 
838ac04831dSMike Gerdts 	/*
839ac04831dSMike Gerdts 	 * If in the context of an active crash dump, use the ldi_dump(9F)
840ac04831dSMike Gerdts 	 * call instead of ldi_strategy(9F) as usual.
841ac04831dSMike Gerdts 	 */
842ac04831dSMike Gerdts 	if (isdump) {
843ac04831dSMike Gerdts 		ASSERT3P(dvd, !=, NULL);
844ac04831dSMike Gerdts 		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
845ac04831dSMike Gerdts 		    lbtodb(size)));
846ac04831dSMike Gerdts 	}
847ac04831dSMike Gerdts 
848ac04831dSMike Gerdts 	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
849ac04831dSMike Gerdts }
850ac04831dSMike Gerdts 
851c62757b2SToomas Soome static int
vdev_disk_io_intr(buf_t * bp)852fa9e4066Sahrens vdev_disk_io_intr(buf_t *bp)
853fa9e4066Sahrens {
85431d7e8faSGeorge Wilson 	vdev_buf_t *vb = (vdev_buf_t *)bp;
85531d7e8faSGeorge Wilson 	zio_t *zio = vb->vb_io;
856fa9e4066Sahrens 
85751ece835Seschrock 	/*
85851ece835Seschrock 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
85951ece835Seschrock 	 * Rather than teach the rest of the stack about other error
86051ece835Seschrock 	 * possibilities (EFAULT, etc), we normalize the error value here.
86151ece835Seschrock 	 */
86251ece835Seschrock 	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
86351ece835Seschrock 
86451ece835Seschrock 	if (zio->io_error == 0 && bp->b_resid != 0)
865be6fd75aSMatthew Ahrens 		zio->io_error = SET_ERROR(EIO);
866fa9e4066Sahrens 
867770499e1SDan Kimmel 	if (zio->io_type == ZIO_TYPE_READ) {
868770499e1SDan Kimmel 		abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
869770499e1SDan Kimmel 	} else {
870770499e1SDan Kimmel 		abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
871770499e1SDan Kimmel 	}
872770499e1SDan Kimmel 
87331d7e8faSGeorge Wilson 	kmem_free(vb, sizeof (vdev_buf_t));
874fa9e4066Sahrens 
87597e81309SPrakash Surya 	zio_delay_interrupt(zio);
876c62757b2SToomas Soome 	return (0);
877fa9e4066Sahrens }
878fa9e4066Sahrens 
879f4a72450SJeff Bonwick static void
vdev_disk_ioctl_free(zio_t * zio)880f4a72450SJeff Bonwick vdev_disk_ioctl_free(zio_t *zio)
881f4a72450SJeff Bonwick {
882f4a72450SJeff Bonwick 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
883f4a72450SJeff Bonwick }
884f4a72450SJeff Bonwick 
88522fe2c88SJonathan Adams static const zio_vsd_ops_t vdev_disk_vsd_ops = {
88622fe2c88SJonathan Adams 	vdev_disk_ioctl_free,
88722fe2c88SJonathan Adams 	zio_vsd_default_cksum_report
88822fe2c88SJonathan Adams };
88922fe2c88SJonathan Adams 
890fa9e4066Sahrens static void
vdev_disk_ioctl_done(void * zio_arg,int error)891fa9e4066Sahrens vdev_disk_ioctl_done(void *zio_arg, int error)
892fa9e4066Sahrens {
893fa9e4066Sahrens 	zio_t *zio = zio_arg;
894fa9e4066Sahrens 
895fa9e4066Sahrens 	zio->io_error = error;
896fa9e4066Sahrens 
897e05725b1Sbonwick 	zio_interrupt(zio);
898fa9e4066Sahrens }
899fa9e4066Sahrens 
900738f37bcSGeorge Wilson static void
vdev_disk_io_start(zio_t * zio)901fa9e4066Sahrens vdev_disk_io_start(zio_t *zio)
902fa9e4066Sahrens {
903fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
904fa9e4066Sahrens 	vdev_disk_t *dvd = vd->vdev_tsd;
905084fd14fSBrian Behlendorf 	unsigned long trim_flags = 0;
90631d7e8faSGeorge Wilson 	vdev_buf_t *vb;
907e14bb325SJeff Bonwick 	struct dk_callback *dkc;
908fa9e4066Sahrens 	buf_t *bp;
909e14bb325SJeff Bonwick 	int error;
910fa9e4066Sahrens 
91139cddb10SJoshua M. Clulow 	/*
91239cddb10SJoshua M. Clulow 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
91339cddb10SJoshua M. Clulow 	 * Nothing to be done here but return failure.
91439cddb10SJoshua M. Clulow 	 */
9151b500975SMike Gerdts 	if (dvd == NULL || dvd->vd_ldi_offline) {
91639cddb10SJoshua M. Clulow 		zio->io_error = ENXIO;
917738f37bcSGeorge Wilson 		zio_interrupt(zio);
918738f37bcSGeorge Wilson 		return;
91939cddb10SJoshua M. Clulow 	}
92039cddb10SJoshua M. Clulow 
921084fd14fSBrian Behlendorf 	switch (zio->io_type) {
922084fd14fSBrian Behlendorf 	case ZIO_TYPE_IOCTL:
923fa9e4066Sahrens 		/* XXPOLICY */
9240a4e9518Sgw 		if (!vdev_readable(vd)) {
925be6fd75aSMatthew Ahrens 			zio->io_error = SET_ERROR(ENXIO);
926738f37bcSGeorge Wilson 			zio_interrupt(zio);
927738f37bcSGeorge Wilson 			return;
928fa9e4066Sahrens 		}
929fa9e4066Sahrens 
930fa9e4066Sahrens 		switch (zio->io_cmd) {
931fa9e4066Sahrens 
932fa9e4066Sahrens 		case DKIOCFLUSHWRITECACHE:
933fa9e4066Sahrens 
934a2eea2e1Sahrens 			if (zfs_nocacheflush)
935a2eea2e1Sahrens 				break;
936a2eea2e1Sahrens 
937b468a217Seschrock 			if (vd->vdev_nowritecache) {
938be6fd75aSMatthew Ahrens 				zio->io_error = SET_ERROR(ENOTSUP);
939b468a217Seschrock 				break;
940b468a217Seschrock 			}
941b468a217Seschrock 
942e14bb325SJeff Bonwick 			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
94322fe2c88SJonathan Adams 			zio->io_vsd_ops = &vdev_disk_vsd_ops;
944e14bb325SJeff Bonwick 
945e14bb325SJeff Bonwick 			dkc->dkc_callback = vdev_disk_ioctl_done;
946e14bb325SJeff Bonwick 			dkc->dkc_flag = FLUSH_VOLATILE;
947e14bb325SJeff Bonwick 			dkc->dkc_cookie = zio;
948fa9e4066Sahrens 
949fa9e4066Sahrens 			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
950e14bb325SJeff Bonwick 			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
951fa9e4066Sahrens 
952fa9e4066Sahrens 			if (error == 0) {
953fa9e4066Sahrens 				/*
954fa9e4066Sahrens 				 * The ioctl will be done asychronously,
955fa9e4066Sahrens 				 * and will call vdev_disk_ioctl_done()
956fa9e4066Sahrens 				 * upon completion.
957fa9e4066Sahrens 				 */
958738f37bcSGeorge Wilson 				return;
959e05725b1Sbonwick 			}
960e05725b1Sbonwick 
961fa9e4066Sahrens 			zio->io_error = error;
962b468a217Seschrock 
963fa9e4066Sahrens 			break;
964fa9e4066Sahrens 
965fa9e4066Sahrens 		default:
966be6fd75aSMatthew Ahrens 			zio->io_error = SET_ERROR(ENOTSUP);
967fa9e4066Sahrens 		}
968fa9e4066Sahrens 
969738f37bcSGeorge Wilson 		zio_execute(zio);
970738f37bcSGeorge Wilson 		return;
971084fd14fSBrian Behlendorf 
972084fd14fSBrian Behlendorf 	case ZIO_TYPE_TRIM:
973fb05b94aSJerry Jelinek 		if (zfs_no_trim == 1 || !vd->vdev_has_trim) {
974084fd14fSBrian Behlendorf 			zio->io_error = SET_ERROR(ENOTSUP);
975084fd14fSBrian Behlendorf 			zio_execute(zio);
976084fd14fSBrian Behlendorf 			return;
977084fd14fSBrian Behlendorf 		}
978084fd14fSBrian Behlendorf 		/* Currently only supported on ZoL. */
979084fd14fSBrian Behlendorf 		ASSERT0(zio->io_trim_flags & ZIO_TRIM_SECURE);
980084fd14fSBrian Behlendorf 
981084fd14fSBrian Behlendorf 		/* dkioc_free_list_t is already declared to hold one entry */
982084fd14fSBrian Behlendorf 		dkioc_free_list_t dfl;
983084fd14fSBrian Behlendorf 		dfl.dfl_flags = 0;
984084fd14fSBrian Behlendorf 		dfl.dfl_num_exts = 1;
985d0562c10SJerry Jelinek 		dfl.dfl_offset = 0;
986084fd14fSBrian Behlendorf 		dfl.dfl_exts[0].dfle_start = zio->io_offset;
987084fd14fSBrian Behlendorf 		dfl.dfl_exts[0].dfle_length = zio->io_size;
988084fd14fSBrian Behlendorf 
989084fd14fSBrian Behlendorf 		zio->io_error = ldi_ioctl(dvd->vd_lh, DKIOCFREE,
990084fd14fSBrian Behlendorf 		    (uintptr_t)&dfl, FKIOCTL, kcred, NULL);
991084fd14fSBrian Behlendorf 
992084fd14fSBrian Behlendorf 		if (zio->io_error == ENOTSUP || zio->io_error == ENOTTY) {
993084fd14fSBrian Behlendorf 			/*
994084fd14fSBrian Behlendorf 			 * The device must have changed and now TRIM is
995084fd14fSBrian Behlendorf 			 * no longer supported.
996084fd14fSBrian Behlendorf 			 */
997084fd14fSBrian Behlendorf 			vd->vdev_has_trim = B_FALSE;
998084fd14fSBrian Behlendorf 		}
999084fd14fSBrian Behlendorf 
1000084fd14fSBrian Behlendorf 		zio_interrupt(zio);
1001084fd14fSBrian Behlendorf 		return;
1002fa9e4066Sahrens 	}
1003fa9e4066Sahrens 
1004f693d300SSteven Hartland 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
100597e81309SPrakash Surya 	zio->io_target_timestamp = zio_handle_io_delay(zio);
1006f693d300SSteven Hartland 
100731d7e8faSGeorge Wilson 	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
1008fa9e4066Sahrens 
100931d7e8faSGeorge Wilson 	vb->vb_io = zio;
101031d7e8faSGeorge Wilson 	bp = &vb->vb_buf;
1011fa9e4066Sahrens 
1012fa9e4066Sahrens 	bioinit(bp);
1013e14bb325SJeff Bonwick 	bp->b_flags = B_BUSY | B_NOCACHE |
10148956713aSEric Schrock 	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
10158956713aSEric Schrock 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
10168956713aSEric Schrock 		bp->b_flags |= B_FAILFAST;
1017fa9e4066Sahrens 	bp->b_bcount = zio->io_size;
1018770499e1SDan Kimmel 
1019770499e1SDan Kimmel 	if (zio->io_type == ZIO_TYPE_READ) {
1020770499e1SDan Kimmel 		bp->b_un.b_addr =
1021770499e1SDan Kimmel 		    abd_borrow_buf(zio->io_abd, zio->io_size);
1022770499e1SDan Kimmel 	} else {
1023770499e1SDan Kimmel 		bp->b_un.b_addr =
1024770499e1SDan Kimmel 		    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
1025770499e1SDan Kimmel 	}
1026770499e1SDan Kimmel 
1027fa9e4066Sahrens 	bp->b_lblkno = lbtodb(zio->io_offset);
1028fa9e4066Sahrens 	bp->b_bufsize = zio->io_size;
1029c62757b2SToomas Soome 	bp->b_iodone = vdev_disk_io_intr;
1030fa9e4066Sahrens 
1031fa88c70fSJerry Jelinek 	/*
1032fa88c70fSJerry Jelinek 	 * In general we would expect ldi_strategy() to return non-zero only
1033fa88c70fSJerry Jelinek 	 * because of programming errors, but we've also seen this fail shortly
1034fa88c70fSJerry Jelinek 	 * after a disk dies.
1035fa88c70fSJerry Jelinek 	 */
1036fa88c70fSJerry Jelinek 	if (ldi_strategy(dvd->vd_lh, bp) != 0) {
1037fa88c70fSJerry Jelinek 		zio->io_error = ENXIO;
1038fa88c70fSJerry Jelinek 		zio_interrupt(zio);
1039fa88c70fSJerry Jelinek 	}
1040fa9e4066Sahrens }
1041fa9e4066Sahrens 
1042e14bb325SJeff Bonwick static void
vdev_disk_io_done(zio_t * zio)1043fa9e4066Sahrens vdev_disk_io_done(zio_t *zio)
1044fa9e4066Sahrens {
1045e14bb325SJeff Bonwick 	vdev_t *vd = zio->io_vd;
1046ea8dc4b6Seschrock 
10473d7072f8Seschrock 	/*
10483d7072f8Seschrock 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
10493d7072f8Seschrock 	 * the device has been removed.  If this is the case, then we trigger an
10500a4e9518Sgw 	 * asynchronous removal of the device. Otherwise, probe the device and
10511f7ad2e1Sgw 	 * make sure it's still accessible.
10523d7072f8Seschrock 	 */
10531d713200SEric Schrock 	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
10540a4e9518Sgw 		vdev_disk_t *dvd = vd->vdev_tsd;
1055e14bb325SJeff Bonwick 		int state = DKIO_NONE;
10560a4e9518Sgw 
1057e14bb325SJeff Bonwick 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
1058e14bb325SJeff Bonwick 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
10591d713200SEric Schrock 			/*
10601d713200SEric Schrock 			 * We post the resource as soon as possible, instead of
10611d713200SEric Schrock 			 * when the async removal actually happens, because the
10621d713200SEric Schrock 			 * DE is using this information to discard previous I/O
10631d713200SEric Schrock 			 * errors.
10641d713200SEric Schrock 			 */
10651d713200SEric Schrock 			zfs_post_remove(zio->io_spa, vd);
10663d7072f8Seschrock 			vd->vdev_remove_wanted = B_TRUE;
10673d7072f8Seschrock 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
106898d1cbfeSGeorge Wilson 		} else if (!vd->vdev_delayed_close) {
106998d1cbfeSGeorge Wilson 			vd->vdev_delayed_close = B_TRUE;
10703d7072f8Seschrock 		}
10713d7072f8Seschrock 	}
1072fa9e4066Sahrens }
1073fa9e4066Sahrens 
1074fa9e4066Sahrens vdev_ops_t vdev_disk_ops = {
1075a3874b8bSToomas Soome 	.vdev_op_open = vdev_disk_open,
1076a3874b8bSToomas Soome 	.vdev_op_close = vdev_disk_close,
1077a3874b8bSToomas Soome 	.vdev_op_asize = vdev_default_asize,
1078a3874b8bSToomas Soome 	.vdev_op_io_start = vdev_disk_io_start,
1079a3874b8bSToomas Soome 	.vdev_op_io_done = vdev_disk_io_done,
1080a3874b8bSToomas Soome 	.vdev_op_state_change = NULL,
1081a3874b8bSToomas Soome 	.vdev_op_need_resilver = NULL,
1082a3874b8bSToomas Soome 	.vdev_op_hold = vdev_disk_hold,
1083a3874b8bSToomas Soome 	.vdev_op_rele = vdev_disk_rele,
1084a3874b8bSToomas Soome 	.vdev_op_remap = NULL,
1085a3874b8bSToomas Soome 	.vdev_op_xlate = vdev_default_xlate,
1086ac04831dSMike Gerdts 	.vdev_op_dumpio = vdev_disk_dumpio,
1087a3874b8bSToomas Soome 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
1088a3874b8bSToomas Soome 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
1089fa9e4066Sahrens };
1090e7cbe64fSgw 
1091e7cbe64fSgw /*
1092051aabe6Staylor  * Given the root disk device devid or pathname, read the label from
1093051aabe6Staylor  * the device, and construct a configuration nvlist.
1094e7cbe64fSgw  */
1095f940fbb1SLin Ling int
vdev_disk_read_rootlabel(const char * devpath,const char * devid,nvlist_t ** config)109630c304d9SJoshua M. Clulow vdev_disk_read_rootlabel(const char *devpath, const char *devid,
109730c304d9SJoshua M. Clulow     nvlist_t **config)
1098e7cbe64fSgw {
1099e7cbe64fSgw 	ldi_handle_t vd_lh;
1100e7cbe64fSgw 	vdev_label_t *label;
1101e7cbe64fSgw 	uint64_t s, size;
1102e7cbe64fSgw 	int l;
1103051aabe6Staylor 	ddi_devid_t tmpdevid;
1104f4565e39SLin Ling 	int error = -1;
1105051aabe6Staylor 	char *minor_name;
1106e7cbe64fSgw 
1107e7cbe64fSgw 	/*
1108e7cbe64fSgw 	 * Read the device label and build the nvlist.
1109e7cbe64fSgw 	 */
111030c304d9SJoshua M. Clulow 	if (devid != NULL && ddi_devid_str_decode((char *)devid, &tmpdevid,
1111051aabe6Staylor 	    &minor_name) == 0) {
1112051aabe6Staylor 		error = ldi_open_by_devid(tmpdevid, minor_name,
11138ad4d6ddSJeff Bonwick 		    FREAD, kcred, &vd_lh, zfs_li);
1114051aabe6Staylor 		ddi_devid_free(tmpdevid);
1115051aabe6Staylor 		ddi_devid_str_free(minor_name);
1116051aabe6Staylor 	}
1117051aabe6Staylor 
111830c304d9SJoshua M. Clulow 	if (error != 0 && (error = ldi_open_by_name((char *)devpath, FREAD,
111930c304d9SJoshua M. Clulow 	    kcred, &vd_lh, zfs_li)) != 0) {
1120f940fbb1SLin Ling 		return (error);
112130c304d9SJoshua M. Clulow 	}
1122e7cbe64fSgw 
1123bf82a41bSeschrock 	if (ldi_get_size(vd_lh, &s)) {
1124bf82a41bSeschrock 		(void) ldi_close(vd_lh, FREAD, kcred);
1125be6fd75aSMatthew Ahrens 		return (SET_ERROR(EIO));
1126bf82a41bSeschrock 	}
1127e7cbe64fSgw 
1128e7cbe64fSgw 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
1129e7cbe64fSgw 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
1130e7cbe64fSgw 
113117f1e64aSEric Taylor 	*config = NULL;
1132e7cbe64fSgw 	for (l = 0; l < VDEV_LABELS; l++) {
1133e7cbe64fSgw 		uint64_t offset, state, txg = 0;
1134e7cbe64fSgw 
1135e7cbe64fSgw 		/* read vdev label */
1136e7cbe64fSgw 		offset = vdev_label_offset(size, l, 0);
1137810e43b2SBill Pijewski 		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
11382264ca7fSLin Ling 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
1139e7cbe64fSgw 			continue;
1140e7cbe64fSgw 
1141e7cbe64fSgw 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
1142f940fbb1SLin Ling 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
1143f940fbb1SLin Ling 			*config = NULL;
1144e7cbe64fSgw 			continue;
1145e7cbe64fSgw 		}
1146e7cbe64fSgw 
1147f940fbb1SLin Ling 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
1148e7cbe64fSgw 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
1149f940fbb1SLin Ling 			nvlist_free(*config);
1150f940fbb1SLin Ling 			*config = NULL;
1151e7cbe64fSgw 			continue;
1152e7cbe64fSgw 		}
1153e7cbe64fSgw 
1154f940fbb1SLin Ling 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
1155e7cbe64fSgw 		    &txg) != 0 || txg == 0) {
1156f940fbb1SLin Ling 			nvlist_free(*config);
1157f940fbb1SLin Ling 			*config = NULL;
1158e7cbe64fSgw 			continue;
1159e7cbe64fSgw 		}
1160e7cbe64fSgw 
1161e7cbe64fSgw 		break;
1162e7cbe64fSgw 	}
1163e7cbe64fSgw 
1164e7cbe64fSgw 	kmem_free(label, sizeof (vdev_label_t));
1165bf82a41bSeschrock 	(void) ldi_close(vd_lh, FREAD, kcred);
116617f1e64aSEric Taylor 	if (*config == NULL)
1167be6fd75aSMatthew Ahrens 		error = SET_ERROR(EIDRM);
1168bf82a41bSeschrock 
1169f940fbb1SLin Ling 	return (error);
1170e7cbe64fSgw }
117130c304d9SJoshua M. Clulow 
117230c304d9SJoshua M. Clulow struct veb {
117330c304d9SJoshua M. Clulow 	list_t veb_ents;
117430c304d9SJoshua M. Clulow 	boolean_t veb_scanned;
1175*8b26092dSJoshua M. Clulow 	char *veb_force_path;
117630c304d9SJoshua M. Clulow };
117730c304d9SJoshua M. Clulow 
117830c304d9SJoshua M. Clulow struct veb_ent {
117930c304d9SJoshua M. Clulow 	uint64_t vebe_pool_guid;
118030c304d9SJoshua M. Clulow 	uint64_t vebe_vdev_guid;
118130c304d9SJoshua M. Clulow 
118230c304d9SJoshua M. Clulow 	char *vebe_devpath;
118330c304d9SJoshua M. Clulow 
118430c304d9SJoshua M. Clulow 	list_node_t vebe_link;
118530c304d9SJoshua M. Clulow };
118630c304d9SJoshua M. Clulow 
118730c304d9SJoshua M. Clulow static kmutex_t veb_lock;
118830c304d9SJoshua M. Clulow static struct veb *veb;
118930c304d9SJoshua M. Clulow 
119030c304d9SJoshua M. Clulow static int
vdev_disk_preroot_scan_walk(const char * devpath,void * arg)119130c304d9SJoshua M. Clulow vdev_disk_preroot_scan_walk(const char *devpath, void *arg)
119230c304d9SJoshua M. Clulow {
119330c304d9SJoshua M. Clulow 	int r;
119430c304d9SJoshua M. Clulow 	nvlist_t *cfg = NULL;
119530c304d9SJoshua M. Clulow 	uint64_t pguid = 0, vguid = 0;
119630c304d9SJoshua M. Clulow 
119730c304d9SJoshua M. Clulow 	/*
119830c304d9SJoshua M. Clulow 	 * Attempt to read the label from this block device.
119930c304d9SJoshua M. Clulow 	 */
120030c304d9SJoshua M. Clulow 	if ((r = vdev_disk_read_rootlabel(devpath, NULL, &cfg)) != 0) {
120130c304d9SJoshua M. Clulow 		/*
120230c304d9SJoshua M. Clulow 		 * Many of the available block devices will represent slices or
120330c304d9SJoshua M. Clulow 		 * partitions of disks, or may represent disks that are not at
120430c304d9SJoshua M. Clulow 		 * all initialised with ZFS.  As this is a best effort
120530c304d9SJoshua M. Clulow 		 * mechanism to locate an alternate path to a particular vdev,
120630c304d9SJoshua M. Clulow 		 * we will ignore any failures and keep scanning.
120730c304d9SJoshua M. Clulow 		 */
120830c304d9SJoshua M. Clulow 		return (PREROOT_WALK_BLOCK_DEVICES_NEXT);
120930c304d9SJoshua M. Clulow 	}
121030c304d9SJoshua M. Clulow 
121130c304d9SJoshua M. Clulow 	/*
121230c304d9SJoshua M. Clulow 	 * Determine the pool and vdev GUID read from the label for this
121330c304d9SJoshua M. Clulow 	 * device.  Both values must be present and have a non-zero value.
121430c304d9SJoshua M. Clulow 	 */
121530c304d9SJoshua M. Clulow 	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pguid) != 0 ||
121630c304d9SJoshua M. Clulow 	    nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_GUID, &vguid) != 0 ||
121730c304d9SJoshua M. Clulow 	    pguid == 0 || vguid == 0) {
121830c304d9SJoshua M. Clulow 		/*
121930c304d9SJoshua M. Clulow 		 * This label was not complete.
122030c304d9SJoshua M. Clulow 		 */
122130c304d9SJoshua M. Clulow 		goto out;
122230c304d9SJoshua M. Clulow 	}
122330c304d9SJoshua M. Clulow 
122430c304d9SJoshua M. Clulow 	/*
122530c304d9SJoshua M. Clulow 	 * Keep track of all of the GUID-to-devpath mappings we find so that
122630c304d9SJoshua M. Clulow 	 * vdev_disk_preroot_lookup() can search them.
122730c304d9SJoshua M. Clulow 	 */
122830c304d9SJoshua M. Clulow 	struct veb_ent *vebe = kmem_zalloc(sizeof (*vebe), KM_SLEEP);
122930c304d9SJoshua M. Clulow 	vebe->vebe_pool_guid = pguid;
123030c304d9SJoshua M. Clulow 	vebe->vebe_vdev_guid = vguid;
123130c304d9SJoshua M. Clulow 	vebe->vebe_devpath = spa_strdup(devpath);
123230c304d9SJoshua M. Clulow 
123330c304d9SJoshua M. Clulow 	list_insert_tail(&veb->veb_ents, vebe);
123430c304d9SJoshua M. Clulow 
123530c304d9SJoshua M. Clulow out:
123630c304d9SJoshua M. Clulow 	nvlist_free(cfg);
123730c304d9SJoshua M. Clulow 	return (PREROOT_WALK_BLOCK_DEVICES_NEXT);
123830c304d9SJoshua M. Clulow }
123930c304d9SJoshua M. Clulow 
124030c304d9SJoshua M. Clulow const char *
vdev_disk_preroot_lookup(uint64_t pool_guid,uint64_t vdev_guid)124130c304d9SJoshua M. Clulow vdev_disk_preroot_lookup(uint64_t pool_guid, uint64_t vdev_guid)
124230c304d9SJoshua M. Clulow {
124330c304d9SJoshua M. Clulow 	if (pool_guid == 0 || vdev_guid == 0) {
124430c304d9SJoshua M. Clulow 		/*
124530c304d9SJoshua M. Clulow 		 * If we aren't provided both a pool and a vdev GUID, we cannot
124630c304d9SJoshua M. Clulow 		 * perform a lookup.
124730c304d9SJoshua M. Clulow 		 */
124830c304d9SJoshua M. Clulow 		return (NULL);
124930c304d9SJoshua M. Clulow 	}
125030c304d9SJoshua M. Clulow 
125130c304d9SJoshua M. Clulow 	mutex_enter(&veb_lock);
125230c304d9SJoshua M. Clulow 	if (veb == NULL) {
125330c304d9SJoshua M. Clulow 		/*
125430c304d9SJoshua M. Clulow 		 * If vdev_disk_preroot_fini() has been called already, there
125530c304d9SJoshua M. Clulow 		 * is nothing we can do.
125630c304d9SJoshua M. Clulow 		 */
125730c304d9SJoshua M. Clulow 		mutex_exit(&veb_lock);
125830c304d9SJoshua M. Clulow 		return (NULL);
125930c304d9SJoshua M. Clulow 	}
126030c304d9SJoshua M. Clulow 
126130c304d9SJoshua M. Clulow 	/*
126230c304d9SJoshua M. Clulow 	 * We want to perform at most one scan of all block devices per boot.
126330c304d9SJoshua M. Clulow 	 */
126430c304d9SJoshua M. Clulow 	if (!veb->veb_scanned) {
126530c304d9SJoshua M. Clulow 		cmn_err(CE_NOTE, "Performing full ZFS device scan!");
126630c304d9SJoshua M. Clulow 
126730c304d9SJoshua M. Clulow 		preroot_walk_block_devices(vdev_disk_preroot_scan_walk, NULL);
126830c304d9SJoshua M. Clulow 
126930c304d9SJoshua M. Clulow 		veb->veb_scanned = B_TRUE;
127030c304d9SJoshua M. Clulow 	}
127130c304d9SJoshua M. Clulow 
127230c304d9SJoshua M. Clulow 	const char *path = NULL;
127330c304d9SJoshua M. Clulow 	for (struct veb_ent *vebe = list_head(&veb->veb_ents); vebe != NULL;
127430c304d9SJoshua M. Clulow 	    vebe = list_next(&veb->veb_ents, vebe)) {
127530c304d9SJoshua M. Clulow 		if (vebe->vebe_pool_guid == pool_guid &&
127630c304d9SJoshua M. Clulow 		    vebe->vebe_vdev_guid == vdev_guid) {
127730c304d9SJoshua M. Clulow 			path = vebe->vebe_devpath;
127830c304d9SJoshua M. Clulow 			break;
127930c304d9SJoshua M. Clulow 		}
128030c304d9SJoshua M. Clulow 	}
128130c304d9SJoshua M. Clulow 
128230c304d9SJoshua M. Clulow 	mutex_exit(&veb_lock);
128330c304d9SJoshua M. Clulow 
128430c304d9SJoshua M. Clulow 	return (path);
128530c304d9SJoshua M. Clulow }
128630c304d9SJoshua M. Clulow 
1287*8b26092dSJoshua M. Clulow const char *
vdev_disk_preroot_force_path(void)1288*8b26092dSJoshua M. Clulow vdev_disk_preroot_force_path(void)
1289*8b26092dSJoshua M. Clulow {
1290*8b26092dSJoshua M. Clulow 	const char *force_path = NULL;
1291*8b26092dSJoshua M. Clulow 
1292*8b26092dSJoshua M. Clulow 	mutex_enter(&veb_lock);
1293*8b26092dSJoshua M. Clulow 	if (veb != NULL) {
1294*8b26092dSJoshua M. Clulow 		force_path = veb->veb_force_path;
1295*8b26092dSJoshua M. Clulow 	}
1296*8b26092dSJoshua M. Clulow 	mutex_exit(&veb_lock);
1297*8b26092dSJoshua M. Clulow 
1298*8b26092dSJoshua M. Clulow 	return (force_path);
1299*8b26092dSJoshua M. Clulow }
1300*8b26092dSJoshua M. Clulow 
130130c304d9SJoshua M. Clulow void
vdev_disk_preroot_init(const char * force_path)1302*8b26092dSJoshua M. Clulow vdev_disk_preroot_init(const char *force_path)
130330c304d9SJoshua M. Clulow {
130430c304d9SJoshua M. Clulow 	mutex_init(&veb_lock, NULL, MUTEX_DEFAULT, NULL);
130530c304d9SJoshua M. Clulow 
130630c304d9SJoshua M. Clulow 	VERIFY3P(veb, ==, NULL);
130730c304d9SJoshua M. Clulow 	veb = kmem_zalloc(sizeof (*veb), KM_SLEEP);
130830c304d9SJoshua M. Clulow 	list_create(&veb->veb_ents, sizeof (struct veb_ent),
130930c304d9SJoshua M. Clulow 	    offsetof(struct veb_ent, vebe_link));
131030c304d9SJoshua M. Clulow 	veb->veb_scanned = B_FALSE;
1311*8b26092dSJoshua M. Clulow 	if (force_path != NULL) {
1312*8b26092dSJoshua M. Clulow 		veb->veb_force_path = spa_strdup(force_path);
1313*8b26092dSJoshua M. Clulow 	}
131430c304d9SJoshua M. Clulow }
131530c304d9SJoshua M. Clulow 
131630c304d9SJoshua M. Clulow void
vdev_disk_preroot_fini(void)131730c304d9SJoshua M. Clulow vdev_disk_preroot_fini(void)
131830c304d9SJoshua M. Clulow {
131930c304d9SJoshua M. Clulow 	mutex_enter(&veb_lock);
132030c304d9SJoshua M. Clulow 
132130c304d9SJoshua M. Clulow 	if (veb != NULL) {
132230c304d9SJoshua M. Clulow 		while (!list_is_empty(&veb->veb_ents)) {
132330c304d9SJoshua M. Clulow 			struct veb_ent *vebe = list_remove_head(&veb->veb_ents);
132430c304d9SJoshua M. Clulow 
132530c304d9SJoshua M. Clulow 			spa_strfree(vebe->vebe_devpath);
132630c304d9SJoshua M. Clulow 
132730c304d9SJoshua M. Clulow 			kmem_free(vebe, sizeof (*vebe));
132830c304d9SJoshua M. Clulow 		}
132930c304d9SJoshua M. Clulow 
1330*8b26092dSJoshua M. Clulow 		if (veb->veb_force_path != NULL) {
1331*8b26092dSJoshua M. Clulow 			spa_strfree(veb->veb_force_path);
1332*8b26092dSJoshua M. Clulow 		}
1333*8b26092dSJoshua M. Clulow 
133430c304d9SJoshua M. Clulow 		kmem_free(veb, sizeof (*veb));
133530c304d9SJoshua M. Clulow 		veb = NULL;
133630c304d9SJoshua M. Clulow 	}
133730c304d9SJoshua M. Clulow 
133830c304d9SJoshua M. Clulow 	mutex_exit(&veb_lock);
133930c304d9SJoshua M. Clulow }
1340