xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_disk.c (revision 30c304d9)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5bef6b7d2Swebaker  * Common Development and Distribution License (the "License").
6bef6b7d2Swebaker  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22f13665b7Sbo zhou - Sun Microsystems - Beijing China  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
236fe4f300SPavel Zakharov  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24295438baSHans Rosenfeld  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
251b500975SMike Gerdts  * Copyright 2020 Joyent, Inc.
26*30c304d9SJoshua M. Clulow  * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
27fa9e4066Sahrens  */
28fa9e4066Sahrens 
29fa9e4066Sahrens #include <sys/zfs_context.h>
30dcba9f3fSGeorge Wilson #include <sys/spa_impl.h>
31e7cbe64fSgw #include <sys/refcount.h>
32fa9e4066Sahrens #include <sys/vdev_impl.h>
33084fd14fSBrian Behlendorf #include <sys/vdev_trim.h>
34770499e1SDan Kimmel #include <sys/abd.h>
35fa9e4066Sahrens #include <sys/fs/zfs.h>
36fa9e4066Sahrens #include <sys/zio.h>
37afefbcddSeschrock #include <sys/sunldi.h>
384263d13fSGeorge Wilson #include <sys/efi_partition.h>
3951ece835Seschrock #include <sys/fm/fs/zfs.h>
40ac04831dSMike Gerdts #include <sys/ddi.h>
41fa9e4066Sahrens 
42fb05b94aSJerry Jelinek /*
43fc5c75cfSJerry Jelinek  * Tunable to disable TRIM in case we're using a problematic SSD.
44fb05b94aSJerry Jelinek  */
45fc5c75cfSJerry Jelinek uint_t zfs_no_trim = 0;
46fb05b94aSJerry Jelinek 
47f8fdf681SPrakash Surya /*
48f8fdf681SPrakash Surya  * Tunable parameter for debugging or performance analysis. Setting this
49f8fdf681SPrakash Surya  * will cause pool corruption on power loss if a volatile out-of-order
50f8fdf681SPrakash Surya  * write cache is enabled.
51f8fdf681SPrakash Surya  */
52f8fdf681SPrakash Surya boolean_t zfs_nocacheflush = B_FALSE;
53f8fdf681SPrakash Surya 
54fa9e4066Sahrens /*
55fa9e4066Sahrens  * Virtual device vector for disks.
56fa9e4066Sahrens  */
57fa9e4066Sahrens 
58fa9e4066Sahrens extern ldi_ident_t zfs_li;
59fa9e4066Sahrens 
6039cddb10SJoshua M. Clulow static void vdev_disk_close(vdev_t *);
6139cddb10SJoshua M. Clulow 
62ac04831dSMike Gerdts typedef struct vdev_disk {
63ac04831dSMike Gerdts 	ddi_devid_t	vd_devid;
64ac04831dSMike Gerdts 	char		*vd_minor;
65ac04831dSMike Gerdts 	ldi_handle_t	vd_lh;
66ac04831dSMike Gerdts 	list_t		vd_ldi_cbs;
67ac04831dSMike Gerdts 	boolean_t	vd_ldi_offline;
68ac04831dSMike Gerdts } vdev_disk_t;
69ac04831dSMike Gerdts 
70ac04831dSMike Gerdts typedef struct vdev_disk_buf {
71ac04831dSMike Gerdts 	buf_t	vdb_buf;
72ac04831dSMike Gerdts 	zio_t	*vdb_io;
73ac04831dSMike Gerdts } vdev_disk_buf_t;
74ac04831dSMike Gerdts 
7539cddb10SJoshua M. Clulow typedef struct vdev_disk_ldi_cb {
7639cddb10SJoshua M. Clulow 	list_node_t		lcb_next;
7739cddb10SJoshua M. Clulow 	ldi_callback_id_t	lcb_id;
7839cddb10SJoshua M. Clulow } vdev_disk_ldi_cb_t;
7939cddb10SJoshua M. Clulow 
806fe4f300SPavel Zakharov /*
816fe4f300SPavel Zakharov  * Bypass the devid when opening a disk vdev.
826fe4f300SPavel Zakharov  * There have been issues where the devids of several devices were shuffled,
836fe4f300SPavel Zakharov  * causing pool open failures. Note, that this flag is intended to be used
846fe4f300SPavel Zakharov  * for pool recovery only.
856fe4f300SPavel Zakharov  *
866fe4f300SPavel Zakharov  * Note that if a pool is imported with the devids bypassed, all its vdevs will
876fe4f300SPavel Zakharov  * cease storing devid information permanently. In practice, the devid is rarely
886fe4f300SPavel Zakharov  * useful as vdev paths do not tend to change unless the hardware is
896fe4f300SPavel Zakharov  * reconfigured. That said, if the paths do change and a pool fails to open
906fe4f300SPavel Zakharov  * automatically at boot, a simple zpool import should re-scan the paths and fix
916fe4f300SPavel Zakharov  * the issue.
926fe4f300SPavel Zakharov  */
936fe4f300SPavel Zakharov boolean_t vdev_disk_bypass_devid = B_FALSE;
946fe4f300SPavel Zakharov 
9539cddb10SJoshua M. Clulow static void
9639cddb10SJoshua M. Clulow vdev_disk_alloc(vdev_t *vd)
9739cddb10SJoshua M. Clulow {
9839cddb10SJoshua M. Clulow 	vdev_disk_t *dvd;
9939cddb10SJoshua M. Clulow 
10039cddb10SJoshua M. Clulow 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
10139cddb10SJoshua M. Clulow 	/*
10239cddb10SJoshua M. Clulow 	 * Create the LDI event callback list.
10339cddb10SJoshua M. Clulow 	 */
10439cddb10SJoshua M. Clulow 	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
10539cddb10SJoshua M. Clulow 	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
10639cddb10SJoshua M. Clulow }
10739cddb10SJoshua M. Clulow 
10839cddb10SJoshua M. Clulow static void
10939cddb10SJoshua M. Clulow vdev_disk_free(vdev_t *vd)
11039cddb10SJoshua M. Clulow {
11139cddb10SJoshua M. Clulow 	vdev_disk_t *dvd = vd->vdev_tsd;
11239cddb10SJoshua M. Clulow 	vdev_disk_ldi_cb_t *lcb;
11339cddb10SJoshua M. Clulow 
11439cddb10SJoshua M. Clulow 	if (dvd == NULL)
11539cddb10SJoshua M. Clulow 		return;
11639cddb10SJoshua M. Clulow 
11739cddb10SJoshua M. Clulow 	/*
11839cddb10SJoshua M. Clulow 	 * We have already closed the LDI handle. Clean up the LDI event
11939cddb10SJoshua M. Clulow 	 * callbacks and free vd->vdev_tsd.
12039cddb10SJoshua M. Clulow 	 */
12139cddb10SJoshua M. Clulow 	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
12239cddb10SJoshua M. Clulow 		list_remove(&dvd->vd_ldi_cbs, lcb);
12339cddb10SJoshua M. Clulow 		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
12439cddb10SJoshua M. Clulow 		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
12539cddb10SJoshua M. Clulow 	}
12639cddb10SJoshua M. Clulow 	list_destroy(&dvd->vd_ldi_cbs);
12739cddb10SJoshua M. Clulow 	kmem_free(dvd, sizeof (vdev_disk_t));
12839cddb10SJoshua M. Clulow 	vd->vdev_tsd = NULL;
12939cddb10SJoshua M. Clulow }
13039cddb10SJoshua M. Clulow 
13139cddb10SJoshua M. Clulow static int
1321b500975SMike Gerdts vdev_disk_off_notify(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
1331b500975SMike Gerdts     void *arg, void *ev_data __unused)
13439cddb10SJoshua M. Clulow {
13539cddb10SJoshua M. Clulow 	vdev_t *vd = (vdev_t *)arg;
13639cddb10SJoshua M. Clulow 	vdev_disk_t *dvd = vd->vdev_tsd;
13739cddb10SJoshua M. Clulow 
13839cddb10SJoshua M. Clulow 	/*
13939cddb10SJoshua M. Clulow 	 * Ignore events other than offline.
14039cddb10SJoshua M. Clulow 	 */
14139cddb10SJoshua M. Clulow 	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
14239cddb10SJoshua M. Clulow 		return (LDI_EV_SUCCESS);
14339cddb10SJoshua M. Clulow 
14439cddb10SJoshua M. Clulow 	/*
1451b500975SMike Gerdts 	 * Tell any new threads that stumble upon this vdev that they should not
1461b500975SMike Gerdts 	 * try to do I/O.
14739cddb10SJoshua M. Clulow 	 */
14839cddb10SJoshua M. Clulow 	dvd->vd_ldi_offline = B_TRUE;
14939cddb10SJoshua M. Clulow 
15039cddb10SJoshua M. Clulow 	/*
1511b500975SMike Gerdts 	 * Request that the spa_async_thread mark the device as REMOVED and
1521b500975SMike Gerdts 	 * notify FMA of the removal.  This should also trigger a vdev_close()
1531b500975SMike Gerdts 	 * in the async thread.
15439cddb10SJoshua M. Clulow 	 */
15539cddb10SJoshua M. Clulow 	zfs_post_remove(vd->vdev_spa, vd);
15639cddb10SJoshua M. Clulow 	vd->vdev_remove_wanted = B_TRUE;
15739cddb10SJoshua M. Clulow 	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
15839cddb10SJoshua M. Clulow 
15939cddb10SJoshua M. Clulow 	return (LDI_EV_SUCCESS);
16039cddb10SJoshua M. Clulow }
16139cddb10SJoshua M. Clulow 
16239cddb10SJoshua M. Clulow static void
1631b500975SMike Gerdts vdev_disk_off_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
1641b500975SMike Gerdts     int ldi_result, void *arg, void *ev_data __unused)
16539cddb10SJoshua M. Clulow {
16639cddb10SJoshua M. Clulow 	vdev_t *vd = (vdev_t *)arg;
16739cddb10SJoshua M. Clulow 
16839cddb10SJoshua M. Clulow 	/*
16939cddb10SJoshua M. Clulow 	 * Ignore events other than offline.
17039cddb10SJoshua M. Clulow 	 */
17139cddb10SJoshua M. Clulow 	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
17239cddb10SJoshua M. Clulow 		return;
17339cddb10SJoshua M. Clulow 
17439cddb10SJoshua M. Clulow 	/*
17539cddb10SJoshua M. Clulow 	 * Request that the vdev be reopened if the offline state change was
17639cddb10SJoshua M. Clulow 	 * unsuccessful.
17739cddb10SJoshua M. Clulow 	 */
17839cddb10SJoshua M. Clulow 	if (ldi_result != LDI_EV_SUCCESS) {
17939cddb10SJoshua M. Clulow 		vd->vdev_probe_wanted = B_TRUE;
18039cddb10SJoshua M. Clulow 		spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
18139cddb10SJoshua M. Clulow 	}
18239cddb10SJoshua M. Clulow }
18339cddb10SJoshua M. Clulow 
18439cddb10SJoshua M. Clulow static ldi_ev_callback_t vdev_disk_off_callb = {
18539cddb10SJoshua M. Clulow 	.cb_vers = LDI_EV_CB_VERS,
18639cddb10SJoshua M. Clulow 	.cb_notify = vdev_disk_off_notify,
18739cddb10SJoshua M. Clulow 	.cb_finalize = vdev_disk_off_finalize
18839cddb10SJoshua M. Clulow };
18939cddb10SJoshua M. Clulow 
19039cddb10SJoshua M. Clulow static void
1911b500975SMike Gerdts vdev_disk_dgrd_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
1921b500975SMike Gerdts     int ldi_result, void *arg, void *ev_data __unused)
19339cddb10SJoshua M. Clulow {
19439cddb10SJoshua M. Clulow 	vdev_t *vd = (vdev_t *)arg;
19539cddb10SJoshua M. Clulow 
19639cddb10SJoshua M. Clulow 	/*
19739cddb10SJoshua M. Clulow 	 * Ignore events other than degrade.
19839cddb10SJoshua M. Clulow 	 */
19939cddb10SJoshua M. Clulow 	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
20039cddb10SJoshua M. Clulow 		return;
20139cddb10SJoshua M. Clulow 
20239cddb10SJoshua M. Clulow 	/*
20339cddb10SJoshua M. Clulow 	 * Degrade events always succeed. Mark the vdev as degraded.
20439cddb10SJoshua M. Clulow 	 * This status is purely informative for the user.
20539cddb10SJoshua M. Clulow 	 */
20639cddb10SJoshua M. Clulow 	(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
20739cddb10SJoshua M. Clulow }
20839cddb10SJoshua M. Clulow 
20939cddb10SJoshua M. Clulow static ldi_ev_callback_t vdev_disk_dgrd_callb = {
21039cddb10SJoshua M. Clulow 	.cb_vers = LDI_EV_CB_VERS,
21139cddb10SJoshua M. Clulow 	.cb_notify = NULL,
21239cddb10SJoshua M. Clulow 	.cb_finalize = vdev_disk_dgrd_finalize
21339cddb10SJoshua M. Clulow };
21439cddb10SJoshua M. Clulow 
215dcba9f3fSGeorge Wilson static void
216dcba9f3fSGeorge Wilson vdev_disk_hold(vdev_t *vd)
217dcba9f3fSGeorge Wilson {
218dcba9f3fSGeorge Wilson 	ddi_devid_t devid;
219dcba9f3fSGeorge Wilson 	char *minor;
220dcba9f3fSGeorge Wilson 
221dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
222dcba9f3fSGeorge Wilson 
223dcba9f3fSGeorge Wilson 	/*
224dcba9f3fSGeorge Wilson 	 * We must have a pathname, and it must be absolute.
225dcba9f3fSGeorge Wilson 	 */
226dcba9f3fSGeorge Wilson 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
227dcba9f3fSGeorge Wilson 		return;
228dcba9f3fSGeorge Wilson 
229dcba9f3fSGeorge Wilson 	/*
230dcba9f3fSGeorge Wilson 	 * Only prefetch path and devid info if the device has
231dcba9f3fSGeorge Wilson 	 * never been opened.
232dcba9f3fSGeorge Wilson 	 */
233dcba9f3fSGeorge Wilson 	if (vd->vdev_tsd != NULL)
234dcba9f3fSGeorge Wilson 		return;
235dcba9f3fSGeorge Wilson 
236dcba9f3fSGeorge Wilson 	if (vd->vdev_wholedisk == -1ULL) {
237dcba9f3fSGeorge Wilson 		size_t len = strlen(vd->vdev_path) + 3;
238dcba9f3fSGeorge Wilson 		char *buf = kmem_alloc(len, KM_SLEEP);
239dcba9f3fSGeorge Wilson 
240dcba9f3fSGeorge Wilson 		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
241dcba9f3fSGeorge Wilson 
242dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
243dcba9f3fSGeorge Wilson 		kmem_free(buf, len);
244dcba9f3fSGeorge Wilson 	}
245dcba9f3fSGeorge Wilson 
246dcba9f3fSGeorge Wilson 	if (vd->vdev_name_vp == NULL)
247dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
248dcba9f3fSGeorge Wilson 
249dcba9f3fSGeorge Wilson 	if (vd->vdev_devid != NULL &&
250dcba9f3fSGeorge Wilson 	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
251dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
252dcba9f3fSGeorge Wilson 		ddi_devid_str_free(minor);
253dcba9f3fSGeorge Wilson 		ddi_devid_free(devid);
254dcba9f3fSGeorge Wilson 	}
255dcba9f3fSGeorge Wilson }
256dcba9f3fSGeorge Wilson 
257dcba9f3fSGeorge Wilson static void
258dcba9f3fSGeorge Wilson vdev_disk_rele(vdev_t *vd)
259dcba9f3fSGeorge Wilson {
260dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
261dcba9f3fSGeorge Wilson 
262dcba9f3fSGeorge Wilson 	if (vd->vdev_name_vp) {
263dcba9f3fSGeorge Wilson 		VN_RELE_ASYNC(vd->vdev_name_vp,
264dcba9f3fSGeorge Wilson 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
265dcba9f3fSGeorge Wilson 		vd->vdev_name_vp = NULL;
266dcba9f3fSGeorge Wilson 	}
267dcba9f3fSGeorge Wilson 	if (vd->vdev_devid_vp) {
268dcba9f3fSGeorge Wilson 		VN_RELE_ASYNC(vd->vdev_devid_vp,
269dcba9f3fSGeorge Wilson 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
270dcba9f3fSGeorge Wilson 		vd->vdev_devid_vp = NULL;
271dcba9f3fSGeorge Wilson 	}
272dcba9f3fSGeorge Wilson }
273dcba9f3fSGeorge Wilson 
274a5b57771SDan McDonald /*
275a5b57771SDan McDonald  * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
276a5b57771SDan McDonald  * even a fallback to DKIOCGMEDIAINFO fails.
277a5b57771SDan McDonald  */
278a5b57771SDan McDonald #ifdef DEBUG
279a5b57771SDan McDonald #define	VDEV_DEBUG(...)	cmn_err(CE_NOTE, __VA_ARGS__)
280a5b57771SDan McDonald #else
281a5b57771SDan McDonald #define	VDEV_DEBUG(...)	/* Nothing... */
282a5b57771SDan McDonald #endif
283a5b57771SDan McDonald 
284fa9e4066Sahrens static int
2854263d13fSGeorge Wilson vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
2864263d13fSGeorge Wilson     uint64_t *ashift)
287fa9e4066Sahrens {
2888ad4d6ddSJeff Bonwick 	spa_t *spa = vd->vdev_spa;
28939cddb10SJoshua M. Clulow 	vdev_disk_t *dvd = vd->vdev_tsd;
29039cddb10SJoshua M. Clulow 	ldi_ev_cookie_t ecookie;
29139cddb10SJoshua M. Clulow 	vdev_disk_ldi_cb_t *lcb;
292a5b57771SDan McDonald 	union {
293a5b57771SDan McDonald 		struct dk_minfo_ext ude;
294a5b57771SDan McDonald 		struct dk_minfo ud;
295a5b57771SDan McDonald 	} dks;
296a5b57771SDan McDonald 	struct dk_minfo_ext *dkmext = &dks.ude;
297a5b57771SDan McDonald 	struct dk_minfo *dkm = &dks.ud;
298084fd14fSBrian Behlendorf 	int error, can_free;
299e14bb325SJeff Bonwick 	dev_t dev;
300e14bb325SJeff Bonwick 	int otyp;
301fb02ae02SGeorge Wilson 	boolean_t validate_devid = B_FALSE;
302a5b57771SDan McDonald 	uint64_t capacity = 0, blksz = 0, pbsize;
303fa9e4066Sahrens 
304fa9e4066Sahrens 	/*
305fa9e4066Sahrens 	 * We must have a pathname, and it must be absolute.
306fa9e4066Sahrens 	 */
307fa9e4066Sahrens 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
308fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
309be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
310fa9e4066Sahrens 	}
311fa9e4066Sahrens 
312095bcd66SGeorge Wilson 	/*
313095bcd66SGeorge Wilson 	 * Reopen the device if it's not currently open. Otherwise,
314095bcd66SGeorge Wilson 	 * just update the physical size of the device.
315095bcd66SGeorge Wilson 	 */
31639cddb10SJoshua M. Clulow 	if (dvd != NULL) {
3171b500975SMike Gerdts 		ASSERT(vd->vdev_reopening);
3181b500975SMike Gerdts 		goto skip_open;
319095bcd66SGeorge Wilson 	}
320095bcd66SGeorge Wilson 
32139cddb10SJoshua M. Clulow 	/*
32239cddb10SJoshua M. Clulow 	 * Create vd->vdev_tsd.
32339cddb10SJoshua M. Clulow 	 */
32439cddb10SJoshua M. Clulow 	vdev_disk_alloc(vd);
32539cddb10SJoshua M. Clulow 	dvd = vd->vdev_tsd;
326fa9e4066Sahrens 
3276fe4f300SPavel Zakharov 	/*
3286fe4f300SPavel Zakharov 	 * Allow bypassing the devid.
3296fe4f300SPavel Zakharov 	 */
3306fe4f300SPavel Zakharov 	if (vd->vdev_devid != NULL && vdev_disk_bypass_devid) {
3316fe4f300SPavel Zakharov 		vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed",
3326fe4f300SPavel Zakharov 		    vd->vdev_devid);
3336fe4f300SPavel Zakharov 		spa_strfree(vd->vdev_devid);
3346fe4f300SPavel Zakharov 		vd->vdev_devid = NULL;
3356fe4f300SPavel Zakharov 	}
3366fe4f300SPavel Zakharov 
337fa9e4066Sahrens 	/*
338fa9e4066Sahrens 	 * When opening a disk device, we want to preserve the user's original
339fa9e4066Sahrens 	 * intent.  We always want to open the device by the path the user gave
3401724dc7bSJoshua M. Clulow 	 * us, even if it is one of multiple paths to the same device.  But we
341fa9e4066Sahrens 	 * also want to be able to survive disks being removed/recabled.
342fa9e4066Sahrens 	 * Therefore the sequence of opening devices is:
343fa9e4066Sahrens 	 *
344afefbcddSeschrock 	 * 1. Try opening the device by path.  For legacy pools without the
345afefbcddSeschrock 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
346fa9e4066Sahrens 	 *
347fa9e4066Sahrens 	 * 2. If the devid of the device matches the stored value, return
348fa9e4066Sahrens 	 *    success.
349fa9e4066Sahrens 	 *
350fa9e4066Sahrens 	 * 3. Otherwise, the device may have moved.  Try opening the device
351fa9e4066Sahrens 	 *    by the devid instead.
352fa9e4066Sahrens 	 */
353fa9e4066Sahrens 	if (vd->vdev_devid != NULL) {
354fa9e4066Sahrens 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
355fa9e4066Sahrens 		    &dvd->vd_minor) != 0) {
356fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
3573ee8c80cSPavel Zakharov 			vdev_dbgmsg(vd, "vdev_disk_open: invalid "
3583ee8c80cSPavel Zakharov 			    "vdev_devid '%s'", vd->vdev_devid);
359be6fd75aSMatthew Ahrens 			return (SET_ERROR(EINVAL));
360fa9e4066Sahrens 		}
361fa9e4066Sahrens 	}
362fa9e4066Sahrens 
363fa9e4066Sahrens 	error = EINVAL;		/* presume failure */
364fa9e4066Sahrens 
365095bcd66SGeorge Wilson 	if (vd->vdev_path != NULL) {
366afefbcddSeschrock 		if (vd->vdev_wholedisk == -1ULL) {
367afefbcddSeschrock 			size_t len = strlen(vd->vdev_path) + 3;
368afefbcddSeschrock 			char *buf = kmem_alloc(len, KM_SLEEP);
369afefbcddSeschrock 
370afefbcddSeschrock 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
371afefbcddSeschrock 
37239cddb10SJoshua M. Clulow 			error = ldi_open_by_name(buf, spa_mode(spa), kcred,
37339cddb10SJoshua M. Clulow 			    &dvd->vd_lh, zfs_li);
37439cddb10SJoshua M. Clulow 			if (error == 0) {
375afefbcddSeschrock 				spa_strfree(vd->vdev_path);
376afefbcddSeschrock 				vd->vdev_path = buf;
377afefbcddSeschrock 				vd->vdev_wholedisk = 1ULL;
378afefbcddSeschrock 			} else {
379afefbcddSeschrock 				kmem_free(buf, len);
380afefbcddSeschrock 			}
381afefbcddSeschrock 		}
382fa9e4066Sahrens 
38339cddb10SJoshua M. Clulow 		/*
38439cddb10SJoshua M. Clulow 		 * If we have not yet opened the device, try to open it by the
38539cddb10SJoshua M. Clulow 		 * specified path.
38639cddb10SJoshua M. Clulow 		 */
38739cddb10SJoshua M. Clulow 		if (error != 0) {
38839cddb10SJoshua M. Clulow 			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
38939cddb10SJoshua M. Clulow 			    kcred, &dvd->vd_lh, zfs_li);
39039cddb10SJoshua M. Clulow 		}
391fa9e4066Sahrens 
392fa9e4066Sahrens 		/*
393fa9e4066Sahrens 		 * Compare the devid to the stored value.
394fa9e4066Sahrens 		 */
3956af23589SJoshua M. Clulow 		if (error == 0 && vd->vdev_devid != NULL) {
3966af23589SJoshua M. Clulow 			ddi_devid_t devid = NULL;
3976af23589SJoshua M. Clulow 
3986af23589SJoshua M. Clulow 			if (ldi_get_devid(dvd->vd_lh, &devid) != 0) {
3996af23589SJoshua M. Clulow 				/*
4006af23589SJoshua M. Clulow 				 * We expected a devid on this device but it no
4016af23589SJoshua M. Clulow 				 * longer appears to have one.  The validation
4026af23589SJoshua M. Clulow 				 * step may need to remove it from the
4036af23589SJoshua M. Clulow 				 * configuration.
4046af23589SJoshua M. Clulow 				 */
4056af23589SJoshua M. Clulow 				validate_devid = B_TRUE;
4066af23589SJoshua M. Clulow 
4076af23589SJoshua M. Clulow 			} else if (ddi_devid_compare(devid, dvd->vd_devid) !=
4086af23589SJoshua M. Clulow 			    0) {
4096fe4f300SPavel Zakharov 				/*
4106fe4f300SPavel Zakharov 				 * A mismatch here is unexpected, log it.
4116fe4f300SPavel Zakharov 				 */
4126fe4f300SPavel Zakharov 				char *devid_str = ddi_devid_str_encode(devid,
4136fe4f300SPavel Zakharov 				    dvd->vd_minor);
4146fe4f300SPavel Zakharov 				vdev_dbgmsg(vd, "vdev_disk_open: devid "
4156fe4f300SPavel Zakharov 				    "mismatch: %s != %s", vd->vdev_devid,
4166fe4f300SPavel Zakharov 				    devid_str);
4176fe4f300SPavel Zakharov 				cmn_err(CE_NOTE, "vdev_disk_open %s: devid "
4186fe4f300SPavel Zakharov 				    "mismatch: %s != %s", vd->vdev_path,
4196fe4f300SPavel Zakharov 				    vd->vdev_devid, devid_str);
4206fe4f300SPavel Zakharov 				ddi_devid_str_free(devid_str);
4216fe4f300SPavel Zakharov 
422be6fd75aSMatthew Ahrens 				error = SET_ERROR(EINVAL);
4238ad4d6ddSJeff Bonwick 				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
4248ad4d6ddSJeff Bonwick 				    kcred);
425fa9e4066Sahrens 				dvd->vd_lh = NULL;
426fa9e4066Sahrens 			}
4276af23589SJoshua M. Clulow 
4286af23589SJoshua M. Clulow 			if (devid != NULL) {
4296af23589SJoshua M. Clulow 				ddi_devid_free(devid);
4306af23589SJoshua M. Clulow 			}
431fa9e4066Sahrens 		}
432afefbcddSeschrock 
433afefbcddSeschrock 		/*
434afefbcddSeschrock 		 * If we succeeded in opening the device, but 'vdev_wholedisk'
435afefbcddSeschrock 		 * is not yet set, then this must be a slice.
436afefbcddSeschrock 		 */
437afefbcddSeschrock 		if (error == 0 && vd->vdev_wholedisk == -1ULL)
438afefbcddSeschrock 			vd->vdev_wholedisk = 0;
439fa9e4066Sahrens 	}
440fa9e4066Sahrens 
441fa9e4066Sahrens 	/*
442fa9e4066Sahrens 	 * If we were unable to open by path, or the devid check fails, open by
443fa9e4066Sahrens 	 * devid instead.
444fa9e4066Sahrens 	 */
445fb02ae02SGeorge Wilson 	if (error != 0 && vd->vdev_devid != NULL) {
446fa9e4066Sahrens 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
4478ad4d6ddSJeff Bonwick 		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
4486fe4f300SPavel Zakharov 		if (error != 0) {
4496fe4f300SPavel Zakharov 			vdev_dbgmsg(vd, "Failed to open by devid (%s)",
4506fe4f300SPavel Zakharov 			    vd->vdev_devid);
4516fe4f300SPavel Zakharov 		}
452fb02ae02SGeorge Wilson 	}
453fa9e4066Sahrens 
4543d7072f8Seschrock 	/*
4553d7072f8Seschrock 	 * If all else fails, then try opening by physical path (if available)
4563d7072f8Seschrock 	 * or the logical path (if we failed due to the devid check).  While not
4573d7072f8Seschrock 	 * as reliable as the devid, this will give us something, and the higher
4583d7072f8Seschrock 	 * level vdev validation will prevent us from opening the wrong device.
4593d7072f8Seschrock 	 */
4606af23589SJoshua M. Clulow 	if (error != 0) {
4616af23589SJoshua M. Clulow 		validate_devid = B_TRUE;
462fb02ae02SGeorge Wilson 
4633d7072f8Seschrock 		if (vd->vdev_physpath != NULL &&
4646af23589SJoshua M. Clulow 		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) {
4658ad4d6ddSJeff Bonwick 			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
4663d7072f8Seschrock 			    kcred, &dvd->vd_lh, zfs_li);
4676af23589SJoshua M. Clulow 		}
4683d7072f8Seschrock 
4693d7072f8Seschrock 		/*
4703d7072f8Seschrock 		 * Note that we don't support the legacy auto-wholedisk support
4713d7072f8Seschrock 		 * as above.  This hasn't been used in a very long time and we
4723d7072f8Seschrock 		 * don't need to propagate its oddities to this edge condition.
4733d7072f8Seschrock 		 */
4746af23589SJoshua M. Clulow 		if (error != 0 && vd->vdev_path != NULL) {
4758ad4d6ddSJeff Bonwick 			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
4768ad4d6ddSJeff Bonwick 			    kcred, &dvd->vd_lh, zfs_li);
4776af23589SJoshua M. Clulow 		}
4783d7072f8Seschrock 	}
4793d7072f8Seschrock 
480*30c304d9SJoshua M. Clulow 	/*
481*30c304d9SJoshua M. Clulow 	 * If this is early in boot, a sweep of available block devices may
482*30c304d9SJoshua M. Clulow 	 * locate an alternative path that we can try.
483*30c304d9SJoshua M. Clulow 	 */
484*30c304d9SJoshua M. Clulow 	if (error != 0) {
485*30c304d9SJoshua M. Clulow 		const char *altdevpath = vdev_disk_preroot_lookup(
486*30c304d9SJoshua M. Clulow 		    spa_guid(spa), vd->vdev_guid);
487*30c304d9SJoshua M. Clulow 
488*30c304d9SJoshua M. Clulow 		if (altdevpath != NULL) {
489*30c304d9SJoshua M. Clulow 			vdev_dbgmsg(vd, "Trying alternate preroot path (%s)",
490*30c304d9SJoshua M. Clulow 			    altdevpath);
491*30c304d9SJoshua M. Clulow 
492*30c304d9SJoshua M. Clulow 			validate_devid = B_TRUE;
493*30c304d9SJoshua M. Clulow 
494*30c304d9SJoshua M. Clulow 			if ((error = ldi_open_by_name((char *)altdevpath,
495*30c304d9SJoshua M. Clulow 			    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li)) != 0) {
496*30c304d9SJoshua M. Clulow 				vdev_dbgmsg(vd, "Failed to open by preroot "
497*30c304d9SJoshua M. Clulow 				    "path (%s)", altdevpath);
498*30c304d9SJoshua M. Clulow 			}
499*30c304d9SJoshua M. Clulow 		}
500*30c304d9SJoshua M. Clulow 	}
501*30c304d9SJoshua M. Clulow 
5026af23589SJoshua M. Clulow 	if (error != 0) {
503fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
5043ee8c80cSPavel Zakharov 		vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]",
5053ee8c80cSPavel Zakharov 		    error);
506fa9e4066Sahrens 		return (error);
507e14bb325SJeff Bonwick 	}
508fa9e4066Sahrens 
509fb02ae02SGeorge Wilson 	/*
510fb02ae02SGeorge Wilson 	 * Now that the device has been successfully opened, update the devid
511fb02ae02SGeorge Wilson 	 * if necessary.
512fb02ae02SGeorge Wilson 	 */
5136af23589SJoshua M. Clulow 	if (validate_devid) {
5146af23589SJoshua M. Clulow 		ddi_devid_t devid = NULL;
5156af23589SJoshua M. Clulow 		char *minorname = NULL;
5166af23589SJoshua M. Clulow 		char *vd_devid = NULL;
5176af23589SJoshua M. Clulow 		boolean_t remove = B_FALSE, update = B_FALSE;
5186af23589SJoshua M. Clulow 
5196af23589SJoshua M. Clulow 		/*
5206af23589SJoshua M. Clulow 		 * Get the current devid and minor name for the device we
5216af23589SJoshua M. Clulow 		 * opened.
5226af23589SJoshua M. Clulow 		 */
5236af23589SJoshua M. Clulow 		if (ldi_get_devid(dvd->vd_lh, &devid) != 0 ||
5246af23589SJoshua M. Clulow 		    ldi_get_minor_name(dvd->vd_lh, &minorname) != 0) {
5256af23589SJoshua M. Clulow 			/*
5266af23589SJoshua M. Clulow 			 * If we are unable to get the devid or the minor name
5276af23589SJoshua M. Clulow 			 * for the device, we need to remove them from the
5286af23589SJoshua M. Clulow 			 * configuration to prevent potential inconsistencies.
5296af23589SJoshua M. Clulow 			 */
5306af23589SJoshua M. Clulow 			if (dvd->vd_minor != NULL || dvd->vd_devid != NULL ||
5316af23589SJoshua M. Clulow 			    vd->vdev_devid != NULL) {
5326af23589SJoshua M. Clulow 				/*
5336af23589SJoshua M. Clulow 				 * We only need to remove the devid if one
5346af23589SJoshua M. Clulow 				 * exists.
5356af23589SJoshua M. Clulow 				 */
5366af23589SJoshua M. Clulow 				remove = B_TRUE;
5376af23589SJoshua M. Clulow 			}
538fb02ae02SGeorge Wilson 
5396af23589SJoshua M. Clulow 		} else if (dvd->vd_devid == NULL || dvd->vd_minor == NULL) {
5406af23589SJoshua M. Clulow 			/*
5416af23589SJoshua M. Clulow 			 * There was previously no devid at all so we need to
5426af23589SJoshua M. Clulow 			 * add one.
5436af23589SJoshua M. Clulow 			 */
5446af23589SJoshua M. Clulow 			update = B_TRUE;
5456af23589SJoshua M. Clulow 
5466af23589SJoshua M. Clulow 		} else if (ddi_devid_compare(devid, dvd->vd_devid) != 0 ||
5476af23589SJoshua M. Clulow 		    strcmp(minorname, dvd->vd_minor) != 0) {
5486af23589SJoshua M. Clulow 			/*
5496af23589SJoshua M. Clulow 			 * The devid or minor name on file does not match the
5506af23589SJoshua M. Clulow 			 * one from the opened device.
5516af23589SJoshua M. Clulow 			 */
5526af23589SJoshua M. Clulow 			update = B_TRUE;
5536af23589SJoshua M. Clulow 		}
5546af23589SJoshua M. Clulow 
5556af23589SJoshua M. Clulow 		if (update) {
5566af23589SJoshua M. Clulow 			/*
5576af23589SJoshua M. Clulow 			 * Render the new devid and minor name as a string for
5586af23589SJoshua M. Clulow 			 * logging and to store in the vdev configuration.
5596af23589SJoshua M. Clulow 			 */
5606af23589SJoshua M. Clulow 			vd_devid = ddi_devid_str_encode(devid, minorname);
5616af23589SJoshua M. Clulow 		}
5626af23589SJoshua M. Clulow 
5636af23589SJoshua M. Clulow 		if (update || remove) {
5643ee8c80cSPavel Zakharov 			vdev_dbgmsg(vd, "vdev_disk_open: update devid from "
5656af23589SJoshua M. Clulow 			    "'%s' to '%s'",
5666af23589SJoshua M. Clulow 			    vd->vdev_devid != NULL ? vd->vdev_devid : "<none>",
5676af23589SJoshua M. Clulow 			    vd_devid != NULL ? vd_devid : "<none>");
5686fe4f300SPavel Zakharov 			cmn_err(CE_NOTE, "vdev_disk_open %s: update devid "
5696af23589SJoshua M. Clulow 			    "from '%s' to '%s'",
5706af23589SJoshua M. Clulow 			    vd->vdev_path != NULL ? vd->vdev_path : "?",
5716af23589SJoshua M. Clulow 			    vd->vdev_devid != NULL ? vd->vdev_devid : "<none>",
5726af23589SJoshua M. Clulow 			    vd_devid != NULL ? vd_devid : "<none>");
5736af23589SJoshua M. Clulow 
5746af23589SJoshua M. Clulow 			/*
5756af23589SJoshua M. Clulow 			 * Remove and free any existing values.
5766af23589SJoshua M. Clulow 			 */
5776af23589SJoshua M. Clulow 			if (dvd->vd_minor != NULL) {
5786af23589SJoshua M. Clulow 				ddi_devid_str_free(dvd->vd_minor);
5796af23589SJoshua M. Clulow 				dvd->vd_minor = NULL;
5806af23589SJoshua M. Clulow 			}
5816af23589SJoshua M. Clulow 			if (dvd->vd_devid != NULL) {
5826af23589SJoshua M. Clulow 				ddi_devid_free(dvd->vd_devid);
5836af23589SJoshua M. Clulow 				dvd->vd_devid = NULL;
5846af23589SJoshua M. Clulow 			}
5856af23589SJoshua M. Clulow 			if (vd->vdev_devid != NULL) {
5866af23589SJoshua M. Clulow 				spa_strfree(vd->vdev_devid);
5876af23589SJoshua M. Clulow 				vd->vdev_devid = NULL;
5886af23589SJoshua M. Clulow 			}
5896af23589SJoshua M. Clulow 		}
5906af23589SJoshua M. Clulow 
5916af23589SJoshua M. Clulow 		if (update) {
5926af23589SJoshua M. Clulow 			/*
5936af23589SJoshua M. Clulow 			 * Install the new values.
5946af23589SJoshua M. Clulow 			 */
5956af23589SJoshua M. Clulow 			vd->vdev_devid = vd_devid;
5966af23589SJoshua M. Clulow 			dvd->vd_minor = minorname;
5976af23589SJoshua M. Clulow 			dvd->vd_devid = devid;
5986af23589SJoshua M. Clulow 
5996af23589SJoshua M. Clulow 		} else {
6006af23589SJoshua M. Clulow 			if (devid != NULL) {
6016af23589SJoshua M. Clulow 				ddi_devid_free(devid);
6026af23589SJoshua M. Clulow 			}
6036af23589SJoshua M. Clulow 			if (minorname != NULL) {
6046af23589SJoshua M. Clulow 				kmem_free(minorname, strlen(minorname) + 1);
6056af23589SJoshua M. Clulow 			}
606fb02ae02SGeorge Wilson 		}
607fb02ae02SGeorge Wilson 	}
608fb02ae02SGeorge Wilson 
6093d7072f8Seschrock 	/*
6103d7072f8Seschrock 	 * Once a device is opened, verify that the physical device path (if
6113d7072f8Seschrock 	 * available) is up to date.
6123d7072f8Seschrock 	 */
6133d7072f8Seschrock 	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
6143d7072f8Seschrock 	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
6150a4e9518Sgw 		char *physpath, *minorname;
6160a4e9518Sgw 
6173d7072f8Seschrock 		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
6183d7072f8Seschrock 		minorname = NULL;
6193d7072f8Seschrock 		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
6203d7072f8Seschrock 		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
6213d7072f8Seschrock 		    (vd->vdev_physpath == NULL ||
6223d7072f8Seschrock 		    strcmp(vd->vdev_physpath, physpath) != 0)) {
6233d7072f8Seschrock 			if (vd->vdev_physpath)
6243d7072f8Seschrock 				spa_strfree(vd->vdev_physpath);
6253d7072f8Seschrock 			(void) strlcat(physpath, ":", MAXPATHLEN);
6263d7072f8Seschrock 			(void) strlcat(physpath, minorname, MAXPATHLEN);
6273d7072f8Seschrock 			vd->vdev_physpath = spa_strdup(physpath);
6283d7072f8Seschrock 		}
6293d7072f8Seschrock 		if (minorname)
6303d7072f8Seschrock 			kmem_free(minorname, strlen(minorname) + 1);
6313d7072f8Seschrock 		kmem_free(physpath, MAXPATHLEN);
6323d7072f8Seschrock 	}
6333d7072f8Seschrock 
63439cddb10SJoshua M. Clulow 	/*
63539cddb10SJoshua M. Clulow 	 * Register callbacks for the LDI offline event.
63639cddb10SJoshua M. Clulow 	 */
63739cddb10SJoshua M. Clulow 	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
63839cddb10SJoshua M. Clulow 	    LDI_EV_SUCCESS) {
63939cddb10SJoshua M. Clulow 		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
64039cddb10SJoshua M. Clulow 		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
64139cddb10SJoshua M. Clulow 		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
64239cddb10SJoshua M. Clulow 		    &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
64339cddb10SJoshua M. Clulow 	}
64439cddb10SJoshua M. Clulow 
64539cddb10SJoshua M. Clulow 	/*
64639cddb10SJoshua M. Clulow 	 * Register callbacks for the LDI degrade event.
64739cddb10SJoshua M. Clulow 	 */
64839cddb10SJoshua M. Clulow 	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
64939cddb10SJoshua M. Clulow 	    LDI_EV_SUCCESS) {
65039cddb10SJoshua M. Clulow 		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
65139cddb10SJoshua M. Clulow 		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
65239cddb10SJoshua M. Clulow 		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
65339cddb10SJoshua M. Clulow 		    &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
65439cddb10SJoshua M. Clulow 	}
655084fd14fSBrian Behlendorf 
656095bcd66SGeorge Wilson skip_open:
657fa9e4066Sahrens 	/*
658fa9e4066Sahrens 	 * Determine the actual size of the device.
659fa9e4066Sahrens 	 */
660fa9e4066Sahrens 	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
661fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
6623ee8c80cSPavel Zakharov 		vdev_dbgmsg(vd, "vdev_disk_open: failed to get size");
663be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
664fa9e4066Sahrens 	}
665fa9e4066Sahrens 
666a5b57771SDan McDonald 	*max_psize = *psize;
667a5b57771SDan McDonald 
668ecc2d604Sbonwick 	/*
669ecc2d604Sbonwick 	 * Determine the device's minimum transfer size.
670ecc2d604Sbonwick 	 * If the ioctl isn't supported, assume DEV_BSIZE.
671ecc2d604Sbonwick 	 */
672a5b57771SDan McDonald 	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
673a5b57771SDan McDonald 	    (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
674a5b57771SDan McDonald 		capacity = dkmext->dki_capacity - 1;
675a5b57771SDan McDonald 		blksz = dkmext->dki_lbsize;
676a5b57771SDan McDonald 		pbsize = dkmext->dki_pbsize;
677a5b57771SDan McDonald 	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
678a5b57771SDan McDonald 	    (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
679a5b57771SDan McDonald 		VDEV_DEBUG(
680a5b57771SDan McDonald 		    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
681a5b57771SDan McDonald 		    vd->vdev_path);
682a5b57771SDan McDonald 		capacity = dkm->dki_capacity - 1;
683a5b57771SDan McDonald 		blksz = dkm->dki_lbsize;
684a5b57771SDan McDonald 		pbsize = blksz;
685a5b57771SDan McDonald 	} else {
686a5b57771SDan McDonald 		VDEV_DEBUG("vdev_disk_open(\"%s\"): "
687a5b57771SDan McDonald 		    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
688a5b57771SDan McDonald 		    vd->vdev_path, error);
689a5b57771SDan McDonald 		pbsize = DEV_BSIZE;
690a5b57771SDan McDonald 	}
691bef6b7d2Swebaker 
692bf16b11eSMatthew Ahrens 	*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
693bef6b7d2Swebaker 
6944263d13fSGeorge Wilson 	if (vd->vdev_wholedisk == 1) {
6954263d13fSGeorge Wilson 		int wce = 1;
6964263d13fSGeorge Wilson 
697a5b57771SDan McDonald 		if (error == 0) {
698a5b57771SDan McDonald 			/*
699a5b57771SDan McDonald 			 * If we have the capability to expand, we'd have
700a5b57771SDan McDonald 			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
701a5b57771SDan McDonald 			 * Adjust max_psize upward accordingly since we know
702a5b57771SDan McDonald 			 * we own the whole disk now.
703a5b57771SDan McDonald 			 */
704c39a2aaeSGeorge Wilson 			*max_psize = capacity * blksz;
705a5b57771SDan McDonald 		}
706a5b57771SDan McDonald 
7074263d13fSGeorge Wilson 		/*
708a5b57771SDan McDonald 		 * Since we own the whole disk, try to enable disk write
709a5b57771SDan McDonald 		 * caching.  We ignore errors because it's OK if we can't do it.
7104263d13fSGeorge Wilson 		 */
7114263d13fSGeorge Wilson 		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
7124263d13fSGeorge Wilson 		    FKIOCTL, kcred, NULL);
7134263d13fSGeorge Wilson 	}
7144263d13fSGeorge Wilson 
715b468a217Seschrock 	/*
716b468a217Seschrock 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
717b468a217Seschrock 	 * try again.
718b468a217Seschrock 	 */
719b468a217Seschrock 	vd->vdev_nowritecache = B_FALSE;
720b468a217Seschrock 
721084fd14fSBrian Behlendorf 	if (ldi_ioctl(dvd->vd_lh, DKIOC_CANFREE, (intptr_t)&can_free, FKIOCTL,
722084fd14fSBrian Behlendorf 	    kcred, NULL) == 0 && can_free == 1) {
723084fd14fSBrian Behlendorf 		vd->vdev_has_trim = B_TRUE;
724084fd14fSBrian Behlendorf 	} else {
725084fd14fSBrian Behlendorf 		vd->vdev_has_trim = B_FALSE;
726084fd14fSBrian Behlendorf 	}
727084fd14fSBrian Behlendorf 
728fb05b94aSJerry Jelinek 	if (zfs_no_trim == 1)
729fb05b94aSJerry Jelinek 		vd->vdev_has_trim = B_FALSE;
730fb05b94aSJerry Jelinek 
731084fd14fSBrian Behlendorf 	/* Currently only supported for ZoL. */
732084fd14fSBrian Behlendorf 	vd->vdev_has_securetrim = B_FALSE;
733084fd14fSBrian Behlendorf 
73412a8814cSTom Caputi 	/* Inform the ZIO pipeline that we are non-rotational */
73512a8814cSTom Caputi 	vd->vdev_nonrot = B_FALSE;
73612a8814cSTom Caputi 	if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
73712a8814cSTom Caputi 	    "device-solid-state")) {
73812a8814cSTom Caputi 		if (ldi_prop_get_int(dvd->vd_lh,
73912a8814cSTom Caputi 		    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
74012a8814cSTom Caputi 		    "device-solid-state", B_FALSE) != 0)
74112a8814cSTom Caputi 			vd->vdev_nonrot = B_TRUE;
74212a8814cSTom Caputi 	}
74312a8814cSTom Caputi 
744fa9e4066Sahrens 	return (0);
745fa9e4066Sahrens }
746fa9e4066Sahrens 
747fa9e4066Sahrens static void
748fa9e4066Sahrens vdev_disk_close(vdev_t *vd)
749fa9e4066Sahrens {
750fa9e4066Sahrens 	vdev_disk_t *dvd = vd->vdev_tsd;
751fa9e4066Sahrens 
752095bcd66SGeorge Wilson 	if (vd->vdev_reopening || dvd == NULL)
753fa9e4066Sahrens 		return;
754fa9e4066Sahrens 
75539cddb10SJoshua M. Clulow 	if (dvd->vd_minor != NULL) {
756fa9e4066Sahrens 		ddi_devid_str_free(dvd->vd_minor);
75739cddb10SJoshua M. Clulow 		dvd->vd_minor = NULL;
75839cddb10SJoshua M. Clulow 	}
759fa9e4066Sahrens 
76039cddb10SJoshua M. Clulow 	if (dvd->vd_devid != NULL) {
761fa9e4066Sahrens 		ddi_devid_free(dvd->vd_devid);
76239cddb10SJoshua M. Clulow 		dvd->vd_devid = NULL;
76339cddb10SJoshua M. Clulow 	}
764fa9e4066Sahrens 
76539cddb10SJoshua M. Clulow 	if (dvd->vd_lh != NULL) {
7668ad4d6ddSJeff Bonwick 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
76739cddb10SJoshua M. Clulow 		dvd->vd_lh = NULL;
76839cddb10SJoshua M. Clulow 	}
769fa9e4066Sahrens 
77098d1cbfeSGeorge Wilson 	vd->vdev_delayed_close = B_FALSE;
77139cddb10SJoshua M. Clulow 	vdev_disk_free(vd);
772fa9e4066Sahrens }
773fa9e4066Sahrens 
774ac04831dSMike Gerdts static int
775810e43b2SBill Pijewski vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
776810e43b2SBill Pijewski     size_t size, uint64_t offset, int flags)
777e7cbe64fSgw {
778e7cbe64fSgw 	buf_t *bp;
779e7cbe64fSgw 	int error = 0;
780e7cbe64fSgw 
781e7cbe64fSgw 	if (vd_lh == NULL)
782be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
783e7cbe64fSgw 
784e7cbe64fSgw 	ASSERT(flags & B_READ || flags & B_WRITE);
785e7cbe64fSgw 
786e7cbe64fSgw 	bp = getrbuf(KM_SLEEP);
787e7cbe64fSgw 	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
788e7cbe64fSgw 	bp->b_bcount = size;
789e7cbe64fSgw 	bp->b_un.b_addr = (void *)data;
790e7cbe64fSgw 	bp->b_lblkno = lbtodb(offset);
791e7cbe64fSgw 	bp->b_bufsize = size;
792e7cbe64fSgw 
793e7cbe64fSgw 	error = ldi_strategy(vd_lh, bp);
794e7cbe64fSgw 	ASSERT(error == 0);
795e7cbe64fSgw 	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
796be6fd75aSMatthew Ahrens 		error = SET_ERROR(EIO);
797e7cbe64fSgw 	freerbuf(bp);
798e7cbe64fSgw 
799e7cbe64fSgw 	return (error);
800e7cbe64fSgw }
801e7cbe64fSgw 
802ac04831dSMike Gerdts static int
803ac04831dSMike Gerdts vdev_disk_dumpio(vdev_t *vd, caddr_t data, size_t size,
8041b500975SMike Gerdts     uint64_t offset, uint64_t origoffset __unused, boolean_t doread,
8051b500975SMike Gerdts     boolean_t isdump)
806ac04831dSMike Gerdts {
807ac04831dSMike Gerdts 	vdev_disk_t *dvd = vd->vdev_tsd;
808ac04831dSMike Gerdts 	int flags = doread ? B_READ : B_WRITE;
809ac04831dSMike Gerdts 
810ac04831dSMike Gerdts 	/*
811ac04831dSMike Gerdts 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
812ac04831dSMike Gerdts 	 * Nothing to be done here but return failure.
813ac04831dSMike Gerdts 	 */
814ac04831dSMike Gerdts 	if (dvd == NULL || dvd->vd_ldi_offline) {
8151b500975SMike Gerdts 		return (SET_ERROR(ENXIO));
816ac04831dSMike Gerdts 	}
817ac04831dSMike Gerdts 
818ac04831dSMike Gerdts 	ASSERT(vd->vdev_ops == &vdev_disk_ops);
819ac04831dSMike Gerdts 
820ac04831dSMike Gerdts 	offset += VDEV_LABEL_START_SIZE;
821ac04831dSMike Gerdts 
822ac04831dSMike Gerdts 	/*
823ac04831dSMike Gerdts 	 * If in the context of an active crash dump, use the ldi_dump(9F)
824ac04831dSMike Gerdts 	 * call instead of ldi_strategy(9F) as usual.
825ac04831dSMike Gerdts 	 */
826ac04831dSMike Gerdts 	if (isdump) {
827ac04831dSMike Gerdts 		ASSERT3P(dvd, !=, NULL);
828ac04831dSMike Gerdts 		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
829ac04831dSMike Gerdts 		    lbtodb(size)));
830ac04831dSMike Gerdts 	}
831ac04831dSMike Gerdts 
832ac04831dSMike Gerdts 	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
833ac04831dSMike Gerdts }
834ac04831dSMike Gerdts 
835c62757b2SToomas Soome static int
836fa9e4066Sahrens vdev_disk_io_intr(buf_t *bp)
837fa9e4066Sahrens {
83831d7e8faSGeorge Wilson 	vdev_buf_t *vb = (vdev_buf_t *)bp;
83931d7e8faSGeorge Wilson 	zio_t *zio = vb->vb_io;
840fa9e4066Sahrens 
84151ece835Seschrock 	/*
84251ece835Seschrock 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
84351ece835Seschrock 	 * Rather than teach the rest of the stack about other error
84451ece835Seschrock 	 * possibilities (EFAULT, etc), we normalize the error value here.
84551ece835Seschrock 	 */
84651ece835Seschrock 	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
84751ece835Seschrock 
84851ece835Seschrock 	if (zio->io_error == 0 && bp->b_resid != 0)
849be6fd75aSMatthew Ahrens 		zio->io_error = SET_ERROR(EIO);
850fa9e4066Sahrens 
851770499e1SDan Kimmel 	if (zio->io_type == ZIO_TYPE_READ) {
852770499e1SDan Kimmel 		abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
853770499e1SDan Kimmel 	} else {
854770499e1SDan Kimmel 		abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
855770499e1SDan Kimmel 	}
856770499e1SDan Kimmel 
85731d7e8faSGeorge Wilson 	kmem_free(vb, sizeof (vdev_buf_t));
858fa9e4066Sahrens 
85997e81309SPrakash Surya 	zio_delay_interrupt(zio);
860c62757b2SToomas Soome 	return (0);
861fa9e4066Sahrens }
862fa9e4066Sahrens 
863f4a72450SJeff Bonwick static void
864f4a72450SJeff Bonwick vdev_disk_ioctl_free(zio_t *zio)
865f4a72450SJeff Bonwick {
866f4a72450SJeff Bonwick 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
867f4a72450SJeff Bonwick }
868f4a72450SJeff Bonwick 
86922fe2c88SJonathan Adams static const zio_vsd_ops_t vdev_disk_vsd_ops = {
87022fe2c88SJonathan Adams 	vdev_disk_ioctl_free,
87122fe2c88SJonathan Adams 	zio_vsd_default_cksum_report
87222fe2c88SJonathan Adams };
87322fe2c88SJonathan Adams 
874fa9e4066Sahrens static void
875fa9e4066Sahrens vdev_disk_ioctl_done(void *zio_arg, int error)
876fa9e4066Sahrens {
877fa9e4066Sahrens 	zio_t *zio = zio_arg;
878fa9e4066Sahrens 
879fa9e4066Sahrens 	zio->io_error = error;
880fa9e4066Sahrens 
881e05725b1Sbonwick 	zio_interrupt(zio);
882fa9e4066Sahrens }
883fa9e4066Sahrens 
884738f37bcSGeorge Wilson static void
885fa9e4066Sahrens vdev_disk_io_start(zio_t *zio)
886fa9e4066Sahrens {
887fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
888fa9e4066Sahrens 	vdev_disk_t *dvd = vd->vdev_tsd;
889084fd14fSBrian Behlendorf 	unsigned long trim_flags = 0;
89031d7e8faSGeorge Wilson 	vdev_buf_t *vb;
891e14bb325SJeff Bonwick 	struct dk_callback *dkc;
892fa9e4066Sahrens 	buf_t *bp;
893e14bb325SJeff Bonwick 	int error;
894fa9e4066Sahrens 
89539cddb10SJoshua M. Clulow 	/*
89639cddb10SJoshua M. Clulow 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
89739cddb10SJoshua M. Clulow 	 * Nothing to be done here but return failure.
89839cddb10SJoshua M. Clulow 	 */
8991b500975SMike Gerdts 	if (dvd == NULL || dvd->vd_ldi_offline) {
90039cddb10SJoshua M. Clulow 		zio->io_error = ENXIO;
901738f37bcSGeorge Wilson 		zio_interrupt(zio);
902738f37bcSGeorge Wilson 		return;
90339cddb10SJoshua M. Clulow 	}
90439cddb10SJoshua M. Clulow 
905084fd14fSBrian Behlendorf 	switch (zio->io_type) {
906084fd14fSBrian Behlendorf 	case ZIO_TYPE_IOCTL:
907fa9e4066Sahrens 		/* XXPOLICY */
9080a4e9518Sgw 		if (!vdev_readable(vd)) {
909be6fd75aSMatthew Ahrens 			zio->io_error = SET_ERROR(ENXIO);
910738f37bcSGeorge Wilson 			zio_interrupt(zio);
911738f37bcSGeorge Wilson 			return;
912fa9e4066Sahrens 		}
913fa9e4066Sahrens 
914fa9e4066Sahrens 		switch (zio->io_cmd) {
915fa9e4066Sahrens 
916fa9e4066Sahrens 		case DKIOCFLUSHWRITECACHE:
917fa9e4066Sahrens 
918a2eea2e1Sahrens 			if (zfs_nocacheflush)
919a2eea2e1Sahrens 				break;
920a2eea2e1Sahrens 
921b468a217Seschrock 			if (vd->vdev_nowritecache) {
922be6fd75aSMatthew Ahrens 				zio->io_error = SET_ERROR(ENOTSUP);
923b468a217Seschrock 				break;
924b468a217Seschrock 			}
925b468a217Seschrock 
926e14bb325SJeff Bonwick 			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
92722fe2c88SJonathan Adams 			zio->io_vsd_ops = &vdev_disk_vsd_ops;
928e14bb325SJeff Bonwick 
929e14bb325SJeff Bonwick 			dkc->dkc_callback = vdev_disk_ioctl_done;
930e14bb325SJeff Bonwick 			dkc->dkc_flag = FLUSH_VOLATILE;
931e14bb325SJeff Bonwick 			dkc->dkc_cookie = zio;
932fa9e4066Sahrens 
933fa9e4066Sahrens 			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
934e14bb325SJeff Bonwick 			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
935fa9e4066Sahrens 
936fa9e4066Sahrens 			if (error == 0) {
937fa9e4066Sahrens 				/*
938fa9e4066Sahrens 				 * The ioctl will be done asychronously,
939fa9e4066Sahrens 				 * and will call vdev_disk_ioctl_done()
940fa9e4066Sahrens 				 * upon completion.
941fa9e4066Sahrens 				 */
942738f37bcSGeorge Wilson 				return;
943e05725b1Sbonwick 			}
944e05725b1Sbonwick 
945fa9e4066Sahrens 			zio->io_error = error;
946b468a217Seschrock 
947fa9e4066Sahrens 			break;
948fa9e4066Sahrens 
949fa9e4066Sahrens 		default:
950be6fd75aSMatthew Ahrens 			zio->io_error = SET_ERROR(ENOTSUP);
951fa9e4066Sahrens 		}
952fa9e4066Sahrens 
953738f37bcSGeorge Wilson 		zio_execute(zio);
954738f37bcSGeorge Wilson 		return;
955084fd14fSBrian Behlendorf 
956084fd14fSBrian Behlendorf 	case ZIO_TYPE_TRIM:
957fb05b94aSJerry Jelinek 		if (zfs_no_trim == 1 || !vd->vdev_has_trim) {
958084fd14fSBrian Behlendorf 			zio->io_error = SET_ERROR(ENOTSUP);
959084fd14fSBrian Behlendorf 			zio_execute(zio);
960084fd14fSBrian Behlendorf 			return;
961084fd14fSBrian Behlendorf 		}
962084fd14fSBrian Behlendorf 		/* Currently only supported on ZoL. */
963084fd14fSBrian Behlendorf 		ASSERT0(zio->io_trim_flags & ZIO_TRIM_SECURE);
964084fd14fSBrian Behlendorf 
965084fd14fSBrian Behlendorf 		/* dkioc_free_list_t is already declared to hold one entry */
966084fd14fSBrian Behlendorf 		dkioc_free_list_t dfl;
967084fd14fSBrian Behlendorf 		dfl.dfl_flags = 0;
968084fd14fSBrian Behlendorf 		dfl.dfl_num_exts = 1;
969d0562c10SJerry Jelinek 		dfl.dfl_offset = 0;
970084fd14fSBrian Behlendorf 		dfl.dfl_exts[0].dfle_start = zio->io_offset;
971084fd14fSBrian Behlendorf 		dfl.dfl_exts[0].dfle_length = zio->io_size;
972084fd14fSBrian Behlendorf 
973084fd14fSBrian Behlendorf 		zio->io_error = ldi_ioctl(dvd->vd_lh, DKIOCFREE,
974084fd14fSBrian Behlendorf 		    (uintptr_t)&dfl, FKIOCTL, kcred, NULL);
975084fd14fSBrian Behlendorf 
976084fd14fSBrian Behlendorf 		if (zio->io_error == ENOTSUP || zio->io_error == ENOTTY) {
977084fd14fSBrian Behlendorf 			/*
978084fd14fSBrian Behlendorf 			 * The device must have changed and now TRIM is
979084fd14fSBrian Behlendorf 			 * no longer supported.
980084fd14fSBrian Behlendorf 			 */
981084fd14fSBrian Behlendorf 			vd->vdev_has_trim = B_FALSE;
982084fd14fSBrian Behlendorf 		}
983084fd14fSBrian Behlendorf 
984084fd14fSBrian Behlendorf 		zio_interrupt(zio);
985084fd14fSBrian Behlendorf 		return;
986fa9e4066Sahrens 	}
987fa9e4066Sahrens 
988f693d300SSteven Hartland 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
98997e81309SPrakash Surya 	zio->io_target_timestamp = zio_handle_io_delay(zio);
990f693d300SSteven Hartland 
99131d7e8faSGeorge Wilson 	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
992fa9e4066Sahrens 
99331d7e8faSGeorge Wilson 	vb->vb_io = zio;
99431d7e8faSGeorge Wilson 	bp = &vb->vb_buf;
995fa9e4066Sahrens 
996fa9e4066Sahrens 	bioinit(bp);
997e14bb325SJeff Bonwick 	bp->b_flags = B_BUSY | B_NOCACHE |
9988956713aSEric Schrock 	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
9998956713aSEric Schrock 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
10008956713aSEric Schrock 		bp->b_flags |= B_FAILFAST;
1001fa9e4066Sahrens 	bp->b_bcount = zio->io_size;
1002770499e1SDan Kimmel 
1003770499e1SDan Kimmel 	if (zio->io_type == ZIO_TYPE_READ) {
1004770499e1SDan Kimmel 		bp->b_un.b_addr =
1005770499e1SDan Kimmel 		    abd_borrow_buf(zio->io_abd, zio->io_size);
1006770499e1SDan Kimmel 	} else {
1007770499e1SDan Kimmel 		bp->b_un.b_addr =
1008770499e1SDan Kimmel 		    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
1009770499e1SDan Kimmel 	}
1010770499e1SDan Kimmel 
1011fa9e4066Sahrens 	bp->b_lblkno = lbtodb(zio->io_offset);
1012fa9e4066Sahrens 	bp->b_bufsize = zio->io_size;
1013c62757b2SToomas Soome 	bp->b_iodone = vdev_disk_io_intr;
1014fa9e4066Sahrens 
1015fa88c70fSJerry Jelinek 	/*
1016fa88c70fSJerry Jelinek 	 * In general we would expect ldi_strategy() to return non-zero only
1017fa88c70fSJerry Jelinek 	 * because of programming errors, but we've also seen this fail shortly
1018fa88c70fSJerry Jelinek 	 * after a disk dies.
1019fa88c70fSJerry Jelinek 	 */
1020fa88c70fSJerry Jelinek 	if (ldi_strategy(dvd->vd_lh, bp) != 0) {
1021fa88c70fSJerry Jelinek 		zio->io_error = ENXIO;
1022fa88c70fSJerry Jelinek 		zio_interrupt(zio);
1023fa88c70fSJerry Jelinek 	}
1024fa9e4066Sahrens }
1025fa9e4066Sahrens 
1026e14bb325SJeff Bonwick static void
1027fa9e4066Sahrens vdev_disk_io_done(zio_t *zio)
1028fa9e4066Sahrens {
1029e14bb325SJeff Bonwick 	vdev_t *vd = zio->io_vd;
1030ea8dc4b6Seschrock 
10313d7072f8Seschrock 	/*
10323d7072f8Seschrock 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
10333d7072f8Seschrock 	 * the device has been removed.  If this is the case, then we trigger an
10340a4e9518Sgw 	 * asynchronous removal of the device. Otherwise, probe the device and
10351f7ad2e1Sgw 	 * make sure it's still accessible.
10363d7072f8Seschrock 	 */
10371d713200SEric Schrock 	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
10380a4e9518Sgw 		vdev_disk_t *dvd = vd->vdev_tsd;
1039e14bb325SJeff Bonwick 		int state = DKIO_NONE;
10400a4e9518Sgw 
1041e14bb325SJeff Bonwick 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
1042e14bb325SJeff Bonwick 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
10431d713200SEric Schrock 			/*
10441d713200SEric Schrock 			 * We post the resource as soon as possible, instead of
10451d713200SEric Schrock 			 * when the async removal actually happens, because the
10461d713200SEric Schrock 			 * DE is using this information to discard previous I/O
10471d713200SEric Schrock 			 * errors.
10481d713200SEric Schrock 			 */
10491d713200SEric Schrock 			zfs_post_remove(zio->io_spa, vd);
10503d7072f8Seschrock 			vd->vdev_remove_wanted = B_TRUE;
10513d7072f8Seschrock 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
105298d1cbfeSGeorge Wilson 		} else if (!vd->vdev_delayed_close) {
105398d1cbfeSGeorge Wilson 			vd->vdev_delayed_close = B_TRUE;
10543d7072f8Seschrock 		}
10553d7072f8Seschrock 	}
1056fa9e4066Sahrens }
1057fa9e4066Sahrens 
1058fa9e4066Sahrens vdev_ops_t vdev_disk_ops = {
1059a3874b8bSToomas Soome 	.vdev_op_open = vdev_disk_open,
1060a3874b8bSToomas Soome 	.vdev_op_close = vdev_disk_close,
1061a3874b8bSToomas Soome 	.vdev_op_asize = vdev_default_asize,
1062a3874b8bSToomas Soome 	.vdev_op_io_start = vdev_disk_io_start,
1063a3874b8bSToomas Soome 	.vdev_op_io_done = vdev_disk_io_done,
1064a3874b8bSToomas Soome 	.vdev_op_state_change = NULL,
1065a3874b8bSToomas Soome 	.vdev_op_need_resilver = NULL,
1066a3874b8bSToomas Soome 	.vdev_op_hold = vdev_disk_hold,
1067a3874b8bSToomas Soome 	.vdev_op_rele = vdev_disk_rele,
1068a3874b8bSToomas Soome 	.vdev_op_remap = NULL,
1069a3874b8bSToomas Soome 	.vdev_op_xlate = vdev_default_xlate,
1070ac04831dSMike Gerdts 	.vdev_op_dumpio = vdev_disk_dumpio,
1071a3874b8bSToomas Soome 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
1072a3874b8bSToomas Soome 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
1073fa9e4066Sahrens };
1074e7cbe64fSgw 
1075e7cbe64fSgw /*
1076051aabe6Staylor  * Given the root disk device devid or pathname, read the label from
1077051aabe6Staylor  * the device, and construct a configuration nvlist.
1078e7cbe64fSgw  */
1079f940fbb1SLin Ling int
1080*30c304d9SJoshua M. Clulow vdev_disk_read_rootlabel(const char *devpath, const char *devid,
1081*30c304d9SJoshua M. Clulow     nvlist_t **config)
1082e7cbe64fSgw {
1083e7cbe64fSgw 	ldi_handle_t vd_lh;
1084e7cbe64fSgw 	vdev_label_t *label;
1085e7cbe64fSgw 	uint64_t s, size;
1086e7cbe64fSgw 	int l;
1087051aabe6Staylor 	ddi_devid_t tmpdevid;
1088f4565e39SLin Ling 	int error = -1;
1089051aabe6Staylor 	char *minor_name;
1090e7cbe64fSgw 
1091e7cbe64fSgw 	/*
1092e7cbe64fSgw 	 * Read the device label and build the nvlist.
1093e7cbe64fSgw 	 */
1094*30c304d9SJoshua M. Clulow 	if (devid != NULL && ddi_devid_str_decode((char *)devid, &tmpdevid,
1095051aabe6Staylor 	    &minor_name) == 0) {
1096051aabe6Staylor 		error = ldi_open_by_devid(tmpdevid, minor_name,
10978ad4d6ddSJeff Bonwick 		    FREAD, kcred, &vd_lh, zfs_li);
1098051aabe6Staylor 		ddi_devid_free(tmpdevid);
1099051aabe6Staylor 		ddi_devid_str_free(minor_name);
1100051aabe6Staylor 	}
1101051aabe6Staylor 
1102*30c304d9SJoshua M. Clulow 	if (error != 0 && (error = ldi_open_by_name((char *)devpath, FREAD,
1103*30c304d9SJoshua M. Clulow 	    kcred, &vd_lh, zfs_li)) != 0) {
1104f940fbb1SLin Ling 		return (error);
1105*30c304d9SJoshua M. Clulow 	}
1106e7cbe64fSgw 
1107bf82a41bSeschrock 	if (ldi_get_size(vd_lh, &s)) {
1108bf82a41bSeschrock 		(void) ldi_close(vd_lh, FREAD, kcred);
1109be6fd75aSMatthew Ahrens 		return (SET_ERROR(EIO));
1110bf82a41bSeschrock 	}
1111e7cbe64fSgw 
1112e7cbe64fSgw 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
1113e7cbe64fSgw 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
1114e7cbe64fSgw 
111517f1e64aSEric Taylor 	*config = NULL;
1116e7cbe64fSgw 	for (l = 0; l < VDEV_LABELS; l++) {
1117e7cbe64fSgw 		uint64_t offset, state, txg = 0;
1118e7cbe64fSgw 
1119e7cbe64fSgw 		/* read vdev label */
1120e7cbe64fSgw 		offset = vdev_label_offset(size, l, 0);
1121810e43b2SBill Pijewski 		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
11222264ca7fSLin Ling 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
1123e7cbe64fSgw 			continue;
1124e7cbe64fSgw 
1125e7cbe64fSgw 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
1126f940fbb1SLin Ling 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
1127f940fbb1SLin Ling 			*config = NULL;
1128e7cbe64fSgw 			continue;
1129e7cbe64fSgw 		}
1130e7cbe64fSgw 
1131f940fbb1SLin Ling 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
1132e7cbe64fSgw 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
1133f940fbb1SLin Ling 			nvlist_free(*config);
1134f940fbb1SLin Ling 			*config = NULL;
1135e7cbe64fSgw 			continue;
1136e7cbe64fSgw 		}
1137e7cbe64fSgw 
1138f940fbb1SLin Ling 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
1139e7cbe64fSgw 		    &txg) != 0 || txg == 0) {
1140f940fbb1SLin Ling 			nvlist_free(*config);
1141f940fbb1SLin Ling 			*config = NULL;
1142e7cbe64fSgw 			continue;
1143e7cbe64fSgw 		}
1144e7cbe64fSgw 
1145e7cbe64fSgw 		break;
1146e7cbe64fSgw 	}
1147e7cbe64fSgw 
1148e7cbe64fSgw 	kmem_free(label, sizeof (vdev_label_t));
1149bf82a41bSeschrock 	(void) ldi_close(vd_lh, FREAD, kcred);
115017f1e64aSEric Taylor 	if (*config == NULL)
1151be6fd75aSMatthew Ahrens 		error = SET_ERROR(EIDRM);
1152bf82a41bSeschrock 
1153f940fbb1SLin Ling 	return (error);
1154e7cbe64fSgw }
1155*30c304d9SJoshua M. Clulow 
1156*30c304d9SJoshua M. Clulow struct veb {
1157*30c304d9SJoshua M. Clulow 	list_t veb_ents;
1158*30c304d9SJoshua M. Clulow 	boolean_t veb_scanned;
1159*30c304d9SJoshua M. Clulow };
1160*30c304d9SJoshua M. Clulow 
1161*30c304d9SJoshua M. Clulow struct veb_ent {
1162*30c304d9SJoshua M. Clulow 	uint64_t vebe_pool_guid;
1163*30c304d9SJoshua M. Clulow 	uint64_t vebe_vdev_guid;
1164*30c304d9SJoshua M. Clulow 
1165*30c304d9SJoshua M. Clulow 	char *vebe_devpath;
1166*30c304d9SJoshua M. Clulow 
1167*30c304d9SJoshua M. Clulow 	list_node_t vebe_link;
1168*30c304d9SJoshua M. Clulow };
1169*30c304d9SJoshua M. Clulow 
1170*30c304d9SJoshua M. Clulow static kmutex_t veb_lock;
1171*30c304d9SJoshua M. Clulow static struct veb *veb;
1172*30c304d9SJoshua M. Clulow 
1173*30c304d9SJoshua M. Clulow static int
1174*30c304d9SJoshua M. Clulow vdev_disk_preroot_scan_walk(const char *devpath, void *arg)
1175*30c304d9SJoshua M. Clulow {
1176*30c304d9SJoshua M. Clulow 	int r;
1177*30c304d9SJoshua M. Clulow 	nvlist_t *cfg = NULL;
1178*30c304d9SJoshua M. Clulow 	uint64_t pguid = 0, vguid = 0;
1179*30c304d9SJoshua M. Clulow 
1180*30c304d9SJoshua M. Clulow 	/*
1181*30c304d9SJoshua M. Clulow 	 * Attempt to read the label from this block device.
1182*30c304d9SJoshua M. Clulow 	 */
1183*30c304d9SJoshua M. Clulow 	if ((r = vdev_disk_read_rootlabel(devpath, NULL, &cfg)) != 0) {
1184*30c304d9SJoshua M. Clulow 		/*
1185*30c304d9SJoshua M. Clulow 		 * Many of the available block devices will represent slices or
1186*30c304d9SJoshua M. Clulow 		 * partitions of disks, or may represent disks that are not at
1187*30c304d9SJoshua M. Clulow 		 * all initialised with ZFS.  As this is a best effort
1188*30c304d9SJoshua M. Clulow 		 * mechanism to locate an alternate path to a particular vdev,
1189*30c304d9SJoshua M. Clulow 		 * we will ignore any failures and keep scanning.
1190*30c304d9SJoshua M. Clulow 		 */
1191*30c304d9SJoshua M. Clulow 		return (PREROOT_WALK_BLOCK_DEVICES_NEXT);
1192*30c304d9SJoshua M. Clulow 	}
1193*30c304d9SJoshua M. Clulow 
1194*30c304d9SJoshua M. Clulow 	/*
1195*30c304d9SJoshua M. Clulow 	 * Determine the pool and vdev GUID read from the label for this
1196*30c304d9SJoshua M. Clulow 	 * device.  Both values must be present and have a non-zero value.
1197*30c304d9SJoshua M. Clulow 	 */
1198*30c304d9SJoshua M. Clulow 	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pguid) != 0 ||
1199*30c304d9SJoshua M. Clulow 	    nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_GUID, &vguid) != 0 ||
1200*30c304d9SJoshua M. Clulow 	    pguid == 0 || vguid == 0) {
1201*30c304d9SJoshua M. Clulow 		/*
1202*30c304d9SJoshua M. Clulow 		 * This label was not complete.
1203*30c304d9SJoshua M. Clulow 		 */
1204*30c304d9SJoshua M. Clulow 		goto out;
1205*30c304d9SJoshua M. Clulow 	}
1206*30c304d9SJoshua M. Clulow 
1207*30c304d9SJoshua M. Clulow 	/*
1208*30c304d9SJoshua M. Clulow 	 * Keep track of all of the GUID-to-devpath mappings we find so that
1209*30c304d9SJoshua M. Clulow 	 * vdev_disk_preroot_lookup() can search them.
1210*30c304d9SJoshua M. Clulow 	 */
1211*30c304d9SJoshua M. Clulow 	struct veb_ent *vebe = kmem_zalloc(sizeof (*vebe), KM_SLEEP);
1212*30c304d9SJoshua M. Clulow 	vebe->vebe_pool_guid = pguid;
1213*30c304d9SJoshua M. Clulow 	vebe->vebe_vdev_guid = vguid;
1214*30c304d9SJoshua M. Clulow 	vebe->vebe_devpath = spa_strdup(devpath);
1215*30c304d9SJoshua M. Clulow 
1216*30c304d9SJoshua M. Clulow 	list_insert_tail(&veb->veb_ents, vebe);
1217*30c304d9SJoshua M. Clulow 
1218*30c304d9SJoshua M. Clulow out:
1219*30c304d9SJoshua M. Clulow 	nvlist_free(cfg);
1220*30c304d9SJoshua M. Clulow 	return (PREROOT_WALK_BLOCK_DEVICES_NEXT);
1221*30c304d9SJoshua M. Clulow }
1222*30c304d9SJoshua M. Clulow 
1223*30c304d9SJoshua M. Clulow const char *
1224*30c304d9SJoshua M. Clulow vdev_disk_preroot_lookup(uint64_t pool_guid, uint64_t vdev_guid)
1225*30c304d9SJoshua M. Clulow {
1226*30c304d9SJoshua M. Clulow 	if (pool_guid == 0 || vdev_guid == 0) {
1227*30c304d9SJoshua M. Clulow 		/*
1228*30c304d9SJoshua M. Clulow 		 * If we aren't provided both a pool and a vdev GUID, we cannot
1229*30c304d9SJoshua M. Clulow 		 * perform a lookup.
1230*30c304d9SJoshua M. Clulow 		 */
1231*30c304d9SJoshua M. Clulow 		return (NULL);
1232*30c304d9SJoshua M. Clulow 	}
1233*30c304d9SJoshua M. Clulow 
1234*30c304d9SJoshua M. Clulow 	mutex_enter(&veb_lock);
1235*30c304d9SJoshua M. Clulow 	if (veb == NULL) {
1236*30c304d9SJoshua M. Clulow 		/*
1237*30c304d9SJoshua M. Clulow 		 * If vdev_disk_preroot_fini() has been called already, there
1238*30c304d9SJoshua M. Clulow 		 * is nothing we can do.
1239*30c304d9SJoshua M. Clulow 		 */
1240*30c304d9SJoshua M. Clulow 		mutex_exit(&veb_lock);
1241*30c304d9SJoshua M. Clulow 		return (NULL);
1242*30c304d9SJoshua M. Clulow 	}
1243*30c304d9SJoshua M. Clulow 
1244*30c304d9SJoshua M. Clulow 	/*
1245*30c304d9SJoshua M. Clulow 	 * We want to perform at most one scan of all block devices per boot.
1246*30c304d9SJoshua M. Clulow 	 */
1247*30c304d9SJoshua M. Clulow 	if (!veb->veb_scanned) {
1248*30c304d9SJoshua M. Clulow 		cmn_err(CE_NOTE, "Performing full ZFS device scan!");
1249*30c304d9SJoshua M. Clulow 
1250*30c304d9SJoshua M. Clulow 		preroot_walk_block_devices(vdev_disk_preroot_scan_walk, NULL);
1251*30c304d9SJoshua M. Clulow 
1252*30c304d9SJoshua M. Clulow 		veb->veb_scanned = B_TRUE;
1253*30c304d9SJoshua M. Clulow 	}
1254*30c304d9SJoshua M. Clulow 
1255*30c304d9SJoshua M. Clulow 	const char *path = NULL;
1256*30c304d9SJoshua M. Clulow 	for (struct veb_ent *vebe = list_head(&veb->veb_ents); vebe != NULL;
1257*30c304d9SJoshua M. Clulow 	    vebe = list_next(&veb->veb_ents, vebe)) {
1258*30c304d9SJoshua M. Clulow 		if (vebe->vebe_pool_guid == pool_guid &&
1259*30c304d9SJoshua M. Clulow 		    vebe->vebe_vdev_guid == vdev_guid) {
1260*30c304d9SJoshua M. Clulow 			path = vebe->vebe_devpath;
1261*30c304d9SJoshua M. Clulow 			break;
1262*30c304d9SJoshua M. Clulow 		}
1263*30c304d9SJoshua M. Clulow 	}
1264*30c304d9SJoshua M. Clulow 
1265*30c304d9SJoshua M. Clulow 	mutex_exit(&veb_lock);
1266*30c304d9SJoshua M. Clulow 
1267*30c304d9SJoshua M. Clulow 	return (path);
1268*30c304d9SJoshua M. Clulow }
1269*30c304d9SJoshua M. Clulow 
1270*30c304d9SJoshua M. Clulow void
1271*30c304d9SJoshua M. Clulow vdev_disk_preroot_init(void)
1272*30c304d9SJoshua M. Clulow {
1273*30c304d9SJoshua M. Clulow 	mutex_init(&veb_lock, NULL, MUTEX_DEFAULT, NULL);
1274*30c304d9SJoshua M. Clulow 
1275*30c304d9SJoshua M. Clulow 	VERIFY3P(veb, ==, NULL);
1276*30c304d9SJoshua M. Clulow 	veb = kmem_zalloc(sizeof (*veb), KM_SLEEP);
1277*30c304d9SJoshua M. Clulow 	list_create(&veb->veb_ents, sizeof (struct veb_ent),
1278*30c304d9SJoshua M. Clulow 	    offsetof(struct veb_ent, vebe_link));
1279*30c304d9SJoshua M. Clulow 	veb->veb_scanned = B_FALSE;
1280*30c304d9SJoshua M. Clulow }
1281*30c304d9SJoshua M. Clulow 
1282*30c304d9SJoshua M. Clulow void
1283*30c304d9SJoshua M. Clulow vdev_disk_preroot_fini(void)
1284*30c304d9SJoshua M. Clulow {
1285*30c304d9SJoshua M. Clulow 	mutex_enter(&veb_lock);
1286*30c304d9SJoshua M. Clulow 
1287*30c304d9SJoshua M. Clulow 	if (veb != NULL) {
1288*30c304d9SJoshua M. Clulow 		while (!list_is_empty(&veb->veb_ents)) {
1289*30c304d9SJoshua M. Clulow 			struct veb_ent *vebe = list_remove_head(&veb->veb_ents);
1290*30c304d9SJoshua M. Clulow 
1291*30c304d9SJoshua M. Clulow 			spa_strfree(vebe->vebe_devpath);
1292*30c304d9SJoshua M. Clulow 
1293*30c304d9SJoshua M. Clulow 			kmem_free(vebe, sizeof (*vebe));
1294*30c304d9SJoshua M. Clulow 		}
1295*30c304d9SJoshua M. Clulow 
1296*30c304d9SJoshua M. Clulow 		kmem_free(veb, sizeof (*veb));
1297*30c304d9SJoshua M. Clulow 		veb = NULL;
1298*30c304d9SJoshua M. Clulow 	}
1299*30c304d9SJoshua M. Clulow 
1300*30c304d9SJoshua M. Clulow 	mutex_exit(&veb_lock);
1301*30c304d9SJoshua M. Clulow }
1302