xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_disk.c (revision 1724dc7b)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5bef6b7d2Swebaker  * Common Development and Distribution License (the "License").
6bef6b7d2Swebaker  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22f13665b7Sbo zhou - Sun Microsystems - Beijing China  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
234263d13fSGeorge Wilson  * Copyright (c) 2012 by Delphix. All rights reserved.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #include <sys/zfs_context.h>
27dcba9f3fSGeorge Wilson #include <sys/spa_impl.h>
28e7cbe64fSgw #include <sys/refcount.h>
29fa9e4066Sahrens #include <sys/vdev_disk.h>
30fa9e4066Sahrens #include <sys/vdev_impl.h>
31fa9e4066Sahrens #include <sys/fs/zfs.h>
32fa9e4066Sahrens #include <sys/zio.h>
33afefbcddSeschrock #include <sys/sunldi.h>
344263d13fSGeorge Wilson #include <sys/efi_partition.h>
3551ece835Seschrock #include <sys/fm/fs/zfs.h>
36fa9e4066Sahrens 
37fa9e4066Sahrens /*
38fa9e4066Sahrens  * Virtual device vector for disks.
39fa9e4066Sahrens  */
40fa9e4066Sahrens 
41fa9e4066Sahrens extern ldi_ident_t zfs_li;
42fa9e4066Sahrens 
43fa9e4066Sahrens typedef struct vdev_disk_buf {
44fa9e4066Sahrens 	buf_t	vdb_buf;
45fa9e4066Sahrens 	zio_t	*vdb_io;
46fa9e4066Sahrens } vdev_disk_buf_t;
47fa9e4066Sahrens 
48dcba9f3fSGeorge Wilson static void
49dcba9f3fSGeorge Wilson vdev_disk_hold(vdev_t *vd)
50dcba9f3fSGeorge Wilson {
51dcba9f3fSGeorge Wilson 	ddi_devid_t devid;
52dcba9f3fSGeorge Wilson 	char *minor;
53dcba9f3fSGeorge Wilson 
54dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
55dcba9f3fSGeorge Wilson 
56dcba9f3fSGeorge Wilson 	/*
57dcba9f3fSGeorge Wilson 	 * We must have a pathname, and it must be absolute.
58dcba9f3fSGeorge Wilson 	 */
59dcba9f3fSGeorge Wilson 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
60dcba9f3fSGeorge Wilson 		return;
61dcba9f3fSGeorge Wilson 
62dcba9f3fSGeorge Wilson 	/*
63dcba9f3fSGeorge Wilson 	 * Only prefetch path and devid info if the device has
64dcba9f3fSGeorge Wilson 	 * never been opened.
65dcba9f3fSGeorge Wilson 	 */
66dcba9f3fSGeorge Wilson 	if (vd->vdev_tsd != NULL)
67dcba9f3fSGeorge Wilson 		return;
68dcba9f3fSGeorge Wilson 
69dcba9f3fSGeorge Wilson 	if (vd->vdev_wholedisk == -1ULL) {
70dcba9f3fSGeorge Wilson 		size_t len = strlen(vd->vdev_path) + 3;
71dcba9f3fSGeorge Wilson 		char *buf = kmem_alloc(len, KM_SLEEP);
72dcba9f3fSGeorge Wilson 
73dcba9f3fSGeorge Wilson 		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
74dcba9f3fSGeorge Wilson 
75dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
76dcba9f3fSGeorge Wilson 		kmem_free(buf, len);
77dcba9f3fSGeorge Wilson 	}
78dcba9f3fSGeorge Wilson 
79dcba9f3fSGeorge Wilson 	if (vd->vdev_name_vp == NULL)
80dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
81dcba9f3fSGeorge Wilson 
82dcba9f3fSGeorge Wilson 	if (vd->vdev_devid != NULL &&
83dcba9f3fSGeorge Wilson 	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
84dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
85dcba9f3fSGeorge Wilson 		ddi_devid_str_free(minor);
86dcba9f3fSGeorge Wilson 		ddi_devid_free(devid);
87dcba9f3fSGeorge Wilson 	}
88dcba9f3fSGeorge Wilson }
89dcba9f3fSGeorge Wilson 
90dcba9f3fSGeorge Wilson static void
91dcba9f3fSGeorge Wilson vdev_disk_rele(vdev_t *vd)
92dcba9f3fSGeorge Wilson {
93dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
94dcba9f3fSGeorge Wilson 
95dcba9f3fSGeorge Wilson 	if (vd->vdev_name_vp) {
96dcba9f3fSGeorge Wilson 		VN_RELE_ASYNC(vd->vdev_name_vp,
97dcba9f3fSGeorge Wilson 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
98dcba9f3fSGeorge Wilson 		vd->vdev_name_vp = NULL;
99dcba9f3fSGeorge Wilson 	}
100dcba9f3fSGeorge Wilson 	if (vd->vdev_devid_vp) {
101dcba9f3fSGeorge Wilson 		VN_RELE_ASYNC(vd->vdev_devid_vp,
102dcba9f3fSGeorge Wilson 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
103dcba9f3fSGeorge Wilson 		vd->vdev_devid_vp = NULL;
104dcba9f3fSGeorge Wilson 	}
105dcba9f3fSGeorge Wilson }
106dcba9f3fSGeorge Wilson 
1074263d13fSGeorge Wilson static uint64_t
1084263d13fSGeorge Wilson vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz)
1094263d13fSGeorge Wilson {
1104263d13fSGeorge Wilson 	ASSERT(vd->vdev_wholedisk);
1114263d13fSGeorge Wilson 
1124263d13fSGeorge Wilson 	vdev_disk_t *dvd = vd->vdev_tsd;
1134263d13fSGeorge Wilson 	dk_efi_t dk_ioc;
1144263d13fSGeorge Wilson 	efi_gpt_t *efi;
1154263d13fSGeorge Wilson 	uint64_t avail_space = 0;
1164263d13fSGeorge Wilson 	int efisize = EFI_LABEL_SIZE * 2;
1174263d13fSGeorge Wilson 
1184263d13fSGeorge Wilson 	dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP);
1194263d13fSGeorge Wilson 	dk_ioc.dki_lba = 1;
1204263d13fSGeorge Wilson 	dk_ioc.dki_length = efisize;
1214263d13fSGeorge Wilson 	dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data;
1224263d13fSGeorge Wilson 	efi = dk_ioc.dki_data;
1234263d13fSGeorge Wilson 
1244263d13fSGeorge Wilson 	if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc,
1254263d13fSGeorge Wilson 	    FKIOCTL, kcred, NULL) == 0) {
1264263d13fSGeorge Wilson 		uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
1274263d13fSGeorge Wilson 
1284263d13fSGeorge Wilson 		zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu",
1294263d13fSGeorge Wilson 		    vd->vdev_path, capacity, efi_altern_lba);
1304263d13fSGeorge Wilson 		if (capacity > efi_altern_lba)
1314263d13fSGeorge Wilson 			avail_space = (capacity - efi_altern_lba) * blksz;
1324263d13fSGeorge Wilson 	}
1334263d13fSGeorge Wilson 	kmem_free(dk_ioc.dki_data, efisize);
1344263d13fSGeorge Wilson 	return (avail_space);
1354263d13fSGeorge Wilson }
1364263d13fSGeorge Wilson 
137fa9e4066Sahrens static int
1384263d13fSGeorge Wilson vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
1394263d13fSGeorge Wilson     uint64_t *ashift)
140fa9e4066Sahrens {
1418ad4d6ddSJeff Bonwick 	spa_t *spa = vd->vdev_spa;
142fa9e4066Sahrens 	vdev_disk_t *dvd;
143f13665b7Sbo zhou - Sun Microsystems - Beijing China 	struct dk_minfo_ext dkmext;
1440a4e9518Sgw 	int error;
145e14bb325SJeff Bonwick 	dev_t dev;
146e14bb325SJeff Bonwick 	int otyp;
147fa9e4066Sahrens 
148fa9e4066Sahrens 	/*
149fa9e4066Sahrens 	 * We must have a pathname, and it must be absolute.
150fa9e4066Sahrens 	 */
151fa9e4066Sahrens 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
152fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
153fa9e4066Sahrens 		return (EINVAL);
154fa9e4066Sahrens 	}
155fa9e4066Sahrens 
156095bcd66SGeorge Wilson 	/*
157095bcd66SGeorge Wilson 	 * Reopen the device if it's not currently open. Otherwise,
158095bcd66SGeorge Wilson 	 * just update the physical size of the device.
159095bcd66SGeorge Wilson 	 */
160095bcd66SGeorge Wilson 	if (vd->vdev_tsd != NULL) {
161095bcd66SGeorge Wilson 		ASSERT(vd->vdev_reopening);
162095bcd66SGeorge Wilson 		dvd = vd->vdev_tsd;
163095bcd66SGeorge Wilson 		goto skip_open;
164095bcd66SGeorge Wilson 	}
165095bcd66SGeorge Wilson 
166fa9e4066Sahrens 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
167fa9e4066Sahrens 
168fa9e4066Sahrens 	/*
169fa9e4066Sahrens 	 * When opening a disk device, we want to preserve the user's original
170fa9e4066Sahrens 	 * intent.  We always want to open the device by the path the user gave
171*1724dc7bSJoshua M. Clulow 	 * us, even if it is one of multiple paths to the same device.  But we
172fa9e4066Sahrens 	 * also want to be able to survive disks being removed/recabled.
173fa9e4066Sahrens 	 * Therefore the sequence of opening devices is:
174fa9e4066Sahrens 	 *
175afefbcddSeschrock 	 * 1. Try opening the device by path.  For legacy pools without the
176afefbcddSeschrock 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
177fa9e4066Sahrens 	 *
178fa9e4066Sahrens 	 * 2. If the devid of the device matches the stored value, return
179fa9e4066Sahrens 	 *    success.
180fa9e4066Sahrens 	 *
181fa9e4066Sahrens 	 * 3. Otherwise, the device may have moved.  Try opening the device
182fa9e4066Sahrens 	 *    by the devid instead.
183fa9e4066Sahrens 	 */
184fa9e4066Sahrens 	if (vd->vdev_devid != NULL) {
185fa9e4066Sahrens 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
186fa9e4066Sahrens 		    &dvd->vd_minor) != 0) {
187fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
188fa9e4066Sahrens 			return (EINVAL);
189fa9e4066Sahrens 		}
190fa9e4066Sahrens 	}
191fa9e4066Sahrens 
192fa9e4066Sahrens 	error = EINVAL;		/* presume failure */
193fa9e4066Sahrens 
194095bcd66SGeorge Wilson 	if (vd->vdev_path != NULL) {
195fa9e4066Sahrens 		ddi_devid_t devid;
196fa9e4066Sahrens 
197afefbcddSeschrock 		if (vd->vdev_wholedisk == -1ULL) {
198afefbcddSeschrock 			size_t len = strlen(vd->vdev_path) + 3;
199afefbcddSeschrock 			char *buf = kmem_alloc(len, KM_SLEEP);
200afefbcddSeschrock 			ldi_handle_t lh;
201afefbcddSeschrock 
202afefbcddSeschrock 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
203afefbcddSeschrock 
2048ad4d6ddSJeff Bonwick 			if (ldi_open_by_name(buf, spa_mode(spa), kcred,
205afefbcddSeschrock 			    &lh, zfs_li) == 0) {
206afefbcddSeschrock 				spa_strfree(vd->vdev_path);
207afefbcddSeschrock 				vd->vdev_path = buf;
208afefbcddSeschrock 				vd->vdev_wholedisk = 1ULL;
2098ad4d6ddSJeff Bonwick 				(void) ldi_close(lh, spa_mode(spa), kcred);
210afefbcddSeschrock 			} else {
211afefbcddSeschrock 				kmem_free(buf, len);
212afefbcddSeschrock 			}
213afefbcddSeschrock 		}
214fa9e4066Sahrens 
2158ad4d6ddSJeff Bonwick 		error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
216afefbcddSeschrock 		    &dvd->vd_lh, zfs_li);
217fa9e4066Sahrens 
218fa9e4066Sahrens 		/*
219fa9e4066Sahrens 		 * Compare the devid to the stored value.
220fa9e4066Sahrens 		 */
221fa9e4066Sahrens 		if (error == 0 && vd->vdev_devid != NULL &&
222fa9e4066Sahrens 		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
223fa9e4066Sahrens 			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
224fa9e4066Sahrens 				error = EINVAL;
2258ad4d6ddSJeff Bonwick 				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
2268ad4d6ddSJeff Bonwick 				    kcred);
227fa9e4066Sahrens 				dvd->vd_lh = NULL;
228fa9e4066Sahrens 			}
229fa9e4066Sahrens 			ddi_devid_free(devid);
230fa9e4066Sahrens 		}
231afefbcddSeschrock 
232afefbcddSeschrock 		/*
233afefbcddSeschrock 		 * If we succeeded in opening the device, but 'vdev_wholedisk'
234afefbcddSeschrock 		 * is not yet set, then this must be a slice.
235afefbcddSeschrock 		 */
236afefbcddSeschrock 		if (error == 0 && vd->vdev_wholedisk == -1ULL)
237afefbcddSeschrock 			vd->vdev_wholedisk = 0;
238fa9e4066Sahrens 	}
239fa9e4066Sahrens 
240fa9e4066Sahrens 	/*
241fa9e4066Sahrens 	 * If we were unable to open by path, or the devid check fails, open by
242fa9e4066Sahrens 	 * devid instead.
243fa9e4066Sahrens 	 */
244fa9e4066Sahrens 	if (error != 0 && vd->vdev_devid != NULL)
245fa9e4066Sahrens 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
2468ad4d6ddSJeff Bonwick 		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
247fa9e4066Sahrens 
2483d7072f8Seschrock 	/*
2493d7072f8Seschrock 	 * If all else fails, then try opening by physical path (if available)
2503d7072f8Seschrock 	 * or the logical path (if we failed due to the devid check).  While not
2513d7072f8Seschrock 	 * as reliable as the devid, this will give us something, and the higher
2523d7072f8Seschrock 	 * level vdev validation will prevent us from opening the wrong device.
2533d7072f8Seschrock 	 */
2543d7072f8Seschrock 	if (error) {
2553d7072f8Seschrock 		if (vd->vdev_physpath != NULL &&
256deb8317bSMark J Musante 		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
2578ad4d6ddSJeff Bonwick 			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
2583d7072f8Seschrock 			    kcred, &dvd->vd_lh, zfs_li);
2593d7072f8Seschrock 
2603d7072f8Seschrock 		/*
2613d7072f8Seschrock 		 * Note that we don't support the legacy auto-wholedisk support
2623d7072f8Seschrock 		 * as above.  This hasn't been used in a very long time and we
2633d7072f8Seschrock 		 * don't need to propagate its oddities to this edge condition.
2643d7072f8Seschrock 		 */
265095bcd66SGeorge Wilson 		if (error && vd->vdev_path != NULL)
2668ad4d6ddSJeff Bonwick 			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
2678ad4d6ddSJeff Bonwick 			    kcred, &dvd->vd_lh, zfs_li);
2683d7072f8Seschrock 	}
2693d7072f8Seschrock 
270e14bb325SJeff Bonwick 	if (error) {
271fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
272fa9e4066Sahrens 		return (error);
273e14bb325SJeff Bonwick 	}
274fa9e4066Sahrens 
2753d7072f8Seschrock 	/*
2763d7072f8Seschrock 	 * Once a device is opened, verify that the physical device path (if
2773d7072f8Seschrock 	 * available) is up to date.
2783d7072f8Seschrock 	 */
2793d7072f8Seschrock 	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
2803d7072f8Seschrock 	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
2810a4e9518Sgw 		char *physpath, *minorname;
2820a4e9518Sgw 
2833d7072f8Seschrock 		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2843d7072f8Seschrock 		minorname = NULL;
2853d7072f8Seschrock 		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
2863d7072f8Seschrock 		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
2873d7072f8Seschrock 		    (vd->vdev_physpath == NULL ||
2883d7072f8Seschrock 		    strcmp(vd->vdev_physpath, physpath) != 0)) {
2893d7072f8Seschrock 			if (vd->vdev_physpath)
2903d7072f8Seschrock 				spa_strfree(vd->vdev_physpath);
2913d7072f8Seschrock 			(void) strlcat(physpath, ":", MAXPATHLEN);
2923d7072f8Seschrock 			(void) strlcat(physpath, minorname, MAXPATHLEN);
2933d7072f8Seschrock 			vd->vdev_physpath = spa_strdup(physpath);
2943d7072f8Seschrock 		}
2953d7072f8Seschrock 		if (minorname)
2963d7072f8Seschrock 			kmem_free(minorname, strlen(minorname) + 1);
2973d7072f8Seschrock 		kmem_free(physpath, MAXPATHLEN);
2983d7072f8Seschrock 	}
2993d7072f8Seschrock 
300095bcd66SGeorge Wilson skip_open:
301fa9e4066Sahrens 	/*
302fa9e4066Sahrens 	 * Determine the actual size of the device.
303fa9e4066Sahrens 	 */
304fa9e4066Sahrens 	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
305fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
306fa9e4066Sahrens 		return (EINVAL);
307fa9e4066Sahrens 	}
308fa9e4066Sahrens 
309ecc2d604Sbonwick 	/*
310ecc2d604Sbonwick 	 * Determine the device's minimum transfer size.
311ecc2d604Sbonwick 	 * If the ioctl isn't supported, assume DEV_BSIZE.
312ecc2d604Sbonwick 	 */
313f13665b7Sbo zhou - Sun Microsystems - Beijing China 	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext,
314ecc2d604Sbonwick 	    FKIOCTL, kcred, NULL) != 0)
315f13665b7Sbo zhou - Sun Microsystems - Beijing China 		dkmext.dki_pbsize = DEV_BSIZE;
316bef6b7d2Swebaker 
317f13665b7Sbo zhou - Sun Microsystems - Beijing China 	*ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1;
318bef6b7d2Swebaker 
3194263d13fSGeorge Wilson 	if (vd->vdev_wholedisk == 1) {
3204263d13fSGeorge Wilson 		uint64_t capacity = dkmext.dki_capacity - 1;
3214263d13fSGeorge Wilson 		uint64_t blksz = dkmext.dki_lbsize;
3224263d13fSGeorge Wilson 		int wce = 1;
3234263d13fSGeorge Wilson 
3244263d13fSGeorge Wilson 		/*
3254263d13fSGeorge Wilson 		 * If we own the whole disk, try to enable disk write caching.
3264263d13fSGeorge Wilson 		 * We ignore errors because it's OK if we can't do it.
3274263d13fSGeorge Wilson 		 */
3284263d13fSGeorge Wilson 		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
3294263d13fSGeorge Wilson 		    FKIOCTL, kcred, NULL);
3304263d13fSGeorge Wilson 
3314263d13fSGeorge Wilson 		*max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz);
3324263d13fSGeorge Wilson 		zfs_dbgmsg("capacity change: vdev %s, psize %llu, "
3334263d13fSGeorge Wilson 		    "max_psize %llu", vd->vdev_path, *psize, *max_psize);
3344263d13fSGeorge Wilson 	} else {
3354263d13fSGeorge Wilson 		*max_psize = *psize;
3364263d13fSGeorge Wilson 	}
3374263d13fSGeorge Wilson 
338b468a217Seschrock 	/*
339b468a217Seschrock 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
340b468a217Seschrock 	 * try again.
341b468a217Seschrock 	 */
342b468a217Seschrock 	vd->vdev_nowritecache = B_FALSE;
343b468a217Seschrock 
344fa9e4066Sahrens 	return (0);
345fa9e4066Sahrens }
346fa9e4066Sahrens 
347fa9e4066Sahrens static void
348fa9e4066Sahrens vdev_disk_close(vdev_t *vd)
349fa9e4066Sahrens {
350fa9e4066Sahrens 	vdev_disk_t *dvd = vd->vdev_tsd;
351fa9e4066Sahrens 
352095bcd66SGeorge Wilson 	if (vd->vdev_reopening || dvd == NULL)
353fa9e4066Sahrens 		return;
354fa9e4066Sahrens 
355fa9e4066Sahrens 	if (dvd->vd_minor != NULL)
356fa9e4066Sahrens 		ddi_devid_str_free(dvd->vd_minor);
357fa9e4066Sahrens 
358fa9e4066Sahrens 	if (dvd->vd_devid != NULL)
359fa9e4066Sahrens 		ddi_devid_free(dvd->vd_devid);
360fa9e4066Sahrens 
361fa9e4066Sahrens 	if (dvd->vd_lh != NULL)
3628ad4d6ddSJeff Bonwick 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
363fa9e4066Sahrens 
36498d1cbfeSGeorge Wilson 	vd->vdev_delayed_close = B_FALSE;
365fa9e4066Sahrens 	kmem_free(dvd, sizeof (vdev_disk_t));
366fa9e4066Sahrens 	vd->vdev_tsd = NULL;
367fa9e4066Sahrens }
368fa9e4066Sahrens 
369e7cbe64fSgw int
370e7cbe64fSgw vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
371e7cbe64fSgw     uint64_t offset, int flags)
372e7cbe64fSgw {
373e7cbe64fSgw 	buf_t *bp;
374e7cbe64fSgw 	int error = 0;
375e7cbe64fSgw 
376e7cbe64fSgw 	if (vd_lh == NULL)
377e7cbe64fSgw 		return (EINVAL);
378e7cbe64fSgw 
379e7cbe64fSgw 	ASSERT(flags & B_READ || flags & B_WRITE);
380e7cbe64fSgw 
381e7cbe64fSgw 	bp = getrbuf(KM_SLEEP);
382e7cbe64fSgw 	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
383e7cbe64fSgw 	bp->b_bcount = size;
384e7cbe64fSgw 	bp->b_un.b_addr = (void *)data;
385e7cbe64fSgw 	bp->b_lblkno = lbtodb(offset);
386e7cbe64fSgw 	bp->b_bufsize = size;
387e7cbe64fSgw 
388e7cbe64fSgw 	error = ldi_strategy(vd_lh, bp);
389e7cbe64fSgw 	ASSERT(error == 0);
390e7cbe64fSgw 	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
391e7cbe64fSgw 		error = EIO;
392e7cbe64fSgw 	freerbuf(bp);
393e7cbe64fSgw 
394e7cbe64fSgw 	return (error);
395e7cbe64fSgw }
396e7cbe64fSgw 
397fa9e4066Sahrens static void
398fa9e4066Sahrens vdev_disk_io_intr(buf_t *bp)
399fa9e4066Sahrens {
400fa9e4066Sahrens 	vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
401fa9e4066Sahrens 	zio_t *zio = vdb->vdb_io;
402fa9e4066Sahrens 
40351ece835Seschrock 	/*
40451ece835Seschrock 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
40551ece835Seschrock 	 * Rather than teach the rest of the stack about other error
40651ece835Seschrock 	 * possibilities (EFAULT, etc), we normalize the error value here.
40751ece835Seschrock 	 */
40851ece835Seschrock 	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
40951ece835Seschrock 
41051ece835Seschrock 	if (zio->io_error == 0 && bp->b_resid != 0)
411fa9e4066Sahrens 		zio->io_error = EIO;
412fa9e4066Sahrens 
413fa9e4066Sahrens 	kmem_free(vdb, sizeof (vdev_disk_buf_t));
414fa9e4066Sahrens 
415e05725b1Sbonwick 	zio_interrupt(zio);
416fa9e4066Sahrens }
417fa9e4066Sahrens 
418f4a72450SJeff Bonwick static void
419f4a72450SJeff Bonwick vdev_disk_ioctl_free(zio_t *zio)
420f4a72450SJeff Bonwick {
421f4a72450SJeff Bonwick 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
422f4a72450SJeff Bonwick }
423f4a72450SJeff Bonwick 
42422fe2c88SJonathan Adams static const zio_vsd_ops_t vdev_disk_vsd_ops = {
42522fe2c88SJonathan Adams 	vdev_disk_ioctl_free,
42622fe2c88SJonathan Adams 	zio_vsd_default_cksum_report
42722fe2c88SJonathan Adams };
42822fe2c88SJonathan Adams 
429fa9e4066Sahrens static void
430fa9e4066Sahrens vdev_disk_ioctl_done(void *zio_arg, int error)
431fa9e4066Sahrens {
432fa9e4066Sahrens 	zio_t *zio = zio_arg;
433fa9e4066Sahrens 
434fa9e4066Sahrens 	zio->io_error = error;
435fa9e4066Sahrens 
436e05725b1Sbonwick 	zio_interrupt(zio);
437fa9e4066Sahrens }
438fa9e4066Sahrens 
439e05725b1Sbonwick static int
440fa9e4066Sahrens vdev_disk_io_start(zio_t *zio)
441fa9e4066Sahrens {
442fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
443fa9e4066Sahrens 	vdev_disk_t *dvd = vd->vdev_tsd;
444fa9e4066Sahrens 	vdev_disk_buf_t *vdb;
445e14bb325SJeff Bonwick 	struct dk_callback *dkc;
446fa9e4066Sahrens 	buf_t *bp;
447e14bb325SJeff Bonwick 	int error;
448fa9e4066Sahrens 
449fa9e4066Sahrens 	if (zio->io_type == ZIO_TYPE_IOCTL) {
450fa9e4066Sahrens 		/* XXPOLICY */
4510a4e9518Sgw 		if (!vdev_readable(vd)) {
452fa9e4066Sahrens 			zio->io_error = ENXIO;
453e05725b1Sbonwick 			return (ZIO_PIPELINE_CONTINUE);
454fa9e4066Sahrens 		}
455fa9e4066Sahrens 
456fa9e4066Sahrens 		switch (zio->io_cmd) {
457fa9e4066Sahrens 
458fa9e4066Sahrens 		case DKIOCFLUSHWRITECACHE:
459fa9e4066Sahrens 
460a2eea2e1Sahrens 			if (zfs_nocacheflush)
461a2eea2e1Sahrens 				break;
462a2eea2e1Sahrens 
463b468a217Seschrock 			if (vd->vdev_nowritecache) {
464b468a217Seschrock 				zio->io_error = ENOTSUP;
465b468a217Seschrock 				break;
466b468a217Seschrock 			}
467b468a217Seschrock 
468e14bb325SJeff Bonwick 			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
46922fe2c88SJonathan Adams 			zio->io_vsd_ops = &vdev_disk_vsd_ops;
470e14bb325SJeff Bonwick 
471e14bb325SJeff Bonwick 			dkc->dkc_callback = vdev_disk_ioctl_done;
472e14bb325SJeff Bonwick 			dkc->dkc_flag = FLUSH_VOLATILE;
473e14bb325SJeff Bonwick 			dkc->dkc_cookie = zio;
474fa9e4066Sahrens 
475fa9e4066Sahrens 			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
476e14bb325SJeff Bonwick 			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
477fa9e4066Sahrens 
478fa9e4066Sahrens 			if (error == 0) {
479fa9e4066Sahrens 				/*
480fa9e4066Sahrens 				 * The ioctl will be done asychronously,
481fa9e4066Sahrens 				 * and will call vdev_disk_ioctl_done()
482fa9e4066Sahrens 				 * upon completion.
483fa9e4066Sahrens 				 */
484e05725b1Sbonwick 				return (ZIO_PIPELINE_STOP);
485e05725b1Sbonwick 			}
486e05725b1Sbonwick 
487e05725b1Sbonwick 			if (error == ENOTSUP || error == ENOTTY) {
488b468a217Seschrock 				/*
489d5782879Smishra 				 * If we get ENOTSUP or ENOTTY, we know that
490d5782879Smishra 				 * no future attempts will ever succeed.
491d5782879Smishra 				 * In this case we set a persistent bit so
492d5782879Smishra 				 * that we don't bother with the ioctl in the
493d5782879Smishra 				 * future.
494b468a217Seschrock 				 */
495b468a217Seschrock 				vd->vdev_nowritecache = B_TRUE;
496fa9e4066Sahrens 			}
497fa9e4066Sahrens 			zio->io_error = error;
498b468a217Seschrock 
499fa9e4066Sahrens 			break;
500fa9e4066Sahrens 
501fa9e4066Sahrens 		default:
502fa9e4066Sahrens 			zio->io_error = ENOTSUP;
503fa9e4066Sahrens 		}
504fa9e4066Sahrens 
505e05725b1Sbonwick 		return (ZIO_PIPELINE_CONTINUE);
506fa9e4066Sahrens 	}
507fa9e4066Sahrens 
508fa9e4066Sahrens 	vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
509fa9e4066Sahrens 
510fa9e4066Sahrens 	vdb->vdb_io = zio;
511fa9e4066Sahrens 	bp = &vdb->vdb_buf;
512fa9e4066Sahrens 
513fa9e4066Sahrens 	bioinit(bp);
514e14bb325SJeff Bonwick 	bp->b_flags = B_BUSY | B_NOCACHE |
5158956713aSEric Schrock 	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
5168956713aSEric Schrock 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
5178956713aSEric Schrock 		bp->b_flags |= B_FAILFAST;
518fa9e4066Sahrens 	bp->b_bcount = zio->io_size;
519fa9e4066Sahrens 	bp->b_un.b_addr = zio->io_data;
520fa9e4066Sahrens 	bp->b_lblkno = lbtodb(zio->io_offset);
521fa9e4066Sahrens 	bp->b_bufsize = zio->io_size;
522fa9e4066Sahrens 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
523fa9e4066Sahrens 
524fa9e4066Sahrens 	/* ldi_strategy() will return non-zero only on programming errors */
525e14bb325SJeff Bonwick 	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
526e05725b1Sbonwick 
527e05725b1Sbonwick 	return (ZIO_PIPELINE_STOP);
528fa9e4066Sahrens }
529fa9e4066Sahrens 
530e14bb325SJeff Bonwick static void
531fa9e4066Sahrens vdev_disk_io_done(zio_t *zio)
532fa9e4066Sahrens {
533e14bb325SJeff Bonwick 	vdev_t *vd = zio->io_vd;
534ea8dc4b6Seschrock 
5353d7072f8Seschrock 	/*
5363d7072f8Seschrock 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
5373d7072f8Seschrock 	 * the device has been removed.  If this is the case, then we trigger an
5380a4e9518Sgw 	 * asynchronous removal of the device. Otherwise, probe the device and
5391f7ad2e1Sgw 	 * make sure it's still accessible.
5403d7072f8Seschrock 	 */
5411d713200SEric Schrock 	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
5420a4e9518Sgw 		vdev_disk_t *dvd = vd->vdev_tsd;
543e14bb325SJeff Bonwick 		int state = DKIO_NONE;
5440a4e9518Sgw 
545e14bb325SJeff Bonwick 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
546e14bb325SJeff Bonwick 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
5471d713200SEric Schrock 			/*
5481d713200SEric Schrock 			 * We post the resource as soon as possible, instead of
5491d713200SEric Schrock 			 * when the async removal actually happens, because the
5501d713200SEric Schrock 			 * DE is using this information to discard previous I/O
5511d713200SEric Schrock 			 * errors.
5521d713200SEric Schrock 			 */
5531d713200SEric Schrock 			zfs_post_remove(zio->io_spa, vd);
5543d7072f8Seschrock 			vd->vdev_remove_wanted = B_TRUE;
5553d7072f8Seschrock 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
55698d1cbfeSGeorge Wilson 		} else if (!vd->vdev_delayed_close) {
55798d1cbfeSGeorge Wilson 			vd->vdev_delayed_close = B_TRUE;
5583d7072f8Seschrock 		}
5593d7072f8Seschrock 	}
560fa9e4066Sahrens }
561fa9e4066Sahrens 
562fa9e4066Sahrens vdev_ops_t vdev_disk_ops = {
563fa9e4066Sahrens 	vdev_disk_open,
564fa9e4066Sahrens 	vdev_disk_close,
565fa9e4066Sahrens 	vdev_default_asize,
566fa9e4066Sahrens 	vdev_disk_io_start,
567fa9e4066Sahrens 	vdev_disk_io_done,
568fa9e4066Sahrens 	NULL,
569dcba9f3fSGeorge Wilson 	vdev_disk_hold,
570dcba9f3fSGeorge Wilson 	vdev_disk_rele,
571fa9e4066Sahrens 	VDEV_TYPE_DISK,		/* name of this vdev type */
572fa9e4066Sahrens 	B_TRUE			/* leaf vdev */
573fa9e4066Sahrens };
574e7cbe64fSgw 
575e7cbe64fSgw /*
576051aabe6Staylor  * Given the root disk device devid or pathname, read the label from
577051aabe6Staylor  * the device, and construct a configuration nvlist.
578e7cbe64fSgw  */
579f940fbb1SLin Ling int
580f940fbb1SLin Ling vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
581e7cbe64fSgw {
582e7cbe64fSgw 	ldi_handle_t vd_lh;
583e7cbe64fSgw 	vdev_label_t *label;
584e7cbe64fSgw 	uint64_t s, size;
585e7cbe64fSgw 	int l;
586051aabe6Staylor 	ddi_devid_t tmpdevid;
587f4565e39SLin Ling 	int error = -1;
588051aabe6Staylor 	char *minor_name;
589e7cbe64fSgw 
590e7cbe64fSgw 	/*
591e7cbe64fSgw 	 * Read the device label and build the nvlist.
592e7cbe64fSgw 	 */
593f4565e39SLin Ling 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
594051aabe6Staylor 	    &minor_name) == 0) {
595051aabe6Staylor 		error = ldi_open_by_devid(tmpdevid, minor_name,
5968ad4d6ddSJeff Bonwick 		    FREAD, kcred, &vd_lh, zfs_li);
597051aabe6Staylor 		ddi_devid_free(tmpdevid);
598051aabe6Staylor 		ddi_devid_str_free(minor_name);
599051aabe6Staylor 	}
600051aabe6Staylor 
601f4565e39SLin Ling 	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
602f4565e39SLin Ling 	    zfs_li)))
603f940fbb1SLin Ling 		return (error);
604e7cbe64fSgw 
605bf82a41bSeschrock 	if (ldi_get_size(vd_lh, &s)) {
606bf82a41bSeschrock 		(void) ldi_close(vd_lh, FREAD, kcred);
607f940fbb1SLin Ling 		return (EIO);
608bf82a41bSeschrock 	}
609e7cbe64fSgw 
610e7cbe64fSgw 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
611e7cbe64fSgw 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
612e7cbe64fSgw 
61317f1e64aSEric Taylor 	*config = NULL;
614e7cbe64fSgw 	for (l = 0; l < VDEV_LABELS; l++) {
615e7cbe64fSgw 		uint64_t offset, state, txg = 0;
616e7cbe64fSgw 
617e7cbe64fSgw 		/* read vdev label */
618e7cbe64fSgw 		offset = vdev_label_offset(size, l, 0);
619e7cbe64fSgw 		if (vdev_disk_physio(vd_lh, (caddr_t)label,
6202264ca7fSLin Ling 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
621e7cbe64fSgw 			continue;
622e7cbe64fSgw 
623e7cbe64fSgw 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
624f940fbb1SLin Ling 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
625f940fbb1SLin Ling 			*config = NULL;
626e7cbe64fSgw 			continue;
627e7cbe64fSgw 		}
628e7cbe64fSgw 
629f940fbb1SLin Ling 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
630e7cbe64fSgw 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
631f940fbb1SLin Ling 			nvlist_free(*config);
632f940fbb1SLin Ling 			*config = NULL;
633e7cbe64fSgw 			continue;
634e7cbe64fSgw 		}
635e7cbe64fSgw 
636f940fbb1SLin Ling 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
637e7cbe64fSgw 		    &txg) != 0 || txg == 0) {
638f940fbb1SLin Ling 			nvlist_free(*config);
639f940fbb1SLin Ling 			*config = NULL;
640e7cbe64fSgw 			continue;
641e7cbe64fSgw 		}
642e7cbe64fSgw 
643e7cbe64fSgw 		break;
644e7cbe64fSgw 	}
645e7cbe64fSgw 
646e7cbe64fSgw 	kmem_free(label, sizeof (vdev_label_t));
647bf82a41bSeschrock 	(void) ldi_close(vd_lh, FREAD, kcred);
64817f1e64aSEric Taylor 	if (*config == NULL)
64917f1e64aSEric Taylor 		error = EIDRM;
650bf82a41bSeschrock 
651f940fbb1SLin Ling 	return (error);
652e7cbe64fSgw }
653