xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_disk.c (revision be6fd75a)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5bef6b7d2Swebaker  * Common Development and Distribution License (the "License").
6bef6b7d2Swebaker  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22f13665b7Sbo zhou - Sun Microsystems - Beijing China  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23*be6fd75aSMatthew Ahrens  * Copyright (c) 2013 by Delphix. All rights reserved.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #include <sys/zfs_context.h>
27dcba9f3fSGeorge Wilson #include <sys/spa_impl.h>
28e7cbe64fSgw #include <sys/refcount.h>
29fa9e4066Sahrens #include <sys/vdev_disk.h>
30fa9e4066Sahrens #include <sys/vdev_impl.h>
31fa9e4066Sahrens #include <sys/fs/zfs.h>
32fa9e4066Sahrens #include <sys/zio.h>
33afefbcddSeschrock #include <sys/sunldi.h>
344263d13fSGeorge Wilson #include <sys/efi_partition.h>
3551ece835Seschrock #include <sys/fm/fs/zfs.h>
36fa9e4066Sahrens 
37fa9e4066Sahrens /*
38fa9e4066Sahrens  * Virtual device vector for disks.
39fa9e4066Sahrens  */
40fa9e4066Sahrens 
41fa9e4066Sahrens extern ldi_ident_t zfs_li;
42fa9e4066Sahrens 
43dcba9f3fSGeorge Wilson static void
44dcba9f3fSGeorge Wilson vdev_disk_hold(vdev_t *vd)
45dcba9f3fSGeorge Wilson {
46dcba9f3fSGeorge Wilson 	ddi_devid_t devid;
47dcba9f3fSGeorge Wilson 	char *minor;
48dcba9f3fSGeorge Wilson 
49dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
50dcba9f3fSGeorge Wilson 
51dcba9f3fSGeorge Wilson 	/*
52dcba9f3fSGeorge Wilson 	 * We must have a pathname, and it must be absolute.
53dcba9f3fSGeorge Wilson 	 */
54dcba9f3fSGeorge Wilson 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
55dcba9f3fSGeorge Wilson 		return;
56dcba9f3fSGeorge Wilson 
57dcba9f3fSGeorge Wilson 	/*
58dcba9f3fSGeorge Wilson 	 * Only prefetch path and devid info if the device has
59dcba9f3fSGeorge Wilson 	 * never been opened.
60dcba9f3fSGeorge Wilson 	 */
61dcba9f3fSGeorge Wilson 	if (vd->vdev_tsd != NULL)
62dcba9f3fSGeorge Wilson 		return;
63dcba9f3fSGeorge Wilson 
64dcba9f3fSGeorge Wilson 	if (vd->vdev_wholedisk == -1ULL) {
65dcba9f3fSGeorge Wilson 		size_t len = strlen(vd->vdev_path) + 3;
66dcba9f3fSGeorge Wilson 		char *buf = kmem_alloc(len, KM_SLEEP);
67dcba9f3fSGeorge Wilson 
68dcba9f3fSGeorge Wilson 		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
69dcba9f3fSGeorge Wilson 
70dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
71dcba9f3fSGeorge Wilson 		kmem_free(buf, len);
72dcba9f3fSGeorge Wilson 	}
73dcba9f3fSGeorge Wilson 
74dcba9f3fSGeorge Wilson 	if (vd->vdev_name_vp == NULL)
75dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
76dcba9f3fSGeorge Wilson 
77dcba9f3fSGeorge Wilson 	if (vd->vdev_devid != NULL &&
78dcba9f3fSGeorge Wilson 	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
79dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
80dcba9f3fSGeorge Wilson 		ddi_devid_str_free(minor);
81dcba9f3fSGeorge Wilson 		ddi_devid_free(devid);
82dcba9f3fSGeorge Wilson 	}
83dcba9f3fSGeorge Wilson }
84dcba9f3fSGeorge Wilson 
85dcba9f3fSGeorge Wilson static void
86dcba9f3fSGeorge Wilson vdev_disk_rele(vdev_t *vd)
87dcba9f3fSGeorge Wilson {
88dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
89dcba9f3fSGeorge Wilson 
90dcba9f3fSGeorge Wilson 	if (vd->vdev_name_vp) {
91dcba9f3fSGeorge Wilson 		VN_RELE_ASYNC(vd->vdev_name_vp,
92dcba9f3fSGeorge Wilson 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
93dcba9f3fSGeorge Wilson 		vd->vdev_name_vp = NULL;
94dcba9f3fSGeorge Wilson 	}
95dcba9f3fSGeorge Wilson 	if (vd->vdev_devid_vp) {
96dcba9f3fSGeorge Wilson 		VN_RELE_ASYNC(vd->vdev_devid_vp,
97dcba9f3fSGeorge Wilson 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
98dcba9f3fSGeorge Wilson 		vd->vdev_devid_vp = NULL;
99dcba9f3fSGeorge Wilson 	}
100dcba9f3fSGeorge Wilson }
101dcba9f3fSGeorge Wilson 
1024263d13fSGeorge Wilson static uint64_t
1034263d13fSGeorge Wilson vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz)
1044263d13fSGeorge Wilson {
1054263d13fSGeorge Wilson 	ASSERT(vd->vdev_wholedisk);
1064263d13fSGeorge Wilson 
1074263d13fSGeorge Wilson 	vdev_disk_t *dvd = vd->vdev_tsd;
1084263d13fSGeorge Wilson 	dk_efi_t dk_ioc;
1094263d13fSGeorge Wilson 	efi_gpt_t *efi;
1104263d13fSGeorge Wilson 	uint64_t avail_space = 0;
1114263d13fSGeorge Wilson 	int efisize = EFI_LABEL_SIZE * 2;
1124263d13fSGeorge Wilson 
1134263d13fSGeorge Wilson 	dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP);
1144263d13fSGeorge Wilson 	dk_ioc.dki_lba = 1;
1154263d13fSGeorge Wilson 	dk_ioc.dki_length = efisize;
1164263d13fSGeorge Wilson 	dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data;
1174263d13fSGeorge Wilson 	efi = dk_ioc.dki_data;
1184263d13fSGeorge Wilson 
1194263d13fSGeorge Wilson 	if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc,
1204263d13fSGeorge Wilson 	    FKIOCTL, kcred, NULL) == 0) {
1214263d13fSGeorge Wilson 		uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
1224263d13fSGeorge Wilson 
1234263d13fSGeorge Wilson 		zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu",
1244263d13fSGeorge Wilson 		    vd->vdev_path, capacity, efi_altern_lba);
1254263d13fSGeorge Wilson 		if (capacity > efi_altern_lba)
1264263d13fSGeorge Wilson 			avail_space = (capacity - efi_altern_lba) * blksz;
1274263d13fSGeorge Wilson 	}
1284263d13fSGeorge Wilson 	kmem_free(dk_ioc.dki_data, efisize);
1294263d13fSGeorge Wilson 	return (avail_space);
1304263d13fSGeorge Wilson }
1314263d13fSGeorge Wilson 
132fa9e4066Sahrens static int
1334263d13fSGeorge Wilson vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
1344263d13fSGeorge Wilson     uint64_t *ashift)
135fa9e4066Sahrens {
1368ad4d6ddSJeff Bonwick 	spa_t *spa = vd->vdev_spa;
137fa9e4066Sahrens 	vdev_disk_t *dvd;
138f13665b7Sbo zhou - Sun Microsystems - Beijing China 	struct dk_minfo_ext dkmext;
1390a4e9518Sgw 	int error;
140e14bb325SJeff Bonwick 	dev_t dev;
141e14bb325SJeff Bonwick 	int otyp;
142fa9e4066Sahrens 
143fa9e4066Sahrens 	/*
144fa9e4066Sahrens 	 * We must have a pathname, and it must be absolute.
145fa9e4066Sahrens 	 */
146fa9e4066Sahrens 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
147fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
148*be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
149fa9e4066Sahrens 	}
150fa9e4066Sahrens 
151095bcd66SGeorge Wilson 	/*
152095bcd66SGeorge Wilson 	 * Reopen the device if it's not currently open. Otherwise,
153095bcd66SGeorge Wilson 	 * just update the physical size of the device.
154095bcd66SGeorge Wilson 	 */
155095bcd66SGeorge Wilson 	if (vd->vdev_tsd != NULL) {
156095bcd66SGeorge Wilson 		ASSERT(vd->vdev_reopening);
157095bcd66SGeorge Wilson 		dvd = vd->vdev_tsd;
158095bcd66SGeorge Wilson 		goto skip_open;
159095bcd66SGeorge Wilson 	}
160095bcd66SGeorge Wilson 
161fa9e4066Sahrens 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
162fa9e4066Sahrens 
163fa9e4066Sahrens 	/*
164fa9e4066Sahrens 	 * When opening a disk device, we want to preserve the user's original
165fa9e4066Sahrens 	 * intent.  We always want to open the device by the path the user gave
1661724dc7bSJoshua M. Clulow 	 * us, even if it is one of multiple paths to the same device.  But we
167fa9e4066Sahrens 	 * also want to be able to survive disks being removed/recabled.
168fa9e4066Sahrens 	 * Therefore the sequence of opening devices is:
169fa9e4066Sahrens 	 *
170afefbcddSeschrock 	 * 1. Try opening the device by path.  For legacy pools without the
171afefbcddSeschrock 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
172fa9e4066Sahrens 	 *
173fa9e4066Sahrens 	 * 2. If the devid of the device matches the stored value, return
174fa9e4066Sahrens 	 *    success.
175fa9e4066Sahrens 	 *
176fa9e4066Sahrens 	 * 3. Otherwise, the device may have moved.  Try opening the device
177fa9e4066Sahrens 	 *    by the devid instead.
178fa9e4066Sahrens 	 */
179fa9e4066Sahrens 	if (vd->vdev_devid != NULL) {
180fa9e4066Sahrens 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
181fa9e4066Sahrens 		    &dvd->vd_minor) != 0) {
182fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
183*be6fd75aSMatthew Ahrens 			return (SET_ERROR(EINVAL));
184fa9e4066Sahrens 		}
185fa9e4066Sahrens 	}
186fa9e4066Sahrens 
187fa9e4066Sahrens 	error = EINVAL;		/* presume failure */
188fa9e4066Sahrens 
189095bcd66SGeorge Wilson 	if (vd->vdev_path != NULL) {
190fa9e4066Sahrens 		ddi_devid_t devid;
191fa9e4066Sahrens 
192afefbcddSeschrock 		if (vd->vdev_wholedisk == -1ULL) {
193afefbcddSeschrock 			size_t len = strlen(vd->vdev_path) + 3;
194afefbcddSeschrock 			char *buf = kmem_alloc(len, KM_SLEEP);
195afefbcddSeschrock 			ldi_handle_t lh;
196afefbcddSeschrock 
197afefbcddSeschrock 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
198afefbcddSeschrock 
1998ad4d6ddSJeff Bonwick 			if (ldi_open_by_name(buf, spa_mode(spa), kcred,
200afefbcddSeschrock 			    &lh, zfs_li) == 0) {
201afefbcddSeschrock 				spa_strfree(vd->vdev_path);
202afefbcddSeschrock 				vd->vdev_path = buf;
203afefbcddSeschrock 				vd->vdev_wholedisk = 1ULL;
2048ad4d6ddSJeff Bonwick 				(void) ldi_close(lh, spa_mode(spa), kcred);
205afefbcddSeschrock 			} else {
206afefbcddSeschrock 				kmem_free(buf, len);
207afefbcddSeschrock 			}
208afefbcddSeschrock 		}
209fa9e4066Sahrens 
2108ad4d6ddSJeff Bonwick 		error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
211afefbcddSeschrock 		    &dvd->vd_lh, zfs_li);
212fa9e4066Sahrens 
213fa9e4066Sahrens 		/*
214fa9e4066Sahrens 		 * Compare the devid to the stored value.
215fa9e4066Sahrens 		 */
216fa9e4066Sahrens 		if (error == 0 && vd->vdev_devid != NULL &&
217fa9e4066Sahrens 		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
218fa9e4066Sahrens 			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
219*be6fd75aSMatthew Ahrens 				error = SET_ERROR(EINVAL);
2208ad4d6ddSJeff Bonwick 				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
2218ad4d6ddSJeff Bonwick 				    kcred);
222fa9e4066Sahrens 				dvd->vd_lh = NULL;
223fa9e4066Sahrens 			}
224fa9e4066Sahrens 			ddi_devid_free(devid);
225fa9e4066Sahrens 		}
226afefbcddSeschrock 
227afefbcddSeschrock 		/*
228afefbcddSeschrock 		 * If we succeeded in opening the device, but 'vdev_wholedisk'
229afefbcddSeschrock 		 * is not yet set, then this must be a slice.
230afefbcddSeschrock 		 */
231afefbcddSeschrock 		if (error == 0 && vd->vdev_wholedisk == -1ULL)
232afefbcddSeschrock 			vd->vdev_wholedisk = 0;
233fa9e4066Sahrens 	}
234fa9e4066Sahrens 
235fa9e4066Sahrens 	/*
236fa9e4066Sahrens 	 * If we were unable to open by path, or the devid check fails, open by
237fa9e4066Sahrens 	 * devid instead.
238fa9e4066Sahrens 	 */
239fa9e4066Sahrens 	if (error != 0 && vd->vdev_devid != NULL)
240fa9e4066Sahrens 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
2418ad4d6ddSJeff Bonwick 		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
242fa9e4066Sahrens 
2433d7072f8Seschrock 	/*
2443d7072f8Seschrock 	 * If all else fails, then try opening by physical path (if available)
2453d7072f8Seschrock 	 * or the logical path (if we failed due to the devid check).  While not
2463d7072f8Seschrock 	 * as reliable as the devid, this will give us something, and the higher
2473d7072f8Seschrock 	 * level vdev validation will prevent us from opening the wrong device.
2483d7072f8Seschrock 	 */
2493d7072f8Seschrock 	if (error) {
2503d7072f8Seschrock 		if (vd->vdev_physpath != NULL &&
251deb8317bSMark J Musante 		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
2528ad4d6ddSJeff Bonwick 			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
2533d7072f8Seschrock 			    kcred, &dvd->vd_lh, zfs_li);
2543d7072f8Seschrock 
2553d7072f8Seschrock 		/*
2563d7072f8Seschrock 		 * Note that we don't support the legacy auto-wholedisk support
2573d7072f8Seschrock 		 * as above.  This hasn't been used in a very long time and we
2583d7072f8Seschrock 		 * don't need to propagate its oddities to this edge condition.
2593d7072f8Seschrock 		 */
260095bcd66SGeorge Wilson 		if (error && vd->vdev_path != NULL)
2618ad4d6ddSJeff Bonwick 			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
2628ad4d6ddSJeff Bonwick 			    kcred, &dvd->vd_lh, zfs_li);
2633d7072f8Seschrock 	}
2643d7072f8Seschrock 
265e14bb325SJeff Bonwick 	if (error) {
266fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
267fa9e4066Sahrens 		return (error);
268e14bb325SJeff Bonwick 	}
269fa9e4066Sahrens 
2703d7072f8Seschrock 	/*
2713d7072f8Seschrock 	 * Once a device is opened, verify that the physical device path (if
2723d7072f8Seschrock 	 * available) is up to date.
2733d7072f8Seschrock 	 */
2743d7072f8Seschrock 	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
2753d7072f8Seschrock 	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
2760a4e9518Sgw 		char *physpath, *minorname;
2770a4e9518Sgw 
2783d7072f8Seschrock 		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2793d7072f8Seschrock 		minorname = NULL;
2803d7072f8Seschrock 		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
2813d7072f8Seschrock 		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
2823d7072f8Seschrock 		    (vd->vdev_physpath == NULL ||
2833d7072f8Seschrock 		    strcmp(vd->vdev_physpath, physpath) != 0)) {
2843d7072f8Seschrock 			if (vd->vdev_physpath)
2853d7072f8Seschrock 				spa_strfree(vd->vdev_physpath);
2863d7072f8Seschrock 			(void) strlcat(physpath, ":", MAXPATHLEN);
2873d7072f8Seschrock 			(void) strlcat(physpath, minorname, MAXPATHLEN);
2883d7072f8Seschrock 			vd->vdev_physpath = spa_strdup(physpath);
2893d7072f8Seschrock 		}
2903d7072f8Seschrock 		if (minorname)
2913d7072f8Seschrock 			kmem_free(minorname, strlen(minorname) + 1);
2923d7072f8Seschrock 		kmem_free(physpath, MAXPATHLEN);
2933d7072f8Seschrock 	}
2943d7072f8Seschrock 
295095bcd66SGeorge Wilson skip_open:
296fa9e4066Sahrens 	/*
297fa9e4066Sahrens 	 * Determine the actual size of the device.
298fa9e4066Sahrens 	 */
299fa9e4066Sahrens 	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
300fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
301*be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
302fa9e4066Sahrens 	}
303fa9e4066Sahrens 
304ecc2d604Sbonwick 	/*
305ecc2d604Sbonwick 	 * Determine the device's minimum transfer size.
306ecc2d604Sbonwick 	 * If the ioctl isn't supported, assume DEV_BSIZE.
307ecc2d604Sbonwick 	 */
308f13665b7Sbo zhou - Sun Microsystems - Beijing China 	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext,
309ecc2d604Sbonwick 	    FKIOCTL, kcred, NULL) != 0)
310f13665b7Sbo zhou - Sun Microsystems - Beijing China 		dkmext.dki_pbsize = DEV_BSIZE;
311bef6b7d2Swebaker 
312f13665b7Sbo zhou - Sun Microsystems - Beijing China 	*ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1;
313bef6b7d2Swebaker 
3144263d13fSGeorge Wilson 	if (vd->vdev_wholedisk == 1) {
3154263d13fSGeorge Wilson 		uint64_t capacity = dkmext.dki_capacity - 1;
3164263d13fSGeorge Wilson 		uint64_t blksz = dkmext.dki_lbsize;
3174263d13fSGeorge Wilson 		int wce = 1;
3184263d13fSGeorge Wilson 
3194263d13fSGeorge Wilson 		/*
3204263d13fSGeorge Wilson 		 * If we own the whole disk, try to enable disk write caching.
3214263d13fSGeorge Wilson 		 * We ignore errors because it's OK if we can't do it.
3224263d13fSGeorge Wilson 		 */
3234263d13fSGeorge Wilson 		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
3244263d13fSGeorge Wilson 		    FKIOCTL, kcred, NULL);
3254263d13fSGeorge Wilson 
3264263d13fSGeorge Wilson 		*max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz);
3274263d13fSGeorge Wilson 		zfs_dbgmsg("capacity change: vdev %s, psize %llu, "
3284263d13fSGeorge Wilson 		    "max_psize %llu", vd->vdev_path, *psize, *max_psize);
3294263d13fSGeorge Wilson 	} else {
3304263d13fSGeorge Wilson 		*max_psize = *psize;
3314263d13fSGeorge Wilson 	}
3324263d13fSGeorge Wilson 
333b468a217Seschrock 	/*
334b468a217Seschrock 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
335b468a217Seschrock 	 * try again.
336b468a217Seschrock 	 */
337b468a217Seschrock 	vd->vdev_nowritecache = B_FALSE;
338b468a217Seschrock 
339fa9e4066Sahrens 	return (0);
340fa9e4066Sahrens }
341fa9e4066Sahrens 
342fa9e4066Sahrens static void
343fa9e4066Sahrens vdev_disk_close(vdev_t *vd)
344fa9e4066Sahrens {
345fa9e4066Sahrens 	vdev_disk_t *dvd = vd->vdev_tsd;
346fa9e4066Sahrens 
347095bcd66SGeorge Wilson 	if (vd->vdev_reopening || dvd == NULL)
348fa9e4066Sahrens 		return;
349fa9e4066Sahrens 
350fa9e4066Sahrens 	if (dvd->vd_minor != NULL)
351fa9e4066Sahrens 		ddi_devid_str_free(dvd->vd_minor);
352fa9e4066Sahrens 
353fa9e4066Sahrens 	if (dvd->vd_devid != NULL)
354fa9e4066Sahrens 		ddi_devid_free(dvd->vd_devid);
355fa9e4066Sahrens 
356fa9e4066Sahrens 	if (dvd->vd_lh != NULL)
3578ad4d6ddSJeff Bonwick 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
358fa9e4066Sahrens 
35998d1cbfeSGeorge Wilson 	vd->vdev_delayed_close = B_FALSE;
360fa9e4066Sahrens 	kmem_free(dvd, sizeof (vdev_disk_t));
361fa9e4066Sahrens 	vd->vdev_tsd = NULL;
362fa9e4066Sahrens }
363fa9e4066Sahrens 
364e7cbe64fSgw int
365e7cbe64fSgw vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
366e7cbe64fSgw     uint64_t offset, int flags)
367e7cbe64fSgw {
368e7cbe64fSgw 	buf_t *bp;
369e7cbe64fSgw 	int error = 0;
370e7cbe64fSgw 
371e7cbe64fSgw 	if (vd_lh == NULL)
372*be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
373e7cbe64fSgw 
374e7cbe64fSgw 	ASSERT(flags & B_READ || flags & B_WRITE);
375e7cbe64fSgw 
376e7cbe64fSgw 	bp = getrbuf(KM_SLEEP);
377e7cbe64fSgw 	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
378e7cbe64fSgw 	bp->b_bcount = size;
379e7cbe64fSgw 	bp->b_un.b_addr = (void *)data;
380e7cbe64fSgw 	bp->b_lblkno = lbtodb(offset);
381e7cbe64fSgw 	bp->b_bufsize = size;
382e7cbe64fSgw 
383e7cbe64fSgw 	error = ldi_strategy(vd_lh, bp);
384e7cbe64fSgw 	ASSERT(error == 0);
385e7cbe64fSgw 	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
386*be6fd75aSMatthew Ahrens 		error = SET_ERROR(EIO);
387e7cbe64fSgw 	freerbuf(bp);
388e7cbe64fSgw 
389e7cbe64fSgw 	return (error);
390e7cbe64fSgw }
391e7cbe64fSgw 
392fa9e4066Sahrens static void
393fa9e4066Sahrens vdev_disk_io_intr(buf_t *bp)
394fa9e4066Sahrens {
39531d7e8faSGeorge Wilson 	vdev_buf_t *vb = (vdev_buf_t *)bp;
39631d7e8faSGeorge Wilson 	zio_t *zio = vb->vb_io;
397fa9e4066Sahrens 
39851ece835Seschrock 	/*
39951ece835Seschrock 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
40051ece835Seschrock 	 * Rather than teach the rest of the stack about other error
40151ece835Seschrock 	 * possibilities (EFAULT, etc), we normalize the error value here.
40251ece835Seschrock 	 */
40351ece835Seschrock 	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
40451ece835Seschrock 
40551ece835Seschrock 	if (zio->io_error == 0 && bp->b_resid != 0)
406*be6fd75aSMatthew Ahrens 		zio->io_error = SET_ERROR(EIO);
407fa9e4066Sahrens 
40831d7e8faSGeorge Wilson 	kmem_free(vb, sizeof (vdev_buf_t));
409fa9e4066Sahrens 
410e05725b1Sbonwick 	zio_interrupt(zio);
411fa9e4066Sahrens }
412fa9e4066Sahrens 
413f4a72450SJeff Bonwick static void
414f4a72450SJeff Bonwick vdev_disk_ioctl_free(zio_t *zio)
415f4a72450SJeff Bonwick {
416f4a72450SJeff Bonwick 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
417f4a72450SJeff Bonwick }
418f4a72450SJeff Bonwick 
41922fe2c88SJonathan Adams static const zio_vsd_ops_t vdev_disk_vsd_ops = {
42022fe2c88SJonathan Adams 	vdev_disk_ioctl_free,
42122fe2c88SJonathan Adams 	zio_vsd_default_cksum_report
42222fe2c88SJonathan Adams };
42322fe2c88SJonathan Adams 
424fa9e4066Sahrens static void
425fa9e4066Sahrens vdev_disk_ioctl_done(void *zio_arg, int error)
426fa9e4066Sahrens {
427fa9e4066Sahrens 	zio_t *zio = zio_arg;
428fa9e4066Sahrens 
429fa9e4066Sahrens 	zio->io_error = error;
430fa9e4066Sahrens 
431e05725b1Sbonwick 	zio_interrupt(zio);
432fa9e4066Sahrens }
433fa9e4066Sahrens 
434e05725b1Sbonwick static int
435fa9e4066Sahrens vdev_disk_io_start(zio_t *zio)
436fa9e4066Sahrens {
437fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
438fa9e4066Sahrens 	vdev_disk_t *dvd = vd->vdev_tsd;
43931d7e8faSGeorge Wilson 	vdev_buf_t *vb;
440e14bb325SJeff Bonwick 	struct dk_callback *dkc;
441fa9e4066Sahrens 	buf_t *bp;
442e14bb325SJeff Bonwick 	int error;
443fa9e4066Sahrens 
444fa9e4066Sahrens 	if (zio->io_type == ZIO_TYPE_IOCTL) {
445fa9e4066Sahrens 		/* XXPOLICY */
4460a4e9518Sgw 		if (!vdev_readable(vd)) {
447*be6fd75aSMatthew Ahrens 			zio->io_error = SET_ERROR(ENXIO);
448e05725b1Sbonwick 			return (ZIO_PIPELINE_CONTINUE);
449fa9e4066Sahrens 		}
450fa9e4066Sahrens 
451fa9e4066Sahrens 		switch (zio->io_cmd) {
452fa9e4066Sahrens 
453fa9e4066Sahrens 		case DKIOCFLUSHWRITECACHE:
454fa9e4066Sahrens 
455a2eea2e1Sahrens 			if (zfs_nocacheflush)
456a2eea2e1Sahrens 				break;
457a2eea2e1Sahrens 
458b468a217Seschrock 			if (vd->vdev_nowritecache) {
459*be6fd75aSMatthew Ahrens 				zio->io_error = SET_ERROR(ENOTSUP);
460b468a217Seschrock 				break;
461b468a217Seschrock 			}
462b468a217Seschrock 
463e14bb325SJeff Bonwick 			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
46422fe2c88SJonathan Adams 			zio->io_vsd_ops = &vdev_disk_vsd_ops;
465e14bb325SJeff Bonwick 
466e14bb325SJeff Bonwick 			dkc->dkc_callback = vdev_disk_ioctl_done;
467e14bb325SJeff Bonwick 			dkc->dkc_flag = FLUSH_VOLATILE;
468e14bb325SJeff Bonwick 			dkc->dkc_cookie = zio;
469fa9e4066Sahrens 
470fa9e4066Sahrens 			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
471e14bb325SJeff Bonwick 			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
472fa9e4066Sahrens 
473fa9e4066Sahrens 			if (error == 0) {
474fa9e4066Sahrens 				/*
475fa9e4066Sahrens 				 * The ioctl will be done asychronously,
476fa9e4066Sahrens 				 * and will call vdev_disk_ioctl_done()
477fa9e4066Sahrens 				 * upon completion.
478fa9e4066Sahrens 				 */
479e05725b1Sbonwick 				return (ZIO_PIPELINE_STOP);
480e05725b1Sbonwick 			}
481e05725b1Sbonwick 
482e05725b1Sbonwick 			if (error == ENOTSUP || error == ENOTTY) {
483b468a217Seschrock 				/*
484d5782879Smishra 				 * If we get ENOTSUP or ENOTTY, we know that
485d5782879Smishra 				 * no future attempts will ever succeed.
486d5782879Smishra 				 * In this case we set a persistent bit so
487d5782879Smishra 				 * that we don't bother with the ioctl in the
488d5782879Smishra 				 * future.
489b468a217Seschrock 				 */
490b468a217Seschrock 				vd->vdev_nowritecache = B_TRUE;
491fa9e4066Sahrens 			}
492fa9e4066Sahrens 			zio->io_error = error;
493b468a217Seschrock 
494fa9e4066Sahrens 			break;
495fa9e4066Sahrens 
496fa9e4066Sahrens 		default:
497*be6fd75aSMatthew Ahrens 			zio->io_error = SET_ERROR(ENOTSUP);
498fa9e4066Sahrens 		}
499fa9e4066Sahrens 
500e05725b1Sbonwick 		return (ZIO_PIPELINE_CONTINUE);
501fa9e4066Sahrens 	}
502fa9e4066Sahrens 
50331d7e8faSGeorge Wilson 	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
504fa9e4066Sahrens 
50531d7e8faSGeorge Wilson 	vb->vb_io = zio;
50631d7e8faSGeorge Wilson 	bp = &vb->vb_buf;
507fa9e4066Sahrens 
508fa9e4066Sahrens 	bioinit(bp);
509e14bb325SJeff Bonwick 	bp->b_flags = B_BUSY | B_NOCACHE |
5108956713aSEric Schrock 	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
5118956713aSEric Schrock 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
5128956713aSEric Schrock 		bp->b_flags |= B_FAILFAST;
513fa9e4066Sahrens 	bp->b_bcount = zio->io_size;
514fa9e4066Sahrens 	bp->b_un.b_addr = zio->io_data;
515fa9e4066Sahrens 	bp->b_lblkno = lbtodb(zio->io_offset);
516fa9e4066Sahrens 	bp->b_bufsize = zio->io_size;
517fa9e4066Sahrens 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
518fa9e4066Sahrens 
519fa9e4066Sahrens 	/* ldi_strategy() will return non-zero only on programming errors */
520e14bb325SJeff Bonwick 	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
521e05725b1Sbonwick 
522e05725b1Sbonwick 	return (ZIO_PIPELINE_STOP);
523fa9e4066Sahrens }
524fa9e4066Sahrens 
525e14bb325SJeff Bonwick static void
526fa9e4066Sahrens vdev_disk_io_done(zio_t *zio)
527fa9e4066Sahrens {
528e14bb325SJeff Bonwick 	vdev_t *vd = zio->io_vd;
529ea8dc4b6Seschrock 
5303d7072f8Seschrock 	/*
5313d7072f8Seschrock 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
5323d7072f8Seschrock 	 * the device has been removed.  If this is the case, then we trigger an
5330a4e9518Sgw 	 * asynchronous removal of the device. Otherwise, probe the device and
5341f7ad2e1Sgw 	 * make sure it's still accessible.
5353d7072f8Seschrock 	 */
5361d713200SEric Schrock 	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
5370a4e9518Sgw 		vdev_disk_t *dvd = vd->vdev_tsd;
538e14bb325SJeff Bonwick 		int state = DKIO_NONE;
5390a4e9518Sgw 
540e14bb325SJeff Bonwick 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
541e14bb325SJeff Bonwick 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
5421d713200SEric Schrock 			/*
5431d713200SEric Schrock 			 * We post the resource as soon as possible, instead of
5441d713200SEric Schrock 			 * when the async removal actually happens, because the
5451d713200SEric Schrock 			 * DE is using this information to discard previous I/O
5461d713200SEric Schrock 			 * errors.
5471d713200SEric Schrock 			 */
5481d713200SEric Schrock 			zfs_post_remove(zio->io_spa, vd);
5493d7072f8Seschrock 			vd->vdev_remove_wanted = B_TRUE;
5503d7072f8Seschrock 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
55198d1cbfeSGeorge Wilson 		} else if (!vd->vdev_delayed_close) {
55298d1cbfeSGeorge Wilson 			vd->vdev_delayed_close = B_TRUE;
5533d7072f8Seschrock 		}
5543d7072f8Seschrock 	}
555fa9e4066Sahrens }
556fa9e4066Sahrens 
557fa9e4066Sahrens vdev_ops_t vdev_disk_ops = {
558fa9e4066Sahrens 	vdev_disk_open,
559fa9e4066Sahrens 	vdev_disk_close,
560fa9e4066Sahrens 	vdev_default_asize,
561fa9e4066Sahrens 	vdev_disk_io_start,
562fa9e4066Sahrens 	vdev_disk_io_done,
563fa9e4066Sahrens 	NULL,
564dcba9f3fSGeorge Wilson 	vdev_disk_hold,
565dcba9f3fSGeorge Wilson 	vdev_disk_rele,
566fa9e4066Sahrens 	VDEV_TYPE_DISK,		/* name of this vdev type */
567fa9e4066Sahrens 	B_TRUE			/* leaf vdev */
568fa9e4066Sahrens };
569e7cbe64fSgw 
570e7cbe64fSgw /*
571051aabe6Staylor  * Given the root disk device devid or pathname, read the label from
572051aabe6Staylor  * the device, and construct a configuration nvlist.
573e7cbe64fSgw  */
574f940fbb1SLin Ling int
575f940fbb1SLin Ling vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
576e7cbe64fSgw {
577e7cbe64fSgw 	ldi_handle_t vd_lh;
578e7cbe64fSgw 	vdev_label_t *label;
579e7cbe64fSgw 	uint64_t s, size;
580e7cbe64fSgw 	int l;
581051aabe6Staylor 	ddi_devid_t tmpdevid;
582f4565e39SLin Ling 	int error = -1;
583051aabe6Staylor 	char *minor_name;
584e7cbe64fSgw 
585e7cbe64fSgw 	/*
586e7cbe64fSgw 	 * Read the device label and build the nvlist.
587e7cbe64fSgw 	 */
588f4565e39SLin Ling 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
589051aabe6Staylor 	    &minor_name) == 0) {
590051aabe6Staylor 		error = ldi_open_by_devid(tmpdevid, minor_name,
5918ad4d6ddSJeff Bonwick 		    FREAD, kcred, &vd_lh, zfs_li);
592051aabe6Staylor 		ddi_devid_free(tmpdevid);
593051aabe6Staylor 		ddi_devid_str_free(minor_name);
594051aabe6Staylor 	}
595051aabe6Staylor 
596f4565e39SLin Ling 	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
597f4565e39SLin Ling 	    zfs_li)))
598f940fbb1SLin Ling 		return (error);
599e7cbe64fSgw 
600bf82a41bSeschrock 	if (ldi_get_size(vd_lh, &s)) {
601bf82a41bSeschrock 		(void) ldi_close(vd_lh, FREAD, kcred);
602*be6fd75aSMatthew Ahrens 		return (SET_ERROR(EIO));
603bf82a41bSeschrock 	}
604e7cbe64fSgw 
605e7cbe64fSgw 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
606e7cbe64fSgw 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
607e7cbe64fSgw 
60817f1e64aSEric Taylor 	*config = NULL;
609e7cbe64fSgw 	for (l = 0; l < VDEV_LABELS; l++) {
610e7cbe64fSgw 		uint64_t offset, state, txg = 0;
611e7cbe64fSgw 
612e7cbe64fSgw 		/* read vdev label */
613e7cbe64fSgw 		offset = vdev_label_offset(size, l, 0);
614e7cbe64fSgw 		if (vdev_disk_physio(vd_lh, (caddr_t)label,
6152264ca7fSLin Ling 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
616e7cbe64fSgw 			continue;
617e7cbe64fSgw 
618e7cbe64fSgw 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
619f940fbb1SLin Ling 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
620f940fbb1SLin Ling 			*config = NULL;
621e7cbe64fSgw 			continue;
622e7cbe64fSgw 		}
623e7cbe64fSgw 
624f940fbb1SLin Ling 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
625e7cbe64fSgw 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
626f940fbb1SLin Ling 			nvlist_free(*config);
627f940fbb1SLin Ling 			*config = NULL;
628e7cbe64fSgw 			continue;
629e7cbe64fSgw 		}
630e7cbe64fSgw 
631f940fbb1SLin Ling 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
632e7cbe64fSgw 		    &txg) != 0 || txg == 0) {
633f940fbb1SLin Ling 			nvlist_free(*config);
634f940fbb1SLin Ling 			*config = NULL;
635e7cbe64fSgw 			continue;
636e7cbe64fSgw 		}
637e7cbe64fSgw 
638e7cbe64fSgw 		break;
639e7cbe64fSgw 	}
640e7cbe64fSgw 
641e7cbe64fSgw 	kmem_free(label, sizeof (vdev_label_t));
642bf82a41bSeschrock 	(void) ldi_close(vd_lh, FREAD, kcred);
64317f1e64aSEric Taylor 	if (*config == NULL)
644*be6fd75aSMatthew Ahrens 		error = SET_ERROR(EIDRM);
645bf82a41bSeschrock 
646f940fbb1SLin Ling 	return (error);
647e7cbe64fSgw }
648