xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_disk.c (revision fb02ae02)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5bef6b7d2Swebaker  * Common Development and Distribution License (the "License").
6bef6b7d2Swebaker  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22f13665b7Sbo zhou - Sun Microsystems - Beijing China  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23be6fd75aSMatthew Ahrens  * Copyright (c) 2013 by Delphix. All rights reserved.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #include <sys/zfs_context.h>
27dcba9f3fSGeorge Wilson #include <sys/spa_impl.h>
28e7cbe64fSgw #include <sys/refcount.h>
29fa9e4066Sahrens #include <sys/vdev_disk.h>
30fa9e4066Sahrens #include <sys/vdev_impl.h>
31fa9e4066Sahrens #include <sys/fs/zfs.h>
32fa9e4066Sahrens #include <sys/zio.h>
33afefbcddSeschrock #include <sys/sunldi.h>
344263d13fSGeorge Wilson #include <sys/efi_partition.h>
3551ece835Seschrock #include <sys/fm/fs/zfs.h>
36fa9e4066Sahrens 
37fa9e4066Sahrens /*
38fa9e4066Sahrens  * Virtual device vector for disks.
39fa9e4066Sahrens  */
40fa9e4066Sahrens 
41fa9e4066Sahrens extern ldi_ident_t zfs_li;
42fa9e4066Sahrens 
43dcba9f3fSGeorge Wilson static void
44dcba9f3fSGeorge Wilson vdev_disk_hold(vdev_t *vd)
45dcba9f3fSGeorge Wilson {
46dcba9f3fSGeorge Wilson 	ddi_devid_t devid;
47dcba9f3fSGeorge Wilson 	char *minor;
48dcba9f3fSGeorge Wilson 
49dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
50dcba9f3fSGeorge Wilson 
51dcba9f3fSGeorge Wilson 	/*
52dcba9f3fSGeorge Wilson 	 * We must have a pathname, and it must be absolute.
53dcba9f3fSGeorge Wilson 	 */
54dcba9f3fSGeorge Wilson 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
55dcba9f3fSGeorge Wilson 		return;
56dcba9f3fSGeorge Wilson 
57dcba9f3fSGeorge Wilson 	/*
58dcba9f3fSGeorge Wilson 	 * Only prefetch path and devid info if the device has
59dcba9f3fSGeorge Wilson 	 * never been opened.
60dcba9f3fSGeorge Wilson 	 */
61dcba9f3fSGeorge Wilson 	if (vd->vdev_tsd != NULL)
62dcba9f3fSGeorge Wilson 		return;
63dcba9f3fSGeorge Wilson 
64dcba9f3fSGeorge Wilson 	if (vd->vdev_wholedisk == -1ULL) {
65dcba9f3fSGeorge Wilson 		size_t len = strlen(vd->vdev_path) + 3;
66dcba9f3fSGeorge Wilson 		char *buf = kmem_alloc(len, KM_SLEEP);
67dcba9f3fSGeorge Wilson 
68dcba9f3fSGeorge Wilson 		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
69dcba9f3fSGeorge Wilson 
70dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
71dcba9f3fSGeorge Wilson 		kmem_free(buf, len);
72dcba9f3fSGeorge Wilson 	}
73dcba9f3fSGeorge Wilson 
74dcba9f3fSGeorge Wilson 	if (vd->vdev_name_vp == NULL)
75dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
76dcba9f3fSGeorge Wilson 
77dcba9f3fSGeorge Wilson 	if (vd->vdev_devid != NULL &&
78dcba9f3fSGeorge Wilson 	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
79dcba9f3fSGeorge Wilson 		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
80dcba9f3fSGeorge Wilson 		ddi_devid_str_free(minor);
81dcba9f3fSGeorge Wilson 		ddi_devid_free(devid);
82dcba9f3fSGeorge Wilson 	}
83dcba9f3fSGeorge Wilson }
84dcba9f3fSGeorge Wilson 
85dcba9f3fSGeorge Wilson static void
86dcba9f3fSGeorge Wilson vdev_disk_rele(vdev_t *vd)
87dcba9f3fSGeorge Wilson {
88dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
89dcba9f3fSGeorge Wilson 
90dcba9f3fSGeorge Wilson 	if (vd->vdev_name_vp) {
91dcba9f3fSGeorge Wilson 		VN_RELE_ASYNC(vd->vdev_name_vp,
92dcba9f3fSGeorge Wilson 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
93dcba9f3fSGeorge Wilson 		vd->vdev_name_vp = NULL;
94dcba9f3fSGeorge Wilson 	}
95dcba9f3fSGeorge Wilson 	if (vd->vdev_devid_vp) {
96dcba9f3fSGeorge Wilson 		VN_RELE_ASYNC(vd->vdev_devid_vp,
97dcba9f3fSGeorge Wilson 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
98dcba9f3fSGeorge Wilson 		vd->vdev_devid_vp = NULL;
99dcba9f3fSGeorge Wilson 	}
100dcba9f3fSGeorge Wilson }
101dcba9f3fSGeorge Wilson 
1024263d13fSGeorge Wilson static uint64_t
1034263d13fSGeorge Wilson vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz)
1044263d13fSGeorge Wilson {
1054263d13fSGeorge Wilson 	ASSERT(vd->vdev_wholedisk);
1064263d13fSGeorge Wilson 
1074263d13fSGeorge Wilson 	vdev_disk_t *dvd = vd->vdev_tsd;
1084263d13fSGeorge Wilson 	dk_efi_t dk_ioc;
1094263d13fSGeorge Wilson 	efi_gpt_t *efi;
1104263d13fSGeorge Wilson 	uint64_t avail_space = 0;
1114263d13fSGeorge Wilson 	int efisize = EFI_LABEL_SIZE * 2;
1124263d13fSGeorge Wilson 
1134263d13fSGeorge Wilson 	dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP);
1144263d13fSGeorge Wilson 	dk_ioc.dki_lba = 1;
1154263d13fSGeorge Wilson 	dk_ioc.dki_length = efisize;
1164263d13fSGeorge Wilson 	dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data;
1174263d13fSGeorge Wilson 	efi = dk_ioc.dki_data;
1184263d13fSGeorge Wilson 
1194263d13fSGeorge Wilson 	if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc,
1204263d13fSGeorge Wilson 	    FKIOCTL, kcred, NULL) == 0) {
1214263d13fSGeorge Wilson 		uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
1224263d13fSGeorge Wilson 
1234263d13fSGeorge Wilson 		zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu",
1244263d13fSGeorge Wilson 		    vd->vdev_path, capacity, efi_altern_lba);
1254263d13fSGeorge Wilson 		if (capacity > efi_altern_lba)
1264263d13fSGeorge Wilson 			avail_space = (capacity - efi_altern_lba) * blksz;
1274263d13fSGeorge Wilson 	}
1284263d13fSGeorge Wilson 	kmem_free(dk_ioc.dki_data, efisize);
1294263d13fSGeorge Wilson 	return (avail_space);
1304263d13fSGeorge Wilson }
1314263d13fSGeorge Wilson 
132fa9e4066Sahrens static int
1334263d13fSGeorge Wilson vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
1344263d13fSGeorge Wilson     uint64_t *ashift)
135fa9e4066Sahrens {
1368ad4d6ddSJeff Bonwick 	spa_t *spa = vd->vdev_spa;
137fa9e4066Sahrens 	vdev_disk_t *dvd;
138f13665b7Sbo zhou - Sun Microsystems - Beijing China 	struct dk_minfo_ext dkmext;
1390a4e9518Sgw 	int error;
140e14bb325SJeff Bonwick 	dev_t dev;
141e14bb325SJeff Bonwick 	int otyp;
142*fb02ae02SGeorge Wilson 	boolean_t validate_devid = B_FALSE;
143*fb02ae02SGeorge Wilson 	ddi_devid_t devid;
144fa9e4066Sahrens 
145fa9e4066Sahrens 	/*
146fa9e4066Sahrens 	 * We must have a pathname, and it must be absolute.
147fa9e4066Sahrens 	 */
148fa9e4066Sahrens 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
149fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
150be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
151fa9e4066Sahrens 	}
152fa9e4066Sahrens 
153095bcd66SGeorge Wilson 	/*
154095bcd66SGeorge Wilson 	 * Reopen the device if it's not currently open. Otherwise,
155095bcd66SGeorge Wilson 	 * just update the physical size of the device.
156095bcd66SGeorge Wilson 	 */
157095bcd66SGeorge Wilson 	if (vd->vdev_tsd != NULL) {
158095bcd66SGeorge Wilson 		ASSERT(vd->vdev_reopening);
159095bcd66SGeorge Wilson 		dvd = vd->vdev_tsd;
160095bcd66SGeorge Wilson 		goto skip_open;
161095bcd66SGeorge Wilson 	}
162095bcd66SGeorge Wilson 
163fa9e4066Sahrens 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
164fa9e4066Sahrens 
165fa9e4066Sahrens 	/*
166fa9e4066Sahrens 	 * When opening a disk device, we want to preserve the user's original
167fa9e4066Sahrens 	 * intent.  We always want to open the device by the path the user gave
1681724dc7bSJoshua M. Clulow 	 * us, even if it is one of multiple paths to the same device.  But we
169fa9e4066Sahrens 	 * also want to be able to survive disks being removed/recabled.
170fa9e4066Sahrens 	 * Therefore the sequence of opening devices is:
171fa9e4066Sahrens 	 *
172afefbcddSeschrock 	 * 1. Try opening the device by path.  For legacy pools without the
173afefbcddSeschrock 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
174fa9e4066Sahrens 	 *
175fa9e4066Sahrens 	 * 2. If the devid of the device matches the stored value, return
176fa9e4066Sahrens 	 *    success.
177fa9e4066Sahrens 	 *
178fa9e4066Sahrens 	 * 3. Otherwise, the device may have moved.  Try opening the device
179fa9e4066Sahrens 	 *    by the devid instead.
180fa9e4066Sahrens 	 */
181fa9e4066Sahrens 	if (vd->vdev_devid != NULL) {
182fa9e4066Sahrens 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
183fa9e4066Sahrens 		    &dvd->vd_minor) != 0) {
184fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
185be6fd75aSMatthew Ahrens 			return (SET_ERROR(EINVAL));
186fa9e4066Sahrens 		}
187fa9e4066Sahrens 	}
188fa9e4066Sahrens 
189fa9e4066Sahrens 	error = EINVAL;		/* presume failure */
190fa9e4066Sahrens 
191095bcd66SGeorge Wilson 	if (vd->vdev_path != NULL) {
192fa9e4066Sahrens 
193afefbcddSeschrock 		if (vd->vdev_wholedisk == -1ULL) {
194afefbcddSeschrock 			size_t len = strlen(vd->vdev_path) + 3;
195afefbcddSeschrock 			char *buf = kmem_alloc(len, KM_SLEEP);
196afefbcddSeschrock 			ldi_handle_t lh;
197afefbcddSeschrock 
198afefbcddSeschrock 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
199afefbcddSeschrock 
2008ad4d6ddSJeff Bonwick 			if (ldi_open_by_name(buf, spa_mode(spa), kcred,
201afefbcddSeschrock 			    &lh, zfs_li) == 0) {
202afefbcddSeschrock 				spa_strfree(vd->vdev_path);
203afefbcddSeschrock 				vd->vdev_path = buf;
204afefbcddSeschrock 				vd->vdev_wholedisk = 1ULL;
2058ad4d6ddSJeff Bonwick 				(void) ldi_close(lh, spa_mode(spa), kcred);
206afefbcddSeschrock 			} else {
207afefbcddSeschrock 				kmem_free(buf, len);
208afefbcddSeschrock 			}
209afefbcddSeschrock 		}
210fa9e4066Sahrens 
2118ad4d6ddSJeff Bonwick 		error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
212afefbcddSeschrock 		    &dvd->vd_lh, zfs_li);
213fa9e4066Sahrens 
214fa9e4066Sahrens 		/*
215fa9e4066Sahrens 		 * Compare the devid to the stored value.
216fa9e4066Sahrens 		 */
217fa9e4066Sahrens 		if (error == 0 && vd->vdev_devid != NULL &&
218fa9e4066Sahrens 		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
219fa9e4066Sahrens 			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
220be6fd75aSMatthew Ahrens 				error = SET_ERROR(EINVAL);
2218ad4d6ddSJeff Bonwick 				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
2228ad4d6ddSJeff Bonwick 				    kcred);
223fa9e4066Sahrens 				dvd->vd_lh = NULL;
224fa9e4066Sahrens 			}
225fa9e4066Sahrens 			ddi_devid_free(devid);
226fa9e4066Sahrens 		}
227afefbcddSeschrock 
228afefbcddSeschrock 		/*
229afefbcddSeschrock 		 * If we succeeded in opening the device, but 'vdev_wholedisk'
230afefbcddSeschrock 		 * is not yet set, then this must be a slice.
231afefbcddSeschrock 		 */
232afefbcddSeschrock 		if (error == 0 && vd->vdev_wholedisk == -1ULL)
233afefbcddSeschrock 			vd->vdev_wholedisk = 0;
234fa9e4066Sahrens 	}
235fa9e4066Sahrens 
236fa9e4066Sahrens 	/*
237fa9e4066Sahrens 	 * If we were unable to open by path, or the devid check fails, open by
238fa9e4066Sahrens 	 * devid instead.
239fa9e4066Sahrens 	 */
240*fb02ae02SGeorge Wilson 	if (error != 0 && vd->vdev_devid != NULL) {
241fa9e4066Sahrens 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
2428ad4d6ddSJeff Bonwick 		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
243*fb02ae02SGeorge Wilson 	}
244fa9e4066Sahrens 
2453d7072f8Seschrock 	/*
2463d7072f8Seschrock 	 * If all else fails, then try opening by physical path (if available)
2473d7072f8Seschrock 	 * or the logical path (if we failed due to the devid check).  While not
2483d7072f8Seschrock 	 * as reliable as the devid, this will give us something, and the higher
2493d7072f8Seschrock 	 * level vdev validation will prevent us from opening the wrong device.
2503d7072f8Seschrock 	 */
2513d7072f8Seschrock 	if (error) {
252*fb02ae02SGeorge Wilson 		if (vd->vdev_devid != NULL)
253*fb02ae02SGeorge Wilson 			validate_devid = B_TRUE;
254*fb02ae02SGeorge Wilson 
2553d7072f8Seschrock 		if (vd->vdev_physpath != NULL &&
256deb8317bSMark J Musante 		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
2578ad4d6ddSJeff Bonwick 			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
2583d7072f8Seschrock 			    kcred, &dvd->vd_lh, zfs_li);
2593d7072f8Seschrock 
2603d7072f8Seschrock 		/*
2613d7072f8Seschrock 		 * Note that we don't support the legacy auto-wholedisk support
2623d7072f8Seschrock 		 * as above.  This hasn't been used in a very long time and we
2633d7072f8Seschrock 		 * don't need to propagate its oddities to this edge condition.
2643d7072f8Seschrock 		 */
265095bcd66SGeorge Wilson 		if (error && vd->vdev_path != NULL)
2668ad4d6ddSJeff Bonwick 			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
2678ad4d6ddSJeff Bonwick 			    kcred, &dvd->vd_lh, zfs_li);
2683d7072f8Seschrock 	}
2693d7072f8Seschrock 
270e14bb325SJeff Bonwick 	if (error) {
271fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
272fa9e4066Sahrens 		return (error);
273e14bb325SJeff Bonwick 	}
274fa9e4066Sahrens 
275*fb02ae02SGeorge Wilson 	/*
276*fb02ae02SGeorge Wilson 	 * Now that the device has been successfully opened, update the devid
277*fb02ae02SGeorge Wilson 	 * if necessary.
278*fb02ae02SGeorge Wilson 	 */
279*fb02ae02SGeorge Wilson 	if (validate_devid && spa_writeable(spa) &&
280*fb02ae02SGeorge Wilson 	    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
281*fb02ae02SGeorge Wilson 		if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
282*fb02ae02SGeorge Wilson 			char *vd_devid;
283*fb02ae02SGeorge Wilson 
284*fb02ae02SGeorge Wilson 			vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
285*fb02ae02SGeorge Wilson 			zfs_dbgmsg("vdev %s: update devid from %s, "
286*fb02ae02SGeorge Wilson 			    "to %s", vd->vdev_path, vd->vdev_devid, vd_devid);
287*fb02ae02SGeorge Wilson 			spa_strfree(vd->vdev_devid);
288*fb02ae02SGeorge Wilson 			vd->vdev_devid = spa_strdup(vd_devid);
289*fb02ae02SGeorge Wilson 			ddi_devid_str_free(vd_devid);
290*fb02ae02SGeorge Wilson 		}
291*fb02ae02SGeorge Wilson 		ddi_devid_free(devid);
292*fb02ae02SGeorge Wilson 	}
293*fb02ae02SGeorge Wilson 
2943d7072f8Seschrock 	/*
2953d7072f8Seschrock 	 * Once a device is opened, verify that the physical device path (if
2963d7072f8Seschrock 	 * available) is up to date.
2973d7072f8Seschrock 	 */
2983d7072f8Seschrock 	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
2993d7072f8Seschrock 	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
3000a4e9518Sgw 		char *physpath, *minorname;
3010a4e9518Sgw 
3023d7072f8Seschrock 		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3033d7072f8Seschrock 		minorname = NULL;
3043d7072f8Seschrock 		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
3053d7072f8Seschrock 		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
3063d7072f8Seschrock 		    (vd->vdev_physpath == NULL ||
3073d7072f8Seschrock 		    strcmp(vd->vdev_physpath, physpath) != 0)) {
3083d7072f8Seschrock 			if (vd->vdev_physpath)
3093d7072f8Seschrock 				spa_strfree(vd->vdev_physpath);
3103d7072f8Seschrock 			(void) strlcat(physpath, ":", MAXPATHLEN);
3113d7072f8Seschrock 			(void) strlcat(physpath, minorname, MAXPATHLEN);
3123d7072f8Seschrock 			vd->vdev_physpath = spa_strdup(physpath);
3133d7072f8Seschrock 		}
3143d7072f8Seschrock 		if (minorname)
3153d7072f8Seschrock 			kmem_free(minorname, strlen(minorname) + 1);
3163d7072f8Seschrock 		kmem_free(physpath, MAXPATHLEN);
3173d7072f8Seschrock 	}
3183d7072f8Seschrock 
319095bcd66SGeorge Wilson skip_open:
320fa9e4066Sahrens 	/*
321fa9e4066Sahrens 	 * Determine the actual size of the device.
322fa9e4066Sahrens 	 */
323fa9e4066Sahrens 	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
324fa9e4066Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
325be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
326fa9e4066Sahrens 	}
327fa9e4066Sahrens 
328ecc2d604Sbonwick 	/*
329ecc2d604Sbonwick 	 * Determine the device's minimum transfer size.
330ecc2d604Sbonwick 	 * If the ioctl isn't supported, assume DEV_BSIZE.
331ecc2d604Sbonwick 	 */
332f13665b7Sbo zhou - Sun Microsystems - Beijing China 	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext,
333ecc2d604Sbonwick 	    FKIOCTL, kcred, NULL) != 0)
334f13665b7Sbo zhou - Sun Microsystems - Beijing China 		dkmext.dki_pbsize = DEV_BSIZE;
335bef6b7d2Swebaker 
336f13665b7Sbo zhou - Sun Microsystems - Beijing China 	*ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1;
337bef6b7d2Swebaker 
3384263d13fSGeorge Wilson 	if (vd->vdev_wholedisk == 1) {
3394263d13fSGeorge Wilson 		uint64_t capacity = dkmext.dki_capacity - 1;
3404263d13fSGeorge Wilson 		uint64_t blksz = dkmext.dki_lbsize;
3414263d13fSGeorge Wilson 		int wce = 1;
3424263d13fSGeorge Wilson 
3434263d13fSGeorge Wilson 		/*
3444263d13fSGeorge Wilson 		 * If we own the whole disk, try to enable disk write caching.
3454263d13fSGeorge Wilson 		 * We ignore errors because it's OK if we can't do it.
3464263d13fSGeorge Wilson 		 */
3474263d13fSGeorge Wilson 		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
3484263d13fSGeorge Wilson 		    FKIOCTL, kcred, NULL);
3494263d13fSGeorge Wilson 
3504263d13fSGeorge Wilson 		*max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz);
3514263d13fSGeorge Wilson 		zfs_dbgmsg("capacity change: vdev %s, psize %llu, "
3524263d13fSGeorge Wilson 		    "max_psize %llu", vd->vdev_path, *psize, *max_psize);
3534263d13fSGeorge Wilson 	} else {
3544263d13fSGeorge Wilson 		*max_psize = *psize;
3554263d13fSGeorge Wilson 	}
3564263d13fSGeorge Wilson 
357b468a217Seschrock 	/*
358b468a217Seschrock 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
359b468a217Seschrock 	 * try again.
360b468a217Seschrock 	 */
361b468a217Seschrock 	vd->vdev_nowritecache = B_FALSE;
362b468a217Seschrock 
363fa9e4066Sahrens 	return (0);
364fa9e4066Sahrens }
365fa9e4066Sahrens 
366fa9e4066Sahrens static void
367fa9e4066Sahrens vdev_disk_close(vdev_t *vd)
368fa9e4066Sahrens {
369fa9e4066Sahrens 	vdev_disk_t *dvd = vd->vdev_tsd;
370fa9e4066Sahrens 
371095bcd66SGeorge Wilson 	if (vd->vdev_reopening || dvd == NULL)
372fa9e4066Sahrens 		return;
373fa9e4066Sahrens 
374fa9e4066Sahrens 	if (dvd->vd_minor != NULL)
375fa9e4066Sahrens 		ddi_devid_str_free(dvd->vd_minor);
376fa9e4066Sahrens 
377fa9e4066Sahrens 	if (dvd->vd_devid != NULL)
378fa9e4066Sahrens 		ddi_devid_free(dvd->vd_devid);
379fa9e4066Sahrens 
380fa9e4066Sahrens 	if (dvd->vd_lh != NULL)
3818ad4d6ddSJeff Bonwick 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
382fa9e4066Sahrens 
38398d1cbfeSGeorge Wilson 	vd->vdev_delayed_close = B_FALSE;
384fa9e4066Sahrens 	kmem_free(dvd, sizeof (vdev_disk_t));
385fa9e4066Sahrens 	vd->vdev_tsd = NULL;
386fa9e4066Sahrens }
387fa9e4066Sahrens 
388e7cbe64fSgw int
389e7cbe64fSgw vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
390e7cbe64fSgw     uint64_t offset, int flags)
391e7cbe64fSgw {
392e7cbe64fSgw 	buf_t *bp;
393e7cbe64fSgw 	int error = 0;
394e7cbe64fSgw 
395e7cbe64fSgw 	if (vd_lh == NULL)
396be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
397e7cbe64fSgw 
398e7cbe64fSgw 	ASSERT(flags & B_READ || flags & B_WRITE);
399e7cbe64fSgw 
400e7cbe64fSgw 	bp = getrbuf(KM_SLEEP);
401e7cbe64fSgw 	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
402e7cbe64fSgw 	bp->b_bcount = size;
403e7cbe64fSgw 	bp->b_un.b_addr = (void *)data;
404e7cbe64fSgw 	bp->b_lblkno = lbtodb(offset);
405e7cbe64fSgw 	bp->b_bufsize = size;
406e7cbe64fSgw 
407e7cbe64fSgw 	error = ldi_strategy(vd_lh, bp);
408e7cbe64fSgw 	ASSERT(error == 0);
409e7cbe64fSgw 	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
410be6fd75aSMatthew Ahrens 		error = SET_ERROR(EIO);
411e7cbe64fSgw 	freerbuf(bp);
412e7cbe64fSgw 
413e7cbe64fSgw 	return (error);
414e7cbe64fSgw }
415e7cbe64fSgw 
416fa9e4066Sahrens static void
417fa9e4066Sahrens vdev_disk_io_intr(buf_t *bp)
418fa9e4066Sahrens {
41931d7e8faSGeorge Wilson 	vdev_buf_t *vb = (vdev_buf_t *)bp;
42031d7e8faSGeorge Wilson 	zio_t *zio = vb->vb_io;
421fa9e4066Sahrens 
42251ece835Seschrock 	/*
42351ece835Seschrock 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
42451ece835Seschrock 	 * Rather than teach the rest of the stack about other error
42551ece835Seschrock 	 * possibilities (EFAULT, etc), we normalize the error value here.
42651ece835Seschrock 	 */
42751ece835Seschrock 	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
42851ece835Seschrock 
42951ece835Seschrock 	if (zio->io_error == 0 && bp->b_resid != 0)
430be6fd75aSMatthew Ahrens 		zio->io_error = SET_ERROR(EIO);
431fa9e4066Sahrens 
43231d7e8faSGeorge Wilson 	kmem_free(vb, sizeof (vdev_buf_t));
433fa9e4066Sahrens 
434e05725b1Sbonwick 	zio_interrupt(zio);
435fa9e4066Sahrens }
436fa9e4066Sahrens 
437f4a72450SJeff Bonwick static void
438f4a72450SJeff Bonwick vdev_disk_ioctl_free(zio_t *zio)
439f4a72450SJeff Bonwick {
440f4a72450SJeff Bonwick 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
441f4a72450SJeff Bonwick }
442f4a72450SJeff Bonwick 
44322fe2c88SJonathan Adams static const zio_vsd_ops_t vdev_disk_vsd_ops = {
44422fe2c88SJonathan Adams 	vdev_disk_ioctl_free,
44522fe2c88SJonathan Adams 	zio_vsd_default_cksum_report
44622fe2c88SJonathan Adams };
44722fe2c88SJonathan Adams 
448fa9e4066Sahrens static void
449fa9e4066Sahrens vdev_disk_ioctl_done(void *zio_arg, int error)
450fa9e4066Sahrens {
451fa9e4066Sahrens 	zio_t *zio = zio_arg;
452fa9e4066Sahrens 
453fa9e4066Sahrens 	zio->io_error = error;
454fa9e4066Sahrens 
455e05725b1Sbonwick 	zio_interrupt(zio);
456fa9e4066Sahrens }
457fa9e4066Sahrens 
458e05725b1Sbonwick static int
459fa9e4066Sahrens vdev_disk_io_start(zio_t *zio)
460fa9e4066Sahrens {
461fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
462fa9e4066Sahrens 	vdev_disk_t *dvd = vd->vdev_tsd;
46331d7e8faSGeorge Wilson 	vdev_buf_t *vb;
464e14bb325SJeff Bonwick 	struct dk_callback *dkc;
465fa9e4066Sahrens 	buf_t *bp;
466e14bb325SJeff Bonwick 	int error;
467fa9e4066Sahrens 
468fa9e4066Sahrens 	if (zio->io_type == ZIO_TYPE_IOCTL) {
469fa9e4066Sahrens 		/* XXPOLICY */
4700a4e9518Sgw 		if (!vdev_readable(vd)) {
471be6fd75aSMatthew Ahrens 			zio->io_error = SET_ERROR(ENXIO);
472e05725b1Sbonwick 			return (ZIO_PIPELINE_CONTINUE);
473fa9e4066Sahrens 		}
474fa9e4066Sahrens 
475fa9e4066Sahrens 		switch (zio->io_cmd) {
476fa9e4066Sahrens 
477fa9e4066Sahrens 		case DKIOCFLUSHWRITECACHE:
478fa9e4066Sahrens 
479a2eea2e1Sahrens 			if (zfs_nocacheflush)
480a2eea2e1Sahrens 				break;
481a2eea2e1Sahrens 
482b468a217Seschrock 			if (vd->vdev_nowritecache) {
483be6fd75aSMatthew Ahrens 				zio->io_error = SET_ERROR(ENOTSUP);
484b468a217Seschrock 				break;
485b468a217Seschrock 			}
486b468a217Seschrock 
487e14bb325SJeff Bonwick 			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
48822fe2c88SJonathan Adams 			zio->io_vsd_ops = &vdev_disk_vsd_ops;
489e14bb325SJeff Bonwick 
490e14bb325SJeff Bonwick 			dkc->dkc_callback = vdev_disk_ioctl_done;
491e14bb325SJeff Bonwick 			dkc->dkc_flag = FLUSH_VOLATILE;
492e14bb325SJeff Bonwick 			dkc->dkc_cookie = zio;
493fa9e4066Sahrens 
494fa9e4066Sahrens 			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
495e14bb325SJeff Bonwick 			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
496fa9e4066Sahrens 
497fa9e4066Sahrens 			if (error == 0) {
498fa9e4066Sahrens 				/*
499fa9e4066Sahrens 				 * The ioctl will be done asychronously,
500fa9e4066Sahrens 				 * and will call vdev_disk_ioctl_done()
501fa9e4066Sahrens 				 * upon completion.
502fa9e4066Sahrens 				 */
503e05725b1Sbonwick 				return (ZIO_PIPELINE_STOP);
504e05725b1Sbonwick 			}
505e05725b1Sbonwick 
506e05725b1Sbonwick 			if (error == ENOTSUP || error == ENOTTY) {
507b468a217Seschrock 				/*
508d5782879Smishra 				 * If we get ENOTSUP or ENOTTY, we know that
509d5782879Smishra 				 * no future attempts will ever succeed.
510d5782879Smishra 				 * In this case we set a persistent bit so
511d5782879Smishra 				 * that we don't bother with the ioctl in the
512d5782879Smishra 				 * future.
513b468a217Seschrock 				 */
514b468a217Seschrock 				vd->vdev_nowritecache = B_TRUE;
515fa9e4066Sahrens 			}
516fa9e4066Sahrens 			zio->io_error = error;
517b468a217Seschrock 
518fa9e4066Sahrens 			break;
519fa9e4066Sahrens 
520fa9e4066Sahrens 		default:
521be6fd75aSMatthew Ahrens 			zio->io_error = SET_ERROR(ENOTSUP);
522fa9e4066Sahrens 		}
523fa9e4066Sahrens 
524e05725b1Sbonwick 		return (ZIO_PIPELINE_CONTINUE);
525fa9e4066Sahrens 	}
526fa9e4066Sahrens 
52731d7e8faSGeorge Wilson 	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
528fa9e4066Sahrens 
52931d7e8faSGeorge Wilson 	vb->vb_io = zio;
53031d7e8faSGeorge Wilson 	bp = &vb->vb_buf;
531fa9e4066Sahrens 
532fa9e4066Sahrens 	bioinit(bp);
533e14bb325SJeff Bonwick 	bp->b_flags = B_BUSY | B_NOCACHE |
5348956713aSEric Schrock 	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
5358956713aSEric Schrock 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
5368956713aSEric Schrock 		bp->b_flags |= B_FAILFAST;
537fa9e4066Sahrens 	bp->b_bcount = zio->io_size;
538fa9e4066Sahrens 	bp->b_un.b_addr = zio->io_data;
539fa9e4066Sahrens 	bp->b_lblkno = lbtodb(zio->io_offset);
540fa9e4066Sahrens 	bp->b_bufsize = zio->io_size;
541fa9e4066Sahrens 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
542fa9e4066Sahrens 
543fa9e4066Sahrens 	/* ldi_strategy() will return non-zero only on programming errors */
544e14bb325SJeff Bonwick 	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
545e05725b1Sbonwick 
546e05725b1Sbonwick 	return (ZIO_PIPELINE_STOP);
547fa9e4066Sahrens }
548fa9e4066Sahrens 
549e14bb325SJeff Bonwick static void
550fa9e4066Sahrens vdev_disk_io_done(zio_t *zio)
551fa9e4066Sahrens {
552e14bb325SJeff Bonwick 	vdev_t *vd = zio->io_vd;
553ea8dc4b6Seschrock 
5543d7072f8Seschrock 	/*
5553d7072f8Seschrock 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
5563d7072f8Seschrock 	 * the device has been removed.  If this is the case, then we trigger an
5570a4e9518Sgw 	 * asynchronous removal of the device. Otherwise, probe the device and
5581f7ad2e1Sgw 	 * make sure it's still accessible.
5593d7072f8Seschrock 	 */
5601d713200SEric Schrock 	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
5610a4e9518Sgw 		vdev_disk_t *dvd = vd->vdev_tsd;
562e14bb325SJeff Bonwick 		int state = DKIO_NONE;
5630a4e9518Sgw 
564e14bb325SJeff Bonwick 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
565e14bb325SJeff Bonwick 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
5661d713200SEric Schrock 			/*
5671d713200SEric Schrock 			 * We post the resource as soon as possible, instead of
5681d713200SEric Schrock 			 * when the async removal actually happens, because the
5691d713200SEric Schrock 			 * DE is using this information to discard previous I/O
5701d713200SEric Schrock 			 * errors.
5711d713200SEric Schrock 			 */
5721d713200SEric Schrock 			zfs_post_remove(zio->io_spa, vd);
5733d7072f8Seschrock 			vd->vdev_remove_wanted = B_TRUE;
5743d7072f8Seschrock 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
57598d1cbfeSGeorge Wilson 		} else if (!vd->vdev_delayed_close) {
57698d1cbfeSGeorge Wilson 			vd->vdev_delayed_close = B_TRUE;
5773d7072f8Seschrock 		}
5783d7072f8Seschrock 	}
579fa9e4066Sahrens }
580fa9e4066Sahrens 
581fa9e4066Sahrens vdev_ops_t vdev_disk_ops = {
582fa9e4066Sahrens 	vdev_disk_open,
583fa9e4066Sahrens 	vdev_disk_close,
584fa9e4066Sahrens 	vdev_default_asize,
585fa9e4066Sahrens 	vdev_disk_io_start,
586fa9e4066Sahrens 	vdev_disk_io_done,
587fa9e4066Sahrens 	NULL,
588dcba9f3fSGeorge Wilson 	vdev_disk_hold,
589dcba9f3fSGeorge Wilson 	vdev_disk_rele,
590fa9e4066Sahrens 	VDEV_TYPE_DISK,		/* name of this vdev type */
591fa9e4066Sahrens 	B_TRUE			/* leaf vdev */
592fa9e4066Sahrens };
593e7cbe64fSgw 
594e7cbe64fSgw /*
595051aabe6Staylor  * Given the root disk device devid or pathname, read the label from
596051aabe6Staylor  * the device, and construct a configuration nvlist.
597e7cbe64fSgw  */
598f940fbb1SLin Ling int
599f940fbb1SLin Ling vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
600e7cbe64fSgw {
601e7cbe64fSgw 	ldi_handle_t vd_lh;
602e7cbe64fSgw 	vdev_label_t *label;
603e7cbe64fSgw 	uint64_t s, size;
604e7cbe64fSgw 	int l;
605051aabe6Staylor 	ddi_devid_t tmpdevid;
606f4565e39SLin Ling 	int error = -1;
607051aabe6Staylor 	char *minor_name;
608e7cbe64fSgw 
609e7cbe64fSgw 	/*
610e7cbe64fSgw 	 * Read the device label and build the nvlist.
611e7cbe64fSgw 	 */
612f4565e39SLin Ling 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
613051aabe6Staylor 	    &minor_name) == 0) {
614051aabe6Staylor 		error = ldi_open_by_devid(tmpdevid, minor_name,
6158ad4d6ddSJeff Bonwick 		    FREAD, kcred, &vd_lh, zfs_li);
616051aabe6Staylor 		ddi_devid_free(tmpdevid);
617051aabe6Staylor 		ddi_devid_str_free(minor_name);
618051aabe6Staylor 	}
619051aabe6Staylor 
620f4565e39SLin Ling 	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
621f4565e39SLin Ling 	    zfs_li)))
622f940fbb1SLin Ling 		return (error);
623e7cbe64fSgw 
624bf82a41bSeschrock 	if (ldi_get_size(vd_lh, &s)) {
625bf82a41bSeschrock 		(void) ldi_close(vd_lh, FREAD, kcred);
626be6fd75aSMatthew Ahrens 		return (SET_ERROR(EIO));
627bf82a41bSeschrock 	}
628e7cbe64fSgw 
629e7cbe64fSgw 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
630e7cbe64fSgw 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
631e7cbe64fSgw 
63217f1e64aSEric Taylor 	*config = NULL;
633e7cbe64fSgw 	for (l = 0; l < VDEV_LABELS; l++) {
634e7cbe64fSgw 		uint64_t offset, state, txg = 0;
635e7cbe64fSgw 
636e7cbe64fSgw 		/* read vdev label */
637e7cbe64fSgw 		offset = vdev_label_offset(size, l, 0);
638e7cbe64fSgw 		if (vdev_disk_physio(vd_lh, (caddr_t)label,
6392264ca7fSLin Ling 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
640e7cbe64fSgw 			continue;
641e7cbe64fSgw 
642e7cbe64fSgw 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
643f940fbb1SLin Ling 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
644f940fbb1SLin Ling 			*config = NULL;
645e7cbe64fSgw 			continue;
646e7cbe64fSgw 		}
647e7cbe64fSgw 
648f940fbb1SLin Ling 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
649e7cbe64fSgw 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
650f940fbb1SLin Ling 			nvlist_free(*config);
651f940fbb1SLin Ling 			*config = NULL;
652e7cbe64fSgw 			continue;
653e7cbe64fSgw 		}
654e7cbe64fSgw 
655f940fbb1SLin Ling 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
656e7cbe64fSgw 		    &txg) != 0 || txg == 0) {
657f940fbb1SLin Ling 			nvlist_free(*config);
658f940fbb1SLin Ling 			*config = NULL;
659e7cbe64fSgw 			continue;
660e7cbe64fSgw 		}
661e7cbe64fSgw 
662e7cbe64fSgw 		break;
663e7cbe64fSgw 	}
664e7cbe64fSgw 
665e7cbe64fSgw 	kmem_free(label, sizeof (vdev_label_t));
666bf82a41bSeschrock 	(void) ldi_close(vd_lh, FREAD, kcred);
66717f1e64aSEric Taylor 	if (*config == NULL)
668be6fd75aSMatthew Ahrens 		error = SET_ERROR(EIDRM);
669bf82a41bSeschrock 
670f940fbb1SLin Ling 	return (error);
671e7cbe64fSgw }
672