1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24 * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright 2020 Joyent, Inc.
26 * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa_impl.h>
31#include <sys/refcount.h>
32#include <sys/vdev_impl.h>
33#include <sys/vdev_trim.h>
34#include <sys/abd.h>
35#include <sys/fs/zfs.h>
36#include <sys/zio.h>
37#include <sys/sunldi.h>
38#include <sys/efi_partition.h>
39#include <sys/fm/fs/zfs.h>
40#include <sys/ddi.h>
41
42/*
43 * Tunable to disable TRIM in case we're using a problematic SSD.
44 */
45uint_t zfs_no_trim = 0;
46
47/*
48 * Tunable parameter for debugging or performance analysis. Setting this
49 * will cause pool corruption on power loss if a volatile out-of-order
50 * write cache is enabled.
51 */
52boolean_t zfs_nocacheflush = B_FALSE;
53
54/*
55 * Virtual device vector for disks.
56 */
57
58extern ldi_ident_t zfs_li;
59
60static void vdev_disk_close(vdev_t *);
61
62typedef struct vdev_disk {
63	ddi_devid_t	vd_devid;
64	char		*vd_minor;
65	ldi_handle_t	vd_lh;
66	list_t		vd_ldi_cbs;
67	boolean_t	vd_ldi_offline;
68} vdev_disk_t;
69
70typedef struct vdev_disk_buf {
71	buf_t	vdb_buf;
72	zio_t	*vdb_io;
73} vdev_disk_buf_t;
74
75typedef struct vdev_disk_ldi_cb {
76	list_node_t		lcb_next;
77	ldi_callback_id_t	lcb_id;
78} vdev_disk_ldi_cb_t;
79
80/*
81 * Bypass the devid when opening a disk vdev.
82 * There have been issues where the devids of several devices were shuffled,
83 * causing pool open failures. Note, that this flag is intended to be used
84 * for pool recovery only.
85 *
86 * Note that if a pool is imported with the devids bypassed, all its vdevs will
87 * cease storing devid information permanently. In practice, the devid is rarely
88 * useful as vdev paths do not tend to change unless the hardware is
89 * reconfigured. That said, if the paths do change and a pool fails to open
90 * automatically at boot, a simple zpool import should re-scan the paths and fix
91 * the issue.
92 */
93boolean_t vdev_disk_bypass_devid = B_FALSE;
94
95static void
96vdev_disk_alloc(vdev_t *vd)
97{
98	vdev_disk_t *dvd;
99
100	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
101	/*
102	 * Create the LDI event callback list.
103	 */
104	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
105	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
106}
107
108static void
109vdev_disk_free(vdev_t *vd)
110{
111	vdev_disk_t *dvd = vd->vdev_tsd;
112	vdev_disk_ldi_cb_t *lcb;
113
114	if (dvd == NULL)
115		return;
116
117	/*
118	 * We have already closed the LDI handle. Clean up the LDI event
119	 * callbacks and free vd->vdev_tsd.
120	 */
121	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
122		list_remove(&dvd->vd_ldi_cbs, lcb);
123		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
124		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
125	}
126	list_destroy(&dvd->vd_ldi_cbs);
127	kmem_free(dvd, sizeof (vdev_disk_t));
128	vd->vdev_tsd = NULL;
129}
130
131static int
132vdev_disk_off_notify(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
133    void *arg, void *ev_data __unused)
134{
135	vdev_t *vd = (vdev_t *)arg;
136	vdev_disk_t *dvd = vd->vdev_tsd;
137
138	/*
139	 * Ignore events other than offline.
140	 */
141	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
142		return (LDI_EV_SUCCESS);
143
144	/*
145	 * Tell any new threads that stumble upon this vdev that they should not
146	 * try to do I/O.
147	 */
148	dvd->vd_ldi_offline = B_TRUE;
149
150	/*
151	 * Request that the spa_async_thread mark the device as REMOVED and
152	 * notify FMA of the removal.  This should also trigger a vdev_close()
153	 * in the async thread.
154	 */
155	zfs_post_remove(vd->vdev_spa, vd);
156	vd->vdev_remove_wanted = B_TRUE;
157	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
158
159	return (LDI_EV_SUCCESS);
160}
161
162static void
163vdev_disk_off_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
164    int ldi_result, void *arg, void *ev_data __unused)
165{
166	vdev_t *vd = (vdev_t *)arg;
167
168	/*
169	 * Ignore events other than offline.
170	 */
171	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
172		return;
173
174	/*
175	 * Request that the vdev be reopened if the offline state change was
176	 * unsuccessful.
177	 */
178	if (ldi_result != LDI_EV_SUCCESS) {
179		vd->vdev_probe_wanted = B_TRUE;
180		spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
181	}
182}
183
184static ldi_ev_callback_t vdev_disk_off_callb = {
185	.cb_vers = LDI_EV_CB_VERS,
186	.cb_notify = vdev_disk_off_notify,
187	.cb_finalize = vdev_disk_off_finalize
188};
189
190static void
191vdev_disk_dgrd_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
192    int ldi_result, void *arg, void *ev_data __unused)
193{
194	vdev_t *vd = (vdev_t *)arg;
195
196	/*
197	 * Ignore events other than degrade.
198	 */
199	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
200		return;
201
202	/*
203	 * Degrade events always succeed. Mark the vdev as degraded.
204	 * This status is purely informative for the user.
205	 */
206	(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
207}
208
209static ldi_ev_callback_t vdev_disk_dgrd_callb = {
210	.cb_vers = LDI_EV_CB_VERS,
211	.cb_notify = NULL,
212	.cb_finalize = vdev_disk_dgrd_finalize
213};
214
215static void
216vdev_disk_hold(vdev_t *vd)
217{
218	ddi_devid_t devid;
219	char *minor;
220
221	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
222
223	/*
224	 * We must have a pathname, and it must be absolute.
225	 */
226	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
227		return;
228
229	/*
230	 * Only prefetch path and devid info if the device has
231	 * never been opened.
232	 */
233	if (vd->vdev_tsd != NULL)
234		return;
235
236	if (vd->vdev_wholedisk == -1ULL) {
237		size_t len = strlen(vd->vdev_path) + 3;
238		char *buf = kmem_alloc(len, KM_SLEEP);
239
240		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
241
242		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
243		kmem_free(buf, len);
244	}
245
246	if (vd->vdev_name_vp == NULL)
247		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
248
249	if (vd->vdev_devid != NULL &&
250	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
251		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
252		ddi_devid_str_free(minor);
253		ddi_devid_free(devid);
254	}
255}
256
257static void
258vdev_disk_rele(vdev_t *vd)
259{
260	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
261
262	if (vd->vdev_name_vp) {
263		VN_RELE_ASYNC(vd->vdev_name_vp,
264		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
265		vd->vdev_name_vp = NULL;
266	}
267	if (vd->vdev_devid_vp) {
268		VN_RELE_ASYNC(vd->vdev_devid_vp,
269		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
270		vd->vdev_devid_vp = NULL;
271	}
272}
273
274/*
275 * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
276 * even a fallback to DKIOCGMEDIAINFO fails.
277 */
278#ifdef DEBUG
279#define	VDEV_DEBUG(...)	cmn_err(CE_NOTE, __VA_ARGS__)
280#else
281#define	VDEV_DEBUG(...)	/* Nothing... */
282#endif
283
284static int
285vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
286    uint64_t *ashift)
287{
288	spa_t *spa = vd->vdev_spa;
289	vdev_disk_t *dvd = vd->vdev_tsd;
290	ldi_ev_cookie_t ecookie;
291	vdev_disk_ldi_cb_t *lcb;
292	union {
293		struct dk_minfo_ext ude;
294		struct dk_minfo ud;
295	} dks;
296	struct dk_minfo_ext *dkmext = &dks.ude;
297	struct dk_minfo *dkm = &dks.ud;
298	int error, can_free;
299	dev_t dev;
300	int otyp;
301	boolean_t validate_devid = B_FALSE;
302	uint64_t capacity = 0, blksz = 0, pbsize;
303
304	/*
305	 * We must have a pathname, and it must be absolute.
306	 */
307	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
308		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
309		return (SET_ERROR(EINVAL));
310	}
311
312	/*
313	 * Reopen the device if it's not currently open. Otherwise,
314	 * just update the physical size of the device.
315	 */
316	if (dvd != NULL) {
317		ASSERT(vd->vdev_reopening);
318		goto skip_open;
319	}
320
321	/*
322	 * Create vd->vdev_tsd.
323	 */
324	vdev_disk_alloc(vd);
325	dvd = vd->vdev_tsd;
326
327	/*
328	 * Allow bypassing the devid.
329	 */
330	if (vd->vdev_devid != NULL && vdev_disk_bypass_devid) {
331		vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed",
332		    vd->vdev_devid);
333		spa_strfree(vd->vdev_devid);
334		vd->vdev_devid = NULL;
335	}
336
337	/*
338	 * When opening a disk device, we want to preserve the user's original
339	 * intent.  We always want to open the device by the path the user gave
340	 * us, even if it is one of multiple paths to the same device.  But we
341	 * also want to be able to survive disks being removed/recabled.
342	 * Therefore the sequence of opening devices is:
343	 *
344	 * 1. Try opening the device by path.  For legacy pools without the
345	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
346	 *
347	 * 2. If the devid of the device matches the stored value, return
348	 *    success.
349	 *
350	 * 3. Otherwise, the device may have moved.  Try opening the device
351	 *    by the devid instead.
352	 */
353	if (vd->vdev_devid != NULL) {
354		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
355		    &dvd->vd_minor) != 0) {
356			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
357			vdev_dbgmsg(vd, "vdev_disk_open: invalid "
358			    "vdev_devid '%s'", vd->vdev_devid);
359			return (SET_ERROR(EINVAL));
360		}
361	}
362
363	error = EINVAL;		/* presume failure */
364
365	if (vd->vdev_path != NULL) {
366		if (vd->vdev_wholedisk == -1ULL) {
367			size_t len = strlen(vd->vdev_path) + 3;
368			char *buf = kmem_alloc(len, KM_SLEEP);
369
370			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
371
372			error = ldi_open_by_name(buf, spa_mode(spa), kcred,
373			    &dvd->vd_lh, zfs_li);
374			if (error == 0) {
375				spa_strfree(vd->vdev_path);
376				vd->vdev_path = buf;
377				vd->vdev_wholedisk = 1ULL;
378			} else {
379				kmem_free(buf, len);
380			}
381		}
382
383		/*
384		 * If we have not yet opened the device, try to open it by the
385		 * specified path.
386		 */
387		if (error != 0) {
388			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
389			    kcred, &dvd->vd_lh, zfs_li);
390		}
391
392		/*
393		 * Compare the devid to the stored value.
394		 */
395		if (error == 0 && vd->vdev_devid != NULL) {
396			ddi_devid_t devid = NULL;
397
398			if (ldi_get_devid(dvd->vd_lh, &devid) != 0) {
399				/*
400				 * We expected a devid on this device but it no
401				 * longer appears to have one.  The validation
402				 * step may need to remove it from the
403				 * configuration.
404				 */
405				validate_devid = B_TRUE;
406
407			} else if (ddi_devid_compare(devid, dvd->vd_devid) !=
408			    0) {
409				/*
410				 * A mismatch here is unexpected, log it.
411				 */
412				char *devid_str = ddi_devid_str_encode(devid,
413				    dvd->vd_minor);
414				vdev_dbgmsg(vd, "vdev_disk_open: devid "
415				    "mismatch: %s != %s", vd->vdev_devid,
416				    devid_str);
417				cmn_err(CE_NOTE, "vdev_disk_open %s: devid "
418				    "mismatch: %s != %s", vd->vdev_path,
419				    vd->vdev_devid, devid_str);
420				ddi_devid_str_free(devid_str);
421
422				error = SET_ERROR(EINVAL);
423				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
424				    kcred);
425				dvd->vd_lh = NULL;
426			}
427
428			if (devid != NULL) {
429				ddi_devid_free(devid);
430			}
431		}
432
433		/*
434		 * If we succeeded in opening the device, but 'vdev_wholedisk'
435		 * is not yet set, then this must be a slice.
436		 */
437		if (error == 0 && vd->vdev_wholedisk == -1ULL)
438			vd->vdev_wholedisk = 0;
439	}
440
441	/*
442	 * If we were unable to open by path, or the devid check fails, open by
443	 * devid instead.
444	 */
445	if (error != 0 && vd->vdev_devid != NULL) {
446		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
447		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
448		if (error != 0) {
449			vdev_dbgmsg(vd, "Failed to open by devid (%s)",
450			    vd->vdev_devid);
451		}
452	}
453
454	/*
455	 * If all else fails, then try opening by physical path (if available)
456	 * or the logical path (if we failed due to the devid check).  While not
457	 * as reliable as the devid, this will give us something, and the higher
458	 * level vdev validation will prevent us from opening the wrong device.
459	 */
460	if (error != 0) {
461		validate_devid = B_TRUE;
462
463		if (vd->vdev_physpath != NULL &&
464		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) {
465			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
466			    kcred, &dvd->vd_lh, zfs_li);
467		}
468
469		/*
470		 * Note that we don't support the legacy auto-wholedisk support
471		 * as above.  This hasn't been used in a very long time and we
472		 * don't need to propagate its oddities to this edge condition.
473		 */
474		if (error != 0 && vd->vdev_path != NULL) {
475			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
476			    kcred, &dvd->vd_lh, zfs_li);
477		}
478	}
479
480	/*
481	 * If this is early in boot, a sweep of available block devices may
482	 * locate an alternative path that we can try.
483	 */
484	if (error != 0) {
485		const char *altdevpath = vdev_disk_preroot_lookup(
486		    spa_guid(spa), vd->vdev_guid);
487
488		if (altdevpath != NULL) {
489			vdev_dbgmsg(vd, "Trying alternate preroot path (%s)",
490			    altdevpath);
491
492			validate_devid = B_TRUE;
493
494			if ((error = ldi_open_by_name((char *)altdevpath,
495			    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li)) != 0) {
496				vdev_dbgmsg(vd, "Failed to open by preroot "
497				    "path (%s)", altdevpath);
498			}
499		}
500	}
501
502	if (error != 0) {
503		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
504		vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]",
505		    error);
506		return (error);
507	}
508
509	/*
510	 * Now that the device has been successfully opened, update the devid
511	 * if necessary.
512	 */
513	if (validate_devid) {
514		ddi_devid_t devid = NULL;
515		char *minorname = NULL;
516		char *vd_devid = NULL;
517		boolean_t remove = B_FALSE, update = B_FALSE;
518
519		/*
520		 * Get the current devid and minor name for the device we
521		 * opened.
522		 */
523		if (ldi_get_devid(dvd->vd_lh, &devid) != 0 ||
524		    ldi_get_minor_name(dvd->vd_lh, &minorname) != 0) {
525			/*
526			 * If we are unable to get the devid or the minor name
527			 * for the device, we need to remove them from the
528			 * configuration to prevent potential inconsistencies.
529			 */
530			if (dvd->vd_minor != NULL || dvd->vd_devid != NULL ||
531			    vd->vdev_devid != NULL) {
532				/*
533				 * We only need to remove the devid if one
534				 * exists.
535				 */
536				remove = B_TRUE;
537			}
538
539		} else if (dvd->vd_devid == NULL || dvd->vd_minor == NULL) {
540			/*
541			 * There was previously no devid at all so we need to
542			 * add one.
543			 */
544			update = B_TRUE;
545
546		} else if (ddi_devid_compare(devid, dvd->vd_devid) != 0 ||
547		    strcmp(minorname, dvd->vd_minor) != 0) {
548			/*
549			 * The devid or minor name on file does not match the
550			 * one from the opened device.
551			 */
552			update = B_TRUE;
553		}
554
555		if (update) {
556			/*
557			 * Render the new devid and minor name as a string for
558			 * logging and to store in the vdev configuration.
559			 */
560			vd_devid = ddi_devid_str_encode(devid, minorname);
561		}
562
563		if (update || remove) {
564			vdev_dbgmsg(vd, "vdev_disk_open: update devid from "
565			    "'%s' to '%s'",
566			    vd->vdev_devid != NULL ? vd->vdev_devid : "<none>",
567			    vd_devid != NULL ? vd_devid : "<none>");
568			cmn_err(CE_NOTE, "vdev_disk_open %s: update devid "
569			    "from '%s' to '%s'",
570			    vd->vdev_path != NULL ? vd->vdev_path : "?",
571			    vd->vdev_devid != NULL ? vd->vdev_devid : "<none>",
572			    vd_devid != NULL ? vd_devid : "<none>");
573
574			/*
575			 * Remove and free any existing values.
576			 */
577			if (dvd->vd_minor != NULL) {
578				ddi_devid_str_free(dvd->vd_minor);
579				dvd->vd_minor = NULL;
580			}
581			if (dvd->vd_devid != NULL) {
582				ddi_devid_free(dvd->vd_devid);
583				dvd->vd_devid = NULL;
584			}
585			if (vd->vdev_devid != NULL) {
586				spa_strfree(vd->vdev_devid);
587				vd->vdev_devid = NULL;
588			}
589		}
590
591		if (update) {
592			/*
593			 * Install the new values.
594			 */
595			vd->vdev_devid = vd_devid;
596			dvd->vd_minor = minorname;
597			dvd->vd_devid = devid;
598
599		} else {
600			if (devid != NULL) {
601				ddi_devid_free(devid);
602			}
603			if (minorname != NULL) {
604				kmem_free(minorname, strlen(minorname) + 1);
605			}
606		}
607	}
608
609	/*
610	 * Once a device is opened, verify that the physical device path (if
611	 * available) is up to date.
612	 */
613	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
614	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
615		char *physpath, *minorname;
616
617		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
618		minorname = NULL;
619		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
620		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
621		    (vd->vdev_physpath == NULL ||
622		    strcmp(vd->vdev_physpath, physpath) != 0)) {
623			if (vd->vdev_physpath)
624				spa_strfree(vd->vdev_physpath);
625			(void) strlcat(physpath, ":", MAXPATHLEN);
626			(void) strlcat(physpath, minorname, MAXPATHLEN);
627			vd->vdev_physpath = spa_strdup(physpath);
628		}
629		if (minorname)
630			kmem_free(minorname, strlen(minorname) + 1);
631		kmem_free(physpath, MAXPATHLEN);
632	}
633
634	/*
635	 * Register callbacks for the LDI offline event.
636	 */
637	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
638	    LDI_EV_SUCCESS) {
639		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
640		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
641		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
642		    &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
643	}
644
645	/*
646	 * Register callbacks for the LDI degrade event.
647	 */
648	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
649	    LDI_EV_SUCCESS) {
650		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
651		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
652		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
653		    &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
654	}
655
656skip_open:
657	/*
658	 * Determine the actual size of the device.
659	 */
660	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
661		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
662		vdev_dbgmsg(vd, "vdev_disk_open: failed to get size");
663		return (SET_ERROR(EINVAL));
664	}
665
666	*max_psize = *psize;
667
668	/*
669	 * Determine the device's minimum transfer size.
670	 * If the ioctl isn't supported, assume DEV_BSIZE.
671	 */
672	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
673	    (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
674		capacity = dkmext->dki_capacity - 1;
675		blksz = dkmext->dki_lbsize;
676		pbsize = dkmext->dki_pbsize;
677	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
678	    (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
679		VDEV_DEBUG(
680		    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
681		    vd->vdev_path);
682		capacity = dkm->dki_capacity - 1;
683		blksz = dkm->dki_lbsize;
684		pbsize = blksz;
685	} else {
686		VDEV_DEBUG("vdev_disk_open(\"%s\"): "
687		    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
688		    vd->vdev_path, error);
689		pbsize = DEV_BSIZE;
690	}
691
692	*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
693
694	if (vd->vdev_wholedisk == 1) {
695		int wce = 1;
696
697		if (error == 0) {
698			/*
699			 * If we have the capability to expand, we'd have
700			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
701			 * Adjust max_psize upward accordingly since we know
702			 * we own the whole disk now.
703			 */
704			*max_psize = capacity * blksz;
705		}
706
707		/*
708		 * Since we own the whole disk, try to enable disk write
709		 * caching.  We ignore errors because it's OK if we can't do it.
710		 */
711		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
712		    FKIOCTL, kcred, NULL);
713	}
714
715	/*
716	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
717	 * try again.
718	 */
719	vd->vdev_nowritecache = B_FALSE;
720
721	if (ldi_ioctl(dvd->vd_lh, DKIOC_CANFREE, (intptr_t)&can_free, FKIOCTL,
722	    kcred, NULL) == 0 && can_free == 1) {
723		vd->vdev_has_trim = B_TRUE;
724	} else {
725		vd->vdev_has_trim = B_FALSE;
726	}
727
728	if (zfs_no_trim == 1)
729		vd->vdev_has_trim = B_FALSE;
730
731	/* Currently only supported for ZoL. */
732	vd->vdev_has_securetrim = B_FALSE;
733
734	/* Inform the ZIO pipeline that we are non-rotational */
735	vd->vdev_nonrot = B_FALSE;
736	if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
737	    "device-solid-state")) {
738		if (ldi_prop_get_int(dvd->vd_lh,
739		    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
740		    "device-solid-state", B_FALSE) != 0)
741			vd->vdev_nonrot = B_TRUE;
742	}
743
744	return (0);
745}
746
747static void
748vdev_disk_close(vdev_t *vd)
749{
750	vdev_disk_t *dvd = vd->vdev_tsd;
751
752	if (vd->vdev_reopening || dvd == NULL)
753		return;
754
755	if (dvd->vd_minor != NULL) {
756		ddi_devid_str_free(dvd->vd_minor);
757		dvd->vd_minor = NULL;
758	}
759
760	if (dvd->vd_devid != NULL) {
761		ddi_devid_free(dvd->vd_devid);
762		dvd->vd_devid = NULL;
763	}
764
765	if (dvd->vd_lh != NULL) {
766		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
767		dvd->vd_lh = NULL;
768	}
769
770	vd->vdev_delayed_close = B_FALSE;
771	vdev_disk_free(vd);
772}
773
774static int
775vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
776    size_t size, uint64_t offset, int flags)
777{
778	buf_t *bp;
779	int error = 0;
780
781	if (vd_lh == NULL)
782		return (SET_ERROR(EINVAL));
783
784	ASSERT(flags & B_READ || flags & B_WRITE);
785
786	bp = getrbuf(KM_SLEEP);
787	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
788	bp->b_bcount = size;
789	bp->b_un.b_addr = (void *)data;
790	bp->b_lblkno = lbtodb(offset);
791	bp->b_bufsize = size;
792
793	error = ldi_strategy(vd_lh, bp);
794	ASSERT(error == 0);
795	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
796		error = SET_ERROR(EIO);
797	freerbuf(bp);
798
799	return (error);
800}
801
802static int
803vdev_disk_dumpio(vdev_t *vd, caddr_t data, size_t size,
804    uint64_t offset, uint64_t origoffset __unused, boolean_t doread,
805    boolean_t isdump)
806{
807	vdev_disk_t *dvd = vd->vdev_tsd;
808	int flags = doread ? B_READ : B_WRITE;
809
810	/*
811	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
812	 * Nothing to be done here but return failure.
813	 */
814	if (dvd == NULL || dvd->vd_ldi_offline) {
815		return (SET_ERROR(ENXIO));
816	}
817
818	ASSERT(vd->vdev_ops == &vdev_disk_ops);
819
820	offset += VDEV_LABEL_START_SIZE;
821
822	/*
823	 * If in the context of an active crash dump, use the ldi_dump(9F)
824	 * call instead of ldi_strategy(9F) as usual.
825	 */
826	if (isdump) {
827		ASSERT3P(dvd, !=, NULL);
828		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
829		    lbtodb(size)));
830	}
831
832	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
833}
834
835static int
836vdev_disk_io_intr(buf_t *bp)
837{
838	vdev_buf_t *vb = (vdev_buf_t *)bp;
839	zio_t *zio = vb->vb_io;
840
841	/*
842	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
843	 * Rather than teach the rest of the stack about other error
844	 * possibilities (EFAULT, etc), we normalize the error value here.
845	 */
846	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
847
848	if (zio->io_error == 0 && bp->b_resid != 0)
849		zio->io_error = SET_ERROR(EIO);
850
851	if (zio->io_type == ZIO_TYPE_READ) {
852		abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
853	} else {
854		abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
855	}
856
857	kmem_free(vb, sizeof (vdev_buf_t));
858
859	zio_delay_interrupt(zio);
860	return (0);
861}
862
863static void
864vdev_disk_ioctl_free(zio_t *zio)
865{
866	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
867}
868
869static const zio_vsd_ops_t vdev_disk_vsd_ops = {
870	vdev_disk_ioctl_free,
871	zio_vsd_default_cksum_report
872};
873
874static void
875vdev_disk_ioctl_done(void *zio_arg, int error)
876{
877	zio_t *zio = zio_arg;
878
879	zio->io_error = error;
880
881	zio_interrupt(zio);
882}
883
884static void
885vdev_disk_io_start(zio_t *zio)
886{
887	vdev_t *vd = zio->io_vd;
888	vdev_disk_t *dvd = vd->vdev_tsd;
889	unsigned long trim_flags = 0;
890	vdev_buf_t *vb;
891	struct dk_callback *dkc;
892	buf_t *bp;
893	int error;
894
895	/*
896	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
897	 * Nothing to be done here but return failure.
898	 */
899	if (dvd == NULL || dvd->vd_ldi_offline) {
900		zio->io_error = ENXIO;
901		zio_interrupt(zio);
902		return;
903	}
904
905	switch (zio->io_type) {
906	case ZIO_TYPE_IOCTL:
907		/* XXPOLICY */
908		if (!vdev_readable(vd)) {
909			zio->io_error = SET_ERROR(ENXIO);
910			zio_interrupt(zio);
911			return;
912		}
913
914		switch (zio->io_cmd) {
915
916		case DKIOCFLUSHWRITECACHE:
917
918			if (zfs_nocacheflush)
919				break;
920
921			if (vd->vdev_nowritecache) {
922				zio->io_error = SET_ERROR(ENOTSUP);
923				break;
924			}
925
926			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
927			zio->io_vsd_ops = &vdev_disk_vsd_ops;
928
929			dkc->dkc_callback = vdev_disk_ioctl_done;
930			dkc->dkc_flag = FLUSH_VOLATILE;
931			dkc->dkc_cookie = zio;
932
933			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
934			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
935
936			if (error == 0) {
937				/*
938				 * The ioctl will be done asychronously,
939				 * and will call vdev_disk_ioctl_done()
940				 * upon completion.
941				 */
942				return;
943			}
944
945			zio->io_error = error;
946
947			break;
948
949		default:
950			zio->io_error = SET_ERROR(ENOTSUP);
951		}
952
953		zio_execute(zio);
954		return;
955
956	case ZIO_TYPE_TRIM:
957		if (zfs_no_trim == 1 || !vd->vdev_has_trim) {
958			zio->io_error = SET_ERROR(ENOTSUP);
959			zio_execute(zio);
960			return;
961		}
962		/* Currently only supported on ZoL. */
963		ASSERT0(zio->io_trim_flags & ZIO_TRIM_SECURE);
964
965		/* dkioc_free_list_t is already declared to hold one entry */
966		dkioc_free_list_t dfl;
967		dfl.dfl_flags = 0;
968		dfl.dfl_num_exts = 1;
969		dfl.dfl_offset = 0;
970		dfl.dfl_exts[0].dfle_start = zio->io_offset;
971		dfl.dfl_exts[0].dfle_length = zio->io_size;
972
973		zio->io_error = ldi_ioctl(dvd->vd_lh, DKIOCFREE,
974		    (uintptr_t)&dfl, FKIOCTL, kcred, NULL);
975
976		if (zio->io_error == ENOTSUP || zio->io_error == ENOTTY) {
977			/*
978			 * The device must have changed and now TRIM is
979			 * no longer supported.
980			 */
981			vd->vdev_has_trim = B_FALSE;
982		}
983
984		zio_interrupt(zio);
985		return;
986	}
987
988	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
989	zio->io_target_timestamp = zio_handle_io_delay(zio);
990
991	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
992
993	vb->vb_io = zio;
994	bp = &vb->vb_buf;
995
996	bioinit(bp);
997	bp->b_flags = B_BUSY | B_NOCACHE |
998	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
999	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
1000		bp->b_flags |= B_FAILFAST;
1001	bp->b_bcount = zio->io_size;
1002
1003	if (zio->io_type == ZIO_TYPE_READ) {
1004		bp->b_un.b_addr =
1005		    abd_borrow_buf(zio->io_abd, zio->io_size);
1006	} else {
1007		bp->b_un.b_addr =
1008		    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
1009	}
1010
1011	bp->b_lblkno = lbtodb(zio->io_offset);
1012	bp->b_bufsize = zio->io_size;
1013	bp->b_iodone = vdev_disk_io_intr;
1014
1015	/*
1016	 * In general we would expect ldi_strategy() to return non-zero only
1017	 * because of programming errors, but we've also seen this fail shortly
1018	 * after a disk dies.
1019	 */
1020	if (ldi_strategy(dvd->vd_lh, bp) != 0) {
1021		zio->io_error = ENXIO;
1022		zio_interrupt(zio);
1023	}
1024}
1025
1026static void
1027vdev_disk_io_done(zio_t *zio)
1028{
1029	vdev_t *vd = zio->io_vd;
1030
1031	/*
1032	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
1033	 * the device has been removed.  If this is the case, then we trigger an
1034	 * asynchronous removal of the device. Otherwise, probe the device and
1035	 * make sure it's still accessible.
1036	 */
1037	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
1038		vdev_disk_t *dvd = vd->vdev_tsd;
1039		int state = DKIO_NONE;
1040
1041		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
1042		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
1043			/*
1044			 * We post the resource as soon as possible, instead of
1045			 * when the async removal actually happens, because the
1046			 * DE is using this information to discard previous I/O
1047			 * errors.
1048			 */
1049			zfs_post_remove(zio->io_spa, vd);
1050			vd->vdev_remove_wanted = B_TRUE;
1051			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
1052		} else if (!vd->vdev_delayed_close) {
1053			vd->vdev_delayed_close = B_TRUE;
1054		}
1055	}
1056}
1057
1058vdev_ops_t vdev_disk_ops = {
1059	.vdev_op_open = vdev_disk_open,
1060	.vdev_op_close = vdev_disk_close,
1061	.vdev_op_asize = vdev_default_asize,
1062	.vdev_op_io_start = vdev_disk_io_start,
1063	.vdev_op_io_done = vdev_disk_io_done,
1064	.vdev_op_state_change = NULL,
1065	.vdev_op_need_resilver = NULL,
1066	.vdev_op_hold = vdev_disk_hold,
1067	.vdev_op_rele = vdev_disk_rele,
1068	.vdev_op_remap = NULL,
1069	.vdev_op_xlate = vdev_default_xlate,
1070	.vdev_op_dumpio = vdev_disk_dumpio,
1071	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
1072	.vdev_op_leaf = B_TRUE			/* leaf vdev */
1073};
1074
1075/*
1076 * Given the root disk device devid or pathname, read the label from
1077 * the device, and construct a configuration nvlist.
1078 */
1079int
1080vdev_disk_read_rootlabel(const char *devpath, const char *devid,
1081    nvlist_t **config)
1082{
1083	ldi_handle_t vd_lh;
1084	vdev_label_t *label;
1085	uint64_t s, size;
1086	int l;
1087	ddi_devid_t tmpdevid;
1088	int error = -1;
1089	char *minor_name;
1090
1091	/*
1092	 * Read the device label and build the nvlist.
1093	 */
1094	if (devid != NULL && ddi_devid_str_decode((char *)devid, &tmpdevid,
1095	    &minor_name) == 0) {
1096		error = ldi_open_by_devid(tmpdevid, minor_name,
1097		    FREAD, kcred, &vd_lh, zfs_li);
1098		ddi_devid_free(tmpdevid);
1099		ddi_devid_str_free(minor_name);
1100	}
1101
1102	if (error != 0 && (error = ldi_open_by_name((char *)devpath, FREAD,
1103	    kcred, &vd_lh, zfs_li)) != 0) {
1104		return (error);
1105	}
1106
1107	if (ldi_get_size(vd_lh, &s)) {
1108		(void) ldi_close(vd_lh, FREAD, kcred);
1109		return (SET_ERROR(EIO));
1110	}
1111
1112	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
1113	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
1114
1115	*config = NULL;
1116	for (l = 0; l < VDEV_LABELS; l++) {
1117		uint64_t offset, state, txg = 0;
1118
1119		/* read vdev label */
1120		offset = vdev_label_offset(size, l, 0);
1121		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
1122		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
1123			continue;
1124
1125		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
1126		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
1127			*config = NULL;
1128			continue;
1129		}
1130
1131		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
1132		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
1133			nvlist_free(*config);
1134			*config = NULL;
1135			continue;
1136		}
1137
1138		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
1139		    &txg) != 0 || txg == 0) {
1140			nvlist_free(*config);
1141			*config = NULL;
1142			continue;
1143		}
1144
1145		break;
1146	}
1147
1148	kmem_free(label, sizeof (vdev_label_t));
1149	(void) ldi_close(vd_lh, FREAD, kcred);
1150	if (*config == NULL)
1151		error = SET_ERROR(EIDRM);
1152
1153	return (error);
1154}
1155
1156struct veb {
1157	list_t veb_ents;
1158	boolean_t veb_scanned;
1159};
1160
1161struct veb_ent {
1162	uint64_t vebe_pool_guid;
1163	uint64_t vebe_vdev_guid;
1164
1165	char *vebe_devpath;
1166
1167	list_node_t vebe_link;
1168};
1169
1170static kmutex_t veb_lock;
1171static struct veb *veb;
1172
1173static int
1174vdev_disk_preroot_scan_walk(const char *devpath, void *arg)
1175{
1176	int r;
1177	nvlist_t *cfg = NULL;
1178	uint64_t pguid = 0, vguid = 0;
1179
1180	/*
1181	 * Attempt to read the label from this block device.
1182	 */
1183	if ((r = vdev_disk_read_rootlabel(devpath, NULL, &cfg)) != 0) {
1184		/*
1185		 * Many of the available block devices will represent slices or
1186		 * partitions of disks, or may represent disks that are not at
1187		 * all initialised with ZFS.  As this is a best effort
1188		 * mechanism to locate an alternate path to a particular vdev,
1189		 * we will ignore any failures and keep scanning.
1190		 */
1191		return (PREROOT_WALK_BLOCK_DEVICES_NEXT);
1192	}
1193
1194	/*
1195	 * Determine the pool and vdev GUID read from the label for this
1196	 * device.  Both values must be present and have a non-zero value.
1197	 */
1198	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pguid) != 0 ||
1199	    nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_GUID, &vguid) != 0 ||
1200	    pguid == 0 || vguid == 0) {
1201		/*
1202		 * This label was not complete.
1203		 */
1204		goto out;
1205	}
1206
1207	/*
1208	 * Keep track of all of the GUID-to-devpath mappings we find so that
1209	 * vdev_disk_preroot_lookup() can search them.
1210	 */
1211	struct veb_ent *vebe = kmem_zalloc(sizeof (*vebe), KM_SLEEP);
1212	vebe->vebe_pool_guid = pguid;
1213	vebe->vebe_vdev_guid = vguid;
1214	vebe->vebe_devpath = spa_strdup(devpath);
1215
1216	list_insert_tail(&veb->veb_ents, vebe);
1217
1218out:
1219	nvlist_free(cfg);
1220	return (PREROOT_WALK_BLOCK_DEVICES_NEXT);
1221}
1222
1223const char *
1224vdev_disk_preroot_lookup(uint64_t pool_guid, uint64_t vdev_guid)
1225{
1226	if (pool_guid == 0 || vdev_guid == 0) {
1227		/*
1228		 * If we aren't provided both a pool and a vdev GUID, we cannot
1229		 * perform a lookup.
1230		 */
1231		return (NULL);
1232	}
1233
1234	mutex_enter(&veb_lock);
1235	if (veb == NULL) {
1236		/*
1237		 * If vdev_disk_preroot_fini() has been called already, there
1238		 * is nothing we can do.
1239		 */
1240		mutex_exit(&veb_lock);
1241		return (NULL);
1242	}
1243
1244	/*
1245	 * We want to perform at most one scan of all block devices per boot.
1246	 */
1247	if (!veb->veb_scanned) {
1248		cmn_err(CE_NOTE, "Performing full ZFS device scan!");
1249
1250		preroot_walk_block_devices(vdev_disk_preroot_scan_walk, NULL);
1251
1252		veb->veb_scanned = B_TRUE;
1253	}
1254
1255	const char *path = NULL;
1256	for (struct veb_ent *vebe = list_head(&veb->veb_ents); vebe != NULL;
1257	    vebe = list_next(&veb->veb_ents, vebe)) {
1258		if (vebe->vebe_pool_guid == pool_guid &&
1259		    vebe->vebe_vdev_guid == vdev_guid) {
1260			path = vebe->vebe_devpath;
1261			break;
1262		}
1263	}
1264
1265	mutex_exit(&veb_lock);
1266
1267	return (path);
1268}
1269
1270void
1271vdev_disk_preroot_init(void)
1272{
1273	mutex_init(&veb_lock, NULL, MUTEX_DEFAULT, NULL);
1274
1275	VERIFY3P(veb, ==, NULL);
1276	veb = kmem_zalloc(sizeof (*veb), KM_SLEEP);
1277	list_create(&veb->veb_ents, sizeof (struct veb_ent),
1278	    offsetof(struct veb_ent, vebe_link));
1279	veb->veb_scanned = B_FALSE;
1280}
1281
1282void
1283vdev_disk_preroot_fini(void)
1284{
1285	mutex_enter(&veb_lock);
1286
1287	if (veb != NULL) {
1288		while (!list_is_empty(&veb->veb_ents)) {
1289			struct veb_ent *vebe = list_remove_head(&veb->veb_ents);
1290
1291			spa_strfree(vebe->vebe_devpath);
1292
1293			kmem_free(vebe, sizeof (*vebe));
1294		}
1295
1296		kmem_free(veb, sizeof (*veb));
1297		veb = NULL;
1298	}
1299
1300	mutex_exit(&veb_lock);
1301}
1302