1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020 Joyent, Inc.
14  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
15  * Copyright 2024 Oxide Computer Company
16  */
17 
18 /*
19  * This file drives topo node enumeration of NVMe controllers.  A single "nvme"
20  * node is enumerated for each NVMe controller.   Child "disk" nodes are then
21  * enumerated for each active or attached NVMe namespace.
22  *
23  * nvme nodes are expected to be enumerated under either a "bay" node (for U.2
24  * devices) or a "slot" node (for M.2 devices) or a "pciexfn" node (for AIC
25  * devices).
26  *
27  * Enumeration of NVMe controllers on PCIe add-in cards is automatically driven
28  * by the pcibus topo module.
29  *
30  * In order to allow for associating a given NVMe controller with a physical
31  * location, enumeration of U.2 and M.2 devices should be driven by a
32  * platform-specific topo map which statically sets the following two
33  * properties on the parent "bay" or "slot" node:
34  *
35  * propgroup        property        description
36  * ---------        --------        ------------
37  * binding          driver          "nvme"
38  * binding          parent-device   devpath of parent PCIe device
39  *
40  * for example:
41  *
42  * <propgroup name="binding" version="1" name-stability="Private"
43  *   data-stability="Private">
44  *     <propval name="driver" type="string" value="nvme"/>
45  *     <propval name="parent-device" type="string"
46  *       value="/pci@0,0/pci8086,6f09@3,1"/>
47  * </propgroup>
48  * <dependents grouping="children">
49  *     <range name="nvme" min="0" max="0">
50  *         <enum-method name="disk" version="1"/>
51  *     </range>
52  * </dependents>
53  */
54 #include <stdlib.h>
55 #include <sys/types.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <unistd.h>
59 #include <string.h>
60 #include <strings.h>
61 #include <stdbool.h>
62 
63 #include <sys/fm/protocol.h>
64 #include <fm/topo_hc.h>
65 #include <fm/topo_mod.h>
66 #include <topo_ufm.h>
67 
68 #include <sys/dkio.h>
69 #include <sys/scsi/generic/inquiry.h>
70 
71 #include <libnvme.h>
72 #include "disk.h"
73 #include "disk_drivers.h"
74 
75 typedef struct nvme_enum_info {
76 	topo_mod_t		*nei_mod;
77 	di_node_t		nei_dinode;
78 	nvme_t			*nei_libnvme;
79 	nvme_ctrl_t		*nei_ctrl;
80 	nvme_ctrl_info_t	*nei_ctrl_info;
81 	const nvme_version_t	*nei_vers;
82 	tnode_t			*nei_parent;
83 	tnode_t			*nei_nvme;
84 	nvlist_t		*nei_nvme_fmri;
85 	int			nei_fd;
86 } nvme_enum_info_t;
87 
88 typedef struct devlink_arg {
89 	topo_mod_t		*dla_mod;
90 	char			*dla_logical_disk;
91 	uint_t			dla_strsz;
92 } devlink_arg_t;
93 
94 static int
devlink_cb(di_devlink_t dl,void * arg)95 devlink_cb(di_devlink_t dl, void *arg)
96 {
97 	devlink_arg_t *dlarg = (devlink_arg_t *)arg;
98 	topo_mod_t *mod = dlarg->dla_mod;
99 	const char *devpath;
100 	char *slice, *ctds;
101 
102 	if ((devpath = di_devlink_path(dl)) == NULL ||
103 	    (dlarg->dla_logical_disk = topo_mod_strdup(mod, devpath)) ==
104 	    NULL) {
105 		return (DI_WALK_TERMINATE);
106 	}
107 
108 	/*
109 	 * We need to keep track of the original string size before we
110 	 * truncate it with a NUL, so that we can free the right number of
111 	 * bytes when we're done, otherwise libumem will complain.
112 	 */
113 	dlarg->dla_strsz = strlen(dlarg->dla_logical_disk) + 1;
114 
115 	/* trim the slice off the public name */
116 	if (((ctds = strrchr(dlarg->dla_logical_disk, '/')) != NULL) &&
117 	    ((slice = strchr(ctds, 's')) != NULL))
118 		*slice = '\0';
119 
120 	return (DI_WALK_TERMINATE);
121 }
122 
123 static char *
get_logical_disk(topo_mod_t * mod,const char * devpath,uint_t * bufsz)124 get_logical_disk(topo_mod_t *mod, const char *devpath, uint_t *bufsz)
125 {
126 	di_devlink_handle_t devhdl;
127 	devlink_arg_t dlarg = { 0 };
128 	char *minorpath = NULL;
129 
130 	if (asprintf(&minorpath, "%s:a", devpath) < 0) {
131 		return (NULL);
132 	}
133 
134 	if ((devhdl = di_devlink_init(NULL, 0)) == DI_NODE_NIL) {
135 		topo_mod_dprintf(mod, "%s: di_devlink_init failed", __func__);
136 		free(minorpath);
137 		return (NULL);
138 	}
139 
140 	dlarg.dla_mod = mod;
141 
142 	(void) di_devlink_walk(devhdl, "^dsk/", minorpath, DI_PRIMARY_LINK,
143 	    &dlarg, devlink_cb);
144 
145 	(void) di_devlink_fini(&devhdl);
146 	free(minorpath);
147 
148 	*bufsz = dlarg.dla_strsz;
149 	return (dlarg.dla_logical_disk);
150 }
151 
152 static bool
disk_nvme_make_ns_serial(topo_mod_t * mod,nvme_ns_info_t * ns_info,char * buf,size_t buflen)153 disk_nvme_make_ns_serial(topo_mod_t *mod, nvme_ns_info_t *ns_info, char *buf,
154     size_t buflen)
155 {
156 	uint8_t nguid[16], eui64[8];
157 	int ret;
158 
159 	if (nvme_ns_info_nguid(ns_info, nguid)) {
160 		ret = snprintf(buf, buflen, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X"
161 		    "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
162 		    nguid[0], nguid[1], nguid[2], nguid[3], nguid[4],
163 		    nguid[5], nguid[6], nguid[7], nguid[8], nguid[9],
164 		    nguid[10], nguid[11], nguid[12], nguid[13], nguid[14],
165 		    nguid[15]);
166 	} else if (nvme_ns_info_eui64(ns_info, eui64)) {
167 		ret = snprintf(buf, buflen,
168 		    "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
169 		    eui64[0], eui64[1], eui64[2], eui64[3], eui64[4],
170 		    eui64[5], eui64[6], eui64[7]);
171 	} else {
172 		ret = snprintf(buf, buflen, "%u", nvme_ns_info_nsid(ns_info));
173 	}
174 
175 	if ((size_t)ret >= buflen) {
176 		topo_mod_dprintf(mod, "overflowed serial number for nsid %u: "
177 		    "needed %zu bytes, got %d", nvme_ns_info_nsid(ns_info),
178 		    buflen, ret);
179 		return (false);
180 	}
181 
182 	return (true);
183 }
184 
185 /*
186  * Create the common I/O property group properties that are shared between
187  * controllers and namespaces. We assume the property group was already created.
188  */
189 static bool
disk_nvme_common_io(topo_mod_t * mod,tnode_t * tn,di_node_t di)190 disk_nvme_common_io(topo_mod_t *mod, tnode_t *tn, di_node_t di)
191 {
192 	int err;
193 	int inst = di_instance(di);
194 	const char *drv = di_driver_name(di);
195 	char *path;
196 	const char *ppaths[1];
197 
198 	if (inst != -1 && topo_prop_set_uint32(tn, TOPO_PGROUP_IO,
199 	    TOPO_IO_INSTANCE, TOPO_PROP_IMMUTABLE, (uint32_t)inst, &err) != 0) {
200 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
201 		    "%s", TOPO_PGROUP_IO, TOPO_IO_INSTANCE, topo_node_name(tn),
202 		    topo_node_instance(tn), topo_strerror(err));
203 		return (false);
204 	}
205 
206 	if (drv != NULL && topo_prop_set_string(tn, TOPO_PGROUP_IO,
207 	    TOPO_IO_DRIVER, TOPO_PROP_IMMUTABLE, drv, &err) != 0) {
208 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
209 		    "%s", TOPO_PGROUP_IO, TOPO_IO_DRIVER, topo_node_name(tn),
210 		    topo_node_instance(tn), topo_strerror(err));
211 		return (false);
212 	}
213 
214 	if (drv != NULL) {
215 		nvlist_t *fmri = topo_mod_modfmri(mod, FM_MOD_SCHEME_VERSION,
216 		    drv);
217 		if (mod != NULL && topo_prop_set_fmri(tn, TOPO_PGROUP_IO,
218 		    TOPO_IO_MODULE, TOPO_PROP_IMMUTABLE, fmri, &err) != 0) {
219 			topo_mod_dprintf(mod, "failed to set %s:%s on %s[%"
220 			    PRIu64 "]: %s", TOPO_PGROUP_IO, TOPO_IO_MODULE,
221 			    topo_node_name(tn), topo_node_instance(tn),
222 			    topo_strerror(err));
223 			nvlist_free(fmri);
224 			return (false);
225 		}
226 		nvlist_free(fmri);
227 	}
228 
229 	path = di_devfs_path(di);
230 	ppaths[0] = path;
231 	if (path != NULL && topo_prop_set_string(tn, TOPO_PGROUP_IO,
232 	    TOPO_IO_DEV_PATH, TOPO_PROP_IMMUTABLE, path, &err) != 0) {
233 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
234 		    "%s", TOPO_PGROUP_IO, TOPO_IO_DRIVER, topo_node_name(tn),
235 		    topo_node_instance(tn), topo_strerror(err));
236 		di_devfs_path_free(path);
237 		return (false);
238 	}
239 
240 	if (path != NULL && topo_prop_set_string_array(tn, TOPO_PGROUP_IO,
241 	    TOPO_IO_PHYS_PATH, TOPO_PROP_IMMUTABLE, ppaths, 1, &err) != 0) {
242 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
243 		    "%s", TOPO_PGROUP_IO, TOPO_IO_PHYS_PATH, topo_node_name(tn),
244 		    topo_node_instance(tn), topo_strerror(err));
245 		di_devfs_path_free(path);
246 		return (false);
247 	}
248 	di_devfs_path_free(path);
249 
250 	return (true);
251 }
252 
253 /*
254  * Add the various storage and I/O property group items that are appropriate
255  * given that we have a devinfo node. The storage property group has already
256  * been created, but the I/O property group has not.
257  */
258 static void
disk_nvme_make_ns_di_props(topo_mod_t * mod,tnode_t * tn,di_node_t di)259 disk_nvme_make_ns_di_props(topo_mod_t *mod, tnode_t *tn, di_node_t di)
260 {
261 	int err;
262 	char *devid, *mfg, *model, *rev, *serial, *log, *path;
263 	uint_t buflen;
264 
265 	if (di_prop_lookup_strings(DDI_DEV_T_ANY, di, DEVID_PROP_NAME,
266 	    &devid) != 1 ||
267 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_VENDOR_ID,
268 	    &mfg) != 1 ||
269 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_PRODUCT_ID,
270 	    &model) != 1 ||
271 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_REVISION_ID,
272 	    &rev) != 1 ||
273 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_SERIAL_NO,
274 	    &serial) != 1) {
275 		topo_mod_dprintf(mod, "failed to get devinfo props for %s[%"
276 		    PRIu64 "]", topo_node_name(tn), topo_node_instance(tn));
277 		return;
278 	}
279 
280 	/*
281 	 * Set the basic storage manufacturer information. Yes, this is
282 	 * information really about the NVMe controller and not the namespace.
283 	 * That's how the storage property group basically works here.
284 	 */
285 	if (topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
286 	    TOPO_STORAGE_MANUFACTURER, TOPO_PROP_IMMUTABLE, mfg, &err) != 0 ||
287 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
288 	    TOPO_STORAGE_SERIAL_NUM, TOPO_PROP_IMMUTABLE, serial, &err) != 0 ||
289 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
290 	    TOPO_STORAGE_FIRMWARE_REV, TOPO_PROP_IMMUTABLE, rev, &err) != 0 ||
291 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
292 	    TOPO_STORAGE_MODEL, TOPO_PROP_IMMUTABLE, model, &err) != 0) {
293 		topo_mod_dprintf(mod, "failed to set storage properties on "
294 		    "%s[%" PRIu64 "]: %s", topo_node_name(tn),
295 		    topo_node_instance(tn), topo_strerror(err));
296 		return;
297 	}
298 
299 	if (topo_pgroup_create(tn, &io_pgroup, &err) != 0) {
300 		topo_mod_dprintf(mod, "failed to create I/O property "
301 		    "group on %s[%" PRIu64 "]: %s",  topo_node_name(tn),
302 		    topo_node_instance(tn), topo_strerror(err));
303 	}
304 
305 	if (!disk_nvme_common_io(mod, tn, di)) {
306 		return;
307 	}
308 
309 	/*
310 	 * The last property that we'd like to attempt to create for a namespace
311 	 * is a mapping back to its corresponding logical disk entry in /dev.
312 	 * The logical disk will be everything past the trailing /, i.e. a
313 	 * cXtXdX value.
314 	 */
315 	path = di_devfs_path(di);
316 	if (path == NULL) {
317 		return;
318 	}
319 	log = get_logical_disk(mod, path, &buflen);
320 	di_devfs_path_free(path);
321 	if (log == NULL) {
322 		return;
323 	}
324 	path = strrchr(log, '/');
325 	if (path != NULL && path[1] != '\0' &&
326 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
327 	    TOPO_STORAGE_LOGICAL_DISK_NAME, TOPO_PROP_IMMUTABLE, path + 1,
328 	    &err) != 0) {
329 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%"
330 		    PRIu64 "]: %s", TOPO_PGROUP_STORAGE,
331 		    TOPO_STORAGE_LOGICAL_DISK_NAME, topo_node_name(tn),
332 		    topo_node_instance(tn), topo_strerror(err));
333 	}
334 	topo_mod_free(mod, log, buflen);
335 }
336 
337 static void
disk_nvme_make_ns(nvme_enum_info_t * nei,nvme_ns_info_t * ns_info)338 disk_nvme_make_ns(nvme_enum_info_t *nei, nvme_ns_info_t *ns_info)
339 {
340 	topo_mod_t *mod = nei->nei_mod;
341 	nvlist_t *auth = NULL, *fmri = NULL;
342 	const uint32_t nsid = nvme_ns_info_nsid(ns_info);
343 	const topo_instance_t inst = nsid - 1;
344 	char serial[64], capstr[64];
345 	const nvme_nvm_lba_fmt_t *fmt;
346 	const char *bd_addr;
347 	uint64_t cap, blksz, capblks;
348 	tnode_t *tn;
349 	int err;
350 
351 	auth = topo_mod_auth(mod, nei->nei_nvme);
352 	if (auth == NULL) {
353 		topo_mod_dprintf(mod, "failed to get auth for nsid %u from "
354 		    "parent %s[%" PRIu64 "]: %s", nsid,
355 		    topo_node_name(nei->nei_nvme),
356 		    topo_node_instance(nei->nei_nvme), topo_mod_errmsg(mod));
357 		goto done;
358 	}
359 
360 	/*
361 	 * We want to construct the FMRI for the namespace. The namespace is a
362 	 * little awkward in terms of things like the model, revision, and
363 	 * serial. While blkdev sets up standard inquiry properties to map these
364 	 * to the parent device which makes sense in the context of trying to
365 	 * use this as a normal block device, it's not really appropriate here.
366 	 * The namespace is not the NVMe controller. We construct the namespace
367 	 * serial number from the preferential ordering of information that
368 	 * we're given of the NGUID, EUI64, and then fall back to the namespace
369 	 * number.
370 	 */
371 	if (!disk_nvme_make_ns_serial(mod, ns_info, serial, sizeof (serial))) {
372 		goto done;
373 	}
374 	fmri = topo_mod_hcfmri(mod, nei->nei_nvme, FM_HC_SCHEME_VERSION,
375 	    DISK, inst, NULL, auth, NULL, NULL, serial);
376 	if (fmri == NULL) {
377 		topo_mod_dprintf(mod, "failed to make fmri for %s[%" PRIu64
378 		    "] on nsid %u: %s", DISK, inst, nsid, topo_mod_errmsg(mod));
379 		goto done;
380 	}
381 
382 	tn = topo_node_bind(mod, nei->nei_nvme, DISK, inst, fmri);
383 	if (tn == NULL) {
384 		topo_mod_dprintf(mod, "failed to bind fmri for %s[%" PRIu64
385 		    "] on nsid %u: %s", DISK, inst, nsid, topo_mod_errmsg(mod));
386 		goto done;
387 	}
388 
389 	/*
390 	 * Always inherit our parent's FRU. The namespace is just a part of the
391 	 * device in reality.
392 	 */
393 	if (topo_node_fru_set(tn, NULL, 0, &err) != 0) {
394 		topo_mod_dprintf(mod, "failed to set FRU for %s[%" PRIu64
395 		    "] on nsid %u: %s", DISK, inst, nsid, topo_strerror(err));
396 		goto done;
397 
398 	}
399 
400 	/*
401 	 * Our namespace may or may not be attached. From the namespace we will
402 	 * always get the capacity and block information. The rest of it will
403 	 * end up being filled in if we find a devinfo node.
404 	 */
405 	if (topo_pgroup_create(tn, &storage_pgroup, &err) != 0) {
406 		topo_mod_dprintf(mod, "failed to create storage property "
407 		    "group on %s[%" PRIu64 "]: %s", DISK, inst,
408 		    topo_strerror(err));
409 	}
410 
411 	if (!nvme_ns_info_curformat(ns_info, &fmt)) {
412 		topo_mod_dprintf(mod, "failed to get current namespace "
413 		    "format: %s", nvme_ns_info_errmsg(ns_info));
414 		goto done;
415 	}
416 
417 	blksz = nvme_nvm_lba_fmt_data_size(fmt);
418 	if (topo_prop_set_uint64(tn, TOPO_PGROUP_STORAGE,
419 	    TOPO_STORAGE_LOG_BLOCK_SIZE, TOPO_PROP_IMMUTABLE, blksz, &err) !=
420 	    0) {
421 		topo_mod_dprintf(mod, "failed to create property %s:%s on %s[%"
422 		    PRIu64 "]: %s", TOPO_PGROUP_STORAGE,
423 		    TOPO_STORAGE_LOG_BLOCK_SIZE, DISK, inst,
424 		    topo_strerror(err));
425 		goto done;
426 	}
427 
428 	if (!nvme_ns_info_cap(ns_info, &capblks)) {
429 		topo_mod_dprintf(mod, "failed to get namespace capacity: %s",
430 		    nvme_ns_info_errmsg(ns_info));
431 		goto done;
432 	}
433 
434 	cap = blksz * capblks;
435 	if (snprintf(capstr, sizeof (capstr), "%" PRIu64, cap) >=
436 	    sizeof (capstr)) {
437 		topo_mod_dprintf(mod, "overflowed capacity calculation on "
438 		    "nsid %u", nsid);
439 		goto done;
440 	}
441 
442 	/*
443 	 * Finally attempt to find a child node that has a matching name and go
444 	 * from there. Sorry, this does result in node creation being O(n^2),
445 	 * but at least n is usually small today. Note, we may not have a blkdev
446 	 * address because the disk may not be attached.
447 	 */
448 	if (!nvme_ns_info_bd_addr(ns_info, &bd_addr)) {
449 		if (nvme_ns_info_err(ns_info) != NVME_INFO_ERR_NS_NO_BLKDEV) {
450 			topo_mod_dprintf(mod, "failed to get namespace blkdev "
451 			    "address: %s", nvme_ns_info_errmsg(ns_info));
452 		}
453 		goto done;
454 	}
455 
456 	for (di_node_t di = di_child_node(nei->nei_dinode); di != DI_NODE_NIL;
457 	    di = di_sibling_node(di)) {
458 		const char *addr = di_bus_addr(di);
459 		if (addr != NULL && strcmp(addr, bd_addr) == 0) {
460 			disk_nvme_make_ns_di_props(mod, tn, di);
461 		}
462 	}
463 
464 done:
465 	nvlist_free(auth);
466 	nvlist_free(fmri);
467 }
468 
469 /*
470  * Attempt to make a ufm node, but swallow the error so we can try to get as
471  * much of the disk information as possible.
472  */
473 static void
disk_nvme_make_ufm(topo_mod_t * mod,nvme_enum_info_t * nei)474 disk_nvme_make_ufm(topo_mod_t *mod, nvme_enum_info_t *nei)
475 {
476 	topo_ufm_devinfo_t tud;
477 	char *path = di_devfs_path(nei->nei_dinode);
478 	if (path == NULL) {
479 		return;
480 	}
481 
482 	tud.tud_method = TOPO_UFM_M_DEVINFO;
483 	tud.tud_path = path;
484 	if (topo_mod_load(mod, TOPO_MOD_UFM, TOPO_VERSION) == NULL) {
485 		topo_mod_dprintf(mod, "disk enum could not load ufm module");
486 		di_devfs_path_free(path);
487 		return;
488 	}
489 
490 	(void) topo_mod_enumerate(mod, nei->nei_nvme, TOPO_MOD_UFM, UFM, 0, 0,
491 	    &tud);
492 	di_devfs_path_free(path);
493 }
494 
495 static const topo_pgroup_info_t nvme_pgroup = {
496 	TOPO_PGROUP_NVME,
497 	TOPO_STABILITY_PRIVATE,
498 	TOPO_STABILITY_PRIVATE,
499 	1
500 };
501 
502 static int
make_nvme_node(nvme_enum_info_t * nvme_info)503 make_nvme_node(nvme_enum_info_t *nvme_info)
504 {
505 	topo_mod_t *mod = nvme_info->nei_mod;
506 	nvme_ctrl_info_t *info = nvme_info->nei_ctrl_info;
507 	nvme_ns_iter_t *iter = NULL;
508 	nvme_iter_t nret;
509 	const nvme_ns_disc_t *disc;
510 	nvlist_t *auth = NULL, *fmri = NULL, *fru;
511 	tnode_t *nvme;
512 	char *model = NULL, *serial = NULL, *vers = NULL;
513 	char *pname = topo_node_name(nvme_info->nei_parent);
514 	char *label = NULL;
515 	topo_instance_t pinst = topo_node_instance(nvme_info->nei_parent);
516 	int err = 0, ret = -1;
517 
518 	/*
519 	 * Pass the model and serial strings through a function that sanitizes
520 	 * them of any characters that can't be used in an FMRI string. Note, we
521 	 * do not use the firmware revision here because that's not really a
522 	 * device property that should be part of the FMRI (it can be changed at
523 	 * runtime).
524 	 */
525 	model = topo_mod_clean_str(mod, nvme_ctrl_info_model(info));
526 	serial = topo_mod_clean_str(mod, nvme_ctrl_info_serial(info));
527 
528 	auth = topo_mod_auth(mod, nvme_info->nei_parent);
529 	fmri = topo_mod_hcfmri(mod, nvme_info->nei_parent, FM_HC_SCHEME_VERSION,
530 	    NVME, 0, NULL, auth, model, NULL, serial);
531 
532 	if (fmri == NULL) {
533 		/* errno set */
534 		topo_mod_dprintf(mod, "%s: hcfmri failed for %s=%" PRIu64
535 		    "/%s=0", __func__, pname, pinst, NVME);
536 		goto error;
537 	}
538 
539 	/*
540 	 * If our parent is a pciexfn node, then we need to create a nvme range
541 	 * underneath it to hold the nvme hierarchy.  For other cases, where
542 	 * enumeration is being driven by a topo map file, this range will have
543 	 * already been statically defined in the XML.
544 	 */
545 	if (strcmp(pname, PCIEX_FUNCTION) == 0) {
546 		if (topo_node_range_create(mod, nvme_info->nei_parent, NVME, 0,
547 		    0) < 0) {
548 			/* errno set */
549 			topo_mod_dprintf(mod, "%s: error creating %s range",
550 			    __func__, NVME);
551 			goto error;
552 		}
553 	}
554 
555 	/*
556 	 * Create a new topo node to represent the NVMe controller and bind it
557 	 * to the parent node.
558 	 */
559 	if ((nvme = topo_node_bind(mod, nvme_info->nei_parent, NVME, 0,
560 	    fmri)) == NULL) {
561 		/* errno set */
562 		topo_mod_dprintf(mod, "%s: bind failed for %s=%" PRIu64
563 		    "/%s=0", __func__, pname, pinst, NVME);
564 		goto error;
565 	}
566 	nvme_info->nei_nvme = nvme;
567 	nvme_info->nei_nvme_fmri = fmri;
568 
569 	/*
570 	 * If our parent node is a "pciexfn" node then this is a NVMe device on
571 	 * a PCIe AIC, so we inherit our parent's FRU.  Otherwise, we set the
572 	 * FRU to ourself.
573 	 */
574 	if (strcmp(topo_node_name(nvme_info->nei_parent), PCIEX_FUNCTION) == 0)
575 		fru = NULL;
576 	else
577 		fru = fmri;
578 
579 	if (topo_node_fru_set(nvme, fru, 0, &err) != 0) {
580 		topo_mod_dprintf(mod, "%s: failed to set FRU: %s", __func__,
581 		    topo_strerror(err));
582 		(void) topo_mod_seterrno(mod, err);
583 		goto error;
584 	}
585 
586 	/*
587 	 * Clone the label from our parent node.  We can't inherit the property
588 	 * because the label prop is mutable on bay nodes and only immutable
589 	 * properties can be inherited.
590 	 */
591 	if ((topo_node_label(nvme_info->nei_parent, &label, &err) != 0 &&
592 	    err != ETOPO_PROP_NOENT) ||
593 	    topo_node_label_set(nvme, label, &err) != 0) {
594 		topo_mod_dprintf(mod, "%s: failed to set label: %s",
595 		    __func__, topo_strerror(err));
596 		(void) topo_mod_seterrno(mod, err);
597 		goto error;
598 	}
599 
600 	/*
601 	 * Ensure that we have a UFM property set based on our devinfo path.
602 	 * This is a little repetitive if our parent actually did so as well,
603 	 * but given that the majority of such nodes are under bays and slots
604 	 * right now, it's a worthwhile tradeoff.
605 	 */
606 	disk_nvme_make_ufm(mod, nvme_info);
607 
608 	if (topo_pgroup_create(nvme, &nvme_pgroup, &err) != 0) {
609 		topo_mod_dprintf(mod, "%s: failed to create %s pgroup: %s",
610 		    __func__, TOPO_PGROUP_NVME, topo_strerror(err));
611 		(void) topo_mod_seterrno(mod, err);
612 		goto error;
613 	}
614 
615 	if (asprintf(&vers, "%u.%u", nvme_info->nei_vers->v_major,
616 	    nvme_info->nei_vers->v_minor) < 0) {
617 		topo_mod_dprintf(mod, "%s: failed to alloc string", __func__);
618 		(void) topo_mod_seterrno(mod, EMOD_NOMEM);
619 		goto error;
620 	}
621 	if (topo_prop_set_string(nvme, TOPO_PGROUP_NVME, TOPO_PROP_NVME_VER,
622 	    TOPO_PROP_IMMUTABLE, vers, &err) != 0) {
623 		topo_mod_dprintf(mod, "%s: failed to set %s/%s property",
624 		    __func__, TOPO_PGROUP_NVME, TOPO_PROP_NVME_VER);
625 		(void) topo_mod_seterrno(mod, err);
626 		goto error;
627 	}
628 
629 	if (topo_pgroup_create(nvme, &io_pgroup, &err) != 0) {
630 		topo_mod_dprintf(mod, "%s: failed to create %s pgroup: %s",
631 		    __func__, TOPO_PGROUP_IO, topo_strerror(err));
632 		(void) topo_mod_seterrno(mod, err);
633 		goto error;
634 	}
635 
636 	if (!disk_nvme_common_io(mod, nvme, nvme_info->nei_dinode)) {
637 		goto error;
638 	}
639 
640 	/*
641 	 * Create a child disk node for each namespace.
642 	 */
643 	if (topo_node_range_create(mod, nvme, DISK, 0,
644 	    nvme_ctrl_info_nns(info) - 1) < 0) {
645 		/* errno set */
646 		topo_mod_dprintf(mod, "%s: error creating %s range", __func__,
647 		    DISK);
648 		goto error;
649 	}
650 
651 	/*
652 	 * Iterate over each namespace to see if it's a candidate for inclusion.
653 	 * Namespaces start at index 1 and not every namespace will be included.
654 	 * We map things such that a disk instance is always namespace - 1 to
655 	 * fit into the above mapping.
656 	 */
657 	if (!nvme_ns_discover_init(nvme_info->nei_ctrl,
658 	    NVME_NS_DISC_F_NOT_IGNORED, &iter)) {
659 		topo_mod_dprintf(mod, "failed to initialize namespace "
660 		    "discovery: %s", nvme_errmsg(nvme_info->nei_libnvme));
661 		ret = topo_mod_seterrno(mod, EMOD_UNKNOWN);
662 		goto error;
663 	}
664 
665 	for (nret = nvme_ns_discover_step(iter, &disc); nret == NVME_ITER_VALID;
666 	    nret = nvme_ns_discover_step(iter, &disc)) {
667 		nvme_ns_info_t *ns_info;
668 		uint32_t nsid = nvme_ns_disc_nsid(disc);
669 
670 		if (!nvme_ctrl_ns_info_snap(nvme_info->nei_ctrl, nsid,
671 		    &ns_info)) {
672 			topo_mod_dprintf(mod, "failed to get namespace "
673 			    "information for ns %u: %s", nsid,
674 			    nvme_errmsg(nvme_info->nei_libnvme));
675 			ret = topo_mod_seterrno(mod, EMOD_UNKNOWN);
676 			goto error;
677 		}
678 
679 		disk_nvme_make_ns(nvme_info, ns_info);
680 		nvme_ns_info_free(ns_info);
681 	}
682 
683 	if (nret == NVME_ITER_ERROR) {
684 		topo_mod_dprintf(mod, "namespace discovery failed: %s",
685 		    nvme_errmsg(nvme_info->nei_libnvme));
686 		ret = topo_mod_seterrno(mod, EMOD_UNKNOWN);
687 	}
688 	ret = 0;
689 
690 error:
691 	nvme_ns_discover_fini(iter);
692 	free(vers);
693 	nvlist_free(auth);
694 	nvlist_free(fmri);
695 	topo_mod_strfree(mod, model);
696 	topo_mod_strfree(mod, serial);
697 	topo_mod_strfree(mod, label);
698 	return (ret);
699 }
700 
701 /*
702  * This function gathers identity information from the NVMe controller and
703  * stores it in a struct.  This struct is passed to make_nvme_node(), which
704  * does the actual topo node creation.
705  */
706 static int
discover_nvme_ctl(topo_mod_t * mod,tnode_t * pnode,di_node_t dinode)707 discover_nvme_ctl(topo_mod_t *mod, tnode_t *pnode, di_node_t dinode)
708 {
709 	topo_disk_t *disk = topo_mod_getspecific(mod);
710 	nvme_enum_info_t nvme_info = { 0 };
711 	int ret;
712 
713 	nvme_info.nei_mod = mod;
714 	nvme_info.nei_dinode = dinode;
715 	nvme_info.nei_parent = pnode;
716 	nvme_info.nei_libnvme = disk->td_nvme;
717 
718 	if (!nvme_ctrl_init(disk->td_nvme, dinode, &nvme_info.nei_ctrl)) {
719 		topo_mod_dprintf(mod, "failed to initialize nvme_ctrl_t: %s",
720 		    nvme_errmsg(disk->td_nvme));
721 		return (topo_mod_seterrno(mod, EMOD_UNKNOWN));
722 	}
723 
724 	if (!nvme_ctrl_info_snap(nvme_info.nei_ctrl,
725 	    &nvme_info.nei_ctrl_info)) {
726 		topo_mod_dprintf(mod, "failed to initialize nvme_ctrl_t: %s",
727 		    nvme_errmsg(disk->td_nvme));
728 		ret = topo_mod_seterrno(mod, EMOD_UNKNOWN);
729 		goto error;
730 	}
731 
732 	nvme_info.nei_vers = nvme_ctrl_info_version(nvme_info.nei_ctrl_info);
733 
734 	if ((ret = make_nvme_node(&nvme_info)) != 0) {
735 		goto error;
736 	}
737 
738 error:
739 	if (nvme_info.nei_ctrl_info != NULL)
740 		nvme_ctrl_info_free(nvme_info.nei_ctrl_info);
741 	if (nvme_info.nei_ctrl != NULL)
742 		nvme_ctrl_fini(nvme_info.nei_ctrl);
743 	return (ret);
744 }
745 
746 int
disk_nvme_enum_disk(topo_mod_t * mod,tnode_t * pnode)747 disk_nvme_enum_disk(topo_mod_t *mod, tnode_t *pnode)
748 {
749 	char *parent = NULL;
750 	int err;
751 	di_node_t devtree;
752 	di_node_t dnode;
753 	int ret = -1;
754 
755 	/*
756 	 * Lookup a property containing the devfs path of the parent PCIe
757 	 * device of the NVMe device we're attempting to enumerate.  This
758 	 * property is hard-coded in per-platform topo XML maps that are
759 	 * delivered with the OS.  This hard-coded path allows topo to map a
760 	 * given NVMe controller to a physical location (bay or slot) on the
761 	 * platform, when generating the topo snapshot.
762 	 */
763 	if (topo_prop_get_string(pnode, TOPO_PGROUP_BINDING,
764 	    TOPO_BINDING_PARENT_DEV, &parent, &err) != 0) {
765 		topo_mod_dprintf(mod, "parent node was missing nvme binding "
766 		    "properties\n");
767 		(void) topo_mod_seterrno(mod, err);
768 		goto out;
769 	}
770 	if ((devtree = topo_mod_devinfo(mod)) == DI_NODE_NIL) {
771 		topo_mod_dprintf(mod, "failed to get devinfo snapshot");
772 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
773 		goto out;
774 	}
775 
776 	/*
777 	 * Walk the devinfo tree looking NVMe devices. For each NVMe device,
778 	 * check if the devfs path of the parent matches the one specified in
779 	 * TOPO_BINDING_PARENT_DEV.
780 	 */
781 	dnode = di_drv_first_node(NVME_DRV, devtree);
782 	while (dnode != DI_NODE_NIL) {
783 		char *path;
784 
785 		if ((path = di_devfs_path(di_parent_node(dnode))) == NULL) {
786 			topo_mod_dprintf(mod, "failed to get dev path");
787 			(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
788 			goto out;
789 		}
790 		if (strcmp(parent, path) == 0) {
791 			ret = discover_nvme_ctl(mod, pnode, dnode);
792 			di_devfs_path_free(path);
793 			goto out;
794 		}
795 		di_devfs_path_free(path);
796 		dnode = di_drv_next_node(dnode);
797 	}
798 	ret = 0;
799 
800 out:
801 	topo_mod_strfree(mod, parent);
802 	return (ret);
803 }
804