1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved.
14 * Copyright 2019 Unix Software Ltd.
15 * Copyright 2020 Joyent, Inc.
16 * Copyright 2020 Racktop Systems.
17 * Copyright 2024 Oxide Computer Company.
18 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
19 * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
20 */
21
22 /*
23 * blkdev driver for NVMe compliant storage devices
24 *
25 * This driver targets and is designed to support all NVMe 1.x and NVMe 2.x
26 * devices. Features are added to the driver as we encounter devices that
27 * require them and our needs, so some commands or log pages may not take
28 * advantage of newer features that devices support at this time. When you
29 * encounter such a case, it is generally fine to add that support to the driver
30 * as long as you take care to ensure that the requisite device version is met
31 * before using it.
32 *
33 * The driver has only been tested on x86 systems and will not work on big-
34 * endian systems without changes to the code accessing registers and data
35 * structures used by the hardware.
36 *
37 *
38 * Interrupt Usage:
39 *
40 * The driver will use a single interrupt while configuring the device as the
41 * specification requires, but contrary to the specification it will try to use
42 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
43 * will switch to multiple-message MSI(-X) if supported. The driver wants to
44 * have one interrupt vector per CPU, but it will work correctly if less are
45 * available. Interrupts can be shared by queues, the interrupt handler will
46 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
47 * the admin queue will share an interrupt with one I/O queue. The interrupt
48 * handler will retrieve completed commands from all queues sharing an interrupt
49 * vector and will post them to a taskq for completion processing.
50 *
51 *
52 * Command Processing:
53 *
54 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
55 * to 65536 I/O commands. The driver will configure one I/O queue pair per
56 * available interrupt vector, with the queue length usually much smaller than
57 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
58 * interrupt vectors will be used.
59 *
60 * Additionally the hardware provides a single special admin queue pair that can
61 * hold up to 4096 admin commands.
62 *
63 * From the hardware perspective both queues of a queue pair are independent,
64 * but they share some driver state: the command array (holding pointers to
65 * commands currently being processed by the hardware) and the active command
66 * counter. Access to a submission queue and the shared state is protected by
67 * nq_mutex; completion queue is protected by ncq_mutex.
68 *
69 * When a command is submitted to a queue pair the active command counter is
70 * incremented and a pointer to the command is stored in the command array. The
71 * array index is used as command identifier (CID) in the submission queue
72 * entry. Some commands may take a very long time to complete, and if the queue
73 * wraps around in that time a submission may find the next array slot to still
74 * be used by a long-running command. In this case the array is sequentially
75 * searched for the next free slot. The length of the command array is the same
76 * as the configured queue length. Queue overrun is prevented by the semaphore,
77 * so a command submission may block if the queue is full.
78 *
79 *
80 * Polled I/O Support:
81 *
82 * For kernel core dump support the driver can do polled I/O. As interrupts are
83 * turned off while dumping the driver will just submit a command in the regular
84 * way, and then repeatedly attempt a command retrieval until it gets the
85 * command back.
86 *
87 *
88 * Namespace Support:
89 *
90 * NVMe devices can have multiple namespaces, each being a independent data
91 * store. The driver supports multiple namespaces and creates a blkdev interface
92 * for each namespace found. Namespaces can have various attributes to support
93 * protection information. This driver does not support any of this and ignores
94 * namespaces that have these attributes.
95 *
96 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
97 * (EUI64), and NVMe 1.2 introduced an additional 128bit Namespace Globally
98 * Unique Identifier (NGUID). This driver uses either the NGUID or the EUI64
99 * if present to generate the devid, and passes the EUI64 to blkdev to use it
100 * in the device node names.
101 *
102 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
103 * single controller. This is an artificial limit imposed by the driver to be
104 * able to address a reasonable number of controllers and namespaces while
105 * fitting within the constraints of MAXMIN32, aka a 32-bit device number which
106 * only has 18-bits for the minor number. See the minor node section for more
107 * information.
108 *
109 *
110 * Minor nodes:
111 *
112 * For each NVMe device the driver exposes one minor node for the controller and
113 * one minor node for each namespace. The only operations supported by those
114 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
115 * primary control interface for the devices. The character device is a private
116 * interface and we attempt stability through libnvme and more so nvmeadm.
117 *
118 * The controller minor node is much more flexible than the namespace minor node
119 * and should be preferred. The controller node allows one to target any
120 * namespace that the device has, while the namespace is limited in what it can
121 * acquire. While the namespace minor exists, it should not be relied upon and
122 * is not by libnvme.
123 *
124 * The minor number space is split in two. We use the lower part to support the
125 * controller and namespaces as described above in the 'Namespace Support'
126 * section. The second set is used for cloning opens. We set aside one million
127 * minors for this purpose. We utilize a cloning open so that way we can have
128 * per-file_t state. This is how we end up implementing and tracking locking
129 * state and related.
130 *
131 * When we have this cloned open, then we allocate a new nvme_minor_t which gets
132 * its minor number from the nvme_open_minors id_space_t and is stored in the
133 * nvme_open_minors_avl. While someone calls open on a controller or namespace
134 * minor, everything else occurs in the context of one of these ephemeral
135 * minors.
136 *
137 *
138 * ioctls, Errors, and Exclusive Access:
139 *
140 * All of the logical commands that one can issue are driven through the
141 * ioctl(9E) interface. All of our ioctls have a similar shape where they
142 * all include the 'nvme_ioctl_common_t' as their first member.
143 *
144 * This common ioctl structure is used to communicate the namespace that should
145 * be targeted. When the namespace is left as 0, then that indicates that it
146 * should target whatever the default is of the minor node. For a namespace
147 * minor, that will be transparently rewritten to the namespace's namespace id.
148 *
149 * In addition, the nvme_ioctl_common_t structure also has a standard error
150 * return. Our goal in our ioctl path is to ensure that we have useful semantic
151 * errors as much as possible. EINVAL, EIO, etc. are all overloaded. Instead as
152 * long as we can copy in our structure, then we will set a semantic error. If
153 * we have an error from the controller, then that will be included there.
154 *
155 * Each command has a specific policy that controls whether or not it is allowed
156 * on the namespace or controller minor, whether the broadcast namespace is
157 * allowed, various settings around what kind of exclusive access is allowed,
158 * and more. Each of these is wrapped up in a bit of policy described by the
159 * 'nvme_ioctl_check_t' structure.
160 *
161 * The device provides a form of exclusion in the form of both a
162 * controller-level and namespace-level read and write lock. Most operations do
163 * not require a lock (e.g. get log page, identify, etc.), but a few do (e.g.
164 * format nvm, firmware related activity, etc.). A read lock guarantees that you
165 * can complete your operation without interference, but read locks are not
166 * required. If you don't take a read lock and someone comes in with a write
167 * lock, then subsequent operations will fail with a semantic error indicating
168 * that you were blocked due to this.
169 *
170 * Here are some of the rules that govern our locks:
171 *
172 * 1. Writers starve readers. Any readers are allowed to finish when there is a
173 * pending writer; however, all subsequent readers will be blocked upon that
174 * writer.
175 * 2. A controller write lock takes priority over all other locks. Put
176 * differently a controller writer not only starves subsequent controller
177 * readers, but also all namespace read and write locks.
178 * 3. Each namespace lock is independent.
179 * 4. At most a single namespace lock may be owned.
180 * 5. If you own a namespace lock, you may not take a controller lock (to help
181 * with lock ordering).
182 * 6. In a similar spirit, if you own a controller write lock, you may not take
183 * any namespace lock. Someone with the controller write lock can perform any
184 * operations that they need to. However, if you have a controller read lock
185 * you may take any namespace lock.
186 * 7. There is no ability to upgrade a read lock to a write lock.
187 * 8. There is no recursive locking.
188 *
189 * While there's a lot there to keep track of, the goals of these are to
190 * constrain things so as to avoid deadlock. This is more complex than the
191 * original implementation in the driver which only allowed for an exclusive
192 * open that was tied to the thread. The first issue with tying this to the
193 * thread was that that didn't work well for software that utilized thread
194 * pools, like complex daemons. The second issue is that we want the ability for
195 * daemons, such as a FRU monitor, to be able to retain a file descriptor to the
196 * device without blocking others from taking action except during critical
197 * periods.
198 *
199 * In particular to enable something like libnvme, we didn't want someone to
200 * have to open and close the file descriptor to change what kind of exclusive
201 * access they desired.
202 *
203 * There are two different sets of data structures that we employ for tracking
204 * locking information:
205 *
206 * 1) The nvme_lock_t structure is contained in both the nvme_t and the
207 * nvme_namespace_t and tracks the current writer, readers, and pending writers
208 * and readers. Each of these lists or the writer pointer all refer to our
209 * second data structure.
210 *
211 * When a lock is owned by a single writer, then the nl_writer field is set to a
212 * specific minor's lock data structure. If instead readers are present, then
213 * the nl_readers list_t is not empty. An invariant of the system is that if
214 * nl_writer is non-NULL, nl_readers must be empty and conversely, if nl_readers
215 * is not empty, nl_writer must be NULL.
216 *
217 * 2) The nvme_minor_lock_info_t exists in the nvme_minor_t. There is one
218 * information structure which represents the minor's controller lock and a
219 * second one that represents the minor's namespace lock. The members of this
220 * are broken into tracking what the current lock is and what it targets. It
221 * also several members that are intended for debugging (nli_last_change,
222 * nli_acq_kthread, etc.).
223 *
224 * While the minor has two different lock information structures, our rules
225 * ensure that only one of the two can be pending and that they shouldn't result
226 * in a deadlock. When a lock is pending, the caller is sleeping on the minor's
227 * nm_cv member.
228 *
229 * These relationships are represented in the following image which shows a
230 * controller write lock being held with a pending readers on the controller
231 * lock and pending writers on one of the controller's namespaces.
232 *
233 * +---------+
234 * | nvme_t |
235 * | |
236 * | n_lock -|-------+
237 * | n_ns -+ | | +-----------------------------+
238 * +-------|-+ +-----------------+ | nvme_minor_t |
239 * | | nvme_lock_t | | |
240 * | | | | +------------------------+ |
241 * | | writer --|-------------->| nvme_minor_lock_info_t | |
242 * | | reader list | | | nm_ctrl_lock | |
243 * | | pending writers | | +------------------------+ |
244 * | | pending readers |------+ | +------------------------+ |
245 * | +-----------------+ | | | nvme_minor_lock_info_t | |
246 * | | | | nm_ns_lock | |
247 * | | | +------------------------+ |
248 * | | +-----------------------------+
249 * +------------------+ | +-----------------+
250 * | nvme_namespace_t | | | nvme_minor_t |
251 * | | | | |
252 * | ns_lock ---+ | | | +-------------+ |
253 * +------------|-----+ +-----------------|>|nm_ctrl_lock | |
254 * | | +-------------+ |
255 * v +-----------------+
256 * +------------------+ ...
257 * | nvme_lock_t | +-----------------+
258 * | | | nvme_minor_t |
259 * | writer | | |
260 * | reader list | | +-------------+ |
261 * | pending writers -|-----------------+ | |nm_ctrl_lock | |
262 * | pending readers | | | +-------------+ |
263 * +------------------+ | +-----------------+
264 * +-----------------------------+ | +-----------------------------+
265 * | nvme_minor_t | | | nvme_minor_t |
266 * | | | | |
267 * | +------------------------+ | | | +------------------------+ |
268 * | | nvme_minor_lock_info_t | | | | | nvme_minor_lock_info_t | |
269 * | | nm_ctrl_lock | | | | | nm_ctrl_lock | |
270 * | +------------------------+ | | | +------------------------+ |
271 * | +------------------------+ | v | +------------------------+ |
272 * | | nvme_minor_lock_info_t |-|-----|->| nvme_minor_lock_info_t | |
273 * | | nm_ns_lock | | | | nm_ns_lock | |
274 * | +------------------------+ | | +------------------------+ |
275 * +-----------------------------+ +-----------------------------+
276 *
277 * Blkdev Interface:
278 *
279 * This driver uses blkdev to do all the heavy lifting involved with presenting
280 * a disk device to the system. As a result, the processing of I/O requests is
281 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
282 * setup, and splitting of transfers into manageable chunks.
283 *
284 * I/O requests coming in from blkdev are turned into NVM commands and posted to
285 * an I/O queue. The queue is selected by taking the CPU id modulo the number of
286 * queues. There is currently no timeout handling of I/O commands.
287 *
288 * Blkdev also supports querying device/media information and generating a
289 * devid. The driver reports the best block size as determined by the namespace
290 * format back to blkdev as physical block size to support partition and block
291 * alignment. The devid is either based on the namespace GUID or EUI64, if
292 * present, or composed using the device vendor ID, model number, serial number,
293 * and the namespace ID.
294 *
295 *
296 * Error Handling:
297 *
298 * Error handling is currently limited to detecting fatal hardware errors,
299 * either by asynchronous events, or synchronously through command status or
300 * admin command timeouts. In case of severe errors the device is fenced off,
301 * all further requests will return EIO. FMA is then called to fault the device.
302 *
303 * The hardware has a limit for outstanding asynchronous event requests. Before
304 * this limit is known the driver assumes it is at least 1 and posts a single
305 * asynchronous request. Later when the limit is known more asynchronous event
306 * requests are posted to allow quicker reception of error information. When an
307 * asynchronous event is posted by the hardware the driver will parse the error
308 * status fields and log information or fault the device, depending on the
309 * severity of the asynchronous event. The asynchronous event request is then
310 * reused and posted to the admin queue again.
311 *
312 * On command completion the command status is checked for errors. In case of
313 * errors indicating a driver bug the driver panics. Almost all other error
314 * status values just cause EIO to be returned.
315 *
316 * Command timeouts are currently detected for all admin commands except
317 * asynchronous event requests. If a command times out and the hardware appears
318 * to be healthy the driver attempts to abort the command. The original command
319 * timeout is also applied to the abort command. If the abort times out too the
320 * driver assumes the device to be dead, fences it off, and calls FMA to retire
321 * it. In all other cases the aborted command should return immediately with a
322 * status indicating it was aborted, and the driver will wait indefinitely for
323 * that to happen. No timeout handling of normal I/O commands is presently done.
324 *
325 * Any command that times out due to the controller dropping dead will be put on
326 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
327 * memory being reused by the system and later be written to by a "dead" NVMe
328 * controller.
329 *
330 *
331 * Locking:
332 *
333 * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held
334 * when accessing shared state and submission queue registers, ncq_mutex
335 * is held when accessing completion queue state and registers.
336 * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while
337 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both
338 * mutexes themselves.
339 *
340 * Each command also has its own nc_mutex, which is associated with the
341 * condition variable nc_cv. It is only used on admin commands which are run
342 * synchronously. In that case it must be held across calls to
343 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
344 * nvme_admin_cmd(). It must also be held whenever the completion state of the
345 * command is changed or while a admin command timeout is handled.
346 *
347 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
348 * More than one nc_mutex may only be held when aborting commands. In this case,
349 * the nc_mutex of the command to be aborted must be held across the call to
350 * nvme_abort_cmd() to prevent the command from completing while the abort is in
351 * progress.
352 *
353 * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be
354 * acquired first. More than one nq_mutex is never held by a single thread.
355 * The ncq_mutex is only held by nvme_retrieve_cmd() and
356 * nvme_process_iocq(). nvme_process_iocq() is only called from the
357 * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the
358 * mutex is non-contentious but is required for implementation completeness
359 * and safety.
360 *
361 * There is one mutex n_minor_mutex which protects all open flags nm_open and
362 * exclusive-open thread pointers nm_oexcl of each minor node associated with a
363 * controller and its namespaces.
364 *
365 * In addition, there is one mutex n_mgmt_mutex which must be held whenever the
366 * driver state for any namespace is changed, especially across calls to
367 * nvme_init_ns(), nvme_attach_ns() and nvme_detach_ns(). Except when detaching
368 * nvme, it should also be held across calls that modify the blkdev handle of a
369 * namespace. Command and queue mutexes may be acquired and released while
370 * n_mgmt_mutex is held, n_minor_mutex should not.
371 *
372 *
373 * Quiesce / Fast Reboot:
374 *
375 * The driver currently does not support fast reboot. A quiesce(9E) entry point
376 * is still provided which is used to send a shutdown notification to the
377 * device.
378 *
379 *
380 * NVMe Hotplug:
381 *
382 * The driver supports hot removal. The driver uses the NDI event framework
383 * to register a callback, nvme_remove_callback, to clean up when a disk is
384 * removed. In particular, the driver will unqueue outstanding I/O commands and
385 * set n_dead on the softstate to true so that other operations, such as ioctls
386 * and command submissions, fail as well.
387 *
388 * While the callback registration relies on the NDI event framework, the
389 * removal event itself is kicked off in the PCIe hotplug framework, when the
390 * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicating that a
391 * device was removed from the slot.
392 *
393 * The NVMe driver instance itself will remain until the final close of the
394 * device.
395 *
396 *
397 * DDI UFM Support
398 *
399 * The driver supports the DDI UFM framework for reporting information about
400 * the device's firmware image and slot configuration. This data can be
401 * queried by userland software via ioctls to the ufm driver. For more
402 * information, see ddi_ufm(9E).
403 *
404 *
405 * Driver Configuration:
406 *
407 * The following driver properties can be changed to control some aspects of the
408 * drivers operation:
409 * - strict-version: can be set to 0 to allow devices conforming to newer
410 * major versions to be used
411 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
412 * specific command status as a fatal error leading device faulting
413 * - admin-queue-len: the maximum length of the admin queue (16-4096)
414 * - io-squeue-len: the maximum length of the I/O submission queues (16-65536)
415 * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536)
416 * - async-event-limit: the maximum number of asynchronous event requests to be
417 * posted by the driver
418 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
419 * cache
420 * - min-phys-block-size: the minimum physical block size to report to blkdev,
421 * which is among other things the basis for ZFS vdev ashift
422 * - max-submission-queues: the maximum number of I/O submission queues.
423 * - max-completion-queues: the maximum number of I/O completion queues,
424 * can be less than max-submission-queues, in which case the completion
425 * queues are shared.
426 *
427 * In addition to the above properties, some device-specific tunables can be
428 * configured using the nvme-config-list global property. The value of this
429 * property is a list of triplets. The formal syntax is:
430 *
431 * nvme-config-list ::= <triplet> [, <triplet>]* ;
432 * <triplet> ::= "<model>" , "<rev-list>" , "<tuple-list>"
433 * <rev-list> ::= [ <fwrev> [, <fwrev>]*]
434 * <tuple-list> ::= <tunable> [, <tunable>]*
435 * <tunable> ::= <name> : <value>
436 *
437 * The <model> and <fwrev> are the strings in nvme_identify_ctrl_t`id_model and
438 * nvme_identify_ctrl_t`id_fwrev, respectively. The remainder of <tuple-list>
439 * contains one or more tunables to apply to all controllers that match the
440 * specified model number and optionally firmware revision. Each <tunable> is a
441 * <name> : <value> pair. Supported tunables are:
442 *
443 * - ignore-unknown-vendor-status: can be set to "on" to not handle any vendor
444 * specific command status as a fatal error leading device faulting
445 *
446 * - min-phys-block-size: the minimum physical block size to report to blkdev,
447 * which is among other things the basis for ZFS vdev ashift
448 *
449 * - volatile-write-cache: can be set to "on" or "off" to enable or disable the
450 * volatile write cache, if present
451 *
452 *
453 * TODO:
454 * - figure out sane default for I/O queue depth reported to blkdev
455 * - FMA handling of media errors
456 * - support for devices supporting very large I/O requests using chained PRPs
457 * - support for configuring hardware parameters like interrupt coalescing
458 * - support for media formatting and hard partitioning into namespaces
459 * - support for big-endian systems
460 * - support for fast reboot
461 * - support for NVMe Subsystem Reset (1.1)
462 * - support for Scatter/Gather lists (1.1)
463 * - support for Reservations (1.1)
464 * - support for power management
465 */
466
467 #include <sys/byteorder.h>
468 #ifdef _BIG_ENDIAN
469 #error nvme driver needs porting for big-endian platforms
470 #endif
471
472 #include <sys/modctl.h>
473 #include <sys/conf.h>
474 #include <sys/devops.h>
475 #include <sys/ddi.h>
476 #include <sys/ddi_ufm.h>
477 #include <sys/sunddi.h>
478 #include <sys/sunndi.h>
479 #include <sys/bitmap.h>
480 #include <sys/sysmacros.h>
481 #include <sys/param.h>
482 #include <sys/varargs.h>
483 #include <sys/cpuvar.h>
484 #include <sys/disp.h>
485 #include <sys/blkdev.h>
486 #include <sys/atomic.h>
487 #include <sys/archsystm.h>
488 #include <sys/sata/sata_hba.h>
489 #include <sys/stat.h>
490 #include <sys/policy.h>
491 #include <sys/list.h>
492 #include <sys/dkio.h>
493 #include <sys/pci.h>
494 #include <sys/mkdev.h>
495
496 #include <sys/nvme.h>
497
498 #ifdef __x86
499 #include <sys/x86_archext.h>
500 #endif
501
502 #include "nvme_reg.h"
503 #include "nvme_var.h"
504
505 /*
506 * Assertions to make sure that we've properly captured various aspects of the
507 * packed structures and haven't broken them during updates.
508 */
509 CTASSERT(sizeof (nvme_identify_ctrl_t) == NVME_IDENTIFY_BUFSIZE);
510 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256);
511 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512);
512 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520);
513 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768);
514 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792);
515 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048);
516 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072);
517
518 CTASSERT(sizeof (nvme_identify_nsid_t) == NVME_IDENTIFY_BUFSIZE);
519 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32);
520 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92);
521 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104);
522 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128);
523 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384);
524
525 CTASSERT(sizeof (nvme_identify_nsid_list_t) == NVME_IDENTIFY_BUFSIZE);
526 CTASSERT(sizeof (nvme_identify_ctrl_list_t) == NVME_IDENTIFY_BUFSIZE);
527
528 CTASSERT(sizeof (nvme_identify_primary_caps_t) == NVME_IDENTIFY_BUFSIZE);
529 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32);
530 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64);
531
532 CTASSERT(sizeof (nvme_nschange_list_t) == 4096);
533
534
535 /* NVMe spec version supported */
536 static const int nvme_version_major = 2;
537
538 /* tunable for admin command timeout in seconds, default is 1s */
539 uint32_t nvme_admin_cmd_timeout = 1;
540
541 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
542 uint32_t nvme_format_cmd_timeout = 600;
543
544 /* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */
545 uint32_t nvme_commit_save_cmd_timeout = 15;
546
547 /*
548 * tunable for the size of arbitrary vendor specific admin commands,
549 * default is 16MiB.
550 */
551 uint32_t nvme_vendor_specific_admin_cmd_size = 1 << 24;
552
553 /*
554 * tunable for the max timeout of arbitary vendor specific admin commands,
555 * default is 60s.
556 */
557 uint_t nvme_vendor_specific_admin_cmd_max_timeout = 60;
558
559 /*
560 * This ID space, AVL, and lock are used for keeping track of minor state across
561 * opens between different devices.
562 */
563 static id_space_t *nvme_open_minors;
564 static avl_tree_t nvme_open_minors_avl;
565 kmutex_t nvme_open_minors_mutex;
566
567 /*
568 * Removal taskq used for n_dead callback processing.
569 */
570 taskq_t *nvme_dead_taskq;
571
572 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
573 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
574 static int nvme_quiesce(dev_info_t *);
575 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
576 static int nvme_setup_interrupts(nvme_t *, int, int);
577 static void nvme_release_interrupts(nvme_t *);
578 static uint_t nvme_intr(caddr_t, caddr_t);
579
580 static void nvme_shutdown(nvme_t *, boolean_t);
581 static boolean_t nvme_reset(nvme_t *, boolean_t);
582 static int nvme_init(nvme_t *);
583 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
584 static void nvme_free_cmd(nvme_cmd_t *);
585 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
586 bd_xfer_t *);
587 static void nvme_admin_cmd(nvme_cmd_t *, uint32_t);
588 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
589 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
590 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
591 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int);
592 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
593 static void nvme_wait_cmd(nvme_cmd_t *, uint_t);
594 static void nvme_wakeup_cmd(void *);
595 static void nvme_async_event_task(void *);
596
597 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
598 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
599 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
600 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
601 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
602 static inline int nvme_check_cmd_status(nvme_cmd_t *);
603 static boolean_t nvme_check_cmd_status_ioctl(nvme_cmd_t *,
604 nvme_ioctl_common_t *);
605
606 static int nvme_abort_cmd(nvme_cmd_t *, uint_t);
607 static void nvme_async_event(nvme_t *);
608 static boolean_t nvme_format_nvm(nvme_t *, nvme_ioctl_format_t *);
609 static boolean_t nvme_get_logpage_int(nvme_t *, boolean_t, void **, size_t *,
610 uint8_t);
611 static boolean_t nvme_identify(nvme_t *, boolean_t, nvme_ioctl_identify_t *,
612 void **);
613 static boolean_t nvme_identify_int(nvme_t *, uint32_t, uint8_t, void **);
614 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t,
615 uint32_t *);
616 static int nvme_write_cache_set(nvme_t *, boolean_t);
617 static int nvme_set_nqueues(nvme_t *);
618
619 static void nvme_free_dma(nvme_dma_t *);
620 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
621 nvme_dma_t **);
622 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
623 nvme_dma_t **);
624 static void nvme_free_qpair(nvme_qpair_t *);
625 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t);
626 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
627
628 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
629 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
630 static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
631 static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
632
633 static boolean_t nvme_check_regs_hdl(nvme_t *);
634 static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
635
636 static int nvme_fill_prp(nvme_cmd_t *, ddi_dma_handle_t);
637
638 static void nvme_bd_xfer_done(void *);
639 static void nvme_bd_driveinfo(void *, bd_drive_t *);
640 static int nvme_bd_mediainfo(void *, bd_media_t *);
641 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
642 static int nvme_bd_read(void *, bd_xfer_t *);
643 static int nvme_bd_write(void *, bd_xfer_t *);
644 static int nvme_bd_sync(void *, bd_xfer_t *);
645 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
646 static int nvme_bd_free_space(void *, bd_xfer_t *);
647
648 static int nvme_prp_dma_constructor(void *, void *, int);
649 static void nvme_prp_dma_destructor(void *, void *);
650
651 static void nvme_prepare_devid(nvme_t *, uint32_t);
652
653 /* DDI UFM callbacks */
654 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t,
655 ddi_ufm_image_t *);
656 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t,
657 ddi_ufm_slot_t *);
658 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *);
659
660 static int nvme_open(dev_t *, int, int, cred_t *);
661 static int nvme_close(dev_t, int, int, cred_t *);
662 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
663
664 static int nvme_init_ns(nvme_t *, uint32_t);
665 static boolean_t nvme_attach_ns(nvme_t *, nvme_ioctl_common_t *);
666 static boolean_t nvme_detach_ns(nvme_t *, nvme_ioctl_common_t *);
667
668 static int nvme_minor_comparator(const void *, const void *);
669
670 static ddi_ufm_ops_t nvme_ufm_ops = {
671 NULL,
672 nvme_ufm_fill_image,
673 nvme_ufm_fill_slot,
674 nvme_ufm_getcaps
675 };
676
677 /*
678 * Minor numbers are split amongst those used for controllers and for device
679 * opens. The number of controller minors are limited based open MAXMIN32 per
680 * the theory statement. We allocate 1 million minors as a total guess at a
681 * number that'll probably be enough. The starting point of the open minors can
682 * be shifted to accommodate future expansion of the NVMe device minors.
683 */
684 #define NVME_MINOR_INST_SHIFT 9
685 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
686 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT)
687 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
688 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2)
689 #define NVME_IS_VENDOR_SPECIFIC_CMD(x) (((x) >= 0xC0) && ((x) <= 0xFF))
690
691 #define NVME_OPEN_NMINORS (1024 * 1024)
692 #define NVME_OPEN_MINOR_MIN (MAXMIN32 + 1)
693 #define NVME_OPEN_MINOR_MAX_EXCL (NVME_OPEN_MINOR_MIN + \
694 NVME_OPEN_NMINORS)
695
696 static void *nvme_state;
697 static kmem_cache_t *nvme_cmd_cache;
698
699 /*
700 * DMA attributes for queue DMA memory
701 *
702 * Queue DMA memory must be page aligned. The maximum length of a queue is
703 * 65536 entries, and an entry can be 64 bytes long.
704 */
705 static const ddi_dma_attr_t nvme_queue_dma_attr = {
706 .dma_attr_version = DMA_ATTR_V0,
707 .dma_attr_addr_lo = 0,
708 .dma_attr_addr_hi = 0xffffffffffffffffULL,
709 .dma_attr_count_max = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1,
710 .dma_attr_align = 0x1000,
711 .dma_attr_burstsizes = 0x7ff,
712 .dma_attr_minxfer = 0x1000,
713 .dma_attr_maxxfer = (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
714 .dma_attr_seg = 0xffffffffffffffffULL,
715 .dma_attr_sgllen = 1,
716 .dma_attr_granular = 1,
717 .dma_attr_flags = 0,
718 };
719
720 /*
721 * DMA attributes for transfers using Physical Region Page (PRP) entries
722 *
723 * A PRP entry describes one page of DMA memory using the page size specified
724 * in the controller configuration's memory page size register (CC.MPS). It uses
725 * a 64bit base address aligned to this page size. There is no limitation on
726 * chaining PRPs together for arbitrarily large DMA transfers. These DMA
727 * attributes will be copied into the nvme_t during nvme_attach() and the
728 * dma_attr_maxxfer will be updated.
729 */
730 static const ddi_dma_attr_t nvme_prp_dma_attr = {
731 .dma_attr_version = DMA_ATTR_V0,
732 .dma_attr_addr_lo = 0,
733 .dma_attr_addr_hi = 0xffffffffffffffffULL,
734 .dma_attr_count_max = 0xfff,
735 .dma_attr_align = 0x1000,
736 .dma_attr_burstsizes = 0x7ff,
737 .dma_attr_minxfer = 0x1000,
738 .dma_attr_maxxfer = 0x1000,
739 .dma_attr_seg = 0xfff,
740 .dma_attr_sgllen = -1,
741 .dma_attr_granular = 1,
742 .dma_attr_flags = 0,
743 };
744
745 /*
746 * DMA attributes for transfers using scatter/gather lists
747 *
748 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
749 * 32bit length field. SGL Segment and SGL Last Segment entries require the
750 * length to be a multiple of 16 bytes. While the SGL DMA attributes are copied
751 * into the nvme_t, they are not currently used for any I/O.
752 */
753 static const ddi_dma_attr_t nvme_sgl_dma_attr = {
754 .dma_attr_version = DMA_ATTR_V0,
755 .dma_attr_addr_lo = 0,
756 .dma_attr_addr_hi = 0xffffffffffffffffULL,
757 .dma_attr_count_max = 0xffffffffUL,
758 .dma_attr_align = 1,
759 .dma_attr_burstsizes = 0x7ff,
760 .dma_attr_minxfer = 0x10,
761 .dma_attr_maxxfer = 0xfffffffffULL,
762 .dma_attr_seg = 0xffffffffffffffffULL,
763 .dma_attr_sgllen = -1,
764 .dma_attr_granular = 0x10,
765 .dma_attr_flags = 0
766 };
767
768 static ddi_device_acc_attr_t nvme_reg_acc_attr = {
769 .devacc_attr_version = DDI_DEVICE_ATTR_V0,
770 .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
771 .devacc_attr_dataorder = DDI_STRICTORDER_ACC
772 };
773
774 /*
775 * ioctl validation policies. These are policies that determine which namespaces
776 * are allowed or disallowed for various operations. Note, all policy items
777 * should be explicitly listed here to help make it clear what our intent is.
778 * That is also why some of these are identical or repeated when they cover
779 * different ioctls.
780 */
781
782 /*
783 * The controller information ioctl generally contains read-only information
784 * about the controller that is sourced from multiple different pieces of
785 * information. This does not operate on a namespace and none are accepted.
786 */
787 static const nvme_ioctl_check_t nvme_check_ctrl_info = {
788 .nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE,
789 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
790 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_NONE
791 };
792
793 /*
794 * The kernel namespace information requires a namespace ID to be specified. It
795 * does not allow for the broadcast ID to be specified.
796 */
797 static const nvme_ioctl_check_t nvme_check_ns_info = {
798 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
799 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
800 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_NONE
801 };
802
803 /*
804 * Identify commands are allowed to operate on a namespace minor. Unfortunately,
805 * the namespace field in identify commands is a bit, weird. In particular, some
806 * commands need a valid namespace, while others are namespace listing
807 * operations, which means illegal namespaces like zero are allowed.
808 */
809 static const nvme_ioctl_check_t nvme_check_identify = {
810 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
811 .nck_skip_ctrl = B_TRUE, .nck_ctrl_rewrite = B_FALSE,
812 .nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
813 };
814
815 /*
816 * The get log page command requires the ability to specify namespaces. When
817 * targeting the controller, one must use the broadcast NSID.
818 */
819 static const nvme_ioctl_check_t nvme_check_get_logpage = {
820 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
821 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
822 .nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
823 };
824
825 /*
826 * When getting a feature, we do not want rewriting behavior as most features do
827 * not require a namespace to be specified. Specific instances are checked in
828 * nvme_validate_get_feature().
829 */
830 static const nvme_ioctl_check_t nvme_check_get_feature = {
831 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
832 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
833 .nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
834 };
835
836 /*
837 * Format commands must target a namespace. The broadcast namespace must be used
838 * when referring to the controller.
839 */
840 static const nvme_ioctl_check_t nvme_check_format = {
841 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
842 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
843 .nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_WRITE
844 };
845
846 /*
847 * Attach and detach must always target a minor. However, the broadcast
848 * namespace is not allowed. We still perform rewriting so that way specifying
849 * the controller node with 0 will be caught.
850 */
851 static const nvme_ioctl_check_t nvme_check_attach_detach = {
852 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
853 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
854 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE
855 };
856
857 /*
858 * Firmware operations must not target a namespace and are only allowed from the
859 * controller.
860 */
861 static const nvme_ioctl_check_t nvme_check_firmware = {
862 .nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE,
863 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
864 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE
865 };
866
867 /*
868 * Passthru commands are an odd set. We only allow them from the primary
869 * controller; however, we allow a namespace to be specified in them and allow
870 * the broadcast namespace. We do not perform rewriting because we don't know
871 * what the semantics are. We explicitly exempt passthru commands from needing
872 * an exclusive lock and leave it up to them to tell us the impact of the
873 * command and semantics. As this is a privileged interface and the semantics
874 * are arbitrary, there's not much we can do without some assistance from the
875 * consumer.
876 */
877 static const nvme_ioctl_check_t nvme_check_passthru = {
878 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_FALSE,
879 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
880 .nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
881 };
882
883 /*
884 * Lock operations are allowed to target a namespace, but must not be rewritten.
885 * There is no support for the broadcast namespace. This is the only ioctl that
886 * should skip exclusive checking as it's used to grant it.
887 */
888 static const nvme_ioctl_check_t nvme_check_locking = {
889 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
890 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
891 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_SKIP
892 };
893
894 static struct cb_ops nvme_cb_ops = {
895 .cb_open = nvme_open,
896 .cb_close = nvme_close,
897 .cb_strategy = nodev,
898 .cb_print = nodev,
899 .cb_dump = nodev,
900 .cb_read = nodev,
901 .cb_write = nodev,
902 .cb_ioctl = nvme_ioctl,
903 .cb_devmap = nodev,
904 .cb_mmap = nodev,
905 .cb_segmap = nodev,
906 .cb_chpoll = nochpoll,
907 .cb_prop_op = ddi_prop_op,
908 .cb_str = 0,
909 .cb_flag = D_NEW | D_MP,
910 .cb_rev = CB_REV,
911 .cb_aread = nodev,
912 .cb_awrite = nodev
913 };
914
915 static struct dev_ops nvme_dev_ops = {
916 .devo_rev = DEVO_REV,
917 .devo_refcnt = 0,
918 .devo_getinfo = ddi_no_info,
919 .devo_identify = nulldev,
920 .devo_probe = nulldev,
921 .devo_attach = nvme_attach,
922 .devo_detach = nvme_detach,
923 .devo_reset = nodev,
924 .devo_cb_ops = &nvme_cb_ops,
925 .devo_bus_ops = NULL,
926 .devo_power = NULL,
927 .devo_quiesce = nvme_quiesce,
928 };
929
930 static struct modldrv nvme_modldrv = {
931 .drv_modops = &mod_driverops,
932 .drv_linkinfo = "NVMe driver",
933 .drv_dev_ops = &nvme_dev_ops
934 };
935
936 static struct modlinkage nvme_modlinkage = {
937 .ml_rev = MODREV_1,
938 .ml_linkage = { &nvme_modldrv, NULL }
939 };
940
941 static bd_ops_t nvme_bd_ops = {
942 .o_version = BD_OPS_CURRENT_VERSION,
943 .o_drive_info = nvme_bd_driveinfo,
944 .o_media_info = nvme_bd_mediainfo,
945 .o_devid_init = nvme_bd_devid,
946 .o_sync_cache = nvme_bd_sync,
947 .o_read = nvme_bd_read,
948 .o_write = nvme_bd_write,
949 .o_free_space = nvme_bd_free_space,
950 };
951
952 /*
953 * This list will hold commands that have timed out and couldn't be aborted.
954 * As we don't know what the hardware may still do with the DMA memory we can't
955 * free them, so we'll keep them forever on this list where we can easily look
956 * at them with mdb.
957 */
958 static struct list nvme_lost_cmds;
959 static kmutex_t nvme_lc_mutex;
960
961 int
_init(void)962 _init(void)
963 {
964 int error;
965
966 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
967 if (error != DDI_SUCCESS)
968 return (error);
969
970 if ((nvme_open_minors = id_space_create("nvme_open_minors",
971 NVME_OPEN_MINOR_MIN, NVME_OPEN_MINOR_MAX_EXCL)) == NULL) {
972 ddi_soft_state_fini(&nvme_state);
973 return (ENOMEM);
974 }
975
976 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
977 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
978
979 mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL);
980 list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t),
981 offsetof(nvme_cmd_t, nc_list));
982
983 mutex_init(&nvme_open_minors_mutex, NULL, MUTEX_DRIVER, NULL);
984 avl_create(&nvme_open_minors_avl, nvme_minor_comparator,
985 sizeof (nvme_minor_t), offsetof(nvme_minor_t, nm_avl));
986
987 nvme_dead_taskq = taskq_create("nvme_dead_taskq", 1, minclsyspri, 1, 1,
988 TASKQ_PREPOPULATE);
989
990 bd_mod_init(&nvme_dev_ops);
991
992 error = mod_install(&nvme_modlinkage);
993 if (error != DDI_SUCCESS) {
994 ddi_soft_state_fini(&nvme_state);
995 id_space_destroy(nvme_open_minors);
996 mutex_destroy(&nvme_lc_mutex);
997 list_destroy(&nvme_lost_cmds);
998 bd_mod_fini(&nvme_dev_ops);
999 mutex_destroy(&nvme_open_minors_mutex);
1000 avl_destroy(&nvme_open_minors_avl);
1001 taskq_destroy(nvme_dead_taskq);
1002 }
1003
1004 return (error);
1005 }
1006
1007 int
_fini(void)1008 _fini(void)
1009 {
1010 int error;
1011
1012 if (!list_is_empty(&nvme_lost_cmds))
1013 return (DDI_FAILURE);
1014
1015 error = mod_remove(&nvme_modlinkage);
1016 if (error == DDI_SUCCESS) {
1017 ddi_soft_state_fini(&nvme_state);
1018 id_space_destroy(nvme_open_minors);
1019 kmem_cache_destroy(nvme_cmd_cache);
1020 mutex_destroy(&nvme_lc_mutex);
1021 list_destroy(&nvme_lost_cmds);
1022 bd_mod_fini(&nvme_dev_ops);
1023 mutex_destroy(&nvme_open_minors_mutex);
1024 avl_destroy(&nvme_open_minors_avl);
1025 taskq_destroy(nvme_dead_taskq);
1026 }
1027
1028 return (error);
1029 }
1030
1031 int
_info(struct modinfo * modinfop)1032 _info(struct modinfo *modinfop)
1033 {
1034 return (mod_info(&nvme_modlinkage, modinfop));
1035 }
1036
1037 static inline void
nvme_put64(nvme_t * nvme,uintptr_t reg,uint64_t val)1038 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
1039 {
1040 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
1041
1042 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
1043 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
1044 }
1045
1046 static inline void
nvme_put32(nvme_t * nvme,uintptr_t reg,uint32_t val)1047 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val)
1048 {
1049 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
1050
1051 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
1052 ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val);
1053 }
1054
1055 static inline uint64_t
nvme_get64(nvme_t * nvme,uintptr_t reg)1056 nvme_get64(nvme_t *nvme, uintptr_t reg)
1057 {
1058 uint64_t val;
1059
1060 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
1061
1062 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
1063 val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg));
1064
1065 return (val);
1066 }
1067
1068 static inline uint32_t
nvme_get32(nvme_t * nvme,uintptr_t reg)1069 nvme_get32(nvme_t *nvme, uintptr_t reg)
1070 {
1071 uint32_t val;
1072
1073 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
1074
1075 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
1076 val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg));
1077
1078 return (val);
1079 }
1080
1081 /*
1082 * This is a central clearing house for marking an NVMe controller dead and/or
1083 * removed. This takes care of setting the flag, taking care of outstanding
1084 * blocked locks, and sending a DDI FMA impact. This is called from a precarious
1085 * place where locking is suspect. The only guarantee we have is that the nvme_t
1086 * is valid and won't disappear until we return.
1087 *
1088 * This should only be used after attach has been called.
1089 */
1090 static void
nvme_ctrl_mark_dead(nvme_t * nvme,boolean_t removed)1091 nvme_ctrl_mark_dead(nvme_t *nvme, boolean_t removed)
1092 {
1093 boolean_t was_dead;
1094
1095 /*
1096 * See if we win the race to set things up here. If someone beat us to
1097 * it, we do not do anything.
1098 */
1099 was_dead = atomic_cas_32((volatile uint32_t *)&nvme->n_dead, B_FALSE,
1100 B_TRUE);
1101 if (was_dead) {
1102 return;
1103 }
1104
1105 /*
1106 * If this was removed, there is no reason to change the service impact.
1107 * However, then we need to change our default return code that we use
1108 * here to indicate that it was gone versus that it is dead.
1109 */
1110 if (removed) {
1111 nvme->n_dead_status = NVME_IOCTL_E_CTRL_GONE;
1112 } else {
1113 ASSERT3U(nvme->n_dead_status, ==, NVME_IOCTL_E_CTRL_DEAD);
1114 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1115 }
1116
1117 taskq_dispatch_ent(nvme_dead_taskq, nvme_rwlock_ctrl_dead, nvme,
1118 TQ_NOSLEEP, &nvme->n_dead_tqent);
1119 }
1120
1121 static boolean_t
nvme_check_regs_hdl(nvme_t * nvme)1122 nvme_check_regs_hdl(nvme_t *nvme)
1123 {
1124 ddi_fm_error_t error;
1125
1126 ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION);
1127
1128 if (error.fme_status != DDI_FM_OK)
1129 return (B_TRUE);
1130
1131 return (B_FALSE);
1132 }
1133
1134 static boolean_t
nvme_check_dma_hdl(nvme_dma_t * dma)1135 nvme_check_dma_hdl(nvme_dma_t *dma)
1136 {
1137 ddi_fm_error_t error;
1138
1139 if (dma == NULL)
1140 return (B_FALSE);
1141
1142 ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION);
1143
1144 if (error.fme_status != DDI_FM_OK)
1145 return (B_TRUE);
1146
1147 return (B_FALSE);
1148 }
1149
1150 static void
nvme_free_dma_common(nvme_dma_t * dma)1151 nvme_free_dma_common(nvme_dma_t *dma)
1152 {
1153 if (dma->nd_dmah != NULL)
1154 (void) ddi_dma_unbind_handle(dma->nd_dmah);
1155 if (dma->nd_acch != NULL)
1156 ddi_dma_mem_free(&dma->nd_acch);
1157 if (dma->nd_dmah != NULL)
1158 ddi_dma_free_handle(&dma->nd_dmah);
1159 }
1160
1161 static void
nvme_free_dma(nvme_dma_t * dma)1162 nvme_free_dma(nvme_dma_t *dma)
1163 {
1164 nvme_free_dma_common(dma);
1165 kmem_free(dma, sizeof (*dma));
1166 }
1167
1168 /* ARGSUSED */
1169 static void
nvme_prp_dma_destructor(void * buf,void * private)1170 nvme_prp_dma_destructor(void *buf, void *private)
1171 {
1172 nvme_dma_t *dma = (nvme_dma_t *)buf;
1173
1174 nvme_free_dma_common(dma);
1175 }
1176
1177 static int
nvme_alloc_dma_common(nvme_t * nvme,nvme_dma_t * dma,size_t len,uint_t flags,ddi_dma_attr_t * dma_attr)1178 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma,
1179 size_t len, uint_t flags, ddi_dma_attr_t *dma_attr)
1180 {
1181 if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL,
1182 &dma->nd_dmah) != DDI_SUCCESS) {
1183 /*
1184 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
1185 * the only other possible error is DDI_DMA_BADATTR which
1186 * indicates a driver bug which should cause a panic.
1187 */
1188 dev_err(nvme->n_dip, CE_PANIC,
1189 "!failed to get DMA handle, check DMA attributes");
1190 return (DDI_FAILURE);
1191 }
1192
1193 /*
1194 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
1195 * or the flags are conflicting, which isn't the case here.
1196 */
1197 (void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr,
1198 DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp,
1199 &dma->nd_len, &dma->nd_acch);
1200
1201 if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp,
1202 dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
1203 &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) {
1204 dev_err(nvme->n_dip, CE_WARN,
1205 "!failed to bind DMA memory");
1206 atomic_inc_32(&nvme->n_dma_bind_err);
1207 nvme_free_dma_common(dma);
1208 return (DDI_FAILURE);
1209 }
1210
1211 return (DDI_SUCCESS);
1212 }
1213
1214 static int
nvme_zalloc_dma(nvme_t * nvme,size_t len,uint_t flags,ddi_dma_attr_t * dma_attr,nvme_dma_t ** ret)1215 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags,
1216 ddi_dma_attr_t *dma_attr, nvme_dma_t **ret)
1217 {
1218 nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP);
1219
1220 if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) !=
1221 DDI_SUCCESS) {
1222 *ret = NULL;
1223 kmem_free(dma, sizeof (nvme_dma_t));
1224 return (DDI_FAILURE);
1225 }
1226
1227 bzero(dma->nd_memp, dma->nd_len);
1228
1229 *ret = dma;
1230 return (DDI_SUCCESS);
1231 }
1232
1233 /* ARGSUSED */
1234 static int
nvme_prp_dma_constructor(void * buf,void * private,int flags)1235 nvme_prp_dma_constructor(void *buf, void *private, int flags)
1236 {
1237 nvme_dma_t *dma = (nvme_dma_t *)buf;
1238 nvme_t *nvme = (nvme_t *)private;
1239
1240 dma->nd_dmah = NULL;
1241 dma->nd_acch = NULL;
1242
1243 if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize,
1244 DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) {
1245 return (-1);
1246 }
1247
1248 ASSERT(dma->nd_ncookie == 1);
1249
1250 dma->nd_cached = B_TRUE;
1251
1252 return (0);
1253 }
1254
1255 static int
nvme_zalloc_queue_dma(nvme_t * nvme,uint32_t nentry,uint16_t qe_len,uint_t flags,nvme_dma_t ** dma)1256 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
1257 uint_t flags, nvme_dma_t **dma)
1258 {
1259 uint32_t len = nentry * qe_len;
1260 ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr;
1261
1262 len = roundup(len, nvme->n_pagesize);
1263
1264 if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
1265 != DDI_SUCCESS) {
1266 dev_err(nvme->n_dip, CE_WARN,
1267 "!failed to get DMA memory for queue");
1268 goto fail;
1269 }
1270
1271 if ((*dma)->nd_ncookie != 1) {
1272 dev_err(nvme->n_dip, CE_WARN,
1273 "!got too many cookies for queue DMA");
1274 goto fail;
1275 }
1276
1277 return (DDI_SUCCESS);
1278
1279 fail:
1280 if (*dma) {
1281 nvme_free_dma(*dma);
1282 *dma = NULL;
1283 }
1284
1285 return (DDI_FAILURE);
1286 }
1287
1288 static void
nvme_free_cq(nvme_cq_t * cq)1289 nvme_free_cq(nvme_cq_t *cq)
1290 {
1291 mutex_destroy(&cq->ncq_mutex);
1292
1293 if (cq->ncq_cmd_taskq != NULL)
1294 taskq_destroy(cq->ncq_cmd_taskq);
1295
1296 if (cq->ncq_dma != NULL)
1297 nvme_free_dma(cq->ncq_dma);
1298
1299 kmem_free(cq, sizeof (*cq));
1300 }
1301
1302 static void
nvme_free_qpair(nvme_qpair_t * qp)1303 nvme_free_qpair(nvme_qpair_t *qp)
1304 {
1305 int i;
1306
1307 mutex_destroy(&qp->nq_mutex);
1308 sema_destroy(&qp->nq_sema);
1309
1310 if (qp->nq_sqdma != NULL)
1311 nvme_free_dma(qp->nq_sqdma);
1312
1313 if (qp->nq_active_cmds > 0)
1314 for (i = 0; i != qp->nq_nentry; i++)
1315 if (qp->nq_cmd[i] != NULL)
1316 nvme_free_cmd(qp->nq_cmd[i]);
1317
1318 if (qp->nq_cmd != NULL)
1319 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
1320
1321 kmem_free(qp, sizeof (nvme_qpair_t));
1322 }
1323
1324 /*
1325 * Destroy the pre-allocated cq array, but only free individual completion
1326 * queues from the given starting index.
1327 */
1328 static void
nvme_destroy_cq_array(nvme_t * nvme,uint_t start)1329 nvme_destroy_cq_array(nvme_t *nvme, uint_t start)
1330 {
1331 uint_t i;
1332
1333 for (i = start; i < nvme->n_cq_count; i++)
1334 if (nvme->n_cq[i] != NULL)
1335 nvme_free_cq(nvme->n_cq[i]);
1336
1337 kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count);
1338 }
1339
1340 static int
nvme_alloc_cq(nvme_t * nvme,uint32_t nentry,nvme_cq_t ** cqp,uint16_t idx,uint_t nthr)1341 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx,
1342 uint_t nthr)
1343 {
1344 nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP);
1345 char name[64]; /* large enough for the taskq name */
1346
1347 mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER,
1348 DDI_INTR_PRI(nvme->n_intr_pri));
1349
1350 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
1351 DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS)
1352 goto fail;
1353
1354 cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp;
1355 cq->ncq_nentry = nentry;
1356 cq->ncq_id = idx;
1357 cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx);
1358
1359 /*
1360 * Each completion queue has its own command taskq.
1361 */
1362 (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u",
1363 ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx);
1364
1365 cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX,
1366 TASKQ_PREPOPULATE);
1367
1368 if (cq->ncq_cmd_taskq == NULL) {
1369 dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd "
1370 "taskq for cq %u", idx);
1371 goto fail;
1372 }
1373
1374 *cqp = cq;
1375 return (DDI_SUCCESS);
1376
1377 fail:
1378 nvme_free_cq(cq);
1379 *cqp = NULL;
1380
1381 return (DDI_FAILURE);
1382 }
1383
1384 /*
1385 * Create the n_cq array big enough to hold "ncq" completion queues.
1386 * If the array already exists it will be re-sized (but only larger).
1387 * The admin queue is included in this array, which boosts the
1388 * max number of entries to UINT16_MAX + 1.
1389 */
1390 static int
nvme_create_cq_array(nvme_t * nvme,uint_t ncq,uint32_t nentry,uint_t nthr)1391 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr)
1392 {
1393 nvme_cq_t **cq;
1394 uint_t i, cq_count;
1395
1396 ASSERT3U(ncq, >, nvme->n_cq_count);
1397
1398 cq = nvme->n_cq;
1399 cq_count = nvme->n_cq_count;
1400
1401 nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP);
1402 nvme->n_cq_count = ncq;
1403
1404 for (i = 0; i < cq_count; i++)
1405 nvme->n_cq[i] = cq[i];
1406
1407 for (; i < nvme->n_cq_count; i++)
1408 if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) !=
1409 DDI_SUCCESS)
1410 goto fail;
1411
1412 if (cq != NULL)
1413 kmem_free(cq, sizeof (*cq) * cq_count);
1414
1415 return (DDI_SUCCESS);
1416
1417 fail:
1418 nvme_destroy_cq_array(nvme, cq_count);
1419 /*
1420 * Restore the original array
1421 */
1422 nvme->n_cq_count = cq_count;
1423 nvme->n_cq = cq;
1424
1425 return (DDI_FAILURE);
1426 }
1427
1428 static int
nvme_alloc_qpair(nvme_t * nvme,uint32_t nentry,nvme_qpair_t ** nqp,uint_t idx)1429 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
1430 uint_t idx)
1431 {
1432 nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
1433 uint_t cq_idx;
1434
1435 mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
1436 DDI_INTR_PRI(nvme->n_intr_pri));
1437
1438 /*
1439 * The NVMe spec defines that a full queue has one empty (unused) slot;
1440 * initialize the semaphore accordingly.
1441 */
1442 sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL);
1443
1444 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
1445 DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
1446 goto fail;
1447
1448 /*
1449 * idx == 0 is adminq, those above 0 are shared io completion queues.
1450 */
1451 cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1);
1452 qp->nq_cq = nvme->n_cq[cq_idx];
1453 qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
1454 qp->nq_nentry = nentry;
1455
1456 qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
1457
1458 qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
1459 qp->nq_next_cmd = 0;
1460
1461 *nqp = qp;
1462 return (DDI_SUCCESS);
1463
1464 fail:
1465 nvme_free_qpair(qp);
1466 *nqp = NULL;
1467
1468 return (DDI_FAILURE);
1469 }
1470
1471 static nvme_cmd_t *
nvme_alloc_cmd(nvme_t * nvme,int kmflag)1472 nvme_alloc_cmd(nvme_t *nvme, int kmflag)
1473 {
1474 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag);
1475
1476 if (cmd == NULL)
1477 return (cmd);
1478
1479 bzero(cmd, sizeof (nvme_cmd_t));
1480
1481 cmd->nc_nvme = nvme;
1482
1483 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER,
1484 DDI_INTR_PRI(nvme->n_intr_pri));
1485 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL);
1486
1487 return (cmd);
1488 }
1489
1490 static void
nvme_free_cmd(nvme_cmd_t * cmd)1491 nvme_free_cmd(nvme_cmd_t *cmd)
1492 {
1493 /* Don't free commands on the lost commands list. */
1494 if (list_link_active(&cmd->nc_list))
1495 return;
1496
1497 if (cmd->nc_dma) {
1498 nvme_free_dma(cmd->nc_dma);
1499 cmd->nc_dma = NULL;
1500 }
1501
1502 if (cmd->nc_prp) {
1503 kmem_cache_free(cmd->nc_nvme->n_prp_cache, cmd->nc_prp);
1504 cmd->nc_prp = NULL;
1505 }
1506
1507 cv_destroy(&cmd->nc_cv);
1508 mutex_destroy(&cmd->nc_mutex);
1509
1510 kmem_cache_free(nvme_cmd_cache, cmd);
1511 }
1512
1513 static void
nvme_submit_admin_cmd(nvme_qpair_t * qp,nvme_cmd_t * cmd)1514 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1515 {
1516 sema_p(&qp->nq_sema);
1517 nvme_submit_cmd_common(qp, cmd);
1518 }
1519
1520 static int
nvme_submit_io_cmd(nvme_qpair_t * qp,nvme_cmd_t * cmd)1521 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1522 {
1523 if (cmd->nc_nvme->n_dead) {
1524 return (EIO);
1525 }
1526
1527 if (sema_tryp(&qp->nq_sema) == 0)
1528 return (EAGAIN);
1529
1530 nvme_submit_cmd_common(qp, cmd);
1531 return (0);
1532 }
1533
1534 static void
nvme_submit_cmd_common(nvme_qpair_t * qp,nvme_cmd_t * cmd)1535 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1536 {
1537 nvme_reg_sqtdbl_t tail = { 0 };
1538
1539 mutex_enter(&qp->nq_mutex);
1540 cmd->nc_completed = B_FALSE;
1541
1542 /*
1543 * Now that we hold the queue pair lock, we must check whether or not
1544 * the controller has been listed as dead (e.g. was removed due to
1545 * hotplug). This is necessary as otherwise we could race with
1546 * nvme_remove_callback(). Because this has not been enqueued, we don't
1547 * call nvme_unqueue_cmd(), which is why we must manually decrement the
1548 * semaphore.
1549 */
1550 if (cmd->nc_nvme->n_dead) {
1551 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback,
1552 cmd, TQ_NOSLEEP, &cmd->nc_tqent);
1553 sema_v(&qp->nq_sema);
1554 mutex_exit(&qp->nq_mutex);
1555 return;
1556 }
1557
1558 /*
1559 * Try to insert the cmd into the active cmd array at the nq_next_cmd
1560 * slot. If the slot is already occupied advance to the next slot and
1561 * try again. This can happen for long running commands like async event
1562 * requests.
1563 */
1564 while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
1565 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1566 qp->nq_cmd[qp->nq_next_cmd] = cmd;
1567
1568 qp->nq_active_cmds++;
1569
1570 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
1571 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
1572 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
1573 sizeof (nvme_sqe_t) * qp->nq_sqtail,
1574 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
1575 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1576
1577 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
1578 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
1579
1580 mutex_exit(&qp->nq_mutex);
1581 }
1582
1583 static nvme_cmd_t *
nvme_unqueue_cmd(nvme_t * nvme,nvme_qpair_t * qp,int cid)1584 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid)
1585 {
1586 nvme_cmd_t *cmd;
1587
1588 ASSERT(mutex_owned(&qp->nq_mutex));
1589 ASSERT3S(cid, <, qp->nq_nentry);
1590
1591 cmd = qp->nq_cmd[cid];
1592 /*
1593 * Some controllers will erroneously add things to the completion queue
1594 * for which there is no matching outstanding command. If this happens,
1595 * it is almost certainly a controller firmware bug since nq_mutex
1596 * is held across command submission and ringing the queue doorbell,
1597 * and is also held in this function.
1598 *
1599 * If we see such an unexpected command, there is not much we can do.
1600 * These will be logged and counted in nvme_get_completed(), but
1601 * otherwise ignored.
1602 */
1603 if (cmd == NULL)
1604 return (NULL);
1605 qp->nq_cmd[cid] = NULL;
1606 ASSERT3U(qp->nq_active_cmds, >, 0);
1607 qp->nq_active_cmds--;
1608 sema_v(&qp->nq_sema);
1609
1610 ASSERT3P(cmd, !=, NULL);
1611 ASSERT3P(cmd->nc_nvme, ==, nvme);
1612 ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid);
1613
1614 return (cmd);
1615 }
1616
1617 /*
1618 * Get the command tied to the next completed cqe and bump along completion
1619 * queue head counter.
1620 */
1621 static nvme_cmd_t *
nvme_get_completed(nvme_t * nvme,nvme_cq_t * cq)1622 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq)
1623 {
1624 nvme_qpair_t *qp;
1625 nvme_cqe_t *cqe;
1626 nvme_cmd_t *cmd;
1627
1628 ASSERT(mutex_owned(&cq->ncq_mutex));
1629
1630 retry:
1631 cqe = &cq->ncq_cq[cq->ncq_head];
1632
1633 /* Check phase tag of CQE. Hardware inverts it for new entries. */
1634 if (cqe->cqe_sf.sf_p == cq->ncq_phase)
1635 return (NULL);
1636
1637 qp = nvme->n_ioq[cqe->cqe_sqid];
1638
1639 mutex_enter(&qp->nq_mutex);
1640 cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid);
1641 mutex_exit(&qp->nq_mutex);
1642
1643 qp->nq_sqhead = cqe->cqe_sqhd;
1644 cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry;
1645
1646 /* Toggle phase on wrap-around. */
1647 if (cq->ncq_head == 0)
1648 cq->ncq_phase = cq->ncq_phase != 0 ? 0 : 1;
1649
1650 if (cmd == NULL) {
1651 dev_err(nvme->n_dip, CE_WARN,
1652 "!received completion for unknown cid 0x%x", cqe->cqe_cid);
1653 atomic_inc_32(&nvme->n_unknown_cid);
1654 /*
1655 * We want to ignore this unexpected completion entry as it
1656 * is most likely a result of a bug in the controller firmware.
1657 * However, if we return NULL, then callers will assume there
1658 * are no more pending commands for this wakeup. Retry to keep
1659 * enumerating commands until the phase tag indicates there are
1660 * no more and we are really done.
1661 */
1662 goto retry;
1663 }
1664
1665 ASSERT3U(cmd->nc_sqid, ==, cqe->cqe_sqid);
1666 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
1667
1668 return (cmd);
1669 }
1670
1671 /*
1672 * Process all completed commands on the io completion queue.
1673 */
1674 static uint_t
nvme_process_iocq(nvme_t * nvme,nvme_cq_t * cq)1675 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq)
1676 {
1677 nvme_reg_cqhdbl_t head = { 0 };
1678 nvme_cmd_t *cmd;
1679 uint_t completed = 0;
1680
1681 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
1682 DDI_SUCCESS)
1683 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
1684 __func__);
1685
1686 mutex_enter(&cq->ncq_mutex);
1687
1688 while ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
1689 taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd,
1690 TQ_NOSLEEP, &cmd->nc_tqent);
1691
1692 completed++;
1693 }
1694
1695 if (completed > 0) {
1696 /*
1697 * Update the completion queue head doorbell.
1698 */
1699 head.b.cqhdbl_cqh = cq->ncq_head;
1700 nvme_put32(nvme, cq->ncq_hdbl, head.r);
1701 }
1702
1703 mutex_exit(&cq->ncq_mutex);
1704
1705 return (completed);
1706 }
1707
1708 static nvme_cmd_t *
nvme_retrieve_cmd(nvme_t * nvme,nvme_qpair_t * qp)1709 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
1710 {
1711 nvme_cq_t *cq = qp->nq_cq;
1712 nvme_reg_cqhdbl_t head = { 0 };
1713 nvme_cmd_t *cmd;
1714
1715 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
1716 DDI_SUCCESS)
1717 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
1718 __func__);
1719
1720 mutex_enter(&cq->ncq_mutex);
1721
1722 if ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
1723 head.b.cqhdbl_cqh = cq->ncq_head;
1724 nvme_put32(nvme, cq->ncq_hdbl, head.r);
1725 }
1726
1727 mutex_exit(&cq->ncq_mutex);
1728
1729 return (cmd);
1730 }
1731
1732 static int
nvme_check_unknown_cmd_status(nvme_cmd_t * cmd)1733 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
1734 {
1735 nvme_cqe_t *cqe = &cmd->nc_cqe;
1736
1737 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1738 "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1739 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
1740 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
1741 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
1742
1743 if (cmd->nc_xfer != NULL)
1744 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1745
1746 if (cmd->nc_nvme->n_strict_version) {
1747 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
1748 }
1749
1750 return (EIO);
1751 }
1752
1753 static int
nvme_check_vendor_cmd_status(nvme_cmd_t * cmd)1754 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd)
1755 {
1756 nvme_cqe_t *cqe = &cmd->nc_cqe;
1757
1758 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1759 "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1760 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
1761 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
1762 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
1763 if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) {
1764 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
1765 }
1766
1767 return (EIO);
1768 }
1769
1770 static int
nvme_check_integrity_cmd_status(nvme_cmd_t * cmd)1771 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd)
1772 {
1773 nvme_cqe_t *cqe = &cmd->nc_cqe;
1774
1775 switch (cqe->cqe_sf.sf_sc) {
1776 case NVME_CQE_SC_INT_NVM_WRITE:
1777 /* write fail */
1778 /* TODO: post ereport */
1779 if (cmd->nc_xfer != NULL)
1780 bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1781 return (EIO);
1782
1783 case NVME_CQE_SC_INT_NVM_READ:
1784 /* read fail */
1785 /* TODO: post ereport */
1786 if (cmd->nc_xfer != NULL)
1787 bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1788 return (EIO);
1789
1790 default:
1791 return (nvme_check_unknown_cmd_status(cmd));
1792 }
1793 }
1794
1795 static int
nvme_check_generic_cmd_status(nvme_cmd_t * cmd)1796 nvme_check_generic_cmd_status(nvme_cmd_t *cmd)
1797 {
1798 nvme_cqe_t *cqe = &cmd->nc_cqe;
1799
1800 switch (cqe->cqe_sf.sf_sc) {
1801 case NVME_CQE_SC_GEN_SUCCESS:
1802 return (0);
1803
1804 /*
1805 * Errors indicating a bug in the driver should cause a panic.
1806 */
1807 case NVME_CQE_SC_GEN_INV_OPC:
1808 /* Invalid Command Opcode */
1809 if (!cmd->nc_dontpanic)
1810 dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1811 "programming error: invalid opcode in cmd %p",
1812 (void *)cmd);
1813 return (EINVAL);
1814
1815 case NVME_CQE_SC_GEN_INV_FLD:
1816 /* Invalid Field in Command */
1817 if (!cmd->nc_dontpanic)
1818 dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1819 "programming error: invalid field in cmd %p",
1820 (void *)cmd);
1821 return (EIO);
1822
1823 case NVME_CQE_SC_GEN_ID_CNFL:
1824 /* Command ID Conflict */
1825 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1826 "cmd ID conflict in cmd %p", (void *)cmd);
1827 return (0);
1828
1829 case NVME_CQE_SC_GEN_INV_NS:
1830 /* Invalid Namespace or Format */
1831 if (!cmd->nc_dontpanic)
1832 dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1833 "programming error: invalid NS/format in cmd %p",
1834 (void *)cmd);
1835 return (EINVAL);
1836
1837 case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
1838 /* LBA Out Of Range */
1839 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1840 "LBA out of range in cmd %p", (void *)cmd);
1841 return (0);
1842
1843 /*
1844 * Non-fatal errors, handle gracefully.
1845 */
1846 case NVME_CQE_SC_GEN_DATA_XFR_ERR:
1847 /* Data Transfer Error (DMA) */
1848 /* TODO: post ereport */
1849 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
1850 if (cmd->nc_xfer != NULL)
1851 bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1852 return (EIO);
1853
1854 case NVME_CQE_SC_GEN_INTERNAL_ERR:
1855 /*
1856 * Internal Error. The spec (v1.0, section 4.5.1.2) says
1857 * detailed error information is returned as async event,
1858 * so we pretty much ignore the error here and handle it
1859 * in the async event handler.
1860 */
1861 atomic_inc_32(&cmd->nc_nvme->n_internal_err);
1862 if (cmd->nc_xfer != NULL)
1863 bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1864 return (EIO);
1865
1866 case NVME_CQE_SC_GEN_ABORT_REQUEST:
1867 /*
1868 * Command Abort Requested. This normally happens only when a
1869 * command times out.
1870 */
1871 /* TODO: post ereport or change blkdev to handle this? */
1872 atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err);
1873 return (ECANCELED);
1874
1875 case NVME_CQE_SC_GEN_ABORT_PWRLOSS:
1876 /* Command Aborted due to Power Loss Notification */
1877 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
1878 return (EIO);
1879
1880 case NVME_CQE_SC_GEN_ABORT_SQ_DEL:
1881 /* Command Aborted due to SQ Deletion */
1882 atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del);
1883 return (EIO);
1884
1885 case NVME_CQE_SC_GEN_NVM_CAP_EXC:
1886 /* Capacity Exceeded */
1887 atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc);
1888 if (cmd->nc_xfer != NULL)
1889 bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1890 return (EIO);
1891
1892 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY:
1893 /* Namespace Not Ready */
1894 atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy);
1895 if (cmd->nc_xfer != NULL)
1896 bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1897 return (EIO);
1898
1899 case NVME_CQE_SC_GEN_NVM_FORMATTING:
1900 /* Format in progress (1.2) */
1901 if (!NVME_VERSION_ATLEAST(&cmd->nc_nvme->n_version, 1, 2))
1902 return (nvme_check_unknown_cmd_status(cmd));
1903 atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_formatting);
1904 if (cmd->nc_xfer != NULL)
1905 bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1906 return (EIO);
1907
1908 default:
1909 return (nvme_check_unknown_cmd_status(cmd));
1910 }
1911 }
1912
1913 static int
nvme_check_specific_cmd_status(nvme_cmd_t * cmd)1914 nvme_check_specific_cmd_status(nvme_cmd_t *cmd)
1915 {
1916 nvme_cqe_t *cqe = &cmd->nc_cqe;
1917
1918 switch (cqe->cqe_sf.sf_sc) {
1919 case NVME_CQE_SC_SPC_INV_CQ:
1920 /* Completion Queue Invalid */
1921 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE);
1922 atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err);
1923 return (EINVAL);
1924
1925 case NVME_CQE_SC_SPC_INV_QID:
1926 /* Invalid Queue Identifier */
1927 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
1928 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE ||
1929 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE ||
1930 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
1931 atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err);
1932 return (EINVAL);
1933
1934 case NVME_CQE_SC_SPC_MAX_QSZ_EXC:
1935 /* Max Queue Size Exceeded */
1936 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
1937 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
1938 atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc);
1939 return (EINVAL);
1940
1941 case NVME_CQE_SC_SPC_ABRT_CMD_EXC:
1942 /* Abort Command Limit Exceeded */
1943 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT);
1944 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1945 "abort command limit exceeded in cmd %p", (void *)cmd);
1946 return (0);
1947
1948 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC:
1949 /* Async Event Request Limit Exceeded */
1950 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT);
1951 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1952 "async event request limit exceeded in cmd %p",
1953 (void *)cmd);
1954 return (0);
1955
1956 case NVME_CQE_SC_SPC_INV_INT_VECT:
1957 /* Invalid Interrupt Vector */
1958 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
1959 atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect);
1960 return (EINVAL);
1961
1962 case NVME_CQE_SC_SPC_INV_LOG_PAGE:
1963 /* Invalid Log Page */
1964 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE);
1965 atomic_inc_32(&cmd->nc_nvme->n_inv_log_page);
1966 return (EINVAL);
1967
1968 case NVME_CQE_SC_SPC_INV_FORMAT:
1969 /* Invalid Format */
1970 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT);
1971 atomic_inc_32(&cmd->nc_nvme->n_inv_format);
1972 if (cmd->nc_xfer != NULL)
1973 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1974 return (EINVAL);
1975
1976 case NVME_CQE_SC_SPC_INV_Q_DEL:
1977 /* Invalid Queue Deletion */
1978 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
1979 atomic_inc_32(&cmd->nc_nvme->n_inv_q_del);
1980 return (EINVAL);
1981
1982 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR:
1983 /* Conflicting Attributes */
1984 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT ||
1985 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
1986 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1987 atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr);
1988 if (cmd->nc_xfer != NULL)
1989 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1990 return (EINVAL);
1991
1992 case NVME_CQE_SC_SPC_NVM_INV_PROT:
1993 /* Invalid Protection Information */
1994 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE ||
1995 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
1996 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1997 atomic_inc_32(&cmd->nc_nvme->n_inv_prot);
1998 if (cmd->nc_xfer != NULL)
1999 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2000 return (EINVAL);
2001
2002 case NVME_CQE_SC_SPC_NVM_READONLY:
2003 /* Write to Read Only Range */
2004 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
2005 atomic_inc_32(&cmd->nc_nvme->n_readonly);
2006 if (cmd->nc_xfer != NULL)
2007 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2008 return (EROFS);
2009
2010 case NVME_CQE_SC_SPC_INV_FW_SLOT:
2011 /* Invalid Firmware Slot */
2012 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2013 return (EINVAL);
2014
2015 case NVME_CQE_SC_SPC_INV_FW_IMG:
2016 /* Invalid Firmware Image */
2017 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2018 return (EINVAL);
2019
2020 case NVME_CQE_SC_SPC_FW_RESET:
2021 /* Conventional Reset Required */
2022 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2023 return (0);
2024
2025 case NVME_CQE_SC_SPC_FW_NSSR:
2026 /* NVMe Subsystem Reset Required */
2027 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2028 return (0);
2029
2030 case NVME_CQE_SC_SPC_FW_NEXT_RESET:
2031 /* Activation Requires Reset */
2032 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2033 return (0);
2034
2035 case NVME_CQE_SC_SPC_FW_MTFA:
2036 /* Activation Requires Maximum Time Violation */
2037 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2038 return (EAGAIN);
2039
2040 case NVME_CQE_SC_SPC_FW_PROHIBITED:
2041 /* Activation Prohibited */
2042 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2043 return (EINVAL);
2044
2045 case NVME_CQE_SC_SPC_FW_OVERLAP:
2046 /* Overlapping Firmware Ranges */
2047 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD);
2048 return (EINVAL);
2049
2050 default:
2051 return (nvme_check_unknown_cmd_status(cmd));
2052 }
2053 }
2054
2055 static inline int
nvme_check_cmd_status(nvme_cmd_t * cmd)2056 nvme_check_cmd_status(nvme_cmd_t *cmd)
2057 {
2058 nvme_cqe_t *cqe = &cmd->nc_cqe;
2059
2060 /*
2061 * Take a shortcut if the controller is dead, or if
2062 * command status indicates no error.
2063 */
2064 if (cmd->nc_nvme->n_dead)
2065 return (EIO);
2066
2067 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2068 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
2069 return (0);
2070
2071 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)
2072 return (nvme_check_generic_cmd_status(cmd));
2073 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC)
2074 return (nvme_check_specific_cmd_status(cmd));
2075 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY)
2076 return (nvme_check_integrity_cmd_status(cmd));
2077 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR)
2078 return (nvme_check_vendor_cmd_status(cmd));
2079
2080 return (nvme_check_unknown_cmd_status(cmd));
2081 }
2082
2083 /*
2084 * Check the command status as used by an ioctl path and do not convert it to an
2085 * errno. We still allow all the command status checking to occur, but otherwise
2086 * will pass back the controller error as is.
2087 */
2088 static boolean_t
nvme_check_cmd_status_ioctl(nvme_cmd_t * cmd,nvme_ioctl_common_t * ioc)2089 nvme_check_cmd_status_ioctl(nvme_cmd_t *cmd, nvme_ioctl_common_t *ioc)
2090 {
2091 nvme_cqe_t *cqe = &cmd->nc_cqe;
2092 nvme_t *nvme = cmd->nc_nvme;
2093
2094 if (nvme->n_dead) {
2095 return (nvme_ioctl_error(ioc, nvme->n_dead_status, 0, 0));
2096 }
2097
2098 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2099 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
2100 return (B_TRUE);
2101
2102 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) {
2103 (void) nvme_check_generic_cmd_status(cmd);
2104 } else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) {
2105 (void) nvme_check_specific_cmd_status(cmd);
2106 } else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) {
2107 (void) nvme_check_integrity_cmd_status(cmd);
2108 } else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) {
2109 (void) nvme_check_vendor_cmd_status(cmd);
2110 } else {
2111 (void) nvme_check_unknown_cmd_status(cmd);
2112 }
2113
2114 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_CTRL_ERROR,
2115 cqe->cqe_sf.sf_sct, cqe->cqe_sf.sf_sc));
2116 }
2117
2118 static int
nvme_abort_cmd(nvme_cmd_t * abort_cmd,uint_t sec)2119 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec)
2120 {
2121 nvme_t *nvme = abort_cmd->nc_nvme;
2122 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2123 nvme_abort_cmd_t ac = { 0 };
2124 int ret = 0;
2125
2126 sema_p(&nvme->n_abort_sema);
2127
2128 ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid;
2129 ac.b.ac_sqid = abort_cmd->nc_sqid;
2130
2131 cmd->nc_sqid = 0;
2132 cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
2133 cmd->nc_callback = nvme_wakeup_cmd;
2134 cmd->nc_sqe.sqe_cdw10 = ac.r;
2135
2136 /*
2137 * Send the ABORT to the hardware. The ABORT command will return _after_
2138 * the aborted command has completed (aborted or otherwise), but since
2139 * we still hold the aborted command's mutex its callback hasn't been
2140 * processed yet.
2141 */
2142 nvme_admin_cmd(cmd, sec);
2143 sema_v(&nvme->n_abort_sema);
2144
2145 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2146 dev_err(nvme->n_dip, CE_WARN,
2147 "!ABORT failed with sct = %x, sc = %x",
2148 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2149 atomic_inc_32(&nvme->n_abort_failed);
2150 } else {
2151 dev_err(nvme->n_dip, CE_WARN,
2152 "!ABORT of command %d/%d %ssuccessful",
2153 abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid,
2154 cmd->nc_cqe.cqe_dw0 & 1 ? "un" : "");
2155 if ((cmd->nc_cqe.cqe_dw0 & 1) == 0)
2156 atomic_inc_32(&nvme->n_cmd_aborted);
2157 }
2158
2159 nvme_free_cmd(cmd);
2160 return (ret);
2161 }
2162
2163 /*
2164 * nvme_wait_cmd -- wait for command completion or timeout
2165 *
2166 * In case of a serious error or a timeout of the abort command the hardware
2167 * will be declared dead and FMA will be notified.
2168 */
2169 static void
nvme_wait_cmd(nvme_cmd_t * cmd,uint32_t sec)2170 nvme_wait_cmd(nvme_cmd_t *cmd, uint32_t sec)
2171 {
2172 clock_t timeout = ddi_get_lbolt() + drv_usectohz((long)sec * MICROSEC);
2173 nvme_t *nvme = cmd->nc_nvme;
2174 nvme_reg_csts_t csts;
2175 nvme_qpair_t *qp;
2176
2177 ASSERT(mutex_owned(&cmd->nc_mutex));
2178
2179 while (!cmd->nc_completed) {
2180 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1)
2181 break;
2182 }
2183
2184 if (cmd->nc_completed)
2185 return;
2186
2187 /*
2188 * The command timed out.
2189 *
2190 * Check controller for fatal status, any errors associated with the
2191 * register or DMA handle, or for a double timeout (abort command timed
2192 * out). If necessary log a warning and call FMA.
2193 */
2194 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2195 dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, "
2196 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid,
2197 cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
2198 atomic_inc_32(&nvme->n_cmd_timeout);
2199
2200 if (csts.b.csts_cfs ||
2201 nvme_check_regs_hdl(nvme) ||
2202 nvme_check_dma_hdl(cmd->nc_dma) ||
2203 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
2204 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2205 } else if (nvme_abort_cmd(cmd, sec) == 0) {
2206 /*
2207 * If the abort succeeded the command should complete
2208 * immediately with an appropriate status.
2209 */
2210 while (!cmd->nc_completed)
2211 cv_wait(&cmd->nc_cv, &cmd->nc_mutex);
2212
2213 return;
2214 }
2215
2216 qp = nvme->n_ioq[cmd->nc_sqid];
2217
2218 mutex_enter(&qp->nq_mutex);
2219 (void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
2220 mutex_exit(&qp->nq_mutex);
2221
2222 /*
2223 * As we don't know what the presumed dead hardware might still do with
2224 * the DMA memory, we'll put the command on the lost commands list if it
2225 * has any DMA memory.
2226 */
2227 if (cmd->nc_dma != NULL) {
2228 mutex_enter(&nvme_lc_mutex);
2229 list_insert_head(&nvme_lost_cmds, cmd);
2230 mutex_exit(&nvme_lc_mutex);
2231 }
2232 }
2233
2234 static void
nvme_wakeup_cmd(void * arg)2235 nvme_wakeup_cmd(void *arg)
2236 {
2237 nvme_cmd_t *cmd = arg;
2238
2239 mutex_enter(&cmd->nc_mutex);
2240 cmd->nc_completed = B_TRUE;
2241 cv_signal(&cmd->nc_cv);
2242 mutex_exit(&cmd->nc_mutex);
2243 }
2244
2245 static void
nvme_async_event_task(void * arg)2246 nvme_async_event_task(void *arg)
2247 {
2248 nvme_cmd_t *cmd = arg;
2249 nvme_t *nvme = cmd->nc_nvme;
2250 nvme_error_log_entry_t *error_log = NULL;
2251 nvme_health_log_t *health_log = NULL;
2252 nvme_nschange_list_t *nslist = NULL;
2253 size_t logsize = 0;
2254 nvme_async_event_t event;
2255
2256 /*
2257 * Check for errors associated with the async request itself. The only
2258 * command-specific error is "async event limit exceeded", which
2259 * indicates a programming error in the driver and causes a panic in
2260 * nvme_check_cmd_status().
2261 *
2262 * Other possible errors are various scenarios where the async request
2263 * was aborted, or internal errors in the device. Internal errors are
2264 * reported to FMA, the command aborts need no special handling here.
2265 *
2266 * And finally, at least qemu nvme does not support async events,
2267 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we
2268 * will avoid posting async events.
2269 */
2270
2271 if (nvme_check_cmd_status(cmd) != 0) {
2272 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
2273 "!async event request returned failure, sct = 0x%x, "
2274 "sc = 0x%x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
2275 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
2276 cmd->nc_cqe.cqe_sf.sf_m);
2277
2278 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2279 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
2280 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2281 }
2282
2283 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2284 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC &&
2285 cmd->nc_cqe.cqe_sf.sf_dnr == 1) {
2286 nvme->n_async_event_supported = B_FALSE;
2287 }
2288
2289 nvme_free_cmd(cmd);
2290 return;
2291 }
2292
2293 event.r = cmd->nc_cqe.cqe_dw0;
2294
2295 /* Clear CQE and re-submit the async request. */
2296 bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
2297 nvme_submit_admin_cmd(nvme->n_adminq, cmd);
2298 cmd = NULL; /* cmd can no longer be used after resubmission */
2299
2300 switch (event.b.ae_type) {
2301 case NVME_ASYNC_TYPE_ERROR:
2302 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
2303 if (!nvme_get_logpage_int(nvme, B_FALSE,
2304 (void **)&error_log, &logsize,
2305 NVME_LOGPAGE_ERROR)) {
2306 return;
2307 }
2308 } else {
2309 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
2310 "async event reply: type=0x%x logpage=0x%x",
2311 event.b.ae_type, event.b.ae_logpage);
2312 atomic_inc_32(&nvme->n_wrong_logpage);
2313 return;
2314 }
2315
2316 switch (event.b.ae_info) {
2317 case NVME_ASYNC_ERROR_INV_SQ:
2318 dev_err(nvme->n_dip, CE_PANIC, "programming error: "
2319 "invalid submission queue");
2320 return;
2321
2322 case NVME_ASYNC_ERROR_INV_DBL:
2323 dev_err(nvme->n_dip, CE_PANIC, "programming error: "
2324 "invalid doorbell write value");
2325 return;
2326
2327 case NVME_ASYNC_ERROR_DIAGFAIL:
2328 dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure");
2329 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2330 atomic_inc_32(&nvme->n_diagfail_event);
2331 break;
2332
2333 case NVME_ASYNC_ERROR_PERSISTENT:
2334 dev_err(nvme->n_dip, CE_WARN, "!persistent internal "
2335 "device error");
2336 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2337 atomic_inc_32(&nvme->n_persistent_event);
2338 break;
2339
2340 case NVME_ASYNC_ERROR_TRANSIENT:
2341 dev_err(nvme->n_dip, CE_WARN, "!transient internal "
2342 "device error");
2343 /* TODO: send ereport */
2344 atomic_inc_32(&nvme->n_transient_event);
2345 break;
2346
2347 case NVME_ASYNC_ERROR_FW_LOAD:
2348 dev_err(nvme->n_dip, CE_WARN,
2349 "!firmware image load error");
2350 atomic_inc_32(&nvme->n_fw_load_event);
2351 break;
2352 }
2353 break;
2354
2355 case NVME_ASYNC_TYPE_HEALTH:
2356 if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) {
2357 if (!nvme_get_logpage_int(nvme, B_FALSE,
2358 (void **)&health_log, &logsize,
2359 NVME_LOGPAGE_HEALTH)) {
2360 return;
2361 }
2362 } else {
2363 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
2364 "type=0x%x logpage=0x%x", event.b.ae_type,
2365 event.b.ae_logpage);
2366 atomic_inc_32(&nvme->n_wrong_logpage);
2367 return;
2368 }
2369
2370 switch (event.b.ae_info) {
2371 case NVME_ASYNC_HEALTH_RELIABILITY:
2372 dev_err(nvme->n_dip, CE_WARN,
2373 "!device reliability compromised");
2374 /* TODO: send ereport */
2375 atomic_inc_32(&nvme->n_reliability_event);
2376 break;
2377
2378 case NVME_ASYNC_HEALTH_TEMPERATURE:
2379 dev_err(nvme->n_dip, CE_WARN,
2380 "!temperature above threshold");
2381 /* TODO: send ereport */
2382 atomic_inc_32(&nvme->n_temperature_event);
2383 break;
2384
2385 case NVME_ASYNC_HEALTH_SPARE:
2386 dev_err(nvme->n_dip, CE_WARN,
2387 "!spare space below threshold");
2388 /* TODO: send ereport */
2389 atomic_inc_32(&nvme->n_spare_event);
2390 break;
2391 }
2392 break;
2393
2394 case NVME_ASYNC_TYPE_NOTICE:
2395 switch (event.b.ae_info) {
2396 case NVME_ASYNC_NOTICE_NS_CHANGE:
2397 if (event.b.ae_logpage != NVME_LOGPAGE_NSCHANGE) {
2398 dev_err(nvme->n_dip, CE_WARN,
2399 "!wrong logpage in async event reply: "
2400 "type=0x%x logpage=0x%x",
2401 event.b.ae_type, event.b.ae_logpage);
2402 atomic_inc_32(&nvme->n_wrong_logpage);
2403 break;
2404 }
2405
2406 dev_err(nvme->n_dip, CE_NOTE,
2407 "namespace attribute change event, "
2408 "logpage = 0x%x", event.b.ae_logpage);
2409 atomic_inc_32(&nvme->n_notice_event);
2410
2411 if (!nvme_get_logpage_int(nvme, B_FALSE,
2412 (void **)&nslist, &logsize,
2413 NVME_LOGPAGE_NSCHANGE)) {
2414 break;
2415 }
2416
2417 if (nslist->nscl_ns[0] == UINT32_MAX) {
2418 dev_err(nvme->n_dip, CE_CONT,
2419 "more than %u namespaces have changed.\n",
2420 NVME_NSCHANGE_LIST_SIZE);
2421 break;
2422 }
2423
2424 mutex_enter(&nvme->n_mgmt_mutex);
2425 for (uint_t i = 0; i < NVME_NSCHANGE_LIST_SIZE; i++) {
2426 uint32_t nsid = nslist->nscl_ns[i];
2427
2428 if (nsid == 0) /* end of list */
2429 break;
2430
2431 dev_err(nvme->n_dip, CE_NOTE,
2432 "!namespace nvme%d/%u has changed.",
2433 ddi_get_instance(nvme->n_dip), nsid);
2434
2435
2436 if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS)
2437 continue;
2438
2439 bd_state_change(nvme_nsid2ns(nvme,
2440 nsid)->ns_bd_hdl);
2441 }
2442 mutex_exit(&nvme->n_mgmt_mutex);
2443
2444 break;
2445
2446 case NVME_ASYNC_NOTICE_FW_ACTIVATE:
2447 dev_err(nvme->n_dip, CE_NOTE,
2448 "firmware activation starting, "
2449 "logpage = 0x%x", event.b.ae_logpage);
2450 atomic_inc_32(&nvme->n_notice_event);
2451 break;
2452
2453 case NVME_ASYNC_NOTICE_TELEMETRY:
2454 dev_err(nvme->n_dip, CE_NOTE,
2455 "telemetry log changed, "
2456 "logpage = 0x%x", event.b.ae_logpage);
2457 atomic_inc_32(&nvme->n_notice_event);
2458 break;
2459
2460 case NVME_ASYNC_NOTICE_NS_ASYMM:
2461 dev_err(nvme->n_dip, CE_NOTE,
2462 "asymmetric namespace access change, "
2463 "logpage = 0x%x", event.b.ae_logpage);
2464 atomic_inc_32(&nvme->n_notice_event);
2465 break;
2466
2467 case NVME_ASYNC_NOTICE_LATENCYLOG:
2468 dev_err(nvme->n_dip, CE_NOTE,
2469 "predictable latency event aggregate log change, "
2470 "logpage = 0x%x", event.b.ae_logpage);
2471 atomic_inc_32(&nvme->n_notice_event);
2472 break;
2473
2474 case NVME_ASYNC_NOTICE_LBASTATUS:
2475 dev_err(nvme->n_dip, CE_NOTE,
2476 "LBA status information alert, "
2477 "logpage = 0x%x", event.b.ae_logpage);
2478 atomic_inc_32(&nvme->n_notice_event);
2479 break;
2480
2481 case NVME_ASYNC_NOTICE_ENDURANCELOG:
2482 dev_err(nvme->n_dip, CE_NOTE,
2483 "endurance group event aggregate log page change, "
2484 "logpage = 0x%x", event.b.ae_logpage);
2485 atomic_inc_32(&nvme->n_notice_event);
2486 break;
2487
2488 default:
2489 dev_err(nvme->n_dip, CE_WARN,
2490 "!unknown notice async event received, "
2491 "info = 0x%x, logpage = 0x%x", event.b.ae_info,
2492 event.b.ae_logpage);
2493 atomic_inc_32(&nvme->n_unknown_event);
2494 break;
2495 }
2496 break;
2497
2498 case NVME_ASYNC_TYPE_VENDOR:
2499 dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event "
2500 "received, info = 0x%x, logpage = 0x%x", event.b.ae_info,
2501 event.b.ae_logpage);
2502 atomic_inc_32(&nvme->n_vendor_event);
2503 break;
2504
2505 default:
2506 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
2507 "type = 0x%x, info = 0x%x, logpage = 0x%x", event.b.ae_type,
2508 event.b.ae_info, event.b.ae_logpage);
2509 atomic_inc_32(&nvme->n_unknown_event);
2510 break;
2511 }
2512
2513 if (error_log != NULL)
2514 kmem_free(error_log, logsize);
2515
2516 if (health_log != NULL)
2517 kmem_free(health_log, logsize);
2518
2519 if (nslist != NULL)
2520 kmem_free(nslist, logsize);
2521 }
2522
2523 static void
nvme_admin_cmd(nvme_cmd_t * cmd,uint32_t sec)2524 nvme_admin_cmd(nvme_cmd_t *cmd, uint32_t sec)
2525 {
2526 mutex_enter(&cmd->nc_mutex);
2527 nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
2528 nvme_wait_cmd(cmd, sec);
2529 mutex_exit(&cmd->nc_mutex);
2530 }
2531
2532 static void
nvme_async_event(nvme_t * nvme)2533 nvme_async_event(nvme_t *nvme)
2534 {
2535 nvme_cmd_t *cmd;
2536
2537 cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2538 cmd->nc_sqid = 0;
2539 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
2540 cmd->nc_callback = nvme_async_event_task;
2541 cmd->nc_dontpanic = B_TRUE;
2542
2543 nvme_submit_admin_cmd(nvme->n_adminq, cmd);
2544 }
2545
2546 /*
2547 * There are commands such as format or vendor unique commands that are going to
2548 * manipulate the data in a namespace or destroy them, we make sure that none of
2549 * the ones that will be impacted are actually attached.
2550 */
2551 static boolean_t
nvme_no_blkdev_attached(nvme_t * nvme,uint32_t nsid)2552 nvme_no_blkdev_attached(nvme_t *nvme, uint32_t nsid)
2553 {
2554 ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
2555 ASSERT3U(nsid, !=, 0);
2556
2557 if (nsid != NVME_NSID_BCAST) {
2558 nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
2559 return (!ns->ns_attached);
2560 }
2561
2562 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
2563 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
2564
2565 if (ns->ns_attached) {
2566 return (B_FALSE);
2567 }
2568 }
2569
2570 return (B_TRUE);
2571 }
2572
2573 static boolean_t
nvme_format_nvm(nvme_t * nvme,nvme_ioctl_format_t * ioc)2574 nvme_format_nvm(nvme_t *nvme, nvme_ioctl_format_t *ioc)
2575 {
2576 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2577 nvme_format_nvm_t format_nvm = { 0 };
2578 boolean_t ret;
2579
2580 format_nvm.b.fm_lbaf = bitx32(ioc->nif_lbaf, 3, 0);
2581 format_nvm.b.fm_ses = bitx32(ioc->nif_ses, 2, 0);
2582
2583 cmd->nc_sqid = 0;
2584 cmd->nc_callback = nvme_wakeup_cmd;
2585 cmd->nc_sqe.sqe_nsid = ioc->nif_common.nioc_nsid;
2586 cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT;
2587 cmd->nc_sqe.sqe_cdw10 = format_nvm.r;
2588
2589 /*
2590 * We don't want to panic on any format commands. There are two reasons
2591 * for this:
2592 *
2593 * 1) All format commands are initiated by users. We don't want to panic
2594 * on user commands.
2595 *
2596 * 2) Several devices like the Samsung SM951 don't allow formatting of
2597 * all namespaces in one command and we'd prefer to handle that
2598 * gracefully.
2599 */
2600 cmd->nc_dontpanic = B_TRUE;
2601
2602 nvme_admin_cmd(cmd, nvme_format_cmd_timeout);
2603
2604 if (!nvme_check_cmd_status_ioctl(cmd, &ioc->nif_common) != 0) {
2605 dev_err(nvme->n_dip, CE_WARN,
2606 "!FORMAT failed with sct = %x, sc = %x",
2607 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2608 ret = B_FALSE;
2609 goto fail;
2610 }
2611
2612 ret = B_TRUE;
2613 fail:
2614 nvme_free_cmd(cmd);
2615 return (ret);
2616 }
2617
2618 /*
2619 * Retrieve a specific log page. The contents of the log page request should
2620 * have already been validated by the system.
2621 */
2622 static boolean_t
nvme_get_logpage(nvme_t * nvme,boolean_t user,nvme_ioctl_get_logpage_t * log,void ** buf)2623 nvme_get_logpage(nvme_t *nvme, boolean_t user, nvme_ioctl_get_logpage_t *log,
2624 void **buf)
2625 {
2626 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2627 nvme_getlogpage_dw10_t dw10;
2628 uint32_t offlo, offhi;
2629 nvme_getlogpage_dw11_t dw11;
2630 nvme_getlogpage_dw14_t dw14;
2631 uint32_t ndw;
2632 boolean_t ret = B_FALSE;
2633
2634 bzero(&dw10, sizeof (dw10));
2635 bzero(&dw11, sizeof (dw11));
2636 bzero(&dw14, sizeof (dw14));
2637
2638 cmd->nc_sqid = 0;
2639 cmd->nc_callback = nvme_wakeup_cmd;
2640 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE;
2641 cmd->nc_sqe.sqe_nsid = log->nigl_common.nioc_nsid;
2642
2643 if (user)
2644 cmd->nc_dontpanic = B_TRUE;
2645
2646 /*
2647 * The size field is the number of double words, but is a zeros based
2648 * value. We need to store our actual value minus one.
2649 */
2650 ndw = (uint32_t)(log->nigl_len / 4);
2651 ASSERT3U(ndw, >, 0);
2652 ndw--;
2653
2654 dw10.b.lp_lid = bitx32(log->nigl_lid, 7, 0);
2655 dw10.b.lp_lsp = bitx32(log->nigl_lsp, 6, 0);
2656 dw10.b.lp_rae = bitx32(log->nigl_lsp, 0, 0);
2657 dw10.b.lp_lnumdl = bitx32(ndw, 15, 0);
2658
2659 dw11.b.lp_numdu = bitx32(ndw, 31, 16);
2660 dw11.b.lp_lsi = bitx32(log->nigl_lsi, 15, 0);
2661
2662 offlo = bitx64(log->nigl_offset, 31, 0);
2663 offhi = bitx64(log->nigl_offset, 63, 32);
2664
2665 dw14.b.lp_csi = bitx32(log->nigl_csi, 7, 0);
2666
2667 cmd->nc_sqe.sqe_cdw10 = dw10.r;
2668 cmd->nc_sqe.sqe_cdw11 = dw11.r;
2669 cmd->nc_sqe.sqe_cdw12 = offlo;
2670 cmd->nc_sqe.sqe_cdw13 = offhi;
2671 cmd->nc_sqe.sqe_cdw14 = dw14.r;
2672
2673 if (nvme_zalloc_dma(nvme, log->nigl_len, DDI_DMA_READ,
2674 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2675 dev_err(nvme->n_dip, CE_WARN,
2676 "!nvme_zalloc_dma failed for GET LOG PAGE");
2677 ret = nvme_ioctl_error(&log->nigl_common,
2678 NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
2679 goto fail;
2680 }
2681
2682 if (nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah) != 0) {
2683 ret = nvme_ioctl_error(&log->nigl_common,
2684 NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
2685 goto fail;
2686 }
2687 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2688
2689 if (!nvme_check_cmd_status_ioctl(cmd, &log->nigl_common)) {
2690 if (!user) {
2691 dev_err(nvme->n_dip, CE_WARN,
2692 "!GET LOG PAGE failed with sct = %x, sc = %x",
2693 cmd->nc_cqe.cqe_sf.sf_sct,
2694 cmd->nc_cqe.cqe_sf.sf_sc);
2695 }
2696 ret = B_FALSE;
2697 goto fail;
2698 }
2699
2700 *buf = kmem_alloc(log->nigl_len, KM_SLEEP);
2701 bcopy(cmd->nc_dma->nd_memp, *buf, log->nigl_len);
2702
2703 ret = B_TRUE;
2704 fail:
2705 nvme_free_cmd(cmd);
2706
2707 return (ret);
2708 }
2709
2710 /*
2711 * This is an internal wrapper for when the kernel wants to get a log page.
2712 * Currently this assumes that the only thing that is required is the log page
2713 * ID. If more information is required, we'll be better served to just use the
2714 * general ioctl interface.
2715 */
2716 static boolean_t
nvme_get_logpage_int(nvme_t * nvme,boolean_t user,void ** buf,size_t * bufsize,uint8_t lid)2717 nvme_get_logpage_int(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize,
2718 uint8_t lid)
2719 {
2720 const nvme_log_page_info_t *info = NULL;
2721 nvme_ioctl_get_logpage_t log;
2722 nvme_valid_ctrl_data_t data;
2723 boolean_t bret;
2724 bool var;
2725
2726 for (size_t i = 0; i < nvme_std_log_npages; i++) {
2727 if (nvme_std_log_pages[i].nlpi_lid == lid &&
2728 nvme_std_log_pages[i].nlpi_csi == NVME_CSI_NVM) {
2729 info = &nvme_std_log_pages[i];
2730 break;
2731 }
2732 }
2733
2734 if (info == NULL) {
2735 return (B_FALSE);
2736 }
2737
2738 data.vcd_vers = &nvme->n_version;
2739 data.vcd_id = nvme->n_idctl;
2740 bzero(&log, sizeof (log));
2741 log.nigl_common.nioc_nsid = NVME_NSID_BCAST;
2742 log.nigl_csi = info->nlpi_csi;
2743 log.nigl_lid = info->nlpi_lid;
2744 log.nigl_len = nvme_log_page_info_size(info, &data, &var);
2745
2746 /*
2747 * We only support getting standard fixed-length log pages through the
2748 * kernel interface at this time. If a log page either has an unknown
2749 * size or has a variable length, then we cannot get it.
2750 */
2751 if (log.nigl_len == 0 || var) {
2752 return (B_FALSE);
2753 }
2754
2755 bret = nvme_get_logpage(nvme, user, &log, buf);
2756 if (!bret) {
2757 return (B_FALSE);
2758 }
2759
2760 *bufsize = log.nigl_len;
2761 return (B_TRUE);
2762 }
2763
2764 static boolean_t
nvme_identify(nvme_t * nvme,boolean_t user,nvme_ioctl_identify_t * ioc,void ** buf)2765 nvme_identify(nvme_t *nvme, boolean_t user, nvme_ioctl_identify_t *ioc,
2766 void **buf)
2767 {
2768 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2769 boolean_t ret = B_FALSE;
2770 nvme_identify_dw10_t dw10;
2771
2772 ASSERT3P(buf, !=, NULL);
2773
2774 bzero(&dw10, sizeof (dw10));
2775
2776 cmd->nc_sqid = 0;
2777 cmd->nc_callback = nvme_wakeup_cmd;
2778 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
2779 cmd->nc_sqe.sqe_nsid = ioc->nid_common.nioc_nsid;
2780
2781 dw10.b.id_cns = bitx32(ioc->nid_cns, 7, 0);
2782 dw10.b.id_cntid = bitx32(ioc->nid_ctrlid, 15, 0);
2783
2784 cmd->nc_sqe.sqe_cdw10 = dw10.r;
2785
2786 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
2787 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2788 dev_err(nvme->n_dip, CE_WARN,
2789 "!nvme_zalloc_dma failed for IDENTIFY");
2790 ret = nvme_ioctl_error(&ioc->nid_common,
2791 NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
2792 goto fail;
2793 }
2794
2795 if (cmd->nc_dma->nd_ncookie > 2) {
2796 dev_err(nvme->n_dip, CE_WARN,
2797 "!too many DMA cookies for IDENTIFY");
2798 atomic_inc_32(&nvme->n_too_many_cookies);
2799 ret = nvme_ioctl_error(&ioc->nid_common,
2800 NVME_IOCTL_E_BAD_PRP, 0, 0);
2801 goto fail;
2802 }
2803
2804 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
2805 if (cmd->nc_dma->nd_ncookie > 1) {
2806 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
2807 &cmd->nc_dma->nd_cookie);
2808 cmd->nc_sqe.sqe_dptr.d_prp[1] =
2809 cmd->nc_dma->nd_cookie.dmac_laddress;
2810 }
2811
2812 if (user)
2813 cmd->nc_dontpanic = B_TRUE;
2814
2815 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2816
2817 if (!nvme_check_cmd_status_ioctl(cmd, &ioc->nid_common)) {
2818 dev_err(nvme->n_dip, CE_WARN,
2819 "!IDENTIFY failed with sct = %x, sc = %x",
2820 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2821 ret = B_FALSE;
2822 goto fail;
2823 }
2824
2825 *buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
2826 bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE);
2827 ret = B_TRUE;
2828
2829 fail:
2830 nvme_free_cmd(cmd);
2831
2832 return (ret);
2833 }
2834
2835 static boolean_t
nvme_identify_int(nvme_t * nvme,uint32_t nsid,uint8_t cns,void ** buf)2836 nvme_identify_int(nvme_t *nvme, uint32_t nsid, uint8_t cns, void **buf)
2837 {
2838 nvme_ioctl_identify_t id;
2839
2840 bzero(&id, sizeof (nvme_ioctl_identify_t));
2841 id.nid_common.nioc_nsid = nsid;
2842 id.nid_cns = cns;
2843
2844 return (nvme_identify(nvme, B_FALSE, &id, buf));
2845 }
2846
2847 static int
nvme_set_features(nvme_t * nvme,boolean_t user,uint32_t nsid,uint8_t feature,uint32_t val,uint32_t * res)2848 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature,
2849 uint32_t val, uint32_t *res)
2850 {
2851 _NOTE(ARGUNUSED(nsid));
2852 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2853 int ret = EINVAL;
2854
2855 ASSERT(res != NULL);
2856
2857 cmd->nc_sqid = 0;
2858 cmd->nc_callback = nvme_wakeup_cmd;
2859 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES;
2860 cmd->nc_sqe.sqe_cdw10 = feature;
2861 cmd->nc_sqe.sqe_cdw11 = val;
2862
2863 if (user)
2864 cmd->nc_dontpanic = B_TRUE;
2865
2866 switch (feature) {
2867 case NVME_FEAT_WRITE_CACHE:
2868 if (!nvme->n_write_cache_present)
2869 goto fail;
2870 break;
2871
2872 case NVME_FEAT_NQUEUES:
2873 break;
2874
2875 default:
2876 goto fail;
2877 }
2878
2879 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2880
2881 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2882 dev_err(nvme->n_dip, CE_WARN,
2883 "!SET FEATURES %d failed with sct = %x, sc = %x",
2884 feature, cmd->nc_cqe.cqe_sf.sf_sct,
2885 cmd->nc_cqe.cqe_sf.sf_sc);
2886 goto fail;
2887 }
2888
2889 *res = cmd->nc_cqe.cqe_dw0;
2890
2891 fail:
2892 nvme_free_cmd(cmd);
2893 return (ret);
2894 }
2895
2896 static int
nvme_write_cache_set(nvme_t * nvme,boolean_t enable)2897 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
2898 {
2899 nvme_write_cache_t nwc = { 0 };
2900
2901 if (enable)
2902 nwc.b.wc_wce = 1;
2903
2904 /*
2905 * We've seen some cases where this fails due to us being told we've
2906 * specified an invalid namespace when operating against the Xen xcp-ng
2907 * qemu NVMe virtual device. As such, we generally ensure that trying to
2908 * enable this doesn't lead us to panic. It's not completely clear why
2909 * specifying namespace zero here fails, but not when we're setting the
2910 * number of queues below.
2911 */
2912 return (nvme_set_features(nvme, B_TRUE, 0, NVME_FEAT_WRITE_CACHE,
2913 nwc.r, &nwc.r));
2914 }
2915
2916 static int
nvme_set_nqueues(nvme_t * nvme)2917 nvme_set_nqueues(nvme_t *nvme)
2918 {
2919 nvme_nqueues_t nq = { 0 };
2920 int ret;
2921
2922 /*
2923 * The default is to allocate one completion queue per vector.
2924 */
2925 if (nvme->n_completion_queues == -1)
2926 nvme->n_completion_queues = nvme->n_intr_cnt;
2927
2928 /*
2929 * There is no point in having more completion queues than
2930 * interrupt vectors.
2931 */
2932 nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2933 nvme->n_intr_cnt);
2934
2935 /*
2936 * The default is to use one submission queue per completion queue.
2937 */
2938 if (nvme->n_submission_queues == -1)
2939 nvme->n_submission_queues = nvme->n_completion_queues;
2940
2941 /*
2942 * There is no point in having more completion queues than
2943 * submission queues.
2944 */
2945 nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2946 nvme->n_submission_queues);
2947
2948 ASSERT(nvme->n_submission_queues > 0);
2949 ASSERT(nvme->n_completion_queues > 0);
2950
2951 nq.b.nq_nsq = nvme->n_submission_queues - 1;
2952 nq.b.nq_ncq = nvme->n_completion_queues - 1;
2953
2954 ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r,
2955 &nq.r);
2956
2957 if (ret == 0) {
2958 /*
2959 * Never use more than the requested number of queues.
2960 */
2961 nvme->n_submission_queues = MIN(nvme->n_submission_queues,
2962 nq.b.nq_nsq + 1);
2963 nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2964 nq.b.nq_ncq + 1);
2965 }
2966
2967 return (ret);
2968 }
2969
2970 static int
nvme_create_completion_queue(nvme_t * nvme,nvme_cq_t * cq)2971 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq)
2972 {
2973 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2974 nvme_create_queue_dw10_t dw10 = { 0 };
2975 nvme_create_cq_dw11_t c_dw11 = { 0 };
2976 int ret;
2977
2978 dw10.b.q_qid = cq->ncq_id;
2979 dw10.b.q_qsize = cq->ncq_nentry - 1;
2980
2981 c_dw11.b.cq_pc = 1;
2982 c_dw11.b.cq_ien = 1;
2983 c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt;
2984
2985 cmd->nc_sqid = 0;
2986 cmd->nc_callback = nvme_wakeup_cmd;
2987 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
2988 cmd->nc_sqe.sqe_cdw10 = dw10.r;
2989 cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
2990 cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress;
2991
2992 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2993
2994 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2995 dev_err(nvme->n_dip, CE_WARN,
2996 "!CREATE CQUEUE failed with sct = %x, sc = %x",
2997 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2998 }
2999
3000 nvme_free_cmd(cmd);
3001
3002 return (ret);
3003 }
3004
3005 static int
nvme_create_io_qpair(nvme_t * nvme,nvme_qpair_t * qp,uint16_t idx)3006 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
3007 {
3008 nvme_cq_t *cq = qp->nq_cq;
3009 nvme_cmd_t *cmd;
3010 nvme_create_queue_dw10_t dw10 = { 0 };
3011 nvme_create_sq_dw11_t s_dw11 = { 0 };
3012 int ret;
3013
3014 /*
3015 * It is possible to have more qpairs than completion queues,
3016 * and when the idx > ncq_id, that completion queue is shared
3017 * and has already been created.
3018 */
3019 if (idx <= cq->ncq_id &&
3020 nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS)
3021 return (DDI_FAILURE);
3022
3023 dw10.b.q_qid = idx;
3024 dw10.b.q_qsize = qp->nq_nentry - 1;
3025
3026 s_dw11.b.sq_pc = 1;
3027 s_dw11.b.sq_cqid = cq->ncq_id;
3028
3029 cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
3030 cmd->nc_sqid = 0;
3031 cmd->nc_callback = nvme_wakeup_cmd;
3032 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
3033 cmd->nc_sqe.sqe_cdw10 = dw10.r;
3034 cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
3035 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
3036
3037 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
3038
3039 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
3040 dev_err(nvme->n_dip, CE_WARN,
3041 "!CREATE SQUEUE failed with sct = %x, sc = %x",
3042 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
3043 }
3044
3045 nvme_free_cmd(cmd);
3046
3047 return (ret);
3048 }
3049
3050 static boolean_t
nvme_reset(nvme_t * nvme,boolean_t quiesce)3051 nvme_reset(nvme_t *nvme, boolean_t quiesce)
3052 {
3053 nvme_reg_csts_t csts;
3054 int i;
3055
3056 nvme_put32(nvme, NVME_REG_CC, 0);
3057
3058 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3059 if (csts.b.csts_rdy == 1) {
3060 nvme_put32(nvme, NVME_REG_CC, 0);
3061
3062 /*
3063 * The timeout value is from the Controller Capabilities
3064 * register (CAP.TO, section 3.1.1). This is the worst case
3065 * time to wait for CSTS.RDY to transition from 1 to 0 after
3066 * CC.EN transitions from 1 to 0.
3067 *
3068 * The timeout units are in 500 ms units, and we are delaying
3069 * in 50ms chunks, hence counting to n_timeout * 10.
3070 */
3071 for (i = 0; i < nvme->n_timeout * 10; i++) {
3072 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3073 if (csts.b.csts_rdy == 0)
3074 break;
3075
3076 /*
3077 * Quiescing drivers should not use locks or timeouts,
3078 * so if this is the quiesce path, use a quiesce-safe
3079 * delay.
3080 */
3081 if (quiesce) {
3082 drv_usecwait(50000);
3083 } else {
3084 delay(drv_usectohz(50000));
3085 }
3086 }
3087 }
3088
3089 nvme_put32(nvme, NVME_REG_AQA, 0);
3090 nvme_put32(nvme, NVME_REG_ASQ, 0);
3091 nvme_put32(nvme, NVME_REG_ACQ, 0);
3092
3093 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3094 return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE);
3095 }
3096
3097 static void
nvme_shutdown(nvme_t * nvme,boolean_t quiesce)3098 nvme_shutdown(nvme_t *nvme, boolean_t quiesce)
3099 {
3100 nvme_reg_cc_t cc;
3101 nvme_reg_csts_t csts;
3102 int i;
3103
3104 cc.r = nvme_get32(nvme, NVME_REG_CC);
3105 cc.b.cc_shn = NVME_CC_SHN_NORMAL;
3106 nvme_put32(nvme, NVME_REG_CC, cc.r);
3107
3108 for (i = 0; i < 10; i++) {
3109 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3110 if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE)
3111 break;
3112
3113 if (quiesce) {
3114 drv_usecwait(100000);
3115 } else {
3116 delay(drv_usectohz(100000));
3117 }
3118 }
3119 }
3120
3121 /*
3122 * Return length of string without trailing spaces.
3123 */
3124 static int
nvme_strlen(const char * str,int len)3125 nvme_strlen(const char *str, int len)
3126 {
3127 if (len <= 0)
3128 return (0);
3129
3130 while (str[--len] == ' ')
3131 ;
3132
3133 return (++len);
3134 }
3135
3136 static void
nvme_config_min_block_size(nvme_t * nvme,char * model,char * val)3137 nvme_config_min_block_size(nvme_t *nvme, char *model, char *val)
3138 {
3139 ulong_t bsize = 0;
3140 char *msg = "";
3141
3142 if (ddi_strtoul(val, NULL, 0, &bsize) != 0)
3143 goto err;
3144
3145 if (!ISP2(bsize)) {
3146 msg = ": not a power of 2";
3147 goto err;
3148 }
3149
3150 if (bsize < NVME_DEFAULT_MIN_BLOCK_SIZE) {
3151 msg = ": too low";
3152 goto err;
3153 }
3154
3155 nvme->n_min_block_size = bsize;
3156 return;
3157
3158 err:
3159 dev_err(nvme->n_dip, CE_WARN,
3160 "!nvme-config-list: ignoring invalid min-phys-block-size '%s' "
3161 "for model '%s'%s", val, model, msg);
3162
3163 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
3164 }
3165
3166 static void
nvme_config_boolean(nvme_t * nvme,char * model,char * name,char * val,boolean_t * b)3167 nvme_config_boolean(nvme_t *nvme, char *model, char *name, char *val,
3168 boolean_t *b)
3169 {
3170 if (strcmp(val, "on") == 0 ||
3171 strcmp(val, "true") == 0)
3172 *b = B_TRUE;
3173 else if (strcmp(val, "off") == 0 ||
3174 strcmp(val, "false") == 0)
3175 *b = B_FALSE;
3176 else
3177 dev_err(nvme->n_dip, CE_WARN,
3178 "!nvme-config-list: invalid value for %s '%s'"
3179 " for model '%s', ignoring", name, val, model);
3180 }
3181
3182 static void
nvme_config_list(nvme_t * nvme)3183 nvme_config_list(nvme_t *nvme)
3184 {
3185 char **config_list;
3186 uint_t nelem;
3187 int rv, i;
3188
3189 /*
3190 * We're following the pattern of 'sd-config-list' here, but extend it.
3191 * Instead of two we have three separate strings for "model", "fwrev",
3192 * and "name-value-list".
3193 */
3194 rv = ddi_prop_lookup_string_array(DDI_DEV_T_ANY, nvme->n_dip,
3195 DDI_PROP_DONTPASS, "nvme-config-list", &config_list, &nelem);
3196
3197 if (rv != DDI_PROP_SUCCESS) {
3198 if (rv == DDI_PROP_CANNOT_DECODE) {
3199 dev_err(nvme->n_dip, CE_WARN,
3200 "!nvme-config-list: cannot be decoded");
3201 }
3202
3203 return;
3204 }
3205
3206 if ((nelem % 3) != 0) {
3207 dev_err(nvme->n_dip, CE_WARN, "!nvme-config-list: must be "
3208 "triplets of <model>/<fwrev>/<name-value-list> strings ");
3209 goto out;
3210 }
3211
3212 for (i = 0; i < nelem; i += 3) {
3213 char *model = config_list[i];
3214 char *fwrev = config_list[i + 1];
3215 char *nvp, *save_nv;
3216 int id_model_len, id_fwrev_len;
3217
3218 id_model_len = nvme_strlen(nvme->n_idctl->id_model,
3219 sizeof (nvme->n_idctl->id_model));
3220
3221 if (strlen(model) != id_model_len)
3222 continue;
3223
3224 if (strncmp(model, nvme->n_idctl->id_model, id_model_len) != 0)
3225 continue;
3226
3227 id_fwrev_len = nvme_strlen(nvme->n_idctl->id_fwrev,
3228 sizeof (nvme->n_idctl->id_fwrev));
3229
3230 if (strlen(fwrev) != 0) {
3231 boolean_t match = B_FALSE;
3232 char *fwr, *last_fw;
3233
3234 for (fwr = strtok_r(fwrev, ",", &last_fw);
3235 fwr != NULL;
3236 fwr = strtok_r(NULL, ",", &last_fw)) {
3237 if (strlen(fwr) != id_fwrev_len)
3238 continue;
3239
3240 if (strncmp(fwr, nvme->n_idctl->id_fwrev,
3241 id_fwrev_len) == 0)
3242 match = B_TRUE;
3243 }
3244
3245 if (!match)
3246 continue;
3247 }
3248
3249 /*
3250 * We should now have a comma-separated list of name:value
3251 * pairs.
3252 */
3253 for (nvp = strtok_r(config_list[i + 2], ",", &save_nv);
3254 nvp != NULL; nvp = strtok_r(NULL, ",", &save_nv)) {
3255 char *name = nvp;
3256 char *val = strchr(nvp, ':');
3257
3258 if (val == NULL || name == val) {
3259 dev_err(nvme->n_dip, CE_WARN,
3260 "!nvme-config-list: <name-value-list> "
3261 "for model '%s' is malformed", model);
3262 goto out;
3263 }
3264
3265 /*
3266 * Null-terminate 'name', move 'val' past ':' sep.
3267 */
3268 *val++ = '\0';
3269
3270 /*
3271 * Process the name:val pairs that we know about.
3272 */
3273 if (strcmp(name, "ignore-unknown-vendor-status") == 0) {
3274 nvme_config_boolean(nvme, model, name, val,
3275 &nvme->n_ignore_unknown_vendor_status);
3276 } else if (strcmp(name, "min-phys-block-size") == 0) {
3277 nvme_config_min_block_size(nvme, model, val);
3278 } else if (strcmp(name, "volatile-write-cache") == 0) {
3279 nvme_config_boolean(nvme, model, name, val,
3280 &nvme->n_write_cache_enabled);
3281 } else {
3282 /*
3283 * Unknown 'name'.
3284 */
3285 dev_err(nvme->n_dip, CE_WARN,
3286 "!nvme-config-list: unknown config '%s' "
3287 "for model '%s', ignoring", name, model);
3288 }
3289 }
3290 }
3291
3292 out:
3293 ddi_prop_free(config_list);
3294 }
3295
3296 static void
nvme_prepare_devid(nvme_t * nvme,uint32_t nsid)3297 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid)
3298 {
3299 /*
3300 * Section 7.7 of the spec describes how to get a unique ID for
3301 * the controller: the vendor ID, the model name and the serial
3302 * number shall be unique when combined.
3303 *
3304 * If a namespace has no EUI64 we use the above and add the hex
3305 * namespace ID to get a unique ID for the namespace.
3306 */
3307 char model[sizeof (nvme->n_idctl->id_model) + 1];
3308 char serial[sizeof (nvme->n_idctl->id_serial) + 1];
3309
3310 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
3311 bcopy(nvme->n_idctl->id_serial, serial,
3312 sizeof (nvme->n_idctl->id_serial));
3313
3314 model[sizeof (nvme->n_idctl->id_model)] = '\0';
3315 serial[sizeof (nvme->n_idctl->id_serial)] = '\0';
3316
3317 nvme_nsid2ns(nvme, nsid)->ns_devid = kmem_asprintf("%4X-%s-%s-%X",
3318 nvme->n_idctl->id_vid, model, serial, nsid);
3319 }
3320
3321 static nvme_identify_nsid_list_t *
nvme_update_nsid_list(nvme_t * nvme,int cns)3322 nvme_update_nsid_list(nvme_t *nvme, int cns)
3323 {
3324 nvme_identify_nsid_list_t *nslist;
3325
3326 /*
3327 * We currently don't handle cases where there are more than
3328 * 1024 active namespaces, requiring several IDENTIFY commands.
3329 */
3330 if (nvme_identify_int(nvme, 0, cns, (void **)&nslist))
3331 return (nslist);
3332
3333 return (NULL);
3334 }
3335
3336 nvme_namespace_t *
nvme_nsid2ns(nvme_t * nvme,uint32_t nsid)3337 nvme_nsid2ns(nvme_t *nvme, uint32_t nsid)
3338 {
3339 ASSERT3U(nsid, !=, 0);
3340 ASSERT3U(nsid, <=, nvme->n_namespace_count);
3341 return (&nvme->n_ns[nsid - 1]);
3342 }
3343
3344 static boolean_t
nvme_allocated_ns(nvme_namespace_t * ns)3345 nvme_allocated_ns(nvme_namespace_t *ns)
3346 {
3347 nvme_t *nvme = ns->ns_nvme;
3348 uint32_t i;
3349
3350 ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3351
3352 /*
3353 * If supported, update the list of allocated namespace IDs.
3354 */
3355 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2) &&
3356 nvme->n_idctl->id_oacs.oa_nsmgmt != 0) {
3357 nvme_identify_nsid_list_t *nslist = nvme_update_nsid_list(nvme,
3358 NVME_IDENTIFY_NSID_ALLOC_LIST);
3359 boolean_t found = B_FALSE;
3360
3361 /*
3362 * When namespace management is supported, this really shouldn't
3363 * be NULL. Treat all namespaces as allocated if it is.
3364 */
3365 if (nslist == NULL)
3366 return (B_TRUE);
3367
3368 for (i = 0; i < ARRAY_SIZE(nslist->nl_nsid); i++) {
3369 if (ns->ns_id == 0)
3370 break;
3371
3372 if (ns->ns_id == nslist->nl_nsid[i])
3373 found = B_TRUE;
3374 }
3375
3376 kmem_free(nslist, NVME_IDENTIFY_BUFSIZE);
3377 return (found);
3378 } else {
3379 /*
3380 * If namespace management isn't supported, report all
3381 * namespaces as allocated.
3382 */
3383 return (B_TRUE);
3384 }
3385 }
3386
3387 static boolean_t
nvme_active_ns(nvme_namespace_t * ns)3388 nvme_active_ns(nvme_namespace_t *ns)
3389 {
3390 nvme_t *nvme = ns->ns_nvme;
3391 uint64_t *ptr;
3392 uint32_t i;
3393
3394 ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3395
3396 /*
3397 * If supported, update the list of active namespace IDs.
3398 */
3399 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) {
3400 nvme_identify_nsid_list_t *nslist = nvme_update_nsid_list(nvme,
3401 NVME_IDENTIFY_NSID_LIST);
3402 boolean_t found = B_FALSE;
3403
3404 /*
3405 * When namespace management is supported, this really shouldn't
3406 * be NULL. Treat all namespaces as allocated if it is.
3407 */
3408 if (nslist == NULL)
3409 return (B_TRUE);
3410
3411 for (i = 0; i < ARRAY_SIZE(nslist->nl_nsid); i++) {
3412 if (ns->ns_id == 0)
3413 break;
3414
3415 if (ns->ns_id == nslist->nl_nsid[i])
3416 found = B_TRUE;
3417 }
3418
3419 kmem_free(nslist, NVME_IDENTIFY_BUFSIZE);
3420 return (found);
3421 }
3422
3423 /*
3424 * Workaround for revision 1.0:
3425 * Check whether the IDENTIFY NAMESPACE data is zero-filled.
3426 */
3427 for (ptr = (uint64_t *)ns->ns_idns;
3428 ptr != (uint64_t *)(ns->ns_idns + 1);
3429 ptr++) {
3430 if (*ptr != 0) {
3431 return (B_TRUE);
3432 }
3433 }
3434
3435 return (B_FALSE);
3436 }
3437
3438 static int
nvme_init_ns(nvme_t * nvme,uint32_t nsid)3439 nvme_init_ns(nvme_t *nvme, uint32_t nsid)
3440 {
3441 nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
3442 nvme_identify_nsid_t *idns;
3443 boolean_t was_ignored;
3444 int last_rp;
3445
3446 ns->ns_nvme = nvme;
3447
3448 ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3449
3450 /*
3451 * Because we might rescan a namespace and this will fail after boot
3452 * that'd leave us in a bad spot. We need to do something about this
3453 * longer term, but it's not clear how exactly we would recover right
3454 * now.
3455 */
3456 if (!nvme_identify_int(nvme, nsid, NVME_IDENTIFY_NSID,
3457 (void **)&idns)) {
3458 dev_err(nvme->n_dip, CE_WARN,
3459 "!failed to identify namespace %d", nsid);
3460 return (DDI_FAILURE);
3461 }
3462
3463 if (ns->ns_idns != NULL)
3464 kmem_free(ns->ns_idns, sizeof (nvme_identify_nsid_t));
3465
3466 ns->ns_idns = idns;
3467 ns->ns_id = nsid;
3468
3469 was_ignored = ns->ns_ignore;
3470
3471 ns->ns_allocated = nvme_allocated_ns(ns);
3472 ns->ns_active = nvme_active_ns(ns);
3473
3474 ns->ns_block_count = idns->id_nsize;
3475 ns->ns_block_size =
3476 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
3477 ns->ns_best_block_size = ns->ns_block_size;
3478
3479 /*
3480 * Get the EUI64 if present.
3481 */
3482 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
3483 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64));
3484
3485 /*
3486 * Get the NGUID if present.
3487 */
3488 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2))
3489 bcopy(idns->id_nguid, ns->ns_nguid, sizeof (ns->ns_nguid));
3490
3491 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
3492 if (*(uint64_t *)ns->ns_eui64 == 0)
3493 nvme_prepare_devid(nvme, ns->ns_id);
3494
3495 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%u", ns->ns_id);
3496
3497 /*
3498 * Find the LBA format with no metadata and the best relative
3499 * performance. A value of 3 means "degraded", 0 is best.
3500 */
3501 last_rp = 3;
3502 for (int j = 0; j <= idns->id_nlbaf; j++) {
3503 if (idns->id_lbaf[j].lbaf_lbads == 0)
3504 break;
3505 if (idns->id_lbaf[j].lbaf_ms != 0)
3506 continue;
3507 if (idns->id_lbaf[j].lbaf_rp >= last_rp)
3508 continue;
3509 last_rp = idns->id_lbaf[j].lbaf_rp;
3510 ns->ns_best_block_size =
3511 1 << idns->id_lbaf[j].lbaf_lbads;
3512 }
3513
3514 if (ns->ns_best_block_size < nvme->n_min_block_size)
3515 ns->ns_best_block_size = nvme->n_min_block_size;
3516
3517 was_ignored = ns->ns_ignore;
3518
3519 /*
3520 * We currently don't support namespaces that are inactive, or use
3521 * either:
3522 * - protection information
3523 * - illegal block size (< 512)
3524 */
3525 if (!ns->ns_active) {
3526 ns->ns_ignore = B_TRUE;
3527 } else if (idns->id_dps.dp_pinfo) {
3528 dev_err(nvme->n_dip, CE_WARN,
3529 "!ignoring namespace %d, unsupported feature: "
3530 "pinfo = %d", nsid, idns->id_dps.dp_pinfo);
3531 ns->ns_ignore = B_TRUE;
3532 } else if (ns->ns_block_size < 512) {
3533 dev_err(nvme->n_dip, CE_WARN,
3534 "!ignoring namespace %d, unsupported block size %"PRIu64,
3535 nsid, (uint64_t)ns->ns_block_size);
3536 ns->ns_ignore = B_TRUE;
3537 } else {
3538 ns->ns_ignore = B_FALSE;
3539 }
3540
3541 /*
3542 * Keep a count of namespaces which are attachable.
3543 * See comments in nvme_bd_driveinfo() to understand its effect.
3544 */
3545 if (was_ignored) {
3546 /*
3547 * Previously ignored, but now not. Count it.
3548 */
3549 if (!ns->ns_ignore)
3550 nvme->n_namespaces_attachable++;
3551 } else {
3552 /*
3553 * Wasn't ignored previously, but now needs to be.
3554 * Discount it.
3555 */
3556 if (ns->ns_ignore)
3557 nvme->n_namespaces_attachable--;
3558 }
3559
3560 return (DDI_SUCCESS);
3561 }
3562
3563 static boolean_t
nvme_attach_ns(nvme_t * nvme,nvme_ioctl_common_t * com)3564 nvme_attach_ns(nvme_t *nvme, nvme_ioctl_common_t *com)
3565 {
3566 nvme_namespace_t *ns = nvme_nsid2ns(nvme, com->nioc_nsid);
3567
3568 ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3569
3570 if (ns->ns_ignore) {
3571 return (nvme_ioctl_error(com, NVME_IOCTL_E_UNSUP_ATTACH_NS,
3572 0, 0));
3573 }
3574
3575 if (ns->ns_bd_hdl == NULL) {
3576 bd_ops_t ops = nvme_bd_ops;
3577
3578 if (!nvme->n_idctl->id_oncs.on_dset_mgmt)
3579 ops.o_free_space = NULL;
3580
3581 ns->ns_bd_hdl = bd_alloc_handle(ns, &ops, &nvme->n_prp_dma_attr,
3582 KM_SLEEP);
3583
3584 if (ns->ns_bd_hdl == NULL) {
3585 dev_err(nvme->n_dip, CE_WARN, "!Failed to get blkdev "
3586 "handle for namespace id %u", com->nioc_nsid);
3587 return (nvme_ioctl_error(com,
3588 NVME_IOCTL_E_BLKDEV_ATTACH, 0, 0));
3589 }
3590 }
3591
3592 if (bd_attach_handle(nvme->n_dip, ns->ns_bd_hdl) != DDI_SUCCESS) {
3593 return (nvme_ioctl_error(com, NVME_IOCTL_E_BLKDEV_ATTACH,
3594 0, 0));
3595 }
3596
3597 ns->ns_attached = B_TRUE;
3598
3599 return (B_TRUE);
3600 }
3601
3602 static boolean_t
nvme_detach_ns(nvme_t * nvme,nvme_ioctl_common_t * com)3603 nvme_detach_ns(nvme_t *nvme, nvme_ioctl_common_t *com)
3604 {
3605 nvme_namespace_t *ns = nvme_nsid2ns(nvme, com->nioc_nsid);
3606
3607 ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3608
3609 if (ns->ns_ignore || !ns->ns_attached)
3610 return (B_TRUE);
3611
3612 ASSERT3P(ns->ns_bd_hdl, !=, NULL);
3613 if (bd_detach_handle(ns->ns_bd_hdl) != DDI_SUCCESS) {
3614 return (nvme_ioctl_error(com, NVME_IOCTL_E_BLKDEV_DETACH, 0,
3615 0));
3616 }
3617
3618 ns->ns_attached = B_FALSE;
3619 return (B_TRUE);
3620
3621 }
3622
3623 /*
3624 * Rescan the namespace information associated with the namespaces indicated by
3625 * ioc. They should not be attached to blkdev right now.
3626 */
3627 static void
nvme_rescan_ns(nvme_t * nvme,uint32_t nsid)3628 nvme_rescan_ns(nvme_t *nvme, uint32_t nsid)
3629 {
3630 ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3631 ASSERT3U(nsid, !=, 0);
3632
3633 if (nsid != NVME_NSID_BCAST) {
3634 nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
3635
3636 ASSERT3U(ns->ns_attached, ==, B_FALSE);
3637 (void) nvme_init_ns(nvme, nsid);
3638 return;
3639 }
3640
3641 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
3642 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
3643
3644 ASSERT3U(ns->ns_attached, ==, B_FALSE);
3645 (void) nvme_init_ns(nvme, i);
3646 }
3647 }
3648
3649 typedef struct nvme_quirk_table {
3650 uint16_t nq_vendor_id;
3651 uint16_t nq_device_id;
3652 nvme_quirk_t nq_quirks;
3653 } nvme_quirk_table_t;
3654
3655 static const nvme_quirk_table_t nvme_quirks[] = {
3656 { 0x1987, 0x5018, NVME_QUIRK_START_CID }, /* Phison E18 */
3657 };
3658
3659 static void
nvme_detect_quirks(nvme_t * nvme)3660 nvme_detect_quirks(nvme_t *nvme)
3661 {
3662 for (uint_t i = 0; i < ARRAY_SIZE(nvme_quirks); i++) {
3663 const nvme_quirk_table_t *nqt = &nvme_quirks[i];
3664
3665 if (nqt->nq_vendor_id == nvme->n_vendor_id &&
3666 nqt->nq_device_id == nvme->n_device_id) {
3667 nvme->n_quirks = nqt->nq_quirks;
3668 return;
3669 }
3670 }
3671 }
3672
3673 static int
nvme_init(nvme_t * nvme)3674 nvme_init(nvme_t *nvme)
3675 {
3676 nvme_reg_cc_t cc = { 0 };
3677 nvme_reg_aqa_t aqa = { 0 };
3678 nvme_reg_asq_t asq = { 0 };
3679 nvme_reg_acq_t acq = { 0 };
3680 nvme_reg_cap_t cap;
3681 nvme_reg_vs_t vs;
3682 nvme_reg_csts_t csts;
3683 int i = 0;
3684 uint16_t nqueues;
3685 uint_t tq_threads;
3686 char model[sizeof (nvme->n_idctl->id_model) + 1];
3687 char *vendor, *product;
3688 uint32_t nsid;
3689
3690 /* Check controller version */
3691 vs.r = nvme_get32(nvme, NVME_REG_VS);
3692 nvme->n_version.v_major = vs.b.vs_mjr;
3693 nvme->n_version.v_minor = vs.b.vs_mnr;
3694 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
3695 nvme->n_version.v_major, nvme->n_version.v_minor);
3696
3697 if (nvme->n_version.v_major > nvme_version_major) {
3698 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x",
3699 nvme_version_major);
3700 if (nvme->n_strict_version)
3701 goto fail;
3702 }
3703
3704 /* retrieve controller configuration */
3705 cap.r = nvme_get64(nvme, NVME_REG_CAP);
3706
3707 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
3708 dev_err(nvme->n_dip, CE_WARN,
3709 "!NVM command set not supported by hardware");
3710 goto fail;
3711 }
3712
3713 nvme->n_nssr_supported = cap.b.cap_nssrs;
3714 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
3715 nvme->n_timeout = cap.b.cap_to;
3716 nvme->n_arbitration_mechanisms = cap.b.cap_ams;
3717 nvme->n_cont_queues_reqd = cap.b.cap_cqr;
3718 nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
3719
3720 /*
3721 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
3722 * the base page size of 4k (1<<12), so add 12 here to get the real
3723 * page size value.
3724 */
3725 nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT),
3726 cap.b.cap_mpsmax + 12);
3727 nvme->n_pagesize = 1UL << (nvme->n_pageshift);
3728
3729 /*
3730 * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
3731 */
3732 nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize;
3733 nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
3734
3735 /*
3736 * Set up PRP DMA to transfer 1 page-aligned page at a time.
3737 * Maxxfer may be increased after we identified the controller limits.
3738 */
3739 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize;
3740 nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
3741 nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize;
3742 nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1;
3743
3744 /*
3745 * Reset controller if it's still in ready state.
3746 */
3747 if (nvme_reset(nvme, B_FALSE) == B_FALSE) {
3748 dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller");
3749 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
3750 nvme->n_dead = B_TRUE;
3751 goto fail;
3752 }
3753
3754 /*
3755 * Create the cq array with one completion queue to be assigned
3756 * to the admin queue pair and a limited number of taskqs (4).
3757 */
3758 if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) !=
3759 DDI_SUCCESS) {
3760 dev_err(nvme->n_dip, CE_WARN,
3761 "!failed to pre-allocate admin completion queue");
3762 goto fail;
3763 }
3764 /*
3765 * Create the admin queue pair.
3766 */
3767 if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0)
3768 != DDI_SUCCESS) {
3769 dev_err(nvme->n_dip, CE_WARN,
3770 "!unable to allocate admin qpair");
3771 goto fail;
3772 }
3773 nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP);
3774 nvme->n_ioq[0] = nvme->n_adminq;
3775
3776 if (nvme->n_quirks & NVME_QUIRK_START_CID)
3777 nvme->n_adminq->nq_next_cmd++;
3778
3779 nvme->n_progress |= NVME_ADMIN_QUEUE;
3780
3781 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3782 "admin-queue-len", nvme->n_admin_queue_len);
3783
3784 aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1;
3785 asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress;
3786 acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress;
3787
3788 ASSERT((asq & (nvme->n_pagesize - 1)) == 0);
3789 ASSERT((acq & (nvme->n_pagesize - 1)) == 0);
3790
3791 nvme_put32(nvme, NVME_REG_AQA, aqa.r);
3792 nvme_put64(nvme, NVME_REG_ASQ, asq);
3793 nvme_put64(nvme, NVME_REG_ACQ, acq);
3794
3795 cc.b.cc_ams = 0; /* use Round-Robin arbitration */
3796 cc.b.cc_css = 0; /* use NVM command set */
3797 cc.b.cc_mps = nvme->n_pageshift - 12;
3798 cc.b.cc_shn = 0; /* no shutdown in progress */
3799 cc.b.cc_en = 1; /* enable controller */
3800 cc.b.cc_iosqes = 6; /* submission queue entry is 2^6 bytes long */
3801 cc.b.cc_iocqes = 4; /* completion queue entry is 2^4 bytes long */
3802
3803 nvme_put32(nvme, NVME_REG_CC, cc.r);
3804
3805 /*
3806 * Wait for the controller to become ready.
3807 */
3808 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3809 if (csts.b.csts_rdy == 0) {
3810 for (i = 0; i != nvme->n_timeout * 10; i++) {
3811 delay(drv_usectohz(50000));
3812 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3813
3814 if (csts.b.csts_cfs == 1) {
3815 dev_err(nvme->n_dip, CE_WARN,
3816 "!controller fatal status at init");
3817 ddi_fm_service_impact(nvme->n_dip,
3818 DDI_SERVICE_LOST);
3819 nvme->n_dead = B_TRUE;
3820 goto fail;
3821 }
3822
3823 if (csts.b.csts_rdy == 1)
3824 break;
3825 }
3826 }
3827
3828 if (csts.b.csts_rdy == 0) {
3829 dev_err(nvme->n_dip, CE_WARN, "!controller not ready");
3830 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
3831 nvme->n_dead = B_TRUE;
3832 goto fail;
3833 }
3834
3835 /*
3836 * Assume an abort command limit of 1. We'll destroy and re-init
3837 * that later when we know the true abort command limit.
3838 */
3839 sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL);
3840
3841 /*
3842 * Set up initial interrupt for admin queue.
3843 */
3844 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1)
3845 != DDI_SUCCESS) &&
3846 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1)
3847 != DDI_SUCCESS) &&
3848 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
3849 != DDI_SUCCESS)) {
3850 dev_err(nvme->n_dip, CE_WARN,
3851 "!failed to set up initial interrupt");
3852 goto fail;
3853 }
3854
3855 /*
3856 * Post an asynchronous event command to catch errors.
3857 * We assume the asynchronous events are supported as required by
3858 * specification (Figure 40 in section 5 of NVMe 1.2).
3859 * However, since at least qemu does not follow the specification,
3860 * we need a mechanism to protect ourselves.
3861 */
3862 nvme->n_async_event_supported = B_TRUE;
3863 nvme_async_event(nvme);
3864
3865 /*
3866 * Identify Controller
3867 */
3868 if (!nvme_identify_int(nvme, 0, NVME_IDENTIFY_CTRL,
3869 (void **)&nvme->n_idctl)) {
3870 dev_err(nvme->n_dip, CE_WARN, "!failed to identify controller");
3871 goto fail;
3872 }
3873
3874 /*
3875 * Get the common namespace information if available. If not, we use the
3876 * information for nsid 1.
3877 */
3878 if (nvme_ctrl_atleast(nvme, &nvme_vers_1v2) &&
3879 nvme->n_idctl->id_oacs.oa_nsmgmt != 0) {
3880 nsid = NVME_NSID_BCAST;
3881 } else {
3882 nsid = 1;
3883 }
3884
3885 if (!nvme_identify_int(nvme, nsid, NVME_IDENTIFY_NSID,
3886 (void **)&nvme->n_idcomns)) {
3887 dev_err(nvme->n_dip, CE_WARN, "!failed to identify common "
3888 "namespace information");
3889 goto fail;
3890 }
3891 /*
3892 * Process nvme-config-list (if present) in nvme.conf.
3893 */
3894 nvme_config_list(nvme);
3895
3896 /*
3897 * Get Vendor & Product ID
3898 */
3899 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
3900 model[sizeof (nvme->n_idctl->id_model)] = '\0';
3901 sata_split_model(model, &vendor, &product);
3902
3903 if (vendor == NULL)
3904 nvme->n_vendor = strdup("NVMe");
3905 else
3906 nvme->n_vendor = strdup(vendor);
3907
3908 nvme->n_product = strdup(product);
3909
3910 /*
3911 * Get controller limits.
3912 */
3913 nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT,
3914 MIN(nvme->n_admin_queue_len / 10,
3915 MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit)));
3916
3917 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3918 "async-event-limit", nvme->n_async_event_limit);
3919
3920 nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1;
3921
3922 /*
3923 * Reinitialize the semaphore with the true abort command limit
3924 * supported by the hardware. It's not necessary to disable interrupts
3925 * as only command aborts use the semaphore, and no commands are
3926 * executed or aborted while we're here.
3927 */
3928 sema_destroy(&nvme->n_abort_sema);
3929 sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL,
3930 SEMA_DRIVER, NULL);
3931
3932 nvme->n_progress |= NVME_CTRL_LIMITS;
3933
3934 if (nvme->n_idctl->id_mdts == 0)
3935 nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536;
3936 else
3937 nvme->n_max_data_transfer_size =
3938 1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts);
3939
3940 nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1;
3941
3942 /*
3943 * Limit n_max_data_transfer_size to what we can handle in one PRP.
3944 * Chained PRPs are currently unsupported.
3945 *
3946 * This is a no-op on hardware which doesn't support a transfer size
3947 * big enough to require chained PRPs.
3948 */
3949 nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size,
3950 (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize));
3951
3952 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size;
3953
3954 /*
3955 * Make sure the minimum/maximum queue entry sizes are not
3956 * larger/smaller than the default.
3957 */
3958
3959 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) ||
3960 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) ||
3961 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) ||
3962 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t)))
3963 goto fail;
3964
3965 /*
3966 * Check for the presence of a Volatile Write Cache. If present,
3967 * enable or disable based on the value of the property
3968 * volatile-write-cache-enable (default is enabled).
3969 */
3970 nvme->n_write_cache_present =
3971 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE;
3972
3973 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3974 "volatile-write-cache-present",
3975 nvme->n_write_cache_present ? 1 : 0);
3976
3977 if (!nvme->n_write_cache_present) {
3978 nvme->n_write_cache_enabled = B_FALSE;
3979 } else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)
3980 != 0) {
3981 dev_err(nvme->n_dip, CE_WARN,
3982 "!failed to %sable volatile write cache",
3983 nvme->n_write_cache_enabled ? "en" : "dis");
3984 /*
3985 * Assume the cache is (still) enabled.
3986 */
3987 nvme->n_write_cache_enabled = B_TRUE;
3988 }
3989
3990 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3991 "volatile-write-cache-enable",
3992 nvme->n_write_cache_enabled ? 1 : 0);
3993
3994 /*
3995 * Get number of supported namespaces and allocate namespace array.
3996 */
3997 nvme->n_namespace_count = nvme->n_idctl->id_nn;
3998
3999 if (nvme->n_namespace_count == 0) {
4000 dev_err(nvme->n_dip, CE_WARN,
4001 "!controllers without namespaces are not supported");
4002 goto fail;
4003 }
4004
4005 if (nvme->n_namespace_count > NVME_MINOR_MAX) {
4006 dev_err(nvme->n_dip, CE_WARN,
4007 "!too many namespaces: %d, limiting to %d\n",
4008 nvme->n_namespace_count, NVME_MINOR_MAX);
4009 nvme->n_namespace_count = NVME_MINOR_MAX;
4010 }
4011
4012 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
4013 nvme->n_namespace_count, KM_SLEEP);
4014
4015 /*
4016 * Try to set up MSI/MSI-X interrupts.
4017 */
4018 if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX))
4019 != 0) {
4020 nvme_release_interrupts(nvme);
4021
4022 nqueues = MIN(UINT16_MAX, ncpus);
4023
4024 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX,
4025 nqueues) != DDI_SUCCESS) &&
4026 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI,
4027 nqueues) != DDI_SUCCESS)) {
4028 dev_err(nvme->n_dip, CE_WARN,
4029 "!failed to set up MSI/MSI-X interrupts");
4030 goto fail;
4031 }
4032 }
4033
4034 /*
4035 * Create I/O queue pairs.
4036 */
4037
4038 if (nvme_set_nqueues(nvme) != 0) {
4039 dev_err(nvme->n_dip, CE_WARN,
4040 "!failed to set number of I/O queues to %d",
4041 nvme->n_intr_cnt);
4042 goto fail;
4043 }
4044
4045 /*
4046 * Reallocate I/O queue array
4047 */
4048 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
4049 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
4050 (nvme->n_submission_queues + 1), KM_SLEEP);
4051 nvme->n_ioq[0] = nvme->n_adminq;
4052
4053 /*
4054 * There should always be at least as many submission queues
4055 * as completion queues.
4056 */
4057 ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues);
4058
4059 nvme->n_ioq_count = nvme->n_submission_queues;
4060
4061 nvme->n_io_squeue_len =
4062 MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries);
4063
4064 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len",
4065 nvme->n_io_squeue_len);
4066
4067 /*
4068 * Pre-allocate completion queues.
4069 * When there are the same number of submission and completion
4070 * queues there is no value in having a larger completion
4071 * queue length.
4072 */
4073 if (nvme->n_submission_queues == nvme->n_completion_queues)
4074 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
4075 nvme->n_io_squeue_len);
4076
4077 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
4078 nvme->n_max_queue_entries);
4079
4080 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len",
4081 nvme->n_io_cqueue_len);
4082
4083 /*
4084 * Assign the equal quantity of taskq threads to each completion
4085 * queue, capping the total number of threads to the number
4086 * of CPUs.
4087 */
4088 tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues;
4089
4090 /*
4091 * In case the calculation above is zero, we need at least one
4092 * thread per completion queue.
4093 */
4094 tq_threads = MAX(1, tq_threads);
4095
4096 if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1,
4097 nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) {
4098 dev_err(nvme->n_dip, CE_WARN,
4099 "!failed to pre-allocate completion queues");
4100 goto fail;
4101 }
4102
4103 /*
4104 * If we use less completion queues than interrupt vectors return
4105 * some of the interrupt vectors back to the system.
4106 */
4107 if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) {
4108 nvme_release_interrupts(nvme);
4109
4110 if (nvme_setup_interrupts(nvme, nvme->n_intr_type,
4111 nvme->n_completion_queues + 1) != DDI_SUCCESS) {
4112 dev_err(nvme->n_dip, CE_WARN,
4113 "!failed to reduce number of interrupts");
4114 goto fail;
4115 }
4116 }
4117
4118 /*
4119 * Alloc & register I/O queue pairs
4120 */
4121
4122 for (i = 1; i != nvme->n_ioq_count + 1; i++) {
4123 if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len,
4124 &nvme->n_ioq[i], i) != DDI_SUCCESS) {
4125 dev_err(nvme->n_dip, CE_WARN,
4126 "!unable to allocate I/O qpair %d", i);
4127 goto fail;
4128 }
4129
4130 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) {
4131 dev_err(nvme->n_dip, CE_WARN,
4132 "!unable to create I/O qpair %d", i);
4133 goto fail;
4134 }
4135 }
4136
4137 /*
4138 * Post more asynchronous events commands to reduce event reporting
4139 * latency as suggested by the spec.
4140 */
4141 if (nvme->n_async_event_supported) {
4142 for (i = 1; i != nvme->n_async_event_limit; i++)
4143 nvme_async_event(nvme);
4144 }
4145
4146 return (DDI_SUCCESS);
4147
4148 fail:
4149 (void) nvme_reset(nvme, B_FALSE);
4150 return (DDI_FAILURE);
4151 }
4152
4153 static uint_t
nvme_intr(caddr_t arg1,caddr_t arg2)4154 nvme_intr(caddr_t arg1, caddr_t arg2)
4155 {
4156 /*LINTED: E_PTR_BAD_CAST_ALIGN*/
4157 nvme_t *nvme = (nvme_t *)arg1;
4158 int inum = (int)(uintptr_t)arg2;
4159 int ccnt = 0;
4160 int qnum;
4161
4162 if (inum >= nvme->n_intr_cnt)
4163 return (DDI_INTR_UNCLAIMED);
4164
4165 if (nvme->n_dead)
4166 return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ?
4167 DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED);
4168
4169 /*
4170 * The interrupt vector a queue uses is calculated as queue_idx %
4171 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
4172 * in steps of n_intr_cnt to process all queues using this vector.
4173 */
4174 for (qnum = inum;
4175 qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL;
4176 qnum += nvme->n_intr_cnt) {
4177 ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]);
4178 }
4179
4180 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
4181 }
4182
4183 static void
nvme_release_interrupts(nvme_t * nvme)4184 nvme_release_interrupts(nvme_t *nvme)
4185 {
4186 int i;
4187
4188 for (i = 0; i < nvme->n_intr_cnt; i++) {
4189 if (nvme->n_inth[i] == NULL)
4190 break;
4191
4192 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
4193 (void) ddi_intr_block_disable(&nvme->n_inth[i], 1);
4194 else
4195 (void) ddi_intr_disable(nvme->n_inth[i]);
4196
4197 (void) ddi_intr_remove_handler(nvme->n_inth[i]);
4198 (void) ddi_intr_free(nvme->n_inth[i]);
4199 }
4200
4201 kmem_free(nvme->n_inth, nvme->n_inth_sz);
4202 nvme->n_inth = NULL;
4203 nvme->n_inth_sz = 0;
4204
4205 nvme->n_progress &= ~NVME_INTERRUPTS;
4206 }
4207
4208 static int
nvme_setup_interrupts(nvme_t * nvme,int intr_type,int nqpairs)4209 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs)
4210 {
4211 int nintrs, navail, count;
4212 int ret;
4213 int i;
4214
4215 if (nvme->n_intr_types == 0) {
4216 ret = ddi_intr_get_supported_types(nvme->n_dip,
4217 &nvme->n_intr_types);
4218 if (ret != DDI_SUCCESS) {
4219 dev_err(nvme->n_dip, CE_WARN,
4220 "!%s: ddi_intr_get_supported types failed",
4221 __func__);
4222 return (ret);
4223 }
4224 #ifdef __x86
4225 if (get_hwenv() == HW_VMWARE)
4226 nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX;
4227 #endif
4228 }
4229
4230 if ((nvme->n_intr_types & intr_type) == 0)
4231 return (DDI_FAILURE);
4232
4233 ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs);
4234 if (ret != DDI_SUCCESS) {
4235 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed",
4236 __func__);
4237 return (ret);
4238 }
4239
4240 ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail);
4241 if (ret != DDI_SUCCESS) {
4242 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed",
4243 __func__);
4244 return (ret);
4245 }
4246
4247 /* We want at most one interrupt per queue pair. */
4248 if (navail > nqpairs)
4249 navail = nqpairs;
4250
4251 nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail;
4252 nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP);
4253
4254 ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail,
4255 &count, 0);
4256 if (ret != DDI_SUCCESS) {
4257 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed",
4258 __func__);
4259 goto fail;
4260 }
4261
4262 nvme->n_intr_cnt = count;
4263
4264 ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri);
4265 if (ret != DDI_SUCCESS) {
4266 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed",
4267 __func__);
4268 goto fail;
4269 }
4270
4271 for (i = 0; i < count; i++) {
4272 ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr,
4273 (void *)nvme, (void *)(uintptr_t)i);
4274 if (ret != DDI_SUCCESS) {
4275 dev_err(nvme->n_dip, CE_WARN,
4276 "!%s: ddi_intr_add_handler failed", __func__);
4277 goto fail;
4278 }
4279 }
4280
4281 (void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap);
4282
4283 for (i = 0; i < count; i++) {
4284 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
4285 ret = ddi_intr_block_enable(&nvme->n_inth[i], 1);
4286 else
4287 ret = ddi_intr_enable(nvme->n_inth[i]);
4288
4289 if (ret != DDI_SUCCESS) {
4290 dev_err(nvme->n_dip, CE_WARN,
4291 "!%s: enabling interrupt %d failed", __func__, i);
4292 goto fail;
4293 }
4294 }
4295
4296 nvme->n_intr_type = intr_type;
4297
4298 nvme->n_progress |= NVME_INTERRUPTS;
4299
4300 return (DDI_SUCCESS);
4301
4302 fail:
4303 nvme_release_interrupts(nvme);
4304
4305 return (ret);
4306 }
4307
4308 static int
nvme_fm_errcb(dev_info_t * dip,ddi_fm_error_t * fm_error,const void * arg)4309 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg)
4310 {
4311 _NOTE(ARGUNUSED(arg));
4312
4313 pci_ereport_post(dip, fm_error, NULL);
4314 return (fm_error->fme_status);
4315 }
4316
4317 static void
nvme_remove_callback(dev_info_t * dip,ddi_eventcookie_t cookie,void * a,void * b)4318 nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a,
4319 void *b)
4320 {
4321 nvme_t *nvme = a;
4322
4323 nvme_ctrl_mark_dead(nvme, B_TRUE);
4324
4325 /*
4326 * Fail all outstanding commands, including those in the admin queue
4327 * (queue 0).
4328 */
4329 for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) {
4330 nvme_qpair_t *qp = nvme->n_ioq[i];
4331
4332 mutex_enter(&qp->nq_mutex);
4333 for (size_t j = 0; j < qp->nq_nentry; j++) {
4334 nvme_cmd_t *cmd = qp->nq_cmd[j];
4335 nvme_cmd_t *u_cmd;
4336
4337 if (cmd == NULL) {
4338 continue;
4339 }
4340
4341 /*
4342 * Since we have the queue lock held the entire time we
4343 * iterate over it, it's not possible for the queue to
4344 * change underneath us. Thus, we don't need to check
4345 * that the return value of nvme_unqueue_cmd matches the
4346 * requested cmd to unqueue.
4347 */
4348 u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
4349 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq,
4350 cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
4351
4352 ASSERT3P(u_cmd, ==, cmd);
4353 }
4354 mutex_exit(&qp->nq_mutex);
4355 }
4356 }
4357
4358 /*
4359 * Open minor management
4360 */
4361 static int
nvme_minor_comparator(const void * l,const void * r)4362 nvme_minor_comparator(const void *l, const void *r)
4363 {
4364 const nvme_minor_t *lm = l;
4365 const nvme_minor_t *rm = r;
4366
4367 if (lm->nm_minor > rm->nm_minor) {
4368 return (1);
4369 } else if (lm->nm_minor < rm->nm_minor) {
4370 return (-1);
4371 } else {
4372 return (0);
4373 }
4374 }
4375
4376 static void
nvme_minor_free(nvme_minor_t * minor)4377 nvme_minor_free(nvme_minor_t *minor)
4378 {
4379 if (minor->nm_minor > 0) {
4380 ASSERT3S(minor->nm_minor, >=, NVME_OPEN_MINOR_MIN);
4381 id_free(nvme_open_minors, minor->nm_minor);
4382 minor->nm_minor = 0;
4383 }
4384 VERIFY0(list_link_active(&minor->nm_ctrl_lock.nli_node));
4385 VERIFY0(list_link_active(&minor->nm_ns_lock.nli_node));
4386 cv_destroy(&minor->nm_cv);
4387 kmem_free(minor, sizeof (nvme_minor_t));
4388 }
4389
4390 static nvme_minor_t *
nvme_minor_find_by_dev(dev_t dev)4391 nvme_minor_find_by_dev(dev_t dev)
4392 {
4393 id_t id = (id_t)getminor(dev);
4394 nvme_minor_t search = { .nm_minor = id };
4395 nvme_minor_t *ret;
4396
4397 mutex_enter(&nvme_open_minors_mutex);
4398 ret = avl_find(&nvme_open_minors_avl, &search, NULL);
4399 mutex_exit(&nvme_open_minors_mutex);
4400
4401 return (ret);
4402 }
4403
4404 static int
nvme_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)4405 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
4406 {
4407 nvme_t *nvme;
4408 int instance;
4409 int nregs;
4410 off_t regsize;
4411 char name[32];
4412 boolean_t attached_ns;
4413
4414 if (cmd != DDI_ATTACH)
4415 return (DDI_FAILURE);
4416
4417 instance = ddi_get_instance(dip);
4418
4419 if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS)
4420 return (DDI_FAILURE);
4421
4422 nvme = ddi_get_soft_state(nvme_state, instance);
4423 ddi_set_driver_private(dip, nvme);
4424 nvme->n_dip = dip;
4425
4426 /*
4427 * Map PCI config space
4428 */
4429 if (pci_config_setup(dip, &nvme->n_pcicfg_handle) != DDI_SUCCESS) {
4430 dev_err(dip, CE_WARN, "!failed to map PCI config space");
4431 goto fail;
4432 }
4433 nvme->n_progress |= NVME_PCI_CONFIG;
4434
4435 /*
4436 * Get the various PCI IDs from config space
4437 */
4438 nvme->n_vendor_id =
4439 pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_VENID);
4440 nvme->n_device_id =
4441 pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_DEVID);
4442 nvme->n_revision_id =
4443 pci_config_get8(nvme->n_pcicfg_handle, PCI_CONF_REVID);
4444 nvme->n_subsystem_device_id =
4445 pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_SUBSYSID);
4446 nvme->n_subsystem_vendor_id =
4447 pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_SUBVENID);
4448
4449 nvme_detect_quirks(nvme);
4450
4451 /*
4452 * Set up event handlers for hot removal. While npe(4D) supports the hot
4453 * removal event being injected for devices, the same is not true of all
4454 * of our possible parents (i.e. pci(4D) as of this writing). The most
4455 * common case this shows up is in some virtualization environments. We
4456 * should treat this as non-fatal so that way devices work but leave
4457 * this set up in such a way that if a nexus does grow support for this
4458 * we're good to go.
4459 */
4460 if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT,
4461 &nvme->n_rm_cookie) == DDI_SUCCESS) {
4462 if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie,
4463 nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) !=
4464 DDI_SUCCESS) {
4465 goto fail;
4466 }
4467 } else {
4468 nvme->n_ev_rm_cb_id = NULL;
4469 }
4470
4471 mutex_init(&nvme->n_minor_mutex, NULL, MUTEX_DRIVER, NULL);
4472 nvme->n_progress |= NVME_MUTEX_INIT;
4473
4474 nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4475 DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE;
4476 nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY,
4477 dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ?
4478 B_TRUE : B_FALSE;
4479 nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4480 DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN);
4481 nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4482 DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN);
4483 /*
4484 * Double up the default for completion queues in case of
4485 * queue sharing.
4486 */
4487 nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4488 DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN);
4489 nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4490 DDI_PROP_DONTPASS, "async-event-limit",
4491 NVME_DEFAULT_ASYNC_EVENT_LIMIT);
4492 nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4493 DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ?
4494 B_TRUE : B_FALSE;
4495 nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4496 DDI_PROP_DONTPASS, "min-phys-block-size",
4497 NVME_DEFAULT_MIN_BLOCK_SIZE);
4498 nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4499 DDI_PROP_DONTPASS, "max-submission-queues", -1);
4500 nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4501 DDI_PROP_DONTPASS, "max-completion-queues", -1);
4502
4503 if (!ISP2(nvme->n_min_block_size) ||
4504 (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) {
4505 dev_err(dip, CE_WARN, "!min-phys-block-size %s, "
4506 "using default %d", ISP2(nvme->n_min_block_size) ?
4507 "too low" : "not a power of 2",
4508 NVME_DEFAULT_MIN_BLOCK_SIZE);
4509 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
4510 }
4511
4512 if (nvme->n_submission_queues != -1 &&
4513 (nvme->n_submission_queues < 1 ||
4514 nvme->n_submission_queues > UINT16_MAX)) {
4515 dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not "
4516 "valid. Must be [1..%d]", nvme->n_submission_queues,
4517 UINT16_MAX);
4518 nvme->n_submission_queues = -1;
4519 }
4520
4521 if (nvme->n_completion_queues != -1 &&
4522 (nvme->n_completion_queues < 1 ||
4523 nvme->n_completion_queues > UINT16_MAX)) {
4524 dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not "
4525 "valid. Must be [1..%d]", nvme->n_completion_queues,
4526 UINT16_MAX);
4527 nvme->n_completion_queues = -1;
4528 }
4529
4530 if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN)
4531 nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN;
4532 else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN)
4533 nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN;
4534
4535 if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN)
4536 nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN;
4537 if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN)
4538 nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN;
4539
4540 if (nvme->n_async_event_limit < 1)
4541 nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT;
4542
4543 nvme->n_reg_acc_attr = nvme_reg_acc_attr;
4544 nvme->n_queue_dma_attr = nvme_queue_dma_attr;
4545 nvme->n_prp_dma_attr = nvme_prp_dma_attr;
4546 nvme->n_sgl_dma_attr = nvme_sgl_dma_attr;
4547
4548 /*
4549 * Set up FMA support.
4550 */
4551 nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip,
4552 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable",
4553 DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
4554 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE);
4555
4556 ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc);
4557
4558 if (nvme->n_fm_cap) {
4559 if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE)
4560 nvme->n_reg_acc_attr.devacc_attr_access =
4561 DDI_FLAGERR_ACC;
4562
4563 if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) {
4564 nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
4565 nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
4566 }
4567
4568 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
4569 DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4570 pci_ereport_setup(dip);
4571
4572 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4573 ddi_fm_handler_register(dip, nvme_fm_errcb,
4574 (void *)nvme);
4575 }
4576
4577 nvme->n_progress |= NVME_FMA_INIT;
4578
4579 /*
4580 * The spec defines several register sets. Only the controller
4581 * registers (set 1) are currently used.
4582 */
4583 if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE ||
4584 nregs < 2 ||
4585 ddi_dev_regsize(dip, 1, ®size) == DDI_FAILURE)
4586 goto fail;
4587
4588 if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize,
4589 &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) {
4590 dev_err(dip, CE_WARN, "!failed to map regset 1");
4591 goto fail;
4592 }
4593
4594 nvme->n_progress |= NVME_REGS_MAPPED;
4595
4596 /*
4597 * Create PRP DMA cache
4598 */
4599 (void) snprintf(name, sizeof (name), "%s%d_prp_cache",
4600 ddi_driver_name(dip), ddi_get_instance(dip));
4601 nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t),
4602 0, nvme_prp_dma_constructor, nvme_prp_dma_destructor,
4603 NULL, (void *)nvme, NULL, 0);
4604
4605 if (nvme_init(nvme) != DDI_SUCCESS)
4606 goto fail;
4607
4608 /*
4609 * Initialize the driver with the UFM subsystem
4610 */
4611 if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops,
4612 &nvme->n_ufmh, nvme) != 0) {
4613 dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem");
4614 goto fail;
4615 }
4616 mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL);
4617 ddi_ufm_update(nvme->n_ufmh);
4618 nvme->n_progress |= NVME_UFM_INIT;
4619
4620 mutex_init(&nvme->n_mgmt_mutex, NULL, MUTEX_DRIVER, NULL);
4621 nvme_lock_init(&nvme->n_lock);
4622 nvme->n_progress |= NVME_MGMT_INIT;
4623 nvme->n_dead_status = NVME_IOCTL_E_CTRL_DEAD;
4624
4625
4626 /*
4627 * Identify namespaces.
4628 */
4629 mutex_enter(&nvme->n_mgmt_mutex);
4630
4631 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
4632 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
4633
4634 nvme_lock_init(&ns->ns_lock);
4635 ns->ns_progress |= NVME_NS_LOCK;
4636
4637 /*
4638 * Namespaces start out ignored. When nvme_init_ns() checks
4639 * their properties and finds they can be used, it will set
4640 * ns_ignore to B_FALSE. It will also use this state change
4641 * to keep an accurate count of attachable namespaces.
4642 */
4643 ns->ns_ignore = B_TRUE;
4644 if (nvme_init_ns(nvme, i) != 0) {
4645 mutex_exit(&nvme->n_mgmt_mutex);
4646 goto fail;
4647 }
4648
4649 if (ddi_create_minor_node(nvme->n_dip, ns->ns_name, S_IFCHR,
4650 NVME_MINOR(ddi_get_instance(nvme->n_dip), i),
4651 DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) {
4652 mutex_exit(&nvme->n_mgmt_mutex);
4653 dev_err(dip, CE_WARN,
4654 "!failed to create minor node for namespace %d", i);
4655 goto fail;
4656 }
4657 }
4658
4659 if (ddi_create_minor_node(dip, "devctl", S_IFCHR,
4660 NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0)
4661 != DDI_SUCCESS) {
4662 mutex_exit(&nvme->n_mgmt_mutex);
4663 dev_err(dip, CE_WARN, "nvme_attach: "
4664 "cannot create devctl minor node");
4665 goto fail;
4666 }
4667
4668 attached_ns = B_FALSE;
4669 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
4670 nvme_ioctl_common_t com = { .nioc_nsid = i };
4671
4672 if (nvme_attach_ns(nvme, &com)) {
4673 attached_ns = B_TRUE;
4674 } else if (com.nioc_drv_err != NVME_IOCTL_E_UNSUP_ATTACH_NS) {
4675 dev_err(nvme->n_dip, CE_WARN, "!failed to attach "
4676 "namespace %d due to blkdev error", i);
4677 /*
4678 * Once we have successfully attached a namespace we
4679 * can no longer fail the driver attach as there is now
4680 * a blkdev child node linked to this device, and
4681 * our node is not yet in the attached state.
4682 */
4683 if (!attached_ns) {
4684 mutex_exit(&nvme->n_mgmt_mutex);
4685 goto fail;
4686 }
4687 }
4688 }
4689
4690 mutex_exit(&nvme->n_mgmt_mutex);
4691
4692 return (DDI_SUCCESS);
4693
4694 fail:
4695 /* attach successful anyway so that FMA can retire the device */
4696 if (nvme->n_dead)
4697 return (DDI_SUCCESS);
4698
4699 (void) nvme_detach(dip, DDI_DETACH);
4700
4701 return (DDI_FAILURE);
4702 }
4703
4704 static int
nvme_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)4705 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4706 {
4707 int instance;
4708 nvme_t *nvme;
4709
4710 if (cmd != DDI_DETACH)
4711 return (DDI_FAILURE);
4712
4713 instance = ddi_get_instance(dip);
4714
4715 nvme = ddi_get_soft_state(nvme_state, instance);
4716
4717 if (nvme == NULL)
4718 return (DDI_FAILURE);
4719
4720 /*
4721 * Remove all minor nodes from the device regardless of the source in
4722 * one swoop.
4723 */
4724 ddi_remove_minor_node(dip, NULL);
4725
4726 /*
4727 * We need to remove the event handler as one of the first things that
4728 * we do. If we proceed with other teardown without removing the event
4729 * handler, we could end up in a very unfortunate race with ourselves.
4730 * The DDI does not serialize these with detach (just like timeout(9F)
4731 * and others).
4732 */
4733 if (nvme->n_ev_rm_cb_id != NULL) {
4734 (void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id);
4735 }
4736 nvme->n_ev_rm_cb_id = NULL;
4737
4738 /*
4739 * If the controller was marked dead, there is a slight chance that we
4740 * are asynchronusly processing the removal taskq. Because we have
4741 * removed the callback handler above and all minor nodes and commands
4742 * are closed, there is no other way to get in here. As such, we wait on
4743 * the nvme_dead_taskq to complete so we can avoid tracking if it's
4744 * running or not.
4745 */
4746 taskq_wait(nvme_dead_taskq);
4747
4748 if (nvme->n_ns) {
4749 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
4750 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
4751
4752 if (ns->ns_bd_hdl) {
4753 (void) bd_detach_handle(ns->ns_bd_hdl);
4754 bd_free_handle(ns->ns_bd_hdl);
4755 }
4756
4757 if (ns->ns_idns)
4758 kmem_free(ns->ns_idns,
4759 sizeof (nvme_identify_nsid_t));
4760 if (ns->ns_devid)
4761 strfree(ns->ns_devid);
4762
4763 if ((ns->ns_progress & NVME_NS_LOCK) != 0)
4764 nvme_lock_fini(&ns->ns_lock);
4765 }
4766
4767 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) *
4768 nvme->n_namespace_count);
4769 }
4770
4771 if (nvme->n_progress & NVME_MGMT_INIT) {
4772 nvme_lock_fini(&nvme->n_lock);
4773 mutex_destroy(&nvme->n_mgmt_mutex);
4774 }
4775
4776 if (nvme->n_progress & NVME_UFM_INIT) {
4777 ddi_ufm_fini(nvme->n_ufmh);
4778 mutex_destroy(&nvme->n_fwslot_mutex);
4779 }
4780
4781 if (nvme->n_progress & NVME_INTERRUPTS)
4782 nvme_release_interrupts(nvme);
4783
4784 for (uint_t i = 0; i < nvme->n_cq_count; i++) {
4785 if (nvme->n_cq[i]->ncq_cmd_taskq != NULL)
4786 taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq);
4787 }
4788
4789 if (nvme->n_progress & NVME_MUTEX_INIT) {
4790 mutex_destroy(&nvme->n_minor_mutex);
4791 }
4792
4793 if (nvme->n_ioq_count > 0) {
4794 for (uint_t i = 1; i != nvme->n_ioq_count + 1; i++) {
4795 if (nvme->n_ioq[i] != NULL) {
4796 /* TODO: send destroy queue commands */
4797 nvme_free_qpair(nvme->n_ioq[i]);
4798 }
4799 }
4800
4801 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
4802 (nvme->n_ioq_count + 1));
4803 }
4804
4805 if (nvme->n_prp_cache != NULL) {
4806 kmem_cache_destroy(nvme->n_prp_cache);
4807 }
4808
4809 if (nvme->n_progress & NVME_REGS_MAPPED) {
4810 nvme_shutdown(nvme, B_FALSE);
4811 (void) nvme_reset(nvme, B_FALSE);
4812 }
4813
4814 if (nvme->n_progress & NVME_CTRL_LIMITS)
4815 sema_destroy(&nvme->n_abort_sema);
4816
4817 if (nvme->n_progress & NVME_ADMIN_QUEUE)
4818 nvme_free_qpair(nvme->n_adminq);
4819
4820 if (nvme->n_cq_count > 0) {
4821 nvme_destroy_cq_array(nvme, 0);
4822 nvme->n_cq = NULL;
4823 nvme->n_cq_count = 0;
4824 }
4825
4826 if (nvme->n_idcomns)
4827 kmem_free(nvme->n_idcomns, NVME_IDENTIFY_BUFSIZE);
4828
4829 if (nvme->n_idctl)
4830 kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE);
4831
4832 if (nvme->n_progress & NVME_REGS_MAPPED)
4833 ddi_regs_map_free(&nvme->n_regh);
4834
4835 if (nvme->n_progress & NVME_FMA_INIT) {
4836 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4837 ddi_fm_handler_unregister(nvme->n_dip);
4838
4839 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
4840 DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4841 pci_ereport_teardown(nvme->n_dip);
4842
4843 ddi_fm_fini(nvme->n_dip);
4844 }
4845
4846 if (nvme->n_progress & NVME_PCI_CONFIG)
4847 pci_config_teardown(&nvme->n_pcicfg_handle);
4848
4849 if (nvme->n_vendor != NULL)
4850 strfree(nvme->n_vendor);
4851
4852 if (nvme->n_product != NULL)
4853 strfree(nvme->n_product);
4854
4855 ddi_soft_state_free(nvme_state, instance);
4856
4857 return (DDI_SUCCESS);
4858 }
4859
4860 static int
nvme_quiesce(dev_info_t * dip)4861 nvme_quiesce(dev_info_t *dip)
4862 {
4863 int instance;
4864 nvme_t *nvme;
4865
4866 instance = ddi_get_instance(dip);
4867
4868 nvme = ddi_get_soft_state(nvme_state, instance);
4869
4870 if (nvme == NULL)
4871 return (DDI_FAILURE);
4872
4873 nvme_shutdown(nvme, B_TRUE);
4874
4875 (void) nvme_reset(nvme, B_TRUE);
4876
4877 return (DDI_SUCCESS);
4878 }
4879
4880 static int
nvme_fill_prp(nvme_cmd_t * cmd,ddi_dma_handle_t dma)4881 nvme_fill_prp(nvme_cmd_t *cmd, ddi_dma_handle_t dma)
4882 {
4883 nvme_t *nvme = cmd->nc_nvme;
4884 uint_t nprp_per_page, nprp;
4885 uint64_t *prp;
4886 const ddi_dma_cookie_t *cookie;
4887 uint_t idx;
4888 uint_t ncookies = ddi_dma_ncookies(dma);
4889
4890 if (ncookies == 0)
4891 return (DDI_FAILURE);
4892
4893 if ((cookie = ddi_dma_cookie_get(dma, 0)) == NULL)
4894 return (DDI_FAILURE);
4895 cmd->nc_sqe.sqe_dptr.d_prp[0] = cookie->dmac_laddress;
4896
4897 if (ncookies == 1) {
4898 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
4899 return (DDI_SUCCESS);
4900 } else if (ncookies == 2) {
4901 if ((cookie = ddi_dma_cookie_get(dma, 1)) == NULL)
4902 return (DDI_FAILURE);
4903 cmd->nc_sqe.sqe_dptr.d_prp[1] = cookie->dmac_laddress;
4904 return (DDI_SUCCESS);
4905 }
4906
4907 /*
4908 * At this point, we're always operating on cookies at
4909 * index >= 1 and writing the addresses of those cookies
4910 * into a new page. The address of that page is stored
4911 * as the second PRP entry.
4912 */
4913 nprp_per_page = nvme->n_pagesize / sizeof (uint64_t);
4914 ASSERT(nprp_per_page > 0);
4915
4916 /*
4917 * We currently don't support chained PRPs and set up our DMA
4918 * attributes to reflect that. If we still get an I/O request
4919 * that needs a chained PRP something is very wrong. Account
4920 * for the first cookie here, which we've placed in d_prp[0].
4921 */
4922 nprp = howmany(ncookies - 1, nprp_per_page);
4923 VERIFY(nprp == 1);
4924
4925 /*
4926 * Allocate a page of pointers, in which we'll write the
4927 * addresses of cookies 1 to `ncookies`.
4928 */
4929 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP);
4930 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len);
4931 cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_prp->nd_cookie.dmac_laddress;
4932
4933 prp = (uint64_t *)cmd->nc_prp->nd_memp;
4934 for (idx = 1; idx < ncookies; idx++) {
4935 if ((cookie = ddi_dma_cookie_get(dma, idx)) == NULL)
4936 return (DDI_FAILURE);
4937 *prp++ = cookie->dmac_laddress;
4938 }
4939
4940 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len,
4941 DDI_DMA_SYNC_FORDEV);
4942 return (DDI_SUCCESS);
4943 }
4944
4945 /*
4946 * The maximum number of requests supported for a deallocate request is
4947 * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and
4948 * unchanged through at least 1.4a). The definition of nvme_range_t is also
4949 * from the NVMe 1.1 spec. Together, the result is that all of the ranges for
4950 * a deallocate request will fit into the smallest supported namespace page
4951 * (4k).
4952 */
4953 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096);
4954
4955 static int
nvme_fill_ranges(nvme_cmd_t * cmd,bd_xfer_t * xfer,uint64_t blocksize,int allocflag)4956 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize,
4957 int allocflag)
4958 {
4959 const dkioc_free_list_t *dfl = xfer->x_dfl;
4960 const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
4961 nvme_t *nvme = cmd->nc_nvme;
4962 nvme_range_t *ranges = NULL;
4963 uint_t i;
4964
4965 /*
4966 * The number of ranges in the request is 0s based (that is
4967 * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ...,
4968 * word10 == 255 -> 256 ranges). Therefore the allowed values are
4969 * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request,
4970 * we either provided bad info in nvme_bd_driveinfo() or there is a bug
4971 * in blkdev.
4972 */
4973 VERIFY3U(dfl->dfl_num_exts, >, 0);
4974 VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES);
4975 cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff;
4976
4977 cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE;
4978
4979 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, allocflag);
4980 if (cmd->nc_prp == NULL)
4981 return (DDI_FAILURE);
4982
4983 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len);
4984 ranges = (nvme_range_t *)cmd->nc_prp->nd_memp;
4985
4986 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_prp->nd_cookie.dmac_laddress;
4987 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
4988
4989 for (i = 0; i < dfl->dfl_num_exts; i++) {
4990 uint64_t lba, len;
4991
4992 lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize;
4993 len = exts[i].dfle_length / blocksize;
4994
4995 VERIFY3U(len, <=, UINT32_MAX);
4996
4997 /* No context attributes for a deallocate request */
4998 ranges[i].nr_ctxattr = 0;
4999 ranges[i].nr_len = len;
5000 ranges[i].nr_lba = lba;
5001 }
5002
5003 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len,
5004 DDI_DMA_SYNC_FORDEV);
5005
5006 return (DDI_SUCCESS);
5007 }
5008
5009 static nvme_cmd_t *
nvme_create_nvm_cmd(nvme_namespace_t * ns,uint8_t opc,bd_xfer_t * xfer)5010 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer)
5011 {
5012 nvme_t *nvme = ns->ns_nvme;
5013 nvme_cmd_t *cmd;
5014 int allocflag;
5015
5016 /*
5017 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
5018 */
5019 allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP;
5020 cmd = nvme_alloc_cmd(nvme, allocflag);
5021
5022 if (cmd == NULL)
5023 return (NULL);
5024
5025 cmd->nc_sqe.sqe_opc = opc;
5026 cmd->nc_callback = nvme_bd_xfer_done;
5027 cmd->nc_xfer = xfer;
5028
5029 switch (opc) {
5030 case NVME_OPC_NVM_WRITE:
5031 case NVME_OPC_NVM_READ:
5032 VERIFY(xfer->x_nblks <= 0x10000);
5033
5034 cmd->nc_sqe.sqe_nsid = ns->ns_id;
5035
5036 cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu;
5037 cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32);
5038 cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1);
5039
5040 if (nvme_fill_prp(cmd, xfer->x_dmah) != DDI_SUCCESS)
5041 goto fail;
5042 break;
5043
5044 case NVME_OPC_NVM_FLUSH:
5045 cmd->nc_sqe.sqe_nsid = ns->ns_id;
5046 break;
5047
5048 case NVME_OPC_NVM_DSET_MGMT:
5049 cmd->nc_sqe.sqe_nsid = ns->ns_id;
5050
5051 if (nvme_fill_ranges(cmd, xfer,
5052 (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS)
5053 goto fail;
5054 break;
5055
5056 default:
5057 goto fail;
5058 }
5059
5060 return (cmd);
5061
5062 fail:
5063 nvme_free_cmd(cmd);
5064 return (NULL);
5065 }
5066
5067 static void
nvme_bd_xfer_done(void * arg)5068 nvme_bd_xfer_done(void *arg)
5069 {
5070 nvme_cmd_t *cmd = arg;
5071 bd_xfer_t *xfer = cmd->nc_xfer;
5072 int error = 0;
5073
5074 error = nvme_check_cmd_status(cmd);
5075 nvme_free_cmd(cmd);
5076
5077 bd_xfer_done(xfer, error);
5078 }
5079
5080 static void
nvme_bd_driveinfo(void * arg,bd_drive_t * drive)5081 nvme_bd_driveinfo(void *arg, bd_drive_t *drive)
5082 {
5083 nvme_namespace_t *ns = arg;
5084 nvme_t *nvme = ns->ns_nvme;
5085 uint_t ns_count = MAX(1, nvme->n_namespaces_attachable);
5086 boolean_t mutex_exit_needed = B_TRUE;
5087
5088 /*
5089 * nvme_bd_driveinfo is called by blkdev in two situations:
5090 * - during bd_attach_handle(), which we call with the mutex held
5091 * - during bd_attach(), which may be called with or without the
5092 * mutex held
5093 */
5094 if (mutex_owned(&nvme->n_mgmt_mutex))
5095 mutex_exit_needed = B_FALSE;
5096 else
5097 mutex_enter(&nvme->n_mgmt_mutex);
5098
5099 /*
5100 * Set the blkdev qcount to the number of submission queues.
5101 * It will then create one waitq/runq pair for each submission
5102 * queue and spread I/O requests across the queues.
5103 */
5104 drive->d_qcount = nvme->n_ioq_count;
5105
5106 /*
5107 * I/O activity to individual namespaces is distributed across
5108 * each of the d_qcount blkdev queues (which has been set to
5109 * the number of nvme submission queues). d_qsize is the number
5110 * of submitted and not completed I/Os within each queue that blkdev
5111 * will allow before it starts holding them in the waitq.
5112 *
5113 * Each namespace will create a child blkdev instance, for each one
5114 * we try and set the d_qsize so that each namespace gets an
5115 * equal portion of the submission queue.
5116 *
5117 * If post instantiation of the nvme drive, n_namespaces_attachable
5118 * changes and a namespace is attached it could calculate a
5119 * different d_qsize. It may even be that the sum of the d_qsizes is
5120 * now beyond the submission queue size. Should that be the case
5121 * and the I/O rate is such that blkdev attempts to submit more
5122 * I/Os than the size of the submission queue, the excess I/Os
5123 * will be held behind the semaphore nq_sema.
5124 */
5125 drive->d_qsize = nvme->n_io_squeue_len / ns_count;
5126
5127 /*
5128 * Don't let the queue size drop below the minimum, though.
5129 */
5130 drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN);
5131
5132 /*
5133 * d_maxxfer is not set, which means the value is taken from the DMA
5134 * attributes specified to bd_alloc_handle.
5135 */
5136
5137 drive->d_removable = B_FALSE;
5138 drive->d_hotpluggable = B_FALSE;
5139
5140 bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64));
5141 drive->d_target = ns->ns_id;
5142 drive->d_lun = 0;
5143
5144 drive->d_model = nvme->n_idctl->id_model;
5145 drive->d_model_len = sizeof (nvme->n_idctl->id_model);
5146 drive->d_vendor = nvme->n_vendor;
5147 drive->d_vendor_len = strlen(nvme->n_vendor);
5148 drive->d_product = nvme->n_product;
5149 drive->d_product_len = strlen(nvme->n_product);
5150 drive->d_serial = nvme->n_idctl->id_serial;
5151 drive->d_serial_len = sizeof (nvme->n_idctl->id_serial);
5152 drive->d_revision = nvme->n_idctl->id_fwrev;
5153 drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev);
5154
5155 /*
5156 * If we support the dataset management command, the only restrictions
5157 * on a discard request are the maximum number of ranges (segments)
5158 * per single request.
5159 */
5160 if (nvme->n_idctl->id_oncs.on_dset_mgmt)
5161 drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES;
5162
5163 if (mutex_exit_needed)
5164 mutex_exit(&nvme->n_mgmt_mutex);
5165 }
5166
5167 static int
nvme_bd_mediainfo(void * arg,bd_media_t * media)5168 nvme_bd_mediainfo(void *arg, bd_media_t *media)
5169 {
5170 nvme_namespace_t *ns = arg;
5171 nvme_t *nvme = ns->ns_nvme;
5172 boolean_t mutex_exit_needed = B_TRUE;
5173
5174 if (nvme->n_dead) {
5175 return (EIO);
5176 }
5177
5178 /*
5179 * nvme_bd_mediainfo is called by blkdev in various situations,
5180 * most of them out of our control. There's one exception though:
5181 * When we call bd_state_change() in response to "namespace change"
5182 * notification, where the mutex is already being held by us.
5183 */
5184 if (mutex_owned(&nvme->n_mgmt_mutex))
5185 mutex_exit_needed = B_FALSE;
5186 else
5187 mutex_enter(&nvme->n_mgmt_mutex);
5188
5189 media->m_nblks = ns->ns_block_count;
5190 media->m_blksize = ns->ns_block_size;
5191 media->m_readonly = B_FALSE;
5192 media->m_solidstate = B_TRUE;
5193
5194 media->m_pblksize = ns->ns_best_block_size;
5195
5196 if (mutex_exit_needed)
5197 mutex_exit(&nvme->n_mgmt_mutex);
5198
5199 return (0);
5200 }
5201
5202 static int
nvme_bd_cmd(nvme_namespace_t * ns,bd_xfer_t * xfer,uint8_t opc)5203 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
5204 {
5205 nvme_t *nvme = ns->ns_nvme;
5206 nvme_cmd_t *cmd;
5207 nvme_qpair_t *ioq;
5208 boolean_t poll;
5209 int ret;
5210
5211 if (nvme->n_dead) {
5212 return (EIO);
5213 }
5214
5215 cmd = nvme_create_nvm_cmd(ns, opc, xfer);
5216 if (cmd == NULL)
5217 return (ENOMEM);
5218
5219 cmd->nc_sqid = xfer->x_qnum + 1;
5220 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
5221 ioq = nvme->n_ioq[cmd->nc_sqid];
5222
5223 /*
5224 * Get the polling flag before submitting the command. The command may
5225 * complete immediately after it was submitted, which means we must
5226 * treat both cmd and xfer as if they have been freed already.
5227 */
5228 poll = (xfer->x_flags & BD_XFER_POLL) != 0;
5229
5230 ret = nvme_submit_io_cmd(ioq, cmd);
5231
5232 if (ret != 0)
5233 return (ret);
5234
5235 if (!poll)
5236 return (0);
5237
5238 do {
5239 cmd = nvme_retrieve_cmd(nvme, ioq);
5240 if (cmd != NULL)
5241 cmd->nc_callback(cmd);
5242 else
5243 drv_usecwait(10);
5244 } while (ioq->nq_active_cmds != 0);
5245
5246 return (0);
5247 }
5248
5249 static int
nvme_bd_read(void * arg,bd_xfer_t * xfer)5250 nvme_bd_read(void *arg, bd_xfer_t *xfer)
5251 {
5252 nvme_namespace_t *ns = arg;
5253
5254 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ));
5255 }
5256
5257 static int
nvme_bd_write(void * arg,bd_xfer_t * xfer)5258 nvme_bd_write(void *arg, bd_xfer_t *xfer)
5259 {
5260 nvme_namespace_t *ns = arg;
5261
5262 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE));
5263 }
5264
5265 static int
nvme_bd_sync(void * arg,bd_xfer_t * xfer)5266 nvme_bd_sync(void *arg, bd_xfer_t *xfer)
5267 {
5268 nvme_namespace_t *ns = arg;
5269
5270 if (ns->ns_nvme->n_dead)
5271 return (EIO);
5272
5273 /*
5274 * If the volatile write cache is not present or not enabled the FLUSH
5275 * command is a no-op, so we can take a shortcut here.
5276 */
5277 if (!ns->ns_nvme->n_write_cache_present) {
5278 bd_xfer_done(xfer, ENOTSUP);
5279 return (0);
5280 }
5281
5282 if (!ns->ns_nvme->n_write_cache_enabled) {
5283 bd_xfer_done(xfer, 0);
5284 return (0);
5285 }
5286
5287 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
5288 }
5289
5290 static int
nvme_bd_devid(void * arg,dev_info_t * devinfo,ddi_devid_t * devid)5291 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
5292 {
5293 nvme_namespace_t *ns = arg;
5294 nvme_t *nvme = ns->ns_nvme;
5295
5296 if (nvme->n_dead) {
5297 return (EIO);
5298 }
5299
5300 if (*(uint64_t *)ns->ns_nguid != 0 ||
5301 *(uint64_t *)(ns->ns_nguid + 8) != 0) {
5302 return (ddi_devid_init(devinfo, DEVID_NVME_NGUID,
5303 sizeof (ns->ns_nguid), ns->ns_nguid, devid));
5304 } else if (*(uint64_t *)ns->ns_eui64 != 0) {
5305 return (ddi_devid_init(devinfo, DEVID_NVME_EUI64,
5306 sizeof (ns->ns_eui64), ns->ns_eui64, devid));
5307 } else {
5308 return (ddi_devid_init(devinfo, DEVID_NVME_NSID,
5309 strlen(ns->ns_devid), ns->ns_devid, devid));
5310 }
5311 }
5312
5313 static int
nvme_bd_free_space(void * arg,bd_xfer_t * xfer)5314 nvme_bd_free_space(void *arg, bd_xfer_t *xfer)
5315 {
5316 nvme_namespace_t *ns = arg;
5317
5318 if (xfer->x_dfl == NULL)
5319 return (EINVAL);
5320
5321 if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt)
5322 return (ENOTSUP);
5323
5324 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT));
5325 }
5326
5327 static int
nvme_open(dev_t * devp,int flag,int otyp,cred_t * cred_p)5328 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
5329 {
5330 #ifndef __lock_lint
5331 _NOTE(ARGUNUSED(cred_p));
5332 #endif
5333 nvme_t *nvme;
5334 nvme_minor_t *minor = NULL;
5335 uint32_t nsid;
5336 minor_t m = getminor(*devp);
5337 int rv = 0;
5338
5339 if (otyp != OTYP_CHR)
5340 return (EINVAL);
5341
5342 if (m >= NVME_OPEN_MINOR_MIN)
5343 return (ENXIO);
5344
5345 nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(m));
5346 nsid = NVME_MINOR_NSID(m);
5347
5348 if (nvme == NULL)
5349 return (ENXIO);
5350
5351 if (nsid > nvme->n_namespace_count)
5352 return (ENXIO);
5353
5354 if (nvme->n_dead)
5355 return (EIO);
5356
5357 /*
5358 * At this point, we're going to allow an open to proceed on this
5359 * device. We need to allocate a new instance for this (presuming one is
5360 * available).
5361 */
5362 minor = kmem_zalloc(sizeof (nvme_minor_t), KM_NOSLEEP_LAZY);
5363 if (minor == NULL) {
5364 return (ENOMEM);
5365 }
5366
5367 cv_init(&minor->nm_cv, NULL, CV_DRIVER, NULL);
5368 list_link_init(&minor->nm_ctrl_lock.nli_node);
5369 minor->nm_ctrl_lock.nli_nvme = nvme;
5370 minor->nm_ctrl_lock.nli_minor = minor;
5371 list_link_init(&minor->nm_ns_lock.nli_node);
5372 minor->nm_ns_lock.nli_nvme = nvme;
5373 minor->nm_ns_lock.nli_minor = minor;
5374 minor->nm_minor = id_alloc_nosleep(nvme_open_minors);
5375 if (minor->nm_minor == -1) {
5376 nvme_minor_free(minor);
5377 return (ENOSPC);
5378 }
5379
5380 minor->nm_ctrl = nvme;
5381 if (nsid != 0) {
5382 minor->nm_ns = nvme_nsid2ns(nvme, nsid);
5383 }
5384
5385 /*
5386 * Before we check for exclusive access and attempt a lock if requested,
5387 * ensure that this minor is persisted.
5388 */
5389 mutex_enter(&nvme_open_minors_mutex);
5390 avl_add(&nvme_open_minors_avl, minor);
5391 mutex_exit(&nvme_open_minors_mutex);
5392
5393 /*
5394 * A request for opening this FEXCL, is translated into a non-blocking
5395 * write lock of the appropriate entity. This honors the original
5396 * semantics here. In the future, we should see if we can remove this
5397 * and turn a request for FEXCL at open into ENOTSUP.
5398 */
5399 mutex_enter(&nvme->n_minor_mutex);
5400 if ((flag & FEXCL) != 0) {
5401 nvme_ioctl_lock_t lock = {
5402 .nil_level = NVME_LOCK_L_WRITE,
5403 .nil_flags = NVME_LOCK_F_DONT_BLOCK
5404 };
5405
5406 if (minor->nm_ns != NULL) {
5407 lock.nil_ent = NVME_LOCK_E_NS;
5408 lock.nil_common.nioc_nsid = nsid;
5409 } else {
5410 lock.nil_ent = NVME_LOCK_E_CTRL;
5411 }
5412 nvme_rwlock(minor, &lock);
5413 if (lock.nil_common.nioc_drv_err != NVME_IOCTL_E_OK) {
5414 mutex_exit(&nvme->n_minor_mutex);
5415
5416 mutex_enter(&nvme_open_minors_mutex);
5417 avl_remove(&nvme_open_minors_avl, minor);
5418 mutex_exit(&nvme_open_minors_mutex);
5419
5420 nvme_minor_free(minor);
5421 return (EBUSY);
5422 }
5423 }
5424 mutex_exit(&nvme->n_minor_mutex);
5425
5426 *devp = makedevice(getmajor(*devp), (minor_t)minor->nm_minor);
5427 return (rv);
5428
5429 }
5430
5431 static int
nvme_close(dev_t dev,int flag __unused,int otyp,cred_t * cred_p __unused)5432 nvme_close(dev_t dev, int flag __unused, int otyp, cred_t *cred_p __unused)
5433 {
5434 nvme_minor_t *minor;
5435 nvme_t *nvme;
5436
5437 if (otyp != OTYP_CHR) {
5438 return (ENXIO);
5439 }
5440
5441 minor = nvme_minor_find_by_dev(dev);
5442 if (minor == NULL) {
5443 return (ENXIO);
5444 }
5445
5446 mutex_enter(&nvme_open_minors_mutex);
5447 avl_remove(&nvme_open_minors_avl, minor);
5448 mutex_exit(&nvme_open_minors_mutex);
5449
5450 /*
5451 * When this device is being closed, we must ensure that any locks held
5452 * by this are dealt with.
5453 */
5454 nvme = minor->nm_ctrl;
5455 mutex_enter(&nvme->n_minor_mutex);
5456 ASSERT3U(minor->nm_ctrl_lock.nli_state, !=, NVME_LOCK_STATE_BLOCKED);
5457 ASSERT3U(minor->nm_ns_lock.nli_state, !=, NVME_LOCK_STATE_BLOCKED);
5458
5459 if (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) {
5460 VERIFY3P(minor->nm_ctrl_lock.nli_lock, !=, NULL);
5461 nvme_rwunlock(&minor->nm_ctrl_lock,
5462 minor->nm_ctrl_lock.nli_lock);
5463 }
5464
5465 if (minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) {
5466 VERIFY3P(minor->nm_ns_lock.nli_lock, !=, NULL);
5467 nvme_rwunlock(&minor->nm_ns_lock, minor->nm_ns_lock.nli_lock);
5468 }
5469 mutex_exit(&nvme->n_minor_mutex);
5470
5471 nvme_minor_free(minor);
5472
5473 return (0);
5474 }
5475
5476 void
nvme_ioctl_success(nvme_ioctl_common_t * ioc)5477 nvme_ioctl_success(nvme_ioctl_common_t *ioc)
5478 {
5479 ioc->nioc_drv_err = NVME_IOCTL_E_OK;
5480 ioc->nioc_ctrl_sc = NVME_CQE_SC_GEN_SUCCESS;
5481 ioc->nioc_ctrl_sct = NVME_CQE_SCT_GENERIC;
5482 }
5483
5484 boolean_t
nvme_ioctl_error(nvme_ioctl_common_t * ioc,nvme_ioctl_errno_t err,uint32_t sct,uint32_t sc)5485 nvme_ioctl_error(nvme_ioctl_common_t *ioc, nvme_ioctl_errno_t err, uint32_t sct,
5486 uint32_t sc)
5487 {
5488 ioc->nioc_drv_err = err;
5489 ioc->nioc_ctrl_sct = sct;
5490 ioc->nioc_ctrl_sc = sc;
5491
5492 return (B_FALSE);
5493 }
5494
5495 static int
nvme_ioctl_copyout_error(nvme_ioctl_errno_t err,intptr_t uaddr,int mode)5496 nvme_ioctl_copyout_error(nvme_ioctl_errno_t err, intptr_t uaddr, int mode)
5497 {
5498 nvme_ioctl_common_t ioc;
5499
5500 ASSERT3U(err, !=, NVME_IOCTL_E_CTRL_ERROR);
5501 bzero(&ioc, sizeof (ioc));
5502 if (ddi_copyout(&ioc, (void *)uaddr, sizeof (nvme_ioctl_common_t),
5503 mode & FKIOCTL) != 0) {
5504 return (EFAULT);
5505 }
5506 return (0);
5507 }
5508
5509
5510 /*
5511 * The companion to the namespace checking. This occurs after any rewriting
5512 * occurs. This is the primary point that we attempt to enforce any operation's
5513 * exclusivity. Note, it is theoretically possible for an operation to be
5514 * ongoing and to have someone with an exclusive lock ask to unlock it for some
5515 * reason. This does not maintain the number of such events that are going on.
5516 * While perhaps this is leaving too much up to the user, by the same token we
5517 * don't try to stop them from issuing two different format NVM commands
5518 * targeting the whole device at the same time either, even though the
5519 * controller would really rather that didn't happen.
5520 */
5521 static boolean_t
nvme_ioctl_excl_check(nvme_minor_t * minor,nvme_ioctl_common_t * ioc,const nvme_ioctl_check_t * check)5522 nvme_ioctl_excl_check(nvme_minor_t *minor, nvme_ioctl_common_t *ioc,
5523 const nvme_ioctl_check_t *check)
5524 {
5525 nvme_t *const nvme = minor->nm_ctrl;
5526 nvme_namespace_t *ns;
5527 boolean_t have_ctrl, have_ns, ctrl_is_excl, ns_is_excl;
5528
5529 /*
5530 * If the command doesn't require anything, then we're done.
5531 */
5532 if (check->nck_excl == NVME_IOCTL_EXCL_SKIP) {
5533 return (B_TRUE);
5534 }
5535
5536 if (ioc->nioc_nsid == 0 || ioc->nioc_nsid == NVME_NSID_BCAST) {
5537 ns = NULL;
5538 } else {
5539 ns = nvme_nsid2ns(nvme, ioc->nioc_nsid);
5540 }
5541
5542 mutex_enter(&nvme->n_minor_mutex);
5543 ctrl_is_excl = nvme->n_lock.nl_writer != NULL;
5544 have_ctrl = nvme->n_lock.nl_writer == &minor->nm_ctrl_lock;
5545 if (ns != NULL) {
5546 /*
5547 * We explicitly test the namespace lock's writer versus asking
5548 * the minor because the minor's namespace lock may apply to a
5549 * different namespace.
5550 */
5551 ns_is_excl = ns->ns_lock.nl_writer != NULL;
5552 have_ns = ns->ns_lock.nl_writer == &minor->nm_ns_lock;
5553 ASSERT0(have_ctrl && have_ns);
5554 #ifdef DEBUG
5555 if (have_ns) {
5556 ASSERT3P(minor->nm_ns_lock.nli_ns, ==, ns);
5557 }
5558 #endif
5559 } else {
5560 ns_is_excl = B_FALSE;
5561 have_ns = B_FALSE;
5562 }
5563 ASSERT0(ctrl_is_excl && ns_is_excl);
5564 mutex_exit(&nvme->n_minor_mutex);
5565
5566 if (check->nck_excl == NVME_IOCTL_EXCL_WRITE) {
5567 if (ns == NULL) {
5568 if (have_ctrl) {
5569 return (B_TRUE);
5570 }
5571 return (nvme_ioctl_error(ioc,
5572 NVME_IOCTL_E_NEED_CTRL_WRLOCK, 0, 0));
5573 } else {
5574 if (have_ctrl || have_ns) {
5575 return (B_TRUE);
5576 }
5577 return (nvme_ioctl_error(ioc,
5578 NVME_IOCTL_E_NEED_NS_WRLOCK, 0, 0));
5579 }
5580 }
5581
5582 /*
5583 * Now we have an operation that does not require exclusive access. We
5584 * can proceed as long as no one else has it or if someone does it is
5585 * us. Regardless of what we target, a controller lock will stop us.
5586 */
5587 if (ctrl_is_excl && !have_ctrl) {
5588 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_CTRL_LOCKED, 0, 0));
5589 }
5590
5591 /*
5592 * Only check namespace exclusivity if we are targeting one.
5593 */
5594 if (ns != NULL && ns_is_excl && !have_ns) {
5595 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_LOCKED, 0, 0));
5596 }
5597
5598 return (B_TRUE);
5599 }
5600
5601 /*
5602 * Perform common checking as to whether or not an ioctl operation may proceed.
5603 * We check in this function various aspects of the namespace attributes that
5604 * it's calling on. Once the namespace attributes and any possible rewriting
5605 * have been performed, then we proceed to check whether or not the requisite
5606 * exclusive access is present in nvme_ioctl_excl_check().
5607 */
5608 static boolean_t
nvme_ioctl_check(nvme_minor_t * minor,nvme_ioctl_common_t * ioc,const nvme_ioctl_check_t * check)5609 nvme_ioctl_check(nvme_minor_t *minor, nvme_ioctl_common_t *ioc,
5610 const nvme_ioctl_check_t *check)
5611 {
5612 /*
5613 * If the minor has a namespace pointer, then it is constrained to that
5614 * namespace. If a namespace is allowed, then there are only two valid
5615 * values that we can find. The first is matching the minor. The second
5616 * is our value zero, which will be transformed to the current
5617 * namespace.
5618 */
5619 if (minor->nm_ns != NULL) {
5620 if (!check->nck_ns_ok || !check->nck_ns_minor_ok) {
5621 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NOT_CTRL, 0,
5622 0));
5623 }
5624
5625 if (ioc->nioc_nsid == 0) {
5626 ioc->nioc_nsid = minor->nm_ns->ns_id;
5627 } else if (ioc->nioc_nsid != minor->nm_ns->ns_id) {
5628 return (nvme_ioctl_error(ioc,
5629 NVME_IOCTL_E_MINOR_WRONG_NS, 0, 0));
5630 }
5631
5632 return (nvme_ioctl_excl_check(minor, ioc, check));
5633 }
5634
5635 /*
5636 * If we've been told to skip checking the controller, here's where we
5637 * do that. This should really only be for commands which use the
5638 * namespace ID for listing purposes and therefore can have
5639 * traditionally illegal values here.
5640 */
5641 if (check->nck_skip_ctrl) {
5642 return (nvme_ioctl_excl_check(minor, ioc, check));
5643 }
5644
5645 /*
5646 * At this point, we know that we're on the controller's node. We first
5647 * deal with the simple case, is a namespace allowed at all or not. If
5648 * it is not allowed, then the only acceptable value is zero.
5649 */
5650 if (!check->nck_ns_ok) {
5651 if (ioc->nioc_nsid != 0) {
5652 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_UNUSE, 0,
5653 0));
5654 }
5655
5656 return (nvme_ioctl_excl_check(minor, ioc, check));
5657 }
5658
5659 /*
5660 * At this point, we know that a controller is allowed to use a
5661 * namespace. If we haven't been given zero or the broadcast namespace,
5662 * check to see if it's actually a valid namespace ID. If is outside of
5663 * range, then it is an error. Next, if we have been requested to
5664 * rewrite 0 (the this controller indicator) as the broadcast namespace,
5665 * do so.
5666 *
5667 * While we validate that this namespace is within the valid range, we
5668 * do not check if it is active or inactive. That is left to our callers
5669 * to determine.
5670 */
5671 if (ioc->nioc_nsid > minor->nm_ctrl->n_namespace_count &&
5672 ioc->nioc_nsid != NVME_NSID_BCAST) {
5673 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_RANGE, 0, 0));
5674 }
5675
5676 if (ioc->nioc_nsid == 0 && check->nck_ctrl_rewrite) {
5677 ioc->nioc_nsid = NVME_NSID_BCAST;
5678 }
5679
5680 /*
5681 * Finally, see if we have ended up with a broadcast namespace ID
5682 * whether through specification or rewriting. If that is not allowed,
5683 * then that is an error.
5684 */
5685 if (!check->nck_bcast_ok && ioc->nioc_nsid == NVME_NSID_BCAST) {
5686 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NO_BCAST_NS, 0, 0));
5687 }
5688
5689 return (nvme_ioctl_excl_check(minor, ioc, check));
5690 }
5691
5692 static int
nvme_ioctl_ctrl_info(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)5693 nvme_ioctl_ctrl_info(nvme_minor_t *minor, intptr_t arg, int mode,
5694 cred_t *cred_p)
5695 {
5696 nvme_t *const nvme = minor->nm_ctrl;
5697 nvme_ioctl_ctrl_info_t *info;
5698 nvme_reg_cap_t cap = { 0 };
5699 nvme_ioctl_identify_t id = { .nid_cns = NVME_IDENTIFY_CTRL };
5700 void *idbuf;
5701
5702 if ((mode & FREAD) == 0)
5703 return (EBADF);
5704
5705 info = kmem_alloc(sizeof (nvme_ioctl_ctrl_info_t), KM_NOSLEEP_LAZY);
5706 if (info == NULL) {
5707 return (nvme_ioctl_copyout_error(NVME_IOCTL_E_NO_KERN_MEM, arg,
5708 mode));
5709 }
5710
5711 if (ddi_copyin((void *)arg, info, sizeof (nvme_ioctl_ctrl_info_t),
5712 mode & FKIOCTL) != 0) {
5713 kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
5714 return (EFAULT);
5715 }
5716
5717 if (!nvme_ioctl_check(minor, &info->nci_common,
5718 &nvme_check_ctrl_info)) {
5719 goto copyout;
5720 }
5721
5722 /*
5723 * We explicitly do not use the identify controller copy in the kernel
5724 * right now so that way we can get a snapshot of the controller's
5725 * current capacity and values. While it's tempting to try to use this
5726 * to refresh the kernel's version we don't just to simplify the rest of
5727 * the driver right now.
5728 */
5729 if (!nvme_identify(nvme, B_TRUE, &id, &idbuf)) {
5730 info->nci_common = id.nid_common;
5731 goto copyout;
5732 }
5733 bcopy(idbuf, &info->nci_ctrl_id, sizeof (nvme_identify_ctrl_t));
5734 kmem_free(idbuf, NVME_IDENTIFY_BUFSIZE);
5735
5736 /*
5737 * Use the kernel's cached common namespace information for this.
5738 */
5739 bcopy(nvme->n_idcomns, &info->nci_common_ns,
5740 sizeof (nvme_identify_nsid_t));
5741
5742 info->nci_vers = nvme->n_version;
5743
5744 /*
5745 * The MPSMIN and MPSMAX fields in the CAP register use 0 to
5746 * specify the base page size of 4k (1<<12), so add 12 here to
5747 * get the real page size value.
5748 */
5749 cap.r = nvme_get64(nvme, NVME_REG_CAP);
5750 info->nci_caps.cap_mpsmax = 1 << (12 + cap.b.cap_mpsmax);
5751 info->nci_caps.cap_mpsmin = 1 << (12 + cap.b.cap_mpsmin);
5752
5753 info->nci_nintrs = (uint32_t)nvme->n_intr_cnt;
5754
5755 copyout:
5756 if (ddi_copyout(info, (void *)arg, sizeof (nvme_ioctl_ctrl_info_t),
5757 mode & FKIOCTL) != 0) {
5758 kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
5759 return (EFAULT);
5760 }
5761
5762 kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
5763 return (0);
5764 }
5765
5766 static int
nvme_ioctl_ns_info(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)5767 nvme_ioctl_ns_info(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
5768 {
5769 nvme_t *const nvme = minor->nm_ctrl;
5770 nvme_ioctl_ns_info_t *ns_info;
5771 nvme_namespace_t *ns;
5772 nvme_ioctl_identify_t id = { .nid_cns = NVME_IDENTIFY_NSID };
5773 void *idbuf;
5774
5775 if ((mode & FREAD) == 0)
5776 return (EBADF);
5777
5778 ns_info = kmem_zalloc(sizeof (nvme_ioctl_ns_info_t), KM_NOSLEEP_LAZY);
5779 if (ns_info == NULL) {
5780 return (nvme_ioctl_copyout_error(NVME_IOCTL_E_NO_KERN_MEM, arg,
5781 mode));
5782 }
5783
5784 if (ddi_copyin((void *)arg, ns_info, sizeof (nvme_ioctl_ns_info_t),
5785 mode & FKIOCTL) != 0) {
5786 kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
5787 return (EFAULT);
5788 }
5789
5790 if (!nvme_ioctl_check(minor, &ns_info->nni_common,
5791 &nvme_check_ns_info)) {
5792 goto copyout;
5793 }
5794
5795 ASSERT3U(ns_info->nni_common.nioc_nsid, >, 0);
5796 ns = nvme_nsid2ns(nvme, ns_info->nni_common.nioc_nsid);
5797
5798 /*
5799 * First fetch a fresh copy of the namespace information. Most callers
5800 * are using this because they will want a mostly accurate snapshot of
5801 * capacity and utilization.
5802 */
5803 id.nid_common.nioc_nsid = ns_info->nni_common.nioc_nsid;
5804 if (!nvme_identify(nvme, B_TRUE, &id, &idbuf)) {
5805 ns_info->nni_common = id.nid_common;
5806 goto copyout;
5807 }
5808 bcopy(idbuf, &ns_info->nni_id, sizeof (nvme_identify_nsid_t));
5809 kmem_free(idbuf, NVME_IDENTIFY_BUFSIZE);
5810
5811 mutex_enter(&nvme->n_mgmt_mutex);
5812 if (ns->ns_allocated)
5813 ns_info->nni_state |= NVME_NS_STATE_ALLOCATED;
5814
5815 if (ns->ns_active)
5816 ns_info->nni_state |= NVME_NS_STATE_ACTIVE;
5817
5818 if (ns->ns_ignore)
5819 ns_info->nni_state |= NVME_NS_STATE_IGNORED;
5820
5821 if (ns->ns_attached) {
5822 const char *addr;
5823
5824 ns_info->nni_state |= NVME_NS_STATE_ATTACHED;
5825 addr = bd_address(ns->ns_bd_hdl);
5826 if (strlcpy(ns_info->nni_addr, addr,
5827 sizeof (ns_info->nni_addr)) >= sizeof (ns_info->nni_addr)) {
5828 mutex_exit(&nvme->n_mgmt_mutex);
5829 (void) nvme_ioctl_error(&ns_info->nni_common,
5830 NVME_IOCTL_E_BD_ADDR_OVER, 0, 0);
5831 goto copyout;
5832 }
5833 }
5834 mutex_exit(&nvme->n_mgmt_mutex);
5835
5836 copyout:
5837 if (ddi_copyout(ns_info, (void *)arg, sizeof (nvme_ioctl_ns_info_t),
5838 mode & FKIOCTL) != 0) {
5839 kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
5840 return (EFAULT);
5841 }
5842
5843 kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
5844 return (0);
5845 }
5846
5847 static int
nvme_ioctl_identify(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)5848 nvme_ioctl_identify(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
5849 {
5850 _NOTE(ARGUNUSED(cred_p));
5851 nvme_t *const nvme = minor->nm_ctrl;
5852 void *idctl;
5853 uint_t model;
5854 nvme_ioctl_identify_t id;
5855 #ifdef _MULTI_DATAMODEL
5856 nvme_ioctl_identify32_t id32;
5857 #endif
5858 boolean_t ns_minor;
5859
5860 if ((mode & FREAD) == 0)
5861 return (EBADF);
5862
5863 model = ddi_model_convert_from(mode);
5864 switch (model) {
5865 #ifdef _MULTI_DATAMODEL
5866 case DDI_MODEL_ILP32:
5867 bzero(&id, sizeof (id));
5868 if (ddi_copyin((void *)arg, &id32, sizeof (id32),
5869 mode & FKIOCTL) != 0) {
5870 return (EFAULT);
5871 }
5872 id.nid_common.nioc_nsid = id32.nid_common.nioc_nsid;
5873 id.nid_cns = id32.nid_cns;
5874 id.nid_ctrlid = id32.nid_ctrlid;
5875 id.nid_data = id32.nid_data;
5876 break;
5877 #endif /* _MULTI_DATAMODEL */
5878 case DDI_MODEL_NONE:
5879 if (ddi_copyin((void *)arg, &id, sizeof (id),
5880 mode & FKIOCTL) != 0) {
5881 return (EFAULT);
5882 }
5883 break;
5884 default:
5885 return (ENOTSUP);
5886 }
5887
5888 if (!nvme_ioctl_check(minor, &id.nid_common, &nvme_check_identify)) {
5889 goto copyout;
5890 }
5891
5892 ns_minor = minor->nm_ns != NULL;
5893 if (!nvme_validate_identify(nvme, &id, ns_minor)) {
5894 goto copyout;
5895 }
5896
5897 if (nvme_identify(nvme, B_TRUE, &id, &idctl)) {
5898 int ret = ddi_copyout(idctl, (void *)id.nid_data,
5899 NVME_IDENTIFY_BUFSIZE, mode & FKIOCTL);
5900 kmem_free(idctl, NVME_IDENTIFY_BUFSIZE);
5901 if (ret != 0) {
5902 (void) nvme_ioctl_error(&id.nid_common,
5903 NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
5904 goto copyout;
5905 }
5906
5907 nvme_ioctl_success(&id.nid_common);
5908 }
5909
5910 copyout:
5911 switch (model) {
5912 #ifdef _MULTI_DATAMODEL
5913 case DDI_MODEL_ILP32:
5914 id32.nid_common = id.nid_common;
5915
5916 if (ddi_copyout(&id32, (void *)arg, sizeof (id32),
5917 mode & FKIOCTL) != 0) {
5918 return (EFAULT);
5919 }
5920 break;
5921 #endif /* _MULTI_DATAMODEL */
5922 case DDI_MODEL_NONE:
5923 if (ddi_copyout(&id, (void *)arg, sizeof (id),
5924 mode & FKIOCTL) != 0) {
5925 return (EFAULT);
5926 }
5927 break;
5928 default:
5929 return (ENOTSUP);
5930 }
5931
5932 return (0);
5933 }
5934
5935 /*
5936 * Execute commands on behalf of the various ioctls.
5937 *
5938 * If this returns true then the command completed successfully. Otherwise error
5939 * information is returned in the nvme_ioctl_common_t arguments.
5940 */
5941 typedef struct {
5942 nvme_sqe_t *ica_sqe;
5943 void *ica_data;
5944 uint32_t ica_data_len;
5945 uint_t ica_dma_flags;
5946 int ica_copy_flags;
5947 uint32_t ica_timeout;
5948 uint32_t ica_cdw0;
5949 } nvme_ioc_cmd_args_t;
5950
5951 static boolean_t
nvme_ioc_cmd(nvme_t * nvme,nvme_ioctl_common_t * ioc,nvme_ioc_cmd_args_t * args)5952 nvme_ioc_cmd(nvme_t *nvme, nvme_ioctl_common_t *ioc, nvme_ioc_cmd_args_t *args)
5953 {
5954 nvme_cmd_t *cmd;
5955 boolean_t ret = B_FALSE;
5956
5957 cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
5958 cmd->nc_sqid = 0;
5959
5960 /*
5961 * This function is used to facilitate requests from
5962 * userspace, so don't panic if the command fails. This
5963 * is especially true for admin passthru commands, where
5964 * the actual command data structure is entirely defined
5965 * by userspace.
5966 */
5967 cmd->nc_dontpanic = B_TRUE;
5968
5969 cmd->nc_callback = nvme_wakeup_cmd;
5970 cmd->nc_sqe = *args->ica_sqe;
5971
5972 if ((args->ica_dma_flags & DDI_DMA_RDWR) != 0) {
5973 if (args->ica_data == NULL) {
5974 ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_NO_DMA_MEM,
5975 0, 0);
5976 goto free_cmd;
5977 }
5978
5979 if (nvme_zalloc_dma(nvme, args->ica_data_len,
5980 args->ica_dma_flags, &nvme->n_prp_dma_attr, &cmd->nc_dma) !=
5981 DDI_SUCCESS) {
5982 dev_err(nvme->n_dip, CE_WARN,
5983 "!nvme_zalloc_dma failed for nvme_ioc_cmd()");
5984 ret = nvme_ioctl_error(ioc,
5985 NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
5986 goto free_cmd;
5987 }
5988
5989 if (nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah) != 0) {
5990 ret = nvme_ioctl_error(ioc,
5991 NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
5992 goto free_cmd;
5993 }
5994
5995 if ((args->ica_dma_flags & DDI_DMA_WRITE) != 0 &&
5996 ddi_copyin(args->ica_data, cmd->nc_dma->nd_memp,
5997 args->ica_data_len, args->ica_copy_flags) != 0) {
5998 ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_BAD_USER_DATA,
5999 0, 0);
6000 goto free_cmd;
6001 }
6002 }
6003
6004 nvme_admin_cmd(cmd, args->ica_timeout);
6005
6006 if (!nvme_check_cmd_status_ioctl(cmd, ioc)) {
6007 ret = B_FALSE;
6008 goto free_cmd;
6009 }
6010
6011 args->ica_cdw0 = cmd->nc_cqe.cqe_dw0;
6012
6013 if ((args->ica_dma_flags & DDI_DMA_READ) != 0 &&
6014 ddi_copyout(cmd->nc_dma->nd_memp, args->ica_data,
6015 args->ica_data_len, args->ica_copy_flags) != 0) {
6016 ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
6017 goto free_cmd;
6018 }
6019
6020 ret = B_TRUE;
6021 nvme_ioctl_success(ioc);
6022
6023 free_cmd:
6024 nvme_free_cmd(cmd);
6025
6026 return (ret);
6027 }
6028
6029 static int
nvme_ioctl_get_logpage(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)6030 nvme_ioctl_get_logpage(nvme_minor_t *minor, intptr_t arg, int mode,
6031 cred_t *cred_p)
6032 {
6033 nvme_t *const nvme = minor->nm_ctrl;
6034 void *buf;
6035 nvme_ioctl_get_logpage_t log;
6036 uint_t model;
6037 #ifdef _MULTI_DATAMODEL
6038 nvme_ioctl_get_logpage32_t log32;
6039 #endif
6040
6041 if ((mode & FREAD) == 0) {
6042 return (EBADF);
6043 }
6044
6045 model = ddi_model_convert_from(mode);
6046 switch (model) {
6047 #ifdef _MULTI_DATAMODEL
6048 case DDI_MODEL_ILP32:
6049 bzero(&log, sizeof (log));
6050 if (ddi_copyin((void *)arg, &log32, sizeof (log32),
6051 mode & FKIOCTL) != 0) {
6052 return (EFAULT);
6053 }
6054
6055 log.nigl_common.nioc_nsid = log32.nigl_common.nioc_nsid;
6056 log.nigl_csi = log32.nigl_csi;
6057 log.nigl_lid = log32.nigl_lid;
6058 log.nigl_lsp = log32.nigl_lsp;
6059 log.nigl_len = log32.nigl_len;
6060 log.nigl_offset = log32.nigl_offset;
6061 log.nigl_data = log32.nigl_data;
6062 break;
6063 #endif /* _MULTI_DATAMODEL */
6064 case DDI_MODEL_NONE:
6065 if (ddi_copyin((void *)arg, &log, sizeof (log),
6066 mode & FKIOCTL) != 0) {
6067 return (EFAULT);
6068 }
6069 break;
6070 default:
6071 return (ENOTSUP);
6072 }
6073
6074 /*
6075 * Eventually we'd like to do a soft lock on the namespaces from
6076 * changing out from us during this operation in the future. But we
6077 * haven't implemented that yet.
6078 */
6079 if (!nvme_ioctl_check(minor, &log.nigl_common,
6080 &nvme_check_get_logpage)) {
6081 goto copyout;
6082 }
6083
6084 if (!nvme_validate_logpage(nvme, &log)) {
6085 goto copyout;
6086 }
6087
6088 if (nvme_get_logpage(nvme, B_TRUE, &log, &buf)) {
6089 int copy;
6090
6091 copy = ddi_copyout(buf, (void *)log.nigl_data, log.nigl_len,
6092 mode & FKIOCTL);
6093 kmem_free(buf, log.nigl_len);
6094 if (copy != 0) {
6095 (void) nvme_ioctl_error(&log.nigl_common,
6096 NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
6097 goto copyout;
6098 }
6099
6100 nvme_ioctl_success(&log.nigl_common);
6101 }
6102
6103 copyout:
6104 switch (model) {
6105 #ifdef _MULTI_DATAMODEL
6106 case DDI_MODEL_ILP32:
6107 bzero(&log32, sizeof (log32));
6108
6109 log32.nigl_common = log.nigl_common;
6110 log32.nigl_csi = log.nigl_csi;
6111 log32.nigl_lid = log.nigl_lid;
6112 log32.nigl_lsp = log.nigl_lsp;
6113 log32.nigl_len = log.nigl_len;
6114 log32.nigl_offset = log.nigl_offset;
6115 log32.nigl_data = log.nigl_data;
6116 if (ddi_copyout(&log32, (void *)arg, sizeof (log32),
6117 mode & FKIOCTL) != 0) {
6118 return (EFAULT);
6119 }
6120 break;
6121 #endif /* _MULTI_DATAMODEL */
6122 case DDI_MODEL_NONE:
6123 if (ddi_copyout(&log, (void *)arg, sizeof (log),
6124 mode & FKIOCTL) != 0) {
6125 return (EFAULT);
6126 }
6127 break;
6128 default:
6129 return (ENOTSUP);
6130 }
6131
6132 return (0);
6133 }
6134
6135 static int
nvme_ioctl_get_feature(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)6136 nvme_ioctl_get_feature(nvme_minor_t *minor, intptr_t arg, int mode,
6137 cred_t *cred_p)
6138 {
6139 nvme_t *const nvme = minor->nm_ctrl;
6140 nvme_ioctl_get_feature_t feat;
6141 uint_t model;
6142 #ifdef _MULTI_DATAMODEL
6143 nvme_ioctl_get_feature32_t feat32;
6144 #endif
6145 nvme_get_features_dw10_t gf_dw10 = { 0 };
6146 nvme_ioc_cmd_args_t args = { NULL };
6147 nvme_sqe_t sqe = {
6148 .sqe_opc = NVME_OPC_GET_FEATURES
6149 };
6150
6151 if ((mode & FREAD) == 0) {
6152 return (EBADF);
6153 }
6154
6155 model = ddi_model_convert_from(mode);
6156 switch (model) {
6157 #ifdef _MULTI_DATAMODEL
6158 case DDI_MODEL_ILP32:
6159 bzero(&feat, sizeof (feat));
6160 if (ddi_copyin((void *)arg, &feat32, sizeof (feat32),
6161 mode & FKIOCTL) != 0) {
6162 return (EFAULT);
6163 }
6164
6165 feat.nigf_common.nioc_nsid = feat32.nigf_common.nioc_nsid;
6166 feat.nigf_fid = feat32.nigf_fid;
6167 feat.nigf_sel = feat32.nigf_sel;
6168 feat.nigf_cdw11 = feat32.nigf_cdw11;
6169 feat.nigf_data = feat32.nigf_data;
6170 feat.nigf_len = feat32.nigf_len;
6171 break;
6172 #endif /* _MULTI_DATAMODEL */
6173 case DDI_MODEL_NONE:
6174 if (ddi_copyin((void *)arg, &feat, sizeof (feat),
6175 mode & FKIOCTL) != 0) {
6176 return (EFAULT);
6177 }
6178 break;
6179 default:
6180 return (ENOTSUP);
6181 }
6182
6183 if (!nvme_ioctl_check(minor, &feat.nigf_common,
6184 &nvme_check_get_feature)) {
6185 goto copyout;
6186 }
6187
6188 if (!nvme_validate_get_feature(nvme, &feat)) {
6189 goto copyout;
6190 }
6191
6192 gf_dw10.b.gt_fid = bitx32(feat.nigf_fid, 7, 0);
6193 gf_dw10.b.gt_sel = bitx32(feat.nigf_sel, 2, 0);
6194 sqe.sqe_cdw10 = gf_dw10.r;
6195 sqe.sqe_cdw11 = feat.nigf_cdw11;
6196 sqe.sqe_nsid = feat.nigf_common.nioc_nsid;
6197
6198 args.ica_sqe = &sqe;
6199 if (feat.nigf_len != 0) {
6200 args.ica_data = (void *)feat.nigf_data;
6201 args.ica_data_len = feat.nigf_len;
6202 args.ica_dma_flags = DDI_DMA_READ;
6203 }
6204 args.ica_copy_flags = mode;
6205 args.ica_timeout = nvme_admin_cmd_timeout;
6206
6207 if (!nvme_ioc_cmd(nvme, &feat.nigf_common, &args)) {
6208 goto copyout;
6209 }
6210
6211 feat.nigf_cdw0 = args.ica_cdw0;
6212
6213 copyout:
6214 switch (model) {
6215 #ifdef _MULTI_DATAMODEL
6216 case DDI_MODEL_ILP32:
6217 bzero(&feat32, sizeof (feat32));
6218
6219 feat32.nigf_common = feat.nigf_common;
6220 feat32.nigf_fid = feat.nigf_fid;
6221 feat32.nigf_sel = feat.nigf_sel;
6222 feat32.nigf_cdw11 = feat.nigf_cdw11;
6223 feat32.nigf_data = feat.nigf_data;
6224 feat32.nigf_len = feat.nigf_len;
6225 feat32.nigf_cdw0 = feat.nigf_cdw0;
6226 if (ddi_copyout(&feat32, (void *)arg, sizeof (feat32),
6227 mode & FKIOCTL) != 0) {
6228 return (EFAULT);
6229 }
6230 break;
6231 #endif /* _MULTI_DATAMODEL */
6232 case DDI_MODEL_NONE:
6233 if (ddi_copyout(&feat, (void *)arg, sizeof (feat),
6234 mode & FKIOCTL) != 0) {
6235 return (EFAULT);
6236 }
6237 break;
6238 default:
6239 return (ENOTSUP);
6240 }
6241
6242 return (0);
6243 }
6244
6245 static int
nvme_ioctl_format(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)6246 nvme_ioctl_format(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6247 {
6248 nvme_t *const nvme = minor->nm_ctrl;
6249 nvme_ioctl_format_t ioc;
6250
6251 if ((mode & FWRITE) == 0)
6252 return (EBADF);
6253
6254 if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6255 return (EPERM);
6256
6257 if (ddi_copyin((void *)(uintptr_t)arg, &ioc,
6258 sizeof (nvme_ioctl_format_t), mode & FKIOCTL) != 0)
6259 return (EFAULT);
6260
6261 if (!nvme_ioctl_check(minor, &ioc.nif_common, &nvme_check_format)) {
6262 goto copyout;
6263 }
6264
6265 if (!nvme_validate_format(nvme, &ioc)) {
6266 goto copyout;
6267 }
6268
6269 mutex_enter(&nvme->n_mgmt_mutex);
6270 if (!nvme_no_blkdev_attached(nvme, ioc.nif_common.nioc_nsid)) {
6271 mutex_exit(&nvme->n_mgmt_mutex);
6272 (void) nvme_ioctl_error(&ioc.nif_common,
6273 NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0);
6274 goto copyout;
6275 }
6276
6277 if (nvme_format_nvm(nvme, &ioc)) {
6278 nvme_ioctl_success(&ioc.nif_common);
6279 nvme_rescan_ns(nvme, ioc.nif_common.nioc_nsid);
6280 }
6281 mutex_exit(&nvme->n_mgmt_mutex);
6282
6283 copyout:
6284 if (ddi_copyout(&ioc, (void *)(uintptr_t)arg, sizeof (ioc),
6285 mode & FKIOCTL) != 0) {
6286 return (EFAULT);
6287 }
6288
6289 return (0);
6290 }
6291
6292 static int
nvme_ioctl_detach(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)6293 nvme_ioctl_detach(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6294 {
6295 nvme_t *const nvme = minor->nm_ctrl;
6296 nvme_ioctl_common_t com;
6297
6298 if ((mode & FWRITE) == 0)
6299 return (EBADF);
6300
6301 if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6302 return (EPERM);
6303
6304 if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
6305 mode & FKIOCTL) != 0) {
6306 return (EFAULT);
6307 }
6308
6309 if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) {
6310 goto copyout;
6311 }
6312
6313 mutex_enter(&nvme->n_mgmt_mutex);
6314 if (nvme_detach_ns(nvme, &com)) {
6315 nvme_ioctl_success(&com);
6316 }
6317 mutex_exit(&nvme->n_mgmt_mutex);
6318
6319 copyout:
6320 if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
6321 mode & FKIOCTL) != 0) {
6322 return (EFAULT);
6323 }
6324
6325 return (0);
6326 }
6327
6328 static int
nvme_ioctl_attach(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)6329 nvme_ioctl_attach(nvme_minor_t *minor, intptr_t arg, int mode,
6330 cred_t *cred_p)
6331 {
6332 nvme_t *const nvme = minor->nm_ctrl;
6333 nvme_ioctl_common_t com;
6334 nvme_namespace_t *ns;
6335
6336 if ((mode & FWRITE) == 0)
6337 return (EBADF);
6338
6339 if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6340 return (EPERM);
6341
6342 if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
6343 mode & FKIOCTL) != 0) {
6344 return (EFAULT);
6345 }
6346
6347 if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) {
6348 goto copyout;
6349 }
6350
6351 mutex_enter(&nvme->n_mgmt_mutex);
6352 ns = nvme_nsid2ns(nvme, com.nioc_nsid);
6353
6354 /*
6355 * Strictly speaking we shouldn't need to call nvme_init_ns() here as
6356 * we should be properly refreshing the internal state when we are
6357 * issuing commands that change things. However, we opt to still do so
6358 * as a bit of a safety check lest we give the kernel something bad or a
6359 * vendor unique command somehow did something behind our backs.
6360 */
6361 if (!ns->ns_attached) {
6362 (void) nvme_rescan_ns(nvme, com.nioc_nsid);
6363 if (nvme_attach_ns(nvme, &com)) {
6364 nvme_ioctl_success(&com);
6365 }
6366 } else {
6367 nvme_ioctl_success(&com);
6368 }
6369 mutex_exit(&nvme->n_mgmt_mutex);
6370
6371 copyout:
6372 if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
6373 mode & FKIOCTL) != 0) {
6374 return (EFAULT);
6375 }
6376
6377 return (0);
6378 }
6379
6380 static void
nvme_ufm_update(nvme_t * nvme)6381 nvme_ufm_update(nvme_t *nvme)
6382 {
6383 mutex_enter(&nvme->n_fwslot_mutex);
6384 ddi_ufm_update(nvme->n_ufmh);
6385 if (nvme->n_fwslot != NULL) {
6386 kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t));
6387 nvme->n_fwslot = NULL;
6388 }
6389 mutex_exit(&nvme->n_fwslot_mutex);
6390 }
6391
6392 /*
6393 * Download new firmware to the device's internal staging area. We do not call
6394 * nvme_ufm_update() here because after a firmware download, there has been no
6395 * change to any of the actual persistent firmware data. That requires a
6396 * subsequent ioctl (NVME_IOC_FIRMWARE_COMMIT) to commit the firmware to a slot
6397 * or to activate a slot.
6398 */
6399 static int
nvme_ioctl_firmware_download(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)6400 nvme_ioctl_firmware_download(nvme_minor_t *minor, intptr_t arg, int mode,
6401 cred_t *cred_p)
6402 {
6403 nvme_t *const nvme = minor->nm_ctrl;
6404 nvme_ioctl_fw_load_t fw;
6405 uint64_t len, maxcopy;
6406 offset_t offset;
6407 uint32_t gran;
6408 nvme_valid_ctrl_data_t data;
6409 uintptr_t buf;
6410 nvme_sqe_t sqe = {
6411 .sqe_opc = NVME_OPC_FW_IMAGE_LOAD
6412 };
6413
6414 if ((mode & FWRITE) == 0)
6415 return (EBADF);
6416
6417 if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6418 return (EPERM);
6419
6420 if (ddi_copyin((void *)(uintptr_t)arg, &fw, sizeof (fw),
6421 mode & FKIOCTL) != 0) {
6422 return (EFAULT);
6423 }
6424
6425 if (!nvme_ioctl_check(minor, &fw.fwl_common, &nvme_check_firmware)) {
6426 goto copyout;
6427 }
6428
6429 if (!nvme_validate_fw_load(nvme, &fw)) {
6430 goto copyout;
6431 }
6432
6433 len = fw.fwl_len;
6434 offset = fw.fwl_off;
6435 buf = fw.fwl_buf;
6436
6437 /*
6438 * We need to determine the minimum and maximum amount of data that we
6439 * will send to the device in a given go. Starting in NMVe 1.3 this must
6440 * be a multiple of the firmware update granularity (FWUG), but must not
6441 * exceed the maximum data transfer that we've set. Many devices don't
6442 * report something here, which means we'll end up getting our default
6443 * value. Our policy is a little simple, but it's basically if the
6444 * maximum data transfer is evenly divided by the granularity, then use
6445 * it. Otherwise we use the granularity itself. The granularity is
6446 * always in page sized units, so trying to find another optimum point
6447 * isn't worth it. If we encounter a contradiction, then we will have to
6448 * error out.
6449 */
6450 data.vcd_vers = &nvme->n_version;
6451 data.vcd_id = nvme->n_idctl;
6452 gran = nvme_fw_load_granularity(&data);
6453
6454 if ((nvme->n_max_data_transfer_size % gran) == 0) {
6455 maxcopy = nvme->n_max_data_transfer_size;
6456 } else if (gran <= nvme->n_max_data_transfer_size) {
6457 maxcopy = gran;
6458 } else {
6459 (void) nvme_ioctl_error(&fw.fwl_common,
6460 NVME_IOCTL_E_FW_LOAD_IMPOS_GRAN, 0, 0);
6461 goto copyout;
6462 }
6463
6464 while (len > 0) {
6465 nvme_ioc_cmd_args_t args = { NULL };
6466 uint64_t copylen = MIN(maxcopy, len);
6467
6468 sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1;
6469 sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT);
6470
6471 args.ica_sqe = &sqe;
6472 args.ica_data = (void *)buf;
6473 args.ica_data_len = copylen;
6474 args.ica_dma_flags = DDI_DMA_WRITE;
6475 args.ica_copy_flags = mode;
6476 args.ica_timeout = nvme_admin_cmd_timeout;
6477
6478 if (!nvme_ioc_cmd(nvme, &fw.fwl_common, &args)) {
6479 break;
6480 }
6481
6482 buf += copylen;
6483 offset += copylen;
6484 len -= copylen;
6485 }
6486
6487 copyout:
6488 if (ddi_copyout(&fw, (void *)(uintptr_t)arg, sizeof (fw),
6489 mode & FKIOCTL) != 0) {
6490 return (EFAULT);
6491 }
6492
6493 return (0);
6494 }
6495
6496 static int
nvme_ioctl_firmware_commit(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)6497 nvme_ioctl_firmware_commit(nvme_minor_t *minor, intptr_t arg, int mode,
6498 cred_t *cred_p)
6499 {
6500 nvme_t *const nvme = minor->nm_ctrl;
6501 nvme_ioctl_fw_commit_t fw;
6502 nvme_firmware_commit_dw10_t fc_dw10 = { 0 };
6503 nvme_ioc_cmd_args_t args = { NULL };
6504 nvme_sqe_t sqe = {
6505 .sqe_opc = NVME_OPC_FW_ACTIVATE
6506 };
6507
6508 if ((mode & FWRITE) == 0)
6509 return (EBADF);
6510
6511 if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6512 return (EPERM);
6513
6514 if (ddi_copyin((void *)(uintptr_t)arg, &fw, sizeof (fw),
6515 mode & FKIOCTL) != 0) {
6516 return (EFAULT);
6517 }
6518
6519 if (!nvme_ioctl_check(minor, &fw.fwc_common, &nvme_check_firmware)) {
6520 goto copyout;
6521 }
6522
6523 if (!nvme_validate_fw_commit(nvme, &fw)) {
6524 goto copyout;
6525 }
6526
6527 fc_dw10.b.fc_slot = fw.fwc_slot;
6528 fc_dw10.b.fc_action = fw.fwc_action;
6529 sqe.sqe_cdw10 = fc_dw10.r;
6530
6531 args.ica_sqe = &sqe;
6532 args.ica_timeout = nvme_commit_save_cmd_timeout;
6533
6534 /*
6535 * There are no conditional actions to take based on this succeeding or
6536 * failing. A failure is recorded in the ioctl structure returned to the
6537 * user.
6538 */
6539 (void) nvme_ioc_cmd(nvme, &fw.fwc_common, &args);
6540
6541 /*
6542 * Let the DDI UFM subsystem know that the firmware information for
6543 * this device has changed. We perform this unconditionally as an
6544 * invalidation doesn't particularly hurt us.
6545 */
6546 nvme_ufm_update(nvme);
6547
6548 copyout:
6549 if (ddi_copyout(&fw, (void *)(uintptr_t)arg, sizeof (fw),
6550 mode & FKIOCTL) != 0) {
6551 return (EFAULT);
6552 }
6553
6554 return (0);
6555 }
6556
6557 /*
6558 * Helper to copy in a passthru command from userspace, handling
6559 * different data models.
6560 */
6561 static int
nvme_passthru_copyin_cmd(const void * buf,nvme_ioctl_passthru_t * cmd,int mode)6562 nvme_passthru_copyin_cmd(const void *buf, nvme_ioctl_passthru_t *cmd, int mode)
6563 {
6564 switch (ddi_model_convert_from(mode & FMODELS)) {
6565 #ifdef _MULTI_DATAMODEL
6566 case DDI_MODEL_ILP32: {
6567 nvme_ioctl_passthru32_t cmd32;
6568
6569 if (ddi_copyin(buf, (void*)&cmd32, sizeof (cmd32), mode) != 0)
6570 return (EFAULT);
6571
6572 bzero(cmd, sizeof (nvme_ioctl_passthru_t));
6573
6574 cmd->npc_common.nioc_nsid = cmd32.npc_common.nioc_nsid;
6575 cmd->npc_opcode = cmd32.npc_opcode;
6576 cmd->npc_timeout = cmd32.npc_timeout;
6577 cmd->npc_flags = cmd32.npc_flags;
6578 cmd->npc_impact = cmd32.npc_impact;
6579 cmd->npc_cdw12 = cmd32.npc_cdw12;
6580 cmd->npc_cdw13 = cmd32.npc_cdw13;
6581 cmd->npc_cdw14 = cmd32.npc_cdw14;
6582 cmd->npc_cdw15 = cmd32.npc_cdw15;
6583 cmd->npc_buflen = cmd32.npc_buflen;
6584 cmd->npc_buf = cmd32.npc_buf;
6585 break;
6586 }
6587 #endif /* _MULTI_DATAMODEL */
6588 case DDI_MODEL_NONE:
6589 if (ddi_copyin(buf, (void *)cmd, sizeof (nvme_ioctl_passthru_t),
6590 mode) != 0) {
6591 return (EFAULT);
6592 }
6593 break;
6594 default:
6595 return (ENOTSUP);
6596 }
6597
6598 return (0);
6599 }
6600
6601 /*
6602 * Helper to copy out a passthru command result to userspace, handling
6603 * different data models.
6604 */
6605 static int
nvme_passthru_copyout_cmd(const nvme_ioctl_passthru_t * cmd,void * buf,int mode)6606 nvme_passthru_copyout_cmd(const nvme_ioctl_passthru_t *cmd, void *buf, int mode)
6607 {
6608 switch (ddi_model_convert_from(mode & FMODELS)) {
6609 #ifdef _MULTI_DATAMODEL
6610 case DDI_MODEL_ILP32: {
6611 nvme_ioctl_passthru32_t cmd32;
6612
6613 bzero(&cmd32, sizeof (nvme_ioctl_passthru32_t));
6614
6615 cmd32.npc_common = cmd->npc_common;
6616 cmd32.npc_opcode = cmd->npc_opcode;
6617 cmd32.npc_timeout = cmd->npc_timeout;
6618 cmd32.npc_flags = cmd->npc_flags;
6619 cmd32.npc_impact = cmd->npc_impact;
6620 cmd32.npc_cdw0 = cmd->npc_cdw0;
6621 cmd32.npc_cdw12 = cmd->npc_cdw12;
6622 cmd32.npc_cdw13 = cmd->npc_cdw13;
6623 cmd32.npc_cdw14 = cmd->npc_cdw14;
6624 cmd32.npc_cdw15 = cmd->npc_cdw15;
6625 cmd32.npc_buflen = (size32_t)cmd->npc_buflen;
6626 cmd32.npc_buf = (uintptr32_t)cmd->npc_buf;
6627 if (ddi_copyout(&cmd32, buf, sizeof (cmd32), mode) != 0)
6628 return (EFAULT);
6629 break;
6630 }
6631 #endif /* _MULTI_DATAMODEL */
6632 case DDI_MODEL_NONE:
6633 if (ddi_copyout(cmd, buf, sizeof (nvme_ioctl_passthru_t),
6634 mode) != 0) {
6635 return (EFAULT);
6636 }
6637 break;
6638 default:
6639 return (ENOTSUP);
6640 }
6641 return (0);
6642 }
6643
6644 /*
6645 * Run an arbitrary vendor-specific admin command on the device.
6646 */
6647 static int
nvme_ioctl_passthru(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)6648 nvme_ioctl_passthru(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6649 {
6650 nvme_t *const nvme = minor->nm_ctrl;
6651 int rv;
6652 nvme_ioctl_passthru_t pass;
6653 nvme_sqe_t sqe;
6654 nvme_ioc_cmd_args_t args = { NULL };
6655
6656 /*
6657 * Basic checks: permissions, data model, argument size.
6658 */
6659 if ((mode & FWRITE) == 0)
6660 return (EBADF);
6661
6662 if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6663 return (EPERM);
6664
6665 if ((rv = nvme_passthru_copyin_cmd((void *)(uintptr_t)arg, &pass,
6666 mode)) != 0) {
6667 return (rv);
6668 }
6669
6670 if (!nvme_ioctl_check(minor, &pass.npc_common, &nvme_check_passthru)) {
6671 goto copyout;
6672 }
6673
6674 if (!nvme_validate_vuc(nvme, &pass)) {
6675 goto copyout;
6676 }
6677
6678 mutex_enter(&nvme->n_mgmt_mutex);
6679 if ((pass.npc_impact & NVME_IMPACT_NS) != 0) {
6680 /*
6681 * We've been told this has ns impact. Right now force that to
6682 * be every ns until we have more use cases and reason to trust
6683 * the nsid field.
6684 */
6685 if (!nvme_no_blkdev_attached(nvme, NVME_NSID_BCAST)) {
6686 mutex_exit(&nvme->n_mgmt_mutex);
6687 (void) nvme_ioctl_error(&pass.npc_common,
6688 NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0);
6689 goto copyout;
6690 }
6691 }
6692
6693 bzero(&sqe, sizeof (sqe));
6694
6695 sqe.sqe_opc = pass.npc_opcode;
6696 sqe.sqe_nsid = pass.npc_common.nioc_nsid;
6697 sqe.sqe_cdw10 = (uint32_t)(pass.npc_buflen >> NVME_DWORD_SHIFT);
6698 sqe.sqe_cdw12 = pass.npc_cdw12;
6699 sqe.sqe_cdw13 = pass.npc_cdw13;
6700 sqe.sqe_cdw14 = pass.npc_cdw14;
6701 sqe.sqe_cdw15 = pass.npc_cdw15;
6702
6703 args.ica_sqe = &sqe;
6704 args.ica_data = (void *)pass.npc_buf;
6705 args.ica_data_len = pass.npc_buflen;
6706 args.ica_copy_flags = mode;
6707 args.ica_timeout = pass.npc_timeout;
6708
6709 if ((pass.npc_flags & NVME_PASSTHRU_READ) != 0)
6710 args.ica_dma_flags |= DDI_DMA_READ;
6711 else if ((pass.npc_flags & NVME_PASSTHRU_WRITE) != 0)
6712 args.ica_dma_flags |= DDI_DMA_WRITE;
6713
6714 if (nvme_ioc_cmd(nvme, &pass.npc_common, &args)) {
6715 pass.npc_cdw0 = args.ica_cdw0;
6716 if ((pass.npc_impact & NVME_IMPACT_NS) != 0) {
6717 nvme_rescan_ns(nvme, NVME_NSID_BCAST);
6718 }
6719 }
6720 mutex_exit(&nvme->n_mgmt_mutex);
6721
6722 copyout:
6723 rv = nvme_passthru_copyout_cmd(&pass, (void *)(uintptr_t)arg,
6724 mode);
6725
6726 return (rv);
6727 }
6728
6729 static int
nvme_ioctl_lock(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)6730 nvme_ioctl_lock(nvme_minor_t *minor, intptr_t arg, int mode,
6731 cred_t *cred_p)
6732 {
6733 nvme_ioctl_lock_t lock;
6734 const nvme_lock_flags_t all_flags = NVME_LOCK_F_DONT_BLOCK;
6735 nvme_t *nvme = minor->nm_ctrl;
6736
6737 if ((mode & FWRITE) == 0)
6738 return (EBADF);
6739
6740 if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6741 return (EPERM);
6742
6743 if (ddi_copyin((void *)(uintptr_t)arg, &lock, sizeof (lock),
6744 mode & FKIOCTL) != 0) {
6745 return (EFAULT);
6746 }
6747
6748 if (lock.nil_ent != NVME_LOCK_E_CTRL &&
6749 lock.nil_ent != NVME_LOCK_E_NS) {
6750 (void) nvme_ioctl_error(&lock.nil_common,
6751 NVME_IOCTL_E_BAD_LOCK_ENTITY, 0, 0);
6752 goto copyout;
6753 }
6754
6755 if (lock.nil_level != NVME_LOCK_L_READ &&
6756 lock.nil_level != NVME_LOCK_L_WRITE) {
6757 (void) nvme_ioctl_error(&lock.nil_common,
6758 NVME_IOCTL_E_BAD_LOCK_LEVEL, 0, 0);
6759 goto copyout;
6760 }
6761
6762 if ((lock.nil_flags & ~all_flags) != 0) {
6763 (void) nvme_ioctl_error(&lock.nil_common,
6764 NVME_IOCTL_E_BAD_LOCK_FLAGS, 0, 0);
6765 goto copyout;
6766 }
6767
6768 if (!nvme_ioctl_check(minor, &lock.nil_common, &nvme_check_locking)) {
6769 goto copyout;
6770 }
6771
6772 /*
6773 * If we're on a namespace, confirm that we're not asking for the
6774 * controller.
6775 */
6776 if (lock.nil_common.nioc_nsid != 0 &&
6777 lock.nil_ent == NVME_LOCK_E_CTRL) {
6778 (void) nvme_ioctl_error(&lock.nil_common,
6779 NVME_IOCTL_E_NS_CANNOT_LOCK_CTRL, 0, 0);
6780 goto copyout;
6781 }
6782
6783 /*
6784 * We've reached the point where we can no longer actually check things
6785 * without serializing state. First, we need to check to make sure that
6786 * none of our invariants are being broken for locking:
6787 *
6788 * 1) The caller isn't already blocking for a lock operation to
6789 * complete.
6790 *
6791 * 2) The caller is attempting to grab a lock that they already have.
6792 * While there are other rule violations that this might create, we opt
6793 * to check this ahead of it so we can have slightly better error
6794 * messages for our callers.
6795 *
6796 * 3) The caller is trying to grab a controller lock, while holding a
6797 * namespace lock.
6798 *
6799 * 4) The caller has a controller write lock and is trying to get a
6800 * namespace lock. For now, we disallow this case. Holding a controller
6801 * read lock is allowed, but the write lock allows you to operate on all
6802 * namespaces anyways. In addition, this simplifies the locking logic;
6803 * however, this constraint may be loosened in the future.
6804 *
6805 * 5) The caller is trying to acquire a second namespace lock when they
6806 * already have one.
6807 */
6808 mutex_enter(&nvme->n_minor_mutex);
6809 if (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_BLOCKED ||
6810 minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_BLOCKED) {
6811 (void) nvme_ioctl_error(&lock.nil_common,
6812 NVME_IOCTL_E_LOCK_PENDING, 0, 0);
6813 mutex_exit(&nvme->n_minor_mutex);
6814 goto copyout;
6815 }
6816
6817 if ((lock.nil_ent == NVME_LOCK_E_CTRL &&
6818 minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) ||
6819 (lock.nil_ent == NVME_LOCK_E_NS &&
6820 minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_ACQUIRED &&
6821 minor->nm_ns_lock.nli_ns->ns_id == lock.nil_common.nioc_nsid)) {
6822 (void) nvme_ioctl_error(&lock.nil_common,
6823 NVME_IOCTL_E_LOCK_ALREADY_HELD, 0, 0);
6824 mutex_exit(&nvme->n_minor_mutex);
6825 goto copyout;
6826 }
6827
6828 if (lock.nil_ent == NVME_LOCK_E_CTRL &&
6829 minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_UNLOCKED) {
6830 (void) nvme_ioctl_error(&lock.nil_common,
6831 NVME_IOCTL_E_LOCK_NO_CTRL_WITH_NS, 0, 0);
6832 mutex_exit(&nvme->n_minor_mutex);
6833 goto copyout;
6834 }
6835
6836 if (lock.nil_ent == NVME_LOCK_E_NS &&
6837 (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED &&
6838 minor->nm_ctrl_lock.nli_curlevel == NVME_LOCK_L_WRITE)) {
6839 (void) nvme_ioctl_error(&lock.nil_common,
6840 NVME_IOCTL_LOCK_NO_NS_WITH_CTRL_WRLOCK, 0, 0);
6841 mutex_exit(&nvme->n_minor_mutex);
6842 goto copyout;
6843 }
6844
6845 if (lock.nil_ent == NVME_LOCK_E_NS &&
6846 minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_UNLOCKED) {
6847 (void) nvme_ioctl_error(&lock.nil_common,
6848 NVME_IOCTL_E_LOCK_NO_2ND_NS, 0, 0);
6849 mutex_exit(&nvme->n_minor_mutex);
6850 goto copyout;
6851 }
6852
6853
6854 #ifdef DEBUG
6855 /*
6856 * This is a big block of sanity checks to make sure that we haven't
6857 * allowed anything bad to happen.
6858 */
6859 if (lock.nil_ent == NVME_LOCK_E_NS) {
6860 ASSERT3P(minor->nm_ns_lock.nli_lock, ==, NULL);
6861 ASSERT3U(minor->nm_ns_lock.nli_state, ==,
6862 NVME_LOCK_STATE_UNLOCKED);
6863 ASSERT3U(minor->nm_ns_lock.nli_curlevel, ==, 0);
6864 ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
6865
6866 if (minor->nm_ns != NULL) {
6867 ASSERT3U(minor->nm_ns->ns_id, ==,
6868 lock.nil_common.nioc_nsid);
6869 }
6870
6871 ASSERT0(list_link_active(&minor->nm_ns_lock.nli_node));
6872 } else {
6873 ASSERT3P(minor->nm_ctrl_lock.nli_lock, ==, NULL);
6874 ASSERT3U(minor->nm_ctrl_lock.nli_state, ==,
6875 NVME_LOCK_STATE_UNLOCKED);
6876 ASSERT3U(minor->nm_ctrl_lock.nli_curlevel, ==, 0);
6877 ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
6878 ASSERT0(list_link_active(&minor->nm_ctrl_lock.nli_node));
6879
6880 ASSERT3P(minor->nm_ns_lock.nli_lock, ==, NULL);
6881 ASSERT3U(minor->nm_ns_lock.nli_state, ==,
6882 NVME_LOCK_STATE_UNLOCKED);
6883 ASSERT3U(minor->nm_ns_lock.nli_curlevel, ==, 0);
6884 ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
6885 ASSERT0(list_link_active(&minor->nm_ns_lock.nli_node));
6886 }
6887 #endif /* DEBUG */
6888
6889 /*
6890 * At this point we should actually attempt a locking operation.
6891 */
6892 nvme_rwlock(minor, &lock);
6893 mutex_exit(&nvme->n_minor_mutex);
6894
6895 copyout:
6896 if (ddi_copyout(&lock, (void *)(uintptr_t)arg, sizeof (lock),
6897 mode & FKIOCTL) != 0) {
6898 return (EFAULT);
6899 }
6900
6901 return (0);
6902 }
6903
6904 static int
nvme_ioctl_unlock(nvme_minor_t * minor,intptr_t arg,int mode,cred_t * cred_p)6905 nvme_ioctl_unlock(nvme_minor_t *minor, intptr_t arg, int mode,
6906 cred_t *cred_p)
6907 {
6908 nvme_ioctl_unlock_t unlock;
6909 nvme_t *const nvme = minor->nm_ctrl;
6910 boolean_t is_ctrl;
6911 nvme_lock_t *lock;
6912 nvme_minor_lock_info_t *info;
6913
6914 /*
6915 * Note, we explicitly don't check for privileges for unlock. The idea
6916 * being that if you have the lock, that's what matters. If you don't
6917 * have the lock, it doesn't matter what privileges that you have at
6918 * all.
6919 */
6920 if ((mode & FWRITE) == 0)
6921 return (EBADF);
6922
6923 if (ddi_copyin((void *)(uintptr_t)arg, &unlock, sizeof (unlock),
6924 mode & FKIOCTL) != 0) {
6925 return (EFAULT);
6926 }
6927
6928 if (unlock.niu_ent != NVME_LOCK_E_CTRL &&
6929 unlock.niu_ent != NVME_LOCK_E_NS) {
6930 (void) nvme_ioctl_error(&unlock.niu_common,
6931 NVME_IOCTL_E_BAD_LOCK_ENTITY, 0, 0);
6932 goto copyout;
6933 }
6934
6935 if (!nvme_ioctl_check(minor, &unlock.niu_common, &nvme_check_locking)) {
6936 goto copyout;
6937 }
6938
6939 /*
6940 * If we're on a namespace, confirm that we're not asking for the
6941 * controller.
6942 */
6943 if (unlock.niu_common.nioc_nsid != 0 &&
6944 unlock.niu_ent == NVME_LOCK_E_CTRL) {
6945 (void) nvme_ioctl_error(&unlock.niu_common,
6946 NVME_IOCTL_E_NS_CANNOT_UNLOCK_CTRL, 0, 0);
6947 goto copyout;
6948 }
6949
6950 mutex_enter(&nvme->n_minor_mutex);
6951 if (unlock.niu_ent == NVME_LOCK_E_CTRL) {
6952 if (minor->nm_ctrl_lock.nli_state != NVME_LOCK_STATE_ACQUIRED) {
6953 mutex_exit(&nvme->n_minor_mutex);
6954 (void) nvme_ioctl_error(&unlock.niu_common,
6955 NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
6956 goto copyout;
6957 }
6958 } else {
6959 if (minor->nm_ns_lock.nli_ns == NULL) {
6960 mutex_exit(&nvme->n_minor_mutex);
6961 (void) nvme_ioctl_error(&unlock.niu_common,
6962 NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
6963 goto copyout;
6964 }
6965
6966 /*
6967 * Check that our unlock request corresponds to the namespace ID
6968 * that is currently locked. This could happen if we're using
6969 * the controller node and it specified a valid, but not locked,
6970 * namespace ID.
6971 */
6972 if (minor->nm_ns_lock.nli_ns->ns_id !=
6973 unlock.niu_common.nioc_nsid) {
6974 mutex_exit(&nvme->n_minor_mutex);
6975 ASSERT3P(minor->nm_ns, ==, NULL);
6976 (void) nvme_ioctl_error(&unlock.niu_common,
6977 NVME_IOCTL_E_LOCK_WRONG_NS, 0, 0);
6978 goto copyout;
6979 }
6980
6981 if (minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_ACQUIRED) {
6982 mutex_exit(&nvme->n_minor_mutex);
6983 (void) nvme_ioctl_error(&unlock.niu_common,
6984 NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
6985 goto copyout;
6986 }
6987 }
6988
6989 /*
6990 * Finally, perform the unlock.
6991 */
6992 is_ctrl = unlock.niu_ent == NVME_LOCK_E_CTRL;
6993 if (is_ctrl) {
6994 lock = &nvme->n_lock;
6995 info = &minor->nm_ctrl_lock;
6996 } else {
6997 nvme_namespace_t *ns;
6998 const uint32_t nsid = unlock.niu_common.nioc_nsid;
6999
7000 ns = nvme_nsid2ns(nvme, nsid);
7001 lock = &ns->ns_lock;
7002 info = &minor->nm_ns_lock;
7003 VERIFY3P(ns, ==, info->nli_ns);
7004 }
7005 nvme_rwunlock(info, lock);
7006 mutex_exit(&nvme->n_minor_mutex);
7007 nvme_ioctl_success(&unlock.niu_common);
7008
7009 copyout:
7010 if (ddi_copyout(&unlock, (void *)(uintptr_t)arg, sizeof (unlock),
7011 mode & FKIOCTL) != 0) {
7012 return (EFAULT);
7013 }
7014
7015 return (0);
7016 }
7017
7018 static int
nvme_ioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * cred_p,int * rval_p)7019 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p,
7020 int *rval_p)
7021 {
7022 #ifndef __lock_lint
7023 _NOTE(ARGUNUSED(rval_p));
7024 #endif
7025 nvme_minor_t *minor;
7026 nvme_t *nvme;
7027
7028 minor = nvme_minor_find_by_dev(dev);
7029 if (minor == NULL) {
7030 return (ENXIO);
7031 }
7032
7033 nvme = minor->nm_ctrl;
7034 if (nvme == NULL)
7035 return (ENXIO);
7036
7037 if (IS_DEVCTL(cmd))
7038 return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0));
7039
7040 if (nvme->n_dead && (cmd != NVME_IOC_DETACH && cmd !=
7041 NVME_IOC_UNLOCK)) {
7042 if (IS_NVME_IOC(cmd) == 0) {
7043 return (EIO);
7044 }
7045
7046 return (nvme_ioctl_copyout_error(nvme->n_dead_status, arg,
7047 mode));
7048 }
7049
7050 /*
7051 * ioctls that are no longer using the original ioctl structure.
7052 */
7053 switch (cmd) {
7054 case NVME_IOC_CTRL_INFO:
7055 return (nvme_ioctl_ctrl_info(minor, arg, mode, cred_p));
7056 case NVME_IOC_IDENTIFY:
7057 return (nvme_ioctl_identify(minor, arg, mode, cred_p));
7058 case NVME_IOC_GET_LOGPAGE:
7059 return (nvme_ioctl_get_logpage(minor, arg, mode, cred_p));
7060 case NVME_IOC_GET_FEATURE:
7061 return (nvme_ioctl_get_feature(minor, arg, mode, cred_p));
7062 case NVME_IOC_DETACH:
7063 return (nvme_ioctl_detach(minor, arg, mode, cred_p));
7064 case NVME_IOC_ATTACH:
7065 return (nvme_ioctl_attach(minor, arg, mode, cred_p));
7066 case NVME_IOC_FORMAT:
7067 return (nvme_ioctl_format(minor, arg, mode, cred_p));
7068 case NVME_IOC_FIRMWARE_DOWNLOAD:
7069 return (nvme_ioctl_firmware_download(minor, arg, mode,
7070 cred_p));
7071 case NVME_IOC_FIRMWARE_COMMIT:
7072 return (nvme_ioctl_firmware_commit(minor, arg, mode,
7073 cred_p));
7074 case NVME_IOC_NS_INFO:
7075 return (nvme_ioctl_ns_info(minor, arg, mode, cred_p));
7076 case NVME_IOC_PASSTHRU:
7077 return (nvme_ioctl_passthru(minor, arg, mode, cred_p));
7078 case NVME_IOC_LOCK:
7079 return (nvme_ioctl_lock(minor, arg, mode, cred_p));
7080 case NVME_IOC_UNLOCK:
7081 return (nvme_ioctl_unlock(minor, arg, mode, cred_p));
7082 default:
7083 return (ENOTTY);
7084 }
7085 }
7086
7087 /*
7088 * DDI UFM Callbacks
7089 */
7090 static int
nvme_ufm_fill_image(ddi_ufm_handle_t * ufmh,void * arg,uint_t imgno,ddi_ufm_image_t * img)7091 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
7092 ddi_ufm_image_t *img)
7093 {
7094 nvme_t *nvme = arg;
7095
7096 if (imgno != 0)
7097 return (EINVAL);
7098
7099 ddi_ufm_image_set_desc(img, "Firmware");
7100 ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot);
7101
7102 return (0);
7103 }
7104
7105 /*
7106 * Fill out firmware slot information for the requested slot. The firmware
7107 * slot information is gathered by requesting the Firmware Slot Information log
7108 * page. The format of the page is described in section 5.10.1.3.
7109 *
7110 * We lazily cache the log page on the first call and then invalidate the cache
7111 * data after a successful firmware download or firmware commit command.
7112 * The cached data is protected by a mutex as the state can change
7113 * asynchronous to this callback.
7114 */
7115 static int
nvme_ufm_fill_slot(ddi_ufm_handle_t * ufmh,void * arg,uint_t imgno,uint_t slotno,ddi_ufm_slot_t * slot)7116 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
7117 uint_t slotno, ddi_ufm_slot_t *slot)
7118 {
7119 nvme_t *nvme = arg;
7120 void *log = NULL;
7121 size_t bufsize;
7122 ddi_ufm_attr_t attr = 0;
7123 char fw_ver[NVME_FWVER_SZ + 1];
7124
7125 if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1))
7126 return (EINVAL);
7127
7128 mutex_enter(&nvme->n_fwslot_mutex);
7129 if (nvme->n_fwslot == NULL) {
7130 if (!nvme_get_logpage_int(nvme, B_TRUE, &log, &bufsize,
7131 NVME_LOGPAGE_FWSLOT) ||
7132 bufsize != sizeof (nvme_fwslot_log_t)) {
7133 if (log != NULL)
7134 kmem_free(log, bufsize);
7135 mutex_exit(&nvme->n_fwslot_mutex);
7136 return (EIO);
7137 }
7138 nvme->n_fwslot = (nvme_fwslot_log_t *)log;
7139 }
7140
7141 /*
7142 * NVMe numbers firmware slots starting at 1
7143 */
7144 if (slotno == (nvme->n_fwslot->fw_afi - 1))
7145 attr |= DDI_UFM_ATTR_ACTIVE;
7146
7147 if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0)
7148 attr |= DDI_UFM_ATTR_WRITEABLE;
7149
7150 if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') {
7151 attr |= DDI_UFM_ATTR_EMPTY;
7152 } else {
7153 (void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno],
7154 NVME_FWVER_SZ);
7155 fw_ver[NVME_FWVER_SZ] = '\0';
7156 ddi_ufm_slot_set_version(slot, fw_ver);
7157 }
7158 mutex_exit(&nvme->n_fwslot_mutex);
7159
7160 ddi_ufm_slot_set_attrs(slot, attr);
7161
7162 return (0);
7163 }
7164
7165 static int
nvme_ufm_getcaps(ddi_ufm_handle_t * ufmh,void * arg,ddi_ufm_cap_t * caps)7166 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps)
7167 {
7168 *caps = DDI_UFM_CAP_REPORT;
7169 return (0);
7170 }
7171
7172 boolean_t
nvme_ctrl_atleast(nvme_t * nvme,const nvme_version_t * min)7173 nvme_ctrl_atleast(nvme_t *nvme, const nvme_version_t *min)
7174 {
7175 return (nvme_vers_atleast(&nvme->n_version, min) ? B_TRUE : B_FALSE);
7176 }
7177