13c9168fHans Rosenfeld/*
23c9168fHans Rosenfeld * This file and its contents are supplied under the terms of the
33c9168fHans Rosenfeld * Common Development and Distribution License ("CDDL"), version 1.0.
43c9168fHans Rosenfeld * You may only use this file in accordance with the terms of version
53c9168fHans Rosenfeld * 1.0 of the CDDL.
63c9168fHans Rosenfeld *
73c9168fHans Rosenfeld * A full copy of the text of the CDDL should have accompanied this
83c9168fHans Rosenfeld * source.  A copy of the CDDL is also available via the Internet at
93c9168fHans Rosenfeld * http://www.illumos.org/license/CDDL.
103c9168fHans Rosenfeld */
113c9168fHans Rosenfeld
123c9168fHans Rosenfeld/*
13f313c17Yuri Pankov * Copyright 2018 Nexenta Systems, Inc.
1434c938cPete Shephard * Copyright 2016 Tegile Systems, Inc. All rights reserved.
152f95345Youzhong Yang * Copyright (c) 2016 The MathWorks, Inc.  All rights reserved.
162ba19baJason King * Copyright 2020 Joyent, Inc.
170999c11Paul Winder * Copyright 2019 Western Digital Corporation.
18709d065Paul Winder * Copyright 2020 Racktop Systems.
193c9168fHans Rosenfeld */
203c9168fHans Rosenfeld
213c9168fHans Rosenfeld/*
223c9168fHans Rosenfeld * blkdev driver for NVMe compliant storage devices
233c9168fHans Rosenfeld *
24f8dcaeaRobert Mustacchi * This driver was written to conform to version 1.2.1 of the NVMe
25f8dcaeaRobert Mustacchi * specification.  It may work with newer versions, but that is completely
26f8dcaeaRobert Mustacchi * untested and disabled by default.
273c9168fHans Rosenfeld *
283c9168fHans Rosenfeld * The driver has only been tested on x86 systems and will not work on big-
293c9168fHans Rosenfeld * endian systems without changes to the code accessing registers and data
303c9168fHans Rosenfeld * structures used by the hardware.
313c9168fHans Rosenfeld *
323c9168fHans Rosenfeld *
333c9168fHans Rosenfeld * Interrupt Usage:
343c9168fHans Rosenfeld *
350b46660Hans Rosenfeld * The driver will use a single interrupt while configuring the device as the
360b46660Hans Rosenfeld * specification requires, but contrary to the specification it will try to use
370b46660Hans Rosenfeld * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
380b46660Hans Rosenfeld * will switch to multiple-message MSI(-X) if supported. The driver wants to
390b46660Hans Rosenfeld * have one interrupt vector per CPU, but it will work correctly if less are
400b46660Hans Rosenfeld * available. Interrupts can be shared by queues, the interrupt handler will
410b46660Hans Rosenfeld * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
420b46660Hans Rosenfeld * the admin queue will share an interrupt with one I/O queue. The interrupt
430b46660Hans Rosenfeld * handler will retrieve completed commands from all queues sharing an interrupt
440b46660Hans Rosenfeld * vector and will post them to a taskq for completion processing.
453c9168fHans Rosenfeld *
463c9168fHans Rosenfeld *
473c9168fHans Rosenfeld * Command Processing:
483c9168fHans Rosenfeld *
494b32436Hans Rosenfeld * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
503c9168fHans Rosenfeld * to 65536 I/O commands. The driver will configure one I/O queue pair per
513c9168fHans Rosenfeld * available interrupt vector, with the queue length usually much smaller than
523c9168fHans Rosenfeld * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
533c9168fHans Rosenfeld * interrupt vectors will be used.
543c9168fHans Rosenfeld *
553c9168fHans Rosenfeld * Additionally the hardware provides a single special admin queue pair that can
563c9168fHans Rosenfeld * hold up to 4096 admin commands.
573c9168fHans Rosenfeld *
583c9168fHans Rosenfeld * From the hardware perspective both queues of a queue pair are independent,
593c9168fHans Rosenfeld * but they share some driver state: the command array (holding pointers to
603c9168fHans Rosenfeld * commands currently being processed by the hardware) and the active command
610999c11Paul Winder * counter. Access to a submission queue and the shared state is protected by
620999c11Paul Winder * nq_mutex, completion queue is protected by ncq_mutex.
633c9168fHans Rosenfeld *
643c9168fHans Rosenfeld * When a command is submitted to a queue pair the active command counter is
653c9168fHans Rosenfeld * incremented and a pointer to the command is stored in the command array. The
663c9168fHans Rosenfeld * array index is used as command identifier (CID) in the submission queue
673c9168fHans Rosenfeld * entry. Some commands may take a very long time to complete, and if the queue
683c9168fHans Rosenfeld * wraps around in that time a submission may find the next array slot to still
693c9168fHans Rosenfeld * be used by a long-running command. In this case the array is sequentially
703c9168fHans Rosenfeld * searched for the next free slot. The length of the command array is the same
714b32436Hans Rosenfeld * as the configured queue length. Queue overrun is prevented by the semaphore,
724b32436Hans Rosenfeld * so a command submission may block if the queue is full.
733c9168fHans Rosenfeld *
743c9168fHans Rosenfeld *
754ac9cfcHans Rosenfeld * Polled I/O Support:
764ac9cfcHans Rosenfeld *
774ac9cfcHans Rosenfeld * For kernel core dump support the driver can do polled I/O. As interrupts are
784ac9cfcHans Rosenfeld * turned off while dumping the driver will just submit a command in the regular
794ac9cfcHans Rosenfeld * way, and then repeatedly attempt a command retrieval until it gets the
804ac9cfcHans Rosenfeld * command back.
814ac9cfcHans Rosenfeld *
824ac9cfcHans Rosenfeld *
833c9168fHans Rosenfeld * Namespace Support:
843c9168fHans Rosenfeld *
853c9168fHans Rosenfeld * NVMe devices can have multiple namespaces, each being a independent data
863c9168fHans Rosenfeld * store. The driver supports multiple namespaces and creates a blkdev interface
873c9168fHans Rosenfeld * for each namespace found. Namespaces can have various attributes to support
88db083a4Yuri Pankov * protection information. This driver does not support any of this and ignores
89db083a4Yuri Pankov * namespaces that have these attributes.
903c9168fHans Rosenfeld *
9124979caHans Rosenfeld * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
9224979caHans Rosenfeld * (EUI64). This driver uses the EUI64 if present to generate the devid and
9324979caHans Rosenfeld * passes it to blkdev to use it in the device node names. As this is currently
9424979caHans Rosenfeld * untested namespaces with EUI64 are ignored by default.
9524979caHans Rosenfeld *
963d9b1a2Hans Rosenfeld * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
973d9b1a2Hans Rosenfeld * single controller. This is an artificial limit imposed by the driver to be
983d9b1a2Hans Rosenfeld * able to address a reasonable number of controllers and namespaces using a
993d9b1a2Hans Rosenfeld * 32bit minor node number.
1003d9b1a2Hans Rosenfeld *
1013d9b1a2Hans Rosenfeld *
1023d9b1a2Hans Rosenfeld * Minor nodes:
1033d9b1a2Hans Rosenfeld *
1043d9b1a2Hans Rosenfeld * For each NVMe device the driver exposes one minor node for the controller and
1053d9b1a2Hans Rosenfeld * one minor node for each namespace. The only operations supported by those
1063d9b1a2Hans Rosenfeld * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
1073d9b1a2Hans Rosenfeld * interface for the nvmeadm(1M) utility.
1083d9b1a2Hans Rosenfeld *
1093c9168fHans Rosenfeld *
1103c9168fHans Rosenfeld * Blkdev Interface:
1113c9168fHans Rosenfeld *
1123c9168fHans Rosenfeld * This driver uses blkdev to do all the heavy lifting involved with presenting
1133c9168fHans Rosenfeld * a disk device to the system. As a result, the processing of I/O requests is
1143c9168fHans Rosenfeld * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
1153c9168fHans Rosenfeld * setup, and splitting of transfers into manageable chunks.
1163c9168fHans Rosenfeld *
1173c9168fHans Rosenfeld * I/O requests coming in from blkdev are turned into NVM commands and posted to
1183c9168fHans Rosenfeld * an I/O queue. The queue is selected by taking the CPU id modulo the number of
1193c9168fHans Rosenfeld * queues. There is currently no timeout handling of I/O commands.
1203c9168fHans Rosenfeld *
1213c9168fHans Rosenfeld * Blkdev also supports querying device/media information and generating a
1223c9168fHans Rosenfeld * devid. The driver reports the best block size as determined by the namespace
1233c9168fHans Rosenfeld * format back to blkdev as physical block size to support partition and block
12424979caHans Rosenfeld * alignment. The devid is either based on the namespace EUI64, if present, or
12524979caHans Rosenfeld * composed using the device vendor ID, model number, serial number, and the
12624979caHans Rosenfeld * namespace ID.
1273c9168fHans Rosenfeld *
1283c9168fHans Rosenfeld *
1293c9168fHans Rosenfeld * Error Handling:
1303c9168fHans Rosenfeld *
1313c9168fHans Rosenfeld * Error handling is currently limited to detecting fatal hardware errors,
1323c9168fHans Rosenfeld * either by asynchronous events, or synchronously through command status or
1333c9168fHans Rosenfeld * admin command timeouts. In case of severe errors the device is fenced off,
1343c9168fHans Rosenfeld * all further requests will return EIO. FMA is then called to fault the device.
1353c9168fHans Rosenfeld *
1363c9168fHans Rosenfeld * The hardware has a limit for outstanding asynchronous event requests. Before
1373c9168fHans Rosenfeld * this limit is known the driver assumes it is at least 1 and posts a single
1383c9168fHans Rosenfeld * asynchronous request. Later when the limit is known more asynchronous event
1393c9168fHans Rosenfeld * requests are posted to allow quicker reception of error information. When an
1403c9168fHans Rosenfeld * asynchronous event is posted by the hardware the driver will parse the error
1413c9168fHans Rosenfeld * status fields and log information or fault the device, depending on the
1423c9168fHans Rosenfeld * severity of the asynchronous event. The asynchronous event request is then
1433c9168fHans Rosenfeld * reused and posted to the admin queue again.
1443c9168fHans Rosenfeld *
1453c9168fHans Rosenfeld * On command completion the command status is checked for errors. In case of
1463c9168fHans Rosenfeld * errors indicating a driver bug the driver panics. Almost all other error
1473c9168fHans Rosenfeld * status values just cause EIO to be returned.
1483c9168fHans Rosenfeld *
1493c9168fHans Rosenfeld * Command timeouts are currently detected for all admin commands except
1503c9168fHans Rosenfeld * asynchronous event requests. If a command times out and the hardware appears
151e984c70Hans Rosenfeld * to be healthy the driver attempts to abort the command. The original command
152e984c70Hans Rosenfeld * timeout is also applied to the abort command. If the abort times out too the
1533c9168fHans Rosenfeld * driver assumes the device to be dead, fences it off, and calls FMA to retire
154e984c70Hans Rosenfeld * it. In all other cases the aborted command should return immediately with a
155e984c70Hans Rosenfeld * status indicating it was aborted, and the driver will wait indefinitely for
156e984c70Hans Rosenfeld * that to happen. No timeout handling of normal I/O commands is presently done.
1573c9168fHans Rosenfeld *
158e984c70Hans Rosenfeld * Any command that times out due to the controller dropping dead will be put on
159e984c70Hans Rosenfeld * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
160e984c70Hans Rosenfeld * memory being reused by the system and later be written to by a "dead" NVMe
161e984c70Hans Rosenfeld * controller.
162e984c70Hans Rosenfeld *
163e984c70Hans Rosenfeld *
164e984c70Hans Rosenfeld * Locking:
165e984c70Hans Rosenfeld *
1660999c11Paul Winder * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held
1670999c11Paul Winder * when accessing shared state and submission queue registers, ncq_mutex
1680999c11Paul Winder * is held when accessing completion queue state and registers.
1690999c11Paul Winder * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while
1700999c11Paul Winder * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both
1710999c11Paul Winder * mutexes themselves.
172e984c70Hans Rosenfeld *
173e984c70Hans Rosenfeld * Each command also has its own nc_mutex, which is associated with the
174e984c70Hans Rosenfeld * condition variable nc_cv. It is only used on admin commands which are run
175e984c70Hans Rosenfeld * synchronously. In that case it must be held across calls to
176e984c70Hans Rosenfeld * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
177e984c70Hans Rosenfeld * nvme_admin_cmd(). It must also be held whenever the completion state of the
178e984c70Hans Rosenfeld * command is changed or while a admin command timeout is handled.
179e984c70Hans Rosenfeld *
180e984c70Hans Rosenfeld * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
181e984c70Hans Rosenfeld * More than one nc_mutex may only be held when aborting commands. In this case,
182e984c70Hans Rosenfeld * the nc_mutex of the command to be aborted must be held across the call to
183e984c70Hans Rosenfeld * nvme_abort_cmd() to prevent the command from completing while the abort is in
184e984c70Hans Rosenfeld * progress.
185e984c70Hans Rosenfeld *
1860999c11Paul Winder * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be
1870999c11Paul Winder * acquired first. More than one nq_mutex is never held by a single thread.
1880999c11Paul Winder * The ncq_mutex is only held by nvme_retrieve_cmd() and
1890999c11Paul Winder * nvme_process_iocq(). nvme_process_iocq() is only called from the
1900999c11Paul Winder * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the
1910999c11Paul Winder * mutex is non-contentious but is required for implementation completeness
1920999c11Paul Winder * and safety.
1930999c11Paul Winder *
194e984c70Hans Rosenfeld * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
195e984c70Hans Rosenfeld * and exclusive-open flag nm_oexcl.
1963c9168fHans Rosenfeld *
1973c9168fHans Rosenfeld *
1983c9168fHans Rosenfeld * Quiesce / Fast Reboot:
1993c9168fHans Rosenfeld *
2003c9168fHans Rosenfeld * The driver currently does not support fast reboot. A quiesce(9E) entry point
2013c9168fHans Rosenfeld * is still provided which is used to send a shutdown notification to the
2023c9168fHans Rosenfeld * device.
2033c9168fHans Rosenfeld *
2043c9168fHans Rosenfeld *
205e89be50Rob Johnston * DDI UFM Support
206e89be50Rob Johnston *
207e89be50Rob Johnston * The driver supports the DDI UFM framework for reporting information about
208e89be50Rob Johnston * the device's firmware image and slot configuration. This data can be
209e89be50Rob Johnston * queried by userland software via ioctls to the ufm driver. For more
210e89be50Rob Johnston * information, see ddi_ufm(9E).
211e89be50Rob Johnston *
212e89be50Rob Johnston *
2133c9168fHans Rosenfeld * Driver Configuration:
2143c9168fHans Rosenfeld *
2153c9168fHans Rosenfeld * The following driver properties can be changed to control some aspects of the
2163c9168fHans Rosenfeld * drivers operation:
2173c9168fHans Rosenfeld * - strict-version: can be set to 0 to allow devices conforming to newer
21848d370fRobert Mustacchi *   major versions to be used
2193c9168fHans Rosenfeld * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
2203c9168fHans Rosenfeld *   specific command status as a fatal error leading device faulting
2213c9168fHans Rosenfeld * - admin-queue-len: the maximum length of the admin queue (16-4096)
2220999c11Paul Winder * - io-squeue-len: the maximum length of the I/O submission queues (16-65536)
2230999c11Paul Winder * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536)
2243c9168fHans Rosenfeld * - async-event-limit: the maximum number of asynchronous event requests to be
2253c9168fHans Rosenfeld *   posted by the driver
226d148d46Hans Rosenfeld * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
227d148d46Hans Rosenfeld *   cache
2286801591Hans Rosenfeld * - min-phys-block-size: the minimum physical block size to report to blkdev,
2296801591Hans Rosenfeld *   which is among other things the basis for ZFS vdev ashift
2300999c11Paul Winder * - max-submission-queues: the maximum number of I/O submission queues.
2310999c11Paul Winder * - max-completion-queues: the maximum number of I/O completion queues,
2320999c11Paul Winder *   can be less than max-submission-queues, in which case the completion
2330999c11Paul Winder *   queues are shared.
2343c9168fHans Rosenfeld *
2353c9168fHans Rosenfeld *
2363c9168fHans Rosenfeld * TODO:
2373c9168fHans Rosenfeld * - figure out sane default for I/O queue depth reported to blkdev
2383c9168fHans Rosenfeld * - FMA handling of media errors
2393c9168fHans Rosenfeld * - support for devices supporting very large I/O requests using chained PRPs
2403c9168fHans Rosenfeld * - support for configuring hardware parameters like interrupt coalescing
2413c9168fHans Rosenfeld * - support for media formatting and hard partitioning into namespaces
2423c9168fHans Rosenfeld * - support for big-endian systems
2433c9168fHans Rosenfeld * - support for fast reboot
24424979caHans Rosenfeld * - support for NVMe Subsystem Reset (1.1)
24524979caHans Rosenfeld * - support for Scatter/Gather lists (1.1)
24624979caHans Rosenfeld * - support for Reservations (1.1)
24724979caHans Rosenfeld * - support for power management
2483c9168fHans Rosenfeld */
2493c9168fHans Rosenfeld
2503c9168fHans Rosenfeld#include <sys/byteorder.h>
2513c9168fHans Rosenfeld#ifdef _BIG_ENDIAN
2523c9168fHans Rosenfeld#error nvme driver needs porting for big-endian platforms
2533c9168fHans Rosenfeld#endif
2543c9168fHans Rosenfeld
2553c9168fHans Rosenfeld#include <sys/modctl.h>
2563c9168fHans Rosenfeld#include <sys/conf.h>
2573c9168fHans Rosenfeld#include <sys/devops.h>
2583c9168fHans Rosenfeld#include <sys/ddi.h>
259e89be50Rob Johnston#include <sys/ddi_ufm.h>
2603c9168fHans Rosenfeld#include <sys/sunddi.h>
2613d9b1a2Hans Rosenfeld#include <sys/sunndi.h>
2623c9168fHans Rosenfeld#include <sys/bitmap.h>
2633c9168fHans Rosenfeld#include <sys/sysmacros.h>
2643c9168fHans Rosenfeld#include <sys/param.h>
2653c9168fHans Rosenfeld#include <sys/varargs.h>
2663c9168fHans Rosenfeld#include <sys/cpuvar.h>
2673c9168fHans Rosenfeld#include <sys/disp.h>
2683c9168fHans Rosenfeld#include <sys/blkdev.h>
2693c9168fHans Rosenfeld#include <sys/atomic.h>
2703c9168fHans Rosenfeld#include <sys/archsystm.h>
271510a684Hans Rosenfeld#include <sys/sata/sata_hba.h>
2723d9b1a2Hans Rosenfeld#include <sys/stat.h>
2733d9b1a2Hans Rosenfeld#include <sys/policy.h>
274e984c70Hans Rosenfeld#include <sys/list.h>
2752ba19baJason King#include <sys/dkio.h>
2763d9b1a2Hans Rosenfeld
2773d9b1a2Hans Rosenfeld#include <sys/nvme.h>
2783c9168fHans Rosenfeld
2799d08e1fHans Rosenfeld#ifdef __x86
2809d08e1fHans Rosenfeld#include <sys/x86_archext.h>
2819d08e1fHans Rosenfeld#endif
2829d08e1fHans Rosenfeld
2833c9168fHans Rosenfeld#include "nvme_reg.h"
2843c9168fHans Rosenfeld#include "nvme_var.h"
2853c9168fHans Rosenfeld
28648d370fRobert Mustacchi/*
28748d370fRobert Mustacchi * Assertions to make sure that we've properly captured various aspects of the
28848d370fRobert Mustacchi * packed structures and haven't broken them during updates.
28948d370fRobert Mustacchi */
29048d370fRobert MustacchiCTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000);
29148d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256);
29248d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512);
2932ba19baJason KingCTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520);
29448d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768);
29548d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792);
29648d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048);
29748d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072);
29848d370fRobert Mustacchi
29948d370fRobert MustacchiCTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000);
30048d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32);
30148d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104);
30248d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128);
30348d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384);
30448d370fRobert Mustacchi
30548d370fRobert MustacchiCTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000);
30648d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32);
30748d370fRobert MustacchiCTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64);
30848d370fRobert Mustacchi
3093c9168fHans Rosenfeld
3103c9168fHans Rosenfeld/* NVMe spec version supported */
3113c9168fHans Rosenfeldstatic const int nvme_version_major = 1;
3123c9168fHans Rosenfeld
313e8ba2a3Hans Rosenfeld/* tunable for admin command timeout in seconds, default is 1s */
3143d9b1a2Hans Rosenfeldint nvme_admin_cmd_timeout = 1;
3153d9b1a2Hans Rosenfeld
3163d9b1a2Hans Rosenfeld/* tunable for FORMAT NVM command timeout in seconds, default is 600s */
3173d9b1a2Hans Rosenfeldint nvme_format_cmd_timeout = 600;
318e8ba2a3Hans Rosenfeld
319cf84087Paul Winder/* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */
320cf84087Paul Winderint nvme_commit_save_cmd_timeout = 15;
321cf84087Paul Winder
3223c9168fHans Rosenfeldstatic int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
3233c9168fHans Rosenfeldstatic int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
3243c9168fHans Rosenfeldstatic int nvme_quiesce(dev_info_t *);
3253c9168fHans Rosenfeldstatic int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
3263c9168fHans Rosenfeldstatic int nvme_setup_interrupts(nvme_t *, int, int);
3273c9168fHans Rosenfeldstatic void nvme_release_interrupts(nvme_t *);
3283c9168fHans Rosenfeldstatic uint_t nvme_intr(caddr_t, caddr_t);
3293c9168fHans Rosenfeld
3303c9168fHans Rosenfeldstatic void nvme_shutdown(nvme_t *, int, boolean_t);
3313c9168fHans Rosenfeldstatic boolean_t nvme_reset(nvme_t *, boolean_t);
3323c9168fHans Rosenfeldstatic int nvme_init(nvme_t *);
3333c9168fHans Rosenfeldstatic nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
3343c9168fHans Rosenfeldstatic void nvme_free_cmd(nvme_cmd_t *);
3353c9168fHans Rosenfeldstatic nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
3363c9168fHans Rosenfeld    bd_xfer_t *);
337e984c70Hans Rosenfeldstatic void nvme_admin_cmd(nvme_cmd_t *, int);
3384b32436Hans Rosenfeldstatic void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
3394b32436Hans Rosenfeldstatic int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
3404b32436Hans Rosenfeldstatic void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
341e984c70Hans Rosenfeldstatic nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int);
3423c9168fHans Rosenfeldstatic nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
343e984c70Hans Rosenfeldstatic void nvme_wait_cmd(nvme_cmd_t *, uint_t);
3443c9168fHans Rosenfeldstatic void nvme_wakeup_cmd(void *);
3453c9168fHans Rosenfeldstatic void nvme_async_event_task(void *);
3463c9168fHans Rosenfeld
3473c9168fHans Rosenfeldstatic int nvme_check_unknown_cmd_status(nvme_cmd_t *);
3483c9168fHans Rosenfeldstatic int nvme_check_vendor_cmd_status(nvme_cmd_t *);
3493c9168fHans Rosenfeldstatic int nvme_check_integrity_cmd_status(nvme_cmd_t *);
3503c9168fHans Rosenfeldstatic int nvme_check_specific_cmd_status(nvme_cmd_t *);
3513c9168fHans Rosenfeldstatic int nvme_check_generic_cmd_status(nvme_cmd_t *);
3523c9168fHans Rosenfeldstatic inline int nvme_check_cmd_status(nvme_cmd_t *);
3533c9168fHans Rosenfeld
354e984c70Hans Rosenfeldstatic int nvme_abort_cmd(nvme_cmd_t *, uint_t);
3554b32436Hans Rosenfeldstatic void nvme_async_event(nvme_t *);
356bc58635Robert Mustacchistatic int nvme_format_nvm(nvme_t *, boolean_t, uint32_t, uint8_t, boolean_t,
357bc58635Robert Mustacchi    uint8_t, boolean_t, uint8_t);
358bc58635Robert Mustacchistatic int nvme_get_logpage(nvme_t *, boolean_t, void **, size_t *, uint8_t,
359bc58635Robert Mustacchi    ...);
360bc58635Robert Mustacchistatic int nvme_identify(nvme_t *, boolean_t, uint32_t, void **);
361bc58635Robert Mustacchistatic int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t,
362d148d46Hans Rosenfeld    uint32_t *);
363bc58635Robert Mustacchistatic int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *,
3643d9b1a2Hans Rosenfeld    void **, size_t *);
365e984c70Hans Rosenfeldstatic int nvme_write_cache_set(nvme_t *, boolean_t);
3660999c11Paul Winderstatic int nvme_set_nqueues(nvme_t *);
3673c9168fHans Rosenfeld
3683c9168fHans Rosenfeldstatic void nvme_free_dma(nvme_dma_t *);
3693c9168fHans Rosenfeldstatic int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
3703c9168fHans Rosenfeld    nvme_dma_t **);
3713c9168fHans Rosenfeldstatic int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
3723c9168fHans Rosenfeld    nvme_dma_t **);
3733c9168fHans Rosenfeldstatic void nvme_free_qpair(nvme_qpair_t *);
3740999c11Paul Winderstatic int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t);
3753c9168fHans Rosenfeldstatic int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
3763c9168fHans Rosenfeld
3773c9168fHans Rosenfeldstatic inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
3783c9168fHans Rosenfeldstatic inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
3793c9168fHans Rosenfeldstatic inline uint64_t nvme_get64(nvme_t *, uintptr_t);
3803c9168fHans Rosenfeldstatic inline uint32_t nvme_get32(nvme_t *, uintptr_t);
3813c9168fHans Rosenfeld
3823c9168fHans Rosenfeldstatic boolean_t nvme_check_regs_hdl(nvme_t *);
3833c9168fHans Rosenfeldstatic boolean_t nvme_check_dma_hdl(nvme_dma_t *);
3843c9168fHans Rosenfeld
3853c9168fHans Rosenfeldstatic int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *);
3863c9168fHans Rosenfeld
3873c9168fHans Rosenfeldstatic void nvme_bd_xfer_done(void *);
3883c9168fHans Rosenfeldstatic void nvme_bd_driveinfo(void *, bd_drive_t *);
3893c9168fHans Rosenfeldstatic int nvme_bd_mediainfo(void *, bd_media_t *);
3903c9168fHans Rosenfeldstatic int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
3913c9168fHans Rosenfeldstatic int nvme_bd_read(void *, bd_xfer_t *);
3923c9168fHans Rosenfeldstatic int nvme_bd_write(void *, bd_xfer_t *);
3933c9168fHans Rosenfeldstatic int nvme_bd_sync(void *, bd_xfer_t *);
3943c9168fHans Rosenfeldstatic int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
3952ba19baJason Kingstatic int nvme_bd_free_space(void *, bd_xfer_t *);
3963c9168fHans Rosenfeld
3978834f7aYouzhong Yangstatic int nvme_prp_dma_constructor(void *, void *, int);
3988834f7aYouzhong Yangstatic void nvme_prp_dma_destructor(void *, void *);
3998834f7aYouzhong Yang
4003c9168fHans Rosenfeldstatic void nvme_prepare_devid(nvme_t *, uint32_t);
4013c9168fHans Rosenfeld
402e89be50Rob Johnston/* DDI UFM callbacks */
403e89be50Rob Johnstonstatic int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t,
404e89be50Rob Johnston    ddi_ufm_image_t *);
405e89be50Rob Johnstonstatic int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t,
406e89be50Rob Johnston    ddi_ufm_slot_t *);
407e89be50Rob Johnstonstatic int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *);
408e89be50Rob Johnston
4093d9b1a2Hans Rosenfeldstatic int nvme_open(dev_t *, int, int, cred_t *);
4103d9b1a2Hans Rosenfeldstatic int nvme_close(dev_t, int, int, cred_t *);
4113d9b1a2Hans Rosenfeldstatic int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
4123d9b1a2Hans Rosenfeld
413e89be50Rob Johnstonstatic ddi_ufm_ops_t nvme_ufm_ops = {
414e89be50Rob Johnston	NULL,
415e89be50Rob Johnston	nvme_ufm_fill_image,
416e89be50Rob Johnston	nvme_ufm_fill_slot,
417e89be50Rob Johnston	nvme_ufm_getcaps
418e89be50Rob Johnston};
419e89be50Rob Johnston
420dc97a43Hans Rosenfeld#define	NVME_MINOR_INST_SHIFT	9
4213d9b1a2Hans Rosenfeld#define	NVME_MINOR(inst, nsid)	(((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
4223d9b1a2Hans Rosenfeld#define	NVME_MINOR_INST(minor)	((minor) >> NVME_MINOR_INST_SHIFT)
4233d9b1a2Hans Rosenfeld#define	NVME_MINOR_NSID(minor)	((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
4243d9b1a2Hans Rosenfeld#define	NVME_MINOR_MAX		(NVME_MINOR(1, 0) - 2)
4253d9b1a2Hans Rosenfeld
4263c9168fHans Rosenfeldstatic void *nvme_state;
4273c9168fHans Rosenfeldstatic kmem_cache_t *nvme_cmd_cache;
4283c9168fHans Rosenfeld
4293c9168fHans Rosenfeld/*
4303c9168fHans Rosenfeld * DMA attributes for queue DMA memory
4313c9168fHans Rosenfeld *
4323c9168fHans Rosenfeld * Queue DMA memory must be page aligned. The maximum length of a queue is
4333c9168fHans Rosenfeld * 65536 entries, and an entry can be 64 bytes long.
4343c9168fHans Rosenfeld */
4353c9168fHans Rosenfeldstatic ddi_dma_attr_t nvme_queue_dma_attr = {
4363c9168fHans Rosenfeld	.dma_attr_version	= DMA_ATTR_V0,
4373c9168fHans Rosenfeld	.dma_attr_addr_lo	= 0,
4383c9168fHans Rosenfeld	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
439910f0d1Youzhong Yang	.dma_attr_count_max	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1,
4403c9168fHans Rosenfeld	.dma_attr_align		= 0x1000,
4413c9168fHans Rosenfeld	.dma_attr_burstsizes	= 0x7ff,
4423c9168fHans Rosenfeld	.dma_attr_minxfer	= 0x1000,
4433c9168fHans Rosenfeld	.dma_attr_maxxfer	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
4443c9168fHans Rosenfeld	.dma_attr_seg		= 0xffffffffffffffffULL,
4453c9168fHans Rosenfeld	.dma_attr_sgllen	= 1,
4463c9168fHans Rosenfeld	.dma_attr_granular	= 1,
4473c9168fHans Rosenfeld	.dma_attr_flags		= 0,
4483c9168fHans Rosenfeld};
4493c9168fHans Rosenfeld
4503c9168fHans Rosenfeld/*
4513c9168fHans Rosenfeld * DMA attributes for transfers using Physical Region Page (PRP) entries
4523c9168fHans Rosenfeld *
4533c9168fHans Rosenfeld * A PRP entry describes one page of DMA memory using the page size specified
4543c9168fHans Rosenfeld * in the controller configuration's memory page size register (CC.MPS). It uses
4553c9168fHans Rosenfeld * a 64bit base address aligned to this page size. There is no limitation on
4563c9168fHans Rosenfeld * chaining PRPs together for arbitrarily large DMA transfers.
4573c9168fHans Rosenfeld */
4583c9168fHans Rosenfeldstatic ddi_dma_attr_t nvme_prp_dma_attr = {
4593c9168fHans Rosenfeld	.dma_attr_version	= DMA_ATTR_V0,
4603c9168fHans Rosenfeld	.dma_attr_addr_lo	= 0,
4613c9168fHans Rosenfeld	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
4623c9168fHans Rosenfeld	.dma_attr_count_max	= 0xfff,
4633c9168fHans Rosenfeld	.dma_attr_align		= 0x1000,
4643c9168fHans Rosenfeld	.dma_attr_burstsizes	= 0x7ff,
4653c9168fHans Rosenfeld	.dma_attr_minxfer	= 0x1000,
4663c9168fHans Rosenfeld	.dma_attr_maxxfer	= 0x1000,
4672f95345Youzhong Yang	.dma_attr_seg		= 0xfff,
4683c9168fHans Rosenfeld	.dma_attr_sgllen	= -1,
4693c9168fHans Rosenfeld	.dma_attr_granular	= 1,
4703c9168fHans Rosenfeld	.dma_attr_flags		= 0,
4713c9168fHans Rosenfeld};
4723c9168fHans Rosenfeld
4733c9168fHans Rosenfeld/*
4743c9168fHans Rosenfeld * DMA attributes for transfers using scatter/gather lists
4753c9168fHans Rosenfeld *
4763c9168fHans Rosenfeld * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
4773c9168fHans Rosenfeld * 32bit length field. SGL Segment and SGL Last Segment entries require the
4783c9168fHans Rosenfeld * length to be a multiple of 16 bytes.
4793c9168fHans Rosenfeld */
4803c9168fHans Rosenfeldstatic ddi_dma_attr_t nvme_sgl_dma_attr = {
4813c9168fHans Rosenfeld	.dma_attr_version	= DMA_ATTR_V0,
4823c9168fHans Rosenfeld	.dma_attr_addr_lo	= 0,
4833c9168fHans Rosenfeld	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
4843c9168fHans Rosenfeld	.dma_attr_count_max	= 0xffffffffUL,
4853c9168fHans Rosenfeld	.dma_attr_align		= 1,
4863c9168fHans Rosenfeld	.dma_attr_burstsizes	= 0x7ff,
4873c9168fHans Rosenfeld	.dma_attr_minxfer	= 0x10,
4883c9168fHans Rosenfeld	.dma_attr_maxxfer	= 0xfffffffffULL,
4893c9168fHans Rosenfeld	.dma_attr_seg		= 0xffffffffffffffffULL,
4903c9168fHans Rosenfeld	.dma_attr_sgllen	= -1,
4913c9168fHans Rosenfeld	.dma_attr_granular	= 0x10,
4923c9168fHans Rosenfeld	.dma_attr_flags		= 0
4933c9168fHans Rosenfeld};
4943c9168fHans Rosenfeld
4953c9168fHans Rosenfeldstatic ddi_device_acc_attr_t nvme_reg_acc_attr = {
4963c9168fHans Rosenfeld	.devacc_attr_version	= DDI_DEVICE_ATTR_V0,
4973c9168fHans Rosenfeld	.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
4983c9168fHans Rosenfeld	.devacc_attr_dataorder	= DDI_STRICTORDER_ACC
4993c9168fHans Rosenfeld};
5003c9168fHans Rosenfeld
5013d9b1a2Hans Rosenfeldstatic struct cb_ops nvme_cb_ops = {
5023d9b1a2Hans Rosenfeld	.cb_open	= nvme_open,
5033d9b1a2Hans Rosenfeld	.cb_close	= nvme_close,
5043d9b1a2Hans Rosenfeld	.cb_strategy	= nodev,
5053d9b1a2Hans Rosenfeld	.cb_print	= nodev,
5063d9b1a2Hans Rosenfeld	.cb_dump	= nodev,
5073d9b1a2Hans Rosenfeld	.cb_read	= nodev,
5083d9b1a2Hans Rosenfeld	.cb_write	= nodev,
5093d9b1a2Hans Rosenfeld	.cb_ioctl	= nvme_ioctl,
5103d9b1a2Hans Rosenfeld	.cb_devmap	= nodev,
5113d9b1a2Hans Rosenfeld	.cb_mmap	= nodev,
5123d9b1a2Hans Rosenfeld	.cb_segmap	= nodev,
5133d9b1a2Hans Rosenfeld	.cb_chpoll	= nochpoll,
5143d9b1a2Hans Rosenfeld	.cb_prop_op	= ddi_prop_op,
5153d9b1a2Hans Rosenfeld	.cb_str		= 0,
5163d9b1a2Hans Rosenfeld	.cb_flag	= D_NEW | D_MP,
5173d9b1a2Hans Rosenfeld	.cb_rev		= CB_REV,
5183d9b1a2Hans Rosenfeld	.cb_aread	= nodev,
5193d9b1a2Hans Rosenfeld	.cb_awrite	= nodev
5203d9b1a2Hans Rosenfeld};
5213d9b1a2Hans Rosenfeld
5223c9168fHans Rosenfeldstatic struct dev_ops nvme_dev_ops = {
5233c9168fHans Rosenfeld	.devo_rev	= DEVO_REV,
5243c9168fHans Rosenfeld	.devo_refcnt	= 0,
5253c9168fHans Rosenfeld	.devo_getinfo	= ddi_no_info,
5263c9168fHans Rosenfeld	.devo_identify	= nulldev,
5273c9168fHans Rosenfeld	.devo_probe	= nulldev,
5283c9168fHans Rosenfeld	.devo_attach	= nvme_attach,
5293c9168fHans Rosenfeld	.devo_detach	= nvme_detach,
5303c9168fHans Rosenfeld	.devo_reset	= nodev,
5313d9b1a2Hans Rosenfeld	.devo_cb_ops	= &nvme_cb_ops,
5323c9168fHans Rosenfeld	.devo_bus_ops	= NULL,
5333c9168fHans Rosenfeld	.devo_power	= NULL,
5343c9168fHans Rosenfeld	.devo_quiesce	= nvme_quiesce,
5353c9168fHans Rosenfeld};
5363c9168fHans Rosenfeld
5373c9168fHans Rosenfeldstatic struct modldrv nvme_modldrv = {
5383c9168fHans Rosenfeld	.drv_modops	= &mod_driverops,
53924979caHans Rosenfeld	.drv_linkinfo	= "NVMe v1.1b",
5403c9168fHans Rosenfeld	.drv_dev_ops	= &nvme_dev_ops
5413c9168fHans Rosenfeld};
5423c9168fHans Rosenfeld
5433c9168fHans Rosenfeldstatic struct modlinkage nvme_modlinkage = {
5443c9168fHans Rosenfeld	.ml_rev		= MODREV_1,
5453c9168fHans Rosenfeld	.ml_linkage	= { &nvme_modldrv, NULL }
5463c9168fHans Rosenfeld};
5473c9168fHans Rosenfeld
5483c9168fHans Rosenfeldstatic bd_ops_t nvme_bd_ops = {
5494d95620Paul Winder	.o_version	= BD_OPS_CURRENT_VERSION,
5503c9168fHans Rosenfeld	.o_drive_info	= nvme_bd_driveinfo,
5513c9168fHans Rosenfeld	.o_media_info	= nvme_bd_mediainfo,
5523c9168fHans Rosenfeld	.o_devid_init	= nvme_bd_devid,
5533c9168fHans Rosenfeld	.o_sync_cache	= nvme_bd_sync,
5543c9168fHans Rosenfeld	.o_read		= nvme_bd_read,
5553c9168fHans Rosenfeld	.o_write	= nvme_bd_write,
5562ba19baJason King	.o_free_space	= nvme_bd_free_space,
5573c9168fHans Rosenfeld};
5583c9168fHans Rosenfeld
559e984c70Hans Rosenfeld/*
560e984c70Hans Rosenfeld * This list will hold commands that have timed out and couldn't be aborted.
561e984c70Hans Rosenfeld * As we don't know what the hardware may still do with the DMA memory we can't
562e984c70Hans Rosenfeld * free them, so we'll keep them forever on this list where we can easily look
563e984c70Hans Rosenfeld * at them with mdb.
564e984c70Hans Rosenfeld */
565e984c70Hans Rosenfeldstatic struct list nvme_lost_cmds;
566e984c70Hans Rosenfeldstatic kmutex_t nvme_lc_mutex;
567e984c70Hans Rosenfeld
5683c9168fHans Rosenfeldint
5693c9168fHans Rosenfeld_init(void)
5703c9168fHans Rosenfeld{
5713c9168fHans Rosenfeld	int error;
5723c9168fHans Rosenfeld
5733c9168fHans Rosenfeld	error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
5743c9168fHans Rosenfeld	if (error != DDI_SUCCESS)
5753c9168fHans Rosenfeld		return (error);
5763c9168fHans Rosenfeld
5773c9168fHans Rosenfeld	nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
5783c9168fHans Rosenfeld	    sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
5793c9168fHans Rosenfeld
580e984c70Hans Rosenfeld	mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL);
581e984c70Hans Rosenfeld	list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t),
582e984c70Hans Rosenfeld	    offsetof(nvme_cmd_t, nc_list));
583e984c70Hans Rosenfeld
5843c9168fHans Rosenfeld	bd_mod_init(&nvme_dev_ops);
5853c9168fHans Rosenfeld
5863c9168fHans Rosenfeld	error = mod_install(&nvme_modlinkage);
5873c9168fHans Rosenfeld	if (error != DDI_SUCCESS) {
5883c9168fHans Rosenfeld		ddi_soft_state_fini(&nvme_state);
589e984c70Hans Rosenfeld		mutex_destroy(&nvme_lc_mutex);
590e984c70Hans Rosenfeld		list_destroy(&nvme_lost_cmds);
5913c9168fHans Rosenfeld		bd_mod_fini(&nvme_dev_ops);
5923c9168fHans Rosenfeld	}
5933c9168fHans Rosenfeld
5943c9168fHans Rosenfeld	return (error);
5953c9168fHans Rosenfeld}
5963c9168fHans Rosenfeld
5973c9168fHans Rosenfeldint
5983c9168fHans Rosenfeld_fini(void)
5993c9168fHans Rosenfeld{
6003c9168fHans Rosenfeld	int error;
6013c9168fHans Rosenfeld
602e984c70Hans Rosenfeld	if (!list_is_empty(&nvme_lost_cmds))
603e984c70Hans Rosenfeld		return (DDI_FAILURE);
604e984c70Hans Rosenfeld
6053c9168fHans Rosenfeld	error = mod_remove(&nvme_modlinkage);
6063c9168fHans Rosenfeld	if (error == DDI_SUCCESS) {
6073c9168fHans Rosenfeld		ddi_soft_state_fini(&nvme_state);
6083c9168fHans Rosenfeld		kmem_cache_destroy(nvme_cmd_cache);
609e984c70Hans Rosenfeld		mutex_destroy(&nvme_lc_mutex);
610e984c70Hans Rosenfeld		list_destroy(&nvme_lost_cmds);
6113c9168fHans Rosenfeld		bd_mod_fini(&nvme_dev_ops);
6123c9168fHans Rosenfeld	}
6133c9168fHans Rosenfeld
6143c9168fHans Rosenfeld	return (error);
6153c9168fHans Rosenfeld}
6163c9168fHans Rosenfeld
6173c9168fHans Rosenfeldint
6183c9168fHans Rosenfeld_info(struct modinfo *modinfop)
6193c9168fHans Rosenfeld{
6203c9168fHans Rosenfeld	return (mod_info(&nvme_modlinkage, modinfop));
6213c9168fHans Rosenfeld}
6223c9168fHans Rosenfeld
6233c9168fHans Rosenfeldstatic inline void
6243c9168fHans Rosenfeldnvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
6253c9168fHans Rosenfeld{
6263c9168fHans Rosenfeld	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
6273c9168fHans Rosenfeld
6283c9168fHans Rosenfeld	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
6293c9168fHans Rosenfeld	ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
6303c9168fHans Rosenfeld}
6313c9168fHans Rosenfeld
6323c9168fHans Rosenfeldstatic inline void
6333c9168fHans Rosenfeldnvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val)
6343c9168fHans Rosenfeld{
6353c9168fHans Rosenfeld	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
6363c9168fHans Rosenfeld
6373c9168fHans Rosenfeld	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
6383c9168fHans Rosenfeld	ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val);
6393c9168fHans Rosenfeld}
6403c9168fHans Rosenfeld
6413c9168fHans Rosenfeldstatic inline uint64_t
6423c9168fHans Rosenfeldnvme_get64(nvme_t *nvme, uintptr_t reg)
6433c9168fHans Rosenfeld{
6443c9168fHans Rosenfeld	uint64_t val;
6453c9168fHans Rosenfeld
6463c9168fHans Rosenfeld	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
6473c9168fHans Rosenfeld
6483c9168fHans Rosenfeld	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
6493c9168fHans Rosenfeld	val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg));
6503c9168fHans Rosenfeld
6513c9168fHans Rosenfeld	return (val);
6523c9168fHans Rosenfeld}
6533c9168fHans Rosenfeld
6543c9168fHans Rosenfeldstatic inline uint32_t
6553c9168fHans Rosenfeldnvme_get32(nvme_t *nvme, uintptr_t reg)
6563c9168fHans Rosenfeld{
6573c9168fHans Rosenfeld	uint32_t val;
6583c9168fHans Rosenfeld
6593c9168fHans Rosenfeld	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
6603c9168fHans Rosenfeld
6613c9168fHans Rosenfeld	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
6623c9168fHans Rosenfeld	val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg));
6633c9168fHans Rosenfeld
6643c9168fHans Rosenfeld	return (val);
6653c9168fHans Rosenfeld}
6663c9168fHans Rosenfeld
6673c9168fHans Rosenfeldstatic boolean_t
6683c9168fHans Rosenfeldnvme_check_regs_hdl(nvme_t *nvme)
6693c9168fHans Rosenfeld{
6703c9168fHans Rosenfeld	ddi_fm_error_t error;
6713c9168fHans Rosenfeld
6723c9168fHans Rosenfeld	ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION);
6733c9168fHans Rosenfeld
6743c9168fHans Rosenfeld	if (error.fme_status != DDI_FM_OK)
6753c9168fHans Rosenfeld		return (B_TRUE);
6763c9168fHans Rosenfeld
6773c9168fHans Rosenfeld	return (B_FALSE);
6783c9168fHans Rosenfeld}
6793c9168fHans Rosenfeld
6803c9168fHans Rosenfeldstatic boolean_t
6813c9168fHans Rosenfeldnvme_check_dma_hdl(nvme_dma_t *dma)
6823c9168fHans Rosenfeld{
6833c9168fHans Rosenfeld	ddi_fm_error_t error;
6843c9168fHans Rosenfeld
6853c9168fHans Rosenfeld	if (dma == NULL)
6863c9168fHans Rosenfeld		return (B_FALSE);
6873c9168fHans Rosenfeld
6883c9168fHans Rosenfeld	ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION);
6893c9168fHans Rosenfeld
6903c9168fHans Rosenfeld	if (error.fme_status != DDI_FM_OK)
6913c9168fHans Rosenfeld		return (B_TRUE);
6923c9168fHans Rosenfeld
6933c9168fHans Rosenfeld	return (B_FALSE);
6943c9168fHans Rosenfeld}
6953c9168fHans Rosenfeld
6963c9168fHans Rosenfeldstatic void
6978834f7aYouzhong Yangnvme_free_dma_common(nvme_dma_t *dma)
6983c9168fHans Rosenfeld{
6993c9168fHans Rosenfeld	if (dma->nd_dmah != NULL)
7003c9168fHans Rosenfeld		(void) ddi_dma_unbind_handle(dma->nd_dmah);
7013c9168fHans Rosenfeld	if (dma->nd_acch != NULL)
7023c9168fHans Rosenfeld		ddi_dma_mem_free(&dma->nd_acch);
7033c9168fHans Rosenfeld	if (dma->nd_dmah != NULL)
7043c9168fHans Rosenfeld		ddi_dma_free_handle(&dma->nd_dmah);
7053c9168fHans Rosenfeld}
7063c9168fHans Rosenfeld
7078834f7aYouzhong Yangstatic void
7088834f7aYouzhong Yangnvme_free_dma(nvme_dma_t *dma)
7093c9168fHans Rosenfeld{
7108834f7aYouzhong Yang	nvme_free_dma_common(dma);
7118834f7aYouzhong Yang	kmem_free(dma, sizeof (*dma));
7128834f7aYouzhong Yang}
7138834f7aYouzhong Yang
714b6bc2fdDan McDonald/* ARGSUSED */
7158834f7aYouzhong Yangstatic void
7168834f7aYouzhong Yangnvme_prp_dma_destructor(void *buf, void *private)
7178834f7aYouzhong Yang{
7188834f7aYouzhong Yang	nvme_dma_t *dma = (nvme_dma_t *)buf;
7193c9168fHans Rosenfeld
7208834f7aYouzhong Yang	nvme_free_dma_common(dma);
7218834f7aYouzhong Yang}
7228834f7aYouzhong Yang
7238834f7aYouzhong Yangstatic int
7248834f7aYouzhong Yangnvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma,
7258834f7aYouzhong Yang    size_t len, uint_t flags, ddi_dma_attr_t *dma_attr)
7268834f7aYouzhong Yang{
7273c9168fHans Rosenfeld	if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL,
7283c9168fHans Rosenfeld	    &dma->nd_dmah) != DDI_SUCCESS) {
7293c9168fHans Rosenfeld		/*
7303c9168fHans Rosenfeld		 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
7313c9168fHans Rosenfeld		 * the only other possible error is DDI_DMA_BADATTR which
7323c9168fHans Rosenfeld		 * indicates a driver bug which should cause a panic.
7333c9168fHans Rosenfeld		 */
7343c9168fHans Rosenfeld		dev_err(nvme->n_dip, CE_PANIC,
7353c9168fHans Rosenfeld		    "!failed to get DMA handle, check DMA attributes");
7363c9168fHans Rosenfeld		return (DDI_FAILURE);
7373c9168fHans Rosenfeld	}
7383c9168fHans Rosenfeld
7393c9168fHans Rosenfeld	/*
7403c9168fHans Rosenfeld	 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
7413c9168fHans Rosenfeld	 * or the flags are conflicting, which isn't the case here.
7423c9168fHans Rosenfeld	 */
7433c9168fHans Rosenfeld	(void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr,
7443c9168fHans Rosenfeld	    DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp,
7453c9168fHans Rosenfeld	    &dma->nd_len, &dma->nd_acch);
7463c9168fHans Rosenfeld
7473c9168fHans Rosenfeld	if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp,
7483c9168fHans Rosenfeld	    dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
7493c9168fHans Rosenfeld	    &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) {
7503c9168fHans Rosenfeld		dev_err(nvme->n_dip, CE_WARN,
7513c9168fHans Rosenfeld		    "!failed to bind DMA memory");
7523c9168fHans Rosenfeld		atomic_inc_32(&nvme->n_dma_bind_err);
7538834f7aYouzhong Yang		nvme_free_dma_common(dma);
7548834f7aYouzhong Yang		return (DDI_FAILURE);
7558834f7aYouzhong Yang	}
7568834f7aYouzhong Yang
7578834f7aYouzhong Yang	return (DDI_SUCCESS);
7588834f7aYouzhong Yang}
7598834f7aYouzhong Yang
7608834f7aYouzhong Yangstatic int
7618834f7aYouzhong Yangnvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags,
7628834f7aYouzhong Yang    ddi_dma_attr_t *dma_attr, nvme_dma_t **ret)
7638834f7aYouzhong Yang{
7648834f7aYouzhong Yang	nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP);
7658834f7aYouzhong Yang
7668834f7aYouzhong Yang	if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) !=
7678834f7aYouzhong Yang	    DDI_SUCCESS) {
7683c9168fHans Rosenfeld		*ret = NULL;
7698834f7aYouzhong Yang		kmem_free(dma, sizeof (nvme_dma_t));
7703c9168fHans Rosenfeld		return (DDI_FAILURE);
7713c9168fHans Rosenfeld	}
7723c9168fHans Rosenfeld
7733c9168fHans Rosenfeld	bzero(dma->nd_memp, dma->nd_len);
7743c9168fHans Rosenfeld
7753c9168fHans Rosenfeld	*ret = dma;
7763c9168fHans Rosenfeld	return (DDI_SUCCESS);
7773c9168fHans Rosenfeld}
7783c9168fHans Rosenfeld
779b6bc2fdDan McDonald/* ARGSUSED */
7803c9168fHans Rosenfeldstatic int
7818834f7aYouzhong Yangnvme_prp_dma_constructor(void *buf, void *private, int flags)
7828834f7aYouzhong Yang{
7838834f7aYouzhong Yang	nvme_dma_t *dma = (nvme_dma_t *)buf;
7848834f7aYouzhong Yang	nvme_t *nvme = (nvme_t *)private;
7858834f7aYouzhong Yang
7868834f7aYouzhong Yang	dma->nd_dmah = NULL;
7878834f7aYouzhong Yang	dma->nd_acch = NULL;
7888834f7aYouzhong Yang
7898834f7aYouzhong Yang	if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize,
7908834f7aYouzhong Yang	    DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) {
7918834f7aYouzhong Yang		return (-1);
7928834f7aYouzhong Yang	}
7938834f7aYouzhong Yang
7948834f7aYouzhong Yang	ASSERT(dma->nd_ncookie == 1);
7958834f7aYouzhong Yang
7968834f7aYouzhong Yang	dma->nd_cached = B_TRUE;
7978834f7aYouzhong Yang
7988834f7aYouzhong Yang	return (0);
7998834f7aYouzhong Yang}
8008834f7aYouzhong Yang
8018834f7aYouzhong Yangstatic int
8023c9168fHans Rosenfeldnvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
8033c9168fHans Rosenfeld    uint_t flags, nvme_dma_t **dma)
8043c9168fHans Rosenfeld{
8053c9168fHans Rosenfeld	uint32_t len = nentry * qe_len;
8063c9168fHans Rosenfeld	ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr;
8073c9168fHans Rosenfeld
8083c9168fHans Rosenfeld	len = roundup(len, nvme->n_pagesize);
8093c9168fHans Rosenfeld
8103c9168fHans Rosenfeld	if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
8113c9168fHans Rosenfeld	    != DDI_SUCCESS) {
8123c9168fHans Rosenfeld		dev_err(nvme->n_dip, CE_WARN,
8133c9168fHans Rosenfeld		    "!failed to get DMA memory for queue");
8143c9168fHans Rosenfeld		goto fail;
8153c9168fHans Rosenfeld	}
8163c9168fHans Rosenfeld
8173c9168fHans Rosenfeld	if ((*dma)->nd_ncookie != 1) {
8183c9168fHans Rosenfeld		dev_err(nvme->n_dip, CE_WARN,
8193c9168fHans Rosenfeld		    "!got too many cookies for queue DMA");
8203c9168fHans Rosenfeld		goto fail;
8213c9168fHans Rosenfeld	}
8223c9168fHans Rosenfeld
8233c9168fHans Rosenfeld	return (DDI_SUCCESS);
8243c9168fHans Rosenfeld
8253c9168fHans Rosenfeldfail:
8263c9168fHans Rosenfeld	if (*dma) {
8273c9168fHans Rosenfeld		nvme_free_dma(*dma);
8283c9168fHans Rosenfeld		*dma = NULL;
8293c9168fHans Rosenfeld	}
8303c9168fHans Rosenfeld
8313c9168fHans Rosenfeld	return (DDI_FAILURE);
8323c9168fHans Rosenfeld}
8333c9168fHans Rosenfeld
8343c9168fHans Rosenfeldstatic void
8350999c11Paul Windernvme_free_cq(nvme_cq_t *cq)
8360999c11Paul Winder{
8370999c11Paul Winder	mutex_destroy(&cq->ncq_mutex);
8380999c11Paul Winder
8394d95620Paul Winder	if (cq->ncq_cmd_taskq != NULL)
8404d95620Paul Winder		taskq_destroy(cq->ncq_cmd_taskq);
8414d95620Paul Winder
8420999c11Paul Winder	if (cq->ncq_dma != NULL)
8430999c11Paul Winder		nvme_free_dma(cq->ncq_dma);
8440999c11Paul Winder
8450999c11Paul Winder	kmem_free(cq, sizeof (*cq));
8460999c11Paul Winder}
8470999c11Paul Winder
8480999c11Paul Winderstatic void
8493c9168fHans Rosenfeldnvme_free_qpair(nvme_qpair_t *qp)
8503c9168fHans Rosenfeld{
8513c9168fHans Rosenfeld	int i;
8523c9168fHans Rosenfeld
8533c9168fHans Rosenfeld	mutex_destroy(&qp->nq_mutex);
8544b32436Hans Rosenfeld	sema_destroy(&qp->nq_sema);
8553c9168fHans Rosenfeld
8563c9168fHans Rosenfeld	if (qp->nq_sqdma != NULL)
8573c9168fHans Rosenfeld		nvme_free_dma(qp->nq_sqdma);
8583c9168fHans Rosenfeld
8593c9168fHans Rosenfeld	if (qp->nq_active_cmds > 0)
860