/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2016 The MathWorks, Inc. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright 2019 Unix Software Ltd. * Copyright 2024 Oxide Computer Company. * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. * Copyright 2022 Tintri by DDN, Inc. All rights reserved. */ #ifndef _NVME_VAR_H #define _NVME_VAR_H #include #include #include #include #include #include #include /* * NVMe driver state */ #ifdef __cplusplus extern "C" { #endif typedef enum { NVME_PCI_CONFIG = 1 << 0, NVME_FMA_INIT = 1 << 1, NVME_REGS_MAPPED = 1 << 2, NVME_ADMIN_QUEUE = 1 << 3, NVME_CTRL_LIMITS = 1 << 4, NVME_INTERRUPTS = 1 << 5, NVME_UFM_INIT = 1 << 6, NVME_MUTEX_INIT = 1 << 7, NVME_MGMT_INIT = 1 << 8 } nvme_progress_t; typedef enum { NVME_NS_LOCK = 1 << 0 } nvme_ns_progress_t; typedef enum { /* * The controller fails to properly process commands on the admin queue * if the first one has CID 0. Subsequent use of CID 0 doesn't present * a problem. */ NVME_QUIRK_START_CID = 1 << 0, } nvme_quirk_t; #define NVME_MIN_ADMIN_QUEUE_LEN 16 #define NVME_MIN_IO_QUEUE_LEN 16 #define NVME_DEFAULT_ADMIN_QUEUE_LEN 256 #define NVME_DEFAULT_IO_QUEUE_LEN 1024 #define NVME_DEFAULT_ASYNC_EVENT_LIMIT 10 #define NVME_MIN_ASYNC_EVENT_LIMIT 1 #define NVME_DEFAULT_MIN_BLOCK_SIZE 512 typedef struct nvme nvme_t; typedef struct nvme_namespace nvme_namespace_t; typedef struct nvme_minor nvme_minor_t; typedef struct nvme_lock nvme_lock_t; typedef struct nvme_minor_lock_info nvme_minor_lock_info_t; typedef struct nvme_dma nvme_dma_t; typedef struct nvme_cmd nvme_cmd_t; typedef struct nvme_cq nvme_cq_t; typedef struct nvme_qpair nvme_qpair_t; typedef struct nvme_task_arg nvme_task_arg_t; /* * These states represent the minor's perspective. That is, of a minor's * namespace and controller lock, where is it? */ typedef enum { NVME_LOCK_STATE_UNLOCKED = 0, NVME_LOCK_STATE_BLOCKED, NVME_LOCK_STATE_ACQUIRED } nvme_minor_lock_state_t; struct nvme_minor_lock_info { list_node_t nli_node; nvme_lock_t *nli_lock; nvme_minor_lock_state_t nli_state; nvme_lock_level_t nli_curlevel; /* * While the minor points back to itself and the nvme_t should always * point to the current controller, the namespace should only point to * one if this is a particular namespace lock. The former two are * initialized at minor initialization time. */ nvme_minor_t *nli_minor; nvme_t *nli_nvme; nvme_namespace_t *nli_ns; /* * This is the common ioctl information that should be filled in when * we're being woken up for any reason other than an interrupted signal. * This should only be set while blocking. */ nvme_ioctl_common_t *nli_ioc; /* * The following are provided for debugging purposes. In particular, * information like the kthread_t and related that performed this should * be considered suspect as it represents who took the operation, not * who performed the operation (unless we're actively blocking). */ hrtime_t nli_last_change; uintptr_t nli_acq_kthread; pid_t nli_acq_pid; }; struct nvme_minor { /* * The following three fields are set when this is created. */ id_t nm_minor; nvme_t *nm_ctrl; nvme_namespace_t *nm_ns; /* * This link is used to index this minor on the global list of active * open-related minors. This is only manipulated under the * nvme_open_minors_mutex. */ avl_node_t nm_avl; /* * Information related to locking. Note, there is no pointer to a locked * controller as the only one can be the one specified here. This data * is protected by the controller's n_minor_mutex. */ kcondvar_t nm_cv; nvme_minor_lock_info_t nm_ctrl_lock; nvme_minor_lock_info_t nm_ns_lock; }; struct nvme_lock { nvme_minor_lock_info_t *nl_writer; list_t nl_readers; list_t nl_pend_readers; list_t nl_pend_writers; /* * The following are stats to indicate how often certain locking * activities have occurred for debugging purposes. */ uint32_t nl_nwrite_locks; uint32_t nl_nread_locks; uint32_t nl_npend_writes; uint32_t nl_npend_reads; uint32_t nl_nnonblock; uint32_t nl_nsignals; uint32_t nl_nsig_unlock; uint32_t nl_nsig_blocks; uint32_t nl_nsig_acq; }; struct nvme_dma { ddi_dma_handle_t nd_dmah; ddi_acc_handle_t nd_acch; ddi_dma_cookie_t nd_cookie; uint_t nd_ncookie; caddr_t nd_memp; size_t nd_len; boolean_t nd_cached; }; struct nvme_cmd { struct list_node nc_list; nvme_sqe_t nc_sqe; nvme_cqe_t nc_cqe; void (*nc_callback)(void *); bd_xfer_t *nc_xfer; boolean_t nc_completed; boolean_t nc_dontpanic; uint16_t nc_sqid; nvme_dma_t *nc_dma; nvme_dma_t *nc_prp; /* DMA for PRP lists */ kmutex_t nc_mutex; kcondvar_t nc_cv; taskq_ent_t nc_tqent; nvme_t *nc_nvme; }; struct nvme_cq { size_t ncq_nentry; uint16_t ncq_id; nvme_dma_t *ncq_dma; nvme_cqe_t *ncq_cq; uint_t ncq_head; uint_t ncq_tail; uintptr_t ncq_hdbl; int ncq_phase; taskq_t *ncq_cmd_taskq; kmutex_t ncq_mutex; }; struct nvme_qpair { size_t nq_nentry; /* submission fields */ nvme_dma_t *nq_sqdma; nvme_sqe_t *nq_sq; uint_t nq_sqhead; uint_t nq_sqtail; uintptr_t nq_sqtdbl; /* completion */ nvme_cq_t *nq_cq; /* shared structures for completion and submission */ nvme_cmd_t **nq_cmd; /* active command array */ uint16_t nq_next_cmd; /* next potential empty queue slot */ uint_t nq_active_cmds; /* number of active cmds */ kmutex_t nq_mutex; /* protects shared state */ ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */ }; struct nvme { dev_info_t *n_dip; nvme_progress_t n_progress; nvme_quirk_t n_quirks; caddr_t n_regs; ddi_acc_handle_t n_regh; kmem_cache_t *n_cmd_cache; kmem_cache_t *n_prp_cache; size_t n_inth_sz; ddi_intr_handle_t *n_inth; int n_intr_cnt; uint_t n_intr_pri; int n_intr_cap; int n_intr_type; int n_intr_types; ddi_acc_handle_t n_pcicfg_handle; uint16_t n_vendor_id; uint16_t n_device_id; uint16_t n_subsystem_vendor_id; uint16_t n_subsystem_device_id; uint8_t n_revision_id; char *n_product; char *n_vendor; nvme_version_t n_version; boolean_t n_dead; nvme_ioctl_errno_t n_dead_status; taskq_ent_t n_dead_tqent; boolean_t n_strict_version; boolean_t n_ignore_unknown_vendor_status; uint32_t n_admin_queue_len; uint32_t n_io_squeue_len; uint32_t n_io_cqueue_len; uint16_t n_async_event_limit; uint_t n_min_block_size; uint16_t n_abort_command_limit; uint64_t n_max_data_transfer_size; boolean_t n_write_cache_present; boolean_t n_write_cache_enabled; int n_error_log_len; boolean_t n_async_event_supported; int n_submission_queues; int n_completion_queues; int n_nssr_supported; int n_doorbell_stride; int n_timeout; int n_arbitration_mechanisms; int n_cont_queues_reqd; int n_max_queue_entries; int n_pageshift; int n_pagesize; uint32_t n_namespace_count; uint_t n_namespaces_attachable; uint_t n_ioq_count; uint_t n_cq_count; /* * This is cached identify controller and common namespace data that * exists in the system. This generally can be used in the kernel; * however, we have to be careful about what we use here because these * values are not refreshed after attach. Therefore these are good for * answering the question what does the controller support or what is in * the common namespace information, but not otherwise. That means you * shouldn't use this to try to answer how much capacity is still in the * controller because this information is just cached. */ nvme_identify_ctrl_t *n_idctl; nvme_identify_nsid_t *n_idcomns; /* Pointer to the admin queue, which is always queue 0 in n_ioq. */ nvme_qpair_t *n_adminq; /* * All command queues, including the admin queue. * Its length is: n_ioq_count + 1. */ nvme_qpair_t **n_ioq; nvme_cq_t **n_cq; nvme_namespace_t *n_ns; ddi_dma_attr_t n_queue_dma_attr; ddi_dma_attr_t n_prp_dma_attr; ddi_dma_attr_t n_sgl_dma_attr; ddi_device_acc_attr_t n_reg_acc_attr; ddi_iblock_cookie_t n_fm_ibc; int n_fm_cap; ksema_t n_abort_sema; /* protects namespace management operations */ kmutex_t n_mgmt_mutex; /* * This lock protects the minor node locking state across the controller * and all related namespaces. */ kmutex_t n_minor_mutex; nvme_lock_t n_lock; /* errors detected by driver */ uint32_t n_dma_bind_err; uint32_t n_abort_failed; uint32_t n_cmd_timeout; uint32_t n_cmd_aborted; uint32_t n_wrong_logpage; uint32_t n_unknown_logpage; uint32_t n_too_many_cookies; uint32_t n_unknown_cid; /* errors detected by hardware */ uint32_t n_data_xfr_err; uint32_t n_internal_err; uint32_t n_abort_rq_err; uint32_t n_abort_sq_del; uint32_t n_nvm_cap_exc; uint32_t n_nvm_ns_notrdy; uint32_t n_nvm_ns_formatting; uint32_t n_inv_cq_err; uint32_t n_inv_qid_err; uint32_t n_max_qsz_exc; uint32_t n_inv_int_vect; uint32_t n_inv_log_page; uint32_t n_inv_format; uint32_t n_inv_q_del; uint32_t n_cnfl_attr; uint32_t n_inv_prot; uint32_t n_readonly; /* errors reported by asynchronous events */ uint32_t n_diagfail_event; uint32_t n_persistent_event; uint32_t n_transient_event; uint32_t n_fw_load_event; uint32_t n_reliability_event; uint32_t n_temperature_event; uint32_t n_spare_event; uint32_t n_vendor_event; uint32_t n_notice_event; uint32_t n_unknown_event; /* hot removal NDI event handling */ ddi_eventcookie_t n_rm_cookie; ddi_callback_id_t n_ev_rm_cb_id; /* DDI UFM handle */ ddi_ufm_handle_t *n_ufmh; /* Cached Firmware Slot Information log page */ nvme_fwslot_log_t *n_fwslot; /* Lock protecting the cached firmware slot info */ kmutex_t n_fwslot_mutex; }; struct nvme_namespace { nvme_t *ns_nvme; nvme_ns_progress_t ns_progress; uint8_t ns_eui64[8]; uint8_t ns_nguid[16]; char ns_name[11]; bd_handle_t ns_bd_hdl; uint32_t ns_id; size_t ns_block_count; size_t ns_block_size; size_t ns_best_block_size; boolean_t ns_allocated; boolean_t ns_active; boolean_t ns_ignore; boolean_t ns_attached; nvme_identify_nsid_t *ns_idns; /* * Namespace lock, see the theory statement for more information. */ nvme_lock_t ns_lock; /* * If a namespace has neither NGUID nor EUI64, we create a devid in * nvme_prepare_devid(). */ char *ns_devid; }; struct nvme_task_arg { nvme_t *nt_nvme; nvme_cmd_t *nt_cmd; }; typedef enum { /* * This indicates that there is no exclusive access required for this * operation. However, this operation will fail if someone attempts to * perform this operation and someone else holds a write lock. */ NVME_IOCTL_EXCL_NONE = 0, /* * This indicates that a write lock is required to perform the * operation. */ NVME_IOCTL_EXCL_WRITE, /* * This indicates that the exclusive check should be skipped. The only * case this should be used in is the lock and unlock ioctls as they * should be able to proceed even when the controller is being used * exclusively. */ NVME_IOCTL_EXCL_SKIP } nvme_ioctl_excl_t; /* * This structure represents the set of checks that we apply to ioctl's using * the nvme_ioctl_common_t structure as part of validation. */ typedef struct nvme_ioctl_check { /* * This indicates whether or not the command in question allows a * namespace to be specified at all. If this is false, a namespace minor * cannot be used and a controller minor must leave the nsid set to * zero. */ boolean_t nck_ns_ok; /* * This indicates that a minor node corresponding to a namespace is * allowed to issue this. */ boolean_t nck_ns_minor_ok; /* * This indicates that the controller should be skipped from all of the * following processing behavior. That is, it's allowed to specify * whatever it wants in the nsid field, regardless if it is valid or * not. This is required for some of the Identify Command options that * list endpoints. This should generally not be used and the driver * should still validate the nuance here. */ boolean_t nck_skip_ctrl; /* * This indicates that if we're on the controller's minor and we don't * have an explicit namespace ID (i.e. 0), should the namespace be * rewritten to be the broadcast namespace. */ boolean_t nck_ctrl_rewrite; /* * This indicates whether or not the broadcast NSID is acceptable for * the controller node. */ boolean_t nck_bcast_ok; /* * This indicates to the lock checking code what kind of exclusive * access is required. This check occurs after any namespace rewriting * has occurred. When looking at exclusivity, a broadcast namespace or * namespace 0 indicate that the controller is the target, otherwise the * target namespace will be checked for a write lock. */ nvme_ioctl_excl_t nck_excl; } nvme_ioctl_check_t; /* * Constants */ extern uint_t nvme_vendor_specific_admin_cmd_max_timeout; extern uint32_t nvme_vendor_specific_admin_cmd_size; /* * Common functions. */ extern nvme_namespace_t *nvme_nsid2ns(nvme_t *, uint32_t); extern boolean_t nvme_ioctl_error(nvme_ioctl_common_t *, nvme_ioctl_errno_t, uint32_t, uint32_t); extern boolean_t nvme_ctrl_atleast(nvme_t *, const nvme_version_t *); extern void nvme_ioctl_success(nvme_ioctl_common_t *); /* * Validation related functions and kernel tunable limits. */ extern boolean_t nvme_validate_logpage(nvme_t *, nvme_ioctl_get_logpage_t *); extern boolean_t nvme_validate_identify(nvme_t *, nvme_ioctl_identify_t *, boolean_t); extern boolean_t nvme_validate_get_feature(nvme_t *, nvme_ioctl_get_feature_t *); extern boolean_t nvme_validate_vuc(nvme_t *, nvme_ioctl_passthru_t *); extern boolean_t nvme_validate_format(nvme_t *, nvme_ioctl_format_t *); extern boolean_t nvme_validate_fw_load(nvme_t *, nvme_ioctl_fw_load_t *); extern boolean_t nvme_validate_fw_commit(nvme_t *, nvme_ioctl_fw_commit_t *); /* * Locking functions */ extern void nvme_rwlock(nvme_minor_t *, nvme_ioctl_lock_t *); extern void nvme_rwunlock(nvme_minor_lock_info_t *, nvme_lock_t *); extern void nvme_rwlock_ctrl_dead(void *); extern void nvme_lock_init(nvme_lock_t *); extern void nvme_lock_fini(nvme_lock_t *); #ifdef __cplusplus } #endif #endif /* _NVME_VAR_H */