/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2016 The MathWorks, Inc. All rights reserved.
 * Copyright 2019 Joyent, Inc.
 * Copyright 2019 Unix Software Ltd.
 * Copyright 2024 Oxide Computer Company.
 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
 * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
 */

#ifndef _NVME_VAR_H
#define	_NVME_VAR_H

#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/blkdev.h>
#include <sys/taskq_impl.h>
#include <sys/list.h>
#include <sys/ddi_ufm.h>
#include <nvme_common.h>

/*
 * NVMe driver state
 */

#ifdef __cplusplus
extern "C" {
#endif

typedef enum {
	NVME_PCI_CONFIG			= 1 << 0,
	NVME_FMA_INIT			= 1 << 1,
	NVME_REGS_MAPPED		= 1 << 2,
	NVME_ADMIN_QUEUE		= 1 << 3,
	NVME_CTRL_LIMITS		= 1 << 4,
	NVME_INTERRUPTS			= 1 << 5,
	NVME_UFM_INIT			= 1 << 6,
	NVME_MUTEX_INIT			= 1 << 7,
	NVME_MGMT_INIT			= 1 << 8
} nvme_progress_t;

typedef enum {
	NVME_NS_LOCK	= 1 << 0
} nvme_ns_progress_t;

typedef enum {
	/*
	 * The controller fails to properly process commands on the admin queue
	 * if the first one has CID 0. Subsequent use of CID 0 doesn't present
	 * a problem.
	 */
	NVME_QUIRK_START_CID		= 1 << 0,
} nvme_quirk_t;

#define	NVME_MIN_ADMIN_QUEUE_LEN	16
#define	NVME_MIN_IO_QUEUE_LEN		16
#define	NVME_DEFAULT_ADMIN_QUEUE_LEN	256
#define	NVME_DEFAULT_IO_QUEUE_LEN	1024
#define	NVME_DEFAULT_ASYNC_EVENT_LIMIT	10
#define	NVME_MIN_ASYNC_EVENT_LIMIT	1
#define	NVME_DEFAULT_MIN_BLOCK_SIZE	512


typedef struct nvme nvme_t;
typedef struct nvme_namespace nvme_namespace_t;
typedef struct nvme_minor nvme_minor_t;
typedef struct nvme_lock nvme_lock_t;
typedef struct nvme_minor_lock_info nvme_minor_lock_info_t;
typedef struct nvme_dma nvme_dma_t;
typedef struct nvme_cmd nvme_cmd_t;
typedef struct nvme_cq nvme_cq_t;
typedef struct nvme_qpair nvme_qpair_t;
typedef struct nvme_task_arg nvme_task_arg_t;

/*
 * These states represent the minor's perspective. That is, of a minor's
 * namespace and controller lock, where is it?
 */
typedef enum {
	NVME_LOCK_STATE_UNLOCKED	= 0,
	NVME_LOCK_STATE_BLOCKED,
	NVME_LOCK_STATE_ACQUIRED
} nvme_minor_lock_state_t;

struct nvme_minor_lock_info {
	list_node_t nli_node;
	nvme_lock_t *nli_lock;
	nvme_minor_lock_state_t nli_state;
	nvme_lock_level_t nli_curlevel;
	/*
	 * While the minor points back to itself and the nvme_t should always
	 * point to the current controller, the namespace should only point to
	 * one if this is a particular namespace lock. The former two are
	 * initialized at minor initialization time.
	 */
	nvme_minor_t *nli_minor;
	nvme_t *nli_nvme;
	nvme_namespace_t *nli_ns;
	/*
	 * This is the common ioctl information that should be filled in when
	 * we're being woken up for any reason other than an interrupted signal.
	 * This should only be set while blocking.
	 */
	nvme_ioctl_common_t *nli_ioc;
	/*
	 * The following are provided for debugging purposes. In particular,
	 * information like the kthread_t and related that performed this should
	 * be considered suspect as it represents who took the operation, not
	 * who performed the operation (unless we're actively blocking).
	 */
	hrtime_t nli_last_change;
	uintptr_t nli_acq_kthread;
	pid_t nli_acq_pid;
};

struct nvme_minor {
	/*
	 * The following three fields are set when this is created.
	 */
	id_t nm_minor;
	nvme_t *nm_ctrl;
	nvme_namespace_t *nm_ns;
	/*
	 * This link is used to index this minor on the global list of active
	 * open-related minors. This is only manipulated under the
	 * nvme_open_minors_mutex.
	 */
	avl_node_t nm_avl;
	/*
	 * Information related to locking. Note, there is no pointer to a locked
	 * controller as the only one can be the one specified here. This data
	 * is protected by the controller's n_minor_mutex.
	 */
	kcondvar_t nm_cv;
	nvme_minor_lock_info_t nm_ctrl_lock;
	nvme_minor_lock_info_t nm_ns_lock;
};

struct nvme_lock {
	nvme_minor_lock_info_t *nl_writer;
	list_t nl_readers;
	list_t nl_pend_readers;
	list_t nl_pend_writers;
	/*
	 * The following are stats to indicate how often certain locking
	 * activities have occurred for debugging purposes.
	 */
	uint32_t nl_nwrite_locks;
	uint32_t nl_nread_locks;
	uint32_t nl_npend_writes;
	uint32_t nl_npend_reads;
	uint32_t nl_nnonblock;
	uint32_t nl_nsignals;
	uint32_t nl_nsig_unlock;
	uint32_t nl_nsig_blocks;
	uint32_t nl_nsig_acq;
};

struct nvme_dma {
	ddi_dma_handle_t nd_dmah;
	ddi_acc_handle_t nd_acch;
	ddi_dma_cookie_t nd_cookie;
	uint_t nd_ncookie;
	caddr_t nd_memp;
	size_t nd_len;
	boolean_t nd_cached;
};

struct nvme_cmd {
	struct list_node nc_list;

	nvme_sqe_t nc_sqe;
	nvme_cqe_t nc_cqe;

	void (*nc_callback)(void *);
	bd_xfer_t *nc_xfer;
	boolean_t nc_completed;
	boolean_t nc_dontpanic;
	uint16_t nc_sqid;

	nvme_dma_t *nc_dma;
	nvme_dma_t *nc_prp; /* DMA for PRP lists */

	kmutex_t nc_mutex;
	kcondvar_t nc_cv;

	taskq_ent_t nc_tqent;
	nvme_t *nc_nvme;
};

struct nvme_cq {
	size_t ncq_nentry;
	uint16_t ncq_id;

	nvme_dma_t *ncq_dma;
	nvme_cqe_t *ncq_cq;
	uint_t ncq_head;
	uint_t ncq_tail;
	uintptr_t ncq_hdbl;
	int ncq_phase;

	taskq_t *ncq_cmd_taskq;

	kmutex_t ncq_mutex;
};

struct nvme_qpair {
	size_t nq_nentry;

	/* submission fields */
	nvme_dma_t *nq_sqdma;
	nvme_sqe_t *nq_sq;
	uint_t nq_sqhead;
	uint_t nq_sqtail;
	uintptr_t nq_sqtdbl;

	/* completion */
	nvme_cq_t *nq_cq;

	/* shared structures for completion and submission */
	nvme_cmd_t **nq_cmd;	/* active command array */
	uint16_t nq_next_cmd;	/* next potential empty queue slot */
	uint_t nq_active_cmds;	/* number of active cmds */

	kmutex_t nq_mutex;	/* protects shared state */
	ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */
};

struct nvme {
	dev_info_t *n_dip;
	nvme_progress_t n_progress;
	nvme_quirk_t n_quirks;

	caddr_t n_regs;
	ddi_acc_handle_t n_regh;

	kmem_cache_t *n_cmd_cache;
	kmem_cache_t *n_prp_cache;

	size_t n_inth_sz;
	ddi_intr_handle_t *n_inth;
	int n_intr_cnt;
	uint_t n_intr_pri;
	int n_intr_cap;
	int n_intr_type;
	int n_intr_types;

	ddi_acc_handle_t n_pcicfg_handle;
	uint16_t n_vendor_id;
	uint16_t n_device_id;
	uint16_t n_subsystem_vendor_id;
	uint16_t n_subsystem_device_id;
	uint8_t n_revision_id;

	char *n_product;
	char *n_vendor;

	nvme_version_t n_version;
	boolean_t n_dead;
	nvme_ioctl_errno_t n_dead_status;
	taskq_ent_t n_dead_tqent;
	boolean_t n_strict_version;
	boolean_t n_ignore_unknown_vendor_status;
	uint32_t n_admin_queue_len;
	uint32_t n_io_squeue_len;
	uint32_t n_io_cqueue_len;
	uint16_t n_async_event_limit;
	uint_t n_min_block_size;
	uint16_t n_abort_command_limit;
	uint64_t n_max_data_transfer_size;
	boolean_t n_write_cache_present;
	boolean_t n_write_cache_enabled;
	int n_error_log_len;
	boolean_t n_async_event_supported;
	int n_submission_queues;
	int n_completion_queues;

	int n_nssr_supported;
	int n_doorbell_stride;
	int n_timeout;
	int n_arbitration_mechanisms;
	int n_cont_queues_reqd;
	int n_max_queue_entries;
	int n_pageshift;
	int n_pagesize;

	uint32_t n_namespace_count;
	uint_t n_namespaces_attachable;
	uint_t n_ioq_count;
	uint_t n_cq_count;

	/*
	 * This is cached identify controller and common namespace data that
	 * exists in the system. This generally can be used in the kernel;
	 * however, we have to be careful about what we use here because these
	 * values are not refreshed after attach. Therefore these are good for
	 * answering the question what does the controller support or what is in
	 * the common namespace information, but not otherwise. That means you
	 * shouldn't use this to try to answer how much capacity is still in the
	 * controller because this information is just cached.
	 */
	nvme_identify_ctrl_t *n_idctl;
	nvme_identify_nsid_t *n_idcomns;

	/* Pointer to the admin queue, which is always queue 0 in n_ioq. */
	nvme_qpair_t *n_adminq;
	/*
	 * All command queues, including the admin queue.
	 * Its length is: n_ioq_count + 1.
	 */
	nvme_qpair_t **n_ioq;
	nvme_cq_t **n_cq;

	nvme_namespace_t *n_ns;

	ddi_dma_attr_t n_queue_dma_attr;
	ddi_dma_attr_t n_prp_dma_attr;
	ddi_dma_attr_t n_sgl_dma_attr;
	ddi_device_acc_attr_t n_reg_acc_attr;
	ddi_iblock_cookie_t n_fm_ibc;
	int n_fm_cap;

	ksema_t n_abort_sema;

	/* protects namespace management operations */
	kmutex_t n_mgmt_mutex;

	/*
	 * This lock protects the minor node locking state across the controller
	 * and all related namespaces.
	 */
	kmutex_t n_minor_mutex;
	nvme_lock_t n_lock;

	/* errors detected by driver */
	uint32_t n_dma_bind_err;
	uint32_t n_abort_failed;
	uint32_t n_cmd_timeout;
	uint32_t n_cmd_aborted;
	uint32_t n_wrong_logpage;
	uint32_t n_unknown_logpage;
	uint32_t n_too_many_cookies;
	uint32_t n_unknown_cid;

	/* errors detected by hardware */
	uint32_t n_data_xfr_err;
	uint32_t n_internal_err;
	uint32_t n_abort_rq_err;
	uint32_t n_abort_sq_del;
	uint32_t n_nvm_cap_exc;
	uint32_t n_nvm_ns_notrdy;
	uint32_t n_nvm_ns_formatting;
	uint32_t n_inv_cq_err;
	uint32_t n_inv_qid_err;
	uint32_t n_max_qsz_exc;
	uint32_t n_inv_int_vect;
	uint32_t n_inv_log_page;
	uint32_t n_inv_format;
	uint32_t n_inv_q_del;
	uint32_t n_cnfl_attr;
	uint32_t n_inv_prot;
	uint32_t n_readonly;

	/* errors reported by asynchronous events */
	uint32_t n_diagfail_event;
	uint32_t n_persistent_event;
	uint32_t n_transient_event;
	uint32_t n_fw_load_event;
	uint32_t n_reliability_event;
	uint32_t n_temperature_event;
	uint32_t n_spare_event;
	uint32_t n_vendor_event;
	uint32_t n_notice_event;
	uint32_t n_unknown_event;

	/* hot removal NDI event handling */
	ddi_eventcookie_t n_rm_cookie;
	ddi_callback_id_t n_ev_rm_cb_id;

	/* DDI UFM handle */
	ddi_ufm_handle_t *n_ufmh;
	/* Cached Firmware Slot Information log page */
	nvme_fwslot_log_t *n_fwslot;
	/* Lock protecting the cached firmware slot info */
	kmutex_t n_fwslot_mutex;
};

struct nvme_namespace {
	nvme_t *ns_nvme;
	nvme_ns_progress_t ns_progress;
	uint8_t ns_eui64[8];
	uint8_t	ns_nguid[16];
	char	ns_name[11];

	bd_handle_t ns_bd_hdl;

	uint32_t ns_id;
	size_t ns_block_count;
	size_t ns_block_size;
	size_t ns_best_block_size;

	boolean_t ns_allocated;
	boolean_t ns_active;
	boolean_t ns_ignore;
	boolean_t ns_attached;

	nvme_identify_nsid_t *ns_idns;

	/*
	 * Namespace lock, see the theory statement for more information.
	 */
	nvme_lock_t ns_lock;

	/*
	 * If a namespace has neither NGUID nor EUI64, we create a devid in
	 * nvme_prepare_devid().
	 */
	char *ns_devid;
};

struct nvme_task_arg {
	nvme_t *nt_nvme;
	nvme_cmd_t *nt_cmd;
};

typedef enum {
	/*
	 * This indicates that there is no exclusive access required for this
	 * operation. However, this operation will fail if someone attempts to
	 * perform this operation and someone else holds a write lock.
	 */
	NVME_IOCTL_EXCL_NONE	= 0,
	/*
	 * This indicates that a write lock is required to perform the
	 * operation.
	 */
	NVME_IOCTL_EXCL_WRITE,
	/*
	 * This indicates that the exclusive check should be skipped. The only
	 * case this should be used in is the lock and unlock ioctls as they
	 * should be able to proceed even when the controller is being used
	 * exclusively.
	 */
	NVME_IOCTL_EXCL_SKIP
} nvme_ioctl_excl_t;

/*
 * This structure represents the set of checks that we apply to ioctl's using
 * the nvme_ioctl_common_t structure as part of validation.
 */
typedef struct nvme_ioctl_check {
	/*
	 * This indicates whether or not the command in question allows a
	 * namespace to be specified at all. If this is false, a namespace minor
	 * cannot be used and a controller minor must leave the nsid set to
	 * zero.
	 */
	boolean_t nck_ns_ok;
	/*
	 * This indicates that a minor node corresponding to a namespace is
	 * allowed to issue this.
	 */
	boolean_t nck_ns_minor_ok;
	/*
	 * This indicates that the controller should be skipped from all of the
	 * following processing behavior. That is, it's allowed to specify
	 * whatever it wants in the nsid field, regardless if it is valid or
	 * not. This is required for some of the Identify Command options that
	 * list endpoints. This should generally not be used and the driver
	 * should still validate the nuance here.
	 */
	boolean_t nck_skip_ctrl;
	/*
	 * This indicates that if we're on the controller's minor and we don't
	 * have an explicit namespace ID (i.e. 0), should the namespace be
	 * rewritten to be the broadcast namespace.
	 */
	boolean_t nck_ctrl_rewrite;
	/*
	 * This indicates whether or not the broadcast NSID is acceptable for
	 * the controller node.
	 */
	boolean_t nck_bcast_ok;

	/*
	 * This indicates to the lock checking code what kind of exclusive
	 * access is required. This check occurs after any namespace rewriting
	 * has occurred. When looking at exclusivity, a broadcast namespace or
	 * namespace 0 indicate that the controller is the target, otherwise the
	 * target namespace will be checked for a write lock.
	 */
	nvme_ioctl_excl_t nck_excl;
} nvme_ioctl_check_t;

/*
 * Constants
 */
extern uint_t nvme_vendor_specific_admin_cmd_max_timeout;
extern uint32_t nvme_vendor_specific_admin_cmd_size;

/*
 * Common functions.
 */
extern nvme_namespace_t *nvme_nsid2ns(nvme_t *, uint32_t);
extern boolean_t nvme_ioctl_error(nvme_ioctl_common_t *, nvme_ioctl_errno_t,
    uint32_t, uint32_t);
extern boolean_t nvme_ctrl_atleast(nvme_t *, const nvme_version_t *);
extern void nvme_ioctl_success(nvme_ioctl_common_t *);

/*
 * Validation related functions and kernel tunable limits.
 */
extern boolean_t nvme_validate_logpage(nvme_t *, nvme_ioctl_get_logpage_t *);
extern boolean_t nvme_validate_identify(nvme_t *, nvme_ioctl_identify_t *,
    boolean_t);
extern boolean_t nvme_validate_get_feature(nvme_t *,
    nvme_ioctl_get_feature_t *);
extern boolean_t nvme_validate_vuc(nvme_t *, nvme_ioctl_passthru_t *);
extern boolean_t nvme_validate_format(nvme_t *, nvme_ioctl_format_t *);
extern boolean_t nvme_validate_fw_load(nvme_t *, nvme_ioctl_fw_load_t *);
extern boolean_t nvme_validate_fw_commit(nvme_t *, nvme_ioctl_fw_commit_t *);

/*
 * Locking functions
 */
extern void nvme_rwlock(nvme_minor_t *, nvme_ioctl_lock_t *);
extern void nvme_rwunlock(nvme_minor_lock_info_t *, nvme_lock_t *);
extern void nvme_rwlock_ctrl_dead(void *);
extern void nvme_lock_init(nvme_lock_t *);
extern void nvme_lock_fini(nvme_lock_t *);

#ifdef __cplusplus
}
#endif

#endif /* _NVME_VAR_H */