1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 *
7 * Function crc16 Copyright (c) 2017, Fedor Uporov
8 *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * bhyve PCIe-NVMe device emulation.
34 *
35 * options:
36 *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37 *
38 *  accepted devpath:
39 *    /dev/blockdev
40 *    /path/to/image
41 *    ram=size_in_MiB
42 *
43 *  maxq    = max number of queues
44 *  qsz     = max elements in each queue
45 *  ioslots = max number of concurrent io requests
46 *  sectsz  = sector size (defaults to blockif sector size)
47 *  ser     = serial number (20-chars max)
48 *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49 *
50 */
51
52/* TODO:
53    - create async event for smart and log
54    - intr coalesce
55 */
56
57#include <sys/cdefs.h>
58__FBSDID("$FreeBSD$");
59
60#include <sys/types.h>
61#include <net/ieee_oui.h>
62#ifndef __FreeBSD__
63#include <endian.h>
64#endif
65
66#include <assert.h>
67#include <pthread.h>
68#include <semaphore.h>
69#include <stdbool.h>
70#include <stddef.h>
71#include <stdint.h>
72#include <stdio.h>
73#include <stdlib.h>
74#include <string.h>
75
76#include <machine/atomic.h>
77#include <machine/vmm.h>
78#include <vmmapi.h>
79
80#include <dev/nvme/nvme.h>
81
82#include "bhyverun.h"
83#include "block_if.h"
84#include "debug.h"
85#include "pci_emul.h"
86
87
88static int nvme_debug = 0;
89#define	DPRINTF(params) if (nvme_debug) PRINTLN params
90#define	WPRINTF(params) PRINTLN params
91
92/* defaults; can be overridden */
93#define	NVME_MSIX_BAR		4
94
95#define	NVME_IOSLOTS		8
96
97/* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98#define NVME_MMIO_SPACE_MIN	(1 << 14)
99
100#define	NVME_QUEUES		16
101#define	NVME_MAX_QENTRIES	2048
102
103#define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
104#define	NVME_MAX_BLOCKIOVS	512
105
106/* This is a synthetic status code to indicate there is no status */
107#define NVME_NO_STATUS		0xffff
108#define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
109
110/* helpers */
111
112/* Convert a zero-based value into a one-based value */
113#define ONE_BASED(zero)		((zero) + 1)
114/* Convert a one-based value into a zero-based value */
115#define ZERO_BASED(one)		((one)  - 1)
116
117/* Encode number of SQ's and CQ's for Set/Get Features */
118#define NVME_FEATURE_NUM_QUEUES(sc) \
119	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
120	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
121
122#define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
123
124enum nvme_controller_register_offsets {
125	NVME_CR_CAP_LOW = 0x00,
126	NVME_CR_CAP_HI  = 0x04,
127	NVME_CR_VS      = 0x08,
128	NVME_CR_INTMS   = 0x0c,
129	NVME_CR_INTMC   = 0x10,
130	NVME_CR_CC      = 0x14,
131	NVME_CR_CSTS    = 0x1c,
132	NVME_CR_NSSR    = 0x20,
133	NVME_CR_AQA     = 0x24,
134	NVME_CR_ASQ_LOW = 0x28,
135	NVME_CR_ASQ_HI  = 0x2c,
136	NVME_CR_ACQ_LOW = 0x30,
137	NVME_CR_ACQ_HI  = 0x34,
138};
139
140enum nvme_cmd_cdw11 {
141	NVME_CMD_CDW11_PC  = 0x0001,
142	NVME_CMD_CDW11_IEN = 0x0002,
143	NVME_CMD_CDW11_IV  = 0xFFFF0000,
144};
145
146enum nvme_copy_dir {
147	NVME_COPY_TO_PRP,
148	NVME_COPY_FROM_PRP,
149};
150
151#define	NVME_CQ_INTEN	0x01
152#define	NVME_CQ_INTCOAL	0x02
153
154struct nvme_completion_queue {
155	struct nvme_completion *qbase;
156	uint32_t	size;
157	uint16_t	tail; /* nvme progress */
158	uint16_t	head; /* guest progress */
159	uint16_t	intr_vec;
160	uint32_t	intr_en;
161	pthread_mutex_t	mtx;
162};
163
164struct nvme_submission_queue {
165	struct nvme_command *qbase;
166	uint32_t	size;
167	uint16_t	head; /* nvme progress */
168	uint16_t	tail; /* guest progress */
169	uint16_t	cqid; /* completion queue id */
170	int		busy; /* queue is being processed */
171	int		qpriority;
172};
173
174enum nvme_storage_type {
175	NVME_STOR_BLOCKIF = 0,
176	NVME_STOR_RAM = 1,
177};
178
179struct pci_nvme_blockstore {
180	enum nvme_storage_type type;
181	void		*ctx;
182	uint64_t	size;
183	uint32_t	sectsz;
184	uint32_t	sectsz_bits;
185	uint64_t	eui64;
186	uint32_t	deallocate:1;
187};
188
189struct pci_nvme_ioreq {
190	struct pci_nvme_softc *sc;
191	STAILQ_ENTRY(pci_nvme_ioreq) link;
192	struct nvme_submission_queue *nvme_sq;
193	uint16_t	sqid;
194
195	/* command information */
196	uint16_t	opc;
197	uint16_t	cid;
198	uint32_t	nsid;
199
200	uint64_t	prev_gpaddr;
201	size_t		prev_size;
202
203	/*
204	 * lock if all iovs consumed (big IO);
205	 * complete transaction before continuing
206	 */
207	pthread_mutex_t	mtx;
208	pthread_cond_t	cv;
209
210	struct blockif_req io_req;
211
212	/* pad to fit up to 512 page descriptors from guest IO request */
213	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
214};
215
216enum nvme_dsm_type {
217	/* Dataset Management bit in ONCS reflects backing storage capability */
218	NVME_DATASET_MANAGEMENT_AUTO,
219	/* Unconditionally set Dataset Management bit in ONCS */
220	NVME_DATASET_MANAGEMENT_ENABLE,
221	/* Unconditionally clear Dataset Management bit in ONCS */
222	NVME_DATASET_MANAGEMENT_DISABLE,
223};
224
225struct pci_nvme_softc {
226	struct pci_devinst *nsc_pi;
227
228	pthread_mutex_t	mtx;
229
230	struct nvme_registers regs;
231
232	struct nvme_namespace_data  nsdata;
233	struct nvme_controller_data ctrldata;
234	struct nvme_error_information_entry err_log;
235	struct nvme_health_information_page health_log;
236	struct nvme_firmware_page fw_log;
237
238	struct pci_nvme_blockstore nvstore;
239
240	uint16_t	max_qentries;	/* max entries per queue */
241	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
242	uint32_t	num_cqueues;
243	uint32_t	num_squeues;
244
245	struct pci_nvme_ioreq *ioreqs;
246	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
247	uint32_t	pending_ios;
248	uint32_t	ioslots;
249	sem_t		iosemlock;
250
251	/*
252	 * Memory mapped Submission and Completion queues
253	 * Each array includes both Admin and IO queues
254	 */
255	struct nvme_completion_queue *compl_queues;
256	struct nvme_submission_queue *submit_queues;
257
258	/* controller features */
259	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
260	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
261	uint32_t	async_ev_config;         /* 0x0B: async event config */
262
263	enum nvme_dsm_type dataset_management;
264};
265
266
267static void pci_nvme_io_partial(struct blockif_req *br, int err);
268
269/* Controller Configuration utils */
270#define	NVME_CC_GET_EN(cc) \
271	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
272#define	NVME_CC_GET_CSS(cc) \
273	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
274#define	NVME_CC_GET_SHN(cc) \
275	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
276#define	NVME_CC_GET_IOSQES(cc) \
277	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
278#define	NVME_CC_GET_IOCQES(cc) \
279	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
280
281#define	NVME_CC_WRITE_MASK \
282	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
283	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
284	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
285
286#define	NVME_CC_NEN_WRITE_MASK \
287	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
288	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
289	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
290
291/* Controller Status utils */
292#define	NVME_CSTS_GET_RDY(sts) \
293	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
294
295#define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
296
297/* Completion Queue status word utils */
298#define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
299#define	NVME_STATUS_MASK \
300	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
301	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
302
303#define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
304	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
305
306static __inline void
307cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
308{
309	size_t len;
310
311	len = strnlen(src, dst_size);
312	memset(dst, pad, dst_size);
313	memcpy(dst, src, len);
314}
315
316static __inline void
317pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
318{
319
320	*status &= ~NVME_STATUS_MASK;
321	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
322		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
323}
324
325static __inline void
326pci_nvme_status_genc(uint16_t *status, uint16_t code)
327{
328
329	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
330}
331
332static __inline void
333pci_nvme_toggle_phase(uint16_t *status, int prev)
334{
335
336	if (prev)
337		*status &= ~NVME_STATUS_P;
338	else
339		*status |= NVME_STATUS_P;
340}
341
342static void
343pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
344{
345	struct nvme_controller_data *cd = &sc->ctrldata;
346
347	cd->vid = 0xFB5D;
348	cd->ssvid = 0x0000;
349
350	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
351	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
352
353	/* Num of submission commands that we can handle at a time (2^rab) */
354	cd->rab   = 4;
355
356	/* FreeBSD OUI */
357	cd->ieee[0] = 0x58;
358	cd->ieee[1] = 0x9c;
359	cd->ieee[2] = 0xfc;
360
361	cd->mic = 0;
362
363	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
364
365	cd->ver = 0x00010300;
366
367	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
368	cd->acl = 2;
369	cd->aerl = 4;
370
371	cd->lpa = 0;	/* TODO: support some simple things like SMART */
372	cd->elpe = 0;	/* max error log page entries */
373	cd->npss = 1;	/* number of power states support */
374
375	/* Warning Composite Temperature Threshold */
376	cd->wctemp = 0x0157;
377
378	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
379	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
380	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
381	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
382	cd->nn = 1;	/* number of namespaces */
383
384	cd->oncs = 0;
385	switch (sc->dataset_management) {
386	case NVME_DATASET_MANAGEMENT_AUTO:
387		if (sc->nvstore.deallocate)
388			cd->oncs |= NVME_ONCS_DSM;
389		break;
390	case NVME_DATASET_MANAGEMENT_ENABLE:
391		cd->oncs |= NVME_ONCS_DSM;
392		break;
393	default:
394		break;
395	}
396
397	cd->fna = 0x03;
398
399	cd->power_state[0].mp = 10;
400}
401
402/*
403 * Calculate the CRC-16 of the given buffer
404 * See copyright attribution at top of file
405 */
406static uint16_t
407crc16(uint16_t crc, const void *buffer, unsigned int len)
408{
409	const unsigned char *cp = buffer;
410	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
411	static uint16_t const crc16_table[256] = {
412		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
413		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
414		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
415		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
416		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
417		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
418		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
419		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
420		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
421		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
422		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
423		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
424		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
425		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
426		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
427		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
428		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
429		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
430		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
431		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
432		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
433		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
434		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
435		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
436		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
437		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
438		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
439		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
440		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
441		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
442		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
443		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
444	};
445
446	while (len--)
447		crc = (((crc >> 8) & 0xffU) ^
448		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
449	return crc;
450}
451
452static void
453pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
454    struct nvme_namespace_data *nd, uint32_t nsid,
455    struct pci_nvme_blockstore *nvstore)
456{
457
458	/* Get capacity and block size information from backing store */
459	nd->nsze = nvstore->size / nvstore->sectsz;
460	nd->ncap = nd->nsze;
461	nd->nuse = nd->nsze;
462
463	if (nvstore->type == NVME_STOR_BLOCKIF)
464		nvstore->deallocate = blockif_candelete(nvstore->ctx);
465
466	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
467	nd->flbas = 0;
468
469	/* Create an EUI-64 if user did not provide one */
470	if (nvstore->eui64 == 0) {
471		char *data = NULL;
472		uint64_t eui64 = nvstore->eui64;
473
474		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
475		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
476
477		if (data != NULL) {
478			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
479			free(data);
480		}
481		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
482	}
483	be64enc(nd->eui64, nvstore->eui64);
484
485	/* LBA data-sz = 2^lbads */
486	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
487}
488
489static void
490pci_nvme_init_logpages(struct pci_nvme_softc *sc)
491{
492
493	memset(&sc->err_log, 0, sizeof(sc->err_log));
494	memset(&sc->health_log, 0, sizeof(sc->health_log));
495	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
496}
497
498static void
499pci_nvme_reset_locked(struct pci_nvme_softc *sc)
500{
501	DPRINTF(("%s", __func__));
502
503	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
504	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
505	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
506
507	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
508
509	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
510
511	sc->regs.cc = 0;
512	sc->regs.csts = 0;
513
514	sc->num_cqueues = sc->num_squeues = sc->max_queues;
515	if (sc->submit_queues != NULL) {
516		for (int i = 0; i < sc->num_squeues + 1; i++) {
517			/*
518			 * The Admin Submission Queue is at index 0.
519			 * It must not be changed at reset otherwise the
520			 * emulation will be out of sync with the guest.
521			 */
522			if (i != 0) {
523				sc->submit_queues[i].qbase = NULL;
524				sc->submit_queues[i].size = 0;
525				sc->submit_queues[i].cqid = 0;
526			}
527			sc->submit_queues[i].tail = 0;
528			sc->submit_queues[i].head = 0;
529			sc->submit_queues[i].busy = 0;
530		}
531	} else
532		sc->submit_queues = calloc(sc->num_squeues + 1,
533		                        sizeof(struct nvme_submission_queue));
534
535	if (sc->compl_queues != NULL) {
536		for (int i = 0; i < sc->num_cqueues + 1; i++) {
537			/* See Admin Submission Queue note above */
538			if (i != 0) {
539				sc->compl_queues[i].qbase = NULL;
540				sc->compl_queues[i].size = 0;
541			}
542
543			sc->compl_queues[i].tail = 0;
544			sc->compl_queues[i].head = 0;
545		}
546	} else {
547		sc->compl_queues = calloc(sc->num_cqueues + 1,
548		                        sizeof(struct nvme_completion_queue));
549
550		for (int i = 0; i < sc->num_cqueues + 1; i++)
551			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
552	}
553}
554
555static void
556pci_nvme_reset(struct pci_nvme_softc *sc)
557{
558	pthread_mutex_lock(&sc->mtx);
559	pci_nvme_reset_locked(sc);
560	pthread_mutex_unlock(&sc->mtx);
561}
562
563static void
564pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
565{
566	uint16_t acqs, asqs;
567
568	DPRINTF(("%s", __func__));
569
570	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
571	sc->submit_queues[0].size = asqs;
572	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
573	            sizeof(struct nvme_command) * asqs);
574
575	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p",
576	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
577
578	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
579	    NVME_AQA_REG_ACQS_MASK) + 1;
580	sc->compl_queues[0].size = acqs;
581	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
582	         sizeof(struct nvme_completion) * acqs);
583	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p",
584	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
585}
586
587static int
588nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
589	size_t len, enum nvme_copy_dir dir)
590{
591	uint8_t *p;
592	size_t bytes;
593
594	if (len > (8 * 1024)) {
595		return (-1);
596	}
597
598	/* Copy from the start of prp1 to the end of the physical page */
599	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
600	bytes = MIN(bytes, len);
601
602	p = vm_map_gpa(ctx, prp1, bytes);
603	if (p == NULL) {
604		return (-1);
605	}
606
607	if (dir == NVME_COPY_TO_PRP)
608		memcpy(p, b, bytes);
609	else
610		memcpy(b, p, bytes);
611
612	b += bytes;
613
614	len -= bytes;
615	if (len == 0) {
616		return (0);
617	}
618
619	len = MIN(len, PAGE_SIZE);
620
621	p = vm_map_gpa(ctx, prp2, len);
622	if (p == NULL) {
623		return (-1);
624	}
625
626	if (dir == NVME_COPY_TO_PRP)
627		memcpy(p, b, len);
628	else
629		memcpy(b, p, len);
630
631	return (0);
632}
633
634static int
635nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
636	struct nvme_completion* compl)
637{
638	uint16_t qid = command->cdw10 & 0xffff;
639
640	DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid));
641	if (qid == 0 || qid > sc->num_squeues) {
642		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u",
643		        __func__, qid, sc->num_squeues));
644		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
645		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
646		return (1);
647	}
648
649	sc->submit_queues[qid].qbase = NULL;
650	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
651	return (1);
652}
653
654static int
655nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
656	struct nvme_completion* compl)
657{
658	if (command->cdw11 & NVME_CMD_CDW11_PC) {
659		uint16_t qid = command->cdw10 & 0xffff;
660		struct nvme_submission_queue *nsq;
661
662		if ((qid == 0) || (qid > sc->num_squeues)) {
663			WPRINTF(("%s queue index %u > num_squeues %u",
664			        __func__, qid, sc->num_squeues));
665			pci_nvme_status_tc(&compl->status,
666			    NVME_SCT_COMMAND_SPECIFIC,
667			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
668			return (1);
669		}
670
671		nsq = &sc->submit_queues[qid];
672		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
673
674		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
675		              sizeof(struct nvme_command) * (size_t)nsq->size);
676		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
677		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
678
679		DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__,
680		        qid, nsq->size, nsq->qbase, nsq->cqid));
681
682		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
683
684		DPRINTF(("%s completed creating IOSQ qid %u",
685		         __func__, qid));
686	} else {
687		/*
688		 * Guest sent non-cont submission queue request.
689		 * This setting is unsupported by this emulation.
690		 */
691		WPRINTF(("%s unsupported non-contig (list-based) "
692		         "create i/o submission queue", __func__));
693
694		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
695	}
696	return (1);
697}
698
699static int
700nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
701	struct nvme_completion* compl)
702{
703	uint16_t qid = command->cdw10 & 0xffff;
704
705	DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid));
706	if (qid == 0 || qid > sc->num_cqueues) {
707		WPRINTF(("%s queue index %u / num_cqueues %u",
708		        __func__, qid, sc->num_cqueues));
709		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
710		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
711		return (1);
712	}
713
714	sc->compl_queues[qid].qbase = NULL;
715	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
716	return (1);
717}
718
719static int
720nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
721	struct nvme_completion* compl)
722{
723	if (command->cdw11 & NVME_CMD_CDW11_PC) {
724		uint16_t qid = command->cdw10 & 0xffff;
725		struct nvme_completion_queue *ncq;
726
727		if ((qid == 0) || (qid > sc->num_cqueues)) {
728			WPRINTF(("%s queue index %u > num_cqueues %u",
729			        __func__, qid, sc->num_cqueues));
730			pci_nvme_status_tc(&compl->status,
731			    NVME_SCT_COMMAND_SPECIFIC,
732			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
733			return (1);
734		}
735
736		ncq = &sc->compl_queues[qid];
737		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
738		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
739		ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
740
741		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
742		             command->prp1,
743		             sizeof(struct nvme_command) * (size_t)ncq->size);
744
745		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
746	} else {
747		/*
748		 * Non-contig completion queue unsupported.
749		 */
750		WPRINTF(("%s unsupported non-contig (list-based) "
751		         "create i/o completion queue",
752		         __func__));
753
754		/* 0x12 = Invalid Use of Controller Memory Buffer */
755		pci_nvme_status_genc(&compl->status, 0x12);
756	}
757
758	return (1);
759}
760
761static int
762nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
763	struct nvme_completion* compl)
764{
765	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
766	uint8_t logpage = command->cdw10 & 0xFF;
767
768	DPRINTF(("%s log page %u len %u", __func__, logpage, logsize));
769
770	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
771
772	switch (logpage) {
773	case NVME_LOG_ERROR:
774		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
775		    command->prp2, (uint8_t *)&sc->err_log, logsize,
776		    NVME_COPY_TO_PRP);
777		break;
778	case NVME_LOG_HEALTH_INFORMATION:
779		/* TODO: present some smart info */
780		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
781		    command->prp2, (uint8_t *)&sc->health_log, logsize,
782		    NVME_COPY_TO_PRP);
783		break;
784	case NVME_LOG_FIRMWARE_SLOT:
785		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
786		    command->prp2, (uint8_t *)&sc->fw_log, logsize,
787		    NVME_COPY_TO_PRP);
788		break;
789	default:
790		WPRINTF(("%s get log page %x command not supported",
791		        __func__, logpage));
792
793		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
794		    NVME_SC_INVALID_LOG_PAGE);
795	}
796
797	return (1);
798}
799
800static int
801nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
802	struct nvme_completion* compl)
803{
804	void *dest;
805
806	DPRINTF(("%s identify 0x%x nsid 0x%x", __func__,
807	        command->cdw10 & 0xFF, command->nsid));
808
809	switch (command->cdw10 & 0xFF) {
810	case 0x00: /* return Identify Namespace data structure */
811		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
812		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
813		    NVME_COPY_TO_PRP);
814		break;
815	case 0x01: /* return Identify Controller data structure */
816		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
817		    command->prp2, (uint8_t *)&sc->ctrldata,
818		    sizeof(sc->ctrldata),
819		    NVME_COPY_TO_PRP);
820		break;
821	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
822		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
823		                  sizeof(uint32_t) * 1024);
824		((uint32_t *)dest)[0] = 1;
825		((uint32_t *)dest)[1] = 0;
826		break;
827	case 0x11:
828		pci_nvme_status_genc(&compl->status,
829		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
830		return (1);
831	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
832	case 0x10:
833	case 0x12:
834	case 0x13:
835	case 0x14:
836	case 0x15:
837	default:
838		DPRINTF(("%s unsupported identify command requested 0x%x",
839		         __func__, command->cdw10 & 0xFF));
840		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
841		return (1);
842	}
843
844	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
845	return (1);
846}
847
848static int
849nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
850	struct nvme_completion* compl)
851{
852	uint16_t nqr;	/* Number of Queues Requested */
853
854	nqr = command->cdw11 & 0xFFFF;
855	if (nqr == 0xffff) {
856		WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr));
857		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
858		return (-1);
859	}
860
861	sc->num_squeues = ONE_BASED(nqr);
862	if (sc->num_squeues > sc->max_queues) {
863		DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues,
864					sc->max_queues));
865		sc->num_squeues = sc->max_queues;
866	}
867
868	nqr = (command->cdw11 >> 16) & 0xFFFF;
869	if (nqr == 0xffff) {
870		WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr));
871		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
872		return (-1);
873	}
874
875	sc->num_cqueues = ONE_BASED(nqr);
876	if (sc->num_cqueues > sc->max_queues) {
877		DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues,
878					sc->max_queues));
879		sc->num_cqueues = sc->max_queues;
880	}
881
882	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
883
884	return (0);
885}
886
887static int
888nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
889	struct nvme_completion* compl)
890{
891	int feature = command->cdw10 & 0xFF;
892	uint32_t iv;
893
894	DPRINTF(("%s feature 0x%x", __func__, feature));
895	compl->cdw0 = 0;
896
897	switch (feature) {
898	case NVME_FEAT_ARBITRATION:
899		DPRINTF(("  arbitration 0x%x", command->cdw11));
900		break;
901	case NVME_FEAT_POWER_MANAGEMENT:
902		DPRINTF(("  power management 0x%x", command->cdw11));
903		break;
904	case NVME_FEAT_LBA_RANGE_TYPE:
905		DPRINTF(("  lba range 0x%x", command->cdw11));
906		break;
907	case NVME_FEAT_TEMPERATURE_THRESHOLD:
908		DPRINTF(("  temperature threshold 0x%x", command->cdw11));
909		break;
910	case NVME_FEAT_ERROR_RECOVERY:
911		DPRINTF(("  error recovery 0x%x", command->cdw11));
912		break;
913	case NVME_FEAT_VOLATILE_WRITE_CACHE:
914		DPRINTF(("  volatile write cache 0x%x", command->cdw11));
915		break;
916	case NVME_FEAT_NUMBER_OF_QUEUES:
917		nvme_set_feature_queues(sc, command, compl);
918		break;
919	case NVME_FEAT_INTERRUPT_COALESCING:
920		DPRINTF(("  interrupt coalescing 0x%x", command->cdw11));
921
922		/* in uS */
923		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
924
925		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
926		break;
927	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
928		iv = command->cdw11 & 0xFFFF;
929
930		DPRINTF(("  interrupt vector configuration 0x%x",
931		        command->cdw11));
932
933		for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
934			if (sc->compl_queues[i].intr_vec == iv) {
935				if (command->cdw11 & (1 << 16))
936					sc->compl_queues[i].intr_en |=
937					                      NVME_CQ_INTCOAL;
938				else
939					sc->compl_queues[i].intr_en &=
940					                     ~NVME_CQ_INTCOAL;
941			}
942		}
943		break;
944	case NVME_FEAT_WRITE_ATOMICITY:
945		DPRINTF(("  write atomicity 0x%x", command->cdw11));
946		break;
947	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
948		DPRINTF(("  async event configuration 0x%x",
949		        command->cdw11));
950		sc->async_ev_config = command->cdw11;
951		break;
952	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
953		DPRINTF(("  software progress marker 0x%x",
954		        command->cdw11));
955		break;
956	case 0x0C:
957		DPRINTF(("  autonomous power state transition 0x%x",
958		        command->cdw11));
959		break;
960	default:
961		WPRINTF(("%s invalid feature", __func__));
962		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
963		return (1);
964	}
965
966	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
967	return (1);
968}
969
970static int
971nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
972	struct nvme_completion* compl)
973{
974	int feature = command->cdw10 & 0xFF;
975
976	DPRINTF(("%s feature 0x%x", __func__, feature));
977
978	compl->cdw0 = 0;
979
980	switch (feature) {
981	case NVME_FEAT_ARBITRATION:
982		DPRINTF(("  arbitration"));
983		break;
984	case NVME_FEAT_POWER_MANAGEMENT:
985		DPRINTF(("  power management"));
986		break;
987	case NVME_FEAT_LBA_RANGE_TYPE:
988		DPRINTF(("  lba range"));
989		break;
990	case NVME_FEAT_TEMPERATURE_THRESHOLD:
991		DPRINTF(("  temperature threshold"));
992		switch ((command->cdw11 >> 20) & 0x3) {
993		case 0:
994			/* Over temp threshold */
995			compl->cdw0 = 0xFFFF;
996			break;
997		case 1:
998			/* Under temp threshold */
999			compl->cdw0 = 0;
1000			break;
1001		default:
1002			WPRINTF(("  invalid threshold type select"));
1003			pci_nvme_status_genc(&compl->status,
1004			    NVME_SC_INVALID_FIELD);
1005			return (1);
1006		}
1007		break;
1008	case NVME_FEAT_ERROR_RECOVERY:
1009		DPRINTF(("  error recovery"));
1010		break;
1011	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1012		DPRINTF(("  volatile write cache"));
1013		break;
1014	case NVME_FEAT_NUMBER_OF_QUEUES:
1015		compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1016
1017		DPRINTF(("  number of queues (submit %u, completion %u)",
1018		        compl->cdw0 & 0xFFFF,
1019		        (compl->cdw0 >> 16) & 0xFFFF));
1020
1021		break;
1022	case NVME_FEAT_INTERRUPT_COALESCING:
1023		DPRINTF(("  interrupt coalescing"));
1024		break;
1025	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1026		DPRINTF(("  interrupt vector configuration"));
1027		break;
1028	case NVME_FEAT_WRITE_ATOMICITY:
1029		DPRINTF(("  write atomicity"));
1030		break;
1031	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1032		DPRINTF(("  async event configuration"));
1033		sc->async_ev_config = command->cdw11;
1034		break;
1035	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1036		DPRINTF(("  software progress marker"));
1037		break;
1038	case 0x0C:
1039		DPRINTF(("  autonomous power state transition"));
1040		break;
1041	default:
1042		WPRINTF(("%s invalid feature 0x%x", __func__, feature));
1043		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1044		return (1);
1045	}
1046
1047	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1048	return (1);
1049}
1050
1051static int
1052nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1053	struct nvme_completion* compl)
1054{
1055	DPRINTF(("%s submission queue %u, command ID 0x%x", __func__,
1056	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1057
1058	/* TODO: search for the command ID and abort it */
1059
1060	compl->cdw0 = 1;
1061	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1062	return (1);
1063}
1064
1065#ifdef __FreeBSD__
1066static int
1067nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1068	struct nvme_command* command, struct nvme_completion* compl)
1069{
1070	DPRINTF(("%s async event request 0x%x", __func__, command->cdw11));
1071
1072	/*
1073	 * TODO: raise events when they happen based on the Set Features cmd.
1074	 * These events happen async, so only set completion successful if
1075	 * there is an event reflective of the request to get event.
1076	 */
1077	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1078	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1079	return (0);
1080}
1081#else
1082/* This is kept behind an ifdef while it's unused to appease the compiler. */
1083#endif /* __FreeBSD__ */
1084
1085static void
1086pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1087{
1088	struct nvme_completion compl;
1089	struct nvme_command *cmd;
1090	struct nvme_submission_queue *sq;
1091	struct nvme_completion_queue *cq;
1092	uint16_t sqhead;
1093
1094	DPRINTF(("%s index %u", __func__, (uint32_t)value));
1095
1096	sq = &sc->submit_queues[0];
1097	cq = &sc->compl_queues[0];
1098
1099	sqhead = atomic_load_acq_short(&sq->head);
1100
1101	if (atomic_testandset_int(&sq->busy, 1)) {
1102		DPRINTF(("%s SQ busy, head %u, tail %u",
1103		        __func__, sqhead, sq->tail));
1104		return;
1105	}
1106
1107	DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail));
1108
1109	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1110		cmd = &(sq->qbase)[sqhead];
1111		compl.cdw0 = 0;
1112		compl.status = 0;
1113
1114		switch (cmd->opc) {
1115		case NVME_OPC_DELETE_IO_SQ:
1116			DPRINTF(("%s command DELETE_IO_SQ", __func__));
1117			nvme_opc_delete_io_sq(sc, cmd, &compl);
1118			break;
1119		case NVME_OPC_CREATE_IO_SQ:
1120			DPRINTF(("%s command CREATE_IO_SQ", __func__));
1121			nvme_opc_create_io_sq(sc, cmd, &compl);
1122			break;
1123		case NVME_OPC_DELETE_IO_CQ:
1124			DPRINTF(("%s command DELETE_IO_CQ", __func__));
1125			nvme_opc_delete_io_cq(sc, cmd, &compl);
1126			break;
1127		case NVME_OPC_CREATE_IO_CQ:
1128			DPRINTF(("%s command CREATE_IO_CQ", __func__));
1129			nvme_opc_create_io_cq(sc, cmd, &compl);
1130			break;
1131		case NVME_OPC_GET_LOG_PAGE:
1132			DPRINTF(("%s command GET_LOG_PAGE", __func__));
1133			nvme_opc_get_log_page(sc, cmd, &compl);
1134			break;
1135		case NVME_OPC_IDENTIFY:
1136			DPRINTF(("%s command IDENTIFY", __func__));
1137			nvme_opc_identify(sc, cmd, &compl);
1138			break;
1139		case NVME_OPC_ABORT:
1140			DPRINTF(("%s command ABORT", __func__));
1141			nvme_opc_abort(sc, cmd, &compl);
1142			break;
1143		case NVME_OPC_SET_FEATURES:
1144			DPRINTF(("%s command SET_FEATURES", __func__));
1145			nvme_opc_set_features(sc, cmd, &compl);
1146			break;
1147		case NVME_OPC_GET_FEATURES:
1148			DPRINTF(("%s command GET_FEATURES", __func__));
1149			nvme_opc_get_features(sc, cmd, &compl);
1150			break;
1151		case NVME_OPC_ASYNC_EVENT_REQUEST:
1152			DPRINTF(("%s command ASYNC_EVENT_REQ", __func__));
1153			/* XXX dont care, unhandled for now
1154			nvme_opc_async_event_req(sc, cmd, &compl);
1155			*/
1156			compl.status = NVME_NO_STATUS;
1157			break;
1158		default:
1159			WPRINTF(("0x%x command is not implemented",
1160			    cmd->opc));
1161			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1162		}
1163		sqhead = (sqhead + 1) % sq->size;
1164
1165		if (NVME_COMPLETION_VALID(compl)) {
1166			struct nvme_completion *cp;
1167			int phase;
1168
1169			cp = &(cq->qbase)[cq->tail];
1170			cp->cdw0 = compl.cdw0;
1171			cp->sqid = 0;
1172			cp->sqhd = sqhead;
1173			cp->cid = cmd->cid;
1174
1175			phase = NVME_STATUS_GET_P(cp->status);
1176			cp->status = compl.status;
1177			pci_nvme_toggle_phase(&cp->status, phase);
1178
1179			cq->tail = (cq->tail + 1) % cq->size;
1180		}
1181	}
1182
1183	DPRINTF(("setting sqhead %u", sqhead));
1184	atomic_store_short(&sq->head, sqhead);
1185	atomic_store_int(&sq->busy, 0);
1186
1187	if (cq->head != cq->tail)
1188		pci_generate_msix(sc->nsc_pi, 0);
1189
1190}
1191
1192static int
1193pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1194	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1195{
1196	int iovidx;
1197
1198	if (req != NULL) {
1199		/* concatenate contig block-iovs to minimize number of iovs */
1200		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1201			iovidx = req->io_req.br_iovcnt - 1;
1202
1203			req->io_req.br_iov[iovidx].iov_base =
1204			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1205			                     req->prev_gpaddr, size);
1206
1207			req->prev_size += size;
1208			req->io_req.br_resid += size;
1209
1210			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1211		} else {
1212			pthread_mutex_lock(&req->mtx);
1213
1214			iovidx = req->io_req.br_iovcnt;
1215			if (iovidx == NVME_MAX_BLOCKIOVS) {
1216				int err = 0;
1217
1218				DPRINTF(("large I/O, doing partial req"));
1219
1220				iovidx = 0;
1221				req->io_req.br_iovcnt = 0;
1222
1223				req->io_req.br_callback = pci_nvme_io_partial;
1224
1225				if (!do_write)
1226					err = blockif_read(sc->nvstore.ctx,
1227					                   &req->io_req);
1228				else
1229					err = blockif_write(sc->nvstore.ctx,
1230					                    &req->io_req);
1231
1232				/* wait until req completes before cont */
1233				if (err == 0)
1234					pthread_cond_wait(&req->cv, &req->mtx);
1235			}
1236			if (iovidx == 0) {
1237				req->io_req.br_offset = lba;
1238				req->io_req.br_resid = 0;
1239				req->io_req.br_param = req;
1240			}
1241
1242			req->io_req.br_iov[iovidx].iov_base =
1243			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1244			                     gpaddr, size);
1245
1246			req->io_req.br_iov[iovidx].iov_len = size;
1247
1248			req->prev_gpaddr = gpaddr;
1249			req->prev_size = size;
1250			req->io_req.br_resid += size;
1251
1252			req->io_req.br_iovcnt++;
1253
1254			pthread_mutex_unlock(&req->mtx);
1255		}
1256	} else {
1257		/* RAM buffer: read/write directly */
1258		void *p = sc->nvstore.ctx;
1259		void *gptr;
1260
1261		if ((lba + size) > sc->nvstore.size) {
1262			WPRINTF(("%s write would overflow RAM", __func__));
1263			return (-1);
1264		}
1265
1266		p = (void *)((uintptr_t)p + (uintptr_t)lba);
1267		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1268		if (do_write)
1269			memcpy(p, gptr, size);
1270		else
1271			memcpy(gptr, p, size);
1272	}
1273	return (0);
1274}
1275
1276static void
1277pci_nvme_set_completion(struct pci_nvme_softc *sc,
1278	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1279	uint32_t cdw0, uint16_t status, int ignore_busy)
1280{
1281	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1282	struct nvme_completion *compl;
1283	int phase;
1284
1285	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1286		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1287		 NVME_STATUS_GET_SC(status)));
1288
1289	pthread_mutex_lock(&cq->mtx);
1290
1291	assert(cq->qbase != NULL);
1292
1293	compl = &cq->qbase[cq->tail];
1294
1295	compl->cdw0 = cdw0;
1296	compl->sqid = sqid;
1297	compl->sqhd = atomic_load_acq_short(&sq->head);
1298	compl->cid = cid;
1299
1300	// toggle phase
1301	phase = NVME_STATUS_GET_P(compl->status);
1302	compl->status = status;
1303	pci_nvme_toggle_phase(&compl->status, phase);
1304
1305	cq->tail = (cq->tail + 1) % cq->size;
1306
1307	pthread_mutex_unlock(&cq->mtx);
1308
1309	if (cq->head != cq->tail) {
1310		if (cq->intr_en & NVME_CQ_INTEN) {
1311			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1312		} else {
1313			DPRINTF(("%s: CQ%u interrupt disabled\n",
1314						__func__, sq->cqid));
1315		}
1316	}
1317}
1318
1319static void
1320pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1321{
1322	req->sc = NULL;
1323	req->nvme_sq = NULL;
1324	req->sqid = 0;
1325
1326	pthread_mutex_lock(&sc->mtx);
1327
1328	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1329	sc->pending_ios--;
1330
1331	/* when no more IO pending, can set to ready if device reset/enabled */
1332	if (sc->pending_ios == 0 &&
1333	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1334		sc->regs.csts |= NVME_CSTS_RDY;
1335
1336	pthread_mutex_unlock(&sc->mtx);
1337
1338	sem_post(&sc->iosemlock);
1339}
1340
1341static struct pci_nvme_ioreq *
1342pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1343{
1344	struct pci_nvme_ioreq *req = NULL;;
1345
1346	sem_wait(&sc->iosemlock);
1347	pthread_mutex_lock(&sc->mtx);
1348
1349	req = STAILQ_FIRST(&sc->ioreqs_free);
1350	assert(req != NULL);
1351	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1352
1353	req->sc = sc;
1354
1355	sc->pending_ios++;
1356
1357	pthread_mutex_unlock(&sc->mtx);
1358
1359	req->io_req.br_iovcnt = 0;
1360	req->io_req.br_offset = 0;
1361	req->io_req.br_resid = 0;
1362	req->io_req.br_param = req;
1363	req->prev_gpaddr = 0;
1364	req->prev_size = 0;
1365
1366	return req;
1367}
1368
1369static void
1370pci_nvme_io_done(struct blockif_req *br, int err)
1371{
1372	struct pci_nvme_ioreq *req = br->br_param;
1373	struct nvme_submission_queue *sq = req->nvme_sq;
1374	uint16_t code, status = 0;
1375
1376	DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1377
1378	/* TODO return correct error */
1379	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1380	pci_nvme_status_genc(&status, code);
1381
1382	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1383	pci_nvme_release_ioreq(req->sc, req);
1384}
1385
1386static void
1387pci_nvme_io_partial(struct blockif_req *br, int err)
1388{
1389	struct pci_nvme_ioreq *req = br->br_param;
1390
1391	DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1392
1393	pthread_cond_signal(&req->cv);
1394}
1395
1396static void
1397pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1398{
1399	struct pci_nvme_ioreq *req = br->br_param;
1400	struct pci_nvme_softc *sc = req->sc;
1401	bool done = true;
1402#ifdef __FreeBSD__
1403	uint16_t status;
1404#else
1405	uint16_t status = 0;
1406#endif
1407
1408	if (err) {
1409		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1410	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1411		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1412	} else {
1413		struct iovec *iov = req->io_req.br_iov;
1414
1415		req->prev_gpaddr++;
1416		iov += req->prev_gpaddr;
1417
1418		/* The iov_* values already include the sector size */
1419		req->io_req.br_offset = (off_t)iov->iov_base;
1420		req->io_req.br_resid = iov->iov_len;
1421		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1422			pci_nvme_status_genc(&status,
1423			    NVME_SC_INTERNAL_DEVICE_ERROR);
1424		} else
1425			done = false;
1426	}
1427
1428	if (done) {
1429		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1430		    req->cid, 0, status, 0);
1431		pci_nvme_release_ioreq(sc, req);
1432	}
1433}
1434
1435static int
1436nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1437    struct nvme_command *cmd,
1438    struct pci_nvme_blockstore *nvstore,
1439    struct pci_nvme_ioreq *req,
1440    uint16_t *status)
1441{
1442	int err = -1;
1443
1444	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1445		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1446		goto out;
1447	}
1448
1449	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1450		struct nvme_dsm_range *range;
1451		uint32_t nr, r;
1452		int sectsz = sc->nvstore.sectsz;
1453
1454		/*
1455		 * DSM calls are advisory only, and compliant controllers
1456		 * may choose to take no actions (i.e. return Success).
1457		 */
1458		if (!nvstore->deallocate) {
1459			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1460			goto out;
1461		}
1462
1463		if (req == NULL) {
1464			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1465			goto out;
1466		}
1467
1468		/* copy locally because a range entry could straddle PRPs */
1469		range = calloc(1, NVME_MAX_DSM_TRIM);
1470		if (range == NULL) {
1471			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1472			goto out;
1473		}
1474		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1475		    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1476
1477		req->opc = cmd->opc;
1478		req->cid = cmd->cid;
1479		req->nsid = cmd->nsid;
1480		/*
1481		 * If the request is for more than a single range, store
1482		 * the ranges in the br_iov. Optimize for the common case
1483		 * of a single range.
1484		 *
1485		 * Note that NVMe Number of Ranges is a zero based value
1486		 */
1487		nr = cmd->cdw10 & 0xff;
1488
1489		req->io_req.br_iovcnt = 0;
1490		req->io_req.br_offset = range[0].starting_lba * sectsz;
1491		req->io_req.br_resid = range[0].length * sectsz;
1492
1493		if (nr == 0) {
1494			req->io_req.br_callback = pci_nvme_io_done;
1495		} else {
1496			struct iovec *iov = req->io_req.br_iov;
1497
1498			for (r = 0; r <= nr; r++) {
1499				iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1500				iov[r].iov_len = range[r].length * sectsz;
1501			}
1502			req->io_req.br_callback = pci_nvme_dealloc_sm;
1503
1504			/*
1505			 * Use prev_gpaddr to track the current entry and
1506			 * prev_size to track the number of entries
1507			 */
1508			req->prev_gpaddr = 0;
1509			req->prev_size = r;
1510		}
1511
1512		err = blockif_delete(nvstore->ctx, &req->io_req);
1513		if (err)
1514			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1515
1516		free(range);
1517	}
1518out:
1519	return (err);
1520}
1521
1522static void
1523pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1524{
1525	struct nvme_submission_queue *sq;
1526	uint16_t status = 0;
1527	uint16_t sqhead;
1528	int err;
1529
1530	/* handle all submissions up to sq->tail index */
1531	sq = &sc->submit_queues[idx];
1532
1533	if (atomic_testandset_int(&sq->busy, 1)) {
1534		DPRINTF(("%s sqid %u busy", __func__, idx));
1535		return;
1536	}
1537
1538	sqhead = atomic_load_acq_short(&sq->head);
1539
1540	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1541	         idx, sqhead, sq->tail, sq->qbase));
1542
1543	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1544		struct nvme_command *cmd;
1545		struct pci_nvme_ioreq *req = NULL;
1546		uint64_t lba;
1547		uint64_t nblocks, bytes, size, cpsz;
1548
1549		/* TODO: support scatter gather list handling */
1550
1551		cmd = &sq->qbase[sqhead];
1552		sqhead = (sqhead + 1) % sq->size;
1553
1554		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1555
1556		if (cmd->opc == NVME_OPC_FLUSH) {
1557			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1558			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1559			                        status, 1);
1560
1561			continue;
1562		} else if (cmd->opc == 0x08) {
1563			/* TODO: write zeroes */
1564			WPRINTF(("%s write zeroes lba 0x%lx blocks %u",
1565			        __func__, lba, cmd->cdw12 & 0xFFFF));
1566			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1567			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1568			                        status, 1);
1569
1570			continue;
1571		}
1572
1573		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1574			req = pci_nvme_get_ioreq(sc);
1575			req->nvme_sq = sq;
1576			req->sqid = idx;
1577		}
1578
1579		if (cmd->opc == NVME_OPC_DATASET_MANAGEMENT) {
1580			if (nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req,
1581			    &status)) {
1582				pci_nvme_set_completion(sc, sq, idx, cmd->cid,
1583				    0, status, 1);
1584				if (req)
1585					pci_nvme_release_ioreq(sc, req);
1586			}
1587			continue;
1588		}
1589
1590		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1591
1592		bytes = nblocks * sc->nvstore.sectsz;
1593
1594		/*
1595		 * If data starts mid-page and flows into the next page, then
1596		 * increase page count
1597		 */
1598
1599		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1600		         "(%lu-bytes)",
1601		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1602		         cmd->opc == NVME_OPC_WRITE ?
1603			     "WRITE" : "READ",
1604		         lba, nblocks, bytes));
1605
1606		cmd->prp1 &= ~(0x03UL);
1607		cmd->prp2 &= ~(0x03UL);
1608
1609		DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2));
1610
1611		size = bytes;
1612		lba *= sc->nvstore.sectsz;
1613
1614		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1615
1616		if (cpsz > bytes)
1617			cpsz = bytes;
1618
1619		if (req != NULL) {
1620			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1621			                        cmd->cdw10;
1622			req->opc = cmd->opc;
1623			req->cid = cmd->cid;
1624			req->nsid = cmd->nsid;
1625		}
1626
1627		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1628		    cmd->opc == NVME_OPC_WRITE, lba);
1629		lba += cpsz;
1630		size -= cpsz;
1631
1632		if (size == 0)
1633			goto iodone;
1634
1635		if (size <= PAGE_SIZE) {
1636			/* prp2 is second (and final) page in transfer */
1637
1638			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1639			    size,
1640			    cmd->opc == NVME_OPC_WRITE,
1641			    lba);
1642		} else {
1643			uint64_t *prp_list;
1644			int i;
1645
1646			/* prp2 is pointer to a physical region page list */
1647			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1648			                            cmd->prp2, PAGE_SIZE);
1649
1650			i = 0;
1651			while (size != 0) {
1652				cpsz = MIN(size, PAGE_SIZE);
1653
1654				/*
1655				 * Move to linked physical region page list
1656				 * in last item.
1657				 */
1658				if (i == (NVME_PRP2_ITEMS-1) &&
1659				    size > PAGE_SIZE) {
1660					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1661					prp_list = paddr_guest2host(
1662					              sc->nsc_pi->pi_vmctx,
1663					              prp_list[i], PAGE_SIZE);
1664					i = 0;
1665				}
1666				if (prp_list[i] == 0) {
1667					WPRINTF(("PRP2[%d] = 0 !!!", i));
1668					err = 1;
1669					break;
1670				}
1671
1672				err = pci_nvme_append_iov_req(sc, req,
1673				    prp_list[i], cpsz,
1674				    cmd->opc == NVME_OPC_WRITE, lba);
1675				if (err)
1676					break;
1677
1678				lba += cpsz;
1679				size -= cpsz;
1680				i++;
1681			}
1682		}
1683
1684iodone:
1685		if (sc->nvstore.type == NVME_STOR_RAM) {
1686			uint16_t code, status = 0;
1687
1688			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1689			    NVME_SC_SUCCESS;
1690			pci_nvme_status_genc(&status, code);
1691
1692			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1693			                        status, 1);
1694
1695			continue;
1696		}
1697
1698
1699		if (err)
1700			goto do_error;
1701
1702		req->io_req.br_callback = pci_nvme_io_done;
1703
1704		err = 0;
1705		switch (cmd->opc) {
1706		case NVME_OPC_READ:
1707			err = blockif_read(sc->nvstore.ctx, &req->io_req);
1708			break;
1709		case NVME_OPC_WRITE:
1710			err = blockif_write(sc->nvstore.ctx, &req->io_req);
1711			break;
1712		default:
1713			WPRINTF(("%s unhandled io command 0x%x",
1714				 __func__, cmd->opc));
1715			err = 1;
1716		}
1717
1718do_error:
1719		if (err) {
1720			uint16_t status = 0;
1721
1722			pci_nvme_status_genc(&status,
1723			    NVME_SC_DATA_TRANSFER_ERROR);
1724
1725			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1726			                        status, 1);
1727			pci_nvme_release_ioreq(sc, req);
1728		}
1729	}
1730
1731	atomic_store_short(&sq->head, sqhead);
1732	atomic_store_int(&sq->busy, 0);
1733}
1734
1735static void
1736pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1737	uint64_t idx, int is_sq, uint64_t value)
1738{
1739	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx",
1740	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1741
1742	if (is_sq) {
1743		atomic_store_short(&sc->submit_queues[idx].tail,
1744		                   (uint16_t)value);
1745
1746		if (idx == 0) {
1747			pci_nvme_handle_admin_cmd(sc, value);
1748		} else {
1749			/* submission queue; handle new entries in SQ */
1750			if (idx > sc->num_squeues) {
1751				WPRINTF(("%s SQ index %lu overflow from "
1752				         "guest (max %u)",
1753				         __func__, idx, sc->num_squeues));
1754				return;
1755			}
1756			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1757		}
1758	} else {
1759		if (idx > sc->num_cqueues) {
1760			WPRINTF(("%s queue index %lu overflow from "
1761			         "guest (max %u)",
1762			         __func__, idx, sc->num_cqueues));
1763			return;
1764		}
1765
1766		sc->compl_queues[idx].head = (uint16_t)value;
1767	}
1768}
1769
1770static void
1771pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1772{
1773	const char *s = iswrite ? "WRITE" : "READ";
1774
1775	switch (offset) {
1776	case NVME_CR_CAP_LOW:
1777		DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s));
1778		break;
1779	case NVME_CR_CAP_HI:
1780		DPRINTF(("%s %s NVME_CR_CAP_HI", func, s));
1781		break;
1782	case NVME_CR_VS:
1783		DPRINTF(("%s %s NVME_CR_VS", func, s));
1784		break;
1785	case NVME_CR_INTMS:
1786		DPRINTF(("%s %s NVME_CR_INTMS", func, s));
1787		break;
1788	case NVME_CR_INTMC:
1789		DPRINTF(("%s %s NVME_CR_INTMC", func, s));
1790		break;
1791	case NVME_CR_CC:
1792		DPRINTF(("%s %s NVME_CR_CC", func, s));
1793		break;
1794	case NVME_CR_CSTS:
1795		DPRINTF(("%s %s NVME_CR_CSTS", func, s));
1796		break;
1797	case NVME_CR_NSSR:
1798		DPRINTF(("%s %s NVME_CR_NSSR", func, s));
1799		break;
1800	case NVME_CR_AQA:
1801		DPRINTF(("%s %s NVME_CR_AQA", func, s));
1802		break;
1803	case NVME_CR_ASQ_LOW:
1804		DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s));
1805		break;
1806	case NVME_CR_ASQ_HI:
1807		DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s));
1808		break;
1809	case NVME_CR_ACQ_LOW:
1810		DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s));
1811		break;
1812	case NVME_CR_ACQ_HI:
1813		DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s));
1814		break;
1815	default:
1816		DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset));
1817	}
1818
1819}
1820
1821static void
1822pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1823	uint64_t offset, int size, uint64_t value)
1824{
1825	uint32_t ccreg;
1826
1827	if (offset >= NVME_DOORBELL_OFFSET) {
1828		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1829		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1830		int is_sq = (belloffset % 8) < 4;
1831
1832		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1833			WPRINTF(("guest attempted an overflow write offset "
1834			         "0x%lx, val 0x%lx in %s",
1835			         offset, value, __func__));
1836			return;
1837		}
1838
1839		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1840		return;
1841	}
1842
1843	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx",
1844	        offset, size, value));
1845
1846	if (size != 4) {
1847		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1848		         "val 0x%lx) to bar0 in %s",
1849		         size, offset, value, __func__));
1850		/* TODO: shutdown device */
1851		return;
1852	}
1853
1854	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1855
1856	pthread_mutex_lock(&sc->mtx);
1857
1858	switch (offset) {
1859	case NVME_CR_CAP_LOW:
1860	case NVME_CR_CAP_HI:
1861		/* readonly */
1862		break;
1863	case NVME_CR_VS:
1864		/* readonly */
1865		break;
1866	case NVME_CR_INTMS:
1867		/* MSI-X, so ignore */
1868		break;
1869	case NVME_CR_INTMC:
1870		/* MSI-X, so ignore */
1871		break;
1872	case NVME_CR_CC:
1873		ccreg = (uint32_t)value;
1874
1875		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1876		         "iocqes %u",
1877		        __func__,
1878			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1879			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1880			 NVME_CC_GET_IOCQES(ccreg)));
1881
1882		if (NVME_CC_GET_SHN(ccreg)) {
1883			/* perform shutdown - flush out data to backend */
1884			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1885			    NVME_CSTS_REG_SHST_SHIFT);
1886			sc->regs.csts |= NVME_SHST_COMPLETE <<
1887			    NVME_CSTS_REG_SHST_SHIFT;
1888		}
1889		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1890			if (NVME_CC_GET_EN(ccreg) == 0)
1891				/* transition 1-> causes controller reset */
1892				pci_nvme_reset_locked(sc);
1893			else
1894				pci_nvme_init_controller(ctx, sc);
1895		}
1896
1897		/* Insert the iocqes, iosqes and en bits from the write */
1898		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1899		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1900		if (NVME_CC_GET_EN(ccreg) == 0) {
1901			/* Insert the ams, mps and css bit fields */
1902			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1903			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1904			sc->regs.csts &= ~NVME_CSTS_RDY;
1905		} else if (sc->pending_ios == 0) {
1906			sc->regs.csts |= NVME_CSTS_RDY;
1907		}
1908		break;
1909	case NVME_CR_CSTS:
1910		break;
1911	case NVME_CR_NSSR:
1912		/* ignore writes; don't support subsystem reset */
1913		break;
1914	case NVME_CR_AQA:
1915		sc->regs.aqa = (uint32_t)value;
1916		break;
1917	case NVME_CR_ASQ_LOW:
1918		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1919		               (0xFFFFF000 & value);
1920		break;
1921	case NVME_CR_ASQ_HI:
1922		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1923		               (value << 32);
1924		break;
1925	case NVME_CR_ACQ_LOW:
1926		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1927		               (0xFFFFF000 & value);
1928		break;
1929	case NVME_CR_ACQ_HI:
1930		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1931		               (value << 32);
1932		break;
1933	default:
1934		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d",
1935		         __func__, offset, value, size));
1936	}
1937	pthread_mutex_unlock(&sc->mtx);
1938}
1939
1940static void
1941pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1942                int baridx, uint64_t offset, int size, uint64_t value)
1943{
1944	struct pci_nvme_softc* sc = pi->pi_arg;
1945
1946	if (baridx == pci_msix_table_bar(pi) ||
1947	    baridx == pci_msix_pba_bar(pi)) {
1948		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1949		         " value 0x%lx", baridx, offset, size, value));
1950
1951		pci_emul_msix_twrite(pi, offset, size, value);
1952		return;
1953	}
1954
1955	switch (baridx) {
1956	case 0:
1957		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1958		break;
1959
1960	default:
1961		DPRINTF(("%s unknown baridx %d, val 0x%lx",
1962		         __func__, baridx, value));
1963	}
1964}
1965
1966static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1967	uint64_t offset, int size)
1968{
1969	uint64_t value;
1970
1971	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1972
1973	if (offset < NVME_DOORBELL_OFFSET) {
1974		void *p = &(sc->regs);
1975		pthread_mutex_lock(&sc->mtx);
1976		memcpy(&value, (void *)((uintptr_t)p + offset), size);
1977		pthread_mutex_unlock(&sc->mtx);
1978	} else {
1979		value = 0;
1980                WPRINTF(("pci_nvme: read invalid offset %ld", offset));
1981	}
1982
1983	switch (size) {
1984	case 1:
1985		value &= 0xFF;
1986		break;
1987	case 2:
1988		value &= 0xFFFF;
1989		break;
1990	case 4:
1991		value &= 0xFFFFFFFF;
1992		break;
1993	}
1994
1995	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x",
1996	         offset, size, (uint32_t)value));
1997
1998	return (value);
1999}
2000
2001
2002
2003static uint64_t
2004pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2005    uint64_t offset, int size)
2006{
2007	struct pci_nvme_softc* sc = pi->pi_arg;
2008
2009	if (baridx == pci_msix_table_bar(pi) ||
2010	    baridx == pci_msix_pba_bar(pi)) {
2011		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2012		        baridx, offset, size));
2013
2014		return pci_emul_msix_tread(pi, offset, size);
2015	}
2016
2017	switch (baridx) {
2018	case 0:
2019       		return pci_nvme_read_bar_0(sc, offset, size);
2020
2021	default:
2022		DPRINTF(("unknown bar %d, 0x%lx", baridx, offset));
2023	}
2024
2025	return (0);
2026}
2027
2028
2029static int
2030pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2031{
2032	char bident[sizeof("XX:X:X")];
2033	char	*uopt, *xopts, *config;
2034	uint32_t sectsz;
2035	int optidx;
2036
2037	sc->max_queues = NVME_QUEUES;
2038	sc->max_qentries = NVME_MAX_QENTRIES;
2039	sc->ioslots = NVME_IOSLOTS;
2040	sc->num_squeues = sc->max_queues;
2041	sc->num_cqueues = sc->max_queues;
2042	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2043	sectsz = 0;
2044
2045	uopt = strdup(opts);
2046	optidx = 0;
2047	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2048	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2049	for (xopts = strtok(uopt, ",");
2050	     xopts != NULL;
2051	     xopts = strtok(NULL, ",")) {
2052
2053		if ((config = strchr(xopts, '=')) != NULL)
2054			*config++ = '\0';
2055
2056		if (!strcmp("maxq", xopts)) {
2057			sc->max_queues = atoi(config);
2058		} else if (!strcmp("qsz", xopts)) {
2059			sc->max_qentries = atoi(config);
2060		} else if (!strcmp("ioslots", xopts)) {
2061			sc->ioslots = atoi(config);
2062		} else if (!strcmp("sectsz", xopts)) {
2063			sectsz = atoi(config);
2064		} else if (!strcmp("ser", xopts)) {
2065			/*
2066			 * This field indicates the Product Serial Number in
2067			 * 7-bit ASCII, unused bytes should be space characters.
2068			 * Ref: NVMe v1.3c.
2069			 */
2070			cpywithpad((char *)sc->ctrldata.sn,
2071			           sizeof(sc->ctrldata.sn), config, ' ');
2072		} else if (!strcmp("ram", xopts)) {
2073			uint64_t sz = strtoull(&xopts[4], NULL, 10);
2074
2075			sc->nvstore.type = NVME_STOR_RAM;
2076			sc->nvstore.size = sz * 1024 * 1024;
2077			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2078			sc->nvstore.sectsz = 4096;
2079			sc->nvstore.sectsz_bits = 12;
2080			if (sc->nvstore.ctx == NULL) {
2081				perror("Unable to allocate RAM");
2082				free(uopt);
2083				return (-1);
2084			}
2085		} else if (!strcmp("eui64", xopts)) {
2086			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2087		} else if (!strcmp("dsm", xopts)) {
2088			if (!strcmp("auto", config))
2089				sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2090			else if (!strcmp("enable", config))
2091				sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2092			else if (!strcmp("disable", config))
2093				sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2094		} else if (optidx == 0) {
2095			snprintf(bident, sizeof(bident), "%d:%d",
2096			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2097			sc->nvstore.ctx = blockif_open(xopts, bident);
2098			if (sc->nvstore.ctx == NULL) {
2099				perror("Could not open backing file");
2100				free(uopt);
2101				return (-1);
2102			}
2103			sc->nvstore.type = NVME_STOR_BLOCKIF;
2104			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2105		} else {
2106			EPRINTLN("Invalid option %s", xopts);
2107			free(uopt);
2108			return (-1);
2109		}
2110
2111		optidx++;
2112	}
2113	free(uopt);
2114
2115	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2116		EPRINTLN("backing store not specified");
2117		return (-1);
2118	}
2119	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2120		sc->nvstore.sectsz = sectsz;
2121	else if (sc->nvstore.type != NVME_STOR_RAM)
2122		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2123	for (sc->nvstore.sectsz_bits = 9;
2124	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2125	     sc->nvstore.sectsz_bits++);
2126
2127	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2128		sc->max_queues = NVME_QUEUES;
2129
2130	if (sc->max_qentries <= 0) {
2131		EPRINTLN("Invalid qsz option");
2132		return (-1);
2133	}
2134	if (sc->ioslots <= 0) {
2135		EPRINTLN("Invalid ioslots option");
2136		return (-1);
2137	}
2138
2139	return (0);
2140}
2141
2142static int
2143pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2144{
2145	struct pci_nvme_softc *sc;
2146	uint32_t pci_membar_sz;
2147	int	error;
2148
2149	error = 0;
2150
2151	sc = calloc(1, sizeof(struct pci_nvme_softc));
2152	pi->pi_arg = sc;
2153	sc->nsc_pi = pi;
2154
2155	error = pci_nvme_parse_opts(sc, opts);
2156	if (error < 0)
2157		goto done;
2158	else
2159		error = 0;
2160
2161	STAILQ_INIT(&sc->ioreqs_free);
2162	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2163	for (int i = 0; i < sc->ioslots; i++) {
2164		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2165		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2166		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2167	}
2168	sc->intr_coales_aggr_thresh = 1;
2169
2170	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2171	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2172	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2173	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2174	pci_set_cfgdata8(pi, PCIR_PROGIF,
2175	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2176
2177	/*
2178	 * Allocate size of NVMe registers + doorbell space for all queues.
2179	 *
2180	 * The specification requires a minimum memory I/O window size of 16K.
2181	 * The Windows driver will refuse to start a device with a smaller
2182	 * window.
2183	 */
2184	pci_membar_sz = sizeof(struct nvme_registers) +
2185	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
2186	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2187
2188	DPRINTF(("nvme membar size: %u", pci_membar_sz));
2189
2190	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2191	if (error) {
2192		WPRINTF(("%s pci alloc mem bar failed", __func__));
2193		goto done;
2194	}
2195
2196	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2197	if (error) {
2198		WPRINTF(("%s pci add msixcap failed", __func__));
2199		goto done;
2200	}
2201
2202	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2203	if (error) {
2204		WPRINTF(("%s pci add Express capability failed", __func__));
2205		goto done;
2206	}
2207
2208	pthread_mutex_init(&sc->mtx, NULL);
2209	sem_init(&sc->iosemlock, 0, sc->ioslots);
2210
2211	pci_nvme_reset(sc);
2212	/*
2213	 * Controller data depends on Namespace data so initialize Namespace
2214	 * data first.
2215	 */
2216	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2217	pci_nvme_init_ctrldata(sc);
2218	pci_nvme_init_logpages(sc);
2219
2220	pci_lintr_request(pi);
2221
2222done:
2223	return (error);
2224}
2225
2226
2227struct pci_devemu pci_de_nvme = {
2228	.pe_emu =	"nvme",
2229	.pe_init =	pci_nvme_init,
2230	.pe_barwrite =	pci_nvme_write,
2231	.pe_barread =	pci_nvme_read
2232};
2233PCI_EMUL_SET(pci_de_nvme);
2234