xref: /illumos-gate/usr/src/cmd/bhyve/pci_nvme.c (revision b0de25cb)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 #ifndef __FreeBSD__
66 #include <endian.h>
67 #endif
68 
69 #include <assert.h>
70 #include <pthread.h>
71 #include <semaphore.h>
72 #include <stdbool.h>
73 #include <stddef.h>
74 #include <stdint.h>
75 #include <stdio.h>
76 #include <stdlib.h>
77 #include <string.h>
78 
79 #include <machine/atomic.h>
80 #include <machine/vmm.h>
81 #include <vmmapi.h>
82 
83 #include <dev/nvme/nvme.h>
84 
85 #include "bhyverun.h"
86 #include "block_if.h"
87 #include "config.h"
88 #include "debug.h"
89 #include "pci_emul.h"
90 
91 
92 static int nvme_debug = 0;
93 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
94 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
95 
96 /* defaults; can be overridden */
97 #define	NVME_MSIX_BAR		4
98 
99 #define	NVME_IOSLOTS		8
100 
101 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
102 #define NVME_MMIO_SPACE_MIN	(1 << 14)
103 
104 #define	NVME_QUEUES		16
105 #define	NVME_MAX_QENTRIES	2048
106 /* Memory Page size Minimum reported in CAP register */
107 #define	NVME_MPSMIN		0
108 /* MPSMIN converted to bytes */
109 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
110 
111 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
112 #define	NVME_MDTS		9
113 /* Note the + 1 allows for the initial descriptor to not be page aligned */
114 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
115 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
116 
117 /* This is a synthetic status code to indicate there is no status */
118 #define NVME_NO_STATUS		0xffff
119 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
120 
121 /* helpers */
122 
123 /* Convert a zero-based value into a one-based value */
124 #define ONE_BASED(zero)		((zero) + 1)
125 /* Convert a one-based value into a zero-based value */
126 #define ZERO_BASED(one)		((one)  - 1)
127 
128 /* Encode number of SQ's and CQ's for Set/Get Features */
129 #define NVME_FEATURE_NUM_QUEUES(sc) \
130 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
131 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
132 
133 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
134 
135 enum nvme_controller_register_offsets {
136 	NVME_CR_CAP_LOW = 0x00,
137 	NVME_CR_CAP_HI  = 0x04,
138 	NVME_CR_VS      = 0x08,
139 	NVME_CR_INTMS   = 0x0c,
140 	NVME_CR_INTMC   = 0x10,
141 	NVME_CR_CC      = 0x14,
142 	NVME_CR_CSTS    = 0x1c,
143 	NVME_CR_NSSR    = 0x20,
144 	NVME_CR_AQA     = 0x24,
145 	NVME_CR_ASQ_LOW = 0x28,
146 	NVME_CR_ASQ_HI  = 0x2c,
147 	NVME_CR_ACQ_LOW = 0x30,
148 	NVME_CR_ACQ_HI  = 0x34,
149 };
150 
151 enum nvme_cmd_cdw11 {
152 	NVME_CMD_CDW11_PC  = 0x0001,
153 	NVME_CMD_CDW11_IEN = 0x0002,
154 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
155 };
156 
157 enum nvme_copy_dir {
158 	NVME_COPY_TO_PRP,
159 	NVME_COPY_FROM_PRP,
160 };
161 
162 #define	NVME_CQ_INTEN	0x01
163 #define	NVME_CQ_INTCOAL	0x02
164 
165 struct nvme_completion_queue {
166 	struct nvme_completion *qbase;
167 	pthread_mutex_t	mtx;
168 	uint32_t	size;
169 	uint16_t	tail; /* nvme progress */
170 	uint16_t	head; /* guest progress */
171 	uint16_t	intr_vec;
172 	uint32_t	intr_en;
173 };
174 
175 struct nvme_submission_queue {
176 	struct nvme_command *qbase;
177 	pthread_mutex_t	mtx;
178 	uint32_t	size;
179 	uint16_t	head; /* nvme progress */
180 	uint16_t	tail; /* guest progress */
181 	uint16_t	cqid; /* completion queue id */
182 	int		qpriority;
183 };
184 
185 enum nvme_storage_type {
186 	NVME_STOR_BLOCKIF = 0,
187 	NVME_STOR_RAM = 1,
188 };
189 
190 struct pci_nvme_blockstore {
191 	enum nvme_storage_type type;
192 	void		*ctx;
193 	uint64_t	size;
194 	uint32_t	sectsz;
195 	uint32_t	sectsz_bits;
196 	uint64_t	eui64;
197 	uint32_t	deallocate:1;
198 };
199 
200 /*
201  * Calculate the number of additional page descriptors for guest IO requests
202  * based on the advertised Max Data Transfer (MDTS) and given the number of
203  * default iovec's in a struct blockif_req.
204  */
205 #define MDTS_PAD_SIZE \
206 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
207 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 	  0 )
209 
210 struct pci_nvme_ioreq {
211 	struct pci_nvme_softc *sc;
212 	STAILQ_ENTRY(pci_nvme_ioreq) link;
213 	struct nvme_submission_queue *nvme_sq;
214 	uint16_t	sqid;
215 
216 	/* command information */
217 	uint16_t	opc;
218 	uint16_t	cid;
219 	uint32_t	nsid;
220 
221 	uint64_t	prev_gpaddr;
222 	size_t		prev_size;
223 	size_t		bytes;
224 
225 	struct blockif_req io_req;
226 
227 	struct iovec	iovpadding[MDTS_PAD_SIZE];
228 };
229 
230 enum nvme_dsm_type {
231 	/* Dataset Management bit in ONCS reflects backing storage capability */
232 	NVME_DATASET_MANAGEMENT_AUTO,
233 	/* Unconditionally set Dataset Management bit in ONCS */
234 	NVME_DATASET_MANAGEMENT_ENABLE,
235 	/* Unconditionally clear Dataset Management bit in ONCS */
236 	NVME_DATASET_MANAGEMENT_DISABLE,
237 };
238 
239 struct pci_nvme_softc;
240 struct nvme_feature_obj;
241 
242 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
243     struct nvme_feature_obj *,
244     struct nvme_command *,
245     struct nvme_completion *);
246 
247 struct nvme_feature_obj {
248 	uint32_t	cdw11;
249 	nvme_feature_cb	set;
250 	nvme_feature_cb	get;
251 	bool namespace_specific;
252 };
253 
254 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
255 
256 struct pci_nvme_aer {
257 	STAILQ_ENTRY(pci_nvme_aer) link;
258 	uint16_t	cid;	/* Command ID of the submitted AER */
259 };
260 
261 struct pci_nvme_softc {
262 	struct pci_devinst *nsc_pi;
263 
264 	pthread_mutex_t	mtx;
265 
266 	struct nvme_registers regs;
267 
268 	struct nvme_namespace_data  nsdata;
269 	struct nvme_controller_data ctrldata;
270 	struct nvme_error_information_entry err_log;
271 	struct nvme_health_information_page health_log;
272 	struct nvme_firmware_page fw_log;
273 
274 	struct pci_nvme_blockstore nvstore;
275 
276 	uint16_t	max_qentries;	/* max entries per queue */
277 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
278 	uint32_t	num_cqueues;
279 	uint32_t	num_squeues;
280 	bool		num_q_is_set; /* Has host set Number of Queues */
281 
282 	struct pci_nvme_ioreq *ioreqs;
283 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
284 	uint32_t	pending_ios;
285 	uint32_t	ioslots;
286 	sem_t		iosemlock;
287 
288 	/*
289 	 * Memory mapped Submission and Completion queues
290 	 * Each array includes both Admin and IO queues
291 	 */
292 	struct nvme_completion_queue *compl_queues;
293 	struct nvme_submission_queue *submit_queues;
294 
295 	struct nvme_feature_obj feat[NVME_FID_MAX];
296 
297 	enum nvme_dsm_type dataset_management;
298 
299 	/* Accounting for SMART data */
300 	__uint128_t	read_data_units;
301 	__uint128_t	write_data_units;
302 	__uint128_t	read_commands;
303 	__uint128_t	write_commands;
304 	uint32_t	read_dunits_remainder;
305 	uint32_t	write_dunits_remainder;
306 
307 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
308 	uint32_t	aer_count;
309 };
310 
311 
312 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
313 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
314 static void pci_nvme_io_done(struct blockif_req *, int);
315 
316 /* Controller Configuration utils */
317 #define	NVME_CC_GET_EN(cc) \
318 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
319 #define	NVME_CC_GET_CSS(cc) \
320 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
321 #define	NVME_CC_GET_SHN(cc) \
322 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
323 #define	NVME_CC_GET_IOSQES(cc) \
324 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
325 #define	NVME_CC_GET_IOCQES(cc) \
326 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
327 
328 #define	NVME_CC_WRITE_MASK \
329 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
330 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
331 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
332 
333 #define	NVME_CC_NEN_WRITE_MASK \
334 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
335 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
336 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
337 
338 /* Controller Status utils */
339 #define	NVME_CSTS_GET_RDY(sts) \
340 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
341 
342 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
343 
344 /* Completion Queue status word utils */
345 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
346 #define	NVME_STATUS_MASK \
347 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
348 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
349 
350 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
351 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
352 
353 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
354     struct nvme_feature_obj *,
355     struct nvme_command *,
356     struct nvme_completion *);
357 static void nvme_feature_num_queues(struct pci_nvme_softc *,
358     struct nvme_feature_obj *,
359     struct nvme_command *,
360     struct nvme_completion *);
361 static void nvme_feature_iv_config(struct pci_nvme_softc *,
362     struct nvme_feature_obj *,
363     struct nvme_command *,
364     struct nvme_completion *);
365 
366 static __inline void
cpywithpad(char * dst,size_t dst_size,const char * src,char pad)367 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
368 {
369 	size_t len;
370 
371 	len = strnlen(src, dst_size);
372 	memset(dst, pad, dst_size);
373 	memcpy(dst, src, len);
374 }
375 
376 static __inline void
pci_nvme_status_tc(uint16_t * status,uint16_t type,uint16_t code)377 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
378 {
379 
380 	*status &= ~NVME_STATUS_MASK;
381 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
382 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
383 }
384 
385 static __inline void
pci_nvme_status_genc(uint16_t * status,uint16_t code)386 pci_nvme_status_genc(uint16_t *status, uint16_t code)
387 {
388 
389 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
390 }
391 
392 /*
393  * Initialize the requested number or IO Submission and Completion Queues.
394  * Admin queues are allocated implicitly.
395  */
396 static void
pci_nvme_init_queues(struct pci_nvme_softc * sc,uint32_t nsq,uint32_t ncq)397 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
398 {
399 	uint32_t i;
400 
401 	/*
402 	 * Allocate and initialize the Submission Queues
403 	 */
404 	if (nsq > NVME_QUEUES) {
405 		WPRINTF("%s: clamping number of SQ from %u to %u",
406 					__func__, nsq, NVME_QUEUES);
407 		nsq = NVME_QUEUES;
408 	}
409 
410 	sc->num_squeues = nsq;
411 
412 	sc->submit_queues = calloc(sc->num_squeues + 1,
413 				sizeof(struct nvme_submission_queue));
414 	if (sc->submit_queues == NULL) {
415 		WPRINTF("%s: SQ allocation failed", __func__);
416 		sc->num_squeues = 0;
417 	} else {
418 		struct nvme_submission_queue *sq = sc->submit_queues;
419 
420 		for (i = 0; i < sc->num_squeues; i++)
421 			pthread_mutex_init(&sq[i].mtx, NULL);
422 	}
423 
424 	/*
425 	 * Allocate and initialize the Completion Queues
426 	 */
427 	if (ncq > NVME_QUEUES) {
428 		WPRINTF("%s: clamping number of CQ from %u to %u",
429 					__func__, ncq, NVME_QUEUES);
430 		ncq = NVME_QUEUES;
431 	}
432 
433 	sc->num_cqueues = ncq;
434 
435 	sc->compl_queues = calloc(sc->num_cqueues + 1,
436 				sizeof(struct nvme_completion_queue));
437 	if (sc->compl_queues == NULL) {
438 		WPRINTF("%s: CQ allocation failed", __func__);
439 		sc->num_cqueues = 0;
440 	} else {
441 		struct nvme_completion_queue *cq = sc->compl_queues;
442 
443 		for (i = 0; i < sc->num_cqueues; i++)
444 			pthread_mutex_init(&cq[i].mtx, NULL);
445 	}
446 }
447 
448 static void
pci_nvme_init_ctrldata(struct pci_nvme_softc * sc)449 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
450 {
451 	struct nvme_controller_data *cd = &sc->ctrldata;
452 
453 	cd->vid = 0xFB5D;
454 	cd->ssvid = 0x0000;
455 
456 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
457 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
458 
459 	/* Num of submission commands that we can handle at a time (2^rab) */
460 	cd->rab   = 4;
461 
462 	/* FreeBSD OUI */
463 	cd->ieee[0] = 0x58;
464 	cd->ieee[1] = 0x9c;
465 	cd->ieee[2] = 0xfc;
466 
467 	cd->mic = 0;
468 
469 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
470 
471 	cd->ver = 0x00010300;
472 
473 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
474 	cd->acl = 2;
475 	cd->aerl = 4;
476 
477 	/* Advertise 1, Read-only firmware slot */
478 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
479 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
480 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
481 	cd->elpe = 0;	/* max error log page entries */
482 	cd->npss = 1;	/* number of power states support */
483 
484 	/* Warning Composite Temperature Threshold */
485 	cd->wctemp = 0x0157;
486 
487 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
488 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
489 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
490 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
491 	cd->nn = 1;	/* number of namespaces */
492 
493 	cd->oncs = 0;
494 	switch (sc->dataset_management) {
495 	case NVME_DATASET_MANAGEMENT_AUTO:
496 		if (sc->nvstore.deallocate)
497 			cd->oncs |= NVME_ONCS_DSM;
498 		break;
499 	case NVME_DATASET_MANAGEMENT_ENABLE:
500 		cd->oncs |= NVME_ONCS_DSM;
501 		break;
502 	default:
503 		break;
504 	}
505 
506 	cd->fna = 0x03;
507 
508 	cd->power_state[0].mp = 10;
509 }
510 
511 /*
512  * Calculate the CRC-16 of the given buffer
513  * See copyright attribution at top of file
514  */
515 static uint16_t
crc16(uint16_t crc,const void * buffer,unsigned int len)516 crc16(uint16_t crc, const void *buffer, unsigned int len)
517 {
518 	const unsigned char *cp = buffer;
519 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
520 	static uint16_t const crc16_table[256] = {
521 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
522 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
523 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
524 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
525 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
526 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
527 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
528 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
529 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
530 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
531 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
532 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
533 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
534 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
535 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
536 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
537 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
538 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
539 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
540 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
541 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
542 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
543 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
544 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
545 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
546 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
547 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
548 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
549 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
550 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
551 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
552 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
553 	};
554 
555 	while (len--)
556 		crc = (((crc >> 8) & 0xffU) ^
557 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
558 	return crc;
559 }
560 
561 static void
pci_nvme_init_nsdata(struct pci_nvme_softc * sc,struct nvme_namespace_data * nd,uint32_t nsid,struct pci_nvme_blockstore * nvstore)562 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
563     struct nvme_namespace_data *nd, uint32_t nsid,
564     struct pci_nvme_blockstore *nvstore)
565 {
566 
567 	/* Get capacity and block size information from backing store */
568 	nd->nsze = nvstore->size / nvstore->sectsz;
569 	nd->ncap = nd->nsze;
570 	nd->nuse = nd->nsze;
571 
572 	if (nvstore->type == NVME_STOR_BLOCKIF)
573 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
574 
575 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
576 	nd->flbas = 0;
577 
578 	/* Create an EUI-64 if user did not provide one */
579 	if (nvstore->eui64 == 0) {
580 		char *data = NULL;
581 		uint64_t eui64 = nvstore->eui64;
582 
583 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
584 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
585 		    sc->nsc_pi->pi_func);
586 
587 		if (data != NULL) {
588 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
589 			free(data);
590 		}
591 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
592 	}
593 	be64enc(nd->eui64, nvstore->eui64);
594 
595 	/* LBA data-sz = 2^lbads */
596 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
597 }
598 
599 static void
pci_nvme_init_logpages(struct pci_nvme_softc * sc)600 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
601 {
602 
603 	memset(&sc->err_log, 0, sizeof(sc->err_log));
604 	memset(&sc->health_log, 0, sizeof(sc->health_log));
605 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
606 
607 	/* Set read/write remainder to round up according to spec */
608 	sc->read_dunits_remainder = 999;
609 	sc->write_dunits_remainder = 999;
610 
611 	/* Set nominal Health values checked by implementations */
612 	sc->health_log.temperature = 310;
613 	sc->health_log.available_spare = 100;
614 	sc->health_log.available_spare_threshold = 10;
615 }
616 
617 static void
pci_nvme_init_features(struct pci_nvme_softc * sc)618 pci_nvme_init_features(struct pci_nvme_softc *sc)
619 {
620 
621 	sc->feat[0].set = nvme_feature_invalid_cb;
622 	sc->feat[0].get = nvme_feature_invalid_cb;
623 
624 	sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
625 	sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
626 	sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
627 	sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
628 	    nvme_feature_iv_config;
629 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
630 	    nvme_feature_invalid_cb;
631 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
632 	    nvme_feature_invalid_cb;
633 }
634 
635 static void
pci_nvme_aer_init(struct pci_nvme_softc * sc)636 pci_nvme_aer_init(struct pci_nvme_softc *sc)
637 {
638 
639 	STAILQ_INIT(&sc->aer_list);
640 	sc->aer_count = 0;
641 }
642 
643 static void
pci_nvme_aer_destroy(struct pci_nvme_softc * sc)644 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
645 {
646 	struct pci_nvme_aer *aer = NULL;
647 
648 	while (!STAILQ_EMPTY(&sc->aer_list)) {
649 		aer = STAILQ_FIRST(&sc->aer_list);
650 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
651 		free(aer);
652 	}
653 
654 	pci_nvme_aer_init(sc);
655 }
656 
657 #ifdef __FreeBSD__
658 static bool
pci_nvme_aer_available(struct pci_nvme_softc * sc)659 pci_nvme_aer_available(struct pci_nvme_softc *sc)
660 {
661 
662 	return (!STAILQ_EMPTY(&sc->aer_list));
663 }
664 #else
665 /* This is kept behind an ifdef while it's unused to appease the compiler. */
666 #endif
667 
668 static bool
pci_nvme_aer_limit_reached(struct pci_nvme_softc * sc)669 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
670 {
671 	struct nvme_controller_data *cd = &sc->ctrldata;
672 
673 	/* AERL is a zero based value while aer_count is one's based */
674 	return (sc->aer_count == (cd->aerl + 1));
675 }
676 
677 /*
678  * Add an Async Event Request
679  *
680  * Stores an AER to be returned later if the Controller needs to notify the
681  * host of an event.
682  * Note that while the NVMe spec doesn't require Controllers to return AER's
683  * in order, this implementation does preserve the order.
684  */
685 static int
pci_nvme_aer_add(struct pci_nvme_softc * sc,uint16_t cid)686 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
687 {
688 	struct pci_nvme_aer *aer = NULL;
689 
690 	if (pci_nvme_aer_limit_reached(sc))
691 		return (-1);
692 
693 	aer = calloc(1, sizeof(struct pci_nvme_aer));
694 	if (aer == NULL)
695 		return (-1);
696 
697 	sc->aer_count++;
698 
699 	/* Save the Command ID for use in the completion message */
700 	aer->cid = cid;
701 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
702 
703 	return (0);
704 }
705 
706 /*
707  * Get an Async Event Request structure
708  *
709  * Returns a pointer to an AER previously submitted by the host or NULL if
710  * no AER's exist. Caller is responsible for freeing the returned struct.
711  */
712 #ifdef __FreeBSD__
713 static struct pci_nvme_aer *
pci_nvme_aer_get(struct pci_nvme_softc * sc)714 pci_nvme_aer_get(struct pci_nvme_softc *sc)
715 {
716 	struct pci_nvme_aer *aer = NULL;
717 
718 	aer = STAILQ_FIRST(&sc->aer_list);
719 	if (aer != NULL) {
720 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
721 		sc->aer_count--;
722 	}
723 
724 	return (aer);
725 }
726 #else
727 /* This is kept behind an ifdef while it's unused to appease the compiler. */
728 #endif
729 
730 static void
pci_nvme_reset_locked(struct pci_nvme_softc * sc)731 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
732 {
733 	uint32_t i;
734 
735 	DPRINTF("%s", __func__);
736 
737 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
738 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
739 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
740 
741 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
742 
743 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
744 
745 	sc->regs.cc = 0;
746 	sc->regs.csts = 0;
747 
748 	assert(sc->submit_queues != NULL);
749 
750 	for (i = 0; i < sc->num_squeues + 1; i++) {
751 		sc->submit_queues[i].qbase = NULL;
752 		sc->submit_queues[i].size = 0;
753 		sc->submit_queues[i].cqid = 0;
754 		sc->submit_queues[i].tail = 0;
755 		sc->submit_queues[i].head = 0;
756 	}
757 
758 	assert(sc->compl_queues != NULL);
759 
760 	for (i = 0; i < sc->num_cqueues + 1; i++) {
761 		sc->compl_queues[i].qbase = NULL;
762 		sc->compl_queues[i].size = 0;
763 		sc->compl_queues[i].tail = 0;
764 		sc->compl_queues[i].head = 0;
765 	}
766 
767 	sc->num_q_is_set = false;
768 
769 	pci_nvme_aer_destroy(sc);
770 }
771 
772 static void
pci_nvme_reset(struct pci_nvme_softc * sc)773 pci_nvme_reset(struct pci_nvme_softc *sc)
774 {
775 	pthread_mutex_lock(&sc->mtx);
776 	pci_nvme_reset_locked(sc);
777 	pthread_mutex_unlock(&sc->mtx);
778 }
779 
780 static void
pci_nvme_init_controller(struct vmctx * ctx,struct pci_nvme_softc * sc)781 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
782 {
783 	uint16_t acqs, asqs;
784 
785 	DPRINTF("%s", __func__);
786 
787 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
788 	sc->submit_queues[0].size = asqs;
789 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
790 	            sizeof(struct nvme_command) * asqs);
791 
792 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
793 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
794 
795 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
796 	    NVME_AQA_REG_ACQS_MASK) + 1;
797 	sc->compl_queues[0].size = acqs;
798 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
799 	         sizeof(struct nvme_completion) * acqs);
800 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
801 
802 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
803 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
804 }
805 
806 static int
nvme_prp_memcpy(struct vmctx * ctx,uint64_t prp1,uint64_t prp2,uint8_t * b,size_t len,enum nvme_copy_dir dir)807 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
808 	size_t len, enum nvme_copy_dir dir)
809 {
810 	uint8_t *p;
811 	size_t bytes;
812 
813 	if (len > (8 * 1024)) {
814 		return (-1);
815 	}
816 
817 	/* Copy from the start of prp1 to the end of the physical page */
818 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
819 	bytes = MIN(bytes, len);
820 
821 	p = vm_map_gpa(ctx, prp1, bytes);
822 	if (p == NULL) {
823 		return (-1);
824 	}
825 
826 	if (dir == NVME_COPY_TO_PRP)
827 		memcpy(p, b, bytes);
828 	else
829 		memcpy(b, p, bytes);
830 
831 	b += bytes;
832 
833 	len -= bytes;
834 	if (len == 0) {
835 		return (0);
836 	}
837 
838 	len = MIN(len, PAGE_SIZE);
839 
840 	p = vm_map_gpa(ctx, prp2, len);
841 	if (p == NULL) {
842 		return (-1);
843 	}
844 
845 	if (dir == NVME_COPY_TO_PRP)
846 		memcpy(p, b, len);
847 	else
848 		memcpy(b, p, len);
849 
850 	return (0);
851 }
852 
853 /*
854  * Write a Completion Queue Entry update
855  *
856  * Write the completion and update the doorbell value
857  */
858 static void
pci_nvme_cq_update(struct pci_nvme_softc * sc,struct nvme_completion_queue * cq,uint32_t cdw0,uint16_t cid,uint16_t sqid,uint16_t status)859 pci_nvme_cq_update(struct pci_nvme_softc *sc,
860 		struct nvme_completion_queue *cq,
861 		uint32_t cdw0,
862 		uint16_t cid,
863 		uint16_t sqid,
864 		uint16_t status)
865 {
866 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
867 	struct nvme_completion *cqe;
868 
869 	assert(cq->qbase != NULL);
870 
871 	pthread_mutex_lock(&cq->mtx);
872 
873 	cqe = &cq->qbase[cq->tail];
874 
875 	/* Flip the phase bit */
876 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
877 
878 	cqe->cdw0 = cdw0;
879 	cqe->sqhd = sq->head;
880 	cqe->sqid = sqid;
881 	cqe->cid = cid;
882 	cqe->status = status;
883 
884 	cq->tail++;
885 	if (cq->tail >= cq->size) {
886 		cq->tail = 0;
887 	}
888 
889 	pthread_mutex_unlock(&cq->mtx);
890 }
891 
892 static int
nvme_opc_delete_io_sq(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)893 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
894 	struct nvme_completion* compl)
895 {
896 	uint16_t qid = command->cdw10 & 0xffff;
897 
898 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
899 	if (qid == 0 || qid > sc->num_squeues ||
900 	    (sc->submit_queues[qid].qbase == NULL)) {
901 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
902 		        __func__, qid, sc->num_squeues);
903 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
904 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
905 		return (1);
906 	}
907 
908 	sc->submit_queues[qid].qbase = NULL;
909 	sc->submit_queues[qid].cqid = 0;
910 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
911 	return (1);
912 }
913 
914 static int
nvme_opc_create_io_sq(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)915 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
916 	struct nvme_completion* compl)
917 {
918 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
919 		uint16_t qid = command->cdw10 & 0xffff;
920 		struct nvme_submission_queue *nsq;
921 
922 		if ((qid == 0) || (qid > sc->num_squeues) ||
923 		    (sc->submit_queues[qid].qbase != NULL)) {
924 			WPRINTF("%s queue index %u > num_squeues %u",
925 			        __func__, qid, sc->num_squeues);
926 			pci_nvme_status_tc(&compl->status,
927 			    NVME_SCT_COMMAND_SPECIFIC,
928 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
929 			return (1);
930 		}
931 
932 		nsq = &sc->submit_queues[qid];
933 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
934 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
935 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
936 			/*
937 			 * Queues must specify at least two entries
938 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
939 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
940 			 */
941 			pci_nvme_status_tc(&compl->status,
942 			    NVME_SCT_COMMAND_SPECIFIC,
943 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
944 			return (1);
945 		}
946 		nsq->head = nsq->tail = 0;
947 
948 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
949 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
950 			pci_nvme_status_tc(&compl->status,
951 			    NVME_SCT_COMMAND_SPECIFIC,
952 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
953 			return (1);
954 		}
955 
956 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
957 			pci_nvme_status_tc(&compl->status,
958 			    NVME_SCT_COMMAND_SPECIFIC,
959 			    NVME_SC_COMPLETION_QUEUE_INVALID);
960 			return (1);
961 		}
962 
963 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
964 
965 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
966 		              sizeof(struct nvme_command) * (size_t)nsq->size);
967 
968 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
969 		        qid, nsq->size, nsq->qbase, nsq->cqid);
970 
971 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
972 
973 		DPRINTF("%s completed creating IOSQ qid %u",
974 		         __func__, qid);
975 	} else {
976 		/*
977 		 * Guest sent non-cont submission queue request.
978 		 * This setting is unsupported by this emulation.
979 		 */
980 		WPRINTF("%s unsupported non-contig (list-based) "
981 		         "create i/o submission queue", __func__);
982 
983 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
984 	}
985 	return (1);
986 }
987 
988 static int
nvme_opc_delete_io_cq(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)989 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
990 	struct nvme_completion* compl)
991 {
992 	uint16_t qid = command->cdw10 & 0xffff;
993 	uint16_t sqid;
994 
995 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
996 	if (qid == 0 || qid > sc->num_cqueues ||
997 	    (sc->compl_queues[qid].qbase == NULL)) {
998 		WPRINTF("%s queue index %u / num_cqueues %u",
999 		        __func__, qid, sc->num_cqueues);
1000 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1001 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1002 		return (1);
1003 	}
1004 
1005 	/* Deleting an Active CQ is an error */
1006 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1007 		if (sc->submit_queues[sqid].cqid == qid) {
1008 			pci_nvme_status_tc(&compl->status,
1009 			    NVME_SCT_COMMAND_SPECIFIC,
1010 			    NVME_SC_INVALID_QUEUE_DELETION);
1011 			return (1);
1012 		}
1013 
1014 	sc->compl_queues[qid].qbase = NULL;
1015 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1016 	return (1);
1017 }
1018 
1019 static int
nvme_opc_create_io_cq(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)1020 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1021 	struct nvme_completion* compl)
1022 {
1023 	struct nvme_completion_queue *ncq;
1024 	uint16_t qid = command->cdw10 & 0xffff;
1025 
1026 	/* Only support Physically Contiguous queues */
1027 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1028 		WPRINTF("%s unsupported non-contig (list-based) "
1029 		         "create i/o completion queue",
1030 		         __func__);
1031 
1032 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1033 		return (1);
1034 	}
1035 
1036 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1037 	    (sc->compl_queues[qid].qbase != NULL)) {
1038 		WPRINTF("%s queue index %u > num_cqueues %u",
1039 			__func__, qid, sc->num_cqueues);
1040 		pci_nvme_status_tc(&compl->status,
1041 		    NVME_SCT_COMMAND_SPECIFIC,
1042 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1043 		return (1);
1044  	}
1045 
1046 	ncq = &sc->compl_queues[qid];
1047 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1048 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1049 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1050 		pci_nvme_status_tc(&compl->status,
1051 		    NVME_SCT_COMMAND_SPECIFIC,
1052 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1053 		return (1);
1054 	}
1055 
1056 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1057 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1058 		/*
1059 		 * Queues must specify at least two entries
1060 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1061 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1062 		 */
1063 		pci_nvme_status_tc(&compl->status,
1064 		    NVME_SCT_COMMAND_SPECIFIC,
1065 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1066 		return (1);
1067 	}
1068 	ncq->head = ncq->tail = 0;
1069 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1070 		     command->prp1,
1071 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1072 
1073 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1074 
1075 
1076 	return (1);
1077 }
1078 
1079 static int
nvme_opc_get_log_page(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)1080 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1081 	struct nvme_completion* compl)
1082 {
1083 	uint32_t logsize;
1084 	uint8_t logpage = command->cdw10 & 0xFF;
1085 
1086 #ifndef __FreeBSD__
1087 	logsize = 0;
1088 #endif
1089 
1090 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1091 
1092 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1093 
1094 	/*
1095 	 * Command specifies the number of dwords to return in fields NUMDU
1096 	 * and NUMDL. This is a zero-based value.
1097 	 */
1098 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1099 	logsize *= sizeof(uint32_t);
1100 
1101 	switch (logpage) {
1102 	case NVME_LOG_ERROR:
1103 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1104 		    command->prp2, (uint8_t *)&sc->err_log,
1105 		    MIN(logsize, sizeof(sc->err_log)),
1106 		    NVME_COPY_TO_PRP);
1107 		break;
1108 	case NVME_LOG_HEALTH_INFORMATION:
1109 		pthread_mutex_lock(&sc->mtx);
1110 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1111 		    sizeof(sc->health_log.data_units_read));
1112 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1113 		    sizeof(sc->health_log.data_units_written));
1114 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1115 		    sizeof(sc->health_log.host_read_commands));
1116 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1117 		    sizeof(sc->health_log.host_write_commands));
1118 		pthread_mutex_unlock(&sc->mtx);
1119 
1120 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1121 		    command->prp2, (uint8_t *)&sc->health_log,
1122 		    MIN(logsize, sizeof(sc->health_log)),
1123 		    NVME_COPY_TO_PRP);
1124 		break;
1125 	case NVME_LOG_FIRMWARE_SLOT:
1126 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1127 		    command->prp2, (uint8_t *)&sc->fw_log,
1128 		    MIN(logsize, sizeof(sc->fw_log)),
1129 		    NVME_COPY_TO_PRP);
1130 		break;
1131 	default:
1132 		DPRINTF("%s get log page %x command not supported",
1133 		        __func__, logpage);
1134 
1135 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1136 		    NVME_SC_INVALID_LOG_PAGE);
1137 	}
1138 
1139 	return (1);
1140 }
1141 
1142 static int
nvme_opc_identify(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)1143 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1144 	struct nvme_completion* compl)
1145 {
1146 	void *dest;
1147 	uint16_t status;
1148 
1149 #ifndef __FreeBSD__
1150 	status = 0;
1151 #endif
1152 
1153 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1154 	        command->cdw10 & 0xFF, command->nsid);
1155 
1156 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1157 
1158 	switch (command->cdw10 & 0xFF) {
1159 	case 0x00: /* return Identify Namespace data structure */
1160 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1161 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1162 		    NVME_COPY_TO_PRP);
1163 		break;
1164 	case 0x01: /* return Identify Controller data structure */
1165 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1166 		    command->prp2, (uint8_t *)&sc->ctrldata,
1167 		    sizeof(sc->ctrldata),
1168 		    NVME_COPY_TO_PRP);
1169 		break;
1170 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1171 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1172 		                  sizeof(uint32_t) * 1024);
1173 		/* All unused entries shall be zero */
1174 		bzero(dest, sizeof(uint32_t) * 1024);
1175 		((uint32_t *)dest)[0] = 1;
1176 		break;
1177 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1178 		if (command->nsid != 1) {
1179 			pci_nvme_status_genc(&status,
1180 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1181 			break;
1182 		}
1183 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1184 		                  sizeof(uint32_t) * 1024);
1185 		/* All bytes after the descriptor shall be zero */
1186 		bzero(dest, sizeof(uint32_t) * 1024);
1187 
1188 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1189 		((uint8_t *)dest)[0] = 1;
1190 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1191 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1192 		break;
1193 	default:
1194 		DPRINTF("%s unsupported identify command requested 0x%x",
1195 		         __func__, command->cdw10 & 0xFF);
1196 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1197 		break;
1198 	}
1199 
1200 	compl->status = status;
1201 	return (1);
1202 }
1203 
1204 static const char *
nvme_fid_to_name(uint8_t fid)1205 nvme_fid_to_name(uint8_t fid)
1206 {
1207 	const char *name;
1208 
1209 	switch (fid) {
1210 	case NVME_FEAT_ARBITRATION:
1211 		name = "Arbitration";
1212 		break;
1213 	case NVME_FEAT_POWER_MANAGEMENT:
1214 		name = "Power Management";
1215 		break;
1216 	case NVME_FEAT_LBA_RANGE_TYPE:
1217 		name = "LBA Range Type";
1218 		break;
1219 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1220 		name = "Temperature Threshold";
1221 		break;
1222 	case NVME_FEAT_ERROR_RECOVERY:
1223 		name = "Error Recovery";
1224 		break;
1225 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1226 		name = "Volatile Write Cache";
1227 		break;
1228 	case NVME_FEAT_NUMBER_OF_QUEUES:
1229 		name = "Number of Queues";
1230 		break;
1231 	case NVME_FEAT_INTERRUPT_COALESCING:
1232 		name = "Interrupt Coalescing";
1233 		break;
1234 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1235 		name = "Interrupt Vector Configuration";
1236 		break;
1237 	case NVME_FEAT_WRITE_ATOMICITY:
1238 		name = "Write Atomicity Normal";
1239 		break;
1240 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1241 		name = "Asynchronous Event Configuration";
1242 		break;
1243 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1244 		name = "Autonomous Power State Transition";
1245 		break;
1246 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1247 		name = "Host Memory Buffer";
1248 		break;
1249 	case NVME_FEAT_TIMESTAMP:
1250 		name = "Timestamp";
1251 		break;
1252 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1253 		name = "Keep Alive Timer";
1254 		break;
1255 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1256 		name = "Host Controlled Thermal Management";
1257 		break;
1258 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1259 		name = "Non-Operation Power State Config";
1260 		break;
1261 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1262 		name = "Read Recovery Level Config";
1263 		break;
1264 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1265 		name = "Predictable Latency Mode Config";
1266 		break;
1267 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1268 		name = "Predictable Latency Mode Window";
1269 		break;
1270 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1271 		name = "LBA Status Information Report Interval";
1272 		break;
1273 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1274 		name = "Host Behavior Support";
1275 		break;
1276 	case NVME_FEAT_SANITIZE_CONFIG:
1277 		name = "Sanitize Config";
1278 		break;
1279 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1280 		name = "Endurance Group Event Configuration";
1281 		break;
1282 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1283 		name = "Software Progress Marker";
1284 		break;
1285 	case NVME_FEAT_HOST_IDENTIFIER:
1286 		name = "Host Identifier";
1287 		break;
1288 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1289 		name = "Reservation Notification Mask";
1290 		break;
1291 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1292 		name = "Reservation Persistence";
1293 		break;
1294 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1295 		name = "Namespace Write Protection Config";
1296 		break;
1297 	default:
1298 		name = "Unknown";
1299 		break;
1300 	}
1301 
1302 	return (name);
1303 }
1304 
1305 static void
nvme_feature_invalid_cb(struct pci_nvme_softc * sc,struct nvme_feature_obj * feat,struct nvme_command * command,struct nvme_completion * compl)1306 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1307     struct nvme_feature_obj *feat,
1308     struct nvme_command *command,
1309     struct nvme_completion *compl)
1310 {
1311 
1312 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1313 }
1314 
1315 static void
nvme_feature_iv_config(struct pci_nvme_softc * sc,struct nvme_feature_obj * feat,struct nvme_command * command,struct nvme_completion * compl)1316 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1317     struct nvme_feature_obj *feat,
1318     struct nvme_command *command,
1319     struct nvme_completion *compl)
1320 {
1321 	uint32_t i;
1322 	uint32_t cdw11 = command->cdw11;
1323 	uint16_t iv;
1324 	bool cd;
1325 
1326 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1327 
1328 	iv = cdw11 & 0xffff;
1329 	cd = cdw11 & (1 << 16);
1330 
1331 	if (iv > (sc->max_queues + 1)) {
1332 		return;
1333 	}
1334 
1335 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1336 	if ((iv == 0) && !cd)
1337 		return;
1338 
1339 	/* Requested Interrupt Vector must be used by a CQ */
1340 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1341 		if (sc->compl_queues[i].intr_vec == iv) {
1342 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1343 		}
1344 	}
1345 
1346 }
1347 
1348 static void
nvme_feature_num_queues(struct pci_nvme_softc * sc,struct nvme_feature_obj * feat,struct nvme_command * command,struct nvme_completion * compl)1349 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1350     struct nvme_feature_obj *feat,
1351     struct nvme_command *command,
1352     struct nvme_completion *compl)
1353 {
1354 	uint16_t nqr;	/* Number of Queues Requested */
1355 
1356 	if (sc->num_q_is_set) {
1357 		WPRINTF("%s: Number of Queues already set", __func__);
1358 		pci_nvme_status_genc(&compl->status,
1359 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1360 		return;
1361 	}
1362 
1363 	nqr = command->cdw11 & 0xFFFF;
1364 	if (nqr == 0xffff) {
1365 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1366 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1367 		return;
1368 	}
1369 
1370 	sc->num_squeues = ONE_BASED(nqr);
1371 	if (sc->num_squeues > sc->max_queues) {
1372 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1373 					sc->max_queues);
1374 		sc->num_squeues = sc->max_queues;
1375 	}
1376 
1377 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1378 	if (nqr == 0xffff) {
1379 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1380 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1381 		return;
1382 	}
1383 
1384 	sc->num_cqueues = ONE_BASED(nqr);
1385 	if (sc->num_cqueues > sc->max_queues) {
1386 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1387 					sc->max_queues);
1388 		sc->num_cqueues = sc->max_queues;
1389 	}
1390 
1391 	/* Patch the command value which will be saved on callback's return */
1392 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1393 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1394 
1395 	sc->num_q_is_set = true;
1396 }
1397 
1398 static int
nvme_opc_set_features(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)1399 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1400 	struct nvme_completion *compl)
1401 {
1402 	struct nvme_feature_obj *feat;
1403 	uint32_t nsid = command->nsid;
1404 	uint8_t fid = command->cdw10 & 0xFF;
1405 
1406 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1407 
1408 	if (fid >= NVME_FID_MAX) {
1409 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1410 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1411 		return (1);
1412 	}
1413 	feat = &sc->feat[fid];
1414 
1415 	if (!feat->namespace_specific &&
1416 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1417 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1418 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1419 		return (1);
1420 	}
1421 
1422 	compl->cdw0 = 0;
1423 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1424 
1425 	if (feat->set)
1426 		feat->set(sc, feat, command, compl);
1427 
1428 	if (compl->status == NVME_SC_SUCCESS)
1429 		feat->cdw11 = command->cdw11;
1430 
1431 	return (0);
1432 }
1433 
1434 static int
nvme_opc_get_features(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)1435 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1436 	struct nvme_completion* compl)
1437 {
1438 	struct nvme_feature_obj *feat;
1439 	uint8_t fid = command->cdw10 & 0xFF;
1440 
1441 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1442 
1443 	if (fid >= NVME_FID_MAX) {
1444 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1445 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1446 		return (1);
1447 	}
1448 
1449 	compl->cdw0 = 0;
1450 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1451 
1452 	feat = &sc->feat[fid];
1453 	if (feat->get) {
1454 		feat->get(sc, feat, command, compl);
1455 	}
1456 
1457 	if (compl->status == NVME_SC_SUCCESS) {
1458 		compl->cdw0 = feat->cdw11;
1459 	}
1460 
1461 	return (0);
1462 }
1463 
1464 static int
nvme_opc_format_nvm(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)1465 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1466 	struct nvme_completion* compl)
1467 {
1468 	uint8_t	ses, lbaf, pi;
1469 
1470 	/* Only supports Secure Erase Setting - User Data Erase */
1471 	ses = (command->cdw10 >> 9) & 0x7;
1472 	if (ses > 0x1) {
1473 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1474 		return (1);
1475 	}
1476 
1477 	/* Only supports a single LBA Format */
1478 	lbaf = command->cdw10 & 0xf;
1479 	if (lbaf != 0) {
1480 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1481 		    NVME_SC_INVALID_FORMAT);
1482 		return (1);
1483 	}
1484 
1485 	/* Doesn't support Protection Infomation */
1486 	pi = (command->cdw10 >> 5) & 0x7;
1487 	if (pi != 0) {
1488 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1489 		return (1);
1490 	}
1491 
1492 	if (sc->nvstore.type == NVME_STOR_RAM) {
1493 		if (sc->nvstore.ctx)
1494 			free(sc->nvstore.ctx);
1495 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1496 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1497 	} else {
1498 		struct pci_nvme_ioreq *req;
1499 		int err;
1500 
1501 		req = pci_nvme_get_ioreq(sc);
1502 		if (req == NULL) {
1503 			pci_nvme_status_genc(&compl->status,
1504 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1505 			WPRINTF("%s: unable to allocate IO req", __func__);
1506 			return (1);
1507 		}
1508 		req->nvme_sq = &sc->submit_queues[0];
1509 		req->sqid = 0;
1510 		req->opc = command->opc;
1511 		req->cid = command->cid;
1512 		req->nsid = command->nsid;
1513 
1514 		req->io_req.br_offset = 0;
1515 		req->io_req.br_resid = sc->nvstore.size;
1516 		req->io_req.br_callback = pci_nvme_io_done;
1517 
1518 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1519 		if (err) {
1520 			pci_nvme_status_genc(&compl->status,
1521 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1522 			pci_nvme_release_ioreq(sc, req);
1523 		}
1524 	}
1525 
1526 	return (1);
1527 }
1528 
1529 static int
nvme_opc_abort(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)1530 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1531 	struct nvme_completion* compl)
1532 {
1533 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1534 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1535 
1536 	/* TODO: search for the command ID and abort it */
1537 
1538 	compl->cdw0 = 1;
1539 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1540 	return (1);
1541 }
1542 
1543 static int
nvme_opc_async_event_req(struct pci_nvme_softc * sc,struct nvme_command * command,struct nvme_completion * compl)1544 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1545 	struct nvme_command* command, struct nvme_completion* compl)
1546 {
1547 	DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1548 
1549 	/* Don't exceed the Async Event Request Limit (AERL). */
1550 	if (pci_nvme_aer_limit_reached(sc)) {
1551 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1552 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1553 		return (1);
1554 	}
1555 
1556 	if (pci_nvme_aer_add(sc, command->cid)) {
1557 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1558 				NVME_SC_INTERNAL_DEVICE_ERROR);
1559 		return (1);
1560 	}
1561 
1562 	/*
1563 	 * Raise events when they happen based on the Set Features cmd.
1564 	 * These events happen async, so only set completion successful if
1565 	 * there is an event reflective of the request to get event.
1566 	 */
1567 	compl->status = NVME_NO_STATUS;
1568 
1569 	return (0);
1570 }
1571 
1572 static void
pci_nvme_handle_admin_cmd(struct pci_nvme_softc * sc,uint64_t value)1573 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1574 {
1575 	struct nvme_completion compl;
1576 	struct nvme_command *cmd;
1577 	struct nvme_submission_queue *sq;
1578 	struct nvme_completion_queue *cq;
1579 	uint16_t sqhead;
1580 
1581 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1582 
1583 	sq = &sc->submit_queues[0];
1584 	cq = &sc->compl_queues[0];
1585 
1586 	pthread_mutex_lock(&sq->mtx);
1587 
1588 	sqhead = sq->head;
1589 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1590 
1591 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1592 		cmd = &(sq->qbase)[sqhead];
1593 		compl.cdw0 = 0;
1594 		compl.status = 0;
1595 
1596 		switch (cmd->opc) {
1597 		case NVME_OPC_DELETE_IO_SQ:
1598 			DPRINTF("%s command DELETE_IO_SQ", __func__);
1599 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1600 			break;
1601 		case NVME_OPC_CREATE_IO_SQ:
1602 			DPRINTF("%s command CREATE_IO_SQ", __func__);
1603 			nvme_opc_create_io_sq(sc, cmd, &compl);
1604 			break;
1605 		case NVME_OPC_DELETE_IO_CQ:
1606 			DPRINTF("%s command DELETE_IO_CQ", __func__);
1607 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1608 			break;
1609 		case NVME_OPC_CREATE_IO_CQ:
1610 			DPRINTF("%s command CREATE_IO_CQ", __func__);
1611 			nvme_opc_create_io_cq(sc, cmd, &compl);
1612 			break;
1613 		case NVME_OPC_GET_LOG_PAGE:
1614 			DPRINTF("%s command GET_LOG_PAGE", __func__);
1615 			nvme_opc_get_log_page(sc, cmd, &compl);
1616 			break;
1617 		case NVME_OPC_IDENTIFY:
1618 			DPRINTF("%s command IDENTIFY", __func__);
1619 			nvme_opc_identify(sc, cmd, &compl);
1620 			break;
1621 		case NVME_OPC_ABORT:
1622 			DPRINTF("%s command ABORT", __func__);
1623 			nvme_opc_abort(sc, cmd, &compl);
1624 			break;
1625 		case NVME_OPC_SET_FEATURES:
1626 			DPRINTF("%s command SET_FEATURES", __func__);
1627 			nvme_opc_set_features(sc, cmd, &compl);
1628 			break;
1629 		case NVME_OPC_GET_FEATURES:
1630 			DPRINTF("%s command GET_FEATURES", __func__);
1631 			nvme_opc_get_features(sc, cmd, &compl);
1632 			break;
1633 		case NVME_OPC_FIRMWARE_ACTIVATE:
1634 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1635 			pci_nvme_status_tc(&compl.status,
1636 			    NVME_SCT_COMMAND_SPECIFIC,
1637 			    NVME_SC_INVALID_FIRMWARE_SLOT);
1638 			break;
1639 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1640 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1641 			nvme_opc_async_event_req(sc, cmd, &compl);
1642 			break;
1643 		case NVME_OPC_FORMAT_NVM:
1644 			DPRINTF("%s command FORMAT_NVM", __func__);
1645 			if ((sc->ctrldata.oacs &
1646 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1647 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1648 			}
1649 			compl.status = NVME_NO_STATUS;
1650 			nvme_opc_format_nvm(sc, cmd, &compl);
1651 			break;
1652 		default:
1653 			DPRINTF("0x%x command is not implemented",
1654 			    cmd->opc);
1655 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1656 		}
1657 		sqhead = (sqhead + 1) % sq->size;
1658 
1659 		if (NVME_COMPLETION_VALID(compl)) {
1660 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
1661 			    compl.cdw0,
1662 			    cmd->cid,
1663 			    0,		/* SQID */
1664 			    compl.status);
1665 		}
1666 	}
1667 
1668 	DPRINTF("setting sqhead %u", sqhead);
1669 	sq->head = sqhead;
1670 
1671 	if (cq->head != cq->tail)
1672 		pci_generate_msix(sc->nsc_pi, 0);
1673 
1674 	pthread_mutex_unlock(&sq->mtx);
1675 }
1676 
1677 /*
1678  * Update the Write and Read statistics reported in SMART data
1679  *
1680  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1681  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1682  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1683  */
1684 static void
pci_nvme_stats_write_read_update(struct pci_nvme_softc * sc,uint8_t opc,size_t bytes,uint16_t status)1685 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1686     size_t bytes, uint16_t status)
1687 {
1688 
1689 	pthread_mutex_lock(&sc->mtx);
1690 	switch (opc) {
1691 	case NVME_OPC_WRITE:
1692 		sc->write_commands++;
1693 		if (status != NVME_SC_SUCCESS)
1694 			break;
1695 		sc->write_dunits_remainder += (bytes / 512);
1696 		while (sc->write_dunits_remainder >= 1000) {
1697 			sc->write_data_units++;
1698 			sc->write_dunits_remainder -= 1000;
1699 		}
1700 		break;
1701 	case NVME_OPC_READ:
1702 		sc->read_commands++;
1703 		if (status != NVME_SC_SUCCESS)
1704 			break;
1705 		sc->read_dunits_remainder += (bytes / 512);
1706 		while (sc->read_dunits_remainder >= 1000) {
1707 			sc->read_data_units++;
1708 			sc->read_dunits_remainder -= 1000;
1709 		}
1710 		break;
1711 	default:
1712 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1713 		break;
1714 	}
1715 	pthread_mutex_unlock(&sc->mtx);
1716 }
1717 
1718 /*
1719  * Check if the combination of Starting LBA (slba) and Number of Logical
1720  * Blocks (nlb) exceeds the range of the underlying storage.
1721  *
1722  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1723  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1724  * overflow.
1725  */
1726 static bool
pci_nvme_out_of_range(struct pci_nvme_blockstore * nvstore,uint64_t slba,uint32_t nlb)1727 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1728     uint32_t nlb)
1729 {
1730 	size_t	offset, bytes;
1731 
1732 	/* Overflow check of multiplying Starting LBA by the sector size */
1733 	if (slba >> (64 - nvstore->sectsz_bits))
1734 		return (true);
1735 
1736 	offset = slba << nvstore->sectsz_bits;
1737 	bytes = nlb << nvstore->sectsz_bits;
1738 
1739 	/* Overflow check of Number of Logical Blocks */
1740 	if ((nvstore->size - offset) < bytes)
1741 		return (true);
1742 
1743 	return (false);
1744 }
1745 
1746 static int
pci_nvme_append_iov_req(struct pci_nvme_softc * sc,struct pci_nvme_ioreq * req,uint64_t gpaddr,size_t size,int do_write,uint64_t lba)1747 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1748 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1749 {
1750 	int iovidx;
1751 
1752 	if (req == NULL)
1753 		return (-1);
1754 
1755 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1756 		return (-1);
1757 	}
1758 
1759 	/* concatenate contig block-iovs to minimize number of iovs */
1760 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1761 		iovidx = req->io_req.br_iovcnt - 1;
1762 
1763 		req->io_req.br_iov[iovidx].iov_base =
1764 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1765 				     req->prev_gpaddr, size);
1766 
1767 		req->prev_size += size;
1768 		req->io_req.br_resid += size;
1769 
1770 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1771 	} else {
1772 		iovidx = req->io_req.br_iovcnt;
1773 		if (iovidx == 0) {
1774 			req->io_req.br_offset = lba;
1775 			req->io_req.br_resid = 0;
1776 			req->io_req.br_param = req;
1777 		}
1778 
1779 		req->io_req.br_iov[iovidx].iov_base =
1780 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1781 				     gpaddr, size);
1782 
1783 		req->io_req.br_iov[iovidx].iov_len = size;
1784 
1785 		req->prev_gpaddr = gpaddr;
1786 		req->prev_size = size;
1787 		req->io_req.br_resid += size;
1788 
1789 		req->io_req.br_iovcnt++;
1790 	}
1791 
1792 	return (0);
1793 }
1794 
1795 static void
pci_nvme_set_completion(struct pci_nvme_softc * sc,struct nvme_submission_queue * sq,int sqid,uint16_t cid,uint32_t cdw0,uint16_t status)1796 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1797 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1798 	uint32_t cdw0, uint16_t status)
1799 {
1800 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1801 
1802 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1803 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1804 		 NVME_STATUS_GET_SC(status));
1805 
1806 	pci_nvme_cq_update(sc, cq,
1807 	    0,		/* CDW0 */
1808 	    cid,
1809 	    sqid,
1810 	    status);
1811 
1812 	if (cq->head != cq->tail) {
1813 		if (cq->intr_en & NVME_CQ_INTEN) {
1814 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1815 		} else {
1816 			DPRINTF("%s: CQ%u interrupt disabled",
1817 						__func__, sq->cqid);
1818 		}
1819 	}
1820 }
1821 
1822 static void
pci_nvme_release_ioreq(struct pci_nvme_softc * sc,struct pci_nvme_ioreq * req)1823 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1824 {
1825 	req->sc = NULL;
1826 	req->nvme_sq = NULL;
1827 	req->sqid = 0;
1828 
1829 	pthread_mutex_lock(&sc->mtx);
1830 
1831 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1832 	sc->pending_ios--;
1833 
1834 	/* when no more IO pending, can set to ready if device reset/enabled */
1835 	if (sc->pending_ios == 0 &&
1836 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1837 		sc->regs.csts |= NVME_CSTS_RDY;
1838 
1839 	pthread_mutex_unlock(&sc->mtx);
1840 
1841 	sem_post(&sc->iosemlock);
1842 }
1843 
1844 static struct pci_nvme_ioreq *
pci_nvme_get_ioreq(struct pci_nvme_softc * sc)1845 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1846 {
1847 	struct pci_nvme_ioreq *req = NULL;
1848 
1849 	sem_wait(&sc->iosemlock);
1850 	pthread_mutex_lock(&sc->mtx);
1851 
1852 	req = STAILQ_FIRST(&sc->ioreqs_free);
1853 	assert(req != NULL);
1854 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1855 
1856 	req->sc = sc;
1857 
1858 	sc->pending_ios++;
1859 
1860 	pthread_mutex_unlock(&sc->mtx);
1861 
1862 	req->io_req.br_iovcnt = 0;
1863 	req->io_req.br_offset = 0;
1864 	req->io_req.br_resid = 0;
1865 	req->io_req.br_param = req;
1866 	req->prev_gpaddr = 0;
1867 	req->prev_size = 0;
1868 
1869 	return req;
1870 }
1871 
1872 static void
pci_nvme_io_done(struct blockif_req * br,int err)1873 pci_nvme_io_done(struct blockif_req *br, int err)
1874 {
1875 	struct pci_nvme_ioreq *req = br->br_param;
1876 	struct nvme_submission_queue *sq = req->nvme_sq;
1877 	uint16_t code, status;
1878 
1879 #ifndef __FreeBSD__
1880 	status = 0;
1881 #endif
1882 
1883 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
1884 
1885 	/* TODO return correct error */
1886 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1887 	pci_nvme_status_genc(&status, code);
1888 
1889 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1890 	pci_nvme_stats_write_read_update(req->sc, req->opc,
1891 	    req->bytes, status);
1892 	pci_nvme_release_ioreq(req->sc, req);
1893 }
1894 
1895 /*
1896  * Implements the Flush command. The specification states:
1897  *    If a volatile write cache is not present, Flush commands complete
1898  *    successfully and have no effect
1899  * in the description of the Volatile Write Cache (VWC) field of the Identify
1900  * Controller data. Therefore, set status to Success if the command is
1901  * not supported (i.e. RAM or as indicated by the blockif).
1902  */
1903 static bool
nvme_opc_flush(struct pci_nvme_softc * sc,struct nvme_command * cmd,struct pci_nvme_blockstore * nvstore,struct pci_nvme_ioreq * req,uint16_t * status)1904 nvme_opc_flush(struct pci_nvme_softc *sc,
1905     struct nvme_command *cmd,
1906     struct pci_nvme_blockstore *nvstore,
1907     struct pci_nvme_ioreq *req,
1908     uint16_t *status)
1909 {
1910 	bool pending = false;
1911 
1912 	if (nvstore->type == NVME_STOR_RAM) {
1913 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1914 	} else {
1915 		int err;
1916 
1917 		req->io_req.br_callback = pci_nvme_io_done;
1918 
1919 		err = blockif_flush(nvstore->ctx, &req->io_req);
1920 		switch (err) {
1921 		case 0:
1922 			pending = true;
1923 			break;
1924 		case EOPNOTSUPP:
1925 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1926 			break;
1927 		default:
1928 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1929 		}
1930 	}
1931 
1932 	return (pending);
1933 }
1934 
1935 static uint16_t
nvme_write_read_ram(struct pci_nvme_softc * sc,struct pci_nvme_blockstore * nvstore,uint64_t prp1,uint64_t prp2,size_t offset,uint64_t bytes,bool is_write)1936 nvme_write_read_ram(struct pci_nvme_softc *sc,
1937     struct pci_nvme_blockstore *nvstore,
1938     uint64_t prp1, uint64_t prp2,
1939     size_t offset, uint64_t bytes,
1940     bool is_write)
1941 {
1942 	uint8_t *buf = nvstore->ctx;
1943 	enum nvme_copy_dir dir;
1944 	uint16_t status;
1945 
1946 #ifndef __FreeBSD__
1947 	status = 0;
1948 #endif
1949 
1950 	if (is_write)
1951 		dir = NVME_COPY_TO_PRP;
1952 	else
1953 		dir = NVME_COPY_FROM_PRP;
1954 
1955 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1956 	    buf + offset, bytes, dir))
1957 		pci_nvme_status_genc(&status,
1958 		    NVME_SC_DATA_TRANSFER_ERROR);
1959 	else
1960 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1961 
1962 	return (status);
1963 }
1964 
1965 static uint16_t
nvme_write_read_blockif(struct pci_nvme_softc * sc,struct pci_nvme_blockstore * nvstore,struct pci_nvme_ioreq * req,uint64_t prp1,uint64_t prp2,size_t offset,uint64_t bytes,bool is_write)1966 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1967     struct pci_nvme_blockstore *nvstore,
1968     struct pci_nvme_ioreq *req,
1969     uint64_t prp1, uint64_t prp2,
1970     size_t offset, uint64_t bytes,
1971     bool is_write)
1972 {
1973 	uint64_t size;
1974 	int err;
1975 	uint16_t status = NVME_NO_STATUS;
1976 
1977 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1978 	if (pci_nvme_append_iov_req(sc, req, prp1,
1979 	    size, is_write, offset)) {
1980 		pci_nvme_status_genc(&status,
1981 		    NVME_SC_DATA_TRANSFER_ERROR);
1982 		goto out;
1983 	}
1984 
1985 	offset += size;
1986 	bytes  -= size;
1987 
1988 	if (bytes == 0) {
1989 		;
1990 	} else if (bytes <= PAGE_SIZE) {
1991 		size = bytes;
1992 		if (pci_nvme_append_iov_req(sc, req, prp2,
1993 		    size, is_write, offset)) {
1994 			pci_nvme_status_genc(&status,
1995 			    NVME_SC_DATA_TRANSFER_ERROR);
1996 			goto out;
1997 		}
1998 	} else {
1999 		void *vmctx = sc->nsc_pi->pi_vmctx;
2000 		uint64_t *prp_list = &prp2;
2001 		uint64_t *last = prp_list;
2002 
2003 		/* PRP2 is pointer to a physical region page list */
2004 		while (bytes) {
2005 			/* Last entry in list points to the next list */
2006 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2007 				uint64_t prp = *prp_list;
2008 
2009 				prp_list = paddr_guest2host(vmctx, prp,
2010 				    PAGE_SIZE - (prp % PAGE_SIZE));
2011 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2012 			}
2013 
2014 			size = MIN(bytes, PAGE_SIZE);
2015 
2016 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2017 			    size, is_write, offset)) {
2018 				pci_nvme_status_genc(&status,
2019 				    NVME_SC_DATA_TRANSFER_ERROR);
2020 				goto out;
2021 			}
2022 
2023 			offset += size;
2024 			bytes  -= size;
2025 
2026 			prp_list++;
2027 		}
2028 	}
2029 	req->io_req.br_callback = pci_nvme_io_done;
2030 	if (is_write)
2031 		err = blockif_write(nvstore->ctx, &req->io_req);
2032 	else
2033 		err = blockif_read(nvstore->ctx, &req->io_req);
2034 
2035 	if (err)
2036 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2037 out:
2038 	return (status);
2039 }
2040 
2041 static bool
nvme_opc_write_read(struct pci_nvme_softc * sc,struct nvme_command * cmd,struct pci_nvme_blockstore * nvstore,struct pci_nvme_ioreq * req,uint16_t * status)2042 nvme_opc_write_read(struct pci_nvme_softc *sc,
2043     struct nvme_command *cmd,
2044     struct pci_nvme_blockstore *nvstore,
2045     struct pci_nvme_ioreq *req,
2046     uint16_t *status)
2047 {
2048 	uint64_t lba, nblocks, bytes;
2049 	size_t offset;
2050 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2051 	bool pending = false;
2052 
2053 #ifndef __FreeBSD__
2054 	bytes = 0;
2055 #endif
2056 
2057 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2058 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2059 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2060 		WPRINTF("%s command would exceed LBA range", __func__);
2061 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2062 		goto out;
2063 	}
2064 
2065 	bytes  = nblocks << nvstore->sectsz_bits;
2066 	if (bytes > NVME_MAX_DATA_SIZE) {
2067 		WPRINTF("%s command would exceed MDTS", __func__);
2068 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2069 		goto out;
2070 	}
2071 
2072 	offset = lba << nvstore->sectsz_bits;
2073 
2074 	req->bytes = bytes;
2075 	req->io_req.br_offset = lba;
2076 
2077 	/* PRP bits 1:0 must be zero */
2078 	cmd->prp1 &= ~0x3UL;
2079 	cmd->prp2 &= ~0x3UL;
2080 
2081 	if (nvstore->type == NVME_STOR_RAM) {
2082 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2083 		    cmd->prp2, offset, bytes, is_write);
2084 	} else {
2085 		*status = nvme_write_read_blockif(sc, nvstore, req,
2086 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2087 
2088 		if (*status == NVME_NO_STATUS)
2089 			pending = true;
2090 	}
2091 out:
2092 	if (!pending)
2093 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2094 
2095 	return (pending);
2096 }
2097 
2098 static void
pci_nvme_dealloc_sm(struct blockif_req * br,int err)2099 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2100 {
2101 	struct pci_nvme_ioreq *req = br->br_param;
2102 	struct pci_nvme_softc *sc = req->sc;
2103 	bool done = true;
2104 	uint16_t status;
2105 
2106 #ifndef __FreeBSD__
2107 	status = 0;
2108 #endif
2109 
2110 	if (err) {
2111 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2112 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2113 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2114 	} else {
2115 		struct iovec *iov = req->io_req.br_iov;
2116 
2117 		req->prev_gpaddr++;
2118 		iov += req->prev_gpaddr;
2119 
2120 		/* The iov_* values already include the sector size */
2121 		req->io_req.br_offset = (off_t)iov->iov_base;
2122 		req->io_req.br_resid = iov->iov_len;
2123 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2124 			pci_nvme_status_genc(&status,
2125 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2126 		} else
2127 			done = false;
2128 	}
2129 
2130 	if (done) {
2131 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2132 		    req->cid, 0, status);
2133 		pci_nvme_release_ioreq(sc, req);
2134 	}
2135 }
2136 
2137 static bool
nvme_opc_dataset_mgmt(struct pci_nvme_softc * sc,struct nvme_command * cmd,struct pci_nvme_blockstore * nvstore,struct pci_nvme_ioreq * req,uint16_t * status)2138 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2139     struct nvme_command *cmd,
2140     struct pci_nvme_blockstore *nvstore,
2141     struct pci_nvme_ioreq *req,
2142     uint16_t *status)
2143 {
2144 	struct nvme_dsm_range *range;
2145 	uint32_t nr, r, non_zero, dr;
2146 	int err;
2147 	bool pending = false;
2148 
2149 #ifndef __FreeBSD__
2150 	range = NULL;
2151 #endif
2152 
2153 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2154 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2155 		goto out;
2156 	}
2157 
2158 	nr = cmd->cdw10 & 0xff;
2159 
2160 	/* copy locally because a range entry could straddle PRPs */
2161 	range = calloc(1, NVME_MAX_DSM_TRIM);
2162 	if (range == NULL) {
2163 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2164 		goto out;
2165 	}
2166 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2167 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2168 
2169 	/* Check for invalid ranges and the number of non-zero lengths */
2170 	non_zero = 0;
2171 	for (r = 0; r <= nr; r++) {
2172 		if (pci_nvme_out_of_range(nvstore,
2173 		    range[r].starting_lba, range[r].length)) {
2174 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2175 			goto out;
2176 		}
2177 		if (range[r].length != 0)
2178 			non_zero++;
2179 	}
2180 
2181 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2182 		size_t offset, bytes;
2183 		int sectsz_bits = sc->nvstore.sectsz_bits;
2184 
2185 		/*
2186 		 * DSM calls are advisory only, and compliant controllers
2187 		 * may choose to take no actions (i.e. return Success).
2188 		 */
2189 		if (!nvstore->deallocate) {
2190 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2191 			goto out;
2192 		}
2193 
2194 		/* If all ranges have a zero length, return Success */
2195 		if (non_zero == 0) {
2196 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2197 			goto out;
2198 		}
2199 
2200 		if (req == NULL) {
2201 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2202 			goto out;
2203 		}
2204 
2205 		offset = range[0].starting_lba << sectsz_bits;
2206 		bytes = range[0].length << sectsz_bits;
2207 
2208 		/*
2209 		 * If the request is for more than a single range, store
2210 		 * the ranges in the br_iov. Optimize for the common case
2211 		 * of a single range.
2212 		 *
2213 		 * Note that NVMe Number of Ranges is a zero based value
2214 		 */
2215 		req->io_req.br_iovcnt = 0;
2216 		req->io_req.br_offset = offset;
2217 		req->io_req.br_resid = bytes;
2218 
2219 		if (nr == 0) {
2220 			req->io_req.br_callback = pci_nvme_io_done;
2221 		} else {
2222 			struct iovec *iov = req->io_req.br_iov;
2223 
2224 			for (r = 0, dr = 0; r <= nr; r++) {
2225 				offset = range[r].starting_lba << sectsz_bits;
2226 				bytes = range[r].length << sectsz_bits;
2227 				if (bytes == 0)
2228 					continue;
2229 
2230 				if ((nvstore->size - offset) < bytes) {
2231 					pci_nvme_status_genc(status,
2232 					    NVME_SC_LBA_OUT_OF_RANGE);
2233 					goto out;
2234 				}
2235 				iov[dr].iov_base = (void *)offset;
2236 				iov[dr].iov_len = bytes;
2237 				dr++;
2238 			}
2239 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2240 
2241 			/*
2242 			 * Use prev_gpaddr to track the current entry and
2243 			 * prev_size to track the number of entries
2244 			 */
2245 			req->prev_gpaddr = 0;
2246 			req->prev_size = dr;
2247 		}
2248 
2249 		err = blockif_delete(nvstore->ctx, &req->io_req);
2250 		if (err)
2251 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2252 		else
2253 			pending = true;
2254 	}
2255 out:
2256 	free(range);
2257 	return (pending);
2258 }
2259 
2260 static void
pci_nvme_handle_io_cmd(struct pci_nvme_softc * sc,uint16_t idx)2261 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2262 {
2263 	struct nvme_submission_queue *sq;
2264 	uint16_t status;
2265 	uint16_t sqhead;
2266 
2267 #ifndef __FreeBSD__
2268 	status = 0;
2269 #endif
2270 
2271 	/* handle all submissions up to sq->tail index */
2272 	sq = &sc->submit_queues[idx];
2273 
2274 	pthread_mutex_lock(&sq->mtx);
2275 
2276 	sqhead = sq->head;
2277 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2278 	         idx, sqhead, sq->tail, sq->qbase);
2279 
2280 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2281 		struct nvme_command *cmd;
2282 		struct pci_nvme_ioreq *req;
2283 		uint32_t nsid;
2284 		bool pending;
2285 
2286 		pending = false;
2287 		req = NULL;
2288 		status = 0;
2289 
2290 		cmd = &sq->qbase[sqhead];
2291 		sqhead = (sqhead + 1) % sq->size;
2292 
2293 		nsid = le32toh(cmd->nsid);
2294 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2295 			pci_nvme_status_genc(&status,
2296 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2297 			status |=
2298 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2299 			goto complete;
2300  		}
2301 
2302 		req = pci_nvme_get_ioreq(sc);
2303 		if (req == NULL) {
2304 			pci_nvme_status_genc(&status,
2305 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2306 			WPRINTF("%s: unable to allocate IO req", __func__);
2307 			goto complete;
2308 		}
2309 		req->nvme_sq = sq;
2310 		req->sqid = idx;
2311 		req->opc = cmd->opc;
2312 		req->cid = cmd->cid;
2313 		req->nsid = cmd->nsid;
2314 
2315 		switch (cmd->opc) {
2316 		case NVME_OPC_FLUSH:
2317 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2318 			    req, &status);
2319  			break;
2320 		case NVME_OPC_WRITE:
2321 		case NVME_OPC_READ:
2322 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2323 			    req, &status);
2324 			break;
2325 		case NVME_OPC_WRITE_ZEROES:
2326 			/* TODO: write zeroes
2327 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2328 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2329 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2330 			break;
2331 		case NVME_OPC_DATASET_MANAGEMENT:
2332  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2333 			    req, &status);
2334 			break;
2335  		default:
2336  			WPRINTF("%s unhandled io command 0x%x",
2337 			    __func__, cmd->opc);
2338 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2339 		}
2340 complete:
2341 		if (!pending) {
2342 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2343 			    status);
2344 			if (req != NULL)
2345 				pci_nvme_release_ioreq(sc, req);
2346 		}
2347 	}
2348 
2349 	sq->head = sqhead;
2350 
2351 	pthread_mutex_unlock(&sq->mtx);
2352 }
2353 
2354 static void
pci_nvme_handle_doorbell(struct vmctx * ctx,struct pci_nvme_softc * sc,uint64_t idx,int is_sq,uint64_t value)2355 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2356 	uint64_t idx, int is_sq, uint64_t value)
2357 {
2358 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2359 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2360 
2361 	if (is_sq) {
2362 		if (idx > sc->num_squeues) {
2363 			WPRINTF("%s queue index %lu overflow from "
2364 			         "guest (max %u)",
2365 			         __func__, idx, sc->num_squeues);
2366 			return;
2367 		}
2368 
2369 		atomic_store_short(&sc->submit_queues[idx].tail,
2370 		                   (uint16_t)value);
2371 
2372 		if (idx == 0) {
2373 			pci_nvme_handle_admin_cmd(sc, value);
2374 		} else {
2375 			/* submission queue; handle new entries in SQ */
2376 			if (idx > sc->num_squeues) {
2377 				WPRINTF("%s SQ index %lu overflow from "
2378 				         "guest (max %u)",
2379 				         __func__, idx, sc->num_squeues);
2380 				return;
2381 			}
2382 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2383 		}
2384 	} else {
2385 		if (idx > sc->num_cqueues) {
2386 			WPRINTF("%s queue index %lu overflow from "
2387 			         "guest (max %u)",
2388 			         __func__, idx, sc->num_cqueues);
2389 			return;
2390 		}
2391 
2392 		atomic_store_short(&sc->compl_queues[idx].head,
2393 				(uint16_t)value);
2394 	}
2395 }
2396 
2397 static void
pci_nvme_bar0_reg_dumps(const char * func,uint64_t offset,int iswrite)2398 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2399 {
2400 	const char *s = iswrite ? "WRITE" : "READ";
2401 
2402 	switch (offset) {
2403 	case NVME_CR_CAP_LOW:
2404 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2405 		break;
2406 	case NVME_CR_CAP_HI:
2407 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2408 		break;
2409 	case NVME_CR_VS:
2410 		DPRINTF("%s %s NVME_CR_VS", func, s);
2411 		break;
2412 	case NVME_CR_INTMS:
2413 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2414 		break;
2415 	case NVME_CR_INTMC:
2416 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2417 		break;
2418 	case NVME_CR_CC:
2419 		DPRINTF("%s %s NVME_CR_CC", func, s);
2420 		break;
2421 	case NVME_CR_CSTS:
2422 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2423 		break;
2424 	case NVME_CR_NSSR:
2425 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2426 		break;
2427 	case NVME_CR_AQA:
2428 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2429 		break;
2430 	case NVME_CR_ASQ_LOW:
2431 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2432 		break;
2433 	case NVME_CR_ASQ_HI:
2434 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2435 		break;
2436 	case NVME_CR_ACQ_LOW:
2437 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2438 		break;
2439 	case NVME_CR_ACQ_HI:
2440 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2441 		break;
2442 	default:
2443 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2444 	}
2445 
2446 }
2447 
2448 static void
pci_nvme_write_bar_0(struct vmctx * ctx,struct pci_nvme_softc * sc,uint64_t offset,int size,uint64_t value)2449 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2450 	uint64_t offset, int size, uint64_t value)
2451 {
2452 	uint32_t ccreg;
2453 
2454 	if (offset >= NVME_DOORBELL_OFFSET) {
2455 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2456 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2457 		int is_sq = (belloffset % 8) < 4;
2458 
2459 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2460 			WPRINTF("guest attempted an overflow write offset "
2461 			         "0x%lx, val 0x%lx in %s",
2462 			         offset, value, __func__);
2463 			return;
2464 		}
2465 
2466 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2467 		return;
2468 	}
2469 
2470 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2471 	        offset, size, value);
2472 
2473 	if (size != 4) {
2474 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2475 		         "val 0x%lx) to bar0 in %s",
2476 		         size, offset, value, __func__);
2477 		/* TODO: shutdown device */
2478 		return;
2479 	}
2480 
2481 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2482 
2483 	pthread_mutex_lock(&sc->mtx);
2484 
2485 	switch (offset) {
2486 	case NVME_CR_CAP_LOW:
2487 	case NVME_CR_CAP_HI:
2488 		/* readonly */
2489 		break;
2490 	case NVME_CR_VS:
2491 		/* readonly */
2492 		break;
2493 	case NVME_CR_INTMS:
2494 		/* MSI-X, so ignore */
2495 		break;
2496 	case NVME_CR_INTMC:
2497 		/* MSI-X, so ignore */
2498 		break;
2499 	case NVME_CR_CC:
2500 		ccreg = (uint32_t)value;
2501 
2502 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2503 		         "iocqes %u",
2504 		        __func__,
2505 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2506 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2507 			 NVME_CC_GET_IOCQES(ccreg));
2508 
2509 		if (NVME_CC_GET_SHN(ccreg)) {
2510 			/* perform shutdown - flush out data to backend */
2511 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2512 			    NVME_CSTS_REG_SHST_SHIFT);
2513 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2514 			    NVME_CSTS_REG_SHST_SHIFT;
2515 		}
2516 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2517 			if (NVME_CC_GET_EN(ccreg) == 0)
2518 				/* transition 1-> causes controller reset */
2519 				pci_nvme_reset_locked(sc);
2520 			else
2521 				pci_nvme_init_controller(ctx, sc);
2522 		}
2523 
2524 		/* Insert the iocqes, iosqes and en bits from the write */
2525 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2526 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2527 		if (NVME_CC_GET_EN(ccreg) == 0) {
2528 			/* Insert the ams, mps and css bit fields */
2529 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2530 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2531 			sc->regs.csts &= ~NVME_CSTS_RDY;
2532 		} else if (sc->pending_ios == 0) {
2533 			sc->regs.csts |= NVME_CSTS_RDY;
2534 		}
2535 		break;
2536 	case NVME_CR_CSTS:
2537 		break;
2538 	case NVME_CR_NSSR:
2539 		/* ignore writes; don't support subsystem reset */
2540 		break;
2541 	case NVME_CR_AQA:
2542 		sc->regs.aqa = (uint32_t)value;
2543 		break;
2544 	case NVME_CR_ASQ_LOW:
2545 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2546 		               (0xFFFFF000 & value);
2547 		break;
2548 	case NVME_CR_ASQ_HI:
2549 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2550 		               (value << 32);
2551 		break;
2552 	case NVME_CR_ACQ_LOW:
2553 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2554 		               (0xFFFFF000 & value);
2555 		break;
2556 	case NVME_CR_ACQ_HI:
2557 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2558 		               (value << 32);
2559 		break;
2560 	default:
2561 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2562 		         __func__, offset, value, size);
2563 	}
2564 	pthread_mutex_unlock(&sc->mtx);
2565 }
2566 
2567 static void
pci_nvme_write(struct vmctx * ctx,int vcpu,struct pci_devinst * pi,int baridx,uint64_t offset,int size,uint64_t value)2568 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2569                 int baridx, uint64_t offset, int size, uint64_t value)
2570 {
2571 	struct pci_nvme_softc* sc = pi->pi_arg;
2572 
2573 	if (baridx == pci_msix_table_bar(pi) ||
2574 	    baridx == pci_msix_pba_bar(pi)) {
2575 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2576 		         " value 0x%lx", baridx, offset, size, value);
2577 
2578 		pci_emul_msix_twrite(pi, offset, size, value);
2579 		return;
2580 	}
2581 
2582 	switch (baridx) {
2583 	case 0:
2584 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2585 		break;
2586 
2587 	default:
2588 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2589 		         __func__, baridx, value);
2590 	}
2591 }
2592 
pci_nvme_read_bar_0(struct pci_nvme_softc * sc,uint64_t offset,int size)2593 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2594 	uint64_t offset, int size)
2595 {
2596 	uint64_t value;
2597 
2598 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2599 
2600 	if (offset < NVME_DOORBELL_OFFSET) {
2601 		void *p = &(sc->regs);
2602 		pthread_mutex_lock(&sc->mtx);
2603 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
2604 		pthread_mutex_unlock(&sc->mtx);
2605 	} else {
2606 		value = 0;
2607                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2608 	}
2609 
2610 	switch (size) {
2611 	case 1:
2612 		value &= 0xFF;
2613 		break;
2614 	case 2:
2615 		value &= 0xFFFF;
2616 		break;
2617 	case 4:
2618 		value &= 0xFFFFFFFF;
2619 		break;
2620 	}
2621 
2622 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2623 	         offset, size, (uint32_t)value);
2624 
2625 	return (value);
2626 }
2627 
2628 
2629 
2630 static uint64_t
pci_nvme_read(struct vmctx * ctx,int vcpu,struct pci_devinst * pi,int baridx,uint64_t offset,int size)2631 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2632     uint64_t offset, int size)
2633 {
2634 	struct pci_nvme_softc* sc = pi->pi_arg;
2635 
2636 	if (baridx == pci_msix_table_bar(pi) ||
2637 	    baridx == pci_msix_pba_bar(pi)) {
2638 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2639 		        baridx, offset, size);
2640 
2641 		return pci_emul_msix_tread(pi, offset, size);
2642 	}
2643 
2644 	switch (baridx) {
2645 	case 0:
2646        		return pci_nvme_read_bar_0(sc, offset, size);
2647 
2648 	default:
2649 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2650 	}
2651 
2652 	return (0);
2653 }
2654 
2655 static int
pci_nvme_parse_config(struct pci_nvme_softc * sc,nvlist_t * nvl)2656 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2657 {
2658 	char bident[sizeof("XX:X:X")];
2659 	const char *value;
2660 	uint32_t sectsz;
2661 
2662 	sc->max_queues = NVME_QUEUES;
2663 	sc->max_qentries = NVME_MAX_QENTRIES;
2664 	sc->ioslots = NVME_IOSLOTS;
2665 	sc->num_squeues = sc->max_queues;
2666 	sc->num_cqueues = sc->max_queues;
2667 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2668 	sectsz = 0;
2669 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2670 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2671 
2672 	value = get_config_value_node(nvl, "maxq");
2673 	if (value != NULL)
2674 		sc->max_queues = atoi(value);
2675 	value = get_config_value_node(nvl, "qsz");
2676 	if (value != NULL) {
2677 		sc->max_qentries = atoi(value);
2678 		if (sc->max_qentries <= 0) {
2679 			EPRINTLN("nvme: Invalid qsz option %d",
2680 			    sc->max_qentries);
2681 			return (-1);
2682 		}
2683 	}
2684 	value = get_config_value_node(nvl, "ioslots");
2685 	if (value != NULL) {
2686 		sc->ioslots = atoi(value);
2687 		if (sc->ioslots <= 0) {
2688 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2689 			return (-1);
2690 		}
2691 	}
2692 	value = get_config_value_node(nvl, "sectsz");
2693 	if (value != NULL)
2694 		sectsz = atoi(value);
2695 	value = get_config_value_node(nvl, "ser");
2696 	if (value != NULL) {
2697 		/*
2698 		 * This field indicates the Product Serial Number in
2699 		 * 7-bit ASCII, unused bytes should be space characters.
2700 		 * Ref: NVMe v1.3c.
2701 		 */
2702 		cpywithpad((char *)sc->ctrldata.sn,
2703 		    sizeof(sc->ctrldata.sn), value, ' ');
2704 	}
2705 	value = get_config_value_node(nvl, "eui64");
2706 	if (value != NULL)
2707 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
2708 	value = get_config_value_node(nvl, "dsm");
2709 	if (value != NULL) {
2710 		if (strcmp(value, "auto") == 0)
2711 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2712 		else if (strcmp(value, "enable") == 0)
2713 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2714 		else if (strcmp(value, "disable") == 0)
2715 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2716 	}
2717 
2718 	value = get_config_value_node(nvl, "ram");
2719 	if (value != NULL) {
2720 		uint64_t sz = strtoull(value, NULL, 10);
2721 
2722 		sc->nvstore.type = NVME_STOR_RAM;
2723 		sc->nvstore.size = sz * 1024 * 1024;
2724 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2725 		sc->nvstore.sectsz = 4096;
2726 		sc->nvstore.sectsz_bits = 12;
2727 		if (sc->nvstore.ctx == NULL) {
2728 			EPRINTLN("nvme: Unable to allocate RAM");
2729 			return (-1);
2730 		}
2731 	} else {
2732 		snprintf(bident, sizeof(bident), "%d:%d",
2733 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2734 		sc->nvstore.ctx = blockif_open(nvl, bident);
2735 		if (sc->nvstore.ctx == NULL) {
2736 			EPRINTLN("nvme: Could not open backing file: %s",
2737 			    strerror(errno));
2738 			return (-1);
2739 		}
2740 		sc->nvstore.type = NVME_STOR_BLOCKIF;
2741 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2742 	}
2743 
2744 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2745 		sc->nvstore.sectsz = sectsz;
2746 	else if (sc->nvstore.type != NVME_STOR_RAM)
2747 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2748 	for (sc->nvstore.sectsz_bits = 9;
2749 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2750 	     sc->nvstore.sectsz_bits++);
2751 
2752 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2753 		sc->max_queues = NVME_QUEUES;
2754 
2755 	return (0);
2756 }
2757 
2758 static int
pci_nvme_init(struct vmctx * ctx,struct pci_devinst * pi,nvlist_t * nvl)2759 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
2760 {
2761 	struct pci_nvme_softc *sc;
2762 	uint32_t pci_membar_sz;
2763 	int	error;
2764 
2765 	error = 0;
2766 
2767 	sc = calloc(1, sizeof(struct pci_nvme_softc));
2768 	pi->pi_arg = sc;
2769 	sc->nsc_pi = pi;
2770 
2771 	error = pci_nvme_parse_config(sc, nvl);
2772 	if (error < 0)
2773 		goto done;
2774 	else
2775 		error = 0;
2776 
2777 	STAILQ_INIT(&sc->ioreqs_free);
2778 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2779 	for (int i = 0; i < sc->ioslots; i++) {
2780 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2781 	}
2782 
2783 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2784 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2785 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2786 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2787 	pci_set_cfgdata8(pi, PCIR_PROGIF,
2788 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2789 
2790 	/*
2791 	 * Allocate size of NVMe registers + doorbell space for all queues.
2792 	 *
2793 	 * The specification requires a minimum memory I/O window size of 16K.
2794 	 * The Windows driver will refuse to start a device with a smaller
2795 	 * window.
2796 	 */
2797 	pci_membar_sz = sizeof(struct nvme_registers) +
2798 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
2799 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2800 
2801 	DPRINTF("nvme membar size: %u", pci_membar_sz);
2802 
2803 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2804 	if (error) {
2805 		WPRINTF("%s pci alloc mem bar failed", __func__);
2806 		goto done;
2807 	}
2808 
2809 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2810 	if (error) {
2811 		WPRINTF("%s pci add msixcap failed", __func__);
2812 		goto done;
2813 	}
2814 
2815 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2816 	if (error) {
2817 		WPRINTF("%s pci add Express capability failed", __func__);
2818 		goto done;
2819 	}
2820 
2821 	pthread_mutex_init(&sc->mtx, NULL);
2822 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2823 
2824 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2825 	/*
2826 	 * Controller data depends on Namespace data so initialize Namespace
2827 	 * data first.
2828 	 */
2829 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2830 	pci_nvme_init_ctrldata(sc);
2831 	pci_nvme_init_logpages(sc);
2832 	pci_nvme_init_features(sc);
2833 
2834 	pci_nvme_aer_init(sc);
2835 
2836 	pci_nvme_reset(sc);
2837 
2838 	pci_lintr_request(pi);
2839 
2840 done:
2841 	return (error);
2842 }
2843 
2844 static int
pci_nvme_legacy_config(nvlist_t * nvl,const char * opts)2845 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
2846 {
2847 	char *cp, *ram;
2848 
2849 	if (opts == NULL)
2850 		return (0);
2851 
2852 	if (strncmp(opts, "ram=", 4) == 0) {
2853 		cp = strchr(opts, ',');
2854 		if (cp == NULL) {
2855 			set_config_value_node(nvl, "ram", opts + 4);
2856 			return (0);
2857 		}
2858 		ram = strndup(opts + 4, cp - opts - 4);
2859 		set_config_value_node(nvl, "ram", ram);
2860 		free(ram);
2861 		return (pci_parse_legacy_config(nvl, cp + 1));
2862 	} else
2863 		return (blockif_legacy_config(nvl, opts));
2864 }
2865 
2866 struct pci_devemu pci_de_nvme = {
2867 	.pe_emu =	"nvme",
2868 	.pe_init =	pci_nvme_init,
2869 	.pe_legacy_config = pci_nvme_legacy_config,
2870 	.pe_barwrite =	pci_nvme_write,
2871 	.pe_barread =	pci_nvme_read
2872 };
2873 PCI_EMUL_SET(pci_de_nvme);
2874