1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 #ifndef __FreeBSD__ 66 #include <endian.h> 67 #endif 68 69 #include <assert.h> 70 #include <pthread.h> 71 #include <semaphore.h> 72 #include <stdbool.h> 73 #include <stddef.h> 74 #include <stdint.h> 75 #include <stdio.h> 76 #include <stdlib.h> 77 #include <string.h> 78 79 #include <machine/atomic.h> 80 #include <machine/vmm.h> 81 #include <vmmapi.h> 82 83 #include <dev/nvme/nvme.h> 84 85 #include "bhyverun.h" 86 #include "block_if.h" 87 #include "config.h" 88 #include "debug.h" 89 #include "pci_emul.h" 90 91 92 static int nvme_debug = 0; 93 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 94 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 95 96 /* defaults; can be overridden */ 97 #define NVME_MSIX_BAR 4 98 99 #define NVME_IOSLOTS 8 100 101 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 102 #define NVME_MMIO_SPACE_MIN (1 << 14) 103 104 #define NVME_QUEUES 16 105 #define NVME_MAX_QENTRIES 2048 106 /* Memory Page size Minimum reported in CAP register */ 107 #define NVME_MPSMIN 0 108 /* MPSMIN converted to bytes */ 109 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 110 111 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 112 #define NVME_MDTS 9 113 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 114 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 115 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 116 117 /* This is a synthetic status code to indicate there is no status */ 118 #define NVME_NO_STATUS 0xffff 119 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 120 121 /* helpers */ 122 123 /* Convert a zero-based value into a one-based value */ 124 #define ONE_BASED(zero) ((zero) + 1) 125 /* Convert a one-based value into a zero-based value */ 126 #define ZERO_BASED(one) ((one) - 1) 127 128 /* Encode number of SQ's and CQ's for Set/Get Features */ 129 #define NVME_FEATURE_NUM_QUEUES(sc) \ 130 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 131 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 132 133 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 134 135 enum nvme_controller_register_offsets { 136 NVME_CR_CAP_LOW = 0x00, 137 NVME_CR_CAP_HI = 0x04, 138 NVME_CR_VS = 0x08, 139 NVME_CR_INTMS = 0x0c, 140 NVME_CR_INTMC = 0x10, 141 NVME_CR_CC = 0x14, 142 NVME_CR_CSTS = 0x1c, 143 NVME_CR_NSSR = 0x20, 144 NVME_CR_AQA = 0x24, 145 NVME_CR_ASQ_LOW = 0x28, 146 NVME_CR_ASQ_HI = 0x2c, 147 NVME_CR_ACQ_LOW = 0x30, 148 NVME_CR_ACQ_HI = 0x34, 149 }; 150 151 enum nvme_cmd_cdw11 { 152 NVME_CMD_CDW11_PC = 0x0001, 153 NVME_CMD_CDW11_IEN = 0x0002, 154 NVME_CMD_CDW11_IV = 0xFFFF0000, 155 }; 156 157 enum nvme_copy_dir { 158 NVME_COPY_TO_PRP, 159 NVME_COPY_FROM_PRP, 160 }; 161 162 #define NVME_CQ_INTEN 0x01 163 #define NVME_CQ_INTCOAL 0x02 164 165 struct nvme_completion_queue { 166 struct nvme_completion *qbase; 167 pthread_mutex_t mtx; 168 uint32_t size; 169 uint16_t tail; /* nvme progress */ 170 uint16_t head; /* guest progress */ 171 uint16_t intr_vec; 172 uint32_t intr_en; 173 }; 174 175 struct nvme_submission_queue { 176 struct nvme_command *qbase; 177 pthread_mutex_t mtx; 178 uint32_t size; 179 uint16_t head; /* nvme progress */ 180 uint16_t tail; /* guest progress */ 181 uint16_t cqid; /* completion queue id */ 182 int qpriority; 183 }; 184 185 enum nvme_storage_type { 186 NVME_STOR_BLOCKIF = 0, 187 NVME_STOR_RAM = 1, 188 }; 189 190 struct pci_nvme_blockstore { 191 enum nvme_storage_type type; 192 void *ctx; 193 uint64_t size; 194 uint32_t sectsz; 195 uint32_t sectsz_bits; 196 uint64_t eui64; 197 uint32_t deallocate:1; 198 }; 199 200 /* 201 * Calculate the number of additional page descriptors for guest IO requests 202 * based on the advertised Max Data Transfer (MDTS) and given the number of 203 * default iovec's in a struct blockif_req. 204 */ 205 #define MDTS_PAD_SIZE \ 206 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 207 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 208 0 ) 209 210 struct pci_nvme_ioreq { 211 struct pci_nvme_softc *sc; 212 STAILQ_ENTRY(pci_nvme_ioreq) link; 213 struct nvme_submission_queue *nvme_sq; 214 uint16_t sqid; 215 216 /* command information */ 217 uint16_t opc; 218 uint16_t cid; 219 uint32_t nsid; 220 221 uint64_t prev_gpaddr; 222 size_t prev_size; 223 size_t bytes; 224 225 struct blockif_req io_req; 226 227 struct iovec iovpadding[MDTS_PAD_SIZE]; 228 }; 229 230 enum nvme_dsm_type { 231 /* Dataset Management bit in ONCS reflects backing storage capability */ 232 NVME_DATASET_MANAGEMENT_AUTO, 233 /* Unconditionally set Dataset Management bit in ONCS */ 234 NVME_DATASET_MANAGEMENT_ENABLE, 235 /* Unconditionally clear Dataset Management bit in ONCS */ 236 NVME_DATASET_MANAGEMENT_DISABLE, 237 }; 238 239 struct pci_nvme_softc; 240 struct nvme_feature_obj; 241 242 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 243 struct nvme_feature_obj *, 244 struct nvme_command *, 245 struct nvme_completion *); 246 247 struct nvme_feature_obj { 248 uint32_t cdw11; 249 nvme_feature_cb set; 250 nvme_feature_cb get; 251 bool namespace_specific; 252 }; 253 254 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 255 256 struct pci_nvme_aer { 257 STAILQ_ENTRY(pci_nvme_aer) link; 258 uint16_t cid; /* Command ID of the submitted AER */ 259 }; 260 261 struct pci_nvme_softc { 262 struct pci_devinst *nsc_pi; 263 264 pthread_mutex_t mtx; 265 266 struct nvme_registers regs; 267 268 struct nvme_namespace_data nsdata; 269 struct nvme_controller_data ctrldata; 270 struct nvme_error_information_entry err_log; 271 struct nvme_health_information_page health_log; 272 struct nvme_firmware_page fw_log; 273 274 struct pci_nvme_blockstore nvstore; 275 276 uint16_t max_qentries; /* max entries per queue */ 277 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 278 uint32_t num_cqueues; 279 uint32_t num_squeues; 280 bool num_q_is_set; /* Has host set Number of Queues */ 281 282 struct pci_nvme_ioreq *ioreqs; 283 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 284 uint32_t pending_ios; 285 uint32_t ioslots; 286 sem_t iosemlock; 287 288 /* 289 * Memory mapped Submission and Completion queues 290 * Each array includes both Admin and IO queues 291 */ 292 struct nvme_completion_queue *compl_queues; 293 struct nvme_submission_queue *submit_queues; 294 295 struct nvme_feature_obj feat[NVME_FID_MAX]; 296 297 enum nvme_dsm_type dataset_management; 298 299 /* Accounting for SMART data */ 300 __uint128_t read_data_units; 301 __uint128_t write_data_units; 302 __uint128_t read_commands; 303 __uint128_t write_commands; 304 uint32_t read_dunits_remainder; 305 uint32_t write_dunits_remainder; 306 307 STAILQ_HEAD(, pci_nvme_aer) aer_list; 308 uint32_t aer_count; 309 }; 310 311 312 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 313 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 314 static void pci_nvme_io_done(struct blockif_req *, int); 315 316 /* Controller Configuration utils */ 317 #define NVME_CC_GET_EN(cc) \ 318 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 319 #define NVME_CC_GET_CSS(cc) \ 320 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 321 #define NVME_CC_GET_SHN(cc) \ 322 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 323 #define NVME_CC_GET_IOSQES(cc) \ 324 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 325 #define NVME_CC_GET_IOCQES(cc) \ 326 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 327 328 #define NVME_CC_WRITE_MASK \ 329 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 330 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 331 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 332 333 #define NVME_CC_NEN_WRITE_MASK \ 334 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 335 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 336 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 337 338 /* Controller Status utils */ 339 #define NVME_CSTS_GET_RDY(sts) \ 340 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 341 342 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 343 344 /* Completion Queue status word utils */ 345 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 346 #define NVME_STATUS_MASK \ 347 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 348 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 349 350 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 351 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 352 353 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 354 struct nvme_feature_obj *, 355 struct nvme_command *, 356 struct nvme_completion *); 357 static void nvme_feature_num_queues(struct pci_nvme_softc *, 358 struct nvme_feature_obj *, 359 struct nvme_command *, 360 struct nvme_completion *); 361 static void nvme_feature_iv_config(struct pci_nvme_softc *, 362 struct nvme_feature_obj *, 363 struct nvme_command *, 364 struct nvme_completion *); 365 366 static __inline void 367 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 368 { 369 size_t len; 370 371 len = strnlen(src, dst_size); 372 memset(dst, pad, dst_size); 373 memcpy(dst, src, len); 374 } 375 376 static __inline void 377 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 378 { 379 380 *status &= ~NVME_STATUS_MASK; 381 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 382 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 383 } 384 385 static __inline void 386 pci_nvme_status_genc(uint16_t *status, uint16_t code) 387 { 388 389 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 390 } 391 392 /* 393 * Initialize the requested number or IO Submission and Completion Queues. 394 * Admin queues are allocated implicitly. 395 */ 396 static void 397 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 398 { 399 uint32_t i; 400 401 /* 402 * Allocate and initialize the Submission Queues 403 */ 404 if (nsq > NVME_QUEUES) { 405 WPRINTF("%s: clamping number of SQ from %u to %u", 406 __func__, nsq, NVME_QUEUES); 407 nsq = NVME_QUEUES; 408 } 409 410 sc->num_squeues = nsq; 411 412 sc->submit_queues = calloc(sc->num_squeues + 1, 413 sizeof(struct nvme_submission_queue)); 414 if (sc->submit_queues == NULL) { 415 WPRINTF("%s: SQ allocation failed", __func__); 416 sc->num_squeues = 0; 417 } else { 418 struct nvme_submission_queue *sq = sc->submit_queues; 419 420 for (i = 0; i < sc->num_squeues; i++) 421 pthread_mutex_init(&sq[i].mtx, NULL); 422 } 423 424 /* 425 * Allocate and initialize the Completion Queues 426 */ 427 if (ncq > NVME_QUEUES) { 428 WPRINTF("%s: clamping number of CQ from %u to %u", 429 __func__, ncq, NVME_QUEUES); 430 ncq = NVME_QUEUES; 431 } 432 433 sc->num_cqueues = ncq; 434 435 sc->compl_queues = calloc(sc->num_cqueues + 1, 436 sizeof(struct nvme_completion_queue)); 437 if (sc->compl_queues == NULL) { 438 WPRINTF("%s: CQ allocation failed", __func__); 439 sc->num_cqueues = 0; 440 } else { 441 struct nvme_completion_queue *cq = sc->compl_queues; 442 443 for (i = 0; i < sc->num_cqueues; i++) 444 pthread_mutex_init(&cq[i].mtx, NULL); 445 } 446 } 447 448 static void 449 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 450 { 451 struct nvme_controller_data *cd = &sc->ctrldata; 452 453 cd->vid = 0xFB5D; 454 cd->ssvid = 0x0000; 455 456 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 457 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 458 459 /* Num of submission commands that we can handle at a time (2^rab) */ 460 cd->rab = 4; 461 462 /* FreeBSD OUI */ 463 cd->ieee[0] = 0x58; 464 cd->ieee[1] = 0x9c; 465 cd->ieee[2] = 0xfc; 466 467 cd->mic = 0; 468 469 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 470 471 cd->ver = 0x00010300; 472 473 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 474 cd->acl = 2; 475 cd->aerl = 4; 476 477 /* Advertise 1, Read-only firmware slot */ 478 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 479 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 480 cd->lpa = 0; /* TODO: support some simple things like SMART */ 481 cd->elpe = 0; /* max error log page entries */ 482 cd->npss = 1; /* number of power states support */ 483 484 /* Warning Composite Temperature Threshold */ 485 cd->wctemp = 0x0157; 486 487 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 488 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 489 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 490 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 491 cd->nn = 1; /* number of namespaces */ 492 493 cd->oncs = 0; 494 switch (sc->dataset_management) { 495 case NVME_DATASET_MANAGEMENT_AUTO: 496 if (sc->nvstore.deallocate) 497 cd->oncs |= NVME_ONCS_DSM; 498 break; 499 case NVME_DATASET_MANAGEMENT_ENABLE: 500 cd->oncs |= NVME_ONCS_DSM; 501 break; 502 default: 503 break; 504 } 505 506 cd->fna = 0x03; 507 508 cd->power_state[0].mp = 10; 509 } 510 511 /* 512 * Calculate the CRC-16 of the given buffer 513 * See copyright attribution at top of file 514 */ 515 static uint16_t 516 crc16(uint16_t crc, const void *buffer, unsigned int len) 517 { 518 const unsigned char *cp = buffer; 519 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 520 static uint16_t const crc16_table[256] = { 521 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 522 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 523 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 524 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 525 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 526 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 527 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 528 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 529 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 530 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 531 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 532 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 533 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 534 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 535 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 536 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 537 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 538 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 539 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 540 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 541 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 542 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 543 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 544 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 545 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 546 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 547 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 548 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 549 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 550 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 551 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 552 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 553 }; 554 555 while (len--) 556 crc = (((crc >> 8) & 0xffU) ^ 557 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 558 return crc; 559 } 560 561 static void 562 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 563 struct nvme_namespace_data *nd, uint32_t nsid, 564 struct pci_nvme_blockstore *nvstore) 565 { 566 567 /* Get capacity and block size information from backing store */ 568 nd->nsze = nvstore->size / nvstore->sectsz; 569 nd->ncap = nd->nsze; 570 nd->nuse = nd->nsze; 571 572 if (nvstore->type == NVME_STOR_BLOCKIF) 573 nvstore->deallocate = blockif_candelete(nvstore->ctx); 574 575 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 576 nd->flbas = 0; 577 578 /* Create an EUI-64 if user did not provide one */ 579 if (nvstore->eui64 == 0) { 580 char *data = NULL; 581 uint64_t eui64 = nvstore->eui64; 582 583 asprintf(&data, "%s%u%u%u", get_config_value("name"), 584 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 585 sc->nsc_pi->pi_func); 586 587 if (data != NULL) { 588 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 589 free(data); 590 } 591 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 592 } 593 be64enc(nd->eui64, nvstore->eui64); 594 595 /* LBA data-sz = 2^lbads */ 596 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 597 } 598 599 static void 600 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 601 { 602 603 memset(&sc->err_log, 0, sizeof(sc->err_log)); 604 memset(&sc->health_log, 0, sizeof(sc->health_log)); 605 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 606 607 /* Set read/write remainder to round up according to spec */ 608 sc->read_dunits_remainder = 999; 609 sc->write_dunits_remainder = 999; 610 611 /* Set nominal Health values checked by implementations */ 612 sc->health_log.temperature = 310; 613 sc->health_log.available_spare = 100; 614 sc->health_log.available_spare_threshold = 10; 615 } 616 617 static void 618 pci_nvme_init_features(struct pci_nvme_softc *sc) 619 { 620 621 sc->feat[0].set = nvme_feature_invalid_cb; 622 sc->feat[0].get = nvme_feature_invalid_cb; 623 624 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; 625 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; 626 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; 627 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = 628 nvme_feature_iv_config; 629 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = 630 nvme_feature_invalid_cb; 631 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = 632 nvme_feature_invalid_cb; 633 } 634 635 static void 636 pci_nvme_aer_init(struct pci_nvme_softc *sc) 637 { 638 639 STAILQ_INIT(&sc->aer_list); 640 sc->aer_count = 0; 641 } 642 643 static void 644 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 645 { 646 struct pci_nvme_aer *aer = NULL; 647 648 while (!STAILQ_EMPTY(&sc->aer_list)) { 649 aer = STAILQ_FIRST(&sc->aer_list); 650 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 651 free(aer); 652 } 653 654 pci_nvme_aer_init(sc); 655 } 656 657 #ifdef __FreeBSD__ 658 static bool 659 pci_nvme_aer_available(struct pci_nvme_softc *sc) 660 { 661 662 return (!STAILQ_EMPTY(&sc->aer_list)); 663 } 664 #else 665 /* This is kept behind an ifdef while it's unused to appease the compiler. */ 666 #endif 667 668 static bool 669 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 670 { 671 struct nvme_controller_data *cd = &sc->ctrldata; 672 673 /* AERL is a zero based value while aer_count is one's based */ 674 return (sc->aer_count == (cd->aerl + 1)); 675 } 676 677 /* 678 * Add an Async Event Request 679 * 680 * Stores an AER to be returned later if the Controller needs to notify the 681 * host of an event. 682 * Note that while the NVMe spec doesn't require Controllers to return AER's 683 * in order, this implementation does preserve the order. 684 */ 685 static int 686 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 687 { 688 struct pci_nvme_aer *aer = NULL; 689 690 if (pci_nvme_aer_limit_reached(sc)) 691 return (-1); 692 693 aer = calloc(1, sizeof(struct pci_nvme_aer)); 694 if (aer == NULL) 695 return (-1); 696 697 sc->aer_count++; 698 699 /* Save the Command ID for use in the completion message */ 700 aer->cid = cid; 701 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 702 703 return (0); 704 } 705 706 /* 707 * Get an Async Event Request structure 708 * 709 * Returns a pointer to an AER previously submitted by the host or NULL if 710 * no AER's exist. Caller is responsible for freeing the returned struct. 711 */ 712 #ifdef __FreeBSD__ 713 static struct pci_nvme_aer * 714 pci_nvme_aer_get(struct pci_nvme_softc *sc) 715 { 716 struct pci_nvme_aer *aer = NULL; 717 718 aer = STAILQ_FIRST(&sc->aer_list); 719 if (aer != NULL) { 720 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 721 sc->aer_count--; 722 } 723 724 return (aer); 725 } 726 #else 727 /* This is kept behind an ifdef while it's unused to appease the compiler. */ 728 #endif 729 730 static void 731 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 732 { 733 uint32_t i; 734 735 DPRINTF("%s", __func__); 736 737 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 738 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 739 (60 << NVME_CAP_LO_REG_TO_SHIFT); 740 741 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 742 743 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 744 745 sc->regs.cc = 0; 746 sc->regs.csts = 0; 747 748 assert(sc->submit_queues != NULL); 749 750 for (i = 0; i < sc->num_squeues + 1; i++) { 751 sc->submit_queues[i].qbase = NULL; 752 sc->submit_queues[i].size = 0; 753 sc->submit_queues[i].cqid = 0; 754 sc->submit_queues[i].tail = 0; 755 sc->submit_queues[i].head = 0; 756 } 757 758 assert(sc->compl_queues != NULL); 759 760 for (i = 0; i < sc->num_cqueues + 1; i++) { 761 sc->compl_queues[i].qbase = NULL; 762 sc->compl_queues[i].size = 0; 763 sc->compl_queues[i].tail = 0; 764 sc->compl_queues[i].head = 0; 765 } 766 767 sc->num_q_is_set = false; 768 769 pci_nvme_aer_destroy(sc); 770 } 771 772 static void 773 pci_nvme_reset(struct pci_nvme_softc *sc) 774 { 775 pthread_mutex_lock(&sc->mtx); 776 pci_nvme_reset_locked(sc); 777 pthread_mutex_unlock(&sc->mtx); 778 } 779 780 static void 781 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 782 { 783 uint16_t acqs, asqs; 784 785 DPRINTF("%s", __func__); 786 787 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 788 sc->submit_queues[0].size = asqs; 789 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 790 sizeof(struct nvme_command) * asqs); 791 792 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 793 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 794 795 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 796 NVME_AQA_REG_ACQS_MASK) + 1; 797 sc->compl_queues[0].size = acqs; 798 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 799 sizeof(struct nvme_completion) * acqs); 800 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 801 802 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 803 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 804 } 805 806 static int 807 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 808 size_t len, enum nvme_copy_dir dir) 809 { 810 uint8_t *p; 811 size_t bytes; 812 813 if (len > (8 * 1024)) { 814 return (-1); 815 } 816 817 /* Copy from the start of prp1 to the end of the physical page */ 818 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 819 bytes = MIN(bytes, len); 820 821 p = vm_map_gpa(ctx, prp1, bytes); 822 if (p == NULL) { 823 return (-1); 824 } 825 826 if (dir == NVME_COPY_TO_PRP) 827 memcpy(p, b, bytes); 828 else 829 memcpy(b, p, bytes); 830 831 b += bytes; 832 833 len -= bytes; 834 if (len == 0) { 835 return (0); 836 } 837 838 len = MIN(len, PAGE_SIZE); 839 840 p = vm_map_gpa(ctx, prp2, len); 841 if (p == NULL) { 842 return (-1); 843 } 844 845 if (dir == NVME_COPY_TO_PRP) 846 memcpy(p, b, len); 847 else 848 memcpy(b, p, len); 849 850 return (0); 851 } 852 853 /* 854 * Write a Completion Queue Entry update 855 * 856 * Write the completion and update the doorbell value 857 */ 858 static void 859 pci_nvme_cq_update(struct pci_nvme_softc *sc, 860 struct nvme_completion_queue *cq, 861 uint32_t cdw0, 862 uint16_t cid, 863 uint16_t sqid, 864 uint16_t status) 865 { 866 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 867 struct nvme_completion *cqe; 868 869 assert(cq->qbase != NULL); 870 871 pthread_mutex_lock(&cq->mtx); 872 873 cqe = &cq->qbase[cq->tail]; 874 875 /* Flip the phase bit */ 876 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 877 878 cqe->cdw0 = cdw0; 879 cqe->sqhd = sq->head; 880 cqe->sqid = sqid; 881 cqe->cid = cid; 882 cqe->status = status; 883 884 cq->tail++; 885 if (cq->tail >= cq->size) { 886 cq->tail = 0; 887 } 888 889 pthread_mutex_unlock(&cq->mtx); 890 } 891 892 static int 893 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 894 struct nvme_completion* compl) 895 { 896 uint16_t qid = command->cdw10 & 0xffff; 897 898 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 899 if (qid == 0 || qid > sc->num_squeues || 900 (sc->submit_queues[qid].qbase == NULL)) { 901 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 902 __func__, qid, sc->num_squeues); 903 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 904 NVME_SC_INVALID_QUEUE_IDENTIFIER); 905 return (1); 906 } 907 908 sc->submit_queues[qid].qbase = NULL; 909 sc->submit_queues[qid].cqid = 0; 910 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 911 return (1); 912 } 913 914 static int 915 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 916 struct nvme_completion* compl) 917 { 918 if (command->cdw11 & NVME_CMD_CDW11_PC) { 919 uint16_t qid = command->cdw10 & 0xffff; 920 struct nvme_submission_queue *nsq; 921 922 if ((qid == 0) || (qid > sc->num_squeues) || 923 (sc->submit_queues[qid].qbase != NULL)) { 924 WPRINTF("%s queue index %u > num_squeues %u", 925 __func__, qid, sc->num_squeues); 926 pci_nvme_status_tc(&compl->status, 927 NVME_SCT_COMMAND_SPECIFIC, 928 NVME_SC_INVALID_QUEUE_IDENTIFIER); 929 return (1); 930 } 931 932 nsq = &sc->submit_queues[qid]; 933 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 934 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 935 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 936 /* 937 * Queues must specify at least two entries 938 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 939 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 940 */ 941 pci_nvme_status_tc(&compl->status, 942 NVME_SCT_COMMAND_SPECIFIC, 943 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 944 return (1); 945 } 946 nsq->head = nsq->tail = 0; 947 948 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 949 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 950 pci_nvme_status_tc(&compl->status, 951 NVME_SCT_COMMAND_SPECIFIC, 952 NVME_SC_INVALID_QUEUE_IDENTIFIER); 953 return (1); 954 } 955 956 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 957 pci_nvme_status_tc(&compl->status, 958 NVME_SCT_COMMAND_SPECIFIC, 959 NVME_SC_COMPLETION_QUEUE_INVALID); 960 return (1); 961 } 962 963 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 964 965 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 966 sizeof(struct nvme_command) * (size_t)nsq->size); 967 968 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 969 qid, nsq->size, nsq->qbase, nsq->cqid); 970 971 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 972 973 DPRINTF("%s completed creating IOSQ qid %u", 974 __func__, qid); 975 } else { 976 /* 977 * Guest sent non-cont submission queue request. 978 * This setting is unsupported by this emulation. 979 */ 980 WPRINTF("%s unsupported non-contig (list-based) " 981 "create i/o submission queue", __func__); 982 983 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 984 } 985 return (1); 986 } 987 988 static int 989 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 990 struct nvme_completion* compl) 991 { 992 uint16_t qid = command->cdw10 & 0xffff; 993 uint16_t sqid; 994 995 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 996 if (qid == 0 || qid > sc->num_cqueues || 997 (sc->compl_queues[qid].qbase == NULL)) { 998 WPRINTF("%s queue index %u / num_cqueues %u", 999 __func__, qid, sc->num_cqueues); 1000 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1001 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1002 return (1); 1003 } 1004 1005 /* Deleting an Active CQ is an error */ 1006 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1007 if (sc->submit_queues[sqid].cqid == qid) { 1008 pci_nvme_status_tc(&compl->status, 1009 NVME_SCT_COMMAND_SPECIFIC, 1010 NVME_SC_INVALID_QUEUE_DELETION); 1011 return (1); 1012 } 1013 1014 sc->compl_queues[qid].qbase = NULL; 1015 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1016 return (1); 1017 } 1018 1019 static int 1020 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1021 struct nvme_completion* compl) 1022 { 1023 struct nvme_completion_queue *ncq; 1024 uint16_t qid = command->cdw10 & 0xffff; 1025 1026 /* Only support Physically Contiguous queues */ 1027 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1028 WPRINTF("%s unsupported non-contig (list-based) " 1029 "create i/o completion queue", 1030 __func__); 1031 1032 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1033 return (1); 1034 } 1035 1036 if ((qid == 0) || (qid > sc->num_cqueues) || 1037 (sc->compl_queues[qid].qbase != NULL)) { 1038 WPRINTF("%s queue index %u > num_cqueues %u", 1039 __func__, qid, sc->num_cqueues); 1040 pci_nvme_status_tc(&compl->status, 1041 NVME_SCT_COMMAND_SPECIFIC, 1042 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1043 return (1); 1044 } 1045 1046 ncq = &sc->compl_queues[qid]; 1047 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1048 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1049 if (ncq->intr_vec > (sc->max_queues + 1)) { 1050 pci_nvme_status_tc(&compl->status, 1051 NVME_SCT_COMMAND_SPECIFIC, 1052 NVME_SC_INVALID_INTERRUPT_VECTOR); 1053 return (1); 1054 } 1055 1056 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1057 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1058 /* 1059 * Queues must specify at least two entries 1060 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1061 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1062 */ 1063 pci_nvme_status_tc(&compl->status, 1064 NVME_SCT_COMMAND_SPECIFIC, 1065 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1066 return (1); 1067 } 1068 ncq->head = ncq->tail = 0; 1069 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1070 command->prp1, 1071 sizeof(struct nvme_command) * (size_t)ncq->size); 1072 1073 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1074 1075 1076 return (1); 1077 } 1078 1079 static int 1080 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1081 struct nvme_completion* compl) 1082 { 1083 uint32_t logsize; 1084 uint8_t logpage = command->cdw10 & 0xFF; 1085 1086 #ifndef __FreeBSD__ 1087 logsize = 0; 1088 #endif 1089 1090 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1091 1092 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1093 1094 /* 1095 * Command specifies the number of dwords to return in fields NUMDU 1096 * and NUMDL. This is a zero-based value. 1097 */ 1098 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1099 logsize *= sizeof(uint32_t); 1100 1101 switch (logpage) { 1102 case NVME_LOG_ERROR: 1103 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1104 command->prp2, (uint8_t *)&sc->err_log, 1105 MIN(logsize, sizeof(sc->err_log)), 1106 NVME_COPY_TO_PRP); 1107 break; 1108 case NVME_LOG_HEALTH_INFORMATION: 1109 pthread_mutex_lock(&sc->mtx); 1110 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1111 sizeof(sc->health_log.data_units_read)); 1112 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1113 sizeof(sc->health_log.data_units_written)); 1114 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1115 sizeof(sc->health_log.host_read_commands)); 1116 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1117 sizeof(sc->health_log.host_write_commands)); 1118 pthread_mutex_unlock(&sc->mtx); 1119 1120 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1121 command->prp2, (uint8_t *)&sc->health_log, 1122 MIN(logsize, sizeof(sc->health_log)), 1123 NVME_COPY_TO_PRP); 1124 break; 1125 case NVME_LOG_FIRMWARE_SLOT: 1126 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1127 command->prp2, (uint8_t *)&sc->fw_log, 1128 MIN(logsize, sizeof(sc->fw_log)), 1129 NVME_COPY_TO_PRP); 1130 break; 1131 default: 1132 DPRINTF("%s get log page %x command not supported", 1133 __func__, logpage); 1134 1135 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1136 NVME_SC_INVALID_LOG_PAGE); 1137 } 1138 1139 return (1); 1140 } 1141 1142 static int 1143 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1144 struct nvme_completion* compl) 1145 { 1146 void *dest; 1147 uint16_t status; 1148 1149 #ifndef __FreeBSD__ 1150 status = 0; 1151 #endif 1152 1153 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1154 command->cdw10 & 0xFF, command->nsid); 1155 1156 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1157 1158 switch (command->cdw10 & 0xFF) { 1159 case 0x00: /* return Identify Namespace data structure */ 1160 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1161 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1162 NVME_COPY_TO_PRP); 1163 break; 1164 case 0x01: /* return Identify Controller data structure */ 1165 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1166 command->prp2, (uint8_t *)&sc->ctrldata, 1167 sizeof(sc->ctrldata), 1168 NVME_COPY_TO_PRP); 1169 break; 1170 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1171 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1172 sizeof(uint32_t) * 1024); 1173 /* All unused entries shall be zero */ 1174 bzero(dest, sizeof(uint32_t) * 1024); 1175 ((uint32_t *)dest)[0] = 1; 1176 break; 1177 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1178 if (command->nsid != 1) { 1179 pci_nvme_status_genc(&status, 1180 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1181 break; 1182 } 1183 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1184 sizeof(uint32_t) * 1024); 1185 /* All bytes after the descriptor shall be zero */ 1186 bzero(dest, sizeof(uint32_t) * 1024); 1187 1188 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1189 ((uint8_t *)dest)[0] = 1; 1190 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1191 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1192 break; 1193 default: 1194 DPRINTF("%s unsupported identify command requested 0x%x", 1195 __func__, command->cdw10 & 0xFF); 1196 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1197 break; 1198 } 1199 1200 compl->status = status; 1201 return (1); 1202 } 1203 1204 static const char * 1205 nvme_fid_to_name(uint8_t fid) 1206 { 1207 const char *name; 1208 1209 switch (fid) { 1210 case NVME_FEAT_ARBITRATION: 1211 name = "Arbitration"; 1212 break; 1213 case NVME_FEAT_POWER_MANAGEMENT: 1214 name = "Power Management"; 1215 break; 1216 case NVME_FEAT_LBA_RANGE_TYPE: 1217 name = "LBA Range Type"; 1218 break; 1219 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1220 name = "Temperature Threshold"; 1221 break; 1222 case NVME_FEAT_ERROR_RECOVERY: 1223 name = "Error Recovery"; 1224 break; 1225 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1226 name = "Volatile Write Cache"; 1227 break; 1228 case NVME_FEAT_NUMBER_OF_QUEUES: 1229 name = "Number of Queues"; 1230 break; 1231 case NVME_FEAT_INTERRUPT_COALESCING: 1232 name = "Interrupt Coalescing"; 1233 break; 1234 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1235 name = "Interrupt Vector Configuration"; 1236 break; 1237 case NVME_FEAT_WRITE_ATOMICITY: 1238 name = "Write Atomicity Normal"; 1239 break; 1240 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1241 name = "Asynchronous Event Configuration"; 1242 break; 1243 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1244 name = "Autonomous Power State Transition"; 1245 break; 1246 case NVME_FEAT_HOST_MEMORY_BUFFER: 1247 name = "Host Memory Buffer"; 1248 break; 1249 case NVME_FEAT_TIMESTAMP: 1250 name = "Timestamp"; 1251 break; 1252 case NVME_FEAT_KEEP_ALIVE_TIMER: 1253 name = "Keep Alive Timer"; 1254 break; 1255 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1256 name = "Host Controlled Thermal Management"; 1257 break; 1258 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1259 name = "Non-Operation Power State Config"; 1260 break; 1261 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1262 name = "Read Recovery Level Config"; 1263 break; 1264 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1265 name = "Predictable Latency Mode Config"; 1266 break; 1267 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1268 name = "Predictable Latency Mode Window"; 1269 break; 1270 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1271 name = "LBA Status Information Report Interval"; 1272 break; 1273 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1274 name = "Host Behavior Support"; 1275 break; 1276 case NVME_FEAT_SANITIZE_CONFIG: 1277 name = "Sanitize Config"; 1278 break; 1279 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1280 name = "Endurance Group Event Configuration"; 1281 break; 1282 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1283 name = "Software Progress Marker"; 1284 break; 1285 case NVME_FEAT_HOST_IDENTIFIER: 1286 name = "Host Identifier"; 1287 break; 1288 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1289 name = "Reservation Notification Mask"; 1290 break; 1291 case NVME_FEAT_RESERVATION_PERSISTENCE: 1292 name = "Reservation Persistence"; 1293 break; 1294 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1295 name = "Namespace Write Protection Config"; 1296 break; 1297 default: 1298 name = "Unknown"; 1299 break; 1300 } 1301 1302 return (name); 1303 } 1304 1305 static void 1306 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1307 struct nvme_feature_obj *feat, 1308 struct nvme_command *command, 1309 struct nvme_completion *compl) 1310 { 1311 1312 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1313 } 1314 1315 static void 1316 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1317 struct nvme_feature_obj *feat, 1318 struct nvme_command *command, 1319 struct nvme_completion *compl) 1320 { 1321 uint32_t i; 1322 uint32_t cdw11 = command->cdw11; 1323 uint16_t iv; 1324 bool cd; 1325 1326 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1327 1328 iv = cdw11 & 0xffff; 1329 cd = cdw11 & (1 << 16); 1330 1331 if (iv > (sc->max_queues + 1)) { 1332 return; 1333 } 1334 1335 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1336 if ((iv == 0) && !cd) 1337 return; 1338 1339 /* Requested Interrupt Vector must be used by a CQ */ 1340 for (i = 0; i < sc->num_cqueues + 1; i++) { 1341 if (sc->compl_queues[i].intr_vec == iv) { 1342 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1343 } 1344 } 1345 1346 } 1347 1348 static void 1349 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1350 struct nvme_feature_obj *feat, 1351 struct nvme_command *command, 1352 struct nvme_completion *compl) 1353 { 1354 uint16_t nqr; /* Number of Queues Requested */ 1355 1356 if (sc->num_q_is_set) { 1357 WPRINTF("%s: Number of Queues already set", __func__); 1358 pci_nvme_status_genc(&compl->status, 1359 NVME_SC_COMMAND_SEQUENCE_ERROR); 1360 return; 1361 } 1362 1363 nqr = command->cdw11 & 0xFFFF; 1364 if (nqr == 0xffff) { 1365 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1366 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1367 return; 1368 } 1369 1370 sc->num_squeues = ONE_BASED(nqr); 1371 if (sc->num_squeues > sc->max_queues) { 1372 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1373 sc->max_queues); 1374 sc->num_squeues = sc->max_queues; 1375 } 1376 1377 nqr = (command->cdw11 >> 16) & 0xFFFF; 1378 if (nqr == 0xffff) { 1379 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1380 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1381 return; 1382 } 1383 1384 sc->num_cqueues = ONE_BASED(nqr); 1385 if (sc->num_cqueues > sc->max_queues) { 1386 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1387 sc->max_queues); 1388 sc->num_cqueues = sc->max_queues; 1389 } 1390 1391 /* Patch the command value which will be saved on callback's return */ 1392 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1393 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1394 1395 sc->num_q_is_set = true; 1396 } 1397 1398 static int 1399 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1400 struct nvme_completion *compl) 1401 { 1402 struct nvme_feature_obj *feat; 1403 uint32_t nsid = command->nsid; 1404 uint8_t fid = command->cdw10 & 0xFF; 1405 1406 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1407 1408 if (fid >= NVME_FID_MAX) { 1409 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1410 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1411 return (1); 1412 } 1413 feat = &sc->feat[fid]; 1414 1415 if (!feat->namespace_specific && 1416 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1417 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1418 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1419 return (1); 1420 } 1421 1422 compl->cdw0 = 0; 1423 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1424 1425 if (feat->set) 1426 feat->set(sc, feat, command, compl); 1427 1428 if (compl->status == NVME_SC_SUCCESS) 1429 feat->cdw11 = command->cdw11; 1430 1431 return (0); 1432 } 1433 1434 static int 1435 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1436 struct nvme_completion* compl) 1437 { 1438 struct nvme_feature_obj *feat; 1439 uint8_t fid = command->cdw10 & 0xFF; 1440 1441 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1442 1443 if (fid >= NVME_FID_MAX) { 1444 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1445 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1446 return (1); 1447 } 1448 1449 compl->cdw0 = 0; 1450 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1451 1452 feat = &sc->feat[fid]; 1453 if (feat->get) { 1454 feat->get(sc, feat, command, compl); 1455 } 1456 1457 if (compl->status == NVME_SC_SUCCESS) { 1458 compl->cdw0 = feat->cdw11; 1459 } 1460 1461 return (0); 1462 } 1463 1464 static int 1465 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1466 struct nvme_completion* compl) 1467 { 1468 uint8_t ses, lbaf, pi; 1469 1470 /* Only supports Secure Erase Setting - User Data Erase */ 1471 ses = (command->cdw10 >> 9) & 0x7; 1472 if (ses > 0x1) { 1473 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1474 return (1); 1475 } 1476 1477 /* Only supports a single LBA Format */ 1478 lbaf = command->cdw10 & 0xf; 1479 if (lbaf != 0) { 1480 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1481 NVME_SC_INVALID_FORMAT); 1482 return (1); 1483 } 1484 1485 /* Doesn't support Protection Infomation */ 1486 pi = (command->cdw10 >> 5) & 0x7; 1487 if (pi != 0) { 1488 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1489 return (1); 1490 } 1491 1492 if (sc->nvstore.type == NVME_STOR_RAM) { 1493 if (sc->nvstore.ctx) 1494 free(sc->nvstore.ctx); 1495 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1496 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1497 } else { 1498 struct pci_nvme_ioreq *req; 1499 int err; 1500 1501 req = pci_nvme_get_ioreq(sc); 1502 if (req == NULL) { 1503 pci_nvme_status_genc(&compl->status, 1504 NVME_SC_INTERNAL_DEVICE_ERROR); 1505 WPRINTF("%s: unable to allocate IO req", __func__); 1506 return (1); 1507 } 1508 req->nvme_sq = &sc->submit_queues[0]; 1509 req->sqid = 0; 1510 req->opc = command->opc; 1511 req->cid = command->cid; 1512 req->nsid = command->nsid; 1513 1514 req->io_req.br_offset = 0; 1515 req->io_req.br_resid = sc->nvstore.size; 1516 req->io_req.br_callback = pci_nvme_io_done; 1517 1518 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1519 if (err) { 1520 pci_nvme_status_genc(&compl->status, 1521 NVME_SC_INTERNAL_DEVICE_ERROR); 1522 pci_nvme_release_ioreq(sc, req); 1523 } 1524 } 1525 1526 return (1); 1527 } 1528 1529 static int 1530 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1531 struct nvme_completion* compl) 1532 { 1533 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1534 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1535 1536 /* TODO: search for the command ID and abort it */ 1537 1538 compl->cdw0 = 1; 1539 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1540 return (1); 1541 } 1542 1543 static int 1544 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1545 struct nvme_command* command, struct nvme_completion* compl) 1546 { 1547 DPRINTF("%s async event request 0x%x", __func__, command->cdw11); 1548 1549 /* Don't exceed the Async Event Request Limit (AERL). */ 1550 if (pci_nvme_aer_limit_reached(sc)) { 1551 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1552 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1553 return (1); 1554 } 1555 1556 if (pci_nvme_aer_add(sc, command->cid)) { 1557 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1558 NVME_SC_INTERNAL_DEVICE_ERROR); 1559 return (1); 1560 } 1561 1562 /* 1563 * Raise events when they happen based on the Set Features cmd. 1564 * These events happen async, so only set completion successful if 1565 * there is an event reflective of the request to get event. 1566 */ 1567 compl->status = NVME_NO_STATUS; 1568 1569 return (0); 1570 } 1571 1572 static void 1573 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1574 { 1575 struct nvme_completion compl; 1576 struct nvme_command *cmd; 1577 struct nvme_submission_queue *sq; 1578 struct nvme_completion_queue *cq; 1579 uint16_t sqhead; 1580 1581 DPRINTF("%s index %u", __func__, (uint32_t)value); 1582 1583 sq = &sc->submit_queues[0]; 1584 cq = &sc->compl_queues[0]; 1585 1586 pthread_mutex_lock(&sq->mtx); 1587 1588 sqhead = sq->head; 1589 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 1590 1591 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1592 cmd = &(sq->qbase)[sqhead]; 1593 compl.cdw0 = 0; 1594 compl.status = 0; 1595 1596 switch (cmd->opc) { 1597 case NVME_OPC_DELETE_IO_SQ: 1598 DPRINTF("%s command DELETE_IO_SQ", __func__); 1599 nvme_opc_delete_io_sq(sc, cmd, &compl); 1600 break; 1601 case NVME_OPC_CREATE_IO_SQ: 1602 DPRINTF("%s command CREATE_IO_SQ", __func__); 1603 nvme_opc_create_io_sq(sc, cmd, &compl); 1604 break; 1605 case NVME_OPC_DELETE_IO_CQ: 1606 DPRINTF("%s command DELETE_IO_CQ", __func__); 1607 nvme_opc_delete_io_cq(sc, cmd, &compl); 1608 break; 1609 case NVME_OPC_CREATE_IO_CQ: 1610 DPRINTF("%s command CREATE_IO_CQ", __func__); 1611 nvme_opc_create_io_cq(sc, cmd, &compl); 1612 break; 1613 case NVME_OPC_GET_LOG_PAGE: 1614 DPRINTF("%s command GET_LOG_PAGE", __func__); 1615 nvme_opc_get_log_page(sc, cmd, &compl); 1616 break; 1617 case NVME_OPC_IDENTIFY: 1618 DPRINTF("%s command IDENTIFY", __func__); 1619 nvme_opc_identify(sc, cmd, &compl); 1620 break; 1621 case NVME_OPC_ABORT: 1622 DPRINTF("%s command ABORT", __func__); 1623 nvme_opc_abort(sc, cmd, &compl); 1624 break; 1625 case NVME_OPC_SET_FEATURES: 1626 DPRINTF("%s command SET_FEATURES", __func__); 1627 nvme_opc_set_features(sc, cmd, &compl); 1628 break; 1629 case NVME_OPC_GET_FEATURES: 1630 DPRINTF("%s command GET_FEATURES", __func__); 1631 nvme_opc_get_features(sc, cmd, &compl); 1632 break; 1633 case NVME_OPC_FIRMWARE_ACTIVATE: 1634 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 1635 pci_nvme_status_tc(&compl.status, 1636 NVME_SCT_COMMAND_SPECIFIC, 1637 NVME_SC_INVALID_FIRMWARE_SLOT); 1638 break; 1639 case NVME_OPC_ASYNC_EVENT_REQUEST: 1640 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 1641 nvme_opc_async_event_req(sc, cmd, &compl); 1642 break; 1643 case NVME_OPC_FORMAT_NVM: 1644 DPRINTF("%s command FORMAT_NVM", __func__); 1645 if ((sc->ctrldata.oacs & 1646 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 1647 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1648 } 1649 compl.status = NVME_NO_STATUS; 1650 nvme_opc_format_nvm(sc, cmd, &compl); 1651 break; 1652 default: 1653 DPRINTF("0x%x command is not implemented", 1654 cmd->opc); 1655 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1656 } 1657 sqhead = (sqhead + 1) % sq->size; 1658 1659 if (NVME_COMPLETION_VALID(compl)) { 1660 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1661 compl.cdw0, 1662 cmd->cid, 1663 0, /* SQID */ 1664 compl.status); 1665 } 1666 } 1667 1668 DPRINTF("setting sqhead %u", sqhead); 1669 sq->head = sqhead; 1670 1671 if (cq->head != cq->tail) 1672 pci_generate_msix(sc->nsc_pi, 0); 1673 1674 pthread_mutex_unlock(&sq->mtx); 1675 } 1676 1677 /* 1678 * Update the Write and Read statistics reported in SMART data 1679 * 1680 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 1681 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 1682 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 1683 */ 1684 static void 1685 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 1686 size_t bytes, uint16_t status) 1687 { 1688 1689 pthread_mutex_lock(&sc->mtx); 1690 switch (opc) { 1691 case NVME_OPC_WRITE: 1692 sc->write_commands++; 1693 if (status != NVME_SC_SUCCESS) 1694 break; 1695 sc->write_dunits_remainder += (bytes / 512); 1696 while (sc->write_dunits_remainder >= 1000) { 1697 sc->write_data_units++; 1698 sc->write_dunits_remainder -= 1000; 1699 } 1700 break; 1701 case NVME_OPC_READ: 1702 sc->read_commands++; 1703 if (status != NVME_SC_SUCCESS) 1704 break; 1705 sc->read_dunits_remainder += (bytes / 512); 1706 while (sc->read_dunits_remainder >= 1000) { 1707 sc->read_data_units++; 1708 sc->read_dunits_remainder -= 1000; 1709 } 1710 break; 1711 default: 1712 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 1713 break; 1714 } 1715 pthread_mutex_unlock(&sc->mtx); 1716 } 1717 1718 /* 1719 * Check if the combination of Starting LBA (slba) and Number of Logical 1720 * Blocks (nlb) exceeds the range of the underlying storage. 1721 * 1722 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 1723 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 1724 * overflow. 1725 */ 1726 static bool 1727 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 1728 uint32_t nlb) 1729 { 1730 size_t offset, bytes; 1731 1732 /* Overflow check of multiplying Starting LBA by the sector size */ 1733 if (slba >> (64 - nvstore->sectsz_bits)) 1734 return (true); 1735 1736 offset = slba << nvstore->sectsz_bits; 1737 bytes = nlb << nvstore->sectsz_bits; 1738 1739 /* Overflow check of Number of Logical Blocks */ 1740 if ((nvstore->size - offset) < bytes) 1741 return (true); 1742 1743 return (false); 1744 } 1745 1746 static int 1747 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1748 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1749 { 1750 int iovidx; 1751 1752 if (req == NULL) 1753 return (-1); 1754 1755 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 1756 return (-1); 1757 } 1758 1759 /* concatenate contig block-iovs to minimize number of iovs */ 1760 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1761 iovidx = req->io_req.br_iovcnt - 1; 1762 1763 req->io_req.br_iov[iovidx].iov_base = 1764 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1765 req->prev_gpaddr, size); 1766 1767 req->prev_size += size; 1768 req->io_req.br_resid += size; 1769 1770 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1771 } else { 1772 iovidx = req->io_req.br_iovcnt; 1773 if (iovidx == 0) { 1774 req->io_req.br_offset = lba; 1775 req->io_req.br_resid = 0; 1776 req->io_req.br_param = req; 1777 } 1778 1779 req->io_req.br_iov[iovidx].iov_base = 1780 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1781 gpaddr, size); 1782 1783 req->io_req.br_iov[iovidx].iov_len = size; 1784 1785 req->prev_gpaddr = gpaddr; 1786 req->prev_size = size; 1787 req->io_req.br_resid += size; 1788 1789 req->io_req.br_iovcnt++; 1790 } 1791 1792 return (0); 1793 } 1794 1795 static void 1796 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1797 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1798 uint32_t cdw0, uint16_t status) 1799 { 1800 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1801 1802 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 1803 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1804 NVME_STATUS_GET_SC(status)); 1805 1806 pci_nvme_cq_update(sc, cq, 1807 0, /* CDW0 */ 1808 cid, 1809 sqid, 1810 status); 1811 1812 if (cq->head != cq->tail) { 1813 if (cq->intr_en & NVME_CQ_INTEN) { 1814 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1815 } else { 1816 DPRINTF("%s: CQ%u interrupt disabled", 1817 __func__, sq->cqid); 1818 } 1819 } 1820 } 1821 1822 static void 1823 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1824 { 1825 req->sc = NULL; 1826 req->nvme_sq = NULL; 1827 req->sqid = 0; 1828 1829 pthread_mutex_lock(&sc->mtx); 1830 1831 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 1832 sc->pending_ios--; 1833 1834 /* when no more IO pending, can set to ready if device reset/enabled */ 1835 if (sc->pending_ios == 0 && 1836 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1837 sc->regs.csts |= NVME_CSTS_RDY; 1838 1839 pthread_mutex_unlock(&sc->mtx); 1840 1841 sem_post(&sc->iosemlock); 1842 } 1843 1844 static struct pci_nvme_ioreq * 1845 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1846 { 1847 struct pci_nvme_ioreq *req = NULL; 1848 1849 sem_wait(&sc->iosemlock); 1850 pthread_mutex_lock(&sc->mtx); 1851 1852 req = STAILQ_FIRST(&sc->ioreqs_free); 1853 assert(req != NULL); 1854 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 1855 1856 req->sc = sc; 1857 1858 sc->pending_ios++; 1859 1860 pthread_mutex_unlock(&sc->mtx); 1861 1862 req->io_req.br_iovcnt = 0; 1863 req->io_req.br_offset = 0; 1864 req->io_req.br_resid = 0; 1865 req->io_req.br_param = req; 1866 req->prev_gpaddr = 0; 1867 req->prev_size = 0; 1868 1869 return req; 1870 } 1871 1872 static void 1873 pci_nvme_io_done(struct blockif_req *br, int err) 1874 { 1875 struct pci_nvme_ioreq *req = br->br_param; 1876 struct nvme_submission_queue *sq = req->nvme_sq; 1877 uint16_t code, status; 1878 1879 #ifndef __FreeBSD__ 1880 status = 0; 1881 #endif 1882 1883 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 1884 1885 /* TODO return correct error */ 1886 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1887 pci_nvme_status_genc(&status, code); 1888 1889 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 1890 pci_nvme_stats_write_read_update(req->sc, req->opc, 1891 req->bytes, status); 1892 pci_nvme_release_ioreq(req->sc, req); 1893 } 1894 1895 /* 1896 * Implements the Flush command. The specification states: 1897 * If a volatile write cache is not present, Flush commands complete 1898 * successfully and have no effect 1899 * in the description of the Volatile Write Cache (VWC) field of the Identify 1900 * Controller data. Therefore, set status to Success if the command is 1901 * not supported (i.e. RAM or as indicated by the blockif). 1902 */ 1903 static bool 1904 nvme_opc_flush(struct pci_nvme_softc *sc, 1905 struct nvme_command *cmd, 1906 struct pci_nvme_blockstore *nvstore, 1907 struct pci_nvme_ioreq *req, 1908 uint16_t *status) 1909 { 1910 bool pending = false; 1911 1912 if (nvstore->type == NVME_STOR_RAM) { 1913 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1914 } else { 1915 int err; 1916 1917 req->io_req.br_callback = pci_nvme_io_done; 1918 1919 err = blockif_flush(nvstore->ctx, &req->io_req); 1920 switch (err) { 1921 case 0: 1922 pending = true; 1923 break; 1924 case EOPNOTSUPP: 1925 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1926 break; 1927 default: 1928 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1929 } 1930 } 1931 1932 return (pending); 1933 } 1934 1935 static uint16_t 1936 nvme_write_read_ram(struct pci_nvme_softc *sc, 1937 struct pci_nvme_blockstore *nvstore, 1938 uint64_t prp1, uint64_t prp2, 1939 size_t offset, uint64_t bytes, 1940 bool is_write) 1941 { 1942 uint8_t *buf = nvstore->ctx; 1943 enum nvme_copy_dir dir; 1944 uint16_t status; 1945 1946 #ifndef __FreeBSD__ 1947 status = 0; 1948 #endif 1949 1950 if (is_write) 1951 dir = NVME_COPY_TO_PRP; 1952 else 1953 dir = NVME_COPY_FROM_PRP; 1954 1955 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 1956 buf + offset, bytes, dir)) 1957 pci_nvme_status_genc(&status, 1958 NVME_SC_DATA_TRANSFER_ERROR); 1959 else 1960 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1961 1962 return (status); 1963 } 1964 1965 static uint16_t 1966 nvme_write_read_blockif(struct pci_nvme_softc *sc, 1967 struct pci_nvme_blockstore *nvstore, 1968 struct pci_nvme_ioreq *req, 1969 uint64_t prp1, uint64_t prp2, 1970 size_t offset, uint64_t bytes, 1971 bool is_write) 1972 { 1973 uint64_t size; 1974 int err; 1975 uint16_t status = NVME_NO_STATUS; 1976 1977 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 1978 if (pci_nvme_append_iov_req(sc, req, prp1, 1979 size, is_write, offset)) { 1980 pci_nvme_status_genc(&status, 1981 NVME_SC_DATA_TRANSFER_ERROR); 1982 goto out; 1983 } 1984 1985 offset += size; 1986 bytes -= size; 1987 1988 if (bytes == 0) { 1989 ; 1990 } else if (bytes <= PAGE_SIZE) { 1991 size = bytes; 1992 if (pci_nvme_append_iov_req(sc, req, prp2, 1993 size, is_write, offset)) { 1994 pci_nvme_status_genc(&status, 1995 NVME_SC_DATA_TRANSFER_ERROR); 1996 goto out; 1997 } 1998 } else { 1999 void *vmctx = sc->nsc_pi->pi_vmctx; 2000 uint64_t *prp_list = &prp2; 2001 uint64_t *last = prp_list; 2002 2003 /* PRP2 is pointer to a physical region page list */ 2004 while (bytes) { 2005 /* Last entry in list points to the next list */ 2006 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2007 uint64_t prp = *prp_list; 2008 2009 prp_list = paddr_guest2host(vmctx, prp, 2010 PAGE_SIZE - (prp % PAGE_SIZE)); 2011 last = prp_list + (NVME_PRP2_ITEMS - 1); 2012 } 2013 2014 size = MIN(bytes, PAGE_SIZE); 2015 2016 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2017 size, is_write, offset)) { 2018 pci_nvme_status_genc(&status, 2019 NVME_SC_DATA_TRANSFER_ERROR); 2020 goto out; 2021 } 2022 2023 offset += size; 2024 bytes -= size; 2025 2026 prp_list++; 2027 } 2028 } 2029 req->io_req.br_callback = pci_nvme_io_done; 2030 if (is_write) 2031 err = blockif_write(nvstore->ctx, &req->io_req); 2032 else 2033 err = blockif_read(nvstore->ctx, &req->io_req); 2034 2035 if (err) 2036 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2037 out: 2038 return (status); 2039 } 2040 2041 static bool 2042 nvme_opc_write_read(struct pci_nvme_softc *sc, 2043 struct nvme_command *cmd, 2044 struct pci_nvme_blockstore *nvstore, 2045 struct pci_nvme_ioreq *req, 2046 uint16_t *status) 2047 { 2048 uint64_t lba, nblocks, bytes; 2049 size_t offset; 2050 bool is_write = cmd->opc == NVME_OPC_WRITE; 2051 bool pending = false; 2052 2053 #ifndef __FreeBSD__ 2054 bytes = 0; 2055 #endif 2056 2057 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2058 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2059 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2060 WPRINTF("%s command would exceed LBA range", __func__); 2061 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2062 goto out; 2063 } 2064 2065 bytes = nblocks << nvstore->sectsz_bits; 2066 if (bytes > NVME_MAX_DATA_SIZE) { 2067 WPRINTF("%s command would exceed MDTS", __func__); 2068 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2069 goto out; 2070 } 2071 2072 offset = lba << nvstore->sectsz_bits; 2073 2074 req->bytes = bytes; 2075 req->io_req.br_offset = lba; 2076 2077 /* PRP bits 1:0 must be zero */ 2078 cmd->prp1 &= ~0x3UL; 2079 cmd->prp2 &= ~0x3UL; 2080 2081 if (nvstore->type == NVME_STOR_RAM) { 2082 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2083 cmd->prp2, offset, bytes, is_write); 2084 } else { 2085 *status = nvme_write_read_blockif(sc, nvstore, req, 2086 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2087 2088 if (*status == NVME_NO_STATUS) 2089 pending = true; 2090 } 2091 out: 2092 if (!pending) 2093 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2094 2095 return (pending); 2096 } 2097 2098 static void 2099 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2100 { 2101 struct pci_nvme_ioreq *req = br->br_param; 2102 struct pci_nvme_softc *sc = req->sc; 2103 bool done = true; 2104 uint16_t status; 2105 2106 #ifndef __FreeBSD__ 2107 status = 0; 2108 #endif 2109 2110 if (err) { 2111 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2112 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2113 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2114 } else { 2115 struct iovec *iov = req->io_req.br_iov; 2116 2117 req->prev_gpaddr++; 2118 iov += req->prev_gpaddr; 2119 2120 /* The iov_* values already include the sector size */ 2121 req->io_req.br_offset = (off_t)iov->iov_base; 2122 req->io_req.br_resid = iov->iov_len; 2123 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2124 pci_nvme_status_genc(&status, 2125 NVME_SC_INTERNAL_DEVICE_ERROR); 2126 } else 2127 done = false; 2128 } 2129 2130 if (done) { 2131 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2132 req->cid, 0, status); 2133 pci_nvme_release_ioreq(sc, req); 2134 } 2135 } 2136 2137 static bool 2138 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2139 struct nvme_command *cmd, 2140 struct pci_nvme_blockstore *nvstore, 2141 struct pci_nvme_ioreq *req, 2142 uint16_t *status) 2143 { 2144 struct nvme_dsm_range *range; 2145 uint32_t nr, r, non_zero, dr; 2146 int err; 2147 bool pending = false; 2148 2149 #ifndef __FreeBSD__ 2150 range = NULL; 2151 #endif 2152 2153 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2154 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2155 goto out; 2156 } 2157 2158 nr = cmd->cdw10 & 0xff; 2159 2160 /* copy locally because a range entry could straddle PRPs */ 2161 range = calloc(1, NVME_MAX_DSM_TRIM); 2162 if (range == NULL) { 2163 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2164 goto out; 2165 } 2166 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2167 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2168 2169 /* Check for invalid ranges and the number of non-zero lengths */ 2170 non_zero = 0; 2171 for (r = 0; r <= nr; r++) { 2172 if (pci_nvme_out_of_range(nvstore, 2173 range[r].starting_lba, range[r].length)) { 2174 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2175 goto out; 2176 } 2177 if (range[r].length != 0) 2178 non_zero++; 2179 } 2180 2181 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2182 size_t offset, bytes; 2183 int sectsz_bits = sc->nvstore.sectsz_bits; 2184 2185 /* 2186 * DSM calls are advisory only, and compliant controllers 2187 * may choose to take no actions (i.e. return Success). 2188 */ 2189 if (!nvstore->deallocate) { 2190 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2191 goto out; 2192 } 2193 2194 /* If all ranges have a zero length, return Success */ 2195 if (non_zero == 0) { 2196 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2197 goto out; 2198 } 2199 2200 if (req == NULL) { 2201 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2202 goto out; 2203 } 2204 2205 offset = range[0].starting_lba << sectsz_bits; 2206 bytes = range[0].length << sectsz_bits; 2207 2208 /* 2209 * If the request is for more than a single range, store 2210 * the ranges in the br_iov. Optimize for the common case 2211 * of a single range. 2212 * 2213 * Note that NVMe Number of Ranges is a zero based value 2214 */ 2215 req->io_req.br_iovcnt = 0; 2216 req->io_req.br_offset = offset; 2217 req->io_req.br_resid = bytes; 2218 2219 if (nr == 0) { 2220 req->io_req.br_callback = pci_nvme_io_done; 2221 } else { 2222 struct iovec *iov = req->io_req.br_iov; 2223 2224 for (r = 0, dr = 0; r <= nr; r++) { 2225 offset = range[r].starting_lba << sectsz_bits; 2226 bytes = range[r].length << sectsz_bits; 2227 if (bytes == 0) 2228 continue; 2229 2230 if ((nvstore->size - offset) < bytes) { 2231 pci_nvme_status_genc(status, 2232 NVME_SC_LBA_OUT_OF_RANGE); 2233 goto out; 2234 } 2235 iov[dr].iov_base = (void *)offset; 2236 iov[dr].iov_len = bytes; 2237 dr++; 2238 } 2239 req->io_req.br_callback = pci_nvme_dealloc_sm; 2240 2241 /* 2242 * Use prev_gpaddr to track the current entry and 2243 * prev_size to track the number of entries 2244 */ 2245 req->prev_gpaddr = 0; 2246 req->prev_size = dr; 2247 } 2248 2249 err = blockif_delete(nvstore->ctx, &req->io_req); 2250 if (err) 2251 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2252 else 2253 pending = true; 2254 } 2255 out: 2256 free(range); 2257 return (pending); 2258 } 2259 2260 static void 2261 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2262 { 2263 struct nvme_submission_queue *sq; 2264 uint16_t status; 2265 uint16_t sqhead; 2266 2267 #ifndef __FreeBSD__ 2268 status = 0; 2269 #endif 2270 2271 /* handle all submissions up to sq->tail index */ 2272 sq = &sc->submit_queues[idx]; 2273 2274 pthread_mutex_lock(&sq->mtx); 2275 2276 sqhead = sq->head; 2277 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2278 idx, sqhead, sq->tail, sq->qbase); 2279 2280 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2281 struct nvme_command *cmd; 2282 struct pci_nvme_ioreq *req; 2283 uint32_t nsid; 2284 bool pending; 2285 2286 pending = false; 2287 req = NULL; 2288 status = 0; 2289 2290 cmd = &sq->qbase[sqhead]; 2291 sqhead = (sqhead + 1) % sq->size; 2292 2293 nsid = le32toh(cmd->nsid); 2294 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2295 pci_nvme_status_genc(&status, 2296 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2297 status |= 2298 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2299 goto complete; 2300 } 2301 2302 req = pci_nvme_get_ioreq(sc); 2303 if (req == NULL) { 2304 pci_nvme_status_genc(&status, 2305 NVME_SC_INTERNAL_DEVICE_ERROR); 2306 WPRINTF("%s: unable to allocate IO req", __func__); 2307 goto complete; 2308 } 2309 req->nvme_sq = sq; 2310 req->sqid = idx; 2311 req->opc = cmd->opc; 2312 req->cid = cmd->cid; 2313 req->nsid = cmd->nsid; 2314 2315 switch (cmd->opc) { 2316 case NVME_OPC_FLUSH: 2317 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2318 req, &status); 2319 break; 2320 case NVME_OPC_WRITE: 2321 case NVME_OPC_READ: 2322 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2323 req, &status); 2324 break; 2325 case NVME_OPC_WRITE_ZEROES: 2326 /* TODO: write zeroes 2327 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2328 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2329 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2330 break; 2331 case NVME_OPC_DATASET_MANAGEMENT: 2332 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2333 req, &status); 2334 break; 2335 default: 2336 WPRINTF("%s unhandled io command 0x%x", 2337 __func__, cmd->opc); 2338 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2339 } 2340 complete: 2341 if (!pending) { 2342 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2343 status); 2344 if (req != NULL) 2345 pci_nvme_release_ioreq(sc, req); 2346 } 2347 } 2348 2349 sq->head = sqhead; 2350 2351 pthread_mutex_unlock(&sq->mtx); 2352 } 2353 2354 static void 2355 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2356 uint64_t idx, int is_sq, uint64_t value) 2357 { 2358 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2359 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2360 2361 if (is_sq) { 2362 if (idx > sc->num_squeues) { 2363 WPRINTF("%s queue index %lu overflow from " 2364 "guest (max %u)", 2365 __func__, idx, sc->num_squeues); 2366 return; 2367 } 2368 2369 atomic_store_short(&sc->submit_queues[idx].tail, 2370 (uint16_t)value); 2371 2372 if (idx == 0) { 2373 pci_nvme_handle_admin_cmd(sc, value); 2374 } else { 2375 /* submission queue; handle new entries in SQ */ 2376 if (idx > sc->num_squeues) { 2377 WPRINTF("%s SQ index %lu overflow from " 2378 "guest (max %u)", 2379 __func__, idx, sc->num_squeues); 2380 return; 2381 } 2382 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2383 } 2384 } else { 2385 if (idx > sc->num_cqueues) { 2386 WPRINTF("%s queue index %lu overflow from " 2387 "guest (max %u)", 2388 __func__, idx, sc->num_cqueues); 2389 return; 2390 } 2391 2392 atomic_store_short(&sc->compl_queues[idx].head, 2393 (uint16_t)value); 2394 } 2395 } 2396 2397 static void 2398 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2399 { 2400 const char *s = iswrite ? "WRITE" : "READ"; 2401 2402 switch (offset) { 2403 case NVME_CR_CAP_LOW: 2404 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2405 break; 2406 case NVME_CR_CAP_HI: 2407 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2408 break; 2409 case NVME_CR_VS: 2410 DPRINTF("%s %s NVME_CR_VS", func, s); 2411 break; 2412 case NVME_CR_INTMS: 2413 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2414 break; 2415 case NVME_CR_INTMC: 2416 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2417 break; 2418 case NVME_CR_CC: 2419 DPRINTF("%s %s NVME_CR_CC", func, s); 2420 break; 2421 case NVME_CR_CSTS: 2422 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2423 break; 2424 case NVME_CR_NSSR: 2425 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2426 break; 2427 case NVME_CR_AQA: 2428 DPRINTF("%s %s NVME_CR_AQA", func, s); 2429 break; 2430 case NVME_CR_ASQ_LOW: 2431 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2432 break; 2433 case NVME_CR_ASQ_HI: 2434 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2435 break; 2436 case NVME_CR_ACQ_LOW: 2437 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2438 break; 2439 case NVME_CR_ACQ_HI: 2440 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2441 break; 2442 default: 2443 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2444 } 2445 2446 } 2447 2448 static void 2449 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2450 uint64_t offset, int size, uint64_t value) 2451 { 2452 uint32_t ccreg; 2453 2454 if (offset >= NVME_DOORBELL_OFFSET) { 2455 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2456 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2457 int is_sq = (belloffset % 8) < 4; 2458 2459 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2460 WPRINTF("guest attempted an overflow write offset " 2461 "0x%lx, val 0x%lx in %s", 2462 offset, value, __func__); 2463 return; 2464 } 2465 2466 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2467 return; 2468 } 2469 2470 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2471 offset, size, value); 2472 2473 if (size != 4) { 2474 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2475 "val 0x%lx) to bar0 in %s", 2476 size, offset, value, __func__); 2477 /* TODO: shutdown device */ 2478 return; 2479 } 2480 2481 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2482 2483 pthread_mutex_lock(&sc->mtx); 2484 2485 switch (offset) { 2486 case NVME_CR_CAP_LOW: 2487 case NVME_CR_CAP_HI: 2488 /* readonly */ 2489 break; 2490 case NVME_CR_VS: 2491 /* readonly */ 2492 break; 2493 case NVME_CR_INTMS: 2494 /* MSI-X, so ignore */ 2495 break; 2496 case NVME_CR_INTMC: 2497 /* MSI-X, so ignore */ 2498 break; 2499 case NVME_CR_CC: 2500 ccreg = (uint32_t)value; 2501 2502 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2503 "iocqes %u", 2504 __func__, 2505 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2506 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2507 NVME_CC_GET_IOCQES(ccreg)); 2508 2509 if (NVME_CC_GET_SHN(ccreg)) { 2510 /* perform shutdown - flush out data to backend */ 2511 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2512 NVME_CSTS_REG_SHST_SHIFT); 2513 sc->regs.csts |= NVME_SHST_COMPLETE << 2514 NVME_CSTS_REG_SHST_SHIFT; 2515 } 2516 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2517 if (NVME_CC_GET_EN(ccreg) == 0) 2518 /* transition 1-> causes controller reset */ 2519 pci_nvme_reset_locked(sc); 2520 else 2521 pci_nvme_init_controller(ctx, sc); 2522 } 2523 2524 /* Insert the iocqes, iosqes and en bits from the write */ 2525 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2526 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2527 if (NVME_CC_GET_EN(ccreg) == 0) { 2528 /* Insert the ams, mps and css bit fields */ 2529 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2530 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2531 sc->regs.csts &= ~NVME_CSTS_RDY; 2532 } else if (sc->pending_ios == 0) { 2533 sc->regs.csts |= NVME_CSTS_RDY; 2534 } 2535 break; 2536 case NVME_CR_CSTS: 2537 break; 2538 case NVME_CR_NSSR: 2539 /* ignore writes; don't support subsystem reset */ 2540 break; 2541 case NVME_CR_AQA: 2542 sc->regs.aqa = (uint32_t)value; 2543 break; 2544 case NVME_CR_ASQ_LOW: 2545 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2546 (0xFFFFF000 & value); 2547 break; 2548 case NVME_CR_ASQ_HI: 2549 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2550 (value << 32); 2551 break; 2552 case NVME_CR_ACQ_LOW: 2553 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2554 (0xFFFFF000 & value); 2555 break; 2556 case NVME_CR_ACQ_HI: 2557 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2558 (value << 32); 2559 break; 2560 default: 2561 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2562 __func__, offset, value, size); 2563 } 2564 pthread_mutex_unlock(&sc->mtx); 2565 } 2566 2567 static void 2568 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2569 int baridx, uint64_t offset, int size, uint64_t value) 2570 { 2571 struct pci_nvme_softc* sc = pi->pi_arg; 2572 2573 if (baridx == pci_msix_table_bar(pi) || 2574 baridx == pci_msix_pba_bar(pi)) { 2575 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2576 " value 0x%lx", baridx, offset, size, value); 2577 2578 pci_emul_msix_twrite(pi, offset, size, value); 2579 return; 2580 } 2581 2582 switch (baridx) { 2583 case 0: 2584 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2585 break; 2586 2587 default: 2588 DPRINTF("%s unknown baridx %d, val 0x%lx", 2589 __func__, baridx, value); 2590 } 2591 } 2592 2593 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 2594 uint64_t offset, int size) 2595 { 2596 uint64_t value; 2597 2598 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 2599 2600 if (offset < NVME_DOORBELL_OFFSET) { 2601 void *p = &(sc->regs); 2602 pthread_mutex_lock(&sc->mtx); 2603 memcpy(&value, (void *)((uintptr_t)p + offset), size); 2604 pthread_mutex_unlock(&sc->mtx); 2605 } else { 2606 value = 0; 2607 WPRINTF("pci_nvme: read invalid offset %ld", offset); 2608 } 2609 2610 switch (size) { 2611 case 1: 2612 value &= 0xFF; 2613 break; 2614 case 2: 2615 value &= 0xFFFF; 2616 break; 2617 case 4: 2618 value &= 0xFFFFFFFF; 2619 break; 2620 } 2621 2622 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 2623 offset, size, (uint32_t)value); 2624 2625 return (value); 2626 } 2627 2628 2629 2630 static uint64_t 2631 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 2632 uint64_t offset, int size) 2633 { 2634 struct pci_nvme_softc* sc = pi->pi_arg; 2635 2636 if (baridx == pci_msix_table_bar(pi) || 2637 baridx == pci_msix_pba_bar(pi)) { 2638 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2639 baridx, offset, size); 2640 2641 return pci_emul_msix_tread(pi, offset, size); 2642 } 2643 2644 switch (baridx) { 2645 case 0: 2646 return pci_nvme_read_bar_0(sc, offset, size); 2647 2648 default: 2649 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 2650 } 2651 2652 return (0); 2653 } 2654 2655 static int 2656 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 2657 { 2658 char bident[sizeof("XX:X:X")]; 2659 const char *value; 2660 uint32_t sectsz; 2661 2662 sc->max_queues = NVME_QUEUES; 2663 sc->max_qentries = NVME_MAX_QENTRIES; 2664 sc->ioslots = NVME_IOSLOTS; 2665 sc->num_squeues = sc->max_queues; 2666 sc->num_cqueues = sc->max_queues; 2667 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2668 sectsz = 0; 2669 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2670 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2671 2672 value = get_config_value_node(nvl, "maxq"); 2673 if (value != NULL) 2674 sc->max_queues = atoi(value); 2675 value = get_config_value_node(nvl, "qsz"); 2676 if (value != NULL) { 2677 sc->max_qentries = atoi(value); 2678 if (sc->max_qentries <= 0) { 2679 EPRINTLN("nvme: Invalid qsz option %d", 2680 sc->max_qentries); 2681 return (-1); 2682 } 2683 } 2684 value = get_config_value_node(nvl, "ioslots"); 2685 if (value != NULL) { 2686 sc->ioslots = atoi(value); 2687 if (sc->ioslots <= 0) { 2688 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 2689 return (-1); 2690 } 2691 } 2692 value = get_config_value_node(nvl, "sectsz"); 2693 if (value != NULL) 2694 sectsz = atoi(value); 2695 value = get_config_value_node(nvl, "ser"); 2696 if (value != NULL) { 2697 /* 2698 * This field indicates the Product Serial Number in 2699 * 7-bit ASCII, unused bytes should be space characters. 2700 * Ref: NVMe v1.3c. 2701 */ 2702 cpywithpad((char *)sc->ctrldata.sn, 2703 sizeof(sc->ctrldata.sn), value, ' '); 2704 } 2705 value = get_config_value_node(nvl, "eui64"); 2706 if (value != NULL) 2707 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 2708 value = get_config_value_node(nvl, "dsm"); 2709 if (value != NULL) { 2710 if (strcmp(value, "auto") == 0) 2711 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2712 else if (strcmp(value, "enable") == 0) 2713 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 2714 else if (strcmp(value, "disable") == 0) 2715 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 2716 } 2717 2718 value = get_config_value_node(nvl, "ram"); 2719 if (value != NULL) { 2720 uint64_t sz = strtoull(value, NULL, 10); 2721 2722 sc->nvstore.type = NVME_STOR_RAM; 2723 sc->nvstore.size = sz * 1024 * 1024; 2724 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 2725 sc->nvstore.sectsz = 4096; 2726 sc->nvstore.sectsz_bits = 12; 2727 if (sc->nvstore.ctx == NULL) { 2728 EPRINTLN("nvme: Unable to allocate RAM"); 2729 return (-1); 2730 } 2731 } else { 2732 snprintf(bident, sizeof(bident), "%d:%d", 2733 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2734 sc->nvstore.ctx = blockif_open(nvl, bident); 2735 if (sc->nvstore.ctx == NULL) { 2736 EPRINTLN("nvme: Could not open backing file: %s", 2737 strerror(errno)); 2738 return (-1); 2739 } 2740 sc->nvstore.type = NVME_STOR_BLOCKIF; 2741 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 2742 } 2743 2744 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 2745 sc->nvstore.sectsz = sectsz; 2746 else if (sc->nvstore.type != NVME_STOR_RAM) 2747 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 2748 for (sc->nvstore.sectsz_bits = 9; 2749 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 2750 sc->nvstore.sectsz_bits++); 2751 2752 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 2753 sc->max_queues = NVME_QUEUES; 2754 2755 return (0); 2756 } 2757 2758 static int 2759 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) 2760 { 2761 struct pci_nvme_softc *sc; 2762 uint32_t pci_membar_sz; 2763 int error; 2764 2765 error = 0; 2766 2767 sc = calloc(1, sizeof(struct pci_nvme_softc)); 2768 pi->pi_arg = sc; 2769 sc->nsc_pi = pi; 2770 2771 error = pci_nvme_parse_config(sc, nvl); 2772 if (error < 0) 2773 goto done; 2774 else 2775 error = 0; 2776 2777 STAILQ_INIT(&sc->ioreqs_free); 2778 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 2779 for (int i = 0; i < sc->ioslots; i++) { 2780 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 2781 } 2782 2783 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 2784 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 2785 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 2786 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 2787 pci_set_cfgdata8(pi, PCIR_PROGIF, 2788 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 2789 2790 /* 2791 * Allocate size of NVMe registers + doorbell space for all queues. 2792 * 2793 * The specification requires a minimum memory I/O window size of 16K. 2794 * The Windows driver will refuse to start a device with a smaller 2795 * window. 2796 */ 2797 pci_membar_sz = sizeof(struct nvme_registers) + 2798 2 * sizeof(uint32_t) * (sc->max_queues + 1); 2799 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 2800 2801 DPRINTF("nvme membar size: %u", pci_membar_sz); 2802 2803 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 2804 if (error) { 2805 WPRINTF("%s pci alloc mem bar failed", __func__); 2806 goto done; 2807 } 2808 2809 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2810 if (error) { 2811 WPRINTF("%s pci add msixcap failed", __func__); 2812 goto done; 2813 } 2814 2815 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2816 if (error) { 2817 WPRINTF("%s pci add Express capability failed", __func__); 2818 goto done; 2819 } 2820 2821 pthread_mutex_init(&sc->mtx, NULL); 2822 sem_init(&sc->iosemlock, 0, sc->ioslots); 2823 2824 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 2825 /* 2826 * Controller data depends on Namespace data so initialize Namespace 2827 * data first. 2828 */ 2829 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 2830 pci_nvme_init_ctrldata(sc); 2831 pci_nvme_init_logpages(sc); 2832 pci_nvme_init_features(sc); 2833 2834 pci_nvme_aer_init(sc); 2835 2836 pci_nvme_reset(sc); 2837 2838 pci_lintr_request(pi); 2839 2840 done: 2841 return (error); 2842 } 2843 2844 static int 2845 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 2846 { 2847 char *cp, *ram; 2848 2849 if (opts == NULL) 2850 return (0); 2851 2852 if (strncmp(opts, "ram=", 4) == 0) { 2853 cp = strchr(opts, ','); 2854 if (cp == NULL) { 2855 set_config_value_node(nvl, "ram", opts + 4); 2856 return (0); 2857 } 2858 ram = strndup(opts + 4, cp - opts - 4); 2859 set_config_value_node(nvl, "ram", ram); 2860 free(ram); 2861 return (pci_parse_legacy_config(nvl, cp + 1)); 2862 } else 2863 return (blockif_legacy_config(nvl, opts)); 2864 } 2865 2866 struct pci_devemu pci_de_nvme = { 2867 .pe_emu = "nvme", 2868 .pe_init = pci_nvme_init, 2869 .pe_legacy_config = pci_nvme_legacy_config, 2870 .pe_barwrite = pci_nvme_write, 2871 .pe_barread = pci_nvme_read 2872 }; 2873 PCI_EMUL_SET(pci_de_nvme); 2874