1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * tavor_cq.c 29 * Tavor Completion Queue Processing Routines 30 * 31 * Implements all the routines necessary for allocating, freeing, resizing, 32 * and handling the completion type events that the Tavor hardware can 33 * generate. 34 */ 35 36 #include <sys/types.h> 37 #include <sys/conf.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/modctl.h> 41 #include <sys/bitmap.h> 42 #include <sys/sysmacros.h> 43 44 #include <sys/ib/adapters/tavor/tavor.h> 45 46 /* 47 * Used by tavor_cq_numcalc() below to fill in the "unconstrained" portion 48 * of Tavor completion queue number 49 */ 50 static uint_t tavor_debug_cqnum_cnt = 0x00000000; 51 52 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, 53 uint32_t cqn, uint32_t cq_param); 54 #pragma inline(tavor_cq_doorbell) 55 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 56 tavor_hw_cqe_t *cqe, ibt_wc_t *wc); 57 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 58 tavor_hw_cqe_t *cqe, ibt_wc_t *wc); 59 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, 60 uint_t flag); 61 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf, 62 uint32_t old_cons_indx, uint32_t num_newcqe); 63 static void tavor_cq_numcalc(tavor_state_t *state, uint32_t indx, 64 uint32_t *key); 65 66 /* 67 * tavor_cq_alloc() 68 * Context: Can be called only from user or kernel context. 69 */ 70 int 71 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl, 72 ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl, 73 uint_t sleepflag) 74 { 75 tavor_rsrc_t *cqc, *rsrc; 76 tavor_umap_db_entry_t *umapdb; 77 tavor_hw_cqc_t cqc_entry; 78 tavor_cqhdl_t cq; 79 ibt_mr_attr_t mr_attr; 80 tavor_mr_options_t op; 81 tavor_pdhdl_t pd; 82 tavor_mrhdl_t mr; 83 tavor_hw_cqe_t *buf; 84 uint64_t addr, value; 85 uint32_t log_cq_size, lkey, uarpg; 86 uint_t dma_xfer_mode, cq_sync, cq_is_umap; 87 int status, i, flag; 88 char *errormsg; 89 90 TAVOR_TNF_ENTER(tavor_cq_alloc); 91 92 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr)) 93 94 /* 95 * Determine whether CQ is being allocated for userland access or 96 * whether it is being allocated for kernel access. If the CQ is 97 * being allocated for userland access, then lookup the UAR doorbell 98 * page number for the current process. Note: If this is not found 99 * (e.g. if the process has not previously open()'d the Tavor driver), 100 * then an error is returned. 101 */ 102 cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0; 103 if (cq_is_umap) { 104 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(), 105 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL); 106 if (status != DDI_SUCCESS) { 107 /* Set "status" and "errormsg" and goto failure */ 108 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page"); 109 goto cqalloc_fail; 110 } 111 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx; 112 } 113 114 /* Use the internal protection domain (PD) for setting up CQs */ 115 pd = state->ts_pdhdl_internal; 116 117 /* Increment the reference count on the protection domain (PD) */ 118 tavor_pd_refcnt_inc(pd); 119 120 /* 121 * Allocate an CQ context entry. This will be filled in with all 122 * the necessary parameters to define the Completion Queue. And then 123 * ownership will be passed to the hardware in the final step 124 * below. If we fail here, we must undo the protection domain 125 * reference count. 126 */ 127 status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc); 128 if (status != DDI_SUCCESS) { 129 /* Set "status" and "errormsg" and goto failure */ 130 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ context"); 131 goto cqalloc_fail1; 132 } 133 134 /* 135 * Allocate the software structure for tracking the completion queue 136 * (i.e. the Tavor Completion Queue handle). If we fail here, we must 137 * undo the protection domain reference count and the previous 138 * resource allocation. 139 */ 140 status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc); 141 if (status != DDI_SUCCESS) { 142 /* Set "status" and "errormsg" and goto failure */ 143 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ handle"); 144 goto cqalloc_fail2; 145 } 146 cq = (tavor_cqhdl_t)rsrc->tr_addr; 147 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq)) 148 cq->cq_is_umap = cq_is_umap; 149 150 /* 151 * Calculate the CQ number from CQC index. In much the same way 152 * as we create keys for memory regions (see tavor_mr.c), this CQ 153 * number is constructed from a "constrained" portion (which depends 154 * on the CQC index) and an "unconstrained" portion (which is 155 * arbitrarily chosen). 156 */ 157 tavor_cq_numcalc(state, cqc->tr_indx, &cq->cq_cqnum); 158 159 /* 160 * If this will be a user-mappable CQ, then allocate an entry for 161 * the "userland resources database". This will later be added to 162 * the database (after all further CQ operations are successful). 163 * If we fail here, we must undo the reference counts and the 164 * previous resource allocation. 165 */ 166 if (cq->cq_is_umap) { 167 umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum, 168 MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc); 169 if (umapdb == NULL) { 170 /* Set "status" and "errormsg" and goto failure */ 171 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add"); 172 goto cqalloc_fail3; 173 } 174 } 175 176 /* 177 * Calculate the appropriate size for the completion queue. 178 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also 179 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is 180 * to round the requested size up to the next highest power-of-2 181 */ 182 cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE); 183 log_cq_size = highbit(cq_attr->cq_size); 184 185 /* 186 * Next we verify that the rounded-up size is valid (i.e. consistent 187 * with the device limits and/or software-configured limits) 188 */ 189 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) { 190 /* Set "status" and "errormsg" and goto failure */ 191 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size"); 192 goto cqalloc_fail4; 193 } 194 195 /* 196 * Allocate the memory for Completion Queue. 197 * 198 * Note: Although we use the common queue allocation routine, we 199 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in 200 * kernel system memory) for kernel CQs because it would be 201 * inefficient to have CQs located in DDR memory. This is primarily 202 * because CQs are read from (by software) more than they are written 203 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all 204 * user-mappable CQs for a similar reason.) 205 * It is also worth noting that, unlike Tavor QP work queues, 206 * completion queues do not have the same strict alignment 207 * requirements. It is sufficient for the CQ memory to be both 208 * aligned to and bound to addresses which are a multiple of CQE size. 209 */ 210 cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t); 211 cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t); 212 cq->cq_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t); 213 if (cq->cq_is_umap) { 214 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND; 215 } else { 216 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL; 217 } 218 status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag); 219 if (status != DDI_SUCCESS) { 220 /* Set "status" and "errormsg" and goto failure */ 221 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue"); 222 goto cqalloc_fail4; 223 } 224 buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned; 225 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf)) 226 227 /* 228 * Initialize each of the Completion Queue Entries (CQE) by setting 229 * their ownership to hardware ("owner" bit set to HW). This is in 230 * preparation for the final transfer of ownership (below) of the 231 * CQ context itself. 232 */ 233 for (i = 0; i < (1 << log_cq_size); i++) { 234 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]); 235 } 236 237 /* 238 * Register the memory for the CQ. The memory for the CQ must 239 * be registered in the Tavor TPT tables. This gives us the LKey 240 * to specify in the CQ context below. Note: If this is a user- 241 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping. 242 */ 243 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP; 244 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf; 245 mr_attr.mr_len = cq->cq_cqinfo.qa_size; 246 mr_attr.mr_as = NULL; 247 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE; 248 if (cq->cq_is_umap) { 249 dma_xfer_mode = DDI_DMA_CONSISTENT; 250 } else { 251 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent; 252 } 253 if (dma_xfer_mode == DDI_DMA_STREAMING) { 254 mr_attr.mr_flags |= IBT_MR_NONCOHERENT; 255 } 256 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass; 257 op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl; 258 op.mro_bind_override_addr = 0; 259 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op); 260 if (status != DDI_SUCCESS) { 261 /* Set "status" and "errormsg" and goto failure */ 262 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr"); 263 goto cqalloc_fail5; 264 } 265 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr)) 266 addr = mr->mr_bindinfo.bi_addr; 267 lkey = mr->mr_lkey; 268 269 /* Determine if later ddi_dma_sync will be necessary */ 270 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo); 271 272 /* Sync entire CQ for use by the hardware (if necessary). */ 273 if (cq_sync) { 274 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0, 275 cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV); 276 } 277 278 /* 279 * Fill in the CQC entry. This is the final step before passing 280 * ownership of the CQC entry to the Tavor hardware. We use all of 281 * the information collected/calculated above to fill in the 282 * requisite portions of the CQC. Note: If this CQ is going to be 283 * used for userland access, then we need to set the UAR page number 284 * appropriately (otherwise it's a "don't care") 285 */ 286 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t)); 287 cq->cq_eqnum = TAVOR_CQ_EQNUM_GET(cq->cq_cqnum); 288 cq->cq_erreqnum = TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum); 289 cqc_entry.xlat = TAVOR_VA2PA_XLAT_ENABLED; 290 cqc_entry.state = TAVOR_CQ_DISARMED; 291 cqc_entry.start_addr_h = (addr >> 32); 292 cqc_entry.start_addr_l = (addr & 0xFFFFFFFF); 293 cqc_entry.log_cq_sz = log_cq_size; 294 if (cq->cq_is_umap) { 295 cqc_entry.usr_page = uarpg; 296 } else { 297 cqc_entry.usr_page = 0; 298 } 299 cqc_entry.pd = pd->pd_pdnum; 300 cqc_entry.lkey = lkey; 301 cqc_entry.e_eqn = cq->cq_erreqnum; 302 cqc_entry.c_eqn = cq->cq_eqnum; 303 cqc_entry.cqn = cq->cq_cqnum; 304 305 /* 306 * Write the CQC entry to hardware. Lastly, we pass ownership of 307 * the entry to the hardware (using the Tavor SW2HW_CQ firmware 308 * command). Note: In general, this operation shouldn't fail. But 309 * if it does, we have to undo everything we've done above before 310 * returning error. 311 */ 312 status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry, 313 sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag); 314 if (status != TAVOR_CMD_SUCCESS) { 315 cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n", 316 status); 317 TNF_PROBE_1(tavor_cq_alloc_sw2hw_cq_cmd_fail, 318 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 319 /* Set "status" and "errormsg" and goto failure */ 320 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "tavor SW2HW_CQ command"); 321 goto cqalloc_fail6; 322 } 323 324 /* 325 * Fill in the rest of the Tavor Completion Queue handle. Having 326 * successfully transferred ownership of the CQC, we can update the 327 * following fields for use in further operations on the CQ. 328 */ 329 cq->cq_cqcrsrcp = cqc; 330 cq->cq_rsrcp = rsrc; 331 cq->cq_consindx = 0; 332 cq->cq_buf = buf; 333 cq->cq_bufsz = (1 << log_cq_size); 334 cq->cq_mrhdl = mr; 335 cq->cq_sync = cq_sync; 336 cq->cq_refcnt = 0; 337 cq->cq_is_special = 0; 338 cq->cq_uarpg = uarpg; 339 cq->cq_umap_dhp = (devmap_cookie_t)NULL; 340 avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare, 341 sizeof (struct tavor_workq_hdr_s), 342 offsetof(struct tavor_workq_hdr_s, wq_avl_link)); 343 344 cq->cq_wrid_reap_head = NULL; 345 cq->cq_wrid_reap_tail = NULL; 346 cq->cq_hdlrarg = (void *)ibt_cqhdl; 347 348 /* 349 * Put CQ handle in Tavor CQNum-to-CQHdl list. Then fill in the 350 * "actual_size" and "cqhdl" and return success 351 */ 352 ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL); 353 state->ts_cqhdl[cqc->tr_indx] = cq; 354 355 /* 356 * If this is a user-mappable CQ, then we need to insert the previously 357 * allocated entry into the "userland resources database". This will 358 * allow for later lookup during devmap() (i.e. mmap()) calls. 359 */ 360 if (cq->cq_is_umap) { 361 tavor_umap_db_add(umapdb); 362 } 363 364 /* 365 * Fill in the return arguments (if necessary). This includes the 366 * real completion queue size. 367 */ 368 if (actual_size != NULL) { 369 *actual_size = (1 << log_cq_size) - 1; 370 } 371 *cqhdl = cq; 372 373 TAVOR_TNF_EXIT(tavor_cq_alloc); 374 return (DDI_SUCCESS); 375 376 /* 377 * The following is cleanup for all possible failure cases in this routine 378 */ 379 cqalloc_fail6: 380 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 381 sleepflag) != DDI_SUCCESS) { 382 TAVOR_WARNING(state, "failed to deregister CQ memory"); 383 } 384 cqalloc_fail5: 385 tavor_queue_free(state, &cq->cq_cqinfo); 386 cqalloc_fail4: 387 if (cq_is_umap) { 388 tavor_umap_db_free(umapdb); 389 } 390 cqalloc_fail3: 391 tavor_rsrc_free(state, &rsrc); 392 cqalloc_fail2: 393 tavor_rsrc_free(state, &cqc); 394 cqalloc_fail1: 395 tavor_pd_refcnt_dec(pd); 396 cqalloc_fail: 397 TNF_PROBE_1(tavor_cq_alloc_fail, TAVOR_TNF_ERROR, "", 398 tnf_string, msg, errormsg); 399 TAVOR_TNF_EXIT(tavor_cq_alloc); 400 return (status); 401 } 402 403 404 /* 405 * tavor_cq_free() 406 * Context: Can be called only from user or kernel context. 407 */ 408 /* ARGSUSED */ 409 int 410 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag) 411 { 412 tavor_rsrc_t *cqc, *rsrc; 413 tavor_umap_db_entry_t *umapdb; 414 tavor_hw_cqc_t cqc_entry; 415 tavor_pdhdl_t pd; 416 tavor_mrhdl_t mr; 417 tavor_cqhdl_t cq; 418 uint32_t cqnum; 419 uint64_t value; 420 uint_t maxprot; 421 int status; 422 423 TAVOR_TNF_ENTER(tavor_cq_free); 424 425 /* 426 * Pull all the necessary information from the Tavor Completion Queue 427 * handle. This is necessary here because the resource for the 428 * CQ handle is going to be freed up as part of this operation. 429 */ 430 cq = *cqhdl; 431 mutex_enter(&cq->cq_lock); 432 cqc = cq->cq_cqcrsrcp; 433 rsrc = cq->cq_rsrcp; 434 pd = state->ts_pdhdl_internal; 435 mr = cq->cq_mrhdl; 436 cqnum = cq->cq_cqnum; 437 438 /* 439 * If there are work queues still associated with the CQ, then return 440 * an error. Otherwise, we will be holding the CQ lock. 441 */ 442 if (cq->cq_refcnt != 0) { 443 mutex_exit(&cq->cq_lock); 444 TNF_PROBE_1(tavor_cq_free_refcnt_fail, TAVOR_TNF_ERROR, "", 445 tnf_int, refcnt, cq->cq_refcnt); 446 TAVOR_TNF_EXIT(tavor_cq_free); 447 return (IBT_CQ_BUSY); 448 } 449 450 /* 451 * If this was a user-mappable CQ, then we need to remove its entry 452 * from the "userland resources database". If it is also currently 453 * mmap()'d out to a user process, then we need to call 454 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping. 455 * We also need to invalidate the CQ tracking information for the 456 * user mapping. 457 */ 458 if (cq->cq_is_umap) { 459 status = tavor_umap_db_find(state->ts_instance, cqnum, 460 MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE, 461 &umapdb); 462 if (status != DDI_SUCCESS) { 463 mutex_exit(&cq->cq_lock); 464 TAVOR_WARNING(state, "failed to find in database"); 465 TAVOR_TNF_EXIT(tavor_cq_free); 466 return (ibc_get_ci_failure(0)); 467 } 468 tavor_umap_db_free(umapdb); 469 if (cq->cq_umap_dhp != NULL) { 470 maxprot = (PROT_READ | PROT_WRITE | PROT_USER); 471 status = devmap_devmem_remap(cq->cq_umap_dhp, 472 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, 473 maxprot, DEVMAP_MAPPING_INVALID, NULL); 474 if (status != DDI_SUCCESS) { 475 mutex_exit(&cq->cq_lock); 476 TAVOR_WARNING(state, "failed in CQ memory " 477 "devmap_devmem_remap()"); 478 TAVOR_TNF_EXIT(tavor_cq_free); 479 return (ibc_get_ci_failure(0)); 480 } 481 cq->cq_umap_dhp = (devmap_cookie_t)NULL; 482 } 483 } 484 485 /* 486 * Put NULL into the Tavor CQNum-to-CQHdl list. This will allow any 487 * in-progress events to detect that the CQ corresponding to this 488 * number has been freed. 489 */ 490 state->ts_cqhdl[cqc->tr_indx] = NULL; 491 492 /* 493 * While we hold the CQ lock, do a "forced reap" of the workQ WRID 494 * list. This cleans up all the structures associated with the WRID 495 * processing for this CQ. Once we complete, drop the lock and finish 496 * the deallocation of the CQ. 497 */ 498 tavor_wrid_cq_force_reap(cq); 499 500 mutex_exit(&cq->cq_lock); 501 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq)) 502 503 /* 504 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ 505 * firmware command). If the ownership transfer fails for any reason, 506 * then it is an indication that something (either in HW or SW) has 507 * gone seriously wrong. 508 */ 509 status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry, 510 sizeof (tavor_hw_cqc_t), cqnum, sleepflag); 511 if (status != TAVOR_CMD_SUCCESS) { 512 TAVOR_WARNING(state, "failed to reclaim CQC ownership"); 513 cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n", 514 status); 515 TNF_PROBE_1(tavor_cq_free_hw2sw_cq_cmd_fail, 516 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 517 TAVOR_TNF_EXIT(tavor_cq_free); 518 return (ibc_get_ci_failure(0)); 519 } 520 521 /* 522 * Deregister the memory for the Completion Queue. If this fails 523 * for any reason, then it is an indication that something (either 524 * in HW or SW) has gone seriously wrong. So we print a warning 525 * message and return. 526 */ 527 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 528 sleepflag); 529 if (status != DDI_SUCCESS) { 530 TAVOR_WARNING(state, "failed to deregister CQ memory"); 531 TNF_PROBE_0(tavor_cq_free_dereg_mr_fail, TAVOR_TNF_ERROR, ""); 532 TAVOR_TNF_EXIT(tavor_cq_free); 533 return (ibc_get_ci_failure(0)); 534 } 535 536 /* Free the memory for the CQ */ 537 tavor_queue_free(state, &cq->cq_cqinfo); 538 539 /* Free the Tavor Completion Queue handle */ 540 tavor_rsrc_free(state, &rsrc); 541 542 /* Free up the CQC entry resource */ 543 tavor_rsrc_free(state, &cqc); 544 545 /* Decrement the reference count on the protection domain (PD) */ 546 tavor_pd_refcnt_dec(pd); 547 548 /* Set the cqhdl pointer to NULL and return success */ 549 *cqhdl = NULL; 550 551 TAVOR_TNF_EXIT(tavor_cq_free); 552 return (DDI_SUCCESS); 553 } 554 555 556 /* 557 * tavor_cq_resize() 558 * Context: Can be called only from user or kernel context. 559 */ 560 int 561 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size, 562 uint_t *actual_size, uint_t sleepflag) 563 { 564 tavor_hw_cqc_t cqc_entry; 565 tavor_qalloc_info_t new_cqinfo, old_cqinfo; 566 ibt_mr_attr_t mr_attr; 567 tavor_mr_options_t op; 568 tavor_pdhdl_t pd; 569 tavor_mrhdl_t mr, mr_old; 570 tavor_hw_cqe_t *buf; 571 uint32_t new_prod_indx, old_cons_indx; 572 uint_t dma_xfer_mode, cq_sync, log_cq_size, maxprot; 573 int status, i, flag; 574 char *errormsg; 575 576 TAVOR_TNF_ENTER(tavor_cq_resize); 577 578 /* Use the internal protection domain (PD) for CQs */ 579 pd = state->ts_pdhdl_internal; 580 581 /* 582 * Calculate the appropriate size for the new resized completion queue. 583 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also 584 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is 585 * to round the requested size up to the next highest power-of-2 586 */ 587 req_size = max(req_size, TAVOR_CQ_MIN_SIZE); 588 log_cq_size = highbit(req_size); 589 590 /* 591 * Next we verify that the rounded-up size is valid (i.e. consistent 592 * with the device limits and/or software-configured limits) 593 */ 594 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) { 595 /* Set "status" and "errormsg" and goto failure */ 596 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size"); 597 goto cqresize_fail; 598 } 599 600 /* 601 * Allocate the memory for newly resized Completion Queue. 602 * 603 * Note: Although we use the common queue allocation routine, we 604 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in 605 * kernel system memory) for kernel CQs because it would be 606 * inefficient to have CQs located in DDR memory. This is the same 607 * as we do when we first allocate completion queues primarily 608 * because CQs are read from (by software) more than they are written 609 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all 610 * user-mappable CQs for a similar reason.) 611 * It is also worth noting that, unlike Tavor QP work queues, 612 * completion queues do not have the same strict alignment 613 * requirements. It is sufficient for the CQ memory to be both 614 * aligned to and bound to addresses which are a multiple of CQE size. 615 */ 616 new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t); 617 new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t); 618 new_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t); 619 if (cq->cq_is_umap) { 620 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND; 621 } else { 622 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL; 623 } 624 status = tavor_queue_alloc(state, &new_cqinfo, sleepflag); 625 if (status != DDI_SUCCESS) { 626 /* Set "status" and "errormsg" and goto failure */ 627 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue"); 628 goto cqresize_fail; 629 } 630 buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned; 631 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf)) 632 633 /* 634 * Initialize each of the Completion Queue Entries (CQE) by setting 635 * their ownership to hardware ("owner" bit set to HW). This is in 636 * preparation for the final resize operation (below). 637 */ 638 for (i = 0; i < (1 << log_cq_size); i++) { 639 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]); 640 } 641 642 /* 643 * Register the memory for the CQ. The memory for the CQ must 644 * be registered in the Tavor TPT tables. This gives us the LKey 645 * to specify in the CQ context below. 646 */ 647 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP; 648 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf; 649 mr_attr.mr_len = new_cqinfo.qa_size; 650 mr_attr.mr_as = NULL; 651 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE; 652 if (cq->cq_is_umap) { 653 dma_xfer_mode = DDI_DMA_CONSISTENT; 654 } else { 655 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent; 656 } 657 if (dma_xfer_mode == DDI_DMA_STREAMING) { 658 mr_attr.mr_flags |= IBT_MR_NONCOHERENT; 659 } 660 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass; 661 op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl; 662 op.mro_bind_override_addr = 0; 663 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op); 664 if (status != DDI_SUCCESS) { 665 tavor_queue_free(state, &new_cqinfo); 666 /* Set "status" and "errormsg" and goto failure */ 667 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr"); 668 goto cqresize_fail; 669 } 670 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr)) 671 672 /* Determine if later ddi_dma_sync will be necessary */ 673 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo); 674 675 /* Sync entire "new" CQ for use by hardware (if necessary) */ 676 if (cq_sync) { 677 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0, 678 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV); 679 } 680 681 /* 682 * Now we grab the CQ lock. Since we will be updating the actual 683 * CQ location and the producer/consumer indexes, we should hold 684 * the lock. 685 * 686 * We do a TAVOR_NOSLEEP here (and below), though, because we are 687 * holding the "cq_lock" and if we got raised to interrupt level 688 * by priority inversion, we would not want to block in this routine 689 * waiting for success. 690 */ 691 mutex_enter(&cq->cq_lock); 692 693 /* 694 * Determine the current CQ "consumer index". 695 * 696 * Note: This will depend on whether the CQ had previously been 697 * mapped for user access or whether it is a kernel CQ. If this 698 * is a kernel CQ, then all PollCQ() operations have come through 699 * the IBTF and, hence, the driver's CQ state structure will 700 * contain the current consumer index. If, however, the user has 701 * accessed this CQ by bypassing the driver (OS-bypass), then we 702 * need to query the firmware to determine the current CQ consumer 703 * index. This also assumes that the user process will not continue 704 * to consume entries while at the same time doing the ResizeCQ() 705 * operation. If the user process does not guarantee this, then it 706 * may see duplicate or missed completions. But under no 707 * circumstances should this panic the system. 708 */ 709 if (cq->cq_is_umap) { 710 status = tavor_cmn_query_cmd_post(state, QUERY_CQ, 711 cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t), 712 TAVOR_NOSLEEP); 713 if (status != TAVOR_CMD_SUCCESS) { 714 /* Query CQ has failed, drop CQ lock and cleanup */ 715 mutex_exit(&cq->cq_lock); 716 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 717 sleepflag) != DDI_SUCCESS) { 718 TAVOR_WARNING(state, "failed to deregister " 719 "CQ memory"); 720 } 721 tavor_queue_free(state, &new_cqinfo); 722 TAVOR_WARNING(state, "failed to find in database"); 723 724 /* Set "status" and "errormsg" and goto failure */ 725 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 726 "failed umap lookup"); 727 goto cqresize_fail; 728 } 729 old_cons_indx = cqc_entry.cons_indx; 730 } else { 731 old_cons_indx = cq->cq_consindx; 732 } 733 734 /* 735 * Fill in the CQC entry. For the resize operation this is the 736 * final step before attempting the resize operation on the CQC entry. 737 * We use all of the information collected/calculated above to fill 738 * in the requisite portions of the CQC. 739 */ 740 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t)); 741 cqc_entry.start_addr_h = (mr->mr_bindinfo.bi_addr >> 32); 742 cqc_entry.start_addr_l = (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF); 743 cqc_entry.log_cq_sz = log_cq_size; 744 cqc_entry.lkey = mr->mr_lkey; 745 746 /* 747 * Write the CQC entry to hardware. Lastly, we pass ownership of 748 * the entry to the hardware (using the Tavor RESIZE_CQ firmware 749 * command). Note: In general, this operation shouldn't fail. But 750 * if it does, we have to undo everything we've done above before 751 * returning error. Also note that the status returned may indicate 752 * the code to return to the IBTF. 753 */ 754 status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum, 755 &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN); 756 if (status != TAVOR_CMD_SUCCESS) { 757 /* Resize attempt has failed, drop CQ lock and cleanup */ 758 mutex_exit(&cq->cq_lock); 759 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 760 sleepflag) != DDI_SUCCESS) { 761 TAVOR_WARNING(state, "failed to deregister CQ memory"); 762 } 763 tavor_queue_free(state, &new_cqinfo); 764 if (status == TAVOR_CMD_BAD_SIZE) { 765 TAVOR_TNF_EXIT(tavor_cq_resize); 766 return (IBT_CQ_SZ_INSUFFICIENT); 767 } else { 768 cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: " 769 "%08x\n", status); 770 TNF_PROBE_1(tavor_cq_resize_cq_cmd_fail, 771 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 772 TAVOR_TNF_EXIT(tavor_cq_resize); 773 return (ibc_get_ci_failure(0)); 774 } 775 } 776 777 /* 778 * The CQ resize attempt was successful. Before dropping the CQ lock, 779 * copy all of the CQEs from the "old" CQ into the "new" CQ. Note: 780 * the Tavor firmware guarantees us that sufficient space is set aside 781 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ. 782 * The two parameters to this helper function ("old_cons_indx" and 783 * "new_prod_indx") essentially indicate the starting index and number 784 * of any CQEs that might remain in the "old" CQ memory. 785 */ 786 tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx); 787 788 /* Sync entire "new" CQ for use by hardware (if necessary) */ 789 if (cq_sync) { 790 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0, 791 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV); 792 } 793 794 /* 795 * Update the Tavor Completion Queue handle with all the new 796 * information. At the same time, save away all the necessary 797 * information for freeing up the old resources 798 */ 799 mr_old = cq->cq_mrhdl; 800 old_cqinfo = cq->cq_cqinfo; 801 cq->cq_cqinfo = new_cqinfo; 802 cq->cq_consindx = 0; 803 cq->cq_buf = buf; 804 cq->cq_bufsz = (1 << log_cq_size); 805 cq->cq_mrhdl = mr; 806 cq->cq_sync = cq_sync; 807 808 /* 809 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out 810 * to a user process, then we need to call devmap_devmem_remap() to 811 * invalidate the mapping to the CQ memory. We also need to 812 * invalidate the CQ tracking information for the user mapping. 813 */ 814 if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) { 815 maxprot = (PROT_READ | PROT_WRITE | PROT_USER); 816 status = devmap_devmem_remap(cq->cq_umap_dhp, 817 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot, 818 DEVMAP_MAPPING_INVALID, NULL); 819 if (status != DDI_SUCCESS) { 820 mutex_exit(&cq->cq_lock); 821 TAVOR_WARNING(state, "failed in CQ memory " 822 "devmap_devmem_remap()"); 823 TAVOR_TNF_EXIT(tavor_cq_free); 824 return (ibc_get_ci_failure(0)); 825 } 826 cq->cq_umap_dhp = (devmap_cookie_t)NULL; 827 } 828 829 /* 830 * Drop the CQ lock now. The only thing left to do is to free up 831 * the old resources. 832 */ 833 mutex_exit(&cq->cq_lock); 834 835 /* 836 * Deregister the memory for the old Completion Queue. Note: We 837 * really can't return error here because we have no good way to 838 * cleanup. Plus, the deregistration really shouldn't ever happen. 839 * So, if it does, it is an indication that something has gone 840 * seriously wrong. So we print a warning message and return error 841 * (knowing, of course, that the "old" CQ memory will be leaked) 842 */ 843 status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL, 844 sleepflag); 845 if (status != DDI_SUCCESS) { 846 TAVOR_WARNING(state, "failed to deregister old CQ memory"); 847 /* Set "status" and "errormsg" and goto failure */ 848 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 849 "failed deregister mr (old)"); 850 goto cqresize_fail; 851 } 852 853 /* Free the memory for the old CQ */ 854 tavor_queue_free(state, &old_cqinfo); 855 856 /* 857 * Fill in the return arguments (if necessary). This includes the 858 * real new completion queue size. 859 */ 860 if (actual_size != NULL) { 861 *actual_size = (1 << log_cq_size) - 1; 862 } 863 864 TAVOR_TNF_EXIT(tavor_cq_resize); 865 return (DDI_SUCCESS); 866 867 cqresize_fail: 868 TNF_PROBE_1(tavor_cq_resize_fail, TAVOR_TNF_ERROR, "", 869 tnf_string, msg, errormsg); 870 TAVOR_TNF_EXIT(tavor_cq_resize); 871 return (status); 872 } 873 874 875 /* 876 * tavor_cq_notify() 877 * Context: Can be called from interrupt or base context. 878 */ 879 int 880 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq, 881 ibt_cq_notify_flags_t flags) 882 { 883 uint_t cqnum; 884 885 TAVOR_TNF_ENTER(tavor_cq_notify); 886 887 /* 888 * Determine if we are trying to get the next completion or the next 889 * "solicited" completion. Then hit the appropriate doorbell. 890 * 891 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll 892 * regarding why we do not have to do an extra PIO read here, and we 893 * will not lose an event after writing this doorbell. 894 */ 895 cqnum = cq->cq_cqnum; 896 if (flags == IBT_NEXT_COMPLETION) { 897 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum, 898 TAVOR_CQDB_DEFAULT_PARAM); 899 900 } else if (flags == IBT_NEXT_SOLICITED) { 901 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT, 902 cqnum, TAVOR_CQDB_DEFAULT_PARAM); 903 904 } else { 905 TNF_PROBE_1(tavor_cq_notify_invflags_fail, TAVOR_TNF_ERROR, "", 906 tnf_int, flags, flags); 907 TAVOR_TNF_EXIT(tavor_cq_notify); 908 return (IBT_CQ_NOTIFY_TYPE_INVALID); 909 } 910 911 TAVOR_TNF_EXIT(tavor_cq_notify); 912 return (DDI_SUCCESS); 913 } 914 915 916 /* 917 * tavor_cq_poll() 918 * Context: Can be called from interrupt or base context. 919 */ 920 int 921 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p, 922 uint_t num_wc, uint_t *num_polled) 923 { 924 tavor_hw_cqe_t *cqe; 925 uint32_t cons_indx, wrap_around_mask; 926 uint32_t polled_cnt, num_to_increment; 927 int status; 928 929 TAVOR_TNF_ENTER(tavor_cq_poll); 930 931 /* 932 * Check for user-mappable CQ memory. Note: We do not allow kernel 933 * clients to poll CQ memory that is accessible directly by the user. 934 * If the CQ memory is user accessible, then return an error. 935 */ 936 if (cq->cq_is_umap) { 937 TNF_PROBE_0(tavor_cq_poll_inv_usrmapped_type, 938 TAVOR_TNF_ERROR, ""); 939 TAVOR_TNF_EXIT(tavor_cq_poll); 940 return (IBT_CQ_HDL_INVALID); 941 } 942 943 mutex_enter(&cq->cq_lock); 944 945 /* Get the consumer index */ 946 cons_indx = cq->cq_consindx; 947 948 /* 949 * Calculate the wrap around mask. Note: This operation only works 950 * because all Tavor completion queues have power-of-2 sizes 951 */ 952 wrap_around_mask = (cq->cq_bufsz - 1); 953 954 /* Calculate the pointer to the first CQ entry */ 955 cqe = &cq->cq_buf[cons_indx]; 956 957 /* Sync the current CQE to read */ 958 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 959 960 /* 961 * Keep pulling entries from the CQ until we find an entry owned by 962 * the hardware. As long as there the CQE's owned by SW, process 963 * each entry by calling tavor_cq_cqe_consume() and updating the CQ 964 * consumer index. Note: We only update the consumer index if 965 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. Otherwise, 966 * it indicates that we are going to "recycle" the CQE (probably 967 * because it is a error CQE and corresponds to more than one 968 * completion). 969 */ 970 polled_cnt = 0; 971 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) { 972 status = tavor_cq_cqe_consume(state, cq, cqe, 973 &wc_p[polled_cnt++]); 974 if (status == TAVOR_CQ_SYNC_AND_DB) { 975 /* Reset entry to hardware ownership */ 976 TAVOR_CQE_OWNER_SET_HW(cq, cqe); 977 978 /* Sync the current CQE for device */ 979 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV); 980 981 /* Increment the consumer index */ 982 cons_indx = (cons_indx + 1) & wrap_around_mask; 983 984 /* Update the pointer to the next CQ entry */ 985 cqe = &cq->cq_buf[cons_indx]; 986 987 /* Sync the next CQE to read */ 988 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 989 } 990 991 /* 992 * If we have run out of space to store work completions, 993 * then stop and return the ones we have pulled of the CQ. 994 */ 995 if (polled_cnt >= num_wc) { 996 break; 997 } 998 } 999 1000 /* 1001 * Now we only ring the doorbell (to update the consumer index) if 1002 * we've actually consumed a CQ entry. If we have, for example, 1003 * pulled from a CQE that we are still in the process of "recycling" 1004 * for error purposes, then we would not update the consumer index. 1005 */ 1006 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) { 1007 /* 1008 * Post doorbell to update the consumer index. Doorbell 1009 * value indicates number of entries consumed (minus 1) 1010 */ 1011 if (cons_indx > cq->cq_consindx) { 1012 num_to_increment = (cons_indx - cq->cq_consindx) - 1; 1013 } else { 1014 num_to_increment = ((cons_indx + cq->cq_bufsz) - 1015 cq->cq_consindx) - 1; 1016 } 1017 cq->cq_consindx = cons_indx; 1018 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX, 1019 cq->cq_cqnum, num_to_increment); 1020 1021 } else if (polled_cnt == 0) { 1022 /* 1023 * If the CQ is empty, we can try to free up some of the WRID 1024 * list containers. See tavor_wr.c for more details on this 1025 * operation. 1026 */ 1027 tavor_wrid_cq_reap(cq); 1028 } 1029 1030 mutex_exit(&cq->cq_lock); 1031 1032 /* Set "num_polled" (if necessary) */ 1033 if (num_polled != NULL) { 1034 *num_polled = polled_cnt; 1035 } 1036 1037 /* Set CQ_EMPTY condition if needed, otherwise return success */ 1038 if (polled_cnt == 0) { 1039 status = IBT_CQ_EMPTY; 1040 } else { 1041 status = DDI_SUCCESS; 1042 } 1043 1044 /* 1045 * Check if the system is currently panicking. If it is, then call 1046 * the Tavor interrupt service routine. This step is necessary here 1047 * because we might be in a polled I/O mode and without the call to 1048 * tavor_isr() - and its subsequent calls to poll and rearm each 1049 * event queue - we might overflow our EQs and render the system 1050 * unable to sync/dump. 1051 */ 1052 if (ddi_in_panic() != 0) { 1053 (void) tavor_isr((caddr_t)state, (caddr_t)NULL); 1054 } 1055 1056 TAVOR_TNF_EXIT(tavor_cq_poll); 1057 return (status); 1058 } 1059 1060 1061 /* 1062 * tavor_cq_handler() 1063 * Context: Only called from interrupt context 1064 */ 1065 int 1066 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq, 1067 tavor_hw_eqe_t *eqe) 1068 { 1069 tavor_cqhdl_t cq; 1070 uint_t cqnum; 1071 uint_t eqe_evttype; 1072 1073 TAVOR_TNF_ENTER(tavor_cq_handler); 1074 1075 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe); 1076 1077 ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION || 1078 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW); 1079 1080 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) { 1081 TNF_PROBE_0(tavor_cq_handler_eq_overflow_condition, 1082 TAVOR_TNF_ERROR, ""); 1083 tavor_eq_overflow_handler(state, eq, eqe); 1084 1085 TAVOR_TNF_EXIT(tavor_cq_handler); 1086 return (DDI_FAILURE); 1087 } 1088 1089 1090 /* Get the CQ handle from CQ number in event descriptor */ 1091 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe); 1092 cq = tavor_cqhdl_from_cqnum(state, cqnum); 1093 1094 /* 1095 * Post the EQ doorbell to move the CQ to the "disarmed" state. 1096 * This operation is to enable subsequent CQ doorbells (e.g. those 1097 * that can be rung by tavor_cq_notify() above) to rearm the CQ. 1098 */ 1099 tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum); 1100 1101 /* 1102 * If the CQ handle is NULL, this is probably an indication 1103 * that the CQ has been freed already. In which case, we 1104 * should not deliver this event. 1105 * 1106 * We also check that the CQ number in the handle is the 1107 * same as the CQ number in the event queue entry. This 1108 * extra check allows us to handle the case where a CQ was 1109 * freed and then allocated again in the time it took to 1110 * handle the event queue processing. By constantly incrementing 1111 * the non-constrained portion of the CQ number every time 1112 * a new CQ is allocated, we mitigate (somewhat) the chance 1113 * that a stale event could be passed to the client's CQ 1114 * handler. 1115 * 1116 * Lastly, we check if "ts_ibtfpriv" is NULL. If it is then it 1117 * means that we've have either received this event before we 1118 * finished attaching to the IBTF or we've received it while we 1119 * are in the process of detaching. 1120 */ 1121 if ((cq != NULL) && (cq->cq_cqnum == cqnum) && 1122 (state->ts_ibtfpriv != NULL)) { 1123 TAVOR_DO_IBTF_CQ_CALLB(state, cq); 1124 } else { 1125 TNF_PROBE_2(tavor_cq_handler_dropped_event, 1126 TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum, 1127 tnf_uint, hdl_cqnum, cqnum); 1128 } 1129 1130 TAVOR_TNF_EXIT(tavor_cq_handler); 1131 return (DDI_SUCCESS); 1132 } 1133 1134 1135 /* 1136 * tavor_cq_err_handler() 1137 * Context: Only called from interrupt context 1138 */ 1139 int 1140 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq, 1141 tavor_hw_eqe_t *eqe) 1142 { 1143 tavor_cqhdl_t cq; 1144 uint_t cqnum; 1145 ibc_async_event_t event; 1146 ibt_async_code_t type; 1147 uint_t eqe_evttype; 1148 1149 TAVOR_TNF_ENTER(tavor_cq_err_handler); 1150 1151 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe); 1152 1153 ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS || 1154 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW); 1155 1156 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) { 1157 TNF_PROBE_0(tavor_cq_err_handler_eq_overflow_condition, 1158 TAVOR_TNF_ERROR, ""); 1159 tavor_eq_overflow_handler(state, eq, eqe); 1160 1161 TAVOR_TNF_EXIT(tavor_cq_err_handler); 1162 return (DDI_FAILURE); 1163 } 1164 1165 /* cmn_err(CE_CONT, "CQ Error handler\n"); */ 1166 1167 /* Get the CQ handle from CQ number in event descriptor */ 1168 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe); 1169 cq = tavor_cqhdl_from_cqnum(state, cqnum); 1170 1171 /* 1172 * If the CQ handle is NULL, this is probably an indication 1173 * that the CQ has been freed already. In which case, we 1174 * should not deliver this event. 1175 * 1176 * We also check that the CQ number in the handle is the 1177 * same as the CQ number in the event queue entry. This 1178 * extra check allows us to handle the case where a CQ was 1179 * freed and then allocated again in the time it took to 1180 * handle the event queue processing. By constantly incrementing 1181 * the non-constrained portion of the CQ number every time 1182 * a new CQ is allocated, we mitigate (somewhat) the chance 1183 * that a stale event could be passed to the client's CQ 1184 * handler. 1185 * 1186 * And then we check if "ts_ibtfpriv" is NULL. If it is then it 1187 * means that we've have either received this event before we 1188 * finished attaching to the IBTF or we've received it while we 1189 * are in the process of detaching. 1190 */ 1191 if ((cq != NULL) && (cq->cq_cqnum == cqnum) && 1192 (state->ts_ibtfpriv != NULL)) { 1193 event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg; 1194 type = IBT_ERROR_CQ; 1195 1196 TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event); 1197 } else { 1198 TNF_PROBE_2(tavor_cq_err_handler_dropped_event, 1199 TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum, 1200 tnf_uint, hdl_cqnum, cqnum); 1201 } 1202 1203 TAVOR_TNF_EXIT(tavor_cq_err_handler); 1204 return (DDI_SUCCESS); 1205 } 1206 1207 1208 /* 1209 * tavor_cq_refcnt_inc() 1210 * Context: Can be called from interrupt or base context. 1211 */ 1212 int 1213 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special) 1214 { 1215 /* 1216 * Increment the completion queue's reference count. Note: In order 1217 * to ensure compliance with IBA C11-15, we must ensure that a given 1218 * CQ is not used for both special (SMI/GSI) QP and non-special QP. 1219 * This is accomplished here by keeping track of how the referenced 1220 * CQ is being used. 1221 */ 1222 mutex_enter(&cq->cq_lock); 1223 TNF_PROBE_1_DEBUG(tavor_cq_refcnt_inc, TAVOR_TNF_TRACE, "", 1224 tnf_uint, refcnt, cq->cq_refcnt); 1225 if (cq->cq_refcnt == 0) { 1226 cq->cq_is_special = is_special; 1227 } else { 1228 if (cq->cq_is_special != is_special) { 1229 mutex_exit(&cq->cq_lock); 1230 return (DDI_FAILURE); 1231 } 1232 } 1233 cq->cq_refcnt++; 1234 mutex_exit(&cq->cq_lock); 1235 return (DDI_SUCCESS); 1236 } 1237 1238 1239 /* 1240 * tavor_cq_refcnt_dec() 1241 * Context: Can be called from interrupt or base context. 1242 */ 1243 void 1244 tavor_cq_refcnt_dec(tavor_cqhdl_t cq) 1245 { 1246 /* Decrement the completion queue's reference count */ 1247 mutex_enter(&cq->cq_lock); 1248 cq->cq_refcnt--; 1249 TNF_PROBE_1_DEBUG(tavor_cq_refcnt_dec, TAVOR_TNF_TRACE, "", 1250 tnf_uint, refcnt, cq->cq_refcnt); 1251 mutex_exit(&cq->cq_lock); 1252 } 1253 1254 1255 /* 1256 * tavor_cq_doorbell() 1257 * Context: Can be called from interrupt or base context. 1258 */ 1259 static void 1260 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn, 1261 uint32_t cq_param) 1262 { 1263 uint64_t doorbell = 0; 1264 1265 /* Build the doorbell from the parameters */ 1266 doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) | 1267 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param; 1268 1269 TNF_PROBE_1_DEBUG(tavor_cq_doorbell, TAVOR_TNF_TRACE, "", 1270 tnf_ulong, doorbell, doorbell); 1271 1272 /* Write the doorbell to UAR */ 1273 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq, 1274 doorbell); 1275 } 1276 1277 1278 /* 1279 * tavor_cqhdl_from_cqnum() 1280 * Context: Can be called from interrupt or base context. 1281 * 1282 * This routine is important because changing the unconstrained 1283 * portion of the CQ number is critical to the detection of a 1284 * potential race condition in the CQ handler code (i.e. the case 1285 * where a CQ is freed and alloc'd again before an event for the 1286 * "old" CQ can be handled). 1287 * 1288 * While this is not a perfect solution (not sure that one exists) 1289 * it does help to mitigate the chance that this race condition will 1290 * cause us to deliver a "stale" event to the new CQ owner. Note: 1291 * this solution does not scale well because the number of constrained 1292 * bits increases (and, hence, the number of unconstrained bits 1293 * decreases) as the number of supported CQs grows. For small and 1294 * intermediate values, it should hopefully provide sufficient 1295 * protection. 1296 */ 1297 tavor_cqhdl_t 1298 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum) 1299 { 1300 uint_t cqindx, cqmask; 1301 1302 /* Calculate the CQ table index from the cqnum */ 1303 cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1; 1304 cqindx = cqnum & cqmask; 1305 return (state->ts_cqhdl[cqindx]); 1306 } 1307 1308 1309 /* 1310 * tavor_cq_cqe_consume() 1311 * Context: Can be called from interrupt or base context. 1312 */ 1313 static int 1314 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 1315 tavor_hw_cqe_t *cqe, ibt_wc_t *wc) 1316 { 1317 uint_t flags, type, opcode, qpnum, qp1_indx; 1318 int status; 1319 1320 TAVOR_TNF_ENTER(tavor_cq_cqe_consume); 1321 1322 /* 1323 * Determine if this is an "error" CQE by examining "opcode". If it 1324 * is an error CQE, then call tavor_cq_errcqe_consume() and return 1325 * whatever status it returns. Otherwise, this is a successful 1326 * completion. 1327 */ 1328 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe); 1329 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) || 1330 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) { 1331 status = tavor_cq_errcqe_consume(state, cq, cqe, wc); 1332 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1333 return (status); 1334 } 1335 1336 /* 1337 * Fetch the Work Request ID using the information in the CQE. 1338 * See tavor_wr.c for more details. 1339 */ 1340 wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL); 1341 1342 /* 1343 * Parse the CQE opcode to determine completion type. This will set 1344 * not only the type of the completion, but also any flags that might 1345 * be associated with it (e.g. whether immediate data is present). 1346 */ 1347 flags = IBT_WC_NO_FLAGS; 1348 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) { 1349 1350 /* Send CQE */ 1351 switch (opcode) { 1352 case TAVOR_CQE_SND_RDMAWR_IMM: 1353 flags |= IBT_WC_IMMED_DATA_PRESENT; 1354 /* FALLTHROUGH */ 1355 case TAVOR_CQE_SND_RDMAWR: 1356 type = IBT_WRC_RDMAW; 1357 break; 1358 1359 case TAVOR_CQE_SND_SEND_IMM: 1360 flags |= IBT_WC_IMMED_DATA_PRESENT; 1361 /* FALLTHROUGH */ 1362 case TAVOR_CQE_SND_SEND: 1363 type = IBT_WRC_SEND; 1364 break; 1365 1366 case TAVOR_CQE_SND_RDMARD: 1367 type = IBT_WRC_RDMAR; 1368 break; 1369 1370 case TAVOR_CQE_SND_ATOMIC_CS: 1371 type = IBT_WRC_CSWAP; 1372 break; 1373 1374 case TAVOR_CQE_SND_ATOMIC_FA: 1375 type = IBT_WRC_FADD; 1376 break; 1377 1378 case TAVOR_CQE_SND_BIND_MW: 1379 type = IBT_WRC_BIND; 1380 break; 1381 1382 default: 1383 TAVOR_WARNING(state, "unknown send CQE type"); 1384 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR; 1385 TNF_PROBE_1(tavor_cq_cqe_consume_unknown_send_type, 1386 TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode); 1387 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1388 return (TAVOR_CQ_SYNC_AND_DB); 1389 } 1390 } else { 1391 1392 /* Receive CQE */ 1393 switch (opcode & 0x1F) { 1394 case TAVOR_CQE_RCV_RECV_IMM: 1395 /* FALLTHROUGH */ 1396 case TAVOR_CQE_RCV_RECV_IMM2: 1397 /* 1398 * Note: According to the Tavor PRM, all QP1 recv 1399 * completions look like the result of a Send with 1400 * Immediate. They are not, however, (MADs are Send 1401 * Only) so we need to check the QP number and set 1402 * the flag only if it is non-QP1. 1403 */ 1404 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe); 1405 qp1_indx = state->ts_spec_qp1->tr_indx; 1406 if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) { 1407 flags |= IBT_WC_IMMED_DATA_PRESENT; 1408 } 1409 /* FALLTHROUGH */ 1410 case TAVOR_CQE_RCV_RECV: 1411 /* FALLTHROUGH */ 1412 case TAVOR_CQE_RCV_RECV2: 1413 type = IBT_WRC_RECV; 1414 break; 1415 1416 case TAVOR_CQE_RCV_RDMAWR_IMM: 1417 /* FALLTHROUGH */ 1418 case TAVOR_CQE_RCV_RDMAWR_IMM2: 1419 flags |= IBT_WC_IMMED_DATA_PRESENT; 1420 type = IBT_WRC_RECV_RDMAWI; 1421 break; 1422 1423 default: 1424 TAVOR_WARNING(state, "unknown recv CQE type"); 1425 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR; 1426 TNF_PROBE_1(tavor_cq_cqe_consume_unknown_rcv_type, 1427 TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode); 1428 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1429 return (TAVOR_CQ_SYNC_AND_DB); 1430 } 1431 } 1432 wc->wc_type = type; 1433 1434 /* 1435 * Check for GRH, update the flags, then fill in "wc_flags" field 1436 * in the work completion 1437 */ 1438 if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) { 1439 flags |= IBT_WC_GRH_PRESENT; 1440 } 1441 wc->wc_flags = flags; 1442 1443 /* If we got here, completion status must be success */ 1444 wc->wc_status = IBT_WC_SUCCESS; 1445 1446 /* 1447 * Parse the remaining contents of the CQE into the work completion. 1448 * This means filling in SL, QP number, SLID, immediate data, etc. 1449 * Note: Not all of these fields are valid in a given completion. 1450 * Many of them depend on the actual type of completion. So we fill 1451 * in all of the fields and leave it up to the IBTF and consumer to 1452 * sort out which are valid based on their context. 1453 */ 1454 wc->wc_sl = TAVOR_CQE_SL_GET(cq, cqe); 1455 wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe); 1456 wc->wc_qpn = TAVOR_CQE_DQPN_GET(cq, cqe); 1457 wc->wc_res_hash = 0; 1458 wc->wc_slid = TAVOR_CQE_DLID_GET(cq, cqe); 1459 wc->wc_ethertype = (wc->wc_immed_data & 0xFFFF); 1460 wc->wc_pkey_ix = (wc->wc_immed_data >> 16); 1461 1462 /* 1463 * Depending on whether the completion was a receive or a send 1464 * completion, fill in "bytes transferred" as appropriate. Also, 1465 * if necessary, fill in the "path bits" field. 1466 */ 1467 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) { 1468 wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe); 1469 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe); 1470 1471 } else if ((wc->wc_type == IBT_WRC_RDMAR) || 1472 (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) { 1473 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe); 1474 } 1475 1476 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1477 return (TAVOR_CQ_SYNC_AND_DB); 1478 } 1479 1480 1481 /* 1482 * tavor_cq_errcqe_consume() 1483 * Context: Can be called from interrupt or base context. 1484 */ 1485 static int 1486 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 1487 tavor_hw_cqe_t *cqe, ibt_wc_t *wc) 1488 { 1489 uint64_t next_wqeaddr; 1490 uint32_t imm_eth_pkey_cred; 1491 uint_t nextwqesize, dbd; 1492 uint_t doorbell_cnt, status; 1493 tavor_wrid_entry_t wre; 1494 1495 TAVOR_TNF_ENTER(tavor_cq_errcqe_consume); 1496 1497 /* 1498 * Fetch the Work Request ID using the information in the CQE. 1499 * See tavor_wr.c for more details. 1500 */ 1501 wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre); 1502 1503 /* 1504 * Parse the CQE opcode to determine completion type. We know that 1505 * the CQE is an error completion, so we extract only the completion 1506 * status here. 1507 */ 1508 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe); 1509 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT; 1510 switch (status) { 1511 case TAVOR_CQE_LOC_LEN_ERR: 1512 status = IBT_WC_LOCAL_LEN_ERR; 1513 break; 1514 1515 case TAVOR_CQE_LOC_OP_ERR: 1516 status = IBT_WC_LOCAL_QP_OP_ERR; 1517 break; 1518 1519 case TAVOR_CQE_LOC_PROT_ERR: 1520 status = IBT_WC_LOCAL_PROTECT_ERR; 1521 break; 1522 1523 case TAVOR_CQE_WR_FLUSHED_ERR: 1524 status = IBT_WC_WR_FLUSHED_ERR; 1525 break; 1526 1527 case TAVOR_CQE_MW_BIND_ERR: 1528 status = IBT_WC_MEM_WIN_BIND_ERR; 1529 break; 1530 1531 case TAVOR_CQE_BAD_RESPONSE_ERR: 1532 status = IBT_WC_BAD_RESPONSE_ERR; 1533 break; 1534 1535 case TAVOR_CQE_LOCAL_ACCESS_ERR: 1536 status = IBT_WC_LOCAL_ACCESS_ERR; 1537 break; 1538 1539 case TAVOR_CQE_REM_INV_REQ_ERR: 1540 status = IBT_WC_REMOTE_INVALID_REQ_ERR; 1541 break; 1542 1543 case TAVOR_CQE_REM_ACC_ERR: 1544 status = IBT_WC_REMOTE_ACCESS_ERR; 1545 break; 1546 1547 case TAVOR_CQE_REM_OP_ERR: 1548 status = IBT_WC_REMOTE_OP_ERR; 1549 break; 1550 1551 case TAVOR_CQE_TRANS_TO_ERR: 1552 status = IBT_WC_TRANS_TIMEOUT_ERR; 1553 break; 1554 1555 case TAVOR_CQE_RNRNAK_TO_ERR: 1556 status = IBT_WC_RNR_NAK_TIMEOUT_ERR; 1557 break; 1558 1559 /* 1560 * The following error codes are not supported in the Tavor driver 1561 * as they relate only to Reliable Datagram completion statuses: 1562 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR: 1563 * case TAVOR_CQE_REM_INV_RD_REQ_ERR: 1564 * case TAVOR_CQE_EEC_REM_ABORTED_ERR: 1565 * case TAVOR_CQE_INV_EEC_NUM_ERR: 1566 * case TAVOR_CQE_INV_EEC_STATE_ERR: 1567 * case TAVOR_CQE_LOC_EEC_ERR: 1568 */ 1569 1570 default: 1571 TAVOR_WARNING(state, "unknown error CQE status"); 1572 status = IBT_WC_LOCAL_QP_OP_ERR; 1573 TNF_PROBE_1(tavor_cq_errcqe_consume_unknown_status, 1574 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 1575 break; 1576 } 1577 wc->wc_status = status; 1578 1579 /* 1580 * Now we do all the checking that's necessary to handle completion 1581 * queue entry "recycling" 1582 * 1583 * It is not necessary here to try to sync the WQE as we are only 1584 * attempting to read from the Work Queue (and hardware does not 1585 * write to it). 1586 */ 1587 1588 /* 1589 * We can get doorbell info, WQE address, size for the next WQE 1590 * from the "wre" (which was filled in above in the call to the 1591 * tavor_wrid_get_entry() routine) 1592 */ 1593 dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0; 1594 next_wqeaddr = wre.wr_wqeaddrsz; 1595 nextwqesize = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK; 1596 1597 /* 1598 * Get the doorbell count from the CQE. This indicates how many 1599 * completions this one CQE represents. 1600 */ 1601 doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK; 1602 1603 /* 1604 * Determine if we're ready to consume this CQE yet or not. If the 1605 * next WQE has size zero (i.e. no next WQE) or if the doorbell count 1606 * is down to zero, then this is the last/only completion represented 1607 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB). Otherwise, the 1608 * current CQE needs to be recycled (see below). 1609 */ 1610 if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) { 1611 /* 1612 * Consume the CQE 1613 * Return status to indicate that doorbell and sync may be 1614 * necessary. 1615 */ 1616 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume); 1617 return (TAVOR_CQ_SYNC_AND_DB); 1618 1619 } else { 1620 /* 1621 * Recycle the CQE for use in the next PollCQ() call 1622 * Decrement the doorbell count, modify the error status, 1623 * and update the WQE address and size (to point to the 1624 * next WQE on the chain. Put these update entries back 1625 * into the CQE. 1626 * Despite the fact that we have updated the CQE, it is not 1627 * necessary for us to attempt to sync this entry just yet 1628 * as we have not changed the "hardware's view" of the 1629 * entry (i.e. we have not modified the "owner" bit - which 1630 * is all that the Tavor hardware really cares about. 1631 */ 1632 doorbell_cnt = doorbell_cnt - dbd; 1633 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe, 1634 ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) | 1635 (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK))); 1636 TAVOR_CQE_WQEADDRSZ_SET(cq, cqe, 1637 TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize)); 1638 1639 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume); 1640 return (TAVOR_CQ_RECYCLE_ENTRY); 1641 } 1642 } 1643 1644 1645 /* 1646 * tavor_cqe_sync() 1647 * Context: Can be called from interrupt or base context. 1648 */ 1649 static void 1650 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag) 1651 { 1652 ddi_dma_handle_t dmahdl; 1653 off_t offset; 1654 int status; 1655 1656 TAVOR_TNF_ENTER(tavor_cqe_sync); 1657 1658 /* Determine if CQ needs to be synced or not */ 1659 if (cq->cq_sync == 0) { 1660 TAVOR_TNF_EXIT(tavor_cqe_sync); 1661 return; 1662 } 1663 1664 /* Get the DMA handle from CQ context */ 1665 dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl; 1666 1667 /* Calculate offset of next CQE */ 1668 offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]); 1669 status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag); 1670 if (status != DDI_SUCCESS) { 1671 TNF_PROBE_0(tavor_cqe_sync_getnextentry_fail, 1672 TAVOR_TNF_ERROR, ""); 1673 TAVOR_TNF_EXIT(tavor_cqe_sync); 1674 return; 1675 } 1676 1677 TAVOR_TNF_EXIT(tavor_cqe_sync); 1678 } 1679 1680 1681 /* 1682 * tavor_cq_resize_helper() 1683 * Context: Can be called only from user or kernel context. 1684 */ 1685 static void 1686 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf, 1687 uint32_t old_cons_indx, uint32_t num_newcqe) 1688 { 1689 tavor_hw_cqe_t *old_cqe, *new_cqe; 1690 uint32_t new_cons_indx, wrap_around_mask; 1691 int i; 1692 1693 TAVOR_TNF_ENTER(tavor_cq_resize_helper); 1694 1695 ASSERT(MUTEX_HELD(&cq->cq_lock)); 1696 1697 /* Get the consumer index */ 1698 new_cons_indx = 0; 1699 1700 /* 1701 * Calculate the wrap around mask. Note: This operation only works 1702 * because all Tavor completion queues have power-of-2 sizes 1703 */ 1704 wrap_around_mask = (cq->cq_bufsz - 1); 1705 1706 /* 1707 * Calculate the pointers to the first CQ entry (in the "old" CQ) 1708 * and the first CQ entry in the "new" CQ 1709 */ 1710 old_cqe = &cq->cq_buf[old_cons_indx]; 1711 new_cqe = &new_cqbuf[new_cons_indx]; 1712 1713 /* Sync entire "old" CQ for use by software (if necessary). */ 1714 if (cq->cq_sync) { 1715 (void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl, 1716 0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU); 1717 } 1718 1719 /* 1720 * Keep pulling entries from the "old" CQ until we find an entry owned 1721 * by the hardware. Process each entry by copying it into the "new" 1722 * CQ and updating respective indices and pointers in the "old" CQ. 1723 */ 1724 for (i = 0; i < num_newcqe; i++) { 1725 1726 /* Copy this old CQE into the "new_cqe" pointer */ 1727 bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t)); 1728 1729 /* Increment the consumer index (for both CQs) */ 1730 old_cons_indx = (old_cons_indx + 1) & wrap_around_mask; 1731 new_cons_indx = (new_cons_indx + 1); 1732 1733 /* Update the pointer to the next CQ entry */ 1734 old_cqe = &cq->cq_buf[old_cons_indx]; 1735 new_cqe = &new_cqbuf[new_cons_indx]; 1736 } 1737 1738 TAVOR_TNF_EXIT(tavor_cq_resize_helper); 1739 } 1740 1741 1742 /* 1743 * tavor_cq_numcalc() 1744 * Context: Can be called from interrupt or base context. 1745 */ 1746 static void 1747 tavor_cq_numcalc(tavor_state_t *state, uint32_t indx, uint32_t *key) 1748 { 1749 uint32_t tmp, log_num_cq; 1750 1751 /* 1752 * Generate a simple key from counter. Note: We increment this 1753 * static variable _intentionally_ without any kind of mutex around 1754 * it. First, single-threading all operations through a single lock 1755 * would be a bad idea (from a performance point-of-view). Second, 1756 * the upper "unconstrained" bits don't really have to be unique 1757 * because the lower bits are guaranteed to be (although we do make a 1758 * best effort to ensure that they are). Third, the window for the 1759 * race (where both threads read and update the counter at the same 1760 * time) is incredibly small. 1761 */ 1762 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_cqnum_cnt)) 1763 log_num_cq = state->ts_cfg_profile->cp_log_num_cq; 1764 tmp = (tavor_debug_cqnum_cnt++) << log_num_cq; 1765 *key = (tmp | indx) & TAVOR_CQ_MAXNUMBER_MSK; 1766 } 1767 1768 /* 1769 * tavor_cq_srq_entries_flush() 1770 * Context: Can be called from interrupt or base context. 1771 */ 1772 void 1773 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp) 1774 { 1775 tavor_cqhdl_t cq; 1776 tavor_workq_hdr_t *wqhdr; 1777 tavor_hw_cqe_t *cqe; 1778 tavor_hw_cqe_t *next_cqe; 1779 uint32_t cons_indx, tail_cons_indx, wrap_around_mask; 1780 uint32_t new_indx, check_indx, indx; 1781 uint32_t num_to_increment; 1782 int cqe_qpnum, cqe_type; 1783 int outstanding_cqes, removed_cqes; 1784 int i; 1785 1786 ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); 1787 1788 cq = qp->qp_rq_cqhdl; 1789 wqhdr = qp->qp_rq_wqhdr; 1790 1791 ASSERT(wqhdr->wq_wrid_post != NULL); 1792 ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0); 1793 1794 /* 1795 * Check for user-mapped CQ memory. Note: We do not allow kernel 1796 * clients to modify any userland mapping CQ. If the CQ is 1797 * user-mapped, then we simply return here, and this "flush" function 1798 * becomes a NO-OP in this case. 1799 */ 1800 if (cq->cq_is_umap) { 1801 return; 1802 } 1803 1804 /* Get the consumer index */ 1805 cons_indx = cq->cq_consindx; 1806 1807 /* 1808 * Calculate the wrap around mask. Note: This operation only works 1809 * because all Tavor completion queues have power-of-2 sizes 1810 */ 1811 wrap_around_mask = (cq->cq_bufsz - 1); 1812 1813 /* Calculate the pointer to the first CQ entry */ 1814 cqe = &cq->cq_buf[cons_indx]; 1815 1816 /* Sync the current CQE to read */ 1817 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 1818 1819 /* 1820 * Loop through the CQ looking for entries owned by software. If an 1821 * entry is owned by software then we increment an 'outstanding_cqes' 1822 * count to know how many entries total we have on our CQ. We use this 1823 * value further down to know how many entries to loop through looking 1824 * for our same QP number. 1825 */ 1826 outstanding_cqes = 0; 1827 tail_cons_indx = cons_indx; 1828 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) { 1829 /* increment total cqes count */ 1830 outstanding_cqes++; 1831 1832 /* increment the consumer index */ 1833 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask; 1834 1835 /* update the pointer to the next cq entry */ 1836 cqe = &cq->cq_buf[tail_cons_indx]; 1837 1838 /* sync the next cqe to read */ 1839 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 1840 } 1841 1842 /* 1843 * Using the 'tail_cons_indx' that was just set, we now know how many 1844 * total CQEs possible there are. Set the 'check_indx' and the 1845 * 'new_indx' to the last entry identified by 'tail_cons_indx' 1846 */ 1847 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask; 1848 1849 for (i = 0; i < outstanding_cqes; i++) { 1850 cqe = &cq->cq_buf[check_indx]; 1851 1852 /* Grab QP number from CQE */ 1853 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe); 1854 cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe); 1855 1856 /* 1857 * If the QP number is the same in the CQE as the QP that we 1858 * have on this SRQ, then we must free up the entry off the 1859 * SRQ. We also make sure that the completion type is of the 1860 * 'TAVOR_COMPLETION_RECV' type. So any send completions on 1861 * this CQ will be left as-is. The handling of returning 1862 * entries back to HW ownership happens further down. 1863 */ 1864 if (cqe_qpnum == qp->qp_qpnum && 1865 cqe_type == TAVOR_COMPLETION_RECV) { 1866 1867 /* Add back to SRQ free list */ 1868 (void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post, 1869 cq, cqe); 1870 } else { 1871 /* Do Copy */ 1872 if (check_indx != new_indx) { 1873 next_cqe = &cq->cq_buf[new_indx]; 1874 1875 /* 1876 * Copy the CQE into the "next_cqe" 1877 * pointer. 1878 */ 1879 bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t)); 1880 } 1881 new_indx = (new_indx - 1) & wrap_around_mask; 1882 } 1883 /* Move index to next CQE to check */ 1884 check_indx = (check_indx - 1) & wrap_around_mask; 1885 } 1886 1887 /* Initialize removed cqes count */ 1888 removed_cqes = 0; 1889 1890 /* If an entry was removed */ 1891 if (check_indx != new_indx) { 1892 1893 /* 1894 * Set current pointer back to the beginning consumer index. 1895 * At this point, all unclaimed entries have been copied to the 1896 * index specified by 'new_indx'. This 'new_indx' will be used 1897 * as the new consumer index after we mark all freed entries as 1898 * having HW ownership. We do that here. 1899 */ 1900 1901 /* Loop through all entries until we reach our new pointer */ 1902 for (indx = cons_indx; indx <= new_indx; 1903 indx = (indx + 1) & wrap_around_mask) { 1904 removed_cqes++; 1905 cqe = &cq->cq_buf[indx]; 1906 1907 /* Reset entry to hardware ownership */ 1908 TAVOR_CQE_OWNER_SET_HW(cq, cqe); 1909 } 1910 } 1911 1912 /* 1913 * Update consumer index to be the 'new_indx'. This moves it past all 1914 * removed entries. Because 'new_indx' is pointing to the last 1915 * previously valid SW owned entry, we add 1 to point the cons_indx to 1916 * the first HW owned entry. 1917 */ 1918 cons_indx = (new_indx + 1) & wrap_around_mask; 1919 1920 /* 1921 * Now we only ring the doorbell (to update the consumer index) if 1922 * we've actually consumed a CQ entry. If we found no QP number 1923 * matches above, then we would not have removed anything. So only if 1924 * something was removed do we ring the doorbell. 1925 */ 1926 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) { 1927 /* 1928 * Post doorbell to update the consumer index. Doorbell 1929 * value indicates number of entries consumed (minus 1) 1930 */ 1931 if (cons_indx > cq->cq_consindx) { 1932 num_to_increment = (cons_indx - cq->cq_consindx) - 1; 1933 } else { 1934 num_to_increment = ((cons_indx + cq->cq_bufsz) - 1935 cq->cq_consindx) - 1; 1936 } 1937 cq->cq_consindx = cons_indx; 1938 1939 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX, 1940 cq->cq_cqnum, num_to_increment); 1941 } 1942 } 1943