xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_cq.c (revision 9e39c5ba00a55fa05777cc94b148296af305e135)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_cq.c
29  *    Tavor Completion Queue Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, resizing,
32  *    and handling the completion type events that the Tavor hardware can
33  *    generate.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
42 #include <sys/sysmacros.h>
43 
44 #include <sys/ib/adapters/tavor/tavor.h>
45 
46 /*
47  * Used by tavor_cq_numcalc() below to fill in the "unconstrained" portion
48  * of Tavor completion queue number
49  */
50 static uint_t tavor_debug_cqnum_cnt = 0x00000000;
51 
52 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
53     uint32_t cqn, uint32_t cq_param);
54 #pragma inline(tavor_cq_doorbell)
55 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
56     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
57 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
58     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
59 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
60     uint_t flag);
61 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
62     uint32_t old_cons_indx, uint32_t num_newcqe);
63 static void tavor_cq_numcalc(tavor_state_t *state, uint32_t indx,
64     uint32_t *key);
65 
66 /*
67  * tavor_cq_alloc()
68  *    Context: Can be called only from user or kernel context.
69  */
70 int
71 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
72     ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
73     uint_t sleepflag)
74 {
75 	tavor_rsrc_t		*cqc, *rsrc;
76 	tavor_umap_db_entry_t	*umapdb;
77 	tavor_hw_cqc_t		cqc_entry;
78 	tavor_cqhdl_t		cq;
79 	ibt_mr_attr_t		mr_attr;
80 	tavor_mr_options_t	op;
81 	tavor_pdhdl_t		pd;
82 	tavor_mrhdl_t		mr;
83 	tavor_hw_cqe_t		*buf;
84 	uint64_t		addr, value;
85 	uint32_t		log_cq_size, lkey, uarpg;
86 	uint_t			dma_xfer_mode, cq_sync, cq_is_umap;
87 	int			status, i, flag;
88 	char			*errormsg;
89 
90 	TAVOR_TNF_ENTER(tavor_cq_alloc);
91 
92 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))
93 
94 	/*
95 	 * Determine whether CQ is being allocated for userland access or
96 	 * whether it is being allocated for kernel access.  If the CQ is
97 	 * being allocated for userland access, then lookup the UAR doorbell
98 	 * page number for the current process.  Note:  If this is not found
99 	 * (e.g. if the process has not previously open()'d the Tavor driver),
100 	 * then an error is returned.
101 	 */
102 	cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
103 	if (cq_is_umap) {
104 		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
105 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
106 		if (status != DDI_SUCCESS) {
107 			/* Set "status" and "errormsg" and goto failure */
108 			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
109 			goto cqalloc_fail;
110 		}
111 		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
112 	}
113 
114 	/* Use the internal protection domain (PD) for setting up CQs */
115 	pd = state->ts_pdhdl_internal;
116 
117 	/* Increment the reference count on the protection domain (PD) */
118 	tavor_pd_refcnt_inc(pd);
119 
120 	/*
121 	 * Allocate an CQ context entry.  This will be filled in with all
122 	 * the necessary parameters to define the Completion Queue.  And then
123 	 * ownership will be passed to the hardware in the final step
124 	 * below.  If we fail here, we must undo the protection domain
125 	 * reference count.
126 	 */
127 	status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
128 	if (status != DDI_SUCCESS) {
129 		/* Set "status" and "errormsg" and goto failure */
130 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ context");
131 		goto cqalloc_fail1;
132 	}
133 
134 	/*
135 	 * Allocate the software structure for tracking the completion queue
136 	 * (i.e. the Tavor Completion Queue handle).  If we fail here, we must
137 	 * undo the protection domain reference count and the previous
138 	 * resource allocation.
139 	 */
140 	status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
141 	if (status != DDI_SUCCESS) {
142 		/* Set "status" and "errormsg" and goto failure */
143 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ handle");
144 		goto cqalloc_fail2;
145 	}
146 	cq = (tavor_cqhdl_t)rsrc->tr_addr;
147 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
148 	cq->cq_is_umap = cq_is_umap;
149 
150 	/*
151 	 * Calculate the CQ number from CQC index.  In much the same way
152 	 * as we create keys for memory regions (see tavor_mr.c), this CQ
153 	 * number is constructed from a "constrained" portion (which depends
154 	 * on the CQC index) and an "unconstrained" portion (which is
155 	 * arbitrarily chosen).
156 	 */
157 	tavor_cq_numcalc(state, cqc->tr_indx, &cq->cq_cqnum);
158 
159 	/*
160 	 * If this will be a user-mappable CQ, then allocate an entry for
161 	 * the "userland resources database".  This will later be added to
162 	 * the database (after all further CQ operations are successful).
163 	 * If we fail here, we must undo the reference counts and the
164 	 * previous resource allocation.
165 	 */
166 	if (cq->cq_is_umap) {
167 		umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
168 		    MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
169 		if (umapdb == NULL) {
170 			/* Set "status" and "errormsg" and goto failure */
171 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
172 			goto cqalloc_fail3;
173 		}
174 	}
175 
176 	/*
177 	 * Calculate the appropriate size for the completion queue.
178 	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
179 	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
180 	 * to round the requested size up to the next highest power-of-2
181 	 */
182 	cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
183 	log_cq_size = highbit(cq_attr->cq_size);
184 
185 	/*
186 	 * Next we verify that the rounded-up size is valid (i.e. consistent
187 	 * with the device limits and/or software-configured limits)
188 	 */
189 	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
190 		/* Set "status" and "errormsg" and goto failure */
191 		TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
192 		goto cqalloc_fail4;
193 	}
194 
195 	/*
196 	 * Allocate the memory for Completion Queue.
197 	 *
198 	 * Note: Although we use the common queue allocation routine, we
199 	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
200 	 * kernel system memory) for kernel CQs because it would be
201 	 * inefficient to have CQs located in DDR memory.  This is primarily
202 	 * because CQs are read from (by software) more than they are written
203 	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
204 	 * user-mappable CQs for a similar reason.)
205 	 * It is also worth noting that, unlike Tavor QP work queues,
206 	 * completion queues do not have the same strict alignment
207 	 * requirements.  It is sufficient for the CQ memory to be both
208 	 * aligned to and bound to addresses which are a multiple of CQE size.
209 	 */
210 	cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
211 	cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
212 	cq->cq_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
213 	if (cq->cq_is_umap) {
214 		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
215 	} else {
216 		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
217 	}
218 	status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
219 	if (status != DDI_SUCCESS) {
220 		/* Set "status" and "errormsg" and goto failure */
221 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
222 		goto cqalloc_fail4;
223 	}
224 	buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
225 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
226 
227 	/*
228 	 * Initialize each of the Completion Queue Entries (CQE) by setting
229 	 * their ownership to hardware ("owner" bit set to HW).  This is in
230 	 * preparation for the final transfer of ownership (below) of the
231 	 * CQ context itself.
232 	 */
233 	for (i = 0; i < (1 << log_cq_size); i++) {
234 		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
235 	}
236 
237 	/*
238 	 * Register the memory for the CQ.  The memory for the CQ must
239 	 * be registered in the Tavor TPT tables.  This gives us the LKey
240 	 * to specify in the CQ context below.  Note: If this is a user-
241 	 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
242 	 */
243 	flag = (sleepflag == TAVOR_SLEEP) ?  IBT_MR_SLEEP : IBT_MR_NOSLEEP;
244 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
245 	mr_attr.mr_len	 = cq->cq_cqinfo.qa_size;
246 	mr_attr.mr_as	 = NULL;
247 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
248 	if (cq->cq_is_umap) {
249 		dma_xfer_mode = DDI_DMA_CONSISTENT;
250 	} else {
251 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
252 	}
253 	if (dma_xfer_mode == DDI_DMA_STREAMING) {
254 		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
255 	}
256 	op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
257 	op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
258 	op.mro_bind_override_addr = 0;
259 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
260 	if (status != DDI_SUCCESS) {
261 		/* Set "status" and "errormsg" and goto failure */
262 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
263 		goto cqalloc_fail5;
264 	}
265 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
266 	addr = mr->mr_bindinfo.bi_addr;
267 	lkey = mr->mr_lkey;
268 
269 	/* Determine if later ddi_dma_sync will be necessary */
270 	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
271 
272 	/* Sync entire CQ for use by the hardware (if necessary). */
273 	if (cq_sync) {
274 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
275 		    cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
276 	}
277 
278 	/*
279 	 * Fill in the CQC entry.  This is the final step before passing
280 	 * ownership of the CQC entry to the Tavor hardware.  We use all of
281 	 * the information collected/calculated above to fill in the
282 	 * requisite portions of the CQC.  Note: If this CQ is going to be
283 	 * used for userland access, then we need to set the UAR page number
284 	 * appropriately (otherwise it's a "don't care")
285 	 */
286 	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
287 	cq->cq_eqnum		= TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
288 	cq->cq_erreqnum		= TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
289 	cqc_entry.xlat		= TAVOR_VA2PA_XLAT_ENABLED;
290 	cqc_entry.state		= TAVOR_CQ_DISARMED;
291 	cqc_entry.start_addr_h	= (addr >> 32);
292 	cqc_entry.start_addr_l	= (addr & 0xFFFFFFFF);
293 	cqc_entry.log_cq_sz	= log_cq_size;
294 	if (cq->cq_is_umap) {
295 		cqc_entry.usr_page = uarpg;
296 	} else {
297 		cqc_entry.usr_page = 0;
298 	}
299 	cqc_entry.pd		= pd->pd_pdnum;
300 	cqc_entry.lkey		= lkey;
301 	cqc_entry.e_eqn		= cq->cq_erreqnum;
302 	cqc_entry.c_eqn		= cq->cq_eqnum;
303 	cqc_entry.cqn		= cq->cq_cqnum;
304 
305 	/*
306 	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
307 	 * the entry to the hardware (using the Tavor SW2HW_CQ firmware
308 	 * command).  Note: In general, this operation shouldn't fail.  But
309 	 * if it does, we have to undo everything we've done above before
310 	 * returning error.
311 	 */
312 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
313 	    sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
314 	if (status != TAVOR_CMD_SUCCESS) {
315 		cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
316 		    status);
317 		TNF_PROBE_1(tavor_cq_alloc_sw2hw_cq_cmd_fail,
318 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
319 		/* Set "status" and "errormsg" and goto failure */
320 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "tavor SW2HW_CQ command");
321 		goto cqalloc_fail6;
322 	}
323 
324 	/*
325 	 * Fill in the rest of the Tavor Completion Queue handle.  Having
326 	 * successfully transferred ownership of the CQC, we can update the
327 	 * following fields for use in further operations on the CQ.
328 	 */
329 	cq->cq_cqcrsrcp	  = cqc;
330 	cq->cq_rsrcp	  = rsrc;
331 	cq->cq_consindx	  = 0;
332 	cq->cq_buf	  = buf;
333 	cq->cq_bufsz	  = (1 << log_cq_size);
334 	cq->cq_mrhdl	  = mr;
335 	cq->cq_sync	  = cq_sync;
336 	cq->cq_refcnt	  = 0;
337 	cq->cq_is_special = 0;
338 	cq->cq_uarpg	  = uarpg;
339 	cq->cq_umap_dhp	  = (devmap_cookie_t)NULL;
340 	avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
341 	    sizeof (struct tavor_workq_hdr_s),
342 	    offsetof(struct tavor_workq_hdr_s, wq_avl_link));
343 
344 	cq->cq_wrid_reap_head  = NULL;
345 	cq->cq_wrid_reap_tail  = NULL;
346 	cq->cq_hdlrarg	  = (void *)ibt_cqhdl;
347 
348 	/*
349 	 * Put CQ handle in Tavor CQNum-to-CQHdl list.  Then fill in the
350 	 * "actual_size" and "cqhdl" and return success
351 	 */
352 	ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
353 	state->ts_cqhdl[cqc->tr_indx] = cq;
354 
355 	/*
356 	 * If this is a user-mappable CQ, then we need to insert the previously
357 	 * allocated entry into the "userland resources database".  This will
358 	 * allow for later lookup during devmap() (i.e. mmap()) calls.
359 	 */
360 	if (cq->cq_is_umap) {
361 		tavor_umap_db_add(umapdb);
362 	}
363 
364 	/*
365 	 * Fill in the return arguments (if necessary).  This includes the
366 	 * real completion queue size.
367 	 */
368 	if (actual_size != NULL) {
369 		*actual_size = (1 << log_cq_size) - 1;
370 	}
371 	*cqhdl = cq;
372 
373 	TAVOR_TNF_EXIT(tavor_cq_alloc);
374 	return (DDI_SUCCESS);
375 
376 /*
377  * The following is cleanup for all possible failure cases in this routine
378  */
379 cqalloc_fail6:
380 	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
381 	    sleepflag) != DDI_SUCCESS) {
382 		TAVOR_WARNING(state, "failed to deregister CQ memory");
383 	}
384 cqalloc_fail5:
385 	tavor_queue_free(state, &cq->cq_cqinfo);
386 cqalloc_fail4:
387 	if (cq_is_umap) {
388 		tavor_umap_db_free(umapdb);
389 	}
390 cqalloc_fail3:
391 	tavor_rsrc_free(state, &rsrc);
392 cqalloc_fail2:
393 	tavor_rsrc_free(state, &cqc);
394 cqalloc_fail1:
395 	tavor_pd_refcnt_dec(pd);
396 cqalloc_fail:
397 	TNF_PROBE_1(tavor_cq_alloc_fail, TAVOR_TNF_ERROR, "",
398 	    tnf_string, msg, errormsg);
399 	TAVOR_TNF_EXIT(tavor_cq_alloc);
400 	return (status);
401 }
402 
403 
404 /*
405  * tavor_cq_free()
406  *    Context: Can be called only from user or kernel context.
407  */
408 /* ARGSUSED */
409 int
410 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
411 {
412 	tavor_rsrc_t		*cqc, *rsrc;
413 	tavor_umap_db_entry_t	*umapdb;
414 	tavor_hw_cqc_t		cqc_entry;
415 	tavor_pdhdl_t		pd;
416 	tavor_mrhdl_t		mr;
417 	tavor_cqhdl_t		cq;
418 	uint32_t		cqnum;
419 	uint64_t		value;
420 	uint_t			maxprot;
421 	int			status;
422 
423 	TAVOR_TNF_ENTER(tavor_cq_free);
424 
425 	/*
426 	 * Pull all the necessary information from the Tavor Completion Queue
427 	 * handle.  This is necessary here because the resource for the
428 	 * CQ handle is going to be freed up as part of this operation.
429 	 */
430 	cq	= *cqhdl;
431 	mutex_enter(&cq->cq_lock);
432 	cqc	= cq->cq_cqcrsrcp;
433 	rsrc	= cq->cq_rsrcp;
434 	pd	= state->ts_pdhdl_internal;
435 	mr	= cq->cq_mrhdl;
436 	cqnum	= cq->cq_cqnum;
437 
438 	/*
439 	 * If there are work queues still associated with the CQ, then return
440 	 * an error.  Otherwise, we will be holding the CQ lock.
441 	 */
442 	if (cq->cq_refcnt != 0) {
443 		mutex_exit(&cq->cq_lock);
444 		TNF_PROBE_1(tavor_cq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
445 		    tnf_int, refcnt, cq->cq_refcnt);
446 		TAVOR_TNF_EXIT(tavor_cq_free);
447 		return (IBT_CQ_BUSY);
448 	}
449 
450 	/*
451 	 * If this was a user-mappable CQ, then we need to remove its entry
452 	 * from the "userland resources database".  If it is also currently
453 	 * mmap()'d out to a user process, then we need to call
454 	 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
455 	 * We also need to invalidate the CQ tracking information for the
456 	 * user mapping.
457 	 */
458 	if (cq->cq_is_umap) {
459 		status = tavor_umap_db_find(state->ts_instance, cqnum,
460 		    MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
461 		    &umapdb);
462 		if (status != DDI_SUCCESS) {
463 			mutex_exit(&cq->cq_lock);
464 			TAVOR_WARNING(state, "failed to find in database");
465 			TAVOR_TNF_EXIT(tavor_cq_free);
466 			return (ibc_get_ci_failure(0));
467 		}
468 		tavor_umap_db_free(umapdb);
469 		if (cq->cq_umap_dhp != NULL) {
470 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
471 			status = devmap_devmem_remap(cq->cq_umap_dhp,
472 			    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
473 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
474 			if (status != DDI_SUCCESS) {
475 				mutex_exit(&cq->cq_lock);
476 				TAVOR_WARNING(state, "failed in CQ memory "
477 				    "devmap_devmem_remap()");
478 				TAVOR_TNF_EXIT(tavor_cq_free);
479 				return (ibc_get_ci_failure(0));
480 			}
481 			cq->cq_umap_dhp = (devmap_cookie_t)NULL;
482 		}
483 	}
484 
485 	/*
486 	 * Put NULL into the Tavor CQNum-to-CQHdl list.  This will allow any
487 	 * in-progress events to detect that the CQ corresponding to this
488 	 * number has been freed.
489 	 */
490 	state->ts_cqhdl[cqc->tr_indx] = NULL;
491 
492 	/*
493 	 * While we hold the CQ lock, do a "forced reap" of the workQ WRID
494 	 * list.  This cleans up all the structures associated with the WRID
495 	 * processing for this CQ.  Once we complete, drop the lock and finish
496 	 * the deallocation of the CQ.
497 	 */
498 	tavor_wrid_cq_force_reap(cq);
499 
500 	mutex_exit(&cq->cq_lock);
501 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
502 
503 	/*
504 	 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
505 	 * firmware command).  If the ownership transfer fails for any reason,
506 	 * then it is an indication that something (either in HW or SW) has
507 	 * gone seriously wrong.
508 	 */
509 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
510 	    sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
511 	if (status != TAVOR_CMD_SUCCESS) {
512 		TAVOR_WARNING(state, "failed to reclaim CQC ownership");
513 		cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
514 		    status);
515 		TNF_PROBE_1(tavor_cq_free_hw2sw_cq_cmd_fail,
516 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
517 		TAVOR_TNF_EXIT(tavor_cq_free);
518 		return (ibc_get_ci_failure(0));
519 	}
520 
521 	/*
522 	 * Deregister the memory for the Completion Queue.  If this fails
523 	 * for any reason, then it is an indication that something (either
524 	 * in HW or SW) has gone seriously wrong.  So we print a warning
525 	 * message and return.
526 	 */
527 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
528 	    sleepflag);
529 	if (status != DDI_SUCCESS) {
530 		TAVOR_WARNING(state, "failed to deregister CQ memory");
531 		TNF_PROBE_0(tavor_cq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
532 		TAVOR_TNF_EXIT(tavor_cq_free);
533 		return (ibc_get_ci_failure(0));
534 	}
535 
536 	/* Free the memory for the CQ */
537 	tavor_queue_free(state, &cq->cq_cqinfo);
538 
539 	/* Free the Tavor Completion Queue handle */
540 	tavor_rsrc_free(state, &rsrc);
541 
542 	/* Free up the CQC entry resource */
543 	tavor_rsrc_free(state, &cqc);
544 
545 	/* Decrement the reference count on the protection domain (PD) */
546 	tavor_pd_refcnt_dec(pd);
547 
548 	/* Set the cqhdl pointer to NULL and return success */
549 	*cqhdl = NULL;
550 
551 	TAVOR_TNF_EXIT(tavor_cq_free);
552 	return (DDI_SUCCESS);
553 }
554 
555 
556 /*
557  * tavor_cq_resize()
558  *    Context: Can be called only from user or kernel context.
559  */
560 int
561 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
562     uint_t *actual_size, uint_t sleepflag)
563 {
564 	tavor_hw_cqc_t		cqc_entry;
565 	tavor_qalloc_info_t	new_cqinfo, old_cqinfo;
566 	ibt_mr_attr_t		mr_attr;
567 	tavor_mr_options_t	op;
568 	tavor_pdhdl_t		pd;
569 	tavor_mrhdl_t		mr, mr_old;
570 	tavor_hw_cqe_t		*buf;
571 	uint32_t		new_prod_indx, old_cons_indx;
572 	uint_t			dma_xfer_mode, cq_sync, log_cq_size, maxprot;
573 	int			status, i, flag;
574 	char			*errormsg;
575 
576 	TAVOR_TNF_ENTER(tavor_cq_resize);
577 
578 	/* Use the internal protection domain (PD) for CQs */
579 	pd = state->ts_pdhdl_internal;
580 
581 	/*
582 	 * Calculate the appropriate size for the new resized completion queue.
583 	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
584 	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
585 	 * to round the requested size up to the next highest power-of-2
586 	 */
587 	req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
588 	log_cq_size = highbit(req_size);
589 
590 	/*
591 	 * Next we verify that the rounded-up size is valid (i.e. consistent
592 	 * with the device limits and/or software-configured limits)
593 	 */
594 	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
595 		/* Set "status" and "errormsg" and goto failure */
596 		TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
597 		goto cqresize_fail;
598 	}
599 
600 	/*
601 	 * Allocate the memory for newly resized Completion Queue.
602 	 *
603 	 * Note: Although we use the common queue allocation routine, we
604 	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
605 	 * kernel system memory) for kernel CQs because it would be
606 	 * inefficient to have CQs located in DDR memory.  This is the same
607 	 * as we do when we first allocate completion queues primarily
608 	 * because CQs are read from (by software) more than they are written
609 	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
610 	 * user-mappable CQs for a similar reason.)
611 	 * It is also worth noting that, unlike Tavor QP work queues,
612 	 * completion queues do not have the same strict alignment
613 	 * requirements.  It is sufficient for the CQ memory to be both
614 	 * aligned to and bound to addresses which are a multiple of CQE size.
615 	 */
616 	new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
617 	new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
618 	new_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
619 	if (cq->cq_is_umap) {
620 		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
621 	} else {
622 		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
623 	}
624 	status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
625 	if (status != DDI_SUCCESS) {
626 		/* Set "status" and "errormsg" and goto failure */
627 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
628 		goto cqresize_fail;
629 	}
630 	buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
631 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
632 
633 	/*
634 	 * Initialize each of the Completion Queue Entries (CQE) by setting
635 	 * their ownership to hardware ("owner" bit set to HW).  This is in
636 	 * preparation for the final resize operation (below).
637 	 */
638 	for (i = 0; i < (1 << log_cq_size); i++) {
639 		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
640 	}
641 
642 	/*
643 	 * Register the memory for the CQ.  The memory for the CQ must
644 	 * be registered in the Tavor TPT tables.  This gives us the LKey
645 	 * to specify in the CQ context below.
646 	 */
647 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
648 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
649 	mr_attr.mr_len	 = new_cqinfo.qa_size;
650 	mr_attr.mr_as	 = NULL;
651 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
652 	if (cq->cq_is_umap) {
653 		dma_xfer_mode = DDI_DMA_CONSISTENT;
654 	} else {
655 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
656 	}
657 	if (dma_xfer_mode == DDI_DMA_STREAMING) {
658 		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
659 	}
660 	op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
661 	op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
662 	op.mro_bind_override_addr = 0;
663 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
664 	if (status != DDI_SUCCESS) {
665 		tavor_queue_free(state, &new_cqinfo);
666 		/* Set "status" and "errormsg" and goto failure */
667 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
668 		goto cqresize_fail;
669 	}
670 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
671 
672 	/* Determine if later ddi_dma_sync will be necessary */
673 	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
674 
675 	/* Sync entire "new" CQ for use by hardware (if necessary) */
676 	if (cq_sync) {
677 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
678 		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
679 	}
680 
681 	/*
682 	 * Now we grab the CQ lock.  Since we will be updating the actual
683 	 * CQ location and the producer/consumer indexes, we should hold
684 	 * the lock.
685 	 *
686 	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
687 	 * holding the "cq_lock" and if we got raised to interrupt level
688 	 * by priority inversion, we would not want to block in this routine
689 	 * waiting for success.
690 	 */
691 	mutex_enter(&cq->cq_lock);
692 
693 	/*
694 	 * Determine the current CQ "consumer index".
695 	 *
696 	 * Note:  This will depend on whether the CQ had previously been
697 	 * mapped for user access or whether it is a kernel CQ.  If this
698 	 * is a kernel CQ, then all PollCQ() operations have come through
699 	 * the IBTF and, hence, the driver's CQ state structure will
700 	 * contain the current consumer index.  If, however, the user has
701 	 * accessed this CQ by bypassing the driver (OS-bypass), then we
702 	 * need to query the firmware to determine the current CQ consumer
703 	 * index.  This also assumes that the user process will not continue
704 	 * to consume entries while at the same time doing the ResizeCQ()
705 	 * operation.  If the user process does not guarantee this, then it
706 	 * may see duplicate or missed completions.  But under no
707 	 * circumstances should this panic the system.
708 	 */
709 	if (cq->cq_is_umap) {
710 		status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
711 		    cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
712 		    TAVOR_NOSLEEP);
713 		if (status != TAVOR_CMD_SUCCESS) {
714 			/* Query CQ has failed, drop CQ lock and cleanup */
715 			mutex_exit(&cq->cq_lock);
716 			if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
717 			    sleepflag) != DDI_SUCCESS) {
718 				TAVOR_WARNING(state, "failed to deregister "
719 				    "CQ memory");
720 			}
721 			tavor_queue_free(state, &new_cqinfo);
722 			TAVOR_WARNING(state, "failed to find in database");
723 
724 			/* Set "status" and "errormsg" and goto failure */
725 			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
726 			    "failed umap lookup");
727 			goto cqresize_fail;
728 		}
729 		old_cons_indx = cqc_entry.cons_indx;
730 	} else {
731 		old_cons_indx = cq->cq_consindx;
732 	}
733 
734 	/*
735 	 * Fill in the CQC entry.  For the resize operation this is the
736 	 * final step before attempting the resize operation on the CQC entry.
737 	 * We use all of the information collected/calculated above to fill
738 	 * in the requisite portions of the CQC.
739 	 */
740 	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
741 	cqc_entry.start_addr_h	= (mr->mr_bindinfo.bi_addr >> 32);
742 	cqc_entry.start_addr_l	= (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
743 	cqc_entry.log_cq_sz	= log_cq_size;
744 	cqc_entry.lkey		= mr->mr_lkey;
745 
746 	/*
747 	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
748 	 * the entry to the hardware (using the Tavor RESIZE_CQ firmware
749 	 * command).  Note: In general, this operation shouldn't fail.  But
750 	 * if it does, we have to undo everything we've done above before
751 	 * returning error.  Also note that the status returned may indicate
752 	 * the code to return to the IBTF.
753 	 */
754 	status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
755 	    &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
756 	if (status != TAVOR_CMD_SUCCESS) {
757 		/* Resize attempt has failed, drop CQ lock and cleanup */
758 		mutex_exit(&cq->cq_lock);
759 		if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
760 		    sleepflag) != DDI_SUCCESS) {
761 			TAVOR_WARNING(state, "failed to deregister CQ memory");
762 		}
763 		tavor_queue_free(state, &new_cqinfo);
764 		if (status == TAVOR_CMD_BAD_SIZE) {
765 			TAVOR_TNF_EXIT(tavor_cq_resize);
766 			return (IBT_CQ_SZ_INSUFFICIENT);
767 		} else {
768 			cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
769 			    "%08x\n", status);
770 			TNF_PROBE_1(tavor_cq_resize_cq_cmd_fail,
771 			    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
772 			TAVOR_TNF_EXIT(tavor_cq_resize);
773 			return (ibc_get_ci_failure(0));
774 		}
775 	}
776 
777 	/*
778 	 * The CQ resize attempt was successful.  Before dropping the CQ lock,
779 	 * copy all of the CQEs from the "old" CQ into the "new" CQ.  Note:
780 	 * the Tavor firmware guarantees us that sufficient space is set aside
781 	 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
782 	 * The two parameters to this helper function ("old_cons_indx" and
783 	 * "new_prod_indx") essentially indicate the starting index and number
784 	 * of any CQEs that might remain in the "old" CQ memory.
785 	 */
786 	tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
787 
788 	/* Sync entire "new" CQ for use by hardware (if necessary) */
789 	if (cq_sync) {
790 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
791 		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
792 	}
793 
794 	/*
795 	 * Update the Tavor Completion Queue handle with all the new
796 	 * information.  At the same time, save away all the necessary
797 	 * information for freeing up the old resources
798 	 */
799 	mr_old		 = cq->cq_mrhdl;
800 	old_cqinfo	 = cq->cq_cqinfo;
801 	cq->cq_cqinfo	 = new_cqinfo;
802 	cq->cq_consindx	 = 0;
803 	cq->cq_buf	 = buf;
804 	cq->cq_bufsz	 = (1 << log_cq_size);
805 	cq->cq_mrhdl	 = mr;
806 	cq->cq_sync	 = cq_sync;
807 
808 	/*
809 	 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
810 	 * to a user process, then we need to call devmap_devmem_remap() to
811 	 * invalidate the mapping to the CQ memory.  We also need to
812 	 * invalidate the CQ tracking information for the user mapping.
813 	 */
814 	if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
815 		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
816 		status = devmap_devmem_remap(cq->cq_umap_dhp,
817 		    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
818 		    DEVMAP_MAPPING_INVALID, NULL);
819 		if (status != DDI_SUCCESS) {
820 			mutex_exit(&cq->cq_lock);
821 			TAVOR_WARNING(state, "failed in CQ memory "
822 			    "devmap_devmem_remap()");
823 			TAVOR_TNF_EXIT(tavor_cq_free);
824 			return (ibc_get_ci_failure(0));
825 		}
826 		cq->cq_umap_dhp = (devmap_cookie_t)NULL;
827 	}
828 
829 	/*
830 	 * Drop the CQ lock now.  The only thing left to do is to free up
831 	 * the old resources.
832 	 */
833 	mutex_exit(&cq->cq_lock);
834 
835 	/*
836 	 * Deregister the memory for the old Completion Queue.  Note: We
837 	 * really can't return error here because we have no good way to
838 	 * cleanup.  Plus, the deregistration really shouldn't ever happen.
839 	 * So, if it does, it is an indication that something has gone
840 	 * seriously wrong.  So we print a warning message and return error
841 	 * (knowing, of course, that the "old" CQ memory will be leaked)
842 	 */
843 	status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
844 	    sleepflag);
845 	if (status != DDI_SUCCESS) {
846 		TAVOR_WARNING(state, "failed to deregister old CQ memory");
847 		/* Set "status" and "errormsg" and goto failure */
848 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
849 		    "failed deregister mr (old)");
850 		goto cqresize_fail;
851 	}
852 
853 	/* Free the memory for the old CQ */
854 	tavor_queue_free(state, &old_cqinfo);
855 
856 	/*
857 	 * Fill in the return arguments (if necessary).  This includes the
858 	 * real new completion queue size.
859 	 */
860 	if (actual_size != NULL) {
861 		*actual_size = (1 << log_cq_size) - 1;
862 	}
863 
864 	TAVOR_TNF_EXIT(tavor_cq_resize);
865 	return (DDI_SUCCESS);
866 
867 cqresize_fail:
868 	TNF_PROBE_1(tavor_cq_resize_fail, TAVOR_TNF_ERROR, "",
869 	    tnf_string, msg, errormsg);
870 	TAVOR_TNF_EXIT(tavor_cq_resize);
871 	return (status);
872 }
873 
874 
875 /*
876  * tavor_cq_notify()
877  *    Context: Can be called from interrupt or base context.
878  */
879 int
880 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
881     ibt_cq_notify_flags_t flags)
882 {
883 	uint_t		cqnum;
884 
885 	TAVOR_TNF_ENTER(tavor_cq_notify);
886 
887 	/*
888 	 * Determine if we are trying to get the next completion or the next
889 	 * "solicited" completion.  Then hit the appropriate doorbell.
890 	 *
891 	 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
892 	 * regarding why we do not have to do an extra PIO read here, and we
893 	 * will not lose an event after writing this doorbell.
894 	 */
895 	cqnum = cq->cq_cqnum;
896 	if (flags == IBT_NEXT_COMPLETION) {
897 		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
898 		    TAVOR_CQDB_DEFAULT_PARAM);
899 
900 	} else if (flags == IBT_NEXT_SOLICITED) {
901 		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
902 		    cqnum, TAVOR_CQDB_DEFAULT_PARAM);
903 
904 	} else {
905 		TNF_PROBE_1(tavor_cq_notify_invflags_fail, TAVOR_TNF_ERROR, "",
906 		    tnf_int, flags, flags);
907 		TAVOR_TNF_EXIT(tavor_cq_notify);
908 		return (IBT_CQ_NOTIFY_TYPE_INVALID);
909 	}
910 
911 	TAVOR_TNF_EXIT(tavor_cq_notify);
912 	return (DDI_SUCCESS);
913 }
914 
915 
916 /*
917  * tavor_cq_poll()
918  *    Context: Can be called from interrupt or base context.
919  */
920 int
921 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
922     uint_t num_wc, uint_t *num_polled)
923 {
924 	tavor_hw_cqe_t	*cqe;
925 	uint32_t	cons_indx, wrap_around_mask;
926 	uint32_t	polled_cnt, num_to_increment;
927 	int		status;
928 
929 	TAVOR_TNF_ENTER(tavor_cq_poll);
930 
931 	/*
932 	 * Check for user-mappable CQ memory.  Note:  We do not allow kernel
933 	 * clients to poll CQ memory that is accessible directly by the user.
934 	 * If the CQ memory is user accessible, then return an error.
935 	 */
936 	if (cq->cq_is_umap) {
937 		TNF_PROBE_0(tavor_cq_poll_inv_usrmapped_type,
938 		    TAVOR_TNF_ERROR, "");
939 		TAVOR_TNF_EXIT(tavor_cq_poll);
940 		return (IBT_CQ_HDL_INVALID);
941 	}
942 
943 	mutex_enter(&cq->cq_lock);
944 
945 	/* Get the consumer index */
946 	cons_indx = cq->cq_consindx;
947 
948 	/*
949 	 * Calculate the wrap around mask.  Note: This operation only works
950 	 * because all Tavor completion queues have power-of-2 sizes
951 	 */
952 	wrap_around_mask = (cq->cq_bufsz - 1);
953 
954 	/* Calculate the pointer to the first CQ entry */
955 	cqe = &cq->cq_buf[cons_indx];
956 
957 	/* Sync the current CQE to read */
958 	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
959 
960 	/*
961 	 * Keep pulling entries from the CQ until we find an entry owned by
962 	 * the hardware.  As long as there the CQE's owned by SW, process
963 	 * each entry by calling tavor_cq_cqe_consume() and updating the CQ
964 	 * consumer index.  Note:  We only update the consumer index if
965 	 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.  Otherwise,
966 	 * it indicates that we are going to "recycle" the CQE (probably
967 	 * because it is a error CQE and corresponds to more than one
968 	 * completion).
969 	 */
970 	polled_cnt = 0;
971 	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
972 		status = tavor_cq_cqe_consume(state, cq, cqe,
973 		    &wc_p[polled_cnt++]);
974 		if (status == TAVOR_CQ_SYNC_AND_DB) {
975 			/* Reset entry to hardware ownership */
976 			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
977 
978 			/* Sync the current CQE for device */
979 			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
980 
981 			/* Increment the consumer index */
982 			cons_indx = (cons_indx + 1) & wrap_around_mask;
983 
984 			/* Update the pointer to the next CQ entry */
985 			cqe = &cq->cq_buf[cons_indx];
986 
987 			/* Sync the next CQE to read */
988 			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
989 		}
990 
991 		/*
992 		 * If we have run out of space to store work completions,
993 		 * then stop and return the ones we have pulled of the CQ.
994 		 */
995 		if (polled_cnt >= num_wc) {
996 			break;
997 		}
998 	}
999 
1000 	/*
1001 	 * Now we only ring the doorbell (to update the consumer index) if
1002 	 * we've actually consumed a CQ entry.  If we have, for example,
1003 	 * pulled from a CQE that we are still in the process of "recycling"
1004 	 * for error purposes, then we would not update the consumer index.
1005 	 */
1006 	if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
1007 		/*
1008 		 * Post doorbell to update the consumer index.  Doorbell
1009 		 * value indicates number of entries consumed (minus 1)
1010 		 */
1011 		if (cons_indx > cq->cq_consindx) {
1012 			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1013 		} else {
1014 			num_to_increment = ((cons_indx + cq->cq_bufsz) -
1015 			    cq->cq_consindx) - 1;
1016 		}
1017 		cq->cq_consindx = cons_indx;
1018 		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1019 		    cq->cq_cqnum, num_to_increment);
1020 
1021 	} else if (polled_cnt == 0) {
1022 		/*
1023 		 * If the CQ is empty, we can try to free up some of the WRID
1024 		 * list containers.  See tavor_wr.c for more details on this
1025 		 * operation.
1026 		 */
1027 		tavor_wrid_cq_reap(cq);
1028 	}
1029 
1030 	mutex_exit(&cq->cq_lock);
1031 
1032 	/* Set "num_polled" (if necessary) */
1033 	if (num_polled != NULL) {
1034 		*num_polled = polled_cnt;
1035 	}
1036 
1037 	/* Set CQ_EMPTY condition if needed, otherwise return success */
1038 	if (polled_cnt == 0) {
1039 		status = IBT_CQ_EMPTY;
1040 	} else {
1041 		status = DDI_SUCCESS;
1042 	}
1043 
1044 	/*
1045 	 * Check if the system is currently panicking.  If it is, then call
1046 	 * the Tavor interrupt service routine.  This step is necessary here
1047 	 * because we might be in a polled I/O mode and without the call to
1048 	 * tavor_isr() - and its subsequent calls to poll and rearm each
1049 	 * event queue - we might overflow our EQs and render the system
1050 	 * unable to sync/dump.
1051 	 */
1052 	if (ddi_in_panic() != 0) {
1053 		(void) tavor_isr((caddr_t)state, (caddr_t)NULL);
1054 	}
1055 
1056 	TAVOR_TNF_EXIT(tavor_cq_poll);
1057 	return (status);
1058 }
1059 
1060 
1061 /*
1062  * tavor_cq_handler()
1063  *    Context: Only called from interrupt context
1064  */
1065 int
1066 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1067     tavor_hw_eqe_t *eqe)
1068 {
1069 	tavor_cqhdl_t		cq;
1070 	uint_t			cqnum;
1071 	uint_t			eqe_evttype;
1072 
1073 	TAVOR_TNF_ENTER(tavor_cq_handler);
1074 
1075 	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1076 
1077 	ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
1078 	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1079 
1080 	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1081 		TNF_PROBE_0(tavor_cq_handler_eq_overflow_condition,
1082 		    TAVOR_TNF_ERROR, "");
1083 		tavor_eq_overflow_handler(state, eq, eqe);
1084 
1085 		TAVOR_TNF_EXIT(tavor_cq_handler);
1086 		return (DDI_FAILURE);
1087 	}
1088 
1089 
1090 	/* Get the CQ handle from CQ number in event descriptor */
1091 	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1092 	cq = tavor_cqhdl_from_cqnum(state, cqnum);
1093 
1094 	/*
1095 	 * Post the EQ doorbell to move the CQ to the "disarmed" state.
1096 	 * This operation is to enable subsequent CQ doorbells (e.g. those
1097 	 * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1098 	 */
1099 	tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1100 
1101 	/*
1102 	 * If the CQ handle is NULL, this is probably an indication
1103 	 * that the CQ has been freed already.  In which case, we
1104 	 * should not deliver this event.
1105 	 *
1106 	 * We also check that the CQ number in the handle is the
1107 	 * same as the CQ number in the event queue entry.  This
1108 	 * extra check allows us to handle the case where a CQ was
1109 	 * freed and then allocated again in the time it took to
1110 	 * handle the event queue processing.  By constantly incrementing
1111 	 * the non-constrained portion of the CQ number every time
1112 	 * a new CQ is allocated, we mitigate (somewhat) the chance
1113 	 * that a stale event could be passed to the client's CQ
1114 	 * handler.
1115 	 *
1116 	 * Lastly, we check if "ts_ibtfpriv" is NULL.  If it is then it
1117 	 * means that we've have either received this event before we
1118 	 * finished attaching to the IBTF or we've received it while we
1119 	 * are in the process of detaching.
1120 	 */
1121 	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1122 	    (state->ts_ibtfpriv != NULL)) {
1123 		TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1124 	} else {
1125 		TNF_PROBE_2(tavor_cq_handler_dropped_event,
1126 		    TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1127 		    tnf_uint, hdl_cqnum, cqnum);
1128 	}
1129 
1130 	TAVOR_TNF_EXIT(tavor_cq_handler);
1131 	return (DDI_SUCCESS);
1132 }
1133 
1134 
1135 /*
1136  * tavor_cq_err_handler()
1137  *    Context: Only called from interrupt context
1138  */
1139 int
1140 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1141     tavor_hw_eqe_t *eqe)
1142 {
1143 	tavor_cqhdl_t		cq;
1144 	uint_t			cqnum;
1145 	ibc_async_event_t	event;
1146 	ibt_async_code_t	type;
1147 	uint_t			eqe_evttype;
1148 
1149 	TAVOR_TNF_ENTER(tavor_cq_err_handler);
1150 
1151 	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1152 
1153 	ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1154 	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1155 
1156 	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1157 		TNF_PROBE_0(tavor_cq_err_handler_eq_overflow_condition,
1158 		    TAVOR_TNF_ERROR, "");
1159 		tavor_eq_overflow_handler(state, eq, eqe);
1160 
1161 		TAVOR_TNF_EXIT(tavor_cq_err_handler);
1162 		return (DDI_FAILURE);
1163 	}
1164 
1165 	/* cmn_err(CE_CONT, "CQ Error handler\n"); */
1166 
1167 	/* Get the CQ handle from CQ number in event descriptor */
1168 	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1169 	cq = tavor_cqhdl_from_cqnum(state, cqnum);
1170 
1171 	/*
1172 	 * If the CQ handle is NULL, this is probably an indication
1173 	 * that the CQ has been freed already.  In which case, we
1174 	 * should not deliver this event.
1175 	 *
1176 	 * We also check that the CQ number in the handle is the
1177 	 * same as the CQ number in the event queue entry.  This
1178 	 * extra check allows us to handle the case where a CQ was
1179 	 * freed and then allocated again in the time it took to
1180 	 * handle the event queue processing.  By constantly incrementing
1181 	 * the non-constrained portion of the CQ number every time
1182 	 * a new CQ is allocated, we mitigate (somewhat) the chance
1183 	 * that a stale event could be passed to the client's CQ
1184 	 * handler.
1185 	 *
1186 	 * And then we check if "ts_ibtfpriv" is NULL.  If it is then it
1187 	 * means that we've have either received this event before we
1188 	 * finished attaching to the IBTF or we've received it while we
1189 	 * are in the process of detaching.
1190 	 */
1191 	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1192 	    (state->ts_ibtfpriv != NULL)) {
1193 		event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1194 		type		= IBT_ERROR_CQ;
1195 
1196 		TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1197 	} else {
1198 		TNF_PROBE_2(tavor_cq_err_handler_dropped_event,
1199 		    TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1200 		    tnf_uint, hdl_cqnum, cqnum);
1201 	}
1202 
1203 	TAVOR_TNF_EXIT(tavor_cq_err_handler);
1204 	return (DDI_SUCCESS);
1205 }
1206 
1207 
1208 /*
1209  * tavor_cq_refcnt_inc()
1210  *    Context: Can be called from interrupt or base context.
1211  */
1212 int
1213 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1214 {
1215 	/*
1216 	 * Increment the completion queue's reference count.  Note: In order
1217 	 * to ensure compliance with IBA C11-15, we must ensure that a given
1218 	 * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1219 	 * This is accomplished here by keeping track of how the referenced
1220 	 * CQ is being used.
1221 	 */
1222 	mutex_enter(&cq->cq_lock);
1223 	TNF_PROBE_1_DEBUG(tavor_cq_refcnt_inc, TAVOR_TNF_TRACE, "",
1224 	    tnf_uint, refcnt, cq->cq_refcnt);
1225 	if (cq->cq_refcnt == 0) {
1226 		cq->cq_is_special = is_special;
1227 	} else {
1228 		if (cq->cq_is_special != is_special) {
1229 			mutex_exit(&cq->cq_lock);
1230 			return (DDI_FAILURE);
1231 		}
1232 	}
1233 	cq->cq_refcnt++;
1234 	mutex_exit(&cq->cq_lock);
1235 	return (DDI_SUCCESS);
1236 }
1237 
1238 
1239 /*
1240  * tavor_cq_refcnt_dec()
1241  *    Context: Can be called from interrupt or base context.
1242  */
1243 void
1244 tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1245 {
1246 	/* Decrement the completion queue's reference count */
1247 	mutex_enter(&cq->cq_lock);
1248 	cq->cq_refcnt--;
1249 	TNF_PROBE_1_DEBUG(tavor_cq_refcnt_dec, TAVOR_TNF_TRACE, "",
1250 	    tnf_uint, refcnt, cq->cq_refcnt);
1251 	mutex_exit(&cq->cq_lock);
1252 }
1253 
1254 
1255 /*
1256  * tavor_cq_doorbell()
1257  *    Context: Can be called from interrupt or base context.
1258  */
1259 static void
1260 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1261     uint32_t cq_param)
1262 {
1263 	uint64_t	doorbell = 0;
1264 
1265 	/* Build the doorbell from the parameters */
1266 	doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1267 	    ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1268 
1269 	TNF_PROBE_1_DEBUG(tavor_cq_doorbell, TAVOR_TNF_TRACE, "",
1270 	    tnf_ulong, doorbell, doorbell);
1271 
1272 	/* Write the doorbell to UAR */
1273 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1274 	    doorbell);
1275 }
1276 
1277 
1278 /*
1279  * tavor_cqhdl_from_cqnum()
1280  *    Context: Can be called from interrupt or base context.
1281  *
1282  *    This routine is important because changing the unconstrained
1283  *    portion of the CQ number is critical to the detection of a
1284  *    potential race condition in the CQ handler code (i.e. the case
1285  *    where a CQ is freed and alloc'd again before an event for the
1286  *    "old" CQ can be handled).
1287  *
1288  *    While this is not a perfect solution (not sure that one exists)
1289  *    it does help to mitigate the chance that this race condition will
1290  *    cause us to deliver a "stale" event to the new CQ owner.  Note:
1291  *    this solution does not scale well because the number of constrained
1292  *    bits increases (and, hence, the number of unconstrained bits
1293  *    decreases) as the number of supported CQs grows.  For small and
1294  *    intermediate values, it should hopefully provide sufficient
1295  *    protection.
1296  */
1297 tavor_cqhdl_t
1298 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1299 {
1300 	uint_t	cqindx, cqmask;
1301 
1302 	/* Calculate the CQ table index from the cqnum */
1303 	cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1304 	cqindx = cqnum & cqmask;
1305 	return (state->ts_cqhdl[cqindx]);
1306 }
1307 
1308 
1309 /*
1310  * tavor_cq_cqe_consume()
1311  *    Context: Can be called from interrupt or base context.
1312  */
1313 static int
1314 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1315     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1316 {
1317 	uint_t		flags, type, opcode, qpnum, qp1_indx;
1318 	int		status;
1319 
1320 	TAVOR_TNF_ENTER(tavor_cq_cqe_consume);
1321 
1322 	/*
1323 	 * Determine if this is an "error" CQE by examining "opcode".  If it
1324 	 * is an error CQE, then call tavor_cq_errcqe_consume() and return
1325 	 * whatever status it returns.  Otherwise, this is a successful
1326 	 * completion.
1327 	 */
1328 	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1329 	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1330 	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1331 		status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1332 		TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1333 		return (status);
1334 	}
1335 
1336 	/*
1337 	 * Fetch the Work Request ID using the information in the CQE.
1338 	 * See tavor_wr.c for more details.
1339 	 */
1340 	wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1341 
1342 	/*
1343 	 * Parse the CQE opcode to determine completion type.  This will set
1344 	 * not only the type of the completion, but also any flags that might
1345 	 * be associated with it (e.g. whether immediate data is present).
1346 	 */
1347 	flags = IBT_WC_NO_FLAGS;
1348 	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1349 
1350 		/* Send CQE */
1351 		switch (opcode) {
1352 		case TAVOR_CQE_SND_RDMAWR_IMM:
1353 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1354 			/* FALLTHROUGH */
1355 		case TAVOR_CQE_SND_RDMAWR:
1356 			type = IBT_WRC_RDMAW;
1357 			break;
1358 
1359 		case TAVOR_CQE_SND_SEND_IMM:
1360 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1361 			/* FALLTHROUGH */
1362 		case TAVOR_CQE_SND_SEND:
1363 			type = IBT_WRC_SEND;
1364 			break;
1365 
1366 		case TAVOR_CQE_SND_RDMARD:
1367 			type = IBT_WRC_RDMAR;
1368 			break;
1369 
1370 		case TAVOR_CQE_SND_ATOMIC_CS:
1371 			type = IBT_WRC_CSWAP;
1372 			break;
1373 
1374 		case TAVOR_CQE_SND_ATOMIC_FA:
1375 			type = IBT_WRC_FADD;
1376 			break;
1377 
1378 		case TAVOR_CQE_SND_BIND_MW:
1379 			type = IBT_WRC_BIND;
1380 			break;
1381 
1382 		default:
1383 			TAVOR_WARNING(state, "unknown send CQE type");
1384 			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1385 			TNF_PROBE_1(tavor_cq_cqe_consume_unknown_send_type,
1386 			    TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1387 			TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1388 			return (TAVOR_CQ_SYNC_AND_DB);
1389 		}
1390 	} else {
1391 
1392 		/* Receive CQE */
1393 		switch (opcode & 0x1F) {
1394 		case TAVOR_CQE_RCV_RECV_IMM:
1395 			/* FALLTHROUGH */
1396 		case TAVOR_CQE_RCV_RECV_IMM2:
1397 			/*
1398 			 * Note:  According to the Tavor PRM, all QP1 recv
1399 			 * completions look like the result of a Send with
1400 			 * Immediate.  They are not, however, (MADs are Send
1401 			 * Only) so we need to check the QP number and set
1402 			 * the flag only if it is non-QP1.
1403 			 */
1404 			qpnum	 = TAVOR_CQE_QPNUM_GET(cq, cqe);
1405 			qp1_indx = state->ts_spec_qp1->tr_indx;
1406 			if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1407 				flags |= IBT_WC_IMMED_DATA_PRESENT;
1408 			}
1409 			/* FALLTHROUGH */
1410 		case TAVOR_CQE_RCV_RECV:
1411 			/* FALLTHROUGH */
1412 		case TAVOR_CQE_RCV_RECV2:
1413 			type = IBT_WRC_RECV;
1414 			break;
1415 
1416 		case TAVOR_CQE_RCV_RDMAWR_IMM:
1417 			/* FALLTHROUGH */
1418 		case TAVOR_CQE_RCV_RDMAWR_IMM2:
1419 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1420 			type = IBT_WRC_RECV_RDMAWI;
1421 			break;
1422 
1423 		default:
1424 			TAVOR_WARNING(state, "unknown recv CQE type");
1425 			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1426 			TNF_PROBE_1(tavor_cq_cqe_consume_unknown_rcv_type,
1427 			    TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1428 			TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1429 			return (TAVOR_CQ_SYNC_AND_DB);
1430 		}
1431 	}
1432 	wc->wc_type = type;
1433 
1434 	/*
1435 	 * Check for GRH, update the flags, then fill in "wc_flags" field
1436 	 * in the work completion
1437 	 */
1438 	if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1439 		flags |= IBT_WC_GRH_PRESENT;
1440 	}
1441 	wc->wc_flags = flags;
1442 
1443 	/* If we got here, completion status must be success */
1444 	wc->wc_status = IBT_WC_SUCCESS;
1445 
1446 	/*
1447 	 * Parse the remaining contents of the CQE into the work completion.
1448 	 * This means filling in SL, QP number, SLID, immediate data, etc.
1449 	 * Note:  Not all of these fields are valid in a given completion.
1450 	 * Many of them depend on the actual type of completion.  So we fill
1451 	 * in all of the fields and leave it up to the IBTF and consumer to
1452 	 * sort out which are valid based on their context.
1453 	 */
1454 	wc->wc_sl	  = TAVOR_CQE_SL_GET(cq, cqe);
1455 	wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1456 	wc->wc_qpn	  = TAVOR_CQE_DQPN_GET(cq, cqe);
1457 	wc->wc_res_hash	  = 0;
1458 	wc->wc_slid	  = TAVOR_CQE_DLID_GET(cq, cqe);
1459 	wc->wc_ethertype  = (wc->wc_immed_data & 0xFFFF);
1460 	wc->wc_pkey_ix	  = (wc->wc_immed_data >> 16);
1461 
1462 	/*
1463 	 * Depending on whether the completion was a receive or a send
1464 	 * completion, fill in "bytes transferred" as appropriate.  Also,
1465 	 * if necessary, fill in the "path bits" field.
1466 	 */
1467 	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1468 		wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1469 		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1470 
1471 	} else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1472 	    (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1473 		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1474 	}
1475 
1476 	TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1477 	return (TAVOR_CQ_SYNC_AND_DB);
1478 }
1479 
1480 
1481 /*
1482  * tavor_cq_errcqe_consume()
1483  *    Context: Can be called from interrupt or base context.
1484  */
1485 static int
1486 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1487     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1488 {
1489 	uint64_t		next_wqeaddr;
1490 	uint32_t		imm_eth_pkey_cred;
1491 	uint_t			nextwqesize, dbd;
1492 	uint_t			doorbell_cnt, status;
1493 	tavor_wrid_entry_t	wre;
1494 
1495 	TAVOR_TNF_ENTER(tavor_cq_errcqe_consume);
1496 
1497 	/*
1498 	 * Fetch the Work Request ID using the information in the CQE.
1499 	 * See tavor_wr.c for more details.
1500 	 */
1501 	wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1502 
1503 	/*
1504 	 * Parse the CQE opcode to determine completion type.  We know that
1505 	 * the CQE is an error completion, so we extract only the completion
1506 	 * status here.
1507 	 */
1508 	imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1509 	status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1510 	switch (status) {
1511 	case TAVOR_CQE_LOC_LEN_ERR:
1512 		status = IBT_WC_LOCAL_LEN_ERR;
1513 		break;
1514 
1515 	case TAVOR_CQE_LOC_OP_ERR:
1516 		status = IBT_WC_LOCAL_QP_OP_ERR;
1517 		break;
1518 
1519 	case TAVOR_CQE_LOC_PROT_ERR:
1520 		status = IBT_WC_LOCAL_PROTECT_ERR;
1521 		break;
1522 
1523 	case TAVOR_CQE_WR_FLUSHED_ERR:
1524 		status = IBT_WC_WR_FLUSHED_ERR;
1525 		break;
1526 
1527 	case TAVOR_CQE_MW_BIND_ERR:
1528 		status = IBT_WC_MEM_WIN_BIND_ERR;
1529 		break;
1530 
1531 	case TAVOR_CQE_BAD_RESPONSE_ERR:
1532 		status = IBT_WC_BAD_RESPONSE_ERR;
1533 		break;
1534 
1535 	case TAVOR_CQE_LOCAL_ACCESS_ERR:
1536 		status = IBT_WC_LOCAL_ACCESS_ERR;
1537 		break;
1538 
1539 	case TAVOR_CQE_REM_INV_REQ_ERR:
1540 		status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1541 		break;
1542 
1543 	case TAVOR_CQE_REM_ACC_ERR:
1544 		status = IBT_WC_REMOTE_ACCESS_ERR;
1545 		break;
1546 
1547 	case TAVOR_CQE_REM_OP_ERR:
1548 		status = IBT_WC_REMOTE_OP_ERR;
1549 		break;
1550 
1551 	case TAVOR_CQE_TRANS_TO_ERR:
1552 		status = IBT_WC_TRANS_TIMEOUT_ERR;
1553 		break;
1554 
1555 	case TAVOR_CQE_RNRNAK_TO_ERR:
1556 		status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1557 		break;
1558 
1559 	/*
1560 	 * The following error codes are not supported in the Tavor driver
1561 	 * as they relate only to Reliable Datagram completion statuses:
1562 	 *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1563 	 *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1564 	 *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1565 	 *    case TAVOR_CQE_INV_EEC_NUM_ERR:
1566 	 *    case TAVOR_CQE_INV_EEC_STATE_ERR:
1567 	 *    case TAVOR_CQE_LOC_EEC_ERR:
1568 	 */
1569 
1570 	default:
1571 		TAVOR_WARNING(state, "unknown error CQE status");
1572 		status = IBT_WC_LOCAL_QP_OP_ERR;
1573 		TNF_PROBE_1(tavor_cq_errcqe_consume_unknown_status,
1574 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1575 		break;
1576 	}
1577 	wc->wc_status = status;
1578 
1579 	/*
1580 	 * Now we do all the checking that's necessary to handle completion
1581 	 * queue entry "recycling"
1582 	 *
1583 	 * It is not necessary here to try to sync the WQE as we are only
1584 	 * attempting to read from the Work Queue (and hardware does not
1585 	 * write to it).
1586 	 */
1587 
1588 	/*
1589 	 * We can get doorbell info, WQE address, size for the next WQE
1590 	 * from the "wre" (which was filled in above in the call to the
1591 	 * tavor_wrid_get_entry() routine)
1592 	 */
1593 	dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1594 	next_wqeaddr = wre.wr_wqeaddrsz;
1595 	nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1596 
1597 	/*
1598 	 * Get the doorbell count from the CQE.  This indicates how many
1599 	 * completions this one CQE represents.
1600 	 */
1601 	doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1602 
1603 	/*
1604 	 * Determine if we're ready to consume this CQE yet or not.  If the
1605 	 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1606 	 * is down to zero, then this is the last/only completion represented
1607 	 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
1608 	 * current CQE needs to be recycled (see below).
1609 	 */
1610 	if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1611 		/*
1612 		 * Consume the CQE
1613 		 *    Return status to indicate that doorbell and sync may be
1614 		 *    necessary.
1615 		 */
1616 		TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1617 		return (TAVOR_CQ_SYNC_AND_DB);
1618 
1619 	} else {
1620 		/*
1621 		 * Recycle the CQE for use in the next PollCQ() call
1622 		 *    Decrement the doorbell count, modify the error status,
1623 		 *    and update the WQE address and size (to point to the
1624 		 *    next WQE on the chain.  Put these update entries back
1625 		 *    into the CQE.
1626 		 *    Despite the fact that we have updated the CQE, it is not
1627 		 *    necessary for us to attempt to sync this entry just yet
1628 		 *    as we have not changed the "hardware's view" of the
1629 		 *    entry (i.e. we have not modified the "owner" bit - which
1630 		 *    is all that the Tavor hardware really cares about.
1631 		 */
1632 		doorbell_cnt = doorbell_cnt - dbd;
1633 		TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1634 		    ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1635 		    (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1636 		TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1637 		    TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1638 
1639 		TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1640 		return (TAVOR_CQ_RECYCLE_ENTRY);
1641 	}
1642 }
1643 
1644 
1645 /*
1646  * tavor_cqe_sync()
1647  *    Context: Can be called from interrupt or base context.
1648  */
1649 static void
1650 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1651 {
1652 	ddi_dma_handle_t	dmahdl;
1653 	off_t			offset;
1654 	int			status;
1655 
1656 	TAVOR_TNF_ENTER(tavor_cqe_sync);
1657 
1658 	/* Determine if CQ needs to be synced or not */
1659 	if (cq->cq_sync == 0) {
1660 		TAVOR_TNF_EXIT(tavor_cqe_sync);
1661 		return;
1662 	}
1663 
1664 	/* Get the DMA handle from CQ context */
1665 	dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1666 
1667 	/* Calculate offset of next CQE */
1668 	offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1669 	status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1670 	if (status != DDI_SUCCESS) {
1671 		TNF_PROBE_0(tavor_cqe_sync_getnextentry_fail,
1672 		    TAVOR_TNF_ERROR, "");
1673 		TAVOR_TNF_EXIT(tavor_cqe_sync);
1674 		return;
1675 	}
1676 
1677 	TAVOR_TNF_EXIT(tavor_cqe_sync);
1678 }
1679 
1680 
1681 /*
1682  * tavor_cq_resize_helper()
1683  *    Context: Can be called only from user or kernel context.
1684  */
1685 static void
1686 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1687     uint32_t old_cons_indx, uint32_t num_newcqe)
1688 {
1689 	tavor_hw_cqe_t	*old_cqe, *new_cqe;
1690 	uint32_t	new_cons_indx, wrap_around_mask;
1691 	int		i;
1692 
1693 	TAVOR_TNF_ENTER(tavor_cq_resize_helper);
1694 
1695 	ASSERT(MUTEX_HELD(&cq->cq_lock));
1696 
1697 	/* Get the consumer index */
1698 	new_cons_indx = 0;
1699 
1700 	/*
1701 	 * Calculate the wrap around mask.  Note: This operation only works
1702 	 * because all Tavor completion queues have power-of-2 sizes
1703 	 */
1704 	wrap_around_mask = (cq->cq_bufsz - 1);
1705 
1706 	/*
1707 	 * Calculate the pointers to the first CQ entry (in the "old" CQ)
1708 	 * and the first CQ entry in the "new" CQ
1709 	 */
1710 	old_cqe = &cq->cq_buf[old_cons_indx];
1711 	new_cqe = &new_cqbuf[new_cons_indx];
1712 
1713 	/* Sync entire "old" CQ for use by software (if necessary). */
1714 	if (cq->cq_sync) {
1715 		(void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1716 		    0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1717 	}
1718 
1719 	/*
1720 	 * Keep pulling entries from the "old" CQ until we find an entry owned
1721 	 * by the hardware.  Process each entry by copying it into the "new"
1722 	 * CQ and updating respective indices and pointers in the "old" CQ.
1723 	 */
1724 	for (i = 0; i < num_newcqe; i++) {
1725 
1726 		/* Copy this old CQE into the "new_cqe" pointer */
1727 		bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1728 
1729 		/* Increment the consumer index (for both CQs) */
1730 		old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1731 		new_cons_indx = (new_cons_indx + 1);
1732 
1733 		/* Update the pointer to the next CQ entry */
1734 		old_cqe = &cq->cq_buf[old_cons_indx];
1735 		new_cqe = &new_cqbuf[new_cons_indx];
1736 	}
1737 
1738 	TAVOR_TNF_EXIT(tavor_cq_resize_helper);
1739 }
1740 
1741 
1742 /*
1743  * tavor_cq_numcalc()
1744  *    Context: Can be called from interrupt or base context.
1745  */
1746 static void
1747 tavor_cq_numcalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
1748 {
1749 	uint32_t	tmp, log_num_cq;
1750 
1751 	/*
1752 	 * Generate a simple key from counter.  Note:  We increment this
1753 	 * static variable _intentionally_ without any kind of mutex around
1754 	 * it.  First, single-threading all operations through a single lock
1755 	 * would be a bad idea (from a performance point-of-view).  Second,
1756 	 * the upper "unconstrained" bits don't really have to be unique
1757 	 * because the lower bits are guaranteed to be (although we do make a
1758 	 * best effort to ensure that they are).  Third, the window for the
1759 	 * race (where both threads read and update the counter at the same
1760 	 * time) is incredibly small.
1761 	 */
1762 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_cqnum_cnt))
1763 	log_num_cq = state->ts_cfg_profile->cp_log_num_cq;
1764 	tmp = (tavor_debug_cqnum_cnt++) << log_num_cq;
1765 	*key = (tmp | indx) & TAVOR_CQ_MAXNUMBER_MSK;
1766 }
1767 
1768 /*
1769  * tavor_cq_srq_entries_flush()
1770  * Context: Can be called from interrupt or base context.
1771  */
1772 void
1773 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1774 {
1775 	tavor_cqhdl_t		cq;
1776 	tavor_workq_hdr_t	*wqhdr;
1777 	tavor_hw_cqe_t		*cqe;
1778 	tavor_hw_cqe_t		*next_cqe;
1779 	uint32_t		cons_indx, tail_cons_indx, wrap_around_mask;
1780 	uint32_t		new_indx, check_indx, indx;
1781 	uint32_t		num_to_increment;
1782 	int			cqe_qpnum, cqe_type;
1783 	int			outstanding_cqes, removed_cqes;
1784 	int			i;
1785 
1786 	ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1787 
1788 	cq = qp->qp_rq_cqhdl;
1789 	wqhdr = qp->qp_rq_wqhdr;
1790 
1791 	ASSERT(wqhdr->wq_wrid_post != NULL);
1792 	ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1793 
1794 	/*
1795 	 * Check for user-mapped CQ memory.  Note:  We do not allow kernel
1796 	 * clients to modify any userland mapping CQ.  If the CQ is
1797 	 * user-mapped, then we simply return here, and this "flush" function
1798 	 * becomes a NO-OP in this case.
1799 	 */
1800 	if (cq->cq_is_umap) {
1801 		return;
1802 	}
1803 
1804 	/* Get the consumer index */
1805 	cons_indx = cq->cq_consindx;
1806 
1807 	/*
1808 	 * Calculate the wrap around mask.  Note: This operation only works
1809 	 * because all Tavor completion queues have power-of-2 sizes
1810 	 */
1811 	wrap_around_mask = (cq->cq_bufsz - 1);
1812 
1813 	/* Calculate the pointer to the first CQ entry */
1814 	cqe = &cq->cq_buf[cons_indx];
1815 
1816 	/* Sync the current CQE to read */
1817 	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1818 
1819 	/*
1820 	 * Loop through the CQ looking for entries owned by software.  If an
1821 	 * entry is owned by software then we increment an 'outstanding_cqes'
1822 	 * count to know how many entries total we have on our CQ.  We use this
1823 	 * value further down to know how many entries to loop through looking
1824 	 * for our same QP number.
1825 	 */
1826 	outstanding_cqes = 0;
1827 	tail_cons_indx = cons_indx;
1828 	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1829 		/* increment total cqes count */
1830 		outstanding_cqes++;
1831 
1832 		/* increment the consumer index */
1833 		tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1834 
1835 		/* update the pointer to the next cq entry */
1836 		cqe = &cq->cq_buf[tail_cons_indx];
1837 
1838 		/* sync the next cqe to read */
1839 		tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1840 	}
1841 
1842 	/*
1843 	 * Using the 'tail_cons_indx' that was just set, we now know how many
1844 	 * total CQEs possible there are.  Set the 'check_indx' and the
1845 	 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1846 	 */
1847 	check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1848 
1849 	for (i = 0; i < outstanding_cqes; i++) {
1850 		cqe = &cq->cq_buf[check_indx];
1851 
1852 		/* Grab QP number from CQE */
1853 		cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1854 		cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1855 
1856 		/*
1857 		 * If the QP number is the same in the CQE as the QP that we
1858 		 * have on this SRQ, then we must free up the entry off the
1859 		 * SRQ.  We also make sure that the completion type is of the
1860 		 * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
1861 		 * this CQ will be left as-is.  The handling of returning
1862 		 * entries back to HW ownership happens further down.
1863 		 */
1864 		if (cqe_qpnum == qp->qp_qpnum &&
1865 		    cqe_type == TAVOR_COMPLETION_RECV) {
1866 
1867 			/* Add back to SRQ free list */
1868 			(void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1869 			    cq, cqe);
1870 		} else {
1871 			/* Do Copy */
1872 			if (check_indx != new_indx) {
1873 				next_cqe = &cq->cq_buf[new_indx];
1874 
1875 				/*
1876 				 * Copy the CQE into the "next_cqe"
1877 				 * pointer.
1878 				 */
1879 				bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1880 			}
1881 			new_indx = (new_indx - 1) & wrap_around_mask;
1882 		}
1883 		/* Move index to next CQE to check */
1884 		check_indx = (check_indx - 1) & wrap_around_mask;
1885 	}
1886 
1887 	/* Initialize removed cqes count */
1888 	removed_cqes = 0;
1889 
1890 	/* If an entry was removed */
1891 	if (check_indx != new_indx) {
1892 
1893 		/*
1894 		 * Set current pointer back to the beginning consumer index.
1895 		 * At this point, all unclaimed entries have been copied to the
1896 		 * index specified by 'new_indx'.  This 'new_indx' will be used
1897 		 * as the new consumer index after we mark all freed entries as
1898 		 * having HW ownership.  We do that here.
1899 		 */
1900 
1901 		/* Loop through all entries until we reach our new pointer */
1902 		for (indx = cons_indx; indx <= new_indx;
1903 		    indx = (indx + 1) & wrap_around_mask) {
1904 			removed_cqes++;
1905 			cqe = &cq->cq_buf[indx];
1906 
1907 			/* Reset entry to hardware ownership */
1908 			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1909 		}
1910 	}
1911 
1912 	/*
1913 	 * Update consumer index to be the 'new_indx'.  This moves it past all
1914 	 * removed entries.  Because 'new_indx' is pointing to the last
1915 	 * previously valid SW owned entry, we add 1 to point the cons_indx to
1916 	 * the first HW owned entry.
1917 	 */
1918 	cons_indx = (new_indx + 1) & wrap_around_mask;
1919 
1920 	/*
1921 	 * Now we only ring the doorbell (to update the consumer index) if
1922 	 * we've actually consumed a CQ entry.  If we found no QP number
1923 	 * matches above, then we would not have removed anything.  So only if
1924 	 * something was removed do we ring the doorbell.
1925 	 */
1926 	if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1927 		/*
1928 		 * Post doorbell to update the consumer index.  Doorbell
1929 		 * value indicates number of entries consumed (minus 1)
1930 		 */
1931 		if (cons_indx > cq->cq_consindx) {
1932 			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1933 		} else {
1934 			num_to_increment = ((cons_indx + cq->cq_bufsz) -
1935 			    cq->cq_consindx) - 1;
1936 		}
1937 		cq->cq_consindx = cons_indx;
1938 
1939 		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1940 		    cq->cq_cqnum, num_to_increment);
1941 	}
1942 }
1943