1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_cq.c
29  *    Tavor Completion Queue Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, resizing,
32  *    and handling the completion type events that the Tavor hardware can
33  *    generate.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
42 #include <sys/sysmacros.h>
43 
44 #include <sys/ib/adapters/tavor/tavor.h>
45 
46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
47     uint32_t cqn, uint32_t cq_param);
48 #pragma inline(tavor_cq_doorbell)
49 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
50     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
51 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
52     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
53 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
54     uint_t flag);
55 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
56     uint32_t old_cons_indx, uint32_t num_newcqe);
57 
58 /*
59  * tavor_cq_alloc()
60  *    Context: Can be called only from user or kernel context.
61  */
62 int
tavor_cq_alloc(tavor_state_t * state,ibt_cq_hdl_t ibt_cqhdl,ibt_cq_attr_t * cq_attr,uint_t * actual_size,tavor_cqhdl_t * cqhdl,uint_t sleepflag)63 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
64     ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
65     uint_t sleepflag)
66 {
67 	tavor_rsrc_t		*cqc, *rsrc;
68 	tavor_umap_db_entry_t	*umapdb;
69 	tavor_hw_cqc_t		cqc_entry;
70 	tavor_cqhdl_t		cq;
71 	ibt_mr_attr_t		mr_attr;
72 	tavor_mr_options_t	op;
73 	tavor_pdhdl_t		pd;
74 	tavor_mrhdl_t		mr;
75 	tavor_hw_cqe_t		*buf;
76 	uint64_t		addr, value;
77 	uint32_t		log_cq_size, lkey, uarpg;
78 	uint_t			dma_xfer_mode, cq_sync, cq_is_umap;
79 	int			status, i, flag;
80 
81 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))
82 
83 	/*
84 	 * Determine whether CQ is being allocated for userland access or
85 	 * whether it is being allocated for kernel access.  If the CQ is
86 	 * being allocated for userland access, then lookup the UAR doorbell
87 	 * page number for the current process.  Note:  If this is not found
88 	 * (e.g. if the process has not previously open()'d the Tavor driver),
89 	 * then an error is returned.
90 	 */
91 	cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
92 	if (cq_is_umap) {
93 		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
94 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
95 		if (status != DDI_SUCCESS) {
96 			goto cqalloc_fail;
97 		}
98 		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
99 	}
100 
101 	/* Use the internal protection domain (PD) for setting up CQs */
102 	pd = state->ts_pdhdl_internal;
103 
104 	/* Increment the reference count on the protection domain (PD) */
105 	tavor_pd_refcnt_inc(pd);
106 
107 	/*
108 	 * Allocate an CQ context entry.  This will be filled in with all
109 	 * the necessary parameters to define the Completion Queue.  And then
110 	 * ownership will be passed to the hardware in the final step
111 	 * below.  If we fail here, we must undo the protection domain
112 	 * reference count.
113 	 */
114 	status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
115 	if (status != DDI_SUCCESS) {
116 		goto cqalloc_fail1;
117 	}
118 
119 	/*
120 	 * Allocate the software structure for tracking the completion queue
121 	 * (i.e. the Tavor Completion Queue handle).  If we fail here, we must
122 	 * undo the protection domain reference count and the previous
123 	 * resource allocation.
124 	 */
125 	status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
126 	if (status != DDI_SUCCESS) {
127 		goto cqalloc_fail2;
128 	}
129 	cq = (tavor_cqhdl_t)rsrc->tr_addr;
130 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
131 	cq->cq_is_umap = cq_is_umap;
132 
133 	/* Use the index as CQ number */
134 	cq->cq_cqnum = cqc->tr_indx;
135 
136 	/*
137 	 * If this will be a user-mappable CQ, then allocate an entry for
138 	 * the "userland resources database".  This will later be added to
139 	 * the database (after all further CQ operations are successful).
140 	 * If we fail here, we must undo the reference counts and the
141 	 * previous resource allocation.
142 	 */
143 	if (cq->cq_is_umap) {
144 		umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
145 		    MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
146 		if (umapdb == NULL) {
147 			goto cqalloc_fail3;
148 		}
149 	}
150 
151 	/*
152 	 * Calculate the appropriate size for the completion queue.
153 	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
154 	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
155 	 * to round the requested size up to the next highest power-of-2
156 	 */
157 	cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
158 	log_cq_size = highbit(cq_attr->cq_size);
159 
160 	/*
161 	 * Next we verify that the rounded-up size is valid (i.e. consistent
162 	 * with the device limits and/or software-configured limits)
163 	 */
164 	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
165 		goto cqalloc_fail4;
166 	}
167 
168 	/*
169 	 * Allocate the memory for Completion Queue.
170 	 *
171 	 * Note: Although we use the common queue allocation routine, we
172 	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
173 	 * kernel system memory) for kernel CQs because it would be
174 	 * inefficient to have CQs located in DDR memory.  This is primarily
175 	 * because CQs are read from (by software) more than they are written
176 	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
177 	 * user-mappable CQs for a similar reason.)
178 	 * It is also worth noting that, unlike Tavor QP work queues,
179 	 * completion queues do not have the same strict alignment
180 	 * requirements.  It is sufficient for the CQ memory to be both
181 	 * aligned to and bound to addresses which are a multiple of CQE size.
182 	 */
183 	cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
184 	cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
185 	cq->cq_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
186 	if (cq->cq_is_umap) {
187 		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
188 	} else {
189 		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
190 	}
191 	status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
192 	if (status != DDI_SUCCESS) {
193 		goto cqalloc_fail4;
194 	}
195 	buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
196 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
197 
198 	/*
199 	 * Initialize each of the Completion Queue Entries (CQE) by setting
200 	 * their ownership to hardware ("owner" bit set to HW).  This is in
201 	 * preparation for the final transfer of ownership (below) of the
202 	 * CQ context itself.
203 	 */
204 	for (i = 0; i < (1 << log_cq_size); i++) {
205 		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
206 	}
207 
208 	/*
209 	 * Register the memory for the CQ.  The memory for the CQ must
210 	 * be registered in the Tavor TPT tables.  This gives us the LKey
211 	 * to specify in the CQ context below.  Note: If this is a user-
212 	 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
213 	 */
214 	flag = (sleepflag == TAVOR_SLEEP) ?  IBT_MR_SLEEP : IBT_MR_NOSLEEP;
215 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
216 	mr_attr.mr_len	 = cq->cq_cqinfo.qa_size;
217 	mr_attr.mr_as	 = NULL;
218 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
219 	if (cq->cq_is_umap) {
220 		dma_xfer_mode = DDI_DMA_CONSISTENT;
221 	} else {
222 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
223 	}
224 	if (dma_xfer_mode == DDI_DMA_STREAMING) {
225 		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
226 	}
227 	op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
228 	op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
229 	op.mro_bind_override_addr = 0;
230 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
231 	if (status != DDI_SUCCESS) {
232 		goto cqalloc_fail5;
233 	}
234 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
235 	addr = mr->mr_bindinfo.bi_addr;
236 	lkey = mr->mr_lkey;
237 
238 	/* Determine if later ddi_dma_sync will be necessary */
239 	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
240 
241 	/* Sync entire CQ for use by the hardware (if necessary). */
242 	if (cq_sync) {
243 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
244 		    cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
245 	}
246 
247 	/*
248 	 * Fill in the CQC entry.  This is the final step before passing
249 	 * ownership of the CQC entry to the Tavor hardware.  We use all of
250 	 * the information collected/calculated above to fill in the
251 	 * requisite portions of the CQC.  Note: If this CQ is going to be
252 	 * used for userland access, then we need to set the UAR page number
253 	 * appropriately (otherwise it's a "don't care")
254 	 */
255 	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
256 	cq->cq_eqnum		= TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
257 	cq->cq_erreqnum		= TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
258 	cqc_entry.xlat		= TAVOR_VA2PA_XLAT_ENABLED;
259 	cqc_entry.state		= TAVOR_CQ_DISARMED;
260 	cqc_entry.start_addr_h	= (addr >> 32);
261 	cqc_entry.start_addr_l	= (addr & 0xFFFFFFFF);
262 	cqc_entry.log_cq_sz	= log_cq_size;
263 	if (cq->cq_is_umap) {
264 		cqc_entry.usr_page = uarpg;
265 	} else {
266 		cqc_entry.usr_page = 0;
267 	}
268 	cqc_entry.pd		= pd->pd_pdnum;
269 	cqc_entry.lkey		= lkey;
270 	cqc_entry.e_eqn		= cq->cq_erreqnum;
271 	cqc_entry.c_eqn		= cq->cq_eqnum;
272 	cqc_entry.cqn		= cq->cq_cqnum;
273 
274 	/*
275 	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
276 	 * the entry to the hardware (using the Tavor SW2HW_CQ firmware
277 	 * command).  Note: In general, this operation shouldn't fail.  But
278 	 * if it does, we have to undo everything we've done above before
279 	 * returning error.
280 	 */
281 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
282 	    sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
283 	if (status != TAVOR_CMD_SUCCESS) {
284 		cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
285 		    status);
286 		goto cqalloc_fail6;
287 	}
288 
289 	/*
290 	 * Fill in the rest of the Tavor Completion Queue handle.  Having
291 	 * successfully transferred ownership of the CQC, we can update the
292 	 * following fields for use in further operations on the CQ.
293 	 */
294 	cq->cq_cqcrsrcp	  = cqc;
295 	cq->cq_rsrcp	  = rsrc;
296 	cq->cq_consindx	  = 0;
297 	cq->cq_buf	  = buf;
298 	cq->cq_bufsz	  = (1 << log_cq_size);
299 	cq->cq_mrhdl	  = mr;
300 	cq->cq_sync	  = cq_sync;
301 	cq->cq_refcnt	  = 0;
302 	cq->cq_is_special = 0;
303 	cq->cq_uarpg	  = uarpg;
304 	cq->cq_umap_dhp	  = (devmap_cookie_t)NULL;
305 	avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
306 	    sizeof (struct tavor_workq_hdr_s),
307 	    offsetof(struct tavor_workq_hdr_s, wq_avl_link));
308 
309 	cq->cq_wrid_reap_head  = NULL;
310 	cq->cq_wrid_reap_tail  = NULL;
311 	cq->cq_hdlrarg	  = (void *)ibt_cqhdl;
312 
313 	/*
314 	 * Put CQ handle in Tavor CQNum-to-CQHdl list.  Then fill in the
315 	 * "actual_size" and "cqhdl" and return success
316 	 */
317 	ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
318 	state->ts_cqhdl[cqc->tr_indx] = cq;
319 
320 	/*
321 	 * If this is a user-mappable CQ, then we need to insert the previously
322 	 * allocated entry into the "userland resources database".  This will
323 	 * allow for later lookup during devmap() (i.e. mmap()) calls.
324 	 */
325 	if (cq->cq_is_umap) {
326 		tavor_umap_db_add(umapdb);
327 	}
328 
329 	/*
330 	 * Fill in the return arguments (if necessary).  This includes the
331 	 * real completion queue size.
332 	 */
333 	if (actual_size != NULL) {
334 		*actual_size = (1 << log_cq_size) - 1;
335 	}
336 	*cqhdl = cq;
337 
338 	return (DDI_SUCCESS);
339 
340 /*
341  * The following is cleanup for all possible failure cases in this routine
342  */
343 cqalloc_fail6:
344 	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
345 	    sleepflag) != DDI_SUCCESS) {
346 		TAVOR_WARNING(state, "failed to deregister CQ memory");
347 	}
348 cqalloc_fail5:
349 	tavor_queue_free(state, &cq->cq_cqinfo);
350 cqalloc_fail4:
351 	if (cq_is_umap) {
352 		tavor_umap_db_free(umapdb);
353 	}
354 cqalloc_fail3:
355 	tavor_rsrc_free(state, &rsrc);
356 cqalloc_fail2:
357 	tavor_rsrc_free(state, &cqc);
358 cqalloc_fail1:
359 	tavor_pd_refcnt_dec(pd);
360 cqalloc_fail:
361 	return (status);
362 }
363 
364 
365 /*
366  * tavor_cq_free()
367  *    Context: Can be called only from user or kernel context.
368  */
369 /* ARGSUSED */
370 int
tavor_cq_free(tavor_state_t * state,tavor_cqhdl_t * cqhdl,uint_t sleepflag)371 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
372 {
373 	tavor_rsrc_t		*cqc, *rsrc;
374 	tavor_umap_db_entry_t	*umapdb;
375 	tavor_hw_cqc_t		cqc_entry;
376 	tavor_pdhdl_t		pd;
377 	tavor_mrhdl_t		mr;
378 	tavor_cqhdl_t		cq;
379 	uint32_t		cqnum;
380 	uint64_t		value;
381 	uint_t			maxprot;
382 	int			status;
383 
384 	/*
385 	 * Pull all the necessary information from the Tavor Completion Queue
386 	 * handle.  This is necessary here because the resource for the
387 	 * CQ handle is going to be freed up as part of this operation.
388 	 */
389 	cq	= *cqhdl;
390 	mutex_enter(&cq->cq_lock);
391 	cqc	= cq->cq_cqcrsrcp;
392 	rsrc	= cq->cq_rsrcp;
393 	pd	= state->ts_pdhdl_internal;
394 	mr	= cq->cq_mrhdl;
395 	cqnum	= cq->cq_cqnum;
396 
397 	/*
398 	 * If there are work queues still associated with the CQ, then return
399 	 * an error.  Otherwise, we will be holding the CQ lock.
400 	 */
401 	if (cq->cq_refcnt != 0) {
402 		mutex_exit(&cq->cq_lock);
403 		return (IBT_CQ_BUSY);
404 	}
405 
406 	/*
407 	 * If this was a user-mappable CQ, then we need to remove its entry
408 	 * from the "userland resources database".  If it is also currently
409 	 * mmap()'d out to a user process, then we need to call
410 	 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
411 	 * We also need to invalidate the CQ tracking information for the
412 	 * user mapping.
413 	 */
414 	if (cq->cq_is_umap) {
415 		status = tavor_umap_db_find(state->ts_instance, cqnum,
416 		    MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
417 		    &umapdb);
418 		if (status != DDI_SUCCESS) {
419 			mutex_exit(&cq->cq_lock);
420 			TAVOR_WARNING(state, "failed to find in database");
421 			return (ibc_get_ci_failure(0));
422 		}
423 		tavor_umap_db_free(umapdb);
424 		if (cq->cq_umap_dhp != NULL) {
425 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
426 			status = devmap_devmem_remap(cq->cq_umap_dhp,
427 			    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
428 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
429 			if (status != DDI_SUCCESS) {
430 				mutex_exit(&cq->cq_lock);
431 				TAVOR_WARNING(state, "failed in CQ memory "
432 				    "devmap_devmem_remap()");
433 				return (ibc_get_ci_failure(0));
434 			}
435 			cq->cq_umap_dhp = (devmap_cookie_t)NULL;
436 		}
437 	}
438 
439 	/*
440 	 * Put NULL into the Tavor CQNum-to-CQHdl list.  This will allow any
441 	 * in-progress events to detect that the CQ corresponding to this
442 	 * number has been freed.
443 	 */
444 	state->ts_cqhdl[cqc->tr_indx] = NULL;
445 
446 	/*
447 	 * While we hold the CQ lock, do a "forced reap" of the workQ WRID
448 	 * list.  This cleans up all the structures associated with the WRID
449 	 * processing for this CQ.  Once we complete, drop the lock and finish
450 	 * the deallocation of the CQ.
451 	 */
452 	tavor_wrid_cq_force_reap(cq);
453 
454 	mutex_exit(&cq->cq_lock);
455 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
456 
457 	/*
458 	 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
459 	 * firmware command).  If the ownership transfer fails for any reason,
460 	 * then it is an indication that something (either in HW or SW) has
461 	 * gone seriously wrong.
462 	 */
463 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
464 	    sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
465 	if (status != TAVOR_CMD_SUCCESS) {
466 		TAVOR_WARNING(state, "failed to reclaim CQC ownership");
467 		cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
468 		    status);
469 		return (ibc_get_ci_failure(0));
470 	}
471 
472 	/*
473 	 * Deregister the memory for the Completion Queue.  If this fails
474 	 * for any reason, then it is an indication that something (either
475 	 * in HW or SW) has gone seriously wrong.  So we print a warning
476 	 * message and return.
477 	 */
478 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
479 	    sleepflag);
480 	if (status != DDI_SUCCESS) {
481 		TAVOR_WARNING(state, "failed to deregister CQ memory");
482 		return (ibc_get_ci_failure(0));
483 	}
484 
485 	/* Free the memory for the CQ */
486 	tavor_queue_free(state, &cq->cq_cqinfo);
487 
488 	/* Free the Tavor Completion Queue handle */
489 	tavor_rsrc_free(state, &rsrc);
490 
491 	/* Free up the CQC entry resource */
492 	tavor_rsrc_free(state, &cqc);
493 
494 	/* Decrement the reference count on the protection domain (PD) */
495 	tavor_pd_refcnt_dec(pd);
496 
497 	/* Set the cqhdl pointer to NULL and return success */
498 	*cqhdl = NULL;
499 
500 	return (DDI_SUCCESS);
501 }
502 
503 
504 /*
505  * tavor_cq_resize()
506  *    Context: Can be called only from user or kernel context.
507  */
508 int
tavor_cq_resize(tavor_state_t * state,tavor_cqhdl_t cq,uint_t req_size,uint_t * actual_size,uint_t sleepflag)509 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
510     uint_t *actual_size, uint_t sleepflag)
511 {
512 	tavor_hw_cqc_t		cqc_entry;
513 	tavor_qalloc_info_t	new_cqinfo, old_cqinfo;
514 	ibt_mr_attr_t		mr_attr;
515 	tavor_mr_options_t	op;
516 	tavor_pdhdl_t		pd;
517 	tavor_mrhdl_t		mr, mr_old;
518 	tavor_hw_cqe_t		*buf;
519 	uint32_t		new_prod_indx, old_cons_indx;
520 	uint_t			dma_xfer_mode, cq_sync, log_cq_size, maxprot;
521 	int			status, i, flag;
522 
523 	/* Use the internal protection domain (PD) for CQs */
524 	pd = state->ts_pdhdl_internal;
525 
526 	/*
527 	 * Calculate the appropriate size for the new resized completion queue.
528 	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
529 	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
530 	 * to round the requested size up to the next highest power-of-2
531 	 */
532 	req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
533 	log_cq_size = highbit(req_size);
534 
535 	/*
536 	 * Next we verify that the rounded-up size is valid (i.e. consistent
537 	 * with the device limits and/or software-configured limits)
538 	 */
539 	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
540 		goto cqresize_fail;
541 	}
542 
543 	/*
544 	 * Allocate the memory for newly resized Completion Queue.
545 	 *
546 	 * Note: Although we use the common queue allocation routine, we
547 	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
548 	 * kernel system memory) for kernel CQs because it would be
549 	 * inefficient to have CQs located in DDR memory.  This is the same
550 	 * as we do when we first allocate completion queues primarily
551 	 * because CQs are read from (by software) more than they are written
552 	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
553 	 * user-mappable CQs for a similar reason.)
554 	 * It is also worth noting that, unlike Tavor QP work queues,
555 	 * completion queues do not have the same strict alignment
556 	 * requirements.  It is sufficient for the CQ memory to be both
557 	 * aligned to and bound to addresses which are a multiple of CQE size.
558 	 */
559 	new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
560 	new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
561 	new_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
562 	if (cq->cq_is_umap) {
563 		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
564 	} else {
565 		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
566 	}
567 	status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
568 	if (status != DDI_SUCCESS) {
569 		goto cqresize_fail;
570 	}
571 	buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
572 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
573 
574 	/*
575 	 * Initialize each of the Completion Queue Entries (CQE) by setting
576 	 * their ownership to hardware ("owner" bit set to HW).  This is in
577 	 * preparation for the final resize operation (below).
578 	 */
579 	for (i = 0; i < (1 << log_cq_size); i++) {
580 		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
581 	}
582 
583 	/*
584 	 * Register the memory for the CQ.  The memory for the CQ must
585 	 * be registered in the Tavor TPT tables.  This gives us the LKey
586 	 * to specify in the CQ context below.
587 	 */
588 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
589 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
590 	mr_attr.mr_len	 = new_cqinfo.qa_size;
591 	mr_attr.mr_as	 = NULL;
592 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
593 	if (cq->cq_is_umap) {
594 		dma_xfer_mode = DDI_DMA_CONSISTENT;
595 	} else {
596 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
597 	}
598 	if (dma_xfer_mode == DDI_DMA_STREAMING) {
599 		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
600 	}
601 	op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
602 	op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
603 	op.mro_bind_override_addr = 0;
604 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
605 	if (status != DDI_SUCCESS) {
606 		tavor_queue_free(state, &new_cqinfo);
607 		goto cqresize_fail;
608 	}
609 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
610 
611 	/* Determine if later ddi_dma_sync will be necessary */
612 	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
613 
614 	/* Sync entire "new" CQ for use by hardware (if necessary) */
615 	if (cq_sync) {
616 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
617 		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
618 	}
619 
620 	/*
621 	 * Now we grab the CQ lock.  Since we will be updating the actual
622 	 * CQ location and the producer/consumer indexes, we should hold
623 	 * the lock.
624 	 *
625 	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
626 	 * holding the "cq_lock" and if we got raised to interrupt level
627 	 * by priority inversion, we would not want to block in this routine
628 	 * waiting for success.
629 	 */
630 	mutex_enter(&cq->cq_lock);
631 
632 	/*
633 	 * Determine the current CQ "consumer index".
634 	 *
635 	 * Note:  This will depend on whether the CQ had previously been
636 	 * mapped for user access or whether it is a kernel CQ.  If this
637 	 * is a kernel CQ, then all PollCQ() operations have come through
638 	 * the IBTF and, hence, the driver's CQ state structure will
639 	 * contain the current consumer index.  If, however, the user has
640 	 * accessed this CQ by bypassing the driver (OS-bypass), then we
641 	 * need to query the firmware to determine the current CQ consumer
642 	 * index.  This also assumes that the user process will not continue
643 	 * to consume entries while at the same time doing the ResizeCQ()
644 	 * operation.  If the user process does not guarantee this, then it
645 	 * may see duplicate or missed completions.  But under no
646 	 * circumstances should this panic the system.
647 	 */
648 	if (cq->cq_is_umap) {
649 		status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
650 		    cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
651 		    TAVOR_NOSLEEP);
652 		if (status != TAVOR_CMD_SUCCESS) {
653 			/* Query CQ has failed, drop CQ lock and cleanup */
654 			mutex_exit(&cq->cq_lock);
655 			if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
656 			    sleepflag) != DDI_SUCCESS) {
657 				TAVOR_WARNING(state, "failed to deregister "
658 				    "CQ memory");
659 			}
660 			tavor_queue_free(state, &new_cqinfo);
661 			TAVOR_WARNING(state, "failed to find in database");
662 
663 			goto cqresize_fail;
664 		}
665 		old_cons_indx = cqc_entry.cons_indx;
666 	} else {
667 		old_cons_indx = cq->cq_consindx;
668 	}
669 
670 	/*
671 	 * Fill in the CQC entry.  For the resize operation this is the
672 	 * final step before attempting the resize operation on the CQC entry.
673 	 * We use all of the information collected/calculated above to fill
674 	 * in the requisite portions of the CQC.
675 	 */
676 	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
677 	cqc_entry.start_addr_h	= (mr->mr_bindinfo.bi_addr >> 32);
678 	cqc_entry.start_addr_l	= (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
679 	cqc_entry.log_cq_sz	= log_cq_size;
680 	cqc_entry.lkey		= mr->mr_lkey;
681 
682 	/*
683 	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
684 	 * the entry to the hardware (using the Tavor RESIZE_CQ firmware
685 	 * command).  Note: In general, this operation shouldn't fail.  But
686 	 * if it does, we have to undo everything we've done above before
687 	 * returning error.  Also note that the status returned may indicate
688 	 * the code to return to the IBTF.
689 	 */
690 	status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
691 	    &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
692 	if (status != TAVOR_CMD_SUCCESS) {
693 		/* Resize attempt has failed, drop CQ lock and cleanup */
694 		mutex_exit(&cq->cq_lock);
695 		if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
696 		    sleepflag) != DDI_SUCCESS) {
697 			TAVOR_WARNING(state, "failed to deregister CQ memory");
698 		}
699 		tavor_queue_free(state, &new_cqinfo);
700 		if (status == TAVOR_CMD_BAD_SIZE) {
701 			return (IBT_CQ_SZ_INSUFFICIENT);
702 		} else {
703 			cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
704 			    "%08x\n", status);
705 			return (ibc_get_ci_failure(0));
706 		}
707 	}
708 
709 	/*
710 	 * The CQ resize attempt was successful.  Before dropping the CQ lock,
711 	 * copy all of the CQEs from the "old" CQ into the "new" CQ.  Note:
712 	 * the Tavor firmware guarantees us that sufficient space is set aside
713 	 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
714 	 * The two parameters to this helper function ("old_cons_indx" and
715 	 * "new_prod_indx") essentially indicate the starting index and number
716 	 * of any CQEs that might remain in the "old" CQ memory.
717 	 */
718 	tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
719 
720 	/* Sync entire "new" CQ for use by hardware (if necessary) */
721 	if (cq_sync) {
722 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
723 		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
724 	}
725 
726 	/*
727 	 * Update the Tavor Completion Queue handle with all the new
728 	 * information.  At the same time, save away all the necessary
729 	 * information for freeing up the old resources
730 	 */
731 	mr_old		 = cq->cq_mrhdl;
732 	old_cqinfo	 = cq->cq_cqinfo;
733 	cq->cq_cqinfo	 = new_cqinfo;
734 	cq->cq_consindx	 = 0;
735 	cq->cq_buf	 = buf;
736 	cq->cq_bufsz	 = (1 << log_cq_size);
737 	cq->cq_mrhdl	 = mr;
738 	cq->cq_sync	 = cq_sync;
739 
740 	/*
741 	 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
742 	 * to a user process, then we need to call devmap_devmem_remap() to
743 	 * invalidate the mapping to the CQ memory.  We also need to
744 	 * invalidate the CQ tracking information for the user mapping.
745 	 */
746 	if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
747 		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
748 		status = devmap_devmem_remap(cq->cq_umap_dhp,
749 		    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
750 		    DEVMAP_MAPPING_INVALID, NULL);
751 		if (status != DDI_SUCCESS) {
752 			mutex_exit(&cq->cq_lock);
753 			TAVOR_WARNING(state, "failed in CQ memory "
754 			    "devmap_devmem_remap()");
755 			return (ibc_get_ci_failure(0));
756 		}
757 		cq->cq_umap_dhp = (devmap_cookie_t)NULL;
758 	}
759 
760 	/*
761 	 * Drop the CQ lock now.  The only thing left to do is to free up
762 	 * the old resources.
763 	 */
764 	mutex_exit(&cq->cq_lock);
765 
766 	/*
767 	 * Deregister the memory for the old Completion Queue.  Note: We
768 	 * really can't return error here because we have no good way to
769 	 * cleanup.  Plus, the deregistration really shouldn't ever happen.
770 	 * So, if it does, it is an indication that something has gone
771 	 * seriously wrong.  So we print a warning message and return error
772 	 * (knowing, of course, that the "old" CQ memory will be leaked)
773 	 */
774 	status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
775 	    sleepflag);
776 	if (status != DDI_SUCCESS) {
777 		TAVOR_WARNING(state, "failed to deregister old CQ memory");
778 		goto cqresize_fail;
779 	}
780 
781 	/* Free the memory for the old CQ */
782 	tavor_queue_free(state, &old_cqinfo);
783 
784 	/*
785 	 * Fill in the return arguments (if necessary).  This includes the
786 	 * real new completion queue size.
787 	 */
788 	if (actual_size != NULL) {
789 		*actual_size = (1 << log_cq_size) - 1;
790 	}
791 
792 	return (DDI_SUCCESS);
793 
794 cqresize_fail:
795 	return (status);
796 }
797 
798 
799 /*
800  * tavor_cq_notify()
801  *    Context: Can be called from interrupt or base context.
802  */
803 int
tavor_cq_notify(tavor_state_t * state,tavor_cqhdl_t cq,ibt_cq_notify_flags_t flags)804 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
805     ibt_cq_notify_flags_t flags)
806 {
807 	uint_t		cqnum;
808 
809 	/*
810 	 * Determine if we are trying to get the next completion or the next
811 	 * "solicited" completion.  Then hit the appropriate doorbell.
812 	 *
813 	 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
814 	 * regarding why we do not have to do an extra PIO read here, and we
815 	 * will not lose an event after writing this doorbell.
816 	 */
817 	cqnum = cq->cq_cqnum;
818 	if (flags == IBT_NEXT_COMPLETION) {
819 		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
820 		    TAVOR_CQDB_DEFAULT_PARAM);
821 
822 	} else if (flags == IBT_NEXT_SOLICITED) {
823 		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
824 		    cqnum, TAVOR_CQDB_DEFAULT_PARAM);
825 
826 	} else {
827 		return (IBT_CQ_NOTIFY_TYPE_INVALID);
828 	}
829 
830 	return (DDI_SUCCESS);
831 }
832 
833 
834 /*
835  * tavor_cq_poll()
836  *    Context: Can be called from interrupt or base context.
837  */
838 int
tavor_cq_poll(tavor_state_t * state,tavor_cqhdl_t cq,ibt_wc_t * wc_p,uint_t num_wc,uint_t * num_polled)839 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
840     uint_t num_wc, uint_t *num_polled)
841 {
842 	tavor_hw_cqe_t	*cqe;
843 	uint32_t	cons_indx, wrap_around_mask;
844 	uint32_t	polled_cnt, num_to_increment;
845 	int		status;
846 
847 	/*
848 	 * Check for user-mappable CQ memory.  Note:  We do not allow kernel
849 	 * clients to poll CQ memory that is accessible directly by the user.
850 	 * If the CQ memory is user accessible, then return an error.
851 	 */
852 	if (cq->cq_is_umap) {
853 		return (IBT_CQ_HDL_INVALID);
854 	}
855 
856 	mutex_enter(&cq->cq_lock);
857 
858 	/* Get the consumer index */
859 	cons_indx = cq->cq_consindx;
860 
861 	/*
862 	 * Calculate the wrap around mask.  Note: This operation only works
863 	 * because all Tavor completion queues have power-of-2 sizes
864 	 */
865 	wrap_around_mask = (cq->cq_bufsz - 1);
866 
867 	/* Calculate the pointer to the first CQ entry */
868 	cqe = &cq->cq_buf[cons_indx];
869 
870 	/* Sync the current CQE to read */
871 	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
872 
873 	/*
874 	 * Keep pulling entries from the CQ until we find an entry owned by
875 	 * the hardware.  As long as there the CQE's owned by SW, process
876 	 * each entry by calling tavor_cq_cqe_consume() and updating the CQ
877 	 * consumer index.  Note:  We only update the consumer index if
878 	 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.  Otherwise,
879 	 * it indicates that we are going to "recycle" the CQE (probably
880 	 * because it is a error CQE and corresponds to more than one
881 	 * completion).
882 	 */
883 	polled_cnt = 0;
884 	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
885 		status = tavor_cq_cqe_consume(state, cq, cqe,
886 		    &wc_p[polled_cnt++]);
887 		if (status == TAVOR_CQ_SYNC_AND_DB) {
888 			/* Reset entry to hardware ownership */
889 			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
890 
891 			/* Sync the current CQE for device */
892 			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
893 
894 			/* Increment the consumer index */
895 			cons_indx = (cons_indx + 1) & wrap_around_mask;
896 
897 			/* Update the pointer to the next CQ entry */
898 			cqe = &cq->cq_buf[cons_indx];
899 
900 			/* Sync the next CQE to read */
901 			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
902 		}
903 
904 		/*
905 		 * If we have run out of space to store work completions,
906 		 * then stop and return the ones we have pulled of the CQ.
907 		 */
908 		if (polled_cnt >= num_wc) {
909 			break;
910 		}
911 	}
912 
913 	/*
914 	 * Now we only ring the doorbell (to update the consumer index) if
915 	 * we've actually consumed a CQ entry.  If we have, for example,
916 	 * pulled from a CQE that we are still in the process of "recycling"
917 	 * for error purposes, then we would not update the consumer index.
918 	 */
919 	if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
920 		/*
921 		 * Post doorbell to update the consumer index.  Doorbell
922 		 * value indicates number of entries consumed (minus 1)
923 		 */
924 		if (cons_indx > cq->cq_consindx) {
925 			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
926 		} else {
927 			num_to_increment = ((cons_indx + cq->cq_bufsz) -
928 			    cq->cq_consindx) - 1;
929 		}
930 		cq->cq_consindx = cons_indx;
931 		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
932 		    cq->cq_cqnum, num_to_increment);
933 
934 	} else if (polled_cnt == 0) {
935 		/*
936 		 * If the CQ is empty, we can try to free up some of the WRID
937 		 * list containers.  See tavor_wr.c for more details on this
938 		 * operation.
939 		 */
940 		tavor_wrid_cq_reap(cq);
941 	}
942 
943 	mutex_exit(&cq->cq_lock);
944 
945 	/* Set "num_polled" (if necessary) */
946 	if (num_polled != NULL) {
947 		*num_polled = polled_cnt;
948 	}
949 
950 	/* Set CQ_EMPTY condition if needed, otherwise return success */
951 	if (polled_cnt == 0) {
952 		status = IBT_CQ_EMPTY;
953 	} else {
954 		status = DDI_SUCCESS;
955 	}
956 
957 	/*
958 	 * Check if the system is currently panicking.  If it is, then call
959 	 * the Tavor interrupt service routine.  This step is necessary here
960 	 * because we might be in a polled I/O mode and without the call to
961 	 * tavor_isr() - and its subsequent calls to poll and rearm each
962 	 * event queue - we might overflow our EQs and render the system
963 	 * unable to sync/dump.
964 	 */
965 	if (ddi_in_panic() != 0) {
966 		(void) tavor_isr((caddr_t)state, (caddr_t)NULL);
967 	}
968 
969 	return (status);
970 }
971 
972 
973 /*
974  * tavor_cq_handler()
975  *    Context: Only called from interrupt context
976  */
977 int
tavor_cq_handler(tavor_state_t * state,tavor_eqhdl_t eq,tavor_hw_eqe_t * eqe)978 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
979     tavor_hw_eqe_t *eqe)
980 {
981 	tavor_cqhdl_t		cq;
982 	uint_t			cqnum;
983 	uint_t			eqe_evttype;
984 
985 	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
986 
987 	ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
988 	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
989 
990 	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
991 		tavor_eq_overflow_handler(state, eq, eqe);
992 
993 		return (DDI_FAILURE);
994 	}
995 
996 
997 	/* Get the CQ handle from CQ number in event descriptor */
998 	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
999 	cq = tavor_cqhdl_from_cqnum(state, cqnum);
1000 
1001 	/*
1002 	 * Post the EQ doorbell to move the CQ to the "disarmed" state.
1003 	 * This operation is to enable subsequent CQ doorbells (e.g. those
1004 	 * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1005 	 */
1006 	tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1007 
1008 	/*
1009 	 * If the CQ handle is NULL, this is probably an indication
1010 	 * that the CQ has been freed already.  In which case, we
1011 	 * should not deliver this event.
1012 	 *
1013 	 * We also check that the CQ number in the handle is the
1014 	 * same as the CQ number in the event queue entry.  This
1015 	 * extra check allows us to handle the case where a CQ was
1016 	 * freed and then allocated again in the time it took to
1017 	 * handle the event queue processing.  By constantly incrementing
1018 	 * the non-constrained portion of the CQ number every time
1019 	 * a new CQ is allocated, we mitigate (somewhat) the chance
1020 	 * that a stale event could be passed to the client's CQ
1021 	 * handler.
1022 	 *
1023 	 * Lastly, we check if "ts_ibtfpriv" is NULL.  If it is then it
1024 	 * means that we've have either received this event before we
1025 	 * finished attaching to the IBTF or we've received it while we
1026 	 * are in the process of detaching.
1027 	 */
1028 	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1029 	    (state->ts_ibtfpriv != NULL)) {
1030 		TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1031 	}
1032 
1033 	return (DDI_SUCCESS);
1034 }
1035 
1036 
1037 /*
1038  * tavor_cq_err_handler()
1039  *    Context: Only called from interrupt context
1040  */
1041 int
tavor_cq_err_handler(tavor_state_t * state,tavor_eqhdl_t eq,tavor_hw_eqe_t * eqe)1042 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1043     tavor_hw_eqe_t *eqe)
1044 {
1045 	tavor_cqhdl_t		cq;
1046 	uint_t			cqnum;
1047 	ibc_async_event_t	event;
1048 	ibt_async_code_t	type;
1049 	uint_t			eqe_evttype;
1050 
1051 	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1052 
1053 	ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1054 	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1055 
1056 	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1057 		tavor_eq_overflow_handler(state, eq, eqe);
1058 
1059 		return (DDI_FAILURE);
1060 	}
1061 
1062 	/* cmn_err(CE_CONT, "CQ Error handler\n"); */
1063 
1064 	/* Get the CQ handle from CQ number in event descriptor */
1065 	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1066 	cq = tavor_cqhdl_from_cqnum(state, cqnum);
1067 
1068 	/*
1069 	 * If the CQ handle is NULL, this is probably an indication
1070 	 * that the CQ has been freed already.  In which case, we
1071 	 * should not deliver this event.
1072 	 *
1073 	 * We also check that the CQ number in the handle is the
1074 	 * same as the CQ number in the event queue entry.  This
1075 	 * extra check allows us to handle the case where a CQ was
1076 	 * freed and then allocated again in the time it took to
1077 	 * handle the event queue processing.  By constantly incrementing
1078 	 * the non-constrained portion of the CQ number every time
1079 	 * a new CQ is allocated, we mitigate (somewhat) the chance
1080 	 * that a stale event could be passed to the client's CQ
1081 	 * handler.
1082 	 *
1083 	 * And then we check if "ts_ibtfpriv" is NULL.  If it is then it
1084 	 * means that we've have either received this event before we
1085 	 * finished attaching to the IBTF or we've received it while we
1086 	 * are in the process of detaching.
1087 	 */
1088 	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1089 	    (state->ts_ibtfpriv != NULL)) {
1090 		event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1091 		type		= IBT_ERROR_CQ;
1092 
1093 		TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1094 	}
1095 
1096 	return (DDI_SUCCESS);
1097 }
1098 
1099 
1100 /*
1101  * tavor_cq_refcnt_inc()
1102  *    Context: Can be called from interrupt or base context.
1103  */
1104 int
tavor_cq_refcnt_inc(tavor_cqhdl_t cq,uint_t is_special)1105 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1106 {
1107 	/*
1108 	 * Increment the completion queue's reference count.  Note: In order
1109 	 * to ensure compliance with IBA C11-15, we must ensure that a given
1110 	 * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1111 	 * This is accomplished here by keeping track of how the referenced
1112 	 * CQ is being used.
1113 	 */
1114 	mutex_enter(&cq->cq_lock);
1115 	if (cq->cq_refcnt == 0) {
1116 		cq->cq_is_special = is_special;
1117 	} else {
1118 		if (cq->cq_is_special != is_special) {
1119 			mutex_exit(&cq->cq_lock);
1120 			return (DDI_FAILURE);
1121 		}
1122 	}
1123 	cq->cq_refcnt++;
1124 	mutex_exit(&cq->cq_lock);
1125 	return (DDI_SUCCESS);
1126 }
1127 
1128 
1129 /*
1130  * tavor_cq_refcnt_dec()
1131  *    Context: Can be called from interrupt or base context.
1132  */
1133 void
tavor_cq_refcnt_dec(tavor_cqhdl_t cq)1134 tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1135 {
1136 	/* Decrement the completion queue's reference count */
1137 	mutex_enter(&cq->cq_lock);
1138 	cq->cq_refcnt--;
1139 	mutex_exit(&cq->cq_lock);
1140 }
1141 
1142 
1143 /*
1144  * tavor_cq_doorbell()
1145  *    Context: Can be called from interrupt or base context.
1146  */
1147 static void
tavor_cq_doorbell(tavor_state_t * state,uint32_t cq_cmd,uint32_t cqn,uint32_t cq_param)1148 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1149     uint32_t cq_param)
1150 {
1151 	uint64_t	doorbell = 0;
1152 
1153 	/* Build the doorbell from the parameters */
1154 	doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1155 	    ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1156 
1157 	/* Write the doorbell to UAR */
1158 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1159 	    doorbell);
1160 }
1161 
1162 
1163 /*
1164  * tavor_cqhdl_from_cqnum()
1165  *    Context: Can be called from interrupt or base context.
1166  *
1167  *    This routine is important because changing the unconstrained
1168  *    portion of the CQ number is critical to the detection of a
1169  *    potential race condition in the CQ handler code (i.e. the case
1170  *    where a CQ is freed and alloc'd again before an event for the
1171  *    "old" CQ can be handled).
1172  *
1173  *    While this is not a perfect solution (not sure that one exists)
1174  *    it does help to mitigate the chance that this race condition will
1175  *    cause us to deliver a "stale" event to the new CQ owner.  Note:
1176  *    this solution does not scale well because the number of constrained
1177  *    bits increases (and, hence, the number of unconstrained bits
1178  *    decreases) as the number of supported CQs grows.  For small and
1179  *    intermediate values, it should hopefully provide sufficient
1180  *    protection.
1181  */
1182 tavor_cqhdl_t
tavor_cqhdl_from_cqnum(tavor_state_t * state,uint_t cqnum)1183 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1184 {
1185 	uint_t	cqindx, cqmask;
1186 
1187 	/* Calculate the CQ table index from the cqnum */
1188 	cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1189 	cqindx = cqnum & cqmask;
1190 	return (state->ts_cqhdl[cqindx]);
1191 }
1192 
1193 
1194 /*
1195  * tavor_cq_cqe_consume()
1196  *    Context: Can be called from interrupt or base context.
1197  */
1198 static int
tavor_cq_cqe_consume(tavor_state_t * state,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1199 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1200     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1201 {
1202 	uint_t		flags, type, opcode, qpnum, qp1_indx;
1203 	int		status;
1204 
1205 	/*
1206 	 * Determine if this is an "error" CQE by examining "opcode".  If it
1207 	 * is an error CQE, then call tavor_cq_errcqe_consume() and return
1208 	 * whatever status it returns.  Otherwise, this is a successful
1209 	 * completion.
1210 	 */
1211 	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1212 	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1213 	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1214 		status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1215 		return (status);
1216 	}
1217 
1218 	/*
1219 	 * Fetch the Work Request ID using the information in the CQE.
1220 	 * See tavor_wr.c for more details.
1221 	 */
1222 	wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1223 
1224 	/*
1225 	 * Parse the CQE opcode to determine completion type.  This will set
1226 	 * not only the type of the completion, but also any flags that might
1227 	 * be associated with it (e.g. whether immediate data is present).
1228 	 */
1229 	flags = IBT_WC_NO_FLAGS;
1230 	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1231 
1232 		/* Send CQE */
1233 		switch (opcode) {
1234 		case TAVOR_CQE_SND_RDMAWR_IMM:
1235 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1236 			/* FALLTHROUGH */
1237 		case TAVOR_CQE_SND_RDMAWR:
1238 			type = IBT_WRC_RDMAW;
1239 			break;
1240 
1241 		case TAVOR_CQE_SND_SEND_IMM:
1242 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1243 			/* FALLTHROUGH */
1244 		case TAVOR_CQE_SND_SEND:
1245 			type = IBT_WRC_SEND;
1246 			break;
1247 
1248 		case TAVOR_CQE_SND_RDMARD:
1249 			type = IBT_WRC_RDMAR;
1250 			break;
1251 
1252 		case TAVOR_CQE_SND_ATOMIC_CS:
1253 			type = IBT_WRC_CSWAP;
1254 			break;
1255 
1256 		case TAVOR_CQE_SND_ATOMIC_FA:
1257 			type = IBT_WRC_FADD;
1258 			break;
1259 
1260 		case TAVOR_CQE_SND_BIND_MW:
1261 			type = IBT_WRC_BIND;
1262 			break;
1263 
1264 		default:
1265 			TAVOR_WARNING(state, "unknown send CQE type");
1266 			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1267 			return (TAVOR_CQ_SYNC_AND_DB);
1268 		}
1269 	} else {
1270 
1271 		/* Receive CQE */
1272 		switch (opcode & 0x1F) {
1273 		case TAVOR_CQE_RCV_RECV_IMM:
1274 			/* FALLTHROUGH */
1275 		case TAVOR_CQE_RCV_RECV_IMM2:
1276 			/*
1277 			 * Note:  According to the Tavor PRM, all QP1 recv
1278 			 * completions look like the result of a Send with
1279 			 * Immediate.  They are not, however, (MADs are Send
1280 			 * Only) so we need to check the QP number and set
1281 			 * the flag only if it is non-QP1.
1282 			 */
1283 			qpnum	 = TAVOR_CQE_QPNUM_GET(cq, cqe);
1284 			qp1_indx = state->ts_spec_qp1->tr_indx;
1285 			if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1286 				flags |= IBT_WC_IMMED_DATA_PRESENT;
1287 			}
1288 			/* FALLTHROUGH */
1289 		case TAVOR_CQE_RCV_RECV:
1290 			/* FALLTHROUGH */
1291 		case TAVOR_CQE_RCV_RECV2:
1292 			type = IBT_WRC_RECV;
1293 			break;
1294 
1295 		case TAVOR_CQE_RCV_RDMAWR_IMM:
1296 			/* FALLTHROUGH */
1297 		case TAVOR_CQE_RCV_RDMAWR_IMM2:
1298 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1299 			type = IBT_WRC_RECV_RDMAWI;
1300 			break;
1301 
1302 		default:
1303 			TAVOR_WARNING(state, "unknown recv CQE type");
1304 			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1305 			return (TAVOR_CQ_SYNC_AND_DB);
1306 		}
1307 	}
1308 	wc->wc_type = type;
1309 
1310 	/*
1311 	 * Check for GRH, update the flags, then fill in "wc_flags" field
1312 	 * in the work completion
1313 	 */
1314 	if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1315 		flags |= IBT_WC_GRH_PRESENT;
1316 	}
1317 	wc->wc_flags = flags;
1318 
1319 	/* If we got here, completion status must be success */
1320 	wc->wc_status = IBT_WC_SUCCESS;
1321 
1322 	/*
1323 	 * Parse the remaining contents of the CQE into the work completion.
1324 	 * This means filling in SL, QP number, SLID, immediate data, etc.
1325 	 * Note:  Not all of these fields are valid in a given completion.
1326 	 * Many of them depend on the actual type of completion.  So we fill
1327 	 * in all of the fields and leave it up to the IBTF and consumer to
1328 	 * sort out which are valid based on their context.
1329 	 */
1330 	wc->wc_sl	  = TAVOR_CQE_SL_GET(cq, cqe);
1331 	wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1332 	wc->wc_qpn	  = TAVOR_CQE_DQPN_GET(cq, cqe);
1333 	wc->wc_res_hash	  = 0;
1334 	wc->wc_slid	  = TAVOR_CQE_DLID_GET(cq, cqe);
1335 	wc->wc_ethertype  = (wc->wc_immed_data & 0xFFFF);
1336 	wc->wc_pkey_ix	  = (wc->wc_immed_data >> 16);
1337 
1338 	/*
1339 	 * Depending on whether the completion was a receive or a send
1340 	 * completion, fill in "bytes transferred" as appropriate.  Also,
1341 	 * if necessary, fill in the "path bits" field.
1342 	 */
1343 	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1344 		wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1345 		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1346 
1347 	} else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1348 	    (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1349 		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1350 	}
1351 
1352 	return (TAVOR_CQ_SYNC_AND_DB);
1353 }
1354 
1355 
1356 /*
1357  * tavor_cq_errcqe_consume()
1358  *    Context: Can be called from interrupt or base context.
1359  */
1360 static int
tavor_cq_errcqe_consume(tavor_state_t * state,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1361 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1362     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1363 {
1364 	uint64_t		next_wqeaddr;
1365 	uint32_t		imm_eth_pkey_cred;
1366 	uint_t			nextwqesize, dbd;
1367 	uint_t			doorbell_cnt, status;
1368 	tavor_wrid_entry_t	wre;
1369 
1370 	/*
1371 	 * Fetch the Work Request ID using the information in the CQE.
1372 	 * See tavor_wr.c for more details.
1373 	 */
1374 	wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1375 
1376 	/*
1377 	 * Parse the CQE opcode to determine completion type.  We know that
1378 	 * the CQE is an error completion, so we extract only the completion
1379 	 * status here.
1380 	 */
1381 	imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1382 	status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1383 	switch (status) {
1384 	case TAVOR_CQE_LOC_LEN_ERR:
1385 		status = IBT_WC_LOCAL_LEN_ERR;
1386 		break;
1387 
1388 	case TAVOR_CQE_LOC_OP_ERR:
1389 		status = IBT_WC_LOCAL_QP_OP_ERR;
1390 		break;
1391 
1392 	case TAVOR_CQE_LOC_PROT_ERR:
1393 		status = IBT_WC_LOCAL_PROTECT_ERR;
1394 		break;
1395 
1396 	case TAVOR_CQE_WR_FLUSHED_ERR:
1397 		status = IBT_WC_WR_FLUSHED_ERR;
1398 		break;
1399 
1400 	case TAVOR_CQE_MW_BIND_ERR:
1401 		status = IBT_WC_MEM_WIN_BIND_ERR;
1402 		break;
1403 
1404 	case TAVOR_CQE_BAD_RESPONSE_ERR:
1405 		status = IBT_WC_BAD_RESPONSE_ERR;
1406 		break;
1407 
1408 	case TAVOR_CQE_LOCAL_ACCESS_ERR:
1409 		status = IBT_WC_LOCAL_ACCESS_ERR;
1410 		break;
1411 
1412 	case TAVOR_CQE_REM_INV_REQ_ERR:
1413 		status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1414 		break;
1415 
1416 	case TAVOR_CQE_REM_ACC_ERR:
1417 		status = IBT_WC_REMOTE_ACCESS_ERR;
1418 		break;
1419 
1420 	case TAVOR_CQE_REM_OP_ERR:
1421 		status = IBT_WC_REMOTE_OP_ERR;
1422 		break;
1423 
1424 	case TAVOR_CQE_TRANS_TO_ERR:
1425 		status = IBT_WC_TRANS_TIMEOUT_ERR;
1426 		break;
1427 
1428 	case TAVOR_CQE_RNRNAK_TO_ERR:
1429 		status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1430 		break;
1431 
1432 	/*
1433 	 * The following error codes are not supported in the Tavor driver
1434 	 * as they relate only to Reliable Datagram completion statuses:
1435 	 *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1436 	 *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1437 	 *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1438 	 *    case TAVOR_CQE_INV_EEC_NUM_ERR:
1439 	 *    case TAVOR_CQE_INV_EEC_STATE_ERR:
1440 	 *    case TAVOR_CQE_LOC_EEC_ERR:
1441 	 */
1442 
1443 	default:
1444 		TAVOR_WARNING(state, "unknown error CQE status");
1445 		status = IBT_WC_LOCAL_QP_OP_ERR;
1446 		break;
1447 	}
1448 	wc->wc_status = status;
1449 
1450 	/*
1451 	 * Now we do all the checking that's necessary to handle completion
1452 	 * queue entry "recycling"
1453 	 *
1454 	 * It is not necessary here to try to sync the WQE as we are only
1455 	 * attempting to read from the Work Queue (and hardware does not
1456 	 * write to it).
1457 	 */
1458 
1459 	/*
1460 	 * We can get doorbell info, WQE address, size for the next WQE
1461 	 * from the "wre" (which was filled in above in the call to the
1462 	 * tavor_wrid_get_entry() routine)
1463 	 */
1464 	dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1465 	next_wqeaddr = wre.wr_wqeaddrsz;
1466 	nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1467 
1468 	/*
1469 	 * Get the doorbell count from the CQE.  This indicates how many
1470 	 * completions this one CQE represents.
1471 	 */
1472 	doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1473 
1474 	/*
1475 	 * Determine if we're ready to consume this CQE yet or not.  If the
1476 	 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1477 	 * is down to zero, then this is the last/only completion represented
1478 	 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
1479 	 * current CQE needs to be recycled (see below).
1480 	 */
1481 	if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1482 		/*
1483 		 * Consume the CQE
1484 		 *    Return status to indicate that doorbell and sync may be
1485 		 *    necessary.
1486 		 */
1487 		return (TAVOR_CQ_SYNC_AND_DB);
1488 
1489 	} else {
1490 		/*
1491 		 * Recycle the CQE for use in the next PollCQ() call
1492 		 *    Decrement the doorbell count, modify the error status,
1493 		 *    and update the WQE address and size (to point to the
1494 		 *    next WQE on the chain.  Put these update entries back
1495 		 *    into the CQE.
1496 		 *    Despite the fact that we have updated the CQE, it is not
1497 		 *    necessary for us to attempt to sync this entry just yet
1498 		 *    as we have not changed the "hardware's view" of the
1499 		 *    entry (i.e. we have not modified the "owner" bit - which
1500 		 *    is all that the Tavor hardware really cares about.
1501 		 */
1502 		doorbell_cnt = doorbell_cnt - dbd;
1503 		TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1504 		    ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1505 		    (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1506 		TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1507 		    TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1508 
1509 		return (TAVOR_CQ_RECYCLE_ENTRY);
1510 	}
1511 }
1512 
1513 
1514 /*
1515  * tavor_cqe_sync()
1516  *    Context: Can be called from interrupt or base context.
1517  */
1518 static void
tavor_cqe_sync(tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,uint_t flag)1519 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1520 {
1521 	ddi_dma_handle_t	dmahdl;
1522 	off_t			offset;
1523 
1524 	/* Determine if CQ needs to be synced or not */
1525 	if (cq->cq_sync == 0)
1526 		return;
1527 
1528 	/* Get the DMA handle from CQ context */
1529 	dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1530 
1531 	/* Calculate offset of next CQE */
1532 	offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1533 	(void) ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1534 }
1535 
1536 
1537 /*
1538  * tavor_cq_resize_helper()
1539  *    Context: Can be called only from user or kernel context.
1540  */
1541 static void
tavor_cq_resize_helper(tavor_cqhdl_t cq,tavor_hw_cqe_t * new_cqbuf,uint32_t old_cons_indx,uint32_t num_newcqe)1542 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1543     uint32_t old_cons_indx, uint32_t num_newcqe)
1544 {
1545 	tavor_hw_cqe_t	*old_cqe, *new_cqe;
1546 	uint32_t	new_cons_indx, wrap_around_mask;
1547 	int		i;
1548 
1549 	ASSERT(MUTEX_HELD(&cq->cq_lock));
1550 
1551 	/* Get the consumer index */
1552 	new_cons_indx = 0;
1553 
1554 	/*
1555 	 * Calculate the wrap around mask.  Note: This operation only works
1556 	 * because all Tavor completion queues have power-of-2 sizes
1557 	 */
1558 	wrap_around_mask = (cq->cq_bufsz - 1);
1559 
1560 	/*
1561 	 * Calculate the pointers to the first CQ entry (in the "old" CQ)
1562 	 * and the first CQ entry in the "new" CQ
1563 	 */
1564 	old_cqe = &cq->cq_buf[old_cons_indx];
1565 	new_cqe = &new_cqbuf[new_cons_indx];
1566 
1567 	/* Sync entire "old" CQ for use by software (if necessary). */
1568 	if (cq->cq_sync) {
1569 		(void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1570 		    0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1571 	}
1572 
1573 	/*
1574 	 * Keep pulling entries from the "old" CQ until we find an entry owned
1575 	 * by the hardware.  Process each entry by copying it into the "new"
1576 	 * CQ and updating respective indices and pointers in the "old" CQ.
1577 	 */
1578 	for (i = 0; i < num_newcqe; i++) {
1579 
1580 		/* Copy this old CQE into the "new_cqe" pointer */
1581 		bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1582 
1583 		/* Increment the consumer index (for both CQs) */
1584 		old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1585 		new_cons_indx = (new_cons_indx + 1);
1586 
1587 		/* Update the pointer to the next CQ entry */
1588 		old_cqe = &cq->cq_buf[old_cons_indx];
1589 		new_cqe = &new_cqbuf[new_cons_indx];
1590 	}
1591 }
1592 
1593 /*
1594  * tavor_cq_srq_entries_flush()
1595  * Context: Can be called from interrupt or base context.
1596  */
1597 void
tavor_cq_srq_entries_flush(tavor_state_t * state,tavor_qphdl_t qp)1598 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1599 {
1600 	tavor_cqhdl_t		cq;
1601 	tavor_workq_hdr_t	*wqhdr;
1602 	tavor_hw_cqe_t		*cqe;
1603 	tavor_hw_cqe_t		*next_cqe;
1604 	uint32_t		cons_indx, tail_cons_indx, wrap_around_mask;
1605 	uint32_t		new_indx, check_indx, indx;
1606 	uint32_t		num_to_increment;
1607 	int			cqe_qpnum, cqe_type;
1608 	int			outstanding_cqes, removed_cqes;
1609 	int			i;
1610 
1611 	ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1612 
1613 	cq = qp->qp_rq_cqhdl;
1614 	wqhdr = qp->qp_rq_wqhdr;
1615 
1616 	ASSERT(wqhdr->wq_wrid_post != NULL);
1617 	ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1618 
1619 	/*
1620 	 * Check for user-mapped CQ memory.  Note:  We do not allow kernel
1621 	 * clients to modify any userland mapping CQ.  If the CQ is
1622 	 * user-mapped, then we simply return here, and this "flush" function
1623 	 * becomes a NO-OP in this case.
1624 	 */
1625 	if (cq->cq_is_umap) {
1626 		return;
1627 	}
1628 
1629 	/* Get the consumer index */
1630 	cons_indx = cq->cq_consindx;
1631 
1632 	/*
1633 	 * Calculate the wrap around mask.  Note: This operation only works
1634 	 * because all Tavor completion queues have power-of-2 sizes
1635 	 */
1636 	wrap_around_mask = (cq->cq_bufsz - 1);
1637 
1638 	/* Calculate the pointer to the first CQ entry */
1639 	cqe = &cq->cq_buf[cons_indx];
1640 
1641 	/* Sync the current CQE to read */
1642 	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1643 
1644 	/*
1645 	 * Loop through the CQ looking for entries owned by software.  If an
1646 	 * entry is owned by software then we increment an 'outstanding_cqes'
1647 	 * count to know how many entries total we have on our CQ.  We use this
1648 	 * value further down to know how many entries to loop through looking
1649 	 * for our same QP number.
1650 	 */
1651 	outstanding_cqes = 0;
1652 	tail_cons_indx = cons_indx;
1653 	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1654 		/* increment total cqes count */
1655 		outstanding_cqes++;
1656 
1657 		/* increment the consumer index */
1658 		tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1659 
1660 		/* update the pointer to the next cq entry */
1661 		cqe = &cq->cq_buf[tail_cons_indx];
1662 
1663 		/* sync the next cqe to read */
1664 		tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1665 	}
1666 
1667 	/*
1668 	 * Using the 'tail_cons_indx' that was just set, we now know how many
1669 	 * total CQEs possible there are.  Set the 'check_indx' and the
1670 	 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1671 	 */
1672 	check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1673 
1674 	for (i = 0; i < outstanding_cqes; i++) {
1675 		cqe = &cq->cq_buf[check_indx];
1676 
1677 		/* Grab QP number from CQE */
1678 		cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1679 		cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1680 
1681 		/*
1682 		 * If the QP number is the same in the CQE as the QP that we
1683 		 * have on this SRQ, then we must free up the entry off the
1684 		 * SRQ.  We also make sure that the completion type is of the
1685 		 * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
1686 		 * this CQ will be left as-is.  The handling of returning
1687 		 * entries back to HW ownership happens further down.
1688 		 */
1689 		if (cqe_qpnum == qp->qp_qpnum &&
1690 		    cqe_type == TAVOR_COMPLETION_RECV) {
1691 
1692 			/* Add back to SRQ free list */
1693 			(void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1694 			    cq, cqe);
1695 		} else {
1696 			/* Do Copy */
1697 			if (check_indx != new_indx) {
1698 				next_cqe = &cq->cq_buf[new_indx];
1699 
1700 				/*
1701 				 * Copy the CQE into the "next_cqe"
1702 				 * pointer.
1703 				 */
1704 				bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1705 			}
1706 			new_indx = (new_indx - 1) & wrap_around_mask;
1707 		}
1708 		/* Move index to next CQE to check */
1709 		check_indx = (check_indx - 1) & wrap_around_mask;
1710 	}
1711 
1712 	/* Initialize removed cqes count */
1713 	removed_cqes = 0;
1714 
1715 	/* If an entry was removed */
1716 	if (check_indx != new_indx) {
1717 
1718 		/*
1719 		 * Set current pointer back to the beginning consumer index.
1720 		 * At this point, all unclaimed entries have been copied to the
1721 		 * index specified by 'new_indx'.  This 'new_indx' will be used
1722 		 * as the new consumer index after we mark all freed entries as
1723 		 * having HW ownership.  We do that here.
1724 		 */
1725 
1726 		/* Loop through all entries until we reach our new pointer */
1727 		for (indx = cons_indx; indx <= new_indx;
1728 		    indx = (indx + 1) & wrap_around_mask) {
1729 			removed_cqes++;
1730 			cqe = &cq->cq_buf[indx];
1731 
1732 			/* Reset entry to hardware ownership */
1733 			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1734 		}
1735 	}
1736 
1737 	/*
1738 	 * Update consumer index to be the 'new_indx'.  This moves it past all
1739 	 * removed entries.  Because 'new_indx' is pointing to the last
1740 	 * previously valid SW owned entry, we add 1 to point the cons_indx to
1741 	 * the first HW owned entry.
1742 	 */
1743 	cons_indx = (new_indx + 1) & wrap_around_mask;
1744 
1745 	/*
1746 	 * Now we only ring the doorbell (to update the consumer index) if
1747 	 * we've actually consumed a CQ entry.  If we found no QP number
1748 	 * matches above, then we would not have removed anything.  So only if
1749 	 * something was removed do we ring the doorbell.
1750 	 */
1751 	if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1752 		/*
1753 		 * Post doorbell to update the consumer index.  Doorbell
1754 		 * value indicates number of entries consumed (minus 1)
1755 		 */
1756 		if (cons_indx > cq->cq_consindx) {
1757 			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1758 		} else {
1759 			num_to_increment = ((cons_indx + cq->cq_bufsz) -
1760 			    cq->cq_consindx) - 1;
1761 		}
1762 		cq->cq_consindx = cons_indx;
1763 
1764 		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1765 		    cq->cq_cqnum, num_to_increment);
1766 	}
1767 }
1768