xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_wr.c (revision 86644ba28c50af32651bfc09efc3bd975a003ac1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_wr.c
29  *    Tavor Work Request Processing Routines
30  *
31  *    Implements all the routines necessary to provide the PostSend(),
32  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
33  *    necessary to implement the Tavor WRID tracking mechanism.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/avl.h>
42 
43 #include <sys/ib/adapters/tavor/tavor.h>
44 
45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda,
46     uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode);
47 #pragma inline(tavor_qp_send_doorbell)
48 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda,
49     uint32_t nds, uint32_t qpn, uint32_t credits);
50 #pragma inline(tavor_qp_recv_doorbell)
51 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr);
52 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr);
53 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
54     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
55 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr,
56     ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz,
57     uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp);
58 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
59     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
60 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
61     uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
62     tavor_qphdl_t qp);
63 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
64     ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size);
65 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz,
66     uint64_t *prev, tavor_qphdl_t qp);
67 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
68     ibt_recv_wr_t *wr, uint64_t *desc);
69 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev,
70     tavor_srqhdl_t srq);
71 static void tavor_wqe_sync(void *hdl, uint_t sync_from,
72     uint_t sync_to, uint_t sync_type, uint_t flag);
73 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq,
74     tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe);
75 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq);
76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn,
77     uint_t send_or_recv);
78 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state,
79     tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql);
80 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq);
81 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
82     tavor_wrid_list_hdr_t *wrid_list);
83 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
84     tavor_wrid_list_hdr_t *wrid_list);
85 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq);
86 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp);
87 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp);
88 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
89 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
90 
91 /*
92  * tavor_post_send()
93  *    Context: Can be called from interrupt or base context.
94  */
95 int
96 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp,
97     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
98 {
99 	tavor_sw_wqe_dbinfo_t		dbinfo;
100 	tavor_wrid_list_hdr_t		*wridlist;
101 	tavor_wrid_entry_t		*wre_last;
102 	uint64_t			*desc, *prev, *first;
103 	uint32_t			desc_sz, first_sz;
104 	uint32_t			wqeaddrsz, signaled_dbd;
105 	uint32_t			head, tail, next_tail, qsize_msk;
106 	uint32_t			sync_from, sync_to;
107 	uint_t				currindx, wrindx, numremain;
108 	uint_t				chainlen, chainbegin, posted_cnt;
109 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
110 	int				status;
111 
112 	TAVOR_TNF_ENTER(tavor_post_send);
113 
114 	/*
115 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
116 	 * clients to post to QP memory that is accessible directly by the
117 	 * user.  If the QP memory is user accessible, then return an error.
118 	 */
119 	if (qp->qp_is_umap) {
120 		TNF_PROBE_0(tavor_post_send_inv_usrmapped_type,
121 		    TAVOR_TNF_ERROR, "");
122 		TAVOR_TNF_EXIT(tavor_post_send);
123 		return (IBT_QP_HDL_INVALID);
124 	}
125 
126 	/* Initialize posted_cnt */
127 	posted_cnt = 0;
128 
129 	mutex_enter(&qp->qp_lock);
130 
131 	/*
132 	 * Check QP state.  Can not post Send requests from the "Reset",
133 	 * "Init", or "RTR" states
134 	 */
135 	if ((qp->qp_state == TAVOR_QP_RESET) ||
136 	    (qp->qp_state == TAVOR_QP_INIT) ||
137 	    (qp->qp_state == TAVOR_QP_RTR)) {
138 		mutex_exit(&qp->qp_lock);
139 		TNF_PROBE_0(tavor_post_send_inv_qpstate_fail,
140 		    TAVOR_TNF_ERROR, "");
141 		TAVOR_TNF_EXIT(tavor_post_send);
142 		return (IBT_QP_STATE_INVALID);
143 	}
144 
145 	/* Grab the lock for the WRID list */
146 	mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
147 	wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
148 
149 	/* Save away some initial QP state */
150 	qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
151 	tail	  = qp->qp_sq_wqhdr->wq_tail;
152 	head	  = qp->qp_sq_wqhdr->wq_head;
153 
154 	/*
155 	 * For each ibt_send_wr_t in the wr[] list passed in, parse the
156 	 * request and build a Send WQE.  Note:  Because we are potentially
157 	 * building a chain of WQEs, we want to link them all together.
158 	 * However, we do not want to link the first one to the previous
159 	 * WQE until the entire chain has been linked.  Then in the last
160 	 * step we ring the appropriate doorbell.  Note:  It is possible for
161 	 * more Work Requests to be posted than the HW will support at one
162 	 * shot.  If this happens, we need to be able to post and ring
163 	 * several chains here until the the entire request is complete.
164 	 */
165 	wrindx = 0;
166 	numremain = num_wr;
167 	status	  = DDI_SUCCESS;
168 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
169 		/*
170 		 * For the first WQE on a new chain we need "prev" to point
171 		 * to the current descriptor.  As we begin to process
172 		 * further, "prev" will be updated to point to the previous
173 		 * WQE on the current chain (see below).
174 		 */
175 		prev = TAVOR_QP_SQ_ENTRY(qp, tail);
176 
177 		/*
178 		 * Before we begin, save the current "tail index" for later
179 		 * DMA sync
180 		 */
181 		sync_from = tail;
182 
183 		/*
184 		 * Break the request up into chains that are less than or
185 		 * equal to the maximum number of WQEs that can be posted
186 		 * per doorbell ring
187 		 */
188 		chainlen   = (numremain > maxdb) ? maxdb : numremain;
189 		numremain -= chainlen;
190 		chainbegin = wrindx;
191 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
192 			/*
193 			 * Check for "queue full" condition.  If the queue
194 			 * is already full, then no more WQEs can be posted.
195 			 * So break out, ring a doorbell (if necessary) and
196 			 * return an error
197 			 */
198 			if (qp->qp_sq_wqhdr->wq_full != 0) {
199 				status = IBT_QP_FULL;
200 				TNF_PROBE_0_DEBUG(tavor_post_send_sqfull,
201 				    TAVOR_TNF_TRACE, "");
202 				break;
203 			}
204 
205 			/*
206 			 * Increment the "tail index" and check for "queue
207 			 * full" condition.  If we detect that the current
208 			 * work request is going to fill the work queue, then
209 			 * we mark this condition and continue.
210 			 */
211 			next_tail = (tail + 1) & qsize_msk;
212 			if (next_tail == head) {
213 				qp->qp_sq_wqhdr->wq_full = 1;
214 			}
215 
216 			/*
217 			 * Get the address of the location where the next
218 			 * Send WQE should be built
219 			 */
220 			desc = TAVOR_QP_SQ_ENTRY(qp, tail);
221 
222 			/*
223 			 * Call tavor_wqe_send_build() to build the WQE
224 			 * at the given address.  This routine uses the
225 			 * information in the ibt_send_wr_t list (wr[]) and
226 			 * returns the size of the WQE when it returns.
227 			 */
228 			status = tavor_wqe_send_build(state, qp,
229 			    &wr[wrindx], desc, &desc_sz);
230 			if (status != DDI_SUCCESS) {
231 				TNF_PROBE_0(tavor_post_send_bldwqe_fail,
232 				    TAVOR_TNF_ERROR, "");
233 				break;
234 			}
235 
236 			/*
237 			 * Add a WRID entry to the WRID list.  Need to
238 			 * calculate the "wqeaddrsz" and "signaled_dbd"
239 			 * values to pass to tavor_wrid_add_entry()
240 			 */
241 			wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
242 			    ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
243 			    desc_sz);
244 			if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) ||
245 			    (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) {
246 				signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
247 			} else {
248 				signaled_dbd = 0;
249 			}
250 			tavor_wrid_add_entry(qp->qp_sq_wqhdr,
251 			    wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
252 
253 			/*
254 			 * If this is not the first descriptor on the current
255 			 * chain, then link it to the previous WQE.  Otherwise,
256 			 * save the address and size of this descriptor (in
257 			 * "first" and "first_sz" respectively) and continue.
258 			 * Note: Linking a WQE to the the previous one will
259 			 * depend on whether the two WQEs are from "special
260 			 * QPs" (i.e. MLX transport WQEs) or whether they are
261 			 * normal Send WQEs.
262 			 */
263 			if (currindx != 0) {
264 				if (qp->qp_is_special) {
265 					tavor_wqe_mlx_linknext(&wr[wrindx - 1],
266 					    desc, desc_sz, prev, NULL, qp);
267 				} else {
268 					tavor_wqe_send_linknext(&wr[wrindx],
269 					    &wr[wrindx - 1], desc, desc_sz,
270 					    prev, NULL, qp);
271 				}
272 				prev = desc;
273 			} else {
274 				first	 = desc;
275 				first_sz = desc_sz;
276 			}
277 
278 			/*
279 			 * Update the current "tail index" and increment
280 			 * "posted_cnt"
281 			 */
282 			tail = next_tail;
283 			posted_cnt++;
284 		}
285 
286 		/*
287 		 * If we reach here and there are one or more WQEs which have
288 		 * been successfully chained together, then we need to link
289 		 * the current chain to the previously executing chain of
290 		 * descriptor (if there is one) and ring the doorbell for the
291 		 * send work queue.
292 		 */
293 		if (currindx != 0) {
294 			/*
295 			 * Before we link the chain, we need to ensure that the
296 			 * "next" field on the last WQE is set to NULL (to
297 			 * indicate the end of the chain).  Note: Just as it
298 			 * did above, the format for the "next" fields in a
299 			 * given WQE depend on whether the WQE is MLX
300 			 * transport or not.
301 			 */
302 			if (qp->qp_is_special) {
303 				tavor_wqe_mlx_linknext(&wr[chainbegin +
304 				    currindx - 1], NULL, 0, prev, NULL, qp);
305 			} else {
306 				tavor_wqe_send_linknext(NULL,
307 				    &wr[chainbegin + currindx - 1], NULL, 0,
308 				    prev, NULL, qp);
309 			}
310 
311 			/* Save away updated "tail index" for the DMA sync */
312 			sync_to = tail;
313 
314 			/* Do a DMA sync for current send WQE(s) */
315 			tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND,
316 			    DDI_DMA_SYNC_FORDEV);
317 
318 			/*
319 			 * Now link the chain to the old chain (if there was
320 			 * one.  Note: still need to pay attention to whether
321 			 * the QP used MLX transport WQEs or not.
322 			 */
323 			if (qp->qp_is_special) {
324 				tavor_wqe_mlx_linknext(NULL, first, first_sz,
325 				    qp->qp_sq_lastwqeaddr, &dbinfo, qp);
326 			} else {
327 				tavor_wqe_send_linknext(&wr[chainbegin], NULL,
328 				    first, first_sz, qp->qp_sq_lastwqeaddr,
329 				    &dbinfo, qp);
330 			}
331 
332 			/*
333 			 * If there was a valid previous WQE (i.e. non-NULL),
334 			 * then sync it too.  This is because we have updated
335 			 * its "next" fields and we want to ensure that the
336 			 * hardware can see the changes.
337 			 */
338 			if (qp->qp_sq_lastwqeaddr != NULL) {
339 				sync_to   = sync_from;
340 				sync_from = (sync_from - 1) & qsize_msk;
341 				tavor_wqe_sync(qp, sync_from, sync_to,
342 				    TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV);
343 			}
344 
345 			/*
346 			 * Now if the WRID tail entry is non-NULL, then this
347 			 * represents the entry to which we are chaining the
348 			 * new entries.  Since we are going to ring the
349 			 * doorbell for this WQE, we want set its "dbd" bit.
350 			 *
351 			 * On the other hand, if the tail is NULL, even though
352 			 * we will have rung the doorbell for the previous WQE
353 			 * (for the hardware's sake) it is irrelevant to our
354 			 * purposes (for tracking WRIDs) because we know the
355 			 * request must have already completed.
356 			 */
357 			wre_last = wridlist->wl_wre_old_tail;
358 			if (wre_last != NULL) {
359 				wre_last->wr_signaled_dbd |=
360 				    TAVOR_WRID_ENTRY_DOORBELLED;
361 			}
362 
363 			/* Update some of the state in the QP */
364 			qp->qp_sq_lastwqeaddr	 = desc;
365 			qp->qp_sq_wqhdr->wq_tail = tail;
366 
367 			/* Ring the doorbell */
368 			tavor_qp_send_doorbell(state,
369 			    (uint32_t)((uintptr_t)first - qp->qp_desc_off),
370 			    first_sz, qp->qp_qpnum, dbinfo.db_fence,
371 			    dbinfo.db_nopcode);
372 		}
373 	}
374 
375 	/*
376 	 * Update the "num_posted" return value (if necessary).  Then drop
377 	 * the locks and return success.
378 	 */
379 	if (num_posted != NULL) {
380 		*num_posted = posted_cnt;
381 	}
382 
383 	mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
384 	mutex_exit(&qp->qp_lock);
385 
386 	TAVOR_TNF_EXIT(tavor_post_send);
387 	return (status);
388 }
389 
390 
391 /*
392  * tavor_post_recv()
393  *    Context: Can be called from interrupt or base context.
394  */
395 int
396 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp,
397     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
398 {
399 	uint64_t			*desc, *prev, *first;
400 	uint32_t			desc_sz, first_sz;
401 	uint32_t			wqeaddrsz, signaled_dbd;
402 	uint32_t			head, tail, next_tail, qsize_msk;
403 	uint32_t			sync_from, sync_to;
404 	uint_t				currindx, wrindx, numremain;
405 	uint_t				chainlen, posted_cnt;
406 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
407 	int				status;
408 
409 	TAVOR_TNF_ENTER(tavor_post_recv);
410 
411 	/*
412 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
413 	 * clients to post to QP memory that is accessible directly by the
414 	 * user.  If the QP memory is user accessible, then return an error.
415 	 */
416 	if (qp->qp_is_umap) {
417 		TNF_PROBE_0(tavor_post_recv_inv_usrmapped_type,
418 		    TAVOR_TNF_ERROR, "");
419 		TAVOR_TNF_EXIT(tavor_post_recv);
420 		return (IBT_QP_HDL_INVALID);
421 	}
422 
423 	/* Initialize posted_cnt */
424 	posted_cnt = 0;
425 
426 	mutex_enter(&qp->qp_lock);
427 
428 	/*
429 	 * Check if QP is associated with an SRQ
430 	 */
431 	if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
432 		mutex_exit(&qp->qp_lock);
433 		TNF_PROBE_0(tavor_post_recv_fail_qp_on_srq,
434 		    TAVOR_TNF_ERROR, "");
435 		TAVOR_TNF_EXIT(tavor_post_recv);
436 		return (IBT_SRQ_IN_USE);
437 	}
438 
439 	/*
440 	 * Check QP state.  Can not post Recv requests from the "Reset" state
441 	 */
442 	if (qp->qp_state == TAVOR_QP_RESET) {
443 		mutex_exit(&qp->qp_lock);
444 		TNF_PROBE_0(tavor_post_recv_inv_qpstate_fail,
445 		    TAVOR_TNF_ERROR, "");
446 		TAVOR_TNF_EXIT(tavor_post_recv);
447 		return (IBT_QP_STATE_INVALID);
448 	}
449 
450 	/* Grab the lock for the WRID list */
451 	mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
452 
453 	/* Save away some initial QP state */
454 	qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
455 	tail	  = qp->qp_rq_wqhdr->wq_tail;
456 	head	  = qp->qp_rq_wqhdr->wq_head;
457 
458 	/*
459 	 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
460 	 * request and build a Recv WQE.  Note:  Because we are potentially
461 	 * building a chain of WQEs, we want to link them all together.
462 	 * However, we do not want to link the first one to the previous
463 	 * WQE until the entire chain has been linked.  Then in the last
464 	 * step we ring the appropriate doorbell.  Note:  It is possible for
465 	 * more Work Requests to be posted than the HW will support at one
466 	 * shot.  If this happens, we need to be able to post and ring
467 	 * several chains here until the the entire request is complete.
468 	 */
469 	wrindx = 0;
470 	numremain = num_wr;
471 	status	  = DDI_SUCCESS;
472 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
473 		/*
474 		 * For the first WQE on a new chain we need "prev" to point
475 		 * to the current descriptor.  As we begin to process
476 		 * further, "prev" will be updated to point to the previous
477 		 * WQE on the current chain (see below).
478 		 */
479 		prev = TAVOR_QP_RQ_ENTRY(qp, tail);
480 
481 		/*
482 		 * Before we begin, save the current "tail index" for later
483 		 * DMA sync
484 		 */
485 		sync_from = tail;
486 
487 		/*
488 		 * Break the request up into chains that are less than or
489 		 * equal to the maximum number of WQEs that can be posted
490 		 * per doorbell ring
491 		 */
492 		chainlen = (numremain > maxdb) ? maxdb : numremain;
493 		numremain -= chainlen;
494 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
495 			/*
496 			 * Check for "queue full" condition.  If the queue
497 			 * is already full, then no more WQEs can be posted.
498 			 * So break out, ring a doorbell (if necessary) and
499 			 * return an error
500 			 */
501 			if (qp->qp_rq_wqhdr->wq_full != 0) {
502 				status = IBT_QP_FULL;
503 				TNF_PROBE_0_DEBUG(tavor_post_recv_rqfull,
504 				    TAVOR_TNF_TRACE, "");
505 				break;
506 			}
507 
508 			/*
509 			 * Increment the "tail index" and check for "queue
510 			 * full" condition.  If we detect that the current
511 			 * work request is going to fill the work queue, then
512 			 * we mark this condition and continue.
513 			 */
514 			next_tail = (tail + 1) & qsize_msk;
515 			if (next_tail == head) {
516 				qp->qp_rq_wqhdr->wq_full = 1;
517 			}
518 
519 			/*
520 			 * Get the address of the location where the next
521 			 * Recv WQE should be built
522 			 */
523 			desc = TAVOR_QP_RQ_ENTRY(qp, tail);
524 
525 			/*
526 			 * Call tavor_wqe_recv_build() to build the WQE
527 			 * at the given address.  This routine uses the
528 			 * information in the ibt_recv_wr_t list (wr[]) and
529 			 * returns the size of the WQE when it returns.
530 			 */
531 			status = tavor_wqe_recv_build(state, qp, &wr[wrindx],
532 			    desc, &desc_sz);
533 			if (status != DDI_SUCCESS) {
534 				TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
535 				    TAVOR_TNF_ERROR, "");
536 				break;
537 			}
538 
539 			/*
540 			 * Add a WRID entry to the WRID list.  Need to
541 			 * calculate the "wqeaddrsz" and "signaled_dbd"
542 			 * values to pass to tavor_wrid_add_entry().  Note:
543 			 * all Recv WQEs are essentially "signaled" and
544 			 * "doorbelled" (since Tavor HW requires all
545 			 * RecvWQE's to have their "DBD" bits set).
546 			 */
547 			wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
548 			    ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
549 			    desc_sz);
550 			signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED |
551 			    TAVOR_WRID_ENTRY_DOORBELLED;
552 			tavor_wrid_add_entry(qp->qp_rq_wqhdr,
553 			    wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
554 
555 			/*
556 			 * If this is not the first descriptor on the current
557 			 * chain, then link it to the previous WQE.  Otherwise,
558 			 * save the address and size of this descriptor (in
559 			 * "first" and "first_sz" respectively) and continue.
560 			 */
561 			if (currindx != 0) {
562 				tavor_wqe_recv_linknext(desc, desc_sz, prev,
563 				    qp);
564 				prev = desc;
565 			} else {
566 				first	 = desc;
567 				first_sz = desc_sz;
568 			}
569 
570 			/*
571 			 * Update the current "tail index" and increment
572 			 * "posted_cnt"
573 			 */
574 			tail = next_tail;
575 			posted_cnt++;
576 		}
577 
578 		/*
579 		 * If we reach here and there are one or more WQEs which have
580 		 * been successfully chained together, then we need to link
581 		 * the current chain to the previously executing chain of
582 		 * descriptor (if there is one) and ring the doorbell for the
583 		 * recv work queue.
584 		 */
585 		if (currindx != 0) {
586 			/*
587 			 * Before we link the chain, we need to ensure that the
588 			 * "next" field on the last WQE is set to NULL (to
589 			 * indicate the end of the chain).
590 			 */
591 			tavor_wqe_recv_linknext(NULL, 0, prev, qp);
592 
593 			/* Save away updated "tail index" for the DMA sync */
594 			sync_to = tail;
595 
596 			/* Do a DMA sync for current recv WQE(s) */
597 			tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV,
598 			    DDI_DMA_SYNC_FORDEV);
599 
600 			/*
601 			 * Now link the chain to the old chain (if there was
602 			 * one.
603 			 */
604 			tavor_wqe_recv_linknext(first, first_sz,
605 			    qp->qp_rq_lastwqeaddr, qp);
606 
607 			/*
608 			 * If there was a valid previous WQE (i.e. non-NULL),
609 			 * then sync it too.  This is because we have updated
610 			 * its "next" fields and we want to ensure that the
611 			 * hardware can see the changes.
612 			 */
613 			if (qp->qp_rq_lastwqeaddr != NULL) {
614 				sync_to	  = sync_from;
615 				sync_from = (sync_from - 1) & qsize_msk;
616 				tavor_wqe_sync(qp, sync_from, sync_to,
617 				    TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV);
618 			}
619 
620 			/* Update some of the state in the QP */
621 			qp->qp_rq_lastwqeaddr	 = desc;
622 			qp->qp_rq_wqhdr->wq_tail = tail;
623 
624 			/* Ring the doorbell */
625 			tavor_qp_recv_doorbell(state,
626 			    (uint32_t)((uintptr_t)first - qp->qp_desc_off),
627 			    first_sz, qp->qp_qpnum, (chainlen % maxdb));
628 		}
629 	}
630 
631 	/*
632 	 * Update the "num_posted" return value (if necessary).  Then drop
633 	 * the locks and return success.
634 	 */
635 	if (num_posted != NULL) {
636 		*num_posted = posted_cnt;
637 	}
638 
639 	mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
640 	mutex_exit(&qp->qp_lock);
641 
642 	TAVOR_TNF_EXIT(tavor_post_recv);
643 	return (status);
644 }
645 
646 /*
647  * tavor_post_srq()
648  *    Context: Can be called from interrupt or base context.
649  */
650 int
651 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq,
652     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
653 {
654 	uint64_t			*desc, *prev, *first, *last_wqe_addr;
655 	uint32_t			signaled_dbd;
656 	uint32_t			sync_indx;
657 	uint_t				currindx, wrindx, numremain;
658 	uint_t				chainlen, posted_cnt;
659 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
660 	int				status;
661 
662 	TAVOR_TNF_ENTER(tavor_post_srq);
663 
664 	/*
665 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
666 	 * clients to post to QP memory that is accessible directly by the
667 	 * user.  If the QP memory is user accessible, then return an error.
668 	 */
669 	if (srq->srq_is_umap) {
670 		TNF_PROBE_0(tavor_post_srq_inv_usrmapped_type,
671 		    TAVOR_TNF_ERROR, "");
672 		TAVOR_TNF_EXIT(tavor_post_srq);
673 		return (IBT_SRQ_HDL_INVALID);
674 	}
675 
676 	/* Initialize posted_cnt */
677 	posted_cnt = 0;
678 
679 	mutex_enter(&srq->srq_lock);
680 
681 	/*
682 	 * Check SRQ state.  Can not post Recv requests when SRQ is in error
683 	 */
684 	if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) {
685 		mutex_exit(&srq->srq_lock);
686 		TNF_PROBE_0(tavor_post_srq_inv_srqstate_fail,
687 		    TAVOR_TNF_ERROR, "");
688 		TAVOR_TNF_EXIT(tavor_post_srq);
689 		return (IBT_QP_STATE_INVALID);
690 	}
691 
692 	/* Grab the lock for the WRID list */
693 	mutex_enter(&srq->srq_wrid_wql->wql_lock);
694 
695 	/*
696 	 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
697 	 * request and build a Recv WQE.  Note:  Because we are potentially
698 	 * building a chain of WQEs, we want to link them all together.
699 	 * However, we do not want to link the first one to the previous
700 	 * WQE until the entire chain has been linked.  Then in the last
701 	 * step we ring the appropriate doorbell.  Note:  It is possible for
702 	 * more Work Requests to be posted than the HW will support at one
703 	 * shot.  If this happens, we need to be able to post and ring
704 	 * several chains here until the the entire request is complete.
705 	 */
706 	wrindx = 0;
707 	numremain = num_wr;
708 	status	  = DDI_SUCCESS;
709 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
710 		/*
711 		 * For the first WQE on a new chain we need "prev" to point
712 		 * to the current descriptor.  As we begin to process
713 		 * further, "prev" will be updated to point to the previous
714 		 * WQE on the current chain (see below).
715 		 */
716 		if (srq->srq_wq_lastwqeindx == -1) {
717 			prev = NULL;
718 		} else {
719 			prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx);
720 		}
721 
722 		/*
723 		 * Break the request up into chains that are less than or
724 		 * equal to the maximum number of WQEs that can be posted
725 		 * per doorbell ring
726 		 */
727 		chainlen = (numremain > maxdb) ? maxdb : numremain;
728 		numremain -= chainlen;
729 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
730 
731 			/*
732 			 * Check for "queue full" condition.  If the queue
733 			 * is already full, then no more WQEs can be posted.
734 			 * So break out, ring a doorbell (if necessary) and
735 			 * return an error
736 			 */
737 			if (srq->srq_wridlist->wl_free_list_indx == -1) {
738 				status = IBT_QP_FULL;
739 				TNF_PROBE_0_DEBUG(tavor_post_srq_wqfull,
740 				    TAVOR_TNF_TRACE, "");
741 				break;
742 			}
743 
744 			/*
745 			 * Get the address of the location where the next
746 			 * Recv WQE should be built
747 			 */
748 			desc = TAVOR_SRQ_WQE_ADDR(srq,
749 			    srq->srq_wridlist->wl_free_list_indx);
750 
751 			/*
752 			 * Add a WRID entry to the WRID list.  Need to
753 			 * set the "signaled_dbd" values to pass to
754 			 * tavor_wrid_add_entry().  Note: all Recv WQEs are
755 			 * essentially "signaled"
756 			 *
757 			 * The 'size' is stored at srq_alloc time, in the
758 			 * srq_wq_stride.  This is a constant value required
759 			 * for SRQ.
760 			 */
761 			signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
762 			tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id,
763 			    signaled_dbd);
764 
765 			/*
766 			 * Call tavor_wqe_srq_build() to build the WQE
767 			 * at the given address.  This routine uses the
768 			 * information in the ibt_recv_wr_t list (wr[]) and
769 			 * returns the size of the WQE when it returns.
770 			 */
771 			status = tavor_wqe_srq_build(state, srq, &wr[wrindx],
772 			    desc);
773 			if (status != DDI_SUCCESS) {
774 				TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
775 				    TAVOR_TNF_ERROR, "");
776 				break;
777 			}
778 
779 			/*
780 			 * If this is not the first descriptor on the current
781 			 * chain, then link it to the previous WQE.  Otherwise,
782 			 * save the address of this descriptor (in "first") and
783 			 * continue.
784 			 */
785 			if (currindx != 0) {
786 				tavor_wqe_srq_linknext(desc, prev, srq);
787 				sync_indx = TAVOR_SRQ_WQE_INDEX(
788 				    srq->srq_wq_buf, prev,
789 				    srq->srq_wq_log_wqesz);
790 
791 				/* Do a DMA sync for previous recv WQE */
792 				tavor_wqe_sync(srq, sync_indx, sync_indx+1,
793 				    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
794 
795 				prev = desc;
796 			} else {
797 
798 				/*
799 				 * In this case, the last WQE on the chain is
800 				 * also considered 'first'.  So set prev to
801 				 * first, here.
802 				 */
803 				first = prev = desc;
804 			}
805 
806 			/*
807 			 * Increment "posted_cnt"
808 			 */
809 			posted_cnt++;
810 		}
811 
812 		/*
813 		 * If we reach here and there are one or more WQEs which have
814 		 * been successfully chained together, then we need to link
815 		 * the current chain to the previously executing chain of
816 		 * descriptor (if there is one) and ring the doorbell for the
817 		 * recv work queue.
818 		 */
819 		if (currindx != 0) {
820 			/*
821 			 * Before we link the chain, we need to ensure that the
822 			 * "next" field on the last WQE is set to NULL (to
823 			 * indicate the end of the chain).
824 			 */
825 			tavor_wqe_srq_linknext(NULL, prev, srq);
826 
827 			sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev,
828 			    srq->srq_wq_log_wqesz);
829 
830 			/* Do a DMA sync for current recv WQE */
831 			tavor_wqe_sync(srq, sync_indx, sync_indx+1,
832 			    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
833 
834 			/*
835 			 * Now link the chain to the old chain (if there was
836 			 * one).
837 			 */
838 			if (srq->srq_wq_lastwqeindx == -1) {
839 				last_wqe_addr = NULL;
840 			} else {
841 				last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq,
842 				    srq->srq_wq_lastwqeindx);
843 			}
844 			tavor_wqe_srq_linknext(first, last_wqe_addr, srq);
845 
846 			/*
847 			 * If there was a valid previous WQE (i.e. valid index),
848 			 * then sync it too.  This is because we have updated
849 			 * its "next" fields and we want to ensure that the
850 			 * hardware can see the changes.
851 			 */
852 			if (srq->srq_wq_lastwqeindx != -1) {
853 				sync_indx = srq->srq_wq_lastwqeindx;
854 				tavor_wqe_sync(srq, sync_indx, sync_indx+1,
855 				    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
856 			}
857 
858 			/* Update some of the state in the QP */
859 			srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX(
860 			    srq->srq_wq_buf, desc,
861 			    srq->srq_wq_log_wqesz);
862 
863 			/* Ring the doorbell */
864 			/* SRQ needs NDS of 0 */
865 			tavor_qp_recv_doorbell(state,
866 			    (uint32_t)((uintptr_t)first - srq->srq_desc_off),
867 			    0, srq->srq_srqnum, (chainlen % maxdb));
868 		}
869 	}
870 
871 	/*
872 	 * Update the "num_posted" return value (if necessary).  Then drop
873 	 * the locks and return success.
874 	 */
875 	if (num_posted != NULL) {
876 		*num_posted = posted_cnt;
877 	}
878 
879 	mutex_exit(&srq->srq_wrid_wql->wql_lock);
880 	mutex_exit(&srq->srq_lock);
881 
882 	TAVOR_TNF_EXIT(tavor_post_srq);
883 	return (status);
884 }
885 
886 
887 /*
888  * tavor_qp_send_doorbell()
889  *    Context: Can be called from interrupt or base context.
890  */
891 static void
892 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
893     uint32_t qpn, uint32_t fence, uint32_t nopcode)
894 {
895 	uint64_t	doorbell = 0;
896 
897 	/* Build the doorbell from the parameters */
898 	doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
899 	    TAVOR_QPSNDDB_NDA_SHIFT) |
900 	    ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
901 	    ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
902 	    ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
903 
904 	TNF_PROBE_1_DEBUG(tavor_qp_send_doorbell, TAVOR_TNF_TRACE, "",
905 	    tnf_ulong, doorbell, doorbell);
906 
907 	/* Write the doorbell to UAR */
908 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send,
909 	    doorbell);
910 }
911 
912 
913 /*
914  * tavor_qp_recv_doorbell()
915  *    Context: Can be called from interrupt or base context.
916  */
917 static void
918 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
919     uint32_t qpn, uint32_t credits)
920 {
921 	uint64_t	doorbell = 0;
922 
923 	/* Build the doorbell from the parameters */
924 	doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
925 	    TAVOR_QPRCVDB_NDA_SHIFT) |
926 	    ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
927 	    ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
928 
929 	TNF_PROBE_1_DEBUG(tavor_qp_recv_doorbell, TAVOR_TNF_TRACE, "",
930 	    tnf_ulong, doorbell, doorbell);
931 
932 	/* Write the doorbell to UAR */
933 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv,
934 	    doorbell);
935 }
936 
937 
938 /*
939  * tavor_wqe_send_build()
940  *    Context: Can be called from interrupt or base context.
941  */
942 static int
943 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
944     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
945 {
946 	tavor_hw_snd_wqe_ud_t		*ud;
947 	tavor_hw_snd_wqe_remaddr_t	*rc;
948 	tavor_hw_snd_wqe_atomic_t	*at;
949 	tavor_hw_snd_wqe_remaddr_t	*uc;
950 	tavor_hw_snd_wqe_bind_t		*bn;
951 	tavor_hw_wqe_sgl_t		*ds;
952 	ibt_wr_ds_t			*sgl;
953 	tavor_ahhdl_t			ah;
954 	uint32_t			nds;
955 	int				i, num_ds, status;
956 
957 	TAVOR_TNF_ENTER(tavor_wqe_send_build);
958 
959 	ASSERT(MUTEX_HELD(&qp->qp_lock));
960 
961 	/* Initialize the information for the Data Segments */
962 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
963 	    sizeof (tavor_hw_snd_wqe_nextctrl_t));
964 	nds = wr->wr_nds;
965 	sgl = wr->wr_sgl;
966 	num_ds = 0;
967 
968 	/*
969 	 * Build a Send WQE depends first and foremost on the transport
970 	 * type of Work Request (i.e. UD, RC, or UC)
971 	 */
972 	switch (wr->wr_trans) {
973 	case IBT_UD_SRV:
974 		/* Ensure that work request transport type matches QP type */
975 		if (qp->qp_serv_type != TAVOR_QP_UD) {
976 			TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
977 			    TAVOR_TNF_ERROR, "");
978 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
979 			return (IBT_QP_SRV_TYPE_INVALID);
980 		}
981 
982 		/*
983 		 * Validate the operation type.  For UD requests, only the
984 		 * "Send" operation is valid
985 		 */
986 		if (wr->wr_opcode != IBT_WRC_SEND) {
987 			TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
988 			    TAVOR_TNF_ERROR, "");
989 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
990 			return (IBT_QP_OP_TYPE_INVALID);
991 		}
992 
993 		/*
994 		 * If this is a Special QP (QP0 or QP1), then we need to
995 		 * build MLX WQEs instead.  So jump to tavor_wqe_mlx_build()
996 		 * and return whatever status it returns
997 		 */
998 		if (qp->qp_is_special) {
999 			status = tavor_wqe_mlx_build(state, qp, wr, desc, size);
1000 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1001 			return (status);
1002 		}
1003 
1004 		/*
1005 		 * Otherwise, if this is a normal UD Send request, then fill
1006 		 * all the fields in the Tavor UD header for the WQE.  Note:
1007 		 * to do this we'll need to extract some information from the
1008 		 * Address Handle passed with the work request.
1009 		 */
1010 		ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1011 		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1012 		ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1013 		if (ah == NULL) {
1014 			TNF_PROBE_0(tavor_wqe_send_build_invahhdl_fail,
1015 			    TAVOR_TNF_ERROR, "");
1016 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1017 			return (IBT_AH_HDL_INVALID);
1018 		}
1019 
1020 		/*
1021 		 * Build the Unreliable Datagram Segment for the WQE, using
1022 		 * the information from the address handle and the work
1023 		 * request.
1024 		 */
1025 		mutex_enter(&ah->ah_lock);
1026 		TAVOR_WQE_BUILD_UD(qp, ud, ah, wr);
1027 		mutex_exit(&ah->ah_lock);
1028 
1029 		/* Update "ds" for filling in Data Segments (below) */
1030 		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud +
1031 		    sizeof (tavor_hw_snd_wqe_ud_t));
1032 		break;
1033 
1034 	case IBT_RC_SRV:
1035 		/* Ensure that work request transport type matches QP type */
1036 		if (qp->qp_serv_type != TAVOR_QP_RC) {
1037 			TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1038 			    TAVOR_TNF_ERROR, "");
1039 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1040 			return (IBT_QP_SRV_TYPE_INVALID);
1041 		}
1042 
1043 		/*
1044 		 * Validate the operation type.  For RC requests, we allow
1045 		 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1046 		 * operations, and memory window "Bind"
1047 		 */
1048 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1049 		    (wr->wr_opcode != IBT_WRC_RDMAR) &&
1050 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1051 		    (wr->wr_opcode != IBT_WRC_CSWAP) &&
1052 		    (wr->wr_opcode != IBT_WRC_FADD) &&
1053 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1054 			TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1055 			    TAVOR_TNF_ERROR, "");
1056 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1057 			return (IBT_QP_OP_TYPE_INVALID);
1058 		}
1059 
1060 		/*
1061 		 * If this is a Send request, then all we need to do is break
1062 		 * out and here and begin the Data Segment processing below
1063 		 */
1064 		if (wr->wr_opcode == IBT_WRC_SEND) {
1065 			break;
1066 		}
1067 
1068 		/*
1069 		 * If this is an RDMA Read or RDMA Write request, then fill
1070 		 * in the "Remote Address" header fields.
1071 		 */
1072 		if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1073 		    (wr->wr_opcode == IBT_WRC_RDMAW)) {
1074 			rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1075 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1076 
1077 			/*
1078 			 * Build the Remote Address Segment for the WQE, using
1079 			 * the information from the RC work request.
1080 			 */
1081 			TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1082 
1083 			/* Update "ds" for filling in Data Segments (below) */
1084 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
1085 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1086 			break;
1087 		}
1088 
1089 		/*
1090 		 * If this is one of the Atomic type operations (i.e
1091 		 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1092 		 * Address" header fields and the "Atomic" header fields.
1093 		 */
1094 		if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1095 		    (wr->wr_opcode == IBT_WRC_FADD)) {
1096 			rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1097 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1098 			at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1099 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1100 
1101 			/*
1102 			 * Build the Remote Address and Atomic Segments for
1103 			 * the WQE, using the information from the RC Atomic
1104 			 * work request.
1105 			 */
1106 			TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1107 			TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1108 
1109 			/* Update "ds" for filling in Data Segments (below) */
1110 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at +
1111 			    sizeof (tavor_hw_snd_wqe_atomic_t));
1112 
1113 			/*
1114 			 * Update "nds" and "sgl" because Atomic requests have
1115 			 * only a single Data Segment (and they are encoded
1116 			 * somewhat differently in the work request.
1117 			 */
1118 			nds = 1;
1119 			sgl = wr->wr_sgl;
1120 			break;
1121 		}
1122 
1123 		/*
1124 		 * If this is memory window Bind operation, then we call the
1125 		 * tavor_wr_bind_check() routine to validate the request and
1126 		 * to generate the updated RKey.  If this is successful, then
1127 		 * we fill in the WQE's "Bind" header fields.
1128 		 */
1129 		if (wr->wr_opcode == IBT_WRC_BIND) {
1130 			status = tavor_wr_bind_check(state, wr);
1131 			if (status != DDI_SUCCESS) {
1132 				TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1133 				    TAVOR_TNF_ERROR, "");
1134 				TAVOR_TNF_EXIT(tavor_wqe_send_build);
1135 				return (status);
1136 			}
1137 
1138 			bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1139 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1140 
1141 			/*
1142 			 * Build the Bind Memory Window Segments for the WQE,
1143 			 * using the information from the RC Bind memory
1144 			 * window work request.
1145 			 */
1146 			TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1147 
1148 			/*
1149 			 * Update the "ds" pointer.  Even though the "bind"
1150 			 * operation requires no SGLs, this is necessary to
1151 			 * facilitate the correct descriptor size calculations
1152 			 * (below).
1153 			 */
1154 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1155 			    sizeof (tavor_hw_snd_wqe_bind_t));
1156 			nds = 0;
1157 		}
1158 		break;
1159 
1160 	case IBT_UC_SRV:
1161 		/* Ensure that work request transport type matches QP type */
1162 		if (qp->qp_serv_type != TAVOR_QP_UC) {
1163 			TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1164 			    TAVOR_TNF_ERROR, "");
1165 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1166 			return (IBT_QP_SRV_TYPE_INVALID);
1167 		}
1168 
1169 		/*
1170 		 * Validate the operation type.  For UC requests, we only
1171 		 * allow "Send", "RDMA Write", and memory window "Bind".
1172 		 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1173 		 * operations
1174 		 */
1175 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1176 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1177 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1178 			TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1179 			    TAVOR_TNF_ERROR, "");
1180 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1181 			return (IBT_QP_OP_TYPE_INVALID);
1182 		}
1183 
1184 		/*
1185 		 * If this is a Send request, then all we need to do is break
1186 		 * out and here and begin the Data Segment processing below
1187 		 */
1188 		if (wr->wr_opcode == IBT_WRC_SEND) {
1189 			break;
1190 		}
1191 
1192 		/*
1193 		 * If this is an RDMA Write request, then fill in the "Remote
1194 		 * Address" header fields.
1195 		 */
1196 		if (wr->wr_opcode == IBT_WRC_RDMAW) {
1197 			uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1198 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1199 
1200 			/*
1201 			 * Build the Remote Address Segment for the WQE, using
1202 			 * the information from the UC work request.
1203 			 */
1204 			TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1205 
1206 			/* Update "ds" for filling in Data Segments (below) */
1207 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc +
1208 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1209 			break;
1210 		}
1211 
1212 		/*
1213 		 * If this is memory window Bind operation, then we call the
1214 		 * tavor_wr_bind_check() routine to validate the request and
1215 		 * to generate the updated RKey.  If this is successful, then
1216 		 * we fill in the WQE's "Bind" header fields.
1217 		 */
1218 		if (wr->wr_opcode == IBT_WRC_BIND) {
1219 			status = tavor_wr_bind_check(state, wr);
1220 			if (status != DDI_SUCCESS) {
1221 				TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1222 				    TAVOR_TNF_ERROR, "");
1223 				TAVOR_TNF_EXIT(tavor_wqe_send_build);
1224 				return (status);
1225 			}
1226 
1227 			bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1228 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1229 
1230 			/*
1231 			 * Build the Bind Memory Window Segments for the WQE,
1232 			 * using the information from the UC Bind memory
1233 			 * window work request.
1234 			 */
1235 			TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1236 
1237 			/*
1238 			 * Update the "ds" pointer.  Even though the "bind"
1239 			 * operation requires no SGLs, this is necessary to
1240 			 * facilitate the correct descriptor size calculations
1241 			 * (below).
1242 			 */
1243 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1244 			    sizeof (tavor_hw_snd_wqe_bind_t));
1245 			nds = 0;
1246 		}
1247 		break;
1248 
1249 	default:
1250 		TNF_PROBE_0(tavor_wqe_send_build_inv_tranport_fail,
1251 		    TAVOR_TNF_ERROR, "");
1252 		TAVOR_TNF_EXIT(tavor_wqe_send_build);
1253 		return (IBT_QP_SRV_TYPE_INVALID);
1254 	}
1255 
1256 	/*
1257 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
1258 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1259 	 * Start by checking for a valid number of SGL entries
1260 	 */
1261 	if (nds > qp->qp_sq_sgl) {
1262 		TNF_PROBE_0(tavor_wqe_send_build_toomanysgl_fail,
1263 		    TAVOR_TNF_ERROR, "");
1264 		TAVOR_TNF_EXIT(tavor_wqe_send_build);
1265 		return (IBT_QP_SGL_LEN_INVALID);
1266 	}
1267 
1268 	/*
1269 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
1270 	 * segments.  Note: We skip any SGL with zero size because Tavor
1271 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1272 	 * the encoding for zero means a 2GB transfer.  Because of this special
1273 	 * encoding in the hardware, we mask the requested length with
1274 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1275 	 * zero.)
1276 	 */
1277 	for (i = 0; i < nds; i++) {
1278 		if (sgl[i].ds_len == 0) {
1279 			continue;
1280 		}
1281 
1282 		/*
1283 		 * Fill in the Data Segment(s) for the current WQE, using the
1284 		 * information contained in the scatter-gather list of the
1285 		 * work request.
1286 		 */
1287 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1288 		num_ds++;
1289 	}
1290 
1291 	/* Return the size of descriptor (in 16-byte chunks) */
1292 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4;
1293 
1294 	TAVOR_TNF_EXIT(tavor_wqe_send_build);
1295 	return (DDI_SUCCESS);
1296 }
1297 
1298 
1299 /*
1300  * tavor_wqe_send_linknext()
1301  *    Context: Can be called from interrupt or base context.
1302  */
1303 static void
1304 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr,
1305     uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc,
1306     tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp)
1307 {
1308 	uint64_t	next, ctrl;
1309 	uint32_t	nopcode, fence;
1310 
1311 	/*
1312 	 * Calculate the "next" field of the descriptor.  This amounts to
1313 	 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
1314 	 * fields (see tavor_hw.h for more).  Note:  If there is no next
1315 	 * descriptor (i.e. if the current descriptor is the last WQE on
1316 	 * the chain), then set "next" to zero.
1317 	 */
1318 	if (curr_desc != NULL) {
1319 		/*
1320 		 * Determine the value for the Tavor WQE "nopcode" field
1321 		 * by using the IBTF opcode from the work request
1322 		 */
1323 		switch (curr_wr->wr_opcode) {
1324 		case IBT_WRC_RDMAW:
1325 			if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1326 				nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI;
1327 			} else {
1328 				nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
1329 			}
1330 			break;
1331 
1332 		case IBT_WRC_SEND:
1333 			if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1334 				nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI;
1335 			} else {
1336 				nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1337 			}
1338 			break;
1339 
1340 		case IBT_WRC_RDMAR:
1341 			nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
1342 			break;
1343 
1344 		case IBT_WRC_CSWAP:
1345 			nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS;
1346 			break;
1347 
1348 		case IBT_WRC_FADD:
1349 			nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA;
1350 			break;
1351 
1352 		case IBT_WRC_BIND:
1353 			nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
1354 			break;
1355 		}
1356 
1357 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc
1358 		    - qp->qp_desc_off);
1359 		next  = ((uint64_t)(uintptr_t)curr_desc &
1360 		    TAVOR_WQE_NDA_MASK) << 32;
1361 		next  = next | ((uint64_t)nopcode << 32);
1362 		fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
1363 		if (fence) {
1364 			next = next | TAVOR_WQE_SEND_FENCE_MASK;
1365 		}
1366 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1367 
1368 		/*
1369 		 * If a send queue doorbell will be rung for the next
1370 		 * WQE on the chain, then set the current WQE's "dbd" bit.
1371 		 * Note: We also update the "dbinfo" structure here to pass
1372 		 * back information about what should (later) be included
1373 		 * in the send queue doorbell.
1374 		 */
1375 		if (dbinfo) {
1376 			next = next | TAVOR_WQE_DBD_MASK;
1377 			dbinfo->db_nopcode = nopcode;
1378 			dbinfo->db_fence   = fence;
1379 		}
1380 	} else {
1381 		next = 0;
1382 	}
1383 
1384 	/*
1385 	 * If this WQE is supposed to be linked to the previous descriptor,
1386 	 * then we need to update not only the previous WQE's "next" fields
1387 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e",
1388 	 * "s", "i" and "immediate" fields - see tavor_hw.h for more).  Note:
1389 	 * the "e" bit is always hardcoded to zero.
1390 	 */
1391 	if (prev_desc != NULL) {
1392 		/*
1393 		 * If a send queue doorbell will be rung for the next WQE on
1394 		 * the chain, then update the current WQE's "next" field and
1395 		 * return.
1396 		 * Note: We don't want to modify the "ctrl" field here because
1397 		 * that portion of the previous WQE has already been set
1398 		 * correctly at some previous point in time.
1399 		 */
1400 		if (dbinfo) {
1401 			TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1402 			return;
1403 		}
1404 
1405 		ctrl = 0;
1406 
1407 		/* Set the "c" (i.e. "signaled") bit appropriately */
1408 		if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1409 			ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
1410 		}
1411 
1412 		/* Set the "s" (i.e. "solicited") bit appropriately */
1413 		if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
1414 			ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
1415 		}
1416 
1417 		/* Set the "i" bit and the immediate data appropriately */
1418 		if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) {
1419 			ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK;
1420 			ctrl = ctrl | tavor_wr_get_immediate(prev_wr);
1421 		}
1422 
1423 		TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1424 	}
1425 }
1426 
1427 
1428 /*
1429  * tavor_wqe_mlx_build()
1430  *    Context: Can be called from interrupt or base context.
1431  */
1432 static int
1433 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
1434     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1435 {
1436 	tavor_hw_udav_t		udav;
1437 	tavor_ahhdl_t		ah;
1438 	ib_lrh_hdr_t		*lrh;
1439 	ib_grh_t		*grh;
1440 	ib_bth_hdr_t		*bth;
1441 	ib_deth_hdr_t		*deth;
1442 	tavor_hw_wqe_sgl_t	*ds;
1443 	ibt_wr_ds_t		*sgl;
1444 	uint8_t			*mgmtclass, *hpoint, *hcount;
1445 	uint64_t		data;
1446 	uint32_t		nds, offset, pktlen;
1447 	uint32_t		desc_sz, udav_sz;
1448 	int			i, num_ds;
1449 
1450 	TAVOR_TNF_ENTER(tavor_wqe_mlx_build);
1451 
1452 	ASSERT(MUTEX_HELD(&qp->qp_lock));
1453 
1454 	/* Initialize the information for the Data Segments */
1455 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1456 	    sizeof (tavor_hw_mlx_wqe_nextctrl_t));
1457 
1458 	/*
1459 	 * Pull the address handle from the work request and read in
1460 	 * the contents of the UDAV.  This will be used to answer some
1461 	 * questions about the request.
1462 	 */
1463 	ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1464 	if (ah == NULL) {
1465 		TNF_PROBE_0(tavor_wqe_mlx_build_invahhdl_fail,
1466 		    TAVOR_TNF_ERROR, "");
1467 		TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1468 		return (IBT_AH_HDL_INVALID);
1469 	}
1470 	mutex_enter(&ah->ah_lock);
1471 	udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1472 	for (i = 0; i < udav_sz; i++) {
1473 		data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1474 		    ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1475 		((uint64_t *)&udav)[i] = data;
1476 	}
1477 	mutex_exit(&ah->ah_lock);
1478 
1479 	/*
1480 	 * If the request is for QP1 and the destination LID is equal to
1481 	 * the Permissive LID, then return an error.  This combination is
1482 	 * not allowed
1483 	 */
1484 	if ((udav.rlid == IB_LID_PERMISSIVE) &&
1485 	    (qp->qp_is_special == TAVOR_QP_GSI)) {
1486 		TNF_PROBE_0(tavor_wqe_mlx_build_permissiveLIDonQP1_fail,
1487 		    TAVOR_TNF_ERROR, "");
1488 		TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1489 		return (IBT_AH_HDL_INVALID);
1490 	}
1491 
1492 	/*
1493 	 * Calculate the size of the packet headers, including the GRH
1494 	 * (if necessary)
1495 	 */
1496 	desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1497 	    sizeof (ib_deth_hdr_t);
1498 	if (udav.grh) {
1499 		desc_sz += sizeof (ib_grh_t);
1500 	}
1501 
1502 	/*
1503 	 * Begin to build the first "inline" data segment for the packet
1504 	 * headers.  Note:  By specifying "inline" we can build the contents
1505 	 * of the MAD packet headers directly into the work queue (as part
1506 	 * descriptor).  This has the advantage of both speeding things up
1507 	 * and of not requiring the driver to allocate/register any additional
1508 	 * memory for the packet headers.
1509 	 */
1510 	TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1511 	desc_sz += 4;
1512 
1513 	/*
1514 	 * Build Local Route Header (LRH)
1515 	 *    We start here by building the LRH into a temporary location.
1516 	 *    When we have finished we copy the LRH data into the descriptor.
1517 	 *
1518 	 *    Notice that the VL values are hardcoded.  This is not a problem
1519 	 *    because VL15 is decided later based on the value in the MLX
1520 	 *    transport "next/ctrl" header (see the "vl15" bit below), and it
1521 	 *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1522 	 *    values.  This rule does not hold for loopback packets however
1523 	 *    (all of which bypass the SL-to-VL tables) and it is the reason
1524 	 *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1525 	 *
1526 	 *    Notice also that Source LID is hardcoded to the Permissive LID
1527 	 *    (0xFFFF).  This is also not a problem because if the Destination
1528 	 *    LID is not the Permissive LID, then the "slr" value in the MLX
1529 	 *    transport "next/ctrl" header will be set to zero and the hardware
1530 	 *    will pull the LID from value in the port.
1531 	 */
1532 	lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1533 	pktlen = (desc_sz + 0x100) >> 2;
1534 	TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1535 
1536 	/*
1537 	 * Build Global Route Header (GRH)
1538 	 *    This is only built if necessary as defined by the "grh" bit in
1539 	 *    the address vector.  Note:  We also calculate the offset to the
1540 	 *    next header (BTH) based on whether or not the "grh" bit is set.
1541 	 */
1542 	if (udav.grh) {
1543 		/*
1544 		 * If the request is for QP0, then return an error.  The
1545 		 * combination of global routine (GRH) and QP0 is not allowed.
1546 		 */
1547 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1548 			TNF_PROBE_0(tavor_wqe_mlx_build_GRHonQP0_fail,
1549 			    TAVOR_TNF_ERROR, "");
1550 			TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1551 			return (IBT_AH_HDL_INVALID);
1552 		}
1553 		grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1554 		TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1555 
1556 		bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1557 	} else {
1558 		bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1559 	}
1560 
1561 
1562 	/*
1563 	 * Build Base Transport Header (BTH)
1564 	 *    Notice that the M, PadCnt, and TVer fields are all set
1565 	 *    to zero implicitly.  This is true for all Management Datagrams
1566 	 *    MADs whether GSI are SMI.
1567 	 */
1568 	TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1569 
1570 	/*
1571 	 * Build Datagram Extended Transport Header (DETH)
1572 	 */
1573 	deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1574 	TAVOR_WQE_BUILD_MLX_DETH(deth, qp);
1575 
1576 	/* Ensure that the Data Segment is aligned on a 16-byte boundary */
1577 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1578 	ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1579 	nds = wr->wr_nds;
1580 	sgl = wr->wr_sgl;
1581 	num_ds = 0;
1582 
1583 	/*
1584 	 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1585 	 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1586 	 * Start by checking for a valid number of SGL entries
1587 	 */
1588 	if (nds > qp->qp_sq_sgl) {
1589 		TNF_PROBE_0(tavor_wqe_mlx_build_toomanysgl_fail,
1590 		    TAVOR_TNF_ERROR, "");
1591 		TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1592 		return (IBT_QP_SGL_LEN_INVALID);
1593 	}
1594 
1595 	/*
1596 	 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1597 	 * segments.  Note: We skip any SGL with zero size because Tavor
1598 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1599 	 * the encoding for zero means a 2GB transfer.  Because of this special
1600 	 * encoding in the hardware, we mask the requested length with
1601 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1602 	 * zero.)
1603 	 */
1604 	mgmtclass = hpoint = hcount = NULL;
1605 	offset = 0;
1606 	for (i = 0; i < nds; i++) {
1607 		if (sgl[i].ds_len == 0) {
1608 			continue;
1609 		}
1610 
1611 		/*
1612 		 * Fill in the Data Segment(s) for the MLX send WQE, using
1613 		 * the information contained in the scatter-gather list of
1614 		 * the work request.
1615 		 */
1616 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1617 
1618 		/*
1619 		 * Search through the contents of all MADs posted to QP0 to
1620 		 * initialize pointers to the places where Directed Route "hop
1621 		 * pointer", "hop count", and "mgmtclass" would be.  Tavor
1622 		 * needs these updated (i.e. incremented or decremented, as
1623 		 * necessary) by software.
1624 		 */
1625 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1626 
1627 			TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1628 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1629 
1630 			TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1631 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1632 
1633 			TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1634 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1635 
1636 			offset += sgl[i].ds_len;
1637 		}
1638 		num_ds++;
1639 	}
1640 
1641 	/*
1642 	 * Tavor's Directed Route MADs need to have the "hop pointer"
1643 	 * incremented/decremented (as necessary) depending on whether it is
1644 	 * currently less than or greater than the "hop count" (i.e. whether
1645 	 * the MAD is a request or a response.)
1646 	 */
1647 	if (qp->qp_is_special == TAVOR_QP_SMI) {
1648 		TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1649 		    *hpoint, *hcount);
1650 	}
1651 
1652 	/*
1653 	 * Now fill in the ICRC Data Segment.  This data segment is inlined
1654 	 * just like the packets headers above, but it is only four bytes and
1655 	 * set to zero (to indicate that we wish the hardware to generate ICRC.
1656 	 */
1657 	TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1658 	num_ds++;
1659 
1660 	/* Return the size of descriptor (in 16-byte chunks) */
1661 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1662 
1663 	TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1664 	return (DDI_SUCCESS);
1665 }
1666 
1667 
1668 /*
1669  * tavor_wqe_mlx_linknext()
1670  *    Context: Can be called from interrupt or base context.
1671  */
1672 static void
1673 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
1674     uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
1675     tavor_qphdl_t qp)
1676 {
1677 	tavor_hw_udav_t		udav;
1678 	tavor_ahhdl_t		ah;
1679 	uint64_t		next, ctrl, data;
1680 	uint_t			nopcode;
1681 	uint_t			udav_sz;
1682 	int			i;
1683 
1684 	/*
1685 	 * Calculate the "next" field of the descriptor.  This amounts to
1686 	 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see
1687 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1688 	 * if the current descriptor is the last WQE on the chain), then set
1689 	 * "next" to zero.
1690 	 */
1691 	if (curr_desc != NULL) {
1692 		/*
1693 		 * The only valid Tavor WQE "nopcode" for MLX transport
1694 		 * requests is the "Send" code.
1695 		 */
1696 		nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1697 		curr_desc = (uint64_t *)(uintptr_t)((uint64_t)
1698 		    (uintptr_t)curr_desc - qp->qp_desc_off);
1699 		next = (uint64_t)((uintptr_t)curr_desc &
1700 		    TAVOR_WQE_NDA_MASK) << 32;
1701 		next = next | ((uint64_t)nopcode << 32);
1702 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1703 
1704 		/*
1705 		 * If a send queue doorbell will be rung for the next
1706 		 * WQE on the chain, then set the current WQE's "dbd" bit.
1707 		 * Note: We also update the "dbinfo" structure here to pass
1708 		 * back information about what should (later) be included
1709 		 * in the send queue doorbell.
1710 		 */
1711 		if (dbinfo) {
1712 			next = next | TAVOR_WQE_DBD_MASK;
1713 			dbinfo->db_nopcode = nopcode;
1714 			dbinfo->db_fence   = 0;
1715 		}
1716 	} else {
1717 		next = 0;
1718 	}
1719 
1720 	/*
1721 	 * If this WQE is supposed to be linked to the previous descriptor,
1722 	 * then we need to update not only the previous WQE's "next" fields
1723 	 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15",
1724 	 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields -
1725 	 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are
1726 	 * always hardcoded to zero.
1727 	 */
1728 	if (prev_desc != NULL) {
1729 		/*
1730 		 * If a send queue doorbell will be rung for the next WQE on
1731 		 * the chain, then update the current WQE's "next" field and
1732 		 * return.
1733 		 * Note: We don't want to modify the "ctrl" field here because
1734 		 * that portion of the previous WQE has already been set
1735 		 * correctly at some previous point in time.
1736 		 */
1737 		if (dbinfo) {
1738 			TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1739 			return;
1740 		}
1741 
1742 		/*
1743 		 * Pull the address handle from the work request and read in
1744 		 * the contents of the UDAV.  This will be used to answer some
1745 		 * questions about the request.
1746 		 */
1747 		ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah;
1748 		mutex_enter(&ah->ah_lock);
1749 		udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1750 		for (i = 0; i < udav_sz; i++) {
1751 			data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1752 			    ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1753 			((uint64_t *)&udav)[i] = data;
1754 		}
1755 		mutex_exit(&ah->ah_lock);
1756 
1757 		ctrl = 0;
1758 
1759 		/* Only QP0 uses VL15, otherwise use VL in the packet */
1760 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1761 			ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK;
1762 		}
1763 
1764 		/*
1765 		 * The SLR (Source LID Replace) bit determines whether the
1766 		 * source LID for an outgoing MLX packet should come from the
1767 		 * PortInfo (SLR = 0) or should be left as it is in the
1768 		 * descriptor (SLR = 1).  The latter is necessary for packets
1769 		 * to be sent with the Permissive LID.
1770 		 */
1771 		if (udav.rlid == IB_LID_PERMISSIVE) {
1772 			ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK;
1773 		}
1774 
1775 		/* Fill in the max static rate from the address handle */
1776 		ctrl = ctrl | ((uint64_t)udav.max_stat_rate <<
1777 		    TAVOR_WQE_MLXHDR_SRATE_SHIFT);
1778 
1779 		/* All VL15 (i.e. SMI) traffic is required to use SL 0 */
1780 		if (qp->qp_is_special != TAVOR_QP_SMI) {
1781 			ctrl = ctrl | ((uint64_t)udav.sl <<
1782 			    TAVOR_WQE_MLXHDR_SL_SHIFT);
1783 		}
1784 
1785 		/* Set the "c" (i.e. "signaled") bit appropriately */
1786 		if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1787 			ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK;
1788 		}
1789 
1790 		/* Fill in the destination LID from the address handle */
1791 		ctrl = ctrl | ((uint64_t)udav.rlid <<
1792 		    TAVOR_WQE_MLXHDR_RLID_SHIFT);
1793 
1794 		TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1795 	}
1796 }
1797 
1798 
1799 /*
1800  * tavor_wqe_recv_build()
1801  *    Context: Can be called from interrupt or base context.
1802  */
1803 /* ARGSUSED */
1804 static int
1805 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
1806     ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size)
1807 {
1808 	tavor_hw_wqe_sgl_t	*ds;
1809 	int			i, num_ds;
1810 
1811 	TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1812 
1813 	ASSERT(MUTEX_HELD(&qp->qp_lock));
1814 
1815 	/* Check that work request transport type is valid */
1816 	if ((qp->qp_serv_type != TAVOR_QP_UD) &&
1817 	    (qp->qp_serv_type != TAVOR_QP_RC) &&
1818 	    (qp->qp_serv_type != TAVOR_QP_UC)) {
1819 		TNF_PROBE_0(tavor_build_recv_wqe_inv_servtype_fail,
1820 		    TAVOR_TNF_ERROR, "");
1821 		TAVOR_TNF_EXIT(tavor_build_recv_wqe);
1822 		return (IBT_QP_SRV_TYPE_INVALID);
1823 	}
1824 
1825 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1826 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1827 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1828 	num_ds = 0;
1829 
1830 	/* Check for valid number of SGL entries */
1831 	if (wr->wr_nds > qp->qp_rq_sgl) {
1832 		TNF_PROBE_0(tavor_wqe_recv_build_toomanysgl_fail,
1833 		    TAVOR_TNF_ERROR, "");
1834 		TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1835 		return (IBT_QP_SGL_LEN_INVALID);
1836 	}
1837 
1838 	/*
1839 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1840 	 * segments.  Note: We skip any SGL with zero size because Tavor
1841 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1842 	 * the encoding for zero means a 2GB transfer.  Because of this special
1843 	 * encoding in the hardware, we mask the requested length with
1844 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1845 	 * zero.)
1846 	 */
1847 	for (i = 0; i < wr->wr_nds; i++) {
1848 		if (wr->wr_sgl[i].ds_len == 0) {
1849 			continue;
1850 		}
1851 
1852 		/*
1853 		 * Fill in the Data Segment(s) for the receive WQE, using the
1854 		 * information contained in the scatter-gather list of the
1855 		 * work request.
1856 		 */
1857 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]);
1858 		num_ds++;
1859 	}
1860 
1861 	/* Return the size of descriptor (in 16-byte chunks) */
1862 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1863 
1864 	TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1865 	return (DDI_SUCCESS);
1866 }
1867 
1868 
1869 /*
1870  * tavor_wqe_recv_linknext()
1871  *    Context: Can be called from interrupt or base context.
1872  */
1873 static void
1874 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz,
1875     uint64_t *prev_desc, tavor_qphdl_t qp)
1876 {
1877 	uint64_t	next;
1878 
1879 	/*
1880 	 * Calculate the "next" field of the descriptor.  This amounts to
1881 	 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1882 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1883 	 * if the current descriptor is the last WQE on the chain), then set
1884 	 * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
1885 	 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1886 	 * In either case, we must add a single bit in the "reserved" field
1887 	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
1888 	 * workaround for a known Tavor errata that can cause Recv WQEs with
1889 	 * zero in the NDA field to behave improperly.
1890 	 */
1891 	if (curr_desc != NULL) {
1892 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1893 		    qp->qp_desc_off);
1894 		next = (uint64_t)((uintptr_t)curr_desc &
1895 		    TAVOR_WQE_NDA_MASK) << 32;
1896 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
1897 		    TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1898 	} else {
1899 		next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1900 	}
1901 
1902 	/*
1903 	 * If this WQE is supposed to be linked to the previous descriptor,
1904 	 * then we need to update not only the previous WQE's "next" fields
1905 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1906 	 * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
1907 	 * bits are always hardcoded to zero.
1908 	 */
1909 	if (prev_desc != NULL) {
1910 		TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next);
1911 	}
1912 }
1913 
1914 
1915 /*
1916  * tavor_wqe_srq_build()
1917  *    Context: Can be called from interrupt or base context.
1918  */
1919 /* ARGSUSED */
1920 static int
1921 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
1922     ibt_recv_wr_t *wr, uint64_t *desc)
1923 {
1924 	tavor_hw_wqe_sgl_t	*ds;
1925 	ibt_wr_ds_t		end_sgl;
1926 	int			i, num_ds;
1927 
1928 	TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1929 
1930 	ASSERT(MUTEX_HELD(&srq->srq_lock));
1931 
1932 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1933 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1934 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1935 	num_ds = 0;
1936 
1937 	/* Check for valid number of SGL entries */
1938 	if (wr->wr_nds > srq->srq_wq_sgl) {
1939 		TNF_PROBE_0(tavor_wqe_srq_build_toomanysgl_fail,
1940 		    TAVOR_TNF_ERROR, "");
1941 		TAVOR_TNF_EXIT(tavor_wqe_srq_build);
1942 		return (IBT_QP_SGL_LEN_INVALID);
1943 	}
1944 
1945 	/*
1946 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1947 	 * segments.  Note: We skip any SGL with zero size because Tavor
1948 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1949 	 * the encoding for zero means a 2GB transfer.  Because of this special
1950 	 * encoding in the hardware, we mask the requested length with
1951 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1952 	 * zero.)
1953 	 */
1954 	for (i = 0; i < wr->wr_nds; i++) {
1955 		if (wr->wr_sgl[i].ds_len == 0) {
1956 			continue;
1957 		}
1958 
1959 		/*
1960 		 * Fill in the Data Segment(s) for the receive WQE, using the
1961 		 * information contained in the scatter-gather list of the
1962 		 * work request.
1963 		 */
1964 		TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]);
1965 		num_ds++;
1966 	}
1967 
1968 	/*
1969 	 * For SRQ, if the number of data segments is less than the maximum
1970 	 * specified at alloc, then we have to fill in a special "key" entry in
1971 	 * the sgl entry after the last valid one in this post request.  We do
1972 	 * that here.
1973 	 */
1974 	if (num_ds < srq->srq_wq_sgl) {
1975 		end_sgl.ds_va  = 0;
1976 		end_sgl.ds_len = 0;
1977 		end_sgl.ds_key = 0x1;
1978 		TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl);
1979 	}
1980 
1981 	TAVOR_TNF_EXIT(tavor_wqe_srq_build);
1982 	return (DDI_SUCCESS);
1983 }
1984 
1985 
1986 /*
1987  * tavor_wqe_srq_linknext()
1988  *    Context: Can be called from interrupt or base context.
1989  */
1990 static void
1991 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc,
1992     tavor_srqhdl_t srq)
1993 {
1994 	uint64_t	next;
1995 
1996 	/*
1997 	 * Calculate the "next" field of the descriptor.  This amounts to
1998 	 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1999 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
2000 	 * if the current descriptor is the last WQE on the chain), then set
2001 	 * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
2002 	 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
2003 	 * In either case, we must add a single bit in the "reserved" field
2004 	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
2005 	 * workaround for a known Tavor errata that can cause Recv WQEs with
2006 	 * zero in the NDA field to behave improperly.
2007 	 */
2008 	if (curr_desc != NULL) {
2009 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
2010 		    srq->srq_desc_off);
2011 		next = (uint64_t)((uintptr_t)curr_desc &
2012 		    TAVOR_WQE_NDA_MASK) << 32;
2013 		next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
2014 	} else {
2015 		next = TAVOR_RCV_WQE_NDA0_WA_MASK;
2016 	}
2017 
2018 	/*
2019 	 * If this WQE is supposed to be linked to the previous descriptor,
2020 	 * then we need to update not only the previous WQE's "next" fields
2021 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
2022 	 * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
2023 	 * bits are always hardcoded to zero.
2024 	 */
2025 	if (prev_desc != NULL) {
2026 		TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next);
2027 	}
2028 }
2029 
2030 
2031 /*
2032  * tavor_wr_get_immediate()
2033  *    Context: Can be called from interrupt or base context.
2034  */
2035 static uint32_t
2036 tavor_wr_get_immediate(ibt_send_wr_t *wr)
2037 {
2038 	/*
2039 	 * This routine extracts the "immediate data" from the appropriate
2040 	 * location in the IBTF work request.  Because of the way the
2041 	 * work request structure is defined, the location for this data
2042 	 * depends on the actual work request operation type.
2043 	 */
2044 
2045 	/* For RDMA Write, test if RC or UC */
2046 	if (wr->wr_opcode == IBT_WRC_RDMAW) {
2047 		if (wr->wr_trans == IBT_RC_SRV) {
2048 			return (wr->wr.rc.rcwr.rdma.rdma_immed);
2049 		} else {  /* IBT_UC_SRV */
2050 			return (wr->wr.uc.ucwr.rdma.rdma_immed);
2051 		}
2052 	}
2053 
2054 	/* For Send, test if RC, UD, or UC */
2055 	if (wr->wr_opcode == IBT_WRC_SEND) {
2056 		if (wr->wr_trans == IBT_RC_SRV) {
2057 			return (wr->wr.rc.rcwr.send_immed);
2058 		} else if (wr->wr_trans == IBT_UD_SRV) {
2059 			return (wr->wr.ud.udwr_immed);
2060 		} else {  /* IBT_UC_SRV */
2061 			return (wr->wr.uc.ucwr.send_immed);
2062 		}
2063 	}
2064 
2065 	/*
2066 	 * If any other type of request, then immediate is undefined
2067 	 */
2068 	return (0);
2069 }
2070 
2071 
2072 /*
2073  * tavor_wqe_sync()
2074  *    Context: Can be called from interrupt or base context.
2075  */
2076 static void
2077 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
2078     uint_t sync_type, uint_t flag)
2079 {
2080 	tavor_qphdl_t		qp;
2081 	tavor_srqhdl_t		srq;
2082 	uint_t			is_sync_req;
2083 	uint64_t		*wqe_from, *wqe_to, *wqe_base, *wqe_top;
2084 	ddi_dma_handle_t	dmahdl;
2085 	off_t			offset;
2086 	size_t			length;
2087 	uint32_t		qsize;
2088 	int			status;
2089 
2090 	TAVOR_TNF_ENTER(tavor_wqe_sync);
2091 
2092 	if (sync_type == TAVOR_WR_SRQ) {
2093 		srq = (tavor_srqhdl_t)hdl;
2094 		is_sync_req = srq->srq_sync;
2095 		/* Get the DMA handle from SRQ context */
2096 		dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
2097 	} else {
2098 		qp = (tavor_qphdl_t)hdl;
2099 		is_sync_req = qp->qp_sync;
2100 		/* Get the DMA handle from QP context */
2101 		dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
2102 	}
2103 
2104 	/* Determine if the work queues need to be synced or not */
2105 	if (is_sync_req == 0) {
2106 		TAVOR_TNF_EXIT(tavor_wqe_sync);
2107 		return;
2108 	}
2109 
2110 	/*
2111 	 * Depending on the type of the work queue, we grab information
2112 	 * about the address ranges we need to DMA sync.
2113 	 */
2114 	if (sync_type == TAVOR_WR_SEND) {
2115 		wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from);
2116 		wqe_to   = TAVOR_QP_SQ_ENTRY(qp, sync_to);
2117 		qsize	 = qp->qp_sq_bufsz;
2118 
2119 		wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0);
2120 		wqe_top	 = TAVOR_QP_SQ_ENTRY(qp, qsize);
2121 	} else if (sync_type == TAVOR_WR_RECV) {
2122 		wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from);
2123 		wqe_to   = TAVOR_QP_RQ_ENTRY(qp, sync_to);
2124 		qsize	 = qp->qp_rq_bufsz;
2125 
2126 		wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0);
2127 		wqe_top	 = TAVOR_QP_RQ_ENTRY(qp, qsize);
2128 	} else {
2129 		wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from);
2130 		wqe_to   = TAVOR_SRQ_WQ_ENTRY(srq, sync_to);
2131 		qsize	 = srq->srq_wq_bufsz;
2132 
2133 		wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0);
2134 		wqe_top	 = TAVOR_SRQ_WQ_ENTRY(srq, qsize);
2135 	}
2136 
2137 	/*
2138 	 * There are two possible cases for the beginning and end of the WQE
2139 	 * chain we are trying to sync.  Either this is the simple case, where
2140 	 * the end of the chain is below the beginning of the chain, or it is
2141 	 * the "wrap-around" case, where the end of the chain has wrapped over
2142 	 * the end of the queue.  In the former case, we simply need to
2143 	 * calculate the span from beginning to end and sync it.  In the latter
2144 	 * case, however, we need to calculate the span from the top of the
2145 	 * work queue to the end of the chain and sync that, and then we need
2146 	 * to find the other portion (from beginning of chain to end of queue)
2147 	 * and sync that as well.  Note: if the "top to end" span is actually
2148 	 * zero length, then we don't do a DMA sync because a zero length DMA
2149 	 * sync unnecessarily syncs the entire work queue.
2150 	 */
2151 	if (wqe_to > wqe_from) {
2152 		/* "From Beginning to End" */
2153 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2154 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2155 
2156 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2157 		if (status != DDI_SUCCESS) {
2158 			TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2159 			TAVOR_TNF_EXIT(tavor_wqe_sync);
2160 			return;
2161 		}
2162 	} else {
2163 		/* "From Top to End" */
2164 		offset = (off_t)0;
2165 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base);
2166 		if (length) {
2167 			status = ddi_dma_sync(dmahdl, offset, length, flag);
2168 			if (status != DDI_SUCCESS) {
2169 				TNF_PROBE_0(tavor_wqe_sync_fail,
2170 				    TAVOR_TNF_ERROR, "");
2171 				TAVOR_TNF_EXIT(tavor_wqe_sync);
2172 				return;
2173 			}
2174 		}
2175 
2176 		/* "From Beginning to Bottom" */
2177 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2178 		length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from);
2179 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2180 		if (status != DDI_SUCCESS) {
2181 			TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2182 			TAVOR_TNF_EXIT(tavor_wqe_sync);
2183 			return;
2184 		}
2185 	}
2186 
2187 	TAVOR_TNF_EXIT(tavor_wqe_sync);
2188 }
2189 
2190 
2191 /*
2192  * tavor_wr_bind_check()
2193  *    Context: Can be called from interrupt or base context.
2194  */
2195 static int
2196 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr)
2197 {
2198 	ibt_bind_flags_t	bind_flags;
2199 	uint64_t		vaddr, len;
2200 	uint64_t		reg_start_addr, reg_end_addr;
2201 	tavor_mwhdl_t		mw;
2202 	tavor_mrhdl_t		mr;
2203 	tavor_rsrc_t		*mpt;
2204 	uint32_t		new_rkey;
2205 
2206 	TAVOR_TNF_ENTER(tavor_wr_bind_check);
2207 
2208 	/* Check for a valid Memory Window handle in the WR */
2209 	mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2210 	if (mw == NULL) {
2211 		TNF_PROBE_0(tavor_wr_bind_check_invmwhdl_fail,
2212 		    TAVOR_TNF_ERROR, "");
2213 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2214 		return (IBT_MW_HDL_INVALID);
2215 	}
2216 
2217 	/* Check for a valid Memory Region handle in the WR */
2218 	mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2219 	if (mr == NULL) {
2220 		TNF_PROBE_0(tavor_wr_bind_check_invmrhdl_fail,
2221 		    TAVOR_TNF_ERROR, "");
2222 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2223 		return (IBT_MR_HDL_INVALID);
2224 	}
2225 
2226 	mutex_enter(&mr->mr_lock);
2227 	mutex_enter(&mw->mr_lock);
2228 
2229 	/*
2230 	 * Check here to see if the memory region has already been partially
2231 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
2232 	 * If so, this is an error, return failure.
2233 	 */
2234 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2235 		mutex_exit(&mr->mr_lock);
2236 		mutex_exit(&mw->mr_lock);
2237 		TNF_PROBE_0(tavor_wr_bind_check_invmrhdl2_fail,
2238 		    TAVOR_TNF_ERROR, "");
2239 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2240 		return (IBT_MR_HDL_INVALID);
2241 	}
2242 
2243 	/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2244 	if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2245 		mutex_exit(&mr->mr_lock);
2246 		mutex_exit(&mw->mr_lock);
2247 		TNF_PROBE_0(tavor_wr_bind_check_invrkey_fail,
2248 		    TAVOR_TNF_ERROR, "");
2249 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2250 		return (IBT_MR_RKEY_INVALID);
2251 	}
2252 
2253 	/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2254 	if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2255 		mutex_exit(&mr->mr_lock);
2256 		mutex_exit(&mw->mr_lock);
2257 		TNF_PROBE_0(tavor_wr_bind_check_invlkey_fail,
2258 		    TAVOR_TNF_ERROR, "");
2259 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2260 		return (IBT_MR_LKEY_INVALID);
2261 	}
2262 
2263 	/*
2264 	 * Now check for valid "vaddr" and "len".  Note:  We don't check the
2265 	 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2266 	 */
2267 	len = wr->wr.rc.rcwr.bind->bind_len;
2268 	if (len != 0) {
2269 		vaddr = wr->wr.rc.rcwr.bind->bind_va;
2270 		reg_start_addr = mr->mr_bindinfo.bi_addr;
2271 		reg_end_addr   = mr->mr_bindinfo.bi_addr +
2272 		    (mr->mr_bindinfo.bi_len - 1);
2273 		if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2274 			mutex_exit(&mr->mr_lock);
2275 			mutex_exit(&mw->mr_lock);
2276 			TNF_PROBE_0(tavor_wr_bind_check_inv_vaddr_fail,
2277 			    TAVOR_TNF_ERROR, "");
2278 			TAVOR_TNF_EXIT(tavor_wr_bind_check);
2279 			return (IBT_MR_VA_INVALID);
2280 		}
2281 		vaddr = (vaddr + len) - 1;
2282 		if (vaddr > reg_end_addr) {
2283 			mutex_exit(&mr->mr_lock);
2284 			mutex_exit(&mw->mr_lock);
2285 			TNF_PROBE_0(tavor_wr_bind_check_invlen_fail,
2286 			    TAVOR_TNF_ERROR, "");
2287 			TAVOR_TNF_EXIT(tavor_wr_bind_check);
2288 			return (IBT_MR_LEN_INVALID);
2289 		}
2290 	}
2291 
2292 	/*
2293 	 * Validate the bind access flags.  Remote Write and Atomic access for
2294 	 * the Memory Window require that Local Write access be set in the
2295 	 * corresponding Memory Region.
2296 	 */
2297 	bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2298 	if (((bind_flags & IBT_WR_BIND_WRITE) ||
2299 	    (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2300 	    !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2301 		mutex_exit(&mr->mr_lock);
2302 		mutex_exit(&mw->mr_lock);
2303 		TNF_PROBE_0(tavor_wr_bind_check_invflags_fail,
2304 		    TAVOR_TNF_ERROR, "");
2305 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2306 		return (IBT_MR_ACCESS_REQ_INVALID);
2307 	}
2308 
2309 	/* Calculate the new RKey for the Memory Window */
2310 	mpt = mw->mr_mptrsrcp;
2311 	tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey);
2312 
2313 	wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2314 	mw->mr_rkey = new_rkey;
2315 
2316 	mutex_exit(&mr->mr_lock);
2317 	mutex_exit(&mw->mr_lock);
2318 	TAVOR_TNF_EXIT(tavor_wr_bind_check);
2319 	return (DDI_SUCCESS);
2320 }
2321 
2322 
2323 /*
2324  * tavor_wrid_from_reset_handling()
2325  *    Context: Can be called from interrupt or base context.
2326  */
2327 int
2328 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2329 {
2330 	tavor_workq_hdr_t	*swq, *rwq;
2331 	tavor_wrid_list_hdr_t	*s_wridlist, *r_wridlist;
2332 	uint_t			create_new_swq = 0, create_new_rwq = 0;
2333 	uint_t			create_wql = 0;
2334 	uint_t			qp_srq_en;
2335 
2336 	TAVOR_TNF_ENTER(tavor_wrid_from_reset_handling);
2337 
2338 	/*
2339 	 * For each of this QP's Work Queues, make sure we have a (properly
2340 	 * initialized) Work Request ID list attached to the relevant
2341 	 * completion queue.  Grab the CQ lock(s) before manipulating the
2342 	 * lists.
2343 	 */
2344 	tavor_wrid_wqhdr_lock_both(qp);
2345 	swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum,
2346 	    TAVOR_WR_SEND);
2347 	if (swq == NULL) {
2348 		/* Couldn't find matching work queue header, create it */
2349 		create_new_swq = create_wql = 1;
2350 		swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl,
2351 		    qp->qp_qpnum, TAVOR_WR_SEND, create_wql);
2352 		if (swq == NULL) {
2353 			/*
2354 			 * If we couldn't find/allocate space for the workq
2355 			 * header, then drop the lock(s) and return failure.
2356 			 */
2357 			tavor_wrid_wqhdr_unlock_both(qp);
2358 			TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2359 			    TAVOR_TNF_ERROR, "");
2360 			TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2361 			return (ibc_get_ci_failure(0));
2362 		}
2363 	}
2364 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq))
2365 	qp->qp_sq_wqhdr = swq;
2366 	swq->wq_size = qp->qp_sq_bufsz;
2367 	swq->wq_head = 0;
2368 	swq->wq_tail = 0;
2369 	swq->wq_full = 0;
2370 
2371 	/*
2372 	 * Allocate space for the tavor_wrid_entry_t container
2373 	 */
2374 	s_wridlist = tavor_wrid_get_list(swq->wq_size);
2375 	if (s_wridlist == NULL) {
2376 		/*
2377 		 * If we couldn't allocate space for tracking the WRID
2378 		 * entries, then cleanup the workq header from above (if
2379 		 * necessary, i.e. if we created the workq header).  Then
2380 		 * drop the lock(s) and return failure.
2381 		 */
2382 		if (create_new_swq) {
2383 			tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2384 		}
2385 
2386 		tavor_wrid_wqhdr_unlock_both(qp);
2387 		TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2388 		    TAVOR_TNF_ERROR, "");
2389 		TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2390 		return (ibc_get_ci_failure(0));
2391 	}
2392 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist))
2393 	s_wridlist->wl_wqhdr = swq;
2394 
2395 	/* Chain the new WRID list container to the workq hdr list */
2396 	mutex_enter(&swq->wq_wrid_wql->wql_lock);
2397 	tavor_wrid_wqhdr_add(swq, s_wridlist);
2398 	mutex_exit(&swq->wq_wrid_wql->wql_lock);
2399 
2400 	qp_srq_en = qp->qp_srq_en;
2401 
2402 #ifdef __lock_lint
2403 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2404 #else
2405 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2406 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2407 	}
2408 #endif
2409 	/*
2410 	 * Now we repeat all the above operations for the receive work queue,
2411 	 * or shared receive work queue.
2412 	 *
2413 	 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2414 	 */
2415 	rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum,
2416 	    TAVOR_WR_RECV);
2417 	if (rwq == NULL) {
2418 		create_new_rwq = create_wql = 1;
2419 
2420 		/*
2421 		 * If this QP is associated with an SRQ, and this isn't the
2422 		 * first QP on the SRQ, then the 'srq_wrid_wql' will already be
2423 		 * created.  Since the WQL is created at 'wqhdr_create' time we
2424 		 * pass in the flag 'create_wql' here to be 0 if we have
2425 		 * already created it.  And later on below we then next setup
2426 		 * the WQL and rwq information based off the existing SRQ info.
2427 		 */
2428 		if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2429 		    qp->qp_srqhdl->srq_wrid_wql != NULL) {
2430 			create_wql = 0;
2431 		}
2432 
2433 		rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl,
2434 		    qp->qp_qpnum, TAVOR_WR_RECV, create_wql);
2435 		if (rwq == NULL) {
2436 			/*
2437 			 * If we couldn't find/allocate space for the workq
2438 			 * header, then free all the send queue resources we
2439 			 * just allocated and setup (above), drop the lock(s)
2440 			 * and return failure.
2441 			 */
2442 			mutex_enter(&swq->wq_wrid_wql->wql_lock);
2443 			tavor_wrid_wqhdr_remove(swq, s_wridlist);
2444 			mutex_exit(&swq->wq_wrid_wql->wql_lock);
2445 			if (create_new_swq) {
2446 				tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
2447 				    swq);
2448 			}
2449 
2450 #ifdef __lock_lint
2451 			mutex_exit(&qp->qp_srqhdl->srq_lock);
2452 #else
2453 			if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2454 				mutex_exit(&qp->qp_srqhdl->srq_lock);
2455 			}
2456 #endif
2457 
2458 			tavor_wrid_wqhdr_unlock_both(qp);
2459 			TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2460 			    TAVOR_TNF_ERROR, "");
2461 			TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2462 			return (ibc_get_ci_failure(0));
2463 		}
2464 	}
2465 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq))
2466 
2467 	/*
2468 	 * Setup receive workq hdr
2469 	 *
2470 	 * If the QP is on an SRQ, we setup the SRQ specific fields, setting
2471 	 * keeping a copy of the rwq pointer, setting the rwq bufsize
2472 	 * appropriately, and initializing our part of the WQLock.
2473 	 *
2474 	 * In the normal QP case, the QP recv queue bufsize is used.
2475 	 */
2476 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2477 		rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz;
2478 		if (qp->qp_srqhdl->srq_wrid_wql == NULL) {
2479 			qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql;
2480 		} else {
2481 			rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql;
2482 		}
2483 		tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql);
2484 
2485 	} else {
2486 		rwq->wq_size = qp->qp_rq_bufsz;
2487 	}
2488 
2489 	qp->qp_rq_wqhdr = rwq;
2490 	rwq->wq_head = 0;
2491 	rwq->wq_tail = 0;
2492 	rwq->wq_full = 0;
2493 
2494 	/*
2495 	 * Allocate space for the tavor_wrid_entry_t container.
2496 	 *
2497 	 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must
2498 	 * allocate the wridlist normally.  However, if the srq_wridlist is !=
2499 	 * NULL, then we know this SRQ has already been initialized, thus the
2500 	 * wridlist has already been initialized.  So we re-use the
2501 	 * srq_wridlist as the r_wridlist for this QP in this case.
2502 	 */
2503 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2504 	    qp->qp_srqhdl->srq_wridlist != NULL) {
2505 		/* Use existing srq_wridlist pointer */
2506 		r_wridlist = qp->qp_srqhdl->srq_wridlist;
2507 		ASSERT(r_wridlist != NULL);
2508 	} else {
2509 		/* Allocate memory for the r_wridlist */
2510 		r_wridlist = tavor_wrid_get_list(rwq->wq_size);
2511 	}
2512 
2513 	/*
2514 	 * If the memory allocation failed for r_wridlist (or the SRQ pointer
2515 	 * is mistakenly NULL), we cleanup our previous swq allocation from
2516 	 * above
2517 	 */
2518 	if (r_wridlist == NULL) {
2519 		/*
2520 		 * If we couldn't allocate space for tracking the WRID
2521 		 * entries, then cleanup all the stuff from above.  Then
2522 		 * drop the lock(s) and return failure.
2523 		 */
2524 		mutex_enter(&swq->wq_wrid_wql->wql_lock);
2525 		tavor_wrid_wqhdr_remove(swq, s_wridlist);
2526 		mutex_exit(&swq->wq_wrid_wql->wql_lock);
2527 		if (create_new_swq) {
2528 			tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2529 		}
2530 		if (create_new_rwq) {
2531 			tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
2532 		}
2533 
2534 #ifdef __lock_lint
2535 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2536 #else
2537 		if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2538 			mutex_exit(&qp->qp_srqhdl->srq_lock);
2539 		}
2540 #endif
2541 
2542 		tavor_wrid_wqhdr_unlock_both(qp);
2543 		TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2544 		    TAVOR_TNF_ERROR, "");
2545 		TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2546 		return (ibc_get_ci_failure(0));
2547 	}
2548 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist))
2549 
2550 	/*
2551 	 * Initialize the wridlist
2552 	 *
2553 	 * In the normal QP case, there is no special initialization needed.
2554 	 * We simply setup the wridlist backpointer to be the receive wqhdr
2555 	 * (rwq).
2556 	 *
2557 	 * But in the SRQ case, there is no backpointer to the wqhdr possible.
2558 	 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ
2559 	 * and thus potentially shared across multiple QPs with the SRQ.  We
2560 	 * also setup the srq_wridlist pointer to be the r_wridlist, and
2561 	 * intialize the freelist to an invalid index.  This srq_wridlist
2562 	 * pointer is used above on future moves from_reset to let us know that
2563 	 * the srq_wridlist has been initialized already.
2564 	 *
2565 	 * And finally, if we are in a non-UMAP case, we setup the srq wrid
2566 	 * free list.
2567 	 */
2568 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2569 	    qp->qp_srqhdl->srq_wridlist == NULL) {
2570 		r_wridlist->wl_srq_en = 1;
2571 		r_wridlist->wl_free_list_indx = -1;
2572 		qp->qp_srqhdl->srq_wridlist = r_wridlist;
2573 
2574 		/* Initialize srq wrid free list */
2575 		if (qp->qp_srqhdl->srq_is_umap == 0) {
2576 			mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2577 			tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0);
2578 			mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2579 		}
2580 	} else {
2581 		r_wridlist->wl_wqhdr = rwq;
2582 	}
2583 
2584 	/* Chain the WRID list "container" to the workq hdr list */
2585 	mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2586 	tavor_wrid_wqhdr_add(rwq, r_wridlist);
2587 	mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2588 
2589 #ifdef __lock_lint
2590 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2591 #else
2592 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2593 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2594 	}
2595 #endif
2596 
2597 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist))
2598 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq))
2599 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist))
2600 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq))
2601 
2602 	tavor_wrid_wqhdr_unlock_both(qp);
2603 	TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2604 	return (DDI_SUCCESS);
2605 }
2606 
2607 
2608 /*
2609  * tavor_wrid_to_reset_handling()
2610  *    Context: Can be called from interrupt or base context.
2611  */
2612 void
2613 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2614 {
2615 	uint_t		free_wqhdr = 0;
2616 
2617 	TAVOR_TNF_ENTER(tavor_wrid_to_reset_handling);
2618 
2619 	/*
2620 	 * For each of this QP's Work Queues, move the WRID "container" to
2621 	 * the "reapable" list.  Although there may still be unpolled
2622 	 * entries in these containers, it is not a big deal.  We will not
2623 	 * reap the list until either the Poll CQ command detects an empty
2624 	 * condition or the CQ itself is freed.  Grab the CQ lock(s) before
2625 	 * manipulating the lists.
2626 	 */
2627 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2628 	tavor_wrid_wqhdr_lock_both(qp);
2629 	tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);
2630 
2631 	/*
2632 	 * Add the receive work queue header on to the reaplist.  But if we are
2633 	 * on SRQ, then don't add anything to the reaplist.  Instead we flush
2634 	 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the
2635 	 * WQHDR (if needed).  We must hold the WQL for these operations, yet
2636 	 * the call to tavor_cq_wqhdr_remove grabs the WQL internally.  So we
2637 	 * drop WQL before that call.  Then release the CQ WQHDR locks and the
2638 	 * CQ lock and return.
2639 	 */
2640 	if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2641 
2642 		/*
2643 		 * Pull off all (if any) entries for this QP from CQ.  This
2644 		 * only includes entries that have not yet been polled
2645 		 */
2646 		mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2647 		tavor_cq_srq_entries_flush(state, qp);
2648 
2649 		/* Remove wridlist from WQHDR */
2650 		tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
2651 		    qp->qp_rq_wqhdr->wq_wrid_post);
2652 
2653 		/* If wridlist chain is now empty, remove the wqhdr as well */
2654 		if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) {
2655 			free_wqhdr = 1;
2656 		} else {
2657 			free_wqhdr = 0;
2658 		}
2659 
2660 		mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2661 
2662 		/* Free the WQHDR */
2663 		if (free_wqhdr) {
2664 			tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2665 		}
2666 	} else {
2667 		tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2668 	}
2669 	tavor_wrid_wqhdr_unlock_both(qp);
2670 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2671 
2672 	TAVOR_TNF_EXIT(tavor_wrid_to_reset_handling);
2673 }
2674 
2675 
2676 /*
2677  * tavor_wrid_add_entry()
2678  *    Context: Can be called from interrupt or base context.
2679  */
2680 void
2681 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz,
2682     uint_t signaled_dbd)
2683 {
2684 	tavor_wrid_entry_t	*wre_tmp;
2685 	uint32_t		head, tail, size;
2686 
2687 	TAVOR_TNF_ENTER(tavor_wrid_add_entry);
2688 
2689 	ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2690 
2691 	/*
2692 	 * Find the entry in the container pointed to by the "tail" index.
2693 	 * Add all of the relevant information to that entry, including WRID,
2694 	 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2695 	 * and/or doorbelled.
2696 	 */
2697 	head = wq->wq_wrid_post->wl_head;
2698 	tail = wq->wq_wrid_post->wl_tail;
2699 	size = wq->wq_wrid_post->wl_size;
2700 	wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2701 	wre_tmp->wr_wrid	  = wrid;
2702 	wre_tmp->wr_wqeaddrsz	  = wqeaddrsz;
2703 	wre_tmp->wr_signaled_dbd  = signaled_dbd;
2704 
2705 	/*
2706 	 * Update the "wrid_old_tail" pointer to point to the entry we just
2707 	 * inserted into the queue.  By tracking this pointer (the pointer to
2708 	 * the most recently inserted entry) it will possible later in the
2709 	 * PostSend() and PostRecv() code paths to find the entry that needs
2710 	 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2711 	 * tavor_post_send()).
2712 	 */
2713 	wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2714 
2715 	/* Update the tail index */
2716 	tail = ((tail + 1) & (size - 1));
2717 	wq->wq_wrid_post->wl_tail = tail;
2718 
2719 	/*
2720 	 * If the "tail" index has just wrapped over into the "head" index,
2721 	 * then we have filled the container.  We use the "full" flag to
2722 	 * indicate this condition and to distinguish it from the "empty"
2723 	 * condition (where head and tail are also equal).
2724 	 */
2725 	if (head == tail) {
2726 		wq->wq_wrid_post->wl_full = 1;
2727 	}
2728 	TAVOR_TNF_EXIT(tavor_wrid_add_entry);
2729 }
2730 
2731 /*
2732  * tavor_wrid_add_entry_srq()
2733  * Context: Can be called from interrupt or base context
2734  */
2735 void
2736 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd)
2737 {
2738 	tavor_wrid_entry_t	*wre;
2739 	uint64_t		*wl_wqe;
2740 	uint32_t		wqe_index;
2741 
2742 	TAVOR_TNF_ENTER(tavor_wrid_add_entry_srq);
2743 
2744 	/*
2745 	 * Find the next available WQE from the SRQ free_list.  Then update the
2746 	 * free_list to point to the next entry
2747 	 */
2748 	wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx);
2749 
2750 	wqe_index = srq->srq_wridlist->wl_free_list_indx;
2751 
2752 	/* ASSERT on impossible wqe_index values */
2753 	ASSERT(wqe_index < srq->srq_wq_bufsz);
2754 
2755 	/*
2756 	 * Setup the WRE.
2757 	 *
2758 	 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2759 	 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2760 	 * this information and associate the WRID to the WQE found on the CQE.
2761 	 */
2762 	wre = &srq->srq_wridlist->wl_wre[wqe_index];
2763 	wre->wr_wrid = wrid;
2764 	wre->wr_signaled_dbd  = signaled_dbd;
2765 
2766 	/* Update the free list index */
2767 	srq->srq_wridlist->wl_free_list_indx = ddi_get32(
2768 	    srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe);
2769 
2770 	TAVOR_TNF_EXIT(tavor_wrid_add_entry_srq);
2771 }
2772 
2773 
2774 /*
2775  * tavor_wrid_get_entry()
2776  *    Context: Can be called from interrupt or base context.
2777  */
2778 uint64_t
2779 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
2780     tavor_wrid_entry_t *wre)
2781 {
2782 	tavor_workq_hdr_t	*wq;
2783 	tavor_wrid_entry_t	*wre_tmp;
2784 	uint64_t		wrid;
2785 	uint_t			send_or_recv, qpnum, error, opcode;
2786 
2787 	TAVOR_TNF_ENTER(tavor_wrid_get_entry);
2788 
2789 	/* Lock the list of work queues associated with this CQ */
2790 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
2791 
2792 	/*
2793 	 * Determine whether this CQE is a send or receive completion (and
2794 	 * whether it was a "successful" completion or not)
2795 	 */
2796 	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
2797 	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2798 	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2799 		error = 1;
2800 		send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ?
2801 		    TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV;
2802 	} else {
2803 		error = 0;
2804 		send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe);
2805 	}
2806 
2807 	/* Find the work queue for this QP number (send or receive side) */
2808 	qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
2809 	wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);
2810 	ASSERT(wq != NULL);
2811 
2812 	/*
2813 	 * Regardless of whether the completion is the result of a "success"
2814 	 * or a "failure", we lock the list of "containers" and attempt to
2815 	 * search for the the first matching completion (i.e. the first WR
2816 	 * with a matching WQE addr and size).  Once we find it, we pull out
2817 	 * the "wrid" field and return it (see below).  Note: One possible
2818 	 * future enhancement would be to enable this routine to skip over
2819 	 * any "unsignaled" completions to go directly to the next "signaled"
2820 	 * entry on success. XXX
2821 	 */
2822 	mutex_enter(&wq->wq_wrid_wql->wql_lock);
2823 	wre_tmp = tavor_wrid_find_match(wq, cq, cqe);
2824 
2825 	/*
2826 	 * If this is a "successful" completion, then we assert that this
2827 	 * completion must be a "signaled" completion.
2828 	 */
2829 	ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED));
2830 
2831 	/*
2832 	 * If the completion is a "failed" completion, then we save away the
2833 	 * contents of the entry (into the "wre" field passed in) for use
2834 	 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz()
2835 	 * function to grab "wqeaddrsz" from the next entry in the container.
2836 	 * This is required for error processing (where updating these fields
2837 	 * properly is necessary to correct handling of the "error" CQE)
2838 	 */
2839 	if (error && (wre != NULL)) {
2840 		*wre = *wre_tmp;
2841 		wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq);
2842 	}
2843 
2844 	/* Pull out the WRID and return it */
2845 	wrid = wre_tmp->wr_wrid;
2846 
2847 	mutex_exit(&wq->wq_wrid_wql->wql_lock);
2848 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
2849 
2850 	TAVOR_TNF_EXIT(tavor_wrid_get_entry);
2851 	return (wrid);
2852 }
2853 
2854 
2855 /*
2856  * tavor_wrid_find_match()
2857  *    Context: Can be called from interrupt or base context.
2858  */
2859 static tavor_wrid_entry_t *
2860 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq,
2861     tavor_hw_cqe_t *cqe)
2862 {
2863 	tavor_wrid_entry_t	*curr = NULL;
2864 	tavor_wrid_list_hdr_t	*container;
2865 	uint32_t		wqeaddr_size;
2866 	uint32_t		head, tail, size;
2867 	int			found = 0, last_container;
2868 
2869 	TAVOR_TNF_ENTER(tavor_wrid_find_match);
2870 
2871 	ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2872 
2873 	/* Pull the "wqeaddrsz" information from the CQE */
2874 	wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe);
2875 
2876 	/*
2877 	 * Walk the "containers" list(s), find first WR with a matching WQE
2878 	 * addr.  If the current "container" is not the last one on the list,
2879 	 * i.e. not the current one to which we are posting new WRID entries,
2880 	 * then we do not attempt to update the "q_head", "q_tail", and
2881 	 * "q_full" indicators on the main work queue header.  We do, however,
2882 	 * update the "head" and "full" indicators on the individual containers
2883 	 * as we go.  This is imperative because we need to be able to
2884 	 * determine when the current container has been emptied (so that we
2885 	 * can move on to the next container).
2886 	 */
2887 	container = wq->wq_wrid_poll;
2888 	while (container != NULL) {
2889 		/* Is this the last/only "container" on the list */
2890 		last_container = (container != wq->wq_wrid_post) ? 0 : 1;
2891 
2892 		/*
2893 		 * First check if we are on an SRQ.  If so, we grab the entry
2894 		 * and break out.  Since SRQ wridlist's are never added to
2895 		 * reaplist, they can only be the last container.
2896 		 */
2897 		if (container->wl_srq_en) {
2898 			ASSERT(last_container == 1);
2899 			curr = tavor_wrid_find_match_srq(container, cq, cqe);
2900 			break;
2901 		}
2902 
2903 		/*
2904 		 * Grab the current "head", "tail" and "size" fields before
2905 		 * walking the list in the current container. Note: the "size"
2906 		 * field here must always be a power-of-2.  The "full"
2907 		 * parameter is checked (and updated) here to distinguish the
2908 		 * "queue full" condition from "queue empty".
2909 		 */
2910 		head = container->wl_head;
2911 		tail = container->wl_tail;
2912 		size = container->wl_size;
2913 		while ((head != tail) || (container->wl_full)) {
2914 			container->wl_full = 0;
2915 			curr = &container->wl_wre[head];
2916 			head = ((head + 1) & (size - 1));
2917 
2918 			/*
2919 			 * If the current entry's "wqeaddrsz" matches the one
2920 			 * we're searching for, then this must correspond to
2921 			 * the work request that caused the completion.  Set
2922 			 * the "found" flag and bail out.
2923 			 */
2924 			if (curr->wr_wqeaddrsz == wqeaddr_size) {
2925 				found = 1;
2926 				break;
2927 			}
2928 		}
2929 
2930 		/*
2931 		 * If the current container is empty (having reached here the
2932 		 * "head == tail" condition can only mean that the container
2933 		 * is empty), then NULL out the "wrid_old_tail" field (see
2934 		 * tavor_post_send() and tavor_post_recv() for more details)
2935 		 * and (potentially) remove the current container from future
2936 		 * searches.
2937 		 */
2938 		if (head == tail) {
2939 
2940 			container->wl_wre_old_tail = NULL;
2941 			/*
2942 			 * If this wasn't the last "container" on the chain,
2943 			 * i.e. the one to which new WRID entries will be
2944 			 * added, then remove it from the list.
2945 			 * Note: we don't "lose" the memory pointed to by this
2946 			 * because we should have already put this container
2947 			 * on the "reapable" list (from where it will later be
2948 			 * pulled).
2949 			 */
2950 			if (!last_container) {
2951 				wq->wq_wrid_poll = container->wl_next;
2952 			}
2953 		}
2954 
2955 		/* Update the head index for the container */
2956 		container->wl_head = head;
2957 
2958 		/*
2959 		 * If the entry was found in this container, then continue to
2960 		 * bail out.  Else reset the "curr" pointer and move on to the
2961 		 * next container (if there is one).  Note: the only real
2962 		 * reason for setting "curr = NULL" here is so that the ASSERT
2963 		 * below can catch the case where no matching entry was found
2964 		 * on any of the lists.
2965 		 */
2966 		if (found) {
2967 			break;
2968 		} else {
2969 			curr = NULL;
2970 			container = container->wl_next;
2971 		}
2972 	}
2973 
2974 	/*
2975 	 * Update work queue header's "head" and "full" conditions to match
2976 	 * the last entry on the container list.  (Note: Only if we're pulling
2977 	 * entries from the last work queue portion of the list, i.e. not from
2978 	 * the previous portions that may be the "reapable" list.)
2979 	 */
2980 	if (last_container) {
2981 		wq->wq_head = wq->wq_wrid_post->wl_head;
2982 		wq->wq_full = wq->wq_wrid_post->wl_full;
2983 	}
2984 
2985 	/* Ensure that we've actually found what we were searching for */
2986 	ASSERT(curr != NULL);
2987 
2988 	TAVOR_TNF_EXIT(tavor_wrid_find_match);
2989 	return (curr);
2990 }
2991 
2992 
2993 /*
2994  * tavor_wrid_find_match_srq()
2995  *    Context: Can be called from interrupt or base context.
2996  */
2997 tavor_wrid_entry_t *
2998 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq,
2999     tavor_hw_cqe_t *cqe)
3000 {
3001 	tavor_wrid_entry_t	*wre;
3002 	uint64_t		*wl_wqe;
3003 	uint32_t		wqe_index;
3004 	uint64_t		wqe_addr;
3005 	uint32_t		cqe_wqe_addr;
3006 
3007 	/* Grab the WQE addr out of the CQE */
3008 	cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0;
3009 
3010 	/*
3011 	 * Use the WQE addr as the lower 32-bit, we add back on the
3012 	 * 'wl_srq_desc_off' because we have a zero-based queue.  Then the
3013 	 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in
3014 	 * the SRQ Work Queue itself.  We use this address as the index to find
3015 	 * out which Work Queue Entry this CQE corresponds with.
3016 	 *
3017 	 * We also use this address below to add the WQE back on to the free
3018 	 * list.
3019 	 */
3020 	wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) |
3021 	    (cqe_wqe_addr + wl->wl_srq_desc_off);
3022 
3023 	/*
3024 	 * Given the 'wqe_addr' just calculated and the srq buf address, we
3025 	 * find the 'wqe_index'.  The 'wre' returned below contains the WRID
3026 	 * that we are looking for.  This indexes into the wre_list for this
3027 	 * specific WQE.
3028 	 */
3029 	wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr,
3030 	    wl->wl_srq_log_wqesz);
3031 
3032 	/* ASSERT on impossible wqe_index values */
3033 	ASSERT(wqe_index < wl->wl_srq_wq_bufsz);
3034 
3035 	/* Get the pointer to this WQE */
3036 	wl_wqe = (uint64_t *)(uintptr_t)wqe_addr;
3037 
3038 	/* Put this WQE index back on the free list */
3039 	ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx);
3040 	wl->wl_free_list_indx = wqe_index;
3041 
3042 	/* Using the index, return the Work Request ID Entry (wre) */
3043 	wre = &wl->wl_wre[wqe_index];
3044 
3045 	return (wre);
3046 }
3047 
3048 
3049 /*
3050  * tavor_wrid_cq_reap()
3051  *    Context: Can be called from interrupt or base context.
3052  */
3053 void
3054 tavor_wrid_cq_reap(tavor_cqhdl_t cq)
3055 {
3056 	tavor_workq_hdr_t	*consume_wqhdr;
3057 	tavor_wrid_list_hdr_t	*container, *to_free;
3058 
3059 	ASSERT(MUTEX_HELD(&cq->cq_lock));
3060 
3061 	TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3062 
3063 	/* Lock the list of work queues associated with this CQ */
3064 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
3065 
3066 	/* Walk the "reapable" list and free up containers */
3067 	container = cq->cq_wrid_reap_head;
3068 	while (container != NULL) {
3069 		to_free	  = container;
3070 		container = container->wl_reap_next;
3071 		/*
3072 		 * If reaping the WRID list containers pulls the last
3073 		 * container from the given work queue header, then we free
3074 		 * the work queue header as well.
3075 		 */
3076 		consume_wqhdr = tavor_wrid_list_reap(to_free);
3077 		if (consume_wqhdr != NULL) {
3078 			tavor_cq_wqhdr_remove(cq, consume_wqhdr);
3079 		}
3080 	}
3081 
3082 	/* Once finished reaping, we reset the CQ's reap list */
3083 	cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;
3084 
3085 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
3086 	TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3087 }
3088 
3089 
3090 /*
3091  * tavor_wrid_cq_force_reap()
3092  *    Context: Can be called from interrupt or base context.
3093  */
3094 void
3095 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)
3096 {
3097 	tavor_workq_hdr_t	*curr;
3098 	tavor_wrid_list_hdr_t	*container, *to_free;
3099 	avl_tree_t		*treep;
3100 	void			*cookie = NULL;
3101 
3102 	ASSERT(MUTEX_HELD(&cq->cq_lock));
3103 
3104 	TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3105 
3106 	/*
3107 	 * The first step is to walk the "reapable" list and free up those
3108 	 * containers.  This is necessary because the containers on the
3109 	 * reapable list are not otherwise connected to the work queue headers
3110 	 * anymore.
3111 	 */
3112 	tavor_wrid_cq_reap(cq);
3113 
3114 	/* Now lock the list of work queues associated with this CQ */
3115 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
3116 
3117 	/*
3118 	 * Walk the list of work queue headers and free up all the WRID list
3119 	 * containers chained to it.  Note: We don't need to grab the locks
3120 	 * for each of the individual WRID lists here because the only way
3121 	 * things can be added or removed from the list at this point would be
3122 	 * through post a work request to a QP.  But if we've come this far,
3123 	 * then we can be assured that there are no longer any QP associated
3124 	 * with the CQ that we are trying to free.
3125 	 */
3126 #ifdef __lock_lint
3127 	tavor_wrid_wqhdr_compare(NULL, NULL);
3128 #endif
3129 	treep = &cq->cq_wrid_wqhdr_avl_tree;
3130 	while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) {
3131 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr))
3132 		container = curr->wq_wrid_poll;
3133 		while (container != NULL) {
3134 			to_free	  = container;
3135 			container = container->wl_next;
3136 			/*
3137 			 * If reaping the WRID list containers pulls the last
3138 			 * container from the given work queue header, then
3139 			 * we free the work queue header as well.  Note: we
3140 			 * ignore the return value because we know that the
3141 			 * work queue header should always be freed once the
3142 			 * list of containers has come to an end.
3143 			 */
3144 			(void) tavor_wrid_list_reap(to_free);
3145 			if (container == NULL) {
3146 				tavor_cq_wqhdr_remove(cq, curr);
3147 			}
3148 		}
3149 	}
3150 	avl_destroy(treep);
3151 
3152 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
3153 	TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3154 }
3155 
3156 
3157 /*
3158  * tavor_wrid_get_list()
3159  *    Context: Can be called from interrupt or base context.
3160  */
3161 tavor_wrid_list_hdr_t *
3162 tavor_wrid_get_list(uint32_t qsize)
3163 {
3164 	tavor_wrid_list_hdr_t	*wridlist;
3165 	uint32_t		size;
3166 
3167 	/*
3168 	 * The WRID list "container" consists of the tavor_wrid_list_hdr_t,
3169 	 * which holds the pointers necessary for maintaining the "reapable"
3170 	 * list, chaining together multiple "containers" old and new, and
3171 	 * tracking the head, tail, size, etc. for each container.
3172 	 *
3173 	 * The "container" also holds all the tavor_wrid_entry_t's, which is
3174 	 * allocated separately, one for each entry on the corresponding work
3175 	 * queue.
3176 	 */
3177 	size = sizeof (tavor_wrid_list_hdr_t);
3178 
3179 	/*
3180 	 * Note that this allocation has to be a NOSLEEP operation here
3181 	 * because we are holding the "wqhdr_list_lock" and, therefore,
3182 	 * could get raised to the interrupt level.
3183 	 */
3184 	wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP);
3185 	if (wridlist == NULL) {
3186 		return (NULL);
3187 	}
3188 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist))
3189 
3190 	/* Complete the "container" initialization */
3191 	wridlist->wl_size = qsize;
3192 	wridlist->wl_full = 0;
3193 	wridlist->wl_head = 0;
3194 	wridlist->wl_tail = 0;
3195 	wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize *
3196 	    sizeof (tavor_wrid_entry_t), KM_NOSLEEP);
3197 	if (wridlist->wl_wre == NULL) {
3198 		kmem_free(wridlist, size);
3199 		return (NULL);
3200 	}
3201 	wridlist->wl_wre_old_tail  = NULL;
3202 	wridlist->wl_reap_next = NULL;
3203 	wridlist->wl_next  = NULL;
3204 	wridlist->wl_prev  = NULL;
3205 	wridlist->wl_srq_en = 0;
3206 
3207 	return (wridlist);
3208 }
3209 
3210 /*
3211  * tavor_wrid_list_srq_init()
3212  * Context: Can be called from interrupt or base context
3213  */
3214 void
3215 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq,
3216     uint_t wq_start)
3217 {
3218 	uint64_t *wl_wqe;
3219 	int wqe_index;
3220 
3221 	ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock));
3222 
3223 	/* Setup pointers for use later when we are polling the CQ */
3224 	wridlist->wl_srq_wq_buf = srq->srq_wq_buf;
3225 	wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz;
3226 	wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz;
3227 	wridlist->wl_srq_desc_off = srq->srq_desc_off;
3228 	wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl;
3229 
3230 	/* Given wq_start to start initializing buf at, verify sanity */
3231 	ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz);
3232 
3233 	/*
3234 	 * Initialize wridlist free list
3235 	 *
3236 	 * For each WQ up to the size of our queue, we store an index in the WQ
3237 	 * memory itself, representing the next available free entry.  The
3238 	 * 'wl_free_list_indx' always holds the index of the next available
3239 	 * free entry in the WQ.  If 'wl_free_list_indx' is -1, then we are
3240 	 * completely full.  This gives us the advantage of being able to have
3241 	 * entries complete or be polled off the WQ out-of-order.
3242 	 *
3243 	 * For now, we write the free_list entries inside the WQ itself.  It
3244 	 * may be useful in the future to store this information in a separate
3245 	 * structure for debugging purposes.
3246 	 */
3247 	for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) {
3248 		wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index);
3249 		ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe,
3250 		    wridlist->wl_free_list_indx);
3251 		wridlist->wl_free_list_indx = wqe_index;
3252 	}
3253 }
3254 
3255 
3256 /*
3257  * tavor_wrid_reaplist_add()
3258  *    Context: Can be called from interrupt or base context.
3259  */
3260 static void
3261 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq)
3262 {
3263 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3264 
3265 	TAVOR_TNF_ENTER(tavor_wrid_reaplist_add);
3266 
3267 	mutex_enter(&wq->wq_wrid_wql->wql_lock);
3268 
3269 	/*
3270 	 * Add the "post" container (the last one on the current chain) to
3271 	 * the CQ's "reapable" list
3272 	 */
3273 	if ((cq->cq_wrid_reap_head == NULL) &&
3274 	    (cq->cq_wrid_reap_tail == NULL)) {
3275 		cq->cq_wrid_reap_head = wq->wq_wrid_post;
3276 		cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3277 	} else {
3278 		cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
3279 		cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3280 	}
3281 
3282 	mutex_exit(&wq->wq_wrid_wql->wql_lock);
3283 }
3284 
3285 
3286 int
3287 tavor_wrid_wqhdr_compare(const void *p1, const void *p2)
3288 {
3289 	tavor_workq_compare_t	*cmpp;
3290 	tavor_workq_hdr_t	*curr;
3291 
3292 	cmpp = (tavor_workq_compare_t *)p1;
3293 	curr = (tavor_workq_hdr_t *)p2;
3294 
3295 	if (cmpp->cmp_qpn < curr->wq_qpn)
3296 		return (-1);
3297 	else if (cmpp->cmp_qpn > curr->wq_qpn)
3298 		return (+1);
3299 	else if (cmpp->cmp_type < curr->wq_type)
3300 		return (-1);
3301 	else if (cmpp->cmp_type > curr->wq_type)
3302 		return (+1);
3303 	else
3304 		return (0);
3305 }
3306 
3307 
3308 /*
3309  * tavor_wrid_wqhdr_find()
3310  *    Context: Can be called from interrupt or base context.
3311  */
3312 static tavor_workq_hdr_t *
3313 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type)
3314 {
3315 	tavor_workq_hdr_t	*curr;
3316 	tavor_workq_compare_t	cmp;
3317 
3318 	TAVOR_TNF_ENTER(tavor_wrid_wqhdr_find);
3319 
3320 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3321 
3322 	/*
3323 	 * Walk the CQ's work queue list, trying to find a send or recv queue
3324 	 * with the same QP number.  We do this even if we are going to later
3325 	 * create a new entry because it helps us easily find the end of the
3326 	 * list.
3327 	 */
3328 	cmp.cmp_qpn = qpn;
3329 	cmp.cmp_type = wq_type;
3330 #ifdef __lock_lint
3331 	tavor_wrid_wqhdr_compare(NULL, NULL);
3332 #endif
3333 	curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
3334 
3335 	TAVOR_TNF_EXIT(tavor_wrid_wqhdr_find);
3336 	return (curr);
3337 }
3338 
3339 
3340 /*
3341  * tavor_wrid_wqhdr_create()
3342  *    Context: Can be called from interrupt or base context.
3343  */
3344 static tavor_workq_hdr_t *
3345 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn,
3346     uint_t wq_type, uint_t create_wql)
3347 {
3348 	tavor_workq_hdr_t	*wqhdr_tmp;
3349 
3350 	TAVOR_TNF_ENTER(tavor_wrid_wqhdr_create);
3351 
3352 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3353 
3354 	/*
3355 	 * Allocate space a work queue header structure and initialize it.
3356 	 * Each work queue header structure includes a "wq_wrid_wql"
3357 	 * which needs to be initialized.  Note that this allocation has to be
3358 	 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock"
3359 	 * and, therefore, could get raised to the interrupt level.
3360 	 */
3361 	wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc(
3362 	    sizeof (tavor_workq_hdr_t), KM_NOSLEEP);
3363 	if (wqhdr_tmp == NULL) {
3364 		TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3365 		return (NULL);
3366 	}
3367 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp))
3368 	wqhdr_tmp->wq_qpn	= qpn;
3369 	wqhdr_tmp->wq_type	= wq_type;
3370 
3371 	if (create_wql) {
3372 		wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state);
3373 		if (wqhdr_tmp->wq_wrid_wql == NULL) {
3374 			kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t));
3375 			TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3376 			return (NULL);
3377 		}
3378 	}
3379 
3380 	wqhdr_tmp->wq_wrid_poll = NULL;
3381 	wqhdr_tmp->wq_wrid_post = NULL;
3382 
3383 	/* Chain the newly allocated work queue header to the CQ's list */
3384 	tavor_cq_wqhdr_add(cq, wqhdr_tmp);
3385 
3386 	TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3387 	return (wqhdr_tmp);
3388 }
3389 
3390 
3391 /*
3392  * tavor_wrid_wql_create()
3393  *    Context: Can be called from interrupt or base context.
3394  */
3395 tavor_wq_lock_t *
3396 tavor_wrid_wql_create(tavor_state_t *state)
3397 {
3398 	tavor_wq_lock_t *wql;
3399 
3400 	TAVOR_TNF_ENTER(tavor_wrid_wql_create);
3401 
3402 	/*
3403 	 * Allocate the WQL and initialize it.
3404 	 */
3405 	wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP);
3406 	if (wql == NULL) {
3407 		TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3408 		return (NULL);
3409 	}
3410 
3411 	mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER,
3412 	    DDI_INTR_PRI(state->ts_intrmsi_pri));
3413 
3414 	/* Add refcount to WQL */
3415 	tavor_wql_refcnt_inc(wql);
3416 
3417 	TAVOR_TNF_EXIT(tavor_wrid_wql_create);
3418 	return (wql);
3419 }
3420 
3421 
3422 /*
3423  * tavor_wrid_get_wqeaddrsz()
3424  *    Context: Can be called from interrupt or base context.
3425  */
3426 static uint32_t
3427 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq)
3428 {
3429 	tavor_wrid_entry_t	*wre;
3430 	uint32_t		wqeaddrsz;
3431 	uint32_t		head;
3432 
3433 	/*
3434 	 * If the container is empty, then there is no next entry. So just
3435 	 * return zero.  Note: the "head == tail" condition here can only
3436 	 * mean that the container is empty because we have previously pulled
3437 	 * something from the container.
3438 	 *
3439 	 * If the container is not empty, then find the next entry and return
3440 	 * the contents of its "wqeaddrsz" field.
3441 	 */
3442 	if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
3443 		wqeaddrsz = 0;
3444 	} else {
3445 		/*
3446 		 * We don't need to calculate the "next" head pointer here
3447 		 * because "head" should already point to the next entry on
3448 		 * the list (since we just pulled something off - in
3449 		 * tavor_wrid_find_match() - and moved the head index forward.)
3450 		 */
3451 		head = wq->wq_wrid_poll->wl_head;
3452 		wre = &wq->wq_wrid_poll->wl_wre[head];
3453 		wqeaddrsz = wre->wr_wqeaddrsz;
3454 	}
3455 	return (wqeaddrsz);
3456 }
3457 
3458 
3459 /*
3460  * tavor_wrid_wqhdr_add()
3461  *    Context: Can be called from interrupt or base context.
3462  */
3463 static void
3464 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
3465     tavor_wrid_list_hdr_t *wridlist)
3466 {
3467 	ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3468 
3469 	/* Chain the new WRID list "container" to the work queue list */
3470 	if ((wqhdr->wq_wrid_post == NULL) &&
3471 	    (wqhdr->wq_wrid_poll == NULL)) {
3472 		wqhdr->wq_wrid_poll = wridlist;
3473 		wqhdr->wq_wrid_post = wridlist;
3474 	} else {
3475 		wqhdr->wq_wrid_post->wl_next = wridlist;
3476 		wridlist->wl_prev = wqhdr->wq_wrid_post;
3477 		wqhdr->wq_wrid_post = wridlist;
3478 	}
3479 }
3480 
3481 
3482 /*
3483  * tavor_wrid_wqhdr_remove()
3484  *    Context: Can be called from interrupt or base context.
3485  *
3486  *    Note: this is only called to remove the most recently added WRID list
3487  *    container (i.e. in tavor_from_reset() above)
3488  */
3489 static void
3490 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
3491     tavor_wrid_list_hdr_t *wridlist)
3492 {
3493 	tavor_wrid_list_hdr_t	*prev, *next;
3494 
3495 	ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3496 
3497 	/* Unlink the WRID list "container" from the work queue list */
3498 	prev = wridlist->wl_prev;
3499 	next = wridlist->wl_next;
3500 	if (prev != NULL) {
3501 		prev->wl_next = next;
3502 	}
3503 	if (next != NULL) {
3504 		next->wl_prev = prev;
3505 	}
3506 
3507 	/*
3508 	 * Update any pointers in the work queue hdr that may point to this
3509 	 * WRID list container
3510 	 */
3511 	if (wqhdr->wq_wrid_post == wridlist) {
3512 		wqhdr->wq_wrid_post = prev;
3513 	}
3514 	if (wqhdr->wq_wrid_poll == wridlist) {
3515 		wqhdr->wq_wrid_poll = NULL;
3516 	}
3517 }
3518 
3519 
3520 /*
3521  * tavor_wrid_list_reap()
3522  *    Context: Can be called from interrupt or base context.
3523  *    Note: The "wqhdr_list_lock" must be held.
3524  */
3525 static tavor_workq_hdr_t *
3526 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist)
3527 {
3528 	tavor_workq_hdr_t	*wqhdr, *consume_wqhdr = NULL;
3529 	tavor_wrid_list_hdr_t	*prev, *next;
3530 	uint32_t		size;
3531 
3532 	TAVOR_TNF_ENTER(tavor_wrid_list_reap);
3533 
3534 	/* Get the back pointer to the work queue header (see below) */
3535 	wqhdr = wridlist->wl_wqhdr;
3536 	mutex_enter(&wqhdr->wq_wrid_wql->wql_lock);
3537 
3538 	/* Unlink the WRID list "container" from the work queue list */
3539 	prev = wridlist->wl_prev;
3540 	next = wridlist->wl_next;
3541 	if (prev != NULL) {
3542 		prev->wl_next = next;
3543 	}
3544 	if (next != NULL) {
3545 		next->wl_prev = prev;
3546 	}
3547 
3548 	/*
3549 	 * If the back pointer to the work queue header shows that it
3550 	 * was pointing to the entry we are about to remove, then the work
3551 	 * queue header is reapable as well.
3552 	 */
3553 	if ((wqhdr->wq_wrid_poll == wridlist) &&
3554 	    (wqhdr->wq_wrid_post == wridlist)) {
3555 		consume_wqhdr = wqhdr;
3556 	}
3557 
3558 	/* Be sure to update the "poll" and "post" container pointers */
3559 	if (wqhdr->wq_wrid_poll == wridlist) {
3560 		wqhdr->wq_wrid_poll = next;
3561 	}
3562 	if (wqhdr->wq_wrid_post == wridlist) {
3563 		wqhdr->wq_wrid_post = NULL;
3564 	}
3565 
3566 	/* Calculate the size and free the container */
3567 	size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t));
3568 	kmem_free(wridlist->wl_wre, size);
3569 	kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t));
3570 
3571 	mutex_exit(&wqhdr->wq_wrid_wql->wql_lock);
3572 
3573 	TAVOR_TNF_EXIT(tavor_wrid_list_reap);
3574 	return (consume_wqhdr);
3575 }
3576 
3577 
3578 /*
3579  * tavor_wrid_wqhdr_lock_both()
3580  *    Context: Can be called from interrupt or base context.
3581  */
3582 static void
3583 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)
3584 {
3585 	tavor_cqhdl_t	sq_cq, rq_cq;
3586 
3587 	sq_cq = qp->qp_sq_cqhdl;
3588 	rq_cq = qp->qp_rq_cqhdl;
3589 
3590 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3591 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3592 
3593 	/*
3594 	 * If both work queues (send and recv) share a completion queue, then
3595 	 * grab the common lock.  If they use different CQs (hence different
3596 	 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
3597 	 * receive.  We do this consistently and correctly in
3598 	 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
3599 	 * of dead lock condition.  Note:  We add the "__lock_lint" code here
3600 	 * to fake out warlock into thinking we've grabbed both locks (when,
3601 	 * in fact, we only needed the one).
3602 	 */
3603 	if (sq_cq == rq_cq) {
3604 		mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3605 #ifdef	__lock_lint
3606 		mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3607 #endif
3608 	} else {
3609 		mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3610 		mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3611 	}
3612 }
3613 
3614 /*
3615  * tavor_wrid_wqhdr_unlock_both()
3616  *    Context: Can be called from interrupt or base context.
3617  */
3618 static void
3619 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)
3620 {
3621 	tavor_cqhdl_t	sq_cq, rq_cq;
3622 
3623 	sq_cq = qp->qp_sq_cqhdl;
3624 	rq_cq = qp->qp_rq_cqhdl;
3625 
3626 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3627 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3628 
3629 	/*
3630 	 * See tavor_wrid_wqhdr_lock_both() above for more detail
3631 	 */
3632 	if (sq_cq == rq_cq) {
3633 #ifdef	__lock_lint
3634 		mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3635 #endif
3636 		mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3637 	} else {
3638 		mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3639 		mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3640 	}
3641 }
3642 
3643 
3644 /*
3645  * tavor_cq_wqhdr_add()
3646  *    Context: Can be called from interrupt or base context.
3647  */
3648 static void
3649 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3650 {
3651 	tavor_workq_compare_t	cmp;
3652 	avl_index_t		where;
3653 
3654 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3655 
3656 	cmp.cmp_qpn = wqhdr->wq_qpn;
3657 	cmp.cmp_type = wqhdr->wq_type;
3658 #ifdef __lock_lint
3659 	tavor_wrid_wqhdr_compare(NULL, NULL);
3660 #endif
3661 	(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
3662 	/*
3663 	 * If the CQ's work queue list is empty, then just add it.
3664 	 * Otherwise, chain it to the beginning of the list.
3665 	 */
3666 	avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where);
3667 }
3668 
3669 
3670 /*
3671  * tavor_cq_wqhdr_remove()
3672  *    Context: Can be called from interrupt or base context.
3673  */
3674 static void
3675 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3676 {
3677 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3678 
3679 #ifdef __lock_lint
3680 	tavor_wrid_wqhdr_compare(NULL, NULL);
3681 #endif
3682 	/* Remove "wqhdr" from the work queue header list on "cq" */
3683 	avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr);
3684 
3685 	/*
3686 	 * Release reference to WQL; If this is the last reference, this call
3687 	 * also has the side effect of freeing up the 'wq_wrid_wql' memory.
3688 	 */
3689 	tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql);
3690 
3691 	/* Free the memory associated with "wqhdr" */
3692 	kmem_free(wqhdr, sizeof (tavor_workq_hdr_t));
3693 }
3694 
3695 
3696 /*
3697  * tavor_wql_refcnt_inc()
3698  * Context: Can be called from interrupt or base context
3699  */
3700 void
3701 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql)
3702 {
3703 	ASSERT(wql != NULL);
3704 
3705 	mutex_enter(&wql->wql_lock);
3706 	wql->wql_refcnt++;
3707 	mutex_exit(&wql->wql_lock);
3708 }
3709 
3710 /*
3711  * tavor_wql_refcnt_dec()
3712  * Context: Can be called from interrupt or base context
3713  */
3714 void
3715 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql)
3716 {
3717 	int	refcnt;
3718 
3719 	ASSERT(wql != NULL);
3720 
3721 	mutex_enter(&wql->wql_lock);
3722 	wql->wql_refcnt--;
3723 	refcnt = wql->wql_refcnt;
3724 	mutex_exit(&wql->wql_lock);
3725 
3726 	/*
3727 	 *
3728 	 * Free up WQL memory if we're the last one associated with this
3729 	 * structure.
3730 	 */
3731 	if (refcnt == 0) {
3732 		mutex_destroy(&wql->wql_lock);
3733 		kmem_free(wql, sizeof (tavor_wq_lock_t));
3734 	}
3735 }
3736