1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_wr.c
29  *    Tavor Work Request Processing Routines
30  *
31  *    Implements all the routines necessary to provide the PostSend(),
32  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
33  *    necessary to implement the Tavor WRID tracking mechanism.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/avl.h>
42 
43 #include <sys/ib/adapters/tavor/tavor.h>
44 
45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda,
46     uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode);
47 #pragma inline(tavor_qp_send_doorbell)
48 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda,
49     uint32_t nds, uint32_t qpn, uint32_t credits);
50 #pragma inline(tavor_qp_recv_doorbell)
51 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr);
52 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr);
53 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
54     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
55 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr,
56     ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz,
57     uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp);
58 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
59     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
60 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
61     uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
62     tavor_qphdl_t qp);
63 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
64     ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size);
65 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz,
66     uint64_t *prev, tavor_qphdl_t qp);
67 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
68     ibt_recv_wr_t *wr, uint64_t *desc);
69 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev,
70     tavor_srqhdl_t srq);
71 static void tavor_wqe_sync(void *hdl, uint_t sync_from,
72     uint_t sync_to, uint_t sync_type, uint_t flag);
73 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq,
74     tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe);
75 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq);
76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn,
77     uint_t send_or_recv);
78 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state,
79     tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql);
80 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq);
81 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
82     tavor_wrid_list_hdr_t *wrid_list);
83 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
84     tavor_wrid_list_hdr_t *wrid_list);
85 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq);
86 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp);
87 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp);
88 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
89 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
90 
91 /*
92  * tavor_post_send()
93  *    Context: Can be called from interrupt or base context.
94  */
95 int
tavor_post_send(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint_t num_wr,uint_t * num_posted)96 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp,
97     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
98 {
99 	tavor_sw_wqe_dbinfo_t		dbinfo;
100 	tavor_wrid_list_hdr_t		*wridlist;
101 	tavor_wrid_entry_t		*wre_last;
102 	uint64_t			*desc, *prev, *first;
103 	uint32_t			desc_sz, first_sz;
104 	uint32_t			wqeaddrsz, signaled_dbd;
105 	uint32_t			head, tail, next_tail, qsize_msk;
106 	uint32_t			sync_from, sync_to;
107 	uint_t				currindx, wrindx, numremain;
108 	uint_t				chainlen, chainbegin, posted_cnt;
109 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
110 	int				status;
111 
112 	/*
113 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
114 	 * clients to post to QP memory that is accessible directly by the
115 	 * user.  If the QP memory is user accessible, then return an error.
116 	 */
117 	if (qp->qp_is_umap) {
118 		return (IBT_QP_HDL_INVALID);
119 	}
120 
121 	/* Initialize posted_cnt */
122 	posted_cnt = 0;
123 
124 	mutex_enter(&qp->qp_lock);
125 
126 	/*
127 	 * Check QP state.  Can not post Send requests from the "Reset",
128 	 * "Init", or "RTR" states
129 	 */
130 	if ((qp->qp_state == TAVOR_QP_RESET) ||
131 	    (qp->qp_state == TAVOR_QP_INIT) ||
132 	    (qp->qp_state == TAVOR_QP_RTR)) {
133 		mutex_exit(&qp->qp_lock);
134 		return (IBT_QP_STATE_INVALID);
135 	}
136 
137 	/* Grab the lock for the WRID list */
138 	mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
139 	wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
140 
141 	/* Save away some initial QP state */
142 	qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
143 	tail	  = qp->qp_sq_wqhdr->wq_tail;
144 	head	  = qp->qp_sq_wqhdr->wq_head;
145 
146 	/*
147 	 * For each ibt_send_wr_t in the wr[] list passed in, parse the
148 	 * request and build a Send WQE.  Note:  Because we are potentially
149 	 * building a chain of WQEs, we want to link them all together.
150 	 * However, we do not want to link the first one to the previous
151 	 * WQE until the entire chain has been linked.  Then in the last
152 	 * step we ring the appropriate doorbell.  Note:  It is possible for
153 	 * more Work Requests to be posted than the HW will support at one
154 	 * shot.  If this happens, we need to be able to post and ring
155 	 * several chains here until the the entire request is complete.
156 	 */
157 	wrindx = 0;
158 	numremain = num_wr;
159 	status	  = DDI_SUCCESS;
160 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
161 		/*
162 		 * For the first WQE on a new chain we need "prev" to point
163 		 * to the current descriptor.  As we begin to process
164 		 * further, "prev" will be updated to point to the previous
165 		 * WQE on the current chain (see below).
166 		 */
167 		prev = TAVOR_QP_SQ_ENTRY(qp, tail);
168 
169 		/*
170 		 * Before we begin, save the current "tail index" for later
171 		 * DMA sync
172 		 */
173 		sync_from = tail;
174 
175 		/*
176 		 * Break the request up into chains that are less than or
177 		 * equal to the maximum number of WQEs that can be posted
178 		 * per doorbell ring
179 		 */
180 		chainlen   = (numremain > maxdb) ? maxdb : numremain;
181 		numremain -= chainlen;
182 		chainbegin = wrindx;
183 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
184 			/*
185 			 * Check for "queue full" condition.  If the queue
186 			 * is already full, then no more WQEs can be posted.
187 			 * So break out, ring a doorbell (if necessary) and
188 			 * return an error
189 			 */
190 			if (qp->qp_sq_wqhdr->wq_full != 0) {
191 				status = IBT_QP_FULL;
192 				break;
193 			}
194 
195 			/*
196 			 * Increment the "tail index" and check for "queue
197 			 * full" condition.  If we detect that the current
198 			 * work request is going to fill the work queue, then
199 			 * we mark this condition and continue.
200 			 */
201 			next_tail = (tail + 1) & qsize_msk;
202 			if (next_tail == head) {
203 				qp->qp_sq_wqhdr->wq_full = 1;
204 			}
205 
206 			/*
207 			 * Get the address of the location where the next
208 			 * Send WQE should be built
209 			 */
210 			desc = TAVOR_QP_SQ_ENTRY(qp, tail);
211 
212 			/*
213 			 * Call tavor_wqe_send_build() to build the WQE
214 			 * at the given address.  This routine uses the
215 			 * information in the ibt_send_wr_t list (wr[]) and
216 			 * returns the size of the WQE when it returns.
217 			 */
218 			status = tavor_wqe_send_build(state, qp,
219 			    &wr[wrindx], desc, &desc_sz);
220 			if (status != DDI_SUCCESS) {
221 				break;
222 			}
223 
224 			/*
225 			 * Add a WRID entry to the WRID list.  Need to
226 			 * calculate the "wqeaddrsz" and "signaled_dbd"
227 			 * values to pass to tavor_wrid_add_entry()
228 			 */
229 			wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
230 			    ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
231 			    desc_sz);
232 			if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) ||
233 			    (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) {
234 				signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
235 			} else {
236 				signaled_dbd = 0;
237 			}
238 			tavor_wrid_add_entry(qp->qp_sq_wqhdr,
239 			    wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
240 
241 			/*
242 			 * If this is not the first descriptor on the current
243 			 * chain, then link it to the previous WQE.  Otherwise,
244 			 * save the address and size of this descriptor (in
245 			 * "first" and "first_sz" respectively) and continue.
246 			 * Note: Linking a WQE to the the previous one will
247 			 * depend on whether the two WQEs are from "special
248 			 * QPs" (i.e. MLX transport WQEs) or whether they are
249 			 * normal Send WQEs.
250 			 */
251 			if (currindx != 0) {
252 				if (qp->qp_is_special) {
253 					tavor_wqe_mlx_linknext(&wr[wrindx - 1],
254 					    desc, desc_sz, prev, NULL, qp);
255 				} else {
256 					tavor_wqe_send_linknext(&wr[wrindx],
257 					    &wr[wrindx - 1], desc, desc_sz,
258 					    prev, NULL, qp);
259 				}
260 				prev = desc;
261 			} else {
262 				first	 = desc;
263 				first_sz = desc_sz;
264 			}
265 
266 			/*
267 			 * Update the current "tail index" and increment
268 			 * "posted_cnt"
269 			 */
270 			tail = next_tail;
271 			posted_cnt++;
272 		}
273 
274 		/*
275 		 * If we reach here and there are one or more WQEs which have
276 		 * been successfully chained together, then we need to link
277 		 * the current chain to the previously executing chain of
278 		 * descriptor (if there is one) and ring the doorbell for the
279 		 * send work queue.
280 		 */
281 		if (currindx != 0) {
282 			/*
283 			 * Before we link the chain, we need to ensure that the
284 			 * "next" field on the last WQE is set to NULL (to
285 			 * indicate the end of the chain).  Note: Just as it
286 			 * did above, the format for the "next" fields in a
287 			 * given WQE depend on whether the WQE is MLX
288 			 * transport or not.
289 			 */
290 			if (qp->qp_is_special) {
291 				tavor_wqe_mlx_linknext(&wr[chainbegin +
292 				    currindx - 1], NULL, 0, prev, NULL, qp);
293 			} else {
294 				tavor_wqe_send_linknext(NULL,
295 				    &wr[chainbegin + currindx - 1], NULL, 0,
296 				    prev, NULL, qp);
297 			}
298 
299 			/* Save away updated "tail index" for the DMA sync */
300 			sync_to = tail;
301 
302 			/* Do a DMA sync for current send WQE(s) */
303 			tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND,
304 			    DDI_DMA_SYNC_FORDEV);
305 
306 			/*
307 			 * Now link the chain to the old chain (if there was
308 			 * one.  Note: still need to pay attention to whether
309 			 * the QP used MLX transport WQEs or not.
310 			 */
311 			if (qp->qp_is_special) {
312 				tavor_wqe_mlx_linknext(NULL, first, first_sz,
313 				    qp->qp_sq_lastwqeaddr, &dbinfo, qp);
314 			} else {
315 				tavor_wqe_send_linknext(&wr[chainbegin], NULL,
316 				    first, first_sz, qp->qp_sq_lastwqeaddr,
317 				    &dbinfo, qp);
318 			}
319 
320 			/*
321 			 * If there was a valid previous WQE (i.e. non-NULL),
322 			 * then sync it too.  This is because we have updated
323 			 * its "next" fields and we want to ensure that the
324 			 * hardware can see the changes.
325 			 */
326 			if (qp->qp_sq_lastwqeaddr != NULL) {
327 				sync_to   = sync_from;
328 				sync_from = (sync_from - 1) & qsize_msk;
329 				tavor_wqe_sync(qp, sync_from, sync_to,
330 				    TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV);
331 			}
332 
333 			/*
334 			 * Now if the WRID tail entry is non-NULL, then this
335 			 * represents the entry to which we are chaining the
336 			 * new entries.  Since we are going to ring the
337 			 * doorbell for this WQE, we want set its "dbd" bit.
338 			 *
339 			 * On the other hand, if the tail is NULL, even though
340 			 * we will have rung the doorbell for the previous WQE
341 			 * (for the hardware's sake) it is irrelevant to our
342 			 * purposes (for tracking WRIDs) because we know the
343 			 * request must have already completed.
344 			 */
345 			wre_last = wridlist->wl_wre_old_tail;
346 			if (wre_last != NULL) {
347 				wre_last->wr_signaled_dbd |=
348 				    TAVOR_WRID_ENTRY_DOORBELLED;
349 			}
350 
351 			/* Update some of the state in the QP */
352 			qp->qp_sq_lastwqeaddr	 = desc;
353 			qp->qp_sq_wqhdr->wq_tail = tail;
354 
355 			/* Ring the doorbell */
356 			tavor_qp_send_doorbell(state,
357 			    (uint32_t)((uintptr_t)first - qp->qp_desc_off),
358 			    first_sz, qp->qp_qpnum, dbinfo.db_fence,
359 			    dbinfo.db_nopcode);
360 		}
361 	}
362 
363 	/*
364 	 * Update the "num_posted" return value (if necessary).  Then drop
365 	 * the locks and return success.
366 	 */
367 	if (num_posted != NULL) {
368 		*num_posted = posted_cnt;
369 	}
370 
371 	mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
372 	mutex_exit(&qp->qp_lock);
373 
374 	return (status);
375 }
376 
377 
378 /*
379  * tavor_post_recv()
380  *    Context: Can be called from interrupt or base context.
381  */
382 int
tavor_post_recv(tavor_state_t * state,tavor_qphdl_t qp,ibt_recv_wr_t * wr,uint_t num_wr,uint_t * num_posted)383 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp,
384     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
385 {
386 	uint64_t			*desc, *prev, *first;
387 	uint32_t			desc_sz, first_sz;
388 	uint32_t			wqeaddrsz, signaled_dbd;
389 	uint32_t			head, tail, next_tail, qsize_msk;
390 	uint32_t			sync_from, sync_to;
391 	uint_t				currindx, wrindx, numremain;
392 	uint_t				chainlen, posted_cnt;
393 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
394 	int				status;
395 
396 	/*
397 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
398 	 * clients to post to QP memory that is accessible directly by the
399 	 * user.  If the QP memory is user accessible, then return an error.
400 	 */
401 	if (qp->qp_is_umap) {
402 		return (IBT_QP_HDL_INVALID);
403 	}
404 
405 	/* Initialize posted_cnt */
406 	posted_cnt = 0;
407 
408 	mutex_enter(&qp->qp_lock);
409 
410 	/*
411 	 * Check if QP is associated with an SRQ
412 	 */
413 	if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
414 		mutex_exit(&qp->qp_lock);
415 		return (IBT_SRQ_IN_USE);
416 	}
417 
418 	/*
419 	 * Check QP state.  Can not post Recv requests from the "Reset" state
420 	 */
421 	if (qp->qp_state == TAVOR_QP_RESET) {
422 		mutex_exit(&qp->qp_lock);
423 		return (IBT_QP_STATE_INVALID);
424 	}
425 
426 	/* Grab the lock for the WRID list */
427 	mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
428 
429 	/* Save away some initial QP state */
430 	qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
431 	tail	  = qp->qp_rq_wqhdr->wq_tail;
432 	head	  = qp->qp_rq_wqhdr->wq_head;
433 
434 	/*
435 	 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
436 	 * request and build a Recv WQE.  Note:  Because we are potentially
437 	 * building a chain of WQEs, we want to link them all together.
438 	 * However, we do not want to link the first one to the previous
439 	 * WQE until the entire chain has been linked.  Then in the last
440 	 * step we ring the appropriate doorbell.  Note:  It is possible for
441 	 * more Work Requests to be posted than the HW will support at one
442 	 * shot.  If this happens, we need to be able to post and ring
443 	 * several chains here until the the entire request is complete.
444 	 */
445 	wrindx = 0;
446 	numremain = num_wr;
447 	status	  = DDI_SUCCESS;
448 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
449 		/*
450 		 * For the first WQE on a new chain we need "prev" to point
451 		 * to the current descriptor.  As we begin to process
452 		 * further, "prev" will be updated to point to the previous
453 		 * WQE on the current chain (see below).
454 		 */
455 		prev = TAVOR_QP_RQ_ENTRY(qp, tail);
456 
457 		/*
458 		 * Before we begin, save the current "tail index" for later
459 		 * DMA sync
460 		 */
461 		sync_from = tail;
462 
463 		/*
464 		 * Break the request up into chains that are less than or
465 		 * equal to the maximum number of WQEs that can be posted
466 		 * per doorbell ring
467 		 */
468 		chainlen = (numremain > maxdb) ? maxdb : numremain;
469 		numremain -= chainlen;
470 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
471 			/*
472 			 * Check for "queue full" condition.  If the queue
473 			 * is already full, then no more WQEs can be posted.
474 			 * So break out, ring a doorbell (if necessary) and
475 			 * return an error
476 			 */
477 			if (qp->qp_rq_wqhdr->wq_full != 0) {
478 				status = IBT_QP_FULL;
479 				break;
480 			}
481 
482 			/*
483 			 * Increment the "tail index" and check for "queue
484 			 * full" condition.  If we detect that the current
485 			 * work request is going to fill the work queue, then
486 			 * we mark this condition and continue.
487 			 */
488 			next_tail = (tail + 1) & qsize_msk;
489 			if (next_tail == head) {
490 				qp->qp_rq_wqhdr->wq_full = 1;
491 			}
492 
493 			/*
494 			 * Get the address of the location where the next
495 			 * Recv WQE should be built
496 			 */
497 			desc = TAVOR_QP_RQ_ENTRY(qp, tail);
498 
499 			/*
500 			 * Call tavor_wqe_recv_build() to build the WQE
501 			 * at the given address.  This routine uses the
502 			 * information in the ibt_recv_wr_t list (wr[]) and
503 			 * returns the size of the WQE when it returns.
504 			 */
505 			status = tavor_wqe_recv_build(state, qp, &wr[wrindx],
506 			    desc, &desc_sz);
507 			if (status != DDI_SUCCESS) {
508 				break;
509 			}
510 
511 			/*
512 			 * Add a WRID entry to the WRID list.  Need to
513 			 * calculate the "wqeaddrsz" and "signaled_dbd"
514 			 * values to pass to tavor_wrid_add_entry().  Note:
515 			 * all Recv WQEs are essentially "signaled" and
516 			 * "doorbelled" (since Tavor HW requires all
517 			 * RecvWQE's to have their "DBD" bits set).
518 			 */
519 			wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
520 			    ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
521 			    desc_sz);
522 			signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED |
523 			    TAVOR_WRID_ENTRY_DOORBELLED;
524 			tavor_wrid_add_entry(qp->qp_rq_wqhdr,
525 			    wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
526 
527 			/*
528 			 * If this is not the first descriptor on the current
529 			 * chain, then link it to the previous WQE.  Otherwise,
530 			 * save the address and size of this descriptor (in
531 			 * "first" and "first_sz" respectively) and continue.
532 			 */
533 			if (currindx != 0) {
534 				tavor_wqe_recv_linknext(desc, desc_sz, prev,
535 				    qp);
536 				prev = desc;
537 			} else {
538 				first	 = desc;
539 				first_sz = desc_sz;
540 			}
541 
542 			/*
543 			 * Update the current "tail index" and increment
544 			 * "posted_cnt"
545 			 */
546 			tail = next_tail;
547 			posted_cnt++;
548 		}
549 
550 		/*
551 		 * If we reach here and there are one or more WQEs which have
552 		 * been successfully chained together, then we need to link
553 		 * the current chain to the previously executing chain of
554 		 * descriptor (if there is one) and ring the doorbell for the
555 		 * recv work queue.
556 		 */
557 		if (currindx != 0) {
558 			/*
559 			 * Before we link the chain, we need to ensure that the
560 			 * "next" field on the last WQE is set to NULL (to
561 			 * indicate the end of the chain).
562 			 */
563 			tavor_wqe_recv_linknext(NULL, 0, prev, qp);
564 
565 			/* Save away updated "tail index" for the DMA sync */
566 			sync_to = tail;
567 
568 			/* Do a DMA sync for current recv WQE(s) */
569 			tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV,
570 			    DDI_DMA_SYNC_FORDEV);
571 
572 			/*
573 			 * Now link the chain to the old chain (if there was
574 			 * one.
575 			 */
576 			tavor_wqe_recv_linknext(first, first_sz,
577 			    qp->qp_rq_lastwqeaddr, qp);
578 
579 			/*
580 			 * If there was a valid previous WQE (i.e. non-NULL),
581 			 * then sync it too.  This is because we have updated
582 			 * its "next" fields and we want to ensure that the
583 			 * hardware can see the changes.
584 			 */
585 			if (qp->qp_rq_lastwqeaddr != NULL) {
586 				sync_to	  = sync_from;
587 				sync_from = (sync_from - 1) & qsize_msk;
588 				tavor_wqe_sync(qp, sync_from, sync_to,
589 				    TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV);
590 			}
591 
592 			/* Update some of the state in the QP */
593 			qp->qp_rq_lastwqeaddr	 = desc;
594 			qp->qp_rq_wqhdr->wq_tail = tail;
595 
596 			/* Ring the doorbell */
597 			tavor_qp_recv_doorbell(state,
598 			    (uint32_t)((uintptr_t)first - qp->qp_desc_off),
599 			    first_sz, qp->qp_qpnum, (chainlen % maxdb));
600 		}
601 	}
602 
603 	/*
604 	 * Update the "num_posted" return value (if necessary).  Then drop
605 	 * the locks and return success.
606 	 */
607 	if (num_posted != NULL) {
608 		*num_posted = posted_cnt;
609 	}
610 
611 	mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
612 	mutex_exit(&qp->qp_lock);
613 
614 	return (status);
615 }
616 
617 /*
618  * tavor_post_srq()
619  *    Context: Can be called from interrupt or base context.
620  */
621 int
tavor_post_srq(tavor_state_t * state,tavor_srqhdl_t srq,ibt_recv_wr_t * wr,uint_t num_wr,uint_t * num_posted)622 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq,
623     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
624 {
625 	uint64_t			*desc, *prev, *first, *last_wqe_addr;
626 	uint32_t			signaled_dbd;
627 	uint32_t			sync_indx;
628 	uint_t				currindx, wrindx, numremain;
629 	uint_t				chainlen, posted_cnt;
630 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
631 	int				status;
632 
633 	/*
634 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
635 	 * clients to post to QP memory that is accessible directly by the
636 	 * user.  If the QP memory is user accessible, then return an error.
637 	 */
638 	if (srq->srq_is_umap) {
639 		return (IBT_SRQ_HDL_INVALID);
640 	}
641 
642 	/* Initialize posted_cnt */
643 	posted_cnt = 0;
644 
645 	mutex_enter(&srq->srq_lock);
646 
647 	/*
648 	 * Check SRQ state.  Can not post Recv requests when SRQ is in error
649 	 */
650 	if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) {
651 		mutex_exit(&srq->srq_lock);
652 		return (IBT_QP_STATE_INVALID);
653 	}
654 
655 	/* Grab the lock for the WRID list */
656 	mutex_enter(&srq->srq_wrid_wql->wql_lock);
657 
658 	/*
659 	 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
660 	 * request and build a Recv WQE.  Note:  Because we are potentially
661 	 * building a chain of WQEs, we want to link them all together.
662 	 * However, we do not want to link the first one to the previous
663 	 * WQE until the entire chain has been linked.  Then in the last
664 	 * step we ring the appropriate doorbell.  Note:  It is possible for
665 	 * more Work Requests to be posted than the HW will support at one
666 	 * shot.  If this happens, we need to be able to post and ring
667 	 * several chains here until the the entire request is complete.
668 	 */
669 	wrindx = 0;
670 	numremain = num_wr;
671 	status	  = DDI_SUCCESS;
672 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
673 		/*
674 		 * For the first WQE on a new chain we need "prev" to point
675 		 * to the current descriptor.  As we begin to process
676 		 * further, "prev" will be updated to point to the previous
677 		 * WQE on the current chain (see below).
678 		 */
679 		if (srq->srq_wq_lastwqeindx == -1) {
680 			prev = NULL;
681 		} else {
682 			prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx);
683 		}
684 
685 		/*
686 		 * Break the request up into chains that are less than or
687 		 * equal to the maximum number of WQEs that can be posted
688 		 * per doorbell ring
689 		 */
690 		chainlen = (numremain > maxdb) ? maxdb : numremain;
691 		numremain -= chainlen;
692 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
693 
694 			/*
695 			 * Check for "queue full" condition.  If the queue
696 			 * is already full, then no more WQEs can be posted.
697 			 * So break out, ring a doorbell (if necessary) and
698 			 * return an error
699 			 */
700 			if (srq->srq_wridlist->wl_free_list_indx == -1) {
701 				status = IBT_QP_FULL;
702 				break;
703 			}
704 
705 			/*
706 			 * Get the address of the location where the next
707 			 * Recv WQE should be built
708 			 */
709 			desc = TAVOR_SRQ_WQE_ADDR(srq,
710 			    srq->srq_wridlist->wl_free_list_indx);
711 
712 			/*
713 			 * Add a WRID entry to the WRID list.  Need to
714 			 * set the "signaled_dbd" values to pass to
715 			 * tavor_wrid_add_entry().  Note: all Recv WQEs are
716 			 * essentially "signaled"
717 			 *
718 			 * The 'size' is stored at srq_alloc time, in the
719 			 * srq_wq_stride.  This is a constant value required
720 			 * for SRQ.
721 			 */
722 			signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
723 			tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id,
724 			    signaled_dbd);
725 
726 			/*
727 			 * Call tavor_wqe_srq_build() to build the WQE
728 			 * at the given address.  This routine uses the
729 			 * information in the ibt_recv_wr_t list (wr[]) and
730 			 * returns the size of the WQE when it returns.
731 			 */
732 			status = tavor_wqe_srq_build(state, srq, &wr[wrindx],
733 			    desc);
734 			if (status != DDI_SUCCESS) {
735 				break;
736 			}
737 
738 			/*
739 			 * If this is not the first descriptor on the current
740 			 * chain, then link it to the previous WQE.  Otherwise,
741 			 * save the address of this descriptor (in "first") and
742 			 * continue.
743 			 */
744 			if (currindx != 0) {
745 				tavor_wqe_srq_linknext(desc, prev, srq);
746 				sync_indx = TAVOR_SRQ_WQE_INDEX(
747 				    srq->srq_wq_buf, prev,
748 				    srq->srq_wq_log_wqesz);
749 
750 				/* Do a DMA sync for previous recv WQE */
751 				tavor_wqe_sync(srq, sync_indx, sync_indx+1,
752 				    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
753 
754 				prev = desc;
755 			} else {
756 
757 				/*
758 				 * In this case, the last WQE on the chain is
759 				 * also considered 'first'.  So set prev to
760 				 * first, here.
761 				 */
762 				first = prev = desc;
763 			}
764 
765 			/*
766 			 * Increment "posted_cnt"
767 			 */
768 			posted_cnt++;
769 		}
770 
771 		/*
772 		 * If we reach here and there are one or more WQEs which have
773 		 * been successfully chained together, then we need to link
774 		 * the current chain to the previously executing chain of
775 		 * descriptor (if there is one) and ring the doorbell for the
776 		 * recv work queue.
777 		 */
778 		if (currindx != 0) {
779 			/*
780 			 * Before we link the chain, we need to ensure that the
781 			 * "next" field on the last WQE is set to NULL (to
782 			 * indicate the end of the chain).
783 			 */
784 			tavor_wqe_srq_linknext(NULL, prev, srq);
785 
786 			sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev,
787 			    srq->srq_wq_log_wqesz);
788 
789 			/* Do a DMA sync for current recv WQE */
790 			tavor_wqe_sync(srq, sync_indx, sync_indx+1,
791 			    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
792 
793 			/*
794 			 * Now link the chain to the old chain (if there was
795 			 * one).
796 			 */
797 			if (srq->srq_wq_lastwqeindx == -1) {
798 				last_wqe_addr = NULL;
799 			} else {
800 				last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq,
801 				    srq->srq_wq_lastwqeindx);
802 			}
803 			tavor_wqe_srq_linknext(first, last_wqe_addr, srq);
804 
805 			/*
806 			 * If there was a valid previous WQE (i.e. valid index),
807 			 * then sync it too.  This is because we have updated
808 			 * its "next" fields and we want to ensure that the
809 			 * hardware can see the changes.
810 			 */
811 			if (srq->srq_wq_lastwqeindx != -1) {
812 				sync_indx = srq->srq_wq_lastwqeindx;
813 				tavor_wqe_sync(srq, sync_indx, sync_indx+1,
814 				    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
815 			}
816 
817 			/* Update some of the state in the QP */
818 			srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX(
819 			    srq->srq_wq_buf, desc,
820 			    srq->srq_wq_log_wqesz);
821 
822 			/* Ring the doorbell */
823 			/* SRQ needs NDS of 0 */
824 			tavor_qp_recv_doorbell(state,
825 			    (uint32_t)((uintptr_t)first - srq->srq_desc_off),
826 			    0, srq->srq_srqnum, (chainlen % maxdb));
827 		}
828 	}
829 
830 	/*
831 	 * Update the "num_posted" return value (if necessary).  Then drop
832 	 * the locks and return success.
833 	 */
834 	if (num_posted != NULL) {
835 		*num_posted = posted_cnt;
836 	}
837 
838 	mutex_exit(&srq->srq_wrid_wql->wql_lock);
839 	mutex_exit(&srq->srq_lock);
840 
841 	return (status);
842 }
843 
844 
845 /*
846  * tavor_qp_send_doorbell()
847  *    Context: Can be called from interrupt or base context.
848  */
849 static void
tavor_qp_send_doorbell(tavor_state_t * state,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t fence,uint32_t nopcode)850 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
851     uint32_t qpn, uint32_t fence, uint32_t nopcode)
852 {
853 	uint64_t	doorbell = 0;
854 
855 	/* Build the doorbell from the parameters */
856 	doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
857 	    TAVOR_QPSNDDB_NDA_SHIFT) |
858 	    ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
859 	    ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
860 	    ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
861 
862 	/* Write the doorbell to UAR */
863 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send,
864 	    doorbell);
865 }
866 
867 
868 /*
869  * tavor_qp_recv_doorbell()
870  *    Context: Can be called from interrupt or base context.
871  */
872 static void
tavor_qp_recv_doorbell(tavor_state_t * state,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t credits)873 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
874     uint32_t qpn, uint32_t credits)
875 {
876 	uint64_t	doorbell = 0;
877 
878 	/* Build the doorbell from the parameters */
879 	doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
880 	    TAVOR_QPRCVDB_NDA_SHIFT) |
881 	    ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
882 	    ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
883 
884 	/* Write the doorbell to UAR */
885 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv,
886 	    doorbell);
887 }
888 
889 
890 /*
891  * tavor_wqe_send_build()
892  *    Context: Can be called from interrupt or base context.
893  */
894 static int
tavor_wqe_send_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint64_t * desc,uint_t * size)895 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
896     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
897 {
898 	tavor_hw_snd_wqe_ud_t		*ud;
899 	tavor_hw_snd_wqe_remaddr_t	*rc;
900 	tavor_hw_snd_wqe_atomic_t	*at;
901 	tavor_hw_snd_wqe_remaddr_t	*uc;
902 	tavor_hw_snd_wqe_bind_t		*bn;
903 	tavor_hw_wqe_sgl_t		*ds;
904 	ibt_wr_ds_t			*sgl;
905 	tavor_ahhdl_t			ah;
906 	uint32_t			nds;
907 	int				i, num_ds, status;
908 
909 	ASSERT(MUTEX_HELD(&qp->qp_lock));
910 
911 	/* Initialize the information for the Data Segments */
912 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
913 	    sizeof (tavor_hw_snd_wqe_nextctrl_t));
914 	nds = wr->wr_nds;
915 	sgl = wr->wr_sgl;
916 	num_ds = 0;
917 
918 	/*
919 	 * Build a Send WQE depends first and foremost on the transport
920 	 * type of Work Request (i.e. UD, RC, or UC)
921 	 */
922 	switch (wr->wr_trans) {
923 	case IBT_UD_SRV:
924 		/* Ensure that work request transport type matches QP type */
925 		if (qp->qp_serv_type != TAVOR_QP_UD) {
926 			return (IBT_QP_SRV_TYPE_INVALID);
927 		}
928 
929 		/*
930 		 * Validate the operation type.  For UD requests, only the
931 		 * "Send" operation is valid
932 		 */
933 		if (wr->wr_opcode != IBT_WRC_SEND) {
934 			return (IBT_QP_OP_TYPE_INVALID);
935 		}
936 
937 		/*
938 		 * If this is a Special QP (QP0 or QP1), then we need to
939 		 * build MLX WQEs instead.  So jump to tavor_wqe_mlx_build()
940 		 * and return whatever status it returns
941 		 */
942 		if (qp->qp_is_special) {
943 			status = tavor_wqe_mlx_build(state, qp, wr, desc, size);
944 			return (status);
945 		}
946 
947 		/*
948 		 * Otherwise, if this is a normal UD Send request, then fill
949 		 * all the fields in the Tavor UD header for the WQE.  Note:
950 		 * to do this we'll need to extract some information from the
951 		 * Address Handle passed with the work request.
952 		 */
953 		ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc +
954 		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
955 		ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
956 		if (ah == NULL) {
957 			return (IBT_AH_HDL_INVALID);
958 		}
959 
960 		/*
961 		 * Build the Unreliable Datagram Segment for the WQE, using
962 		 * the information from the address handle and the work
963 		 * request.
964 		 */
965 		mutex_enter(&ah->ah_lock);
966 		TAVOR_WQE_BUILD_UD(qp, ud, ah, wr);
967 		mutex_exit(&ah->ah_lock);
968 
969 		/* Update "ds" for filling in Data Segments (below) */
970 		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud +
971 		    sizeof (tavor_hw_snd_wqe_ud_t));
972 		break;
973 
974 	case IBT_RC_SRV:
975 		/* Ensure that work request transport type matches QP type */
976 		if (qp->qp_serv_type != TAVOR_QP_RC) {
977 			return (IBT_QP_SRV_TYPE_INVALID);
978 		}
979 
980 		/*
981 		 * Validate the operation type.  For RC requests, we allow
982 		 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
983 		 * operations, and memory window "Bind"
984 		 */
985 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
986 		    (wr->wr_opcode != IBT_WRC_RDMAR) &&
987 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
988 		    (wr->wr_opcode != IBT_WRC_CSWAP) &&
989 		    (wr->wr_opcode != IBT_WRC_FADD) &&
990 		    (wr->wr_opcode != IBT_WRC_BIND)) {
991 			return (IBT_QP_OP_TYPE_INVALID);
992 		}
993 
994 		/*
995 		 * If this is a Send request, then all we need to do is break
996 		 * out and here and begin the Data Segment processing below
997 		 */
998 		if (wr->wr_opcode == IBT_WRC_SEND) {
999 			break;
1000 		}
1001 
1002 		/*
1003 		 * If this is an RDMA Read or RDMA Write request, then fill
1004 		 * in the "Remote Address" header fields.
1005 		 */
1006 		if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1007 		    (wr->wr_opcode == IBT_WRC_RDMAW)) {
1008 			rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1009 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1010 
1011 			/*
1012 			 * Build the Remote Address Segment for the WQE, using
1013 			 * the information from the RC work request.
1014 			 */
1015 			TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1016 
1017 			/* Update "ds" for filling in Data Segments (below) */
1018 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
1019 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1020 			break;
1021 		}
1022 
1023 		/*
1024 		 * If this is one of the Atomic type operations (i.e
1025 		 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1026 		 * Address" header fields and the "Atomic" header fields.
1027 		 */
1028 		if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1029 		    (wr->wr_opcode == IBT_WRC_FADD)) {
1030 			rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1031 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1032 			at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1033 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1034 
1035 			/*
1036 			 * Build the Remote Address and Atomic Segments for
1037 			 * the WQE, using the information from the RC Atomic
1038 			 * work request.
1039 			 */
1040 			TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1041 			TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1042 
1043 			/* Update "ds" for filling in Data Segments (below) */
1044 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at +
1045 			    sizeof (tavor_hw_snd_wqe_atomic_t));
1046 
1047 			/*
1048 			 * Update "nds" and "sgl" because Atomic requests have
1049 			 * only a single Data Segment (and they are encoded
1050 			 * somewhat differently in the work request.
1051 			 */
1052 			nds = 1;
1053 			sgl = wr->wr_sgl;
1054 			break;
1055 		}
1056 
1057 		/*
1058 		 * If this is memory window Bind operation, then we call the
1059 		 * tavor_wr_bind_check() routine to validate the request and
1060 		 * to generate the updated RKey.  If this is successful, then
1061 		 * we fill in the WQE's "Bind" header fields.
1062 		 */
1063 		if (wr->wr_opcode == IBT_WRC_BIND) {
1064 			status = tavor_wr_bind_check(state, wr);
1065 			if (status != DDI_SUCCESS) {
1066 				return (status);
1067 			}
1068 
1069 			bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1070 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1071 
1072 			/*
1073 			 * Build the Bind Memory Window Segments for the WQE,
1074 			 * using the information from the RC Bind memory
1075 			 * window work request.
1076 			 */
1077 			TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1078 
1079 			/*
1080 			 * Update the "ds" pointer.  Even though the "bind"
1081 			 * operation requires no SGLs, this is necessary to
1082 			 * facilitate the correct descriptor size calculations
1083 			 * (below).
1084 			 */
1085 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1086 			    sizeof (tavor_hw_snd_wqe_bind_t));
1087 			nds = 0;
1088 		}
1089 		break;
1090 
1091 	case IBT_UC_SRV:
1092 		/* Ensure that work request transport type matches QP type */
1093 		if (qp->qp_serv_type != TAVOR_QP_UC) {
1094 			return (IBT_QP_SRV_TYPE_INVALID);
1095 		}
1096 
1097 		/*
1098 		 * Validate the operation type.  For UC requests, we only
1099 		 * allow "Send", "RDMA Write", and memory window "Bind".
1100 		 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1101 		 * operations
1102 		 */
1103 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1104 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1105 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1106 			return (IBT_QP_OP_TYPE_INVALID);
1107 		}
1108 
1109 		/*
1110 		 * If this is a Send request, then all we need to do is break
1111 		 * out and here and begin the Data Segment processing below
1112 		 */
1113 		if (wr->wr_opcode == IBT_WRC_SEND) {
1114 			break;
1115 		}
1116 
1117 		/*
1118 		 * If this is an RDMA Write request, then fill in the "Remote
1119 		 * Address" header fields.
1120 		 */
1121 		if (wr->wr_opcode == IBT_WRC_RDMAW) {
1122 			uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1123 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1124 
1125 			/*
1126 			 * Build the Remote Address Segment for the WQE, using
1127 			 * the information from the UC work request.
1128 			 */
1129 			TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1130 
1131 			/* Update "ds" for filling in Data Segments (below) */
1132 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc +
1133 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1134 			break;
1135 		}
1136 
1137 		/*
1138 		 * If this is memory window Bind operation, then we call the
1139 		 * tavor_wr_bind_check() routine to validate the request and
1140 		 * to generate the updated RKey.  If this is successful, then
1141 		 * we fill in the WQE's "Bind" header fields.
1142 		 */
1143 		if (wr->wr_opcode == IBT_WRC_BIND) {
1144 			status = tavor_wr_bind_check(state, wr);
1145 			if (status != DDI_SUCCESS) {
1146 				return (status);
1147 			}
1148 
1149 			bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1150 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1151 
1152 			/*
1153 			 * Build the Bind Memory Window Segments for the WQE,
1154 			 * using the information from the UC Bind memory
1155 			 * window work request.
1156 			 */
1157 			TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1158 
1159 			/*
1160 			 * Update the "ds" pointer.  Even though the "bind"
1161 			 * operation requires no SGLs, this is necessary to
1162 			 * facilitate the correct descriptor size calculations
1163 			 * (below).
1164 			 */
1165 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1166 			    sizeof (tavor_hw_snd_wqe_bind_t));
1167 			nds = 0;
1168 		}
1169 		break;
1170 
1171 	default:
1172 		return (IBT_QP_SRV_TYPE_INVALID);
1173 	}
1174 
1175 	/*
1176 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
1177 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1178 	 * Start by checking for a valid number of SGL entries
1179 	 */
1180 	if (nds > qp->qp_sq_sgl) {
1181 		return (IBT_QP_SGL_LEN_INVALID);
1182 	}
1183 
1184 	/*
1185 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
1186 	 * segments.  Note: We skip any SGL with zero size because Tavor
1187 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1188 	 * the encoding for zero means a 2GB transfer.  Because of this special
1189 	 * encoding in the hardware, we mask the requested length with
1190 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1191 	 * zero.)
1192 	 */
1193 	for (i = 0; i < nds; i++) {
1194 		if (sgl[i].ds_len == 0) {
1195 			continue;
1196 		}
1197 
1198 		/*
1199 		 * Fill in the Data Segment(s) for the current WQE, using the
1200 		 * information contained in the scatter-gather list of the
1201 		 * work request.
1202 		 */
1203 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1204 		num_ds++;
1205 	}
1206 
1207 	/* Return the size of descriptor (in 16-byte chunks) */
1208 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4;
1209 
1210 	return (DDI_SUCCESS);
1211 }
1212 
1213 
1214 /*
1215  * tavor_wqe_send_linknext()
1216  *    Context: Can be called from interrupt or base context.
1217  */
1218 static void
tavor_wqe_send_linknext(ibt_send_wr_t * curr_wr,ibt_send_wr_t * prev_wr,uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_sw_wqe_dbinfo_t * dbinfo,tavor_qphdl_t qp)1219 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr,
1220     uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc,
1221     tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp)
1222 {
1223 	uint64_t	next, ctrl;
1224 	uint32_t	nopcode, fence;
1225 
1226 	/*
1227 	 * Calculate the "next" field of the descriptor.  This amounts to
1228 	 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
1229 	 * fields (see tavor_hw.h for more).  Note:  If there is no next
1230 	 * descriptor (i.e. if the current descriptor is the last WQE on
1231 	 * the chain), then set "next" to zero.
1232 	 */
1233 	if (curr_desc != NULL) {
1234 		/*
1235 		 * Determine the value for the Tavor WQE "nopcode" field
1236 		 * by using the IBTF opcode from the work request
1237 		 */
1238 		switch (curr_wr->wr_opcode) {
1239 		case IBT_WRC_RDMAW:
1240 			if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1241 				nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI;
1242 			} else {
1243 				nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
1244 			}
1245 			break;
1246 
1247 		case IBT_WRC_SEND:
1248 			if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1249 				nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI;
1250 			} else {
1251 				nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1252 			}
1253 			break;
1254 
1255 		case IBT_WRC_RDMAR:
1256 			nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
1257 			break;
1258 
1259 		case IBT_WRC_CSWAP:
1260 			nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS;
1261 			break;
1262 
1263 		case IBT_WRC_FADD:
1264 			nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA;
1265 			break;
1266 
1267 		case IBT_WRC_BIND:
1268 			nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
1269 			break;
1270 		}
1271 
1272 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc
1273 		    - qp->qp_desc_off);
1274 		next  = ((uint64_t)(uintptr_t)curr_desc &
1275 		    TAVOR_WQE_NDA_MASK) << 32;
1276 		next  = next | ((uint64_t)nopcode << 32);
1277 		fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
1278 		if (fence) {
1279 			next = next | TAVOR_WQE_SEND_FENCE_MASK;
1280 		}
1281 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1282 
1283 		/*
1284 		 * If a send queue doorbell will be rung for the next
1285 		 * WQE on the chain, then set the current WQE's "dbd" bit.
1286 		 * Note: We also update the "dbinfo" structure here to pass
1287 		 * back information about what should (later) be included
1288 		 * in the send queue doorbell.
1289 		 */
1290 		if (dbinfo) {
1291 			next = next | TAVOR_WQE_DBD_MASK;
1292 			dbinfo->db_nopcode = nopcode;
1293 			dbinfo->db_fence   = fence;
1294 		}
1295 	} else {
1296 		next = 0;
1297 	}
1298 
1299 	/*
1300 	 * If this WQE is supposed to be linked to the previous descriptor,
1301 	 * then we need to update not only the previous WQE's "next" fields
1302 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e",
1303 	 * "s", "i" and "immediate" fields - see tavor_hw.h for more).  Note:
1304 	 * the "e" bit is always hardcoded to zero.
1305 	 */
1306 	if (prev_desc != NULL) {
1307 		/*
1308 		 * If a send queue doorbell will be rung for the next WQE on
1309 		 * the chain, then update the current WQE's "next" field and
1310 		 * return.
1311 		 * Note: We don't want to modify the "ctrl" field here because
1312 		 * that portion of the previous WQE has already been set
1313 		 * correctly at some previous point in time.
1314 		 */
1315 		if (dbinfo) {
1316 			TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1317 			return;
1318 		}
1319 
1320 		ctrl = 0;
1321 
1322 		/* Set the "c" (i.e. "signaled") bit appropriately */
1323 		if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1324 			ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
1325 		}
1326 
1327 		/* Set the "s" (i.e. "solicited") bit appropriately */
1328 		if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
1329 			ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
1330 		}
1331 
1332 		/* Set the "i" bit and the immediate data appropriately */
1333 		if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) {
1334 			ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK;
1335 			ctrl = ctrl | tavor_wr_get_immediate(prev_wr);
1336 		}
1337 
1338 		TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1339 	}
1340 }
1341 
1342 
1343 /*
1344  * tavor_wqe_mlx_build()
1345  *    Context: Can be called from interrupt or base context.
1346  */
1347 static int
tavor_wqe_mlx_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint64_t * desc,uint_t * size)1348 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
1349     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1350 {
1351 	tavor_hw_udav_t		udav;
1352 	tavor_ahhdl_t		ah;
1353 	ib_lrh_hdr_t		*lrh;
1354 	ib_grh_t		*grh;
1355 	ib_bth_hdr_t		*bth;
1356 	ib_deth_hdr_t		*deth;
1357 	tavor_hw_wqe_sgl_t	*ds;
1358 	ibt_wr_ds_t		*sgl;
1359 	uint8_t			*mgmtclass, *hpoint, *hcount;
1360 	uint64_t		data;
1361 	uint32_t		nds, offset, pktlen;
1362 	uint32_t		desc_sz, udav_sz;
1363 	int			i, num_ds;
1364 
1365 	ASSERT(MUTEX_HELD(&qp->qp_lock));
1366 
1367 	/* Initialize the information for the Data Segments */
1368 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1369 	    sizeof (tavor_hw_mlx_wqe_nextctrl_t));
1370 
1371 	/*
1372 	 * Pull the address handle from the work request and read in
1373 	 * the contents of the UDAV.  This will be used to answer some
1374 	 * questions about the request.
1375 	 */
1376 	ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1377 	if (ah == NULL) {
1378 		return (IBT_AH_HDL_INVALID);
1379 	}
1380 	mutex_enter(&ah->ah_lock);
1381 	udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1382 	for (i = 0; i < udav_sz; i++) {
1383 		data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1384 		    ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1385 		((uint64_t *)&udav)[i] = data;
1386 	}
1387 	mutex_exit(&ah->ah_lock);
1388 
1389 	/*
1390 	 * If the request is for QP1 and the destination LID is equal to
1391 	 * the Permissive LID, then return an error.  This combination is
1392 	 * not allowed
1393 	 */
1394 	if ((udav.rlid == IB_LID_PERMISSIVE) &&
1395 	    (qp->qp_is_special == TAVOR_QP_GSI)) {
1396 		return (IBT_AH_HDL_INVALID);
1397 	}
1398 
1399 	/*
1400 	 * Calculate the size of the packet headers, including the GRH
1401 	 * (if necessary)
1402 	 */
1403 	desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1404 	    sizeof (ib_deth_hdr_t);
1405 	if (udav.grh) {
1406 		desc_sz += sizeof (ib_grh_t);
1407 	}
1408 
1409 	/*
1410 	 * Begin to build the first "inline" data segment for the packet
1411 	 * headers.  Note:  By specifying "inline" we can build the contents
1412 	 * of the MAD packet headers directly into the work queue (as part
1413 	 * descriptor).  This has the advantage of both speeding things up
1414 	 * and of not requiring the driver to allocate/register any additional
1415 	 * memory for the packet headers.
1416 	 */
1417 	TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1418 	desc_sz += 4;
1419 
1420 	/*
1421 	 * Build Local Route Header (LRH)
1422 	 *    We start here by building the LRH into a temporary location.
1423 	 *    When we have finished we copy the LRH data into the descriptor.
1424 	 *
1425 	 *    Notice that the VL values are hardcoded.  This is not a problem
1426 	 *    because VL15 is decided later based on the value in the MLX
1427 	 *    transport "next/ctrl" header (see the "vl15" bit below), and it
1428 	 *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1429 	 *    values.  This rule does not hold for loopback packets however
1430 	 *    (all of which bypass the SL-to-VL tables) and it is the reason
1431 	 *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1432 	 *
1433 	 *    Notice also that Source LID is hardcoded to the Permissive LID
1434 	 *    (0xFFFF).  This is also not a problem because if the Destination
1435 	 *    LID is not the Permissive LID, then the "slr" value in the MLX
1436 	 *    transport "next/ctrl" header will be set to zero and the hardware
1437 	 *    will pull the LID from value in the port.
1438 	 */
1439 	lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1440 	pktlen = (desc_sz + 0x100) >> 2;
1441 	TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1442 
1443 	/*
1444 	 * Build Global Route Header (GRH)
1445 	 *    This is only built if necessary as defined by the "grh" bit in
1446 	 *    the address vector.  Note:  We also calculate the offset to the
1447 	 *    next header (BTH) based on whether or not the "grh" bit is set.
1448 	 */
1449 	if (udav.grh) {
1450 		/*
1451 		 * If the request is for QP0, then return an error.  The
1452 		 * combination of global routine (GRH) and QP0 is not allowed.
1453 		 */
1454 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1455 			return (IBT_AH_HDL_INVALID);
1456 		}
1457 		grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1458 		TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1459 
1460 		bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1461 	} else {
1462 		bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1463 	}
1464 
1465 
1466 	/*
1467 	 * Build Base Transport Header (BTH)
1468 	 *    Notice that the M, PadCnt, and TVer fields are all set
1469 	 *    to zero implicitly.  This is true for all Management Datagrams
1470 	 *    MADs whether GSI are SMI.
1471 	 */
1472 	TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1473 
1474 	/*
1475 	 * Build Datagram Extended Transport Header (DETH)
1476 	 */
1477 	deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1478 	TAVOR_WQE_BUILD_MLX_DETH(deth, qp);
1479 
1480 	/* Ensure that the Data Segment is aligned on a 16-byte boundary */
1481 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1482 	ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1483 	nds = wr->wr_nds;
1484 	sgl = wr->wr_sgl;
1485 	num_ds = 0;
1486 
1487 	/*
1488 	 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1489 	 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1490 	 * Start by checking for a valid number of SGL entries
1491 	 */
1492 	if (nds > qp->qp_sq_sgl) {
1493 		return (IBT_QP_SGL_LEN_INVALID);
1494 	}
1495 
1496 	/*
1497 	 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1498 	 * segments.  Note: We skip any SGL with zero size because Tavor
1499 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1500 	 * the encoding for zero means a 2GB transfer.  Because of this special
1501 	 * encoding in the hardware, we mask the requested length with
1502 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1503 	 * zero.)
1504 	 */
1505 	mgmtclass = hpoint = hcount = NULL;
1506 	offset = 0;
1507 	for (i = 0; i < nds; i++) {
1508 		if (sgl[i].ds_len == 0) {
1509 			continue;
1510 		}
1511 
1512 		/*
1513 		 * Fill in the Data Segment(s) for the MLX send WQE, using
1514 		 * the information contained in the scatter-gather list of
1515 		 * the work request.
1516 		 */
1517 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1518 
1519 		/*
1520 		 * Search through the contents of all MADs posted to QP0 to
1521 		 * initialize pointers to the places where Directed Route "hop
1522 		 * pointer", "hop count", and "mgmtclass" would be.  Tavor
1523 		 * needs these updated (i.e. incremented or decremented, as
1524 		 * necessary) by software.
1525 		 */
1526 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1527 
1528 			TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1529 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1530 
1531 			TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1532 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1533 
1534 			TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1535 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1536 
1537 			offset += sgl[i].ds_len;
1538 		}
1539 		num_ds++;
1540 	}
1541 
1542 	/*
1543 	 * Tavor's Directed Route MADs need to have the "hop pointer"
1544 	 * incremented/decremented (as necessary) depending on whether it is
1545 	 * currently less than or greater than the "hop count" (i.e. whether
1546 	 * the MAD is a request or a response.)
1547 	 */
1548 	if (qp->qp_is_special == TAVOR_QP_SMI) {
1549 		TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1550 		    *hpoint, *hcount);
1551 	}
1552 
1553 	/*
1554 	 * Now fill in the ICRC Data Segment.  This data segment is inlined
1555 	 * just like the packets headers above, but it is only four bytes and
1556 	 * set to zero (to indicate that we wish the hardware to generate ICRC.
1557 	 */
1558 	TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1559 	num_ds++;
1560 
1561 	/* Return the size of descriptor (in 16-byte chunks) */
1562 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1563 
1564 	return (DDI_SUCCESS);
1565 }
1566 
1567 
1568 /*
1569  * tavor_wqe_mlx_linknext()
1570  *    Context: Can be called from interrupt or base context.
1571  */
1572 static void
tavor_wqe_mlx_linknext(ibt_send_wr_t * prev_wr,uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_sw_wqe_dbinfo_t * dbinfo,tavor_qphdl_t qp)1573 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
1574     uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
1575     tavor_qphdl_t qp)
1576 {
1577 	tavor_hw_udav_t		udav;
1578 	tavor_ahhdl_t		ah;
1579 	uint64_t		next, ctrl, data;
1580 	uint_t			nopcode;
1581 	uint_t			udav_sz;
1582 	int			i;
1583 
1584 	/*
1585 	 * Calculate the "next" field of the descriptor.  This amounts to
1586 	 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see
1587 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1588 	 * if the current descriptor is the last WQE on the chain), then set
1589 	 * "next" to zero.
1590 	 */
1591 	if (curr_desc != NULL) {
1592 		/*
1593 		 * The only valid Tavor WQE "nopcode" for MLX transport
1594 		 * requests is the "Send" code.
1595 		 */
1596 		nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1597 		curr_desc = (uint64_t *)(uintptr_t)((uint64_t)
1598 		    (uintptr_t)curr_desc - qp->qp_desc_off);
1599 		next = (uint64_t)((uintptr_t)curr_desc &
1600 		    TAVOR_WQE_NDA_MASK) << 32;
1601 		next = next | ((uint64_t)nopcode << 32);
1602 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1603 
1604 		/*
1605 		 * If a send queue doorbell will be rung for the next
1606 		 * WQE on the chain, then set the current WQE's "dbd" bit.
1607 		 * Note: We also update the "dbinfo" structure here to pass
1608 		 * back information about what should (later) be included
1609 		 * in the send queue doorbell.
1610 		 */
1611 		if (dbinfo) {
1612 			next = next | TAVOR_WQE_DBD_MASK;
1613 			dbinfo->db_nopcode = nopcode;
1614 			dbinfo->db_fence   = 0;
1615 		}
1616 	} else {
1617 		next = 0;
1618 	}
1619 
1620 	/*
1621 	 * If this WQE is supposed to be linked to the previous descriptor,
1622 	 * then we need to update not only the previous WQE's "next" fields
1623 	 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15",
1624 	 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields -
1625 	 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are
1626 	 * always hardcoded to zero.
1627 	 */
1628 	if (prev_desc != NULL) {
1629 		/*
1630 		 * If a send queue doorbell will be rung for the next WQE on
1631 		 * the chain, then update the current WQE's "next" field and
1632 		 * return.
1633 		 * Note: We don't want to modify the "ctrl" field here because
1634 		 * that portion of the previous WQE has already been set
1635 		 * correctly at some previous point in time.
1636 		 */
1637 		if (dbinfo) {
1638 			TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1639 			return;
1640 		}
1641 
1642 		/*
1643 		 * Pull the address handle from the work request and read in
1644 		 * the contents of the UDAV.  This will be used to answer some
1645 		 * questions about the request.
1646 		 */
1647 		ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah;
1648 		mutex_enter(&ah->ah_lock);
1649 		udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1650 		for (i = 0; i < udav_sz; i++) {
1651 			data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1652 			    ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1653 			((uint64_t *)&udav)[i] = data;
1654 		}
1655 		mutex_exit(&ah->ah_lock);
1656 
1657 		ctrl = 0;
1658 
1659 		/* Only QP0 uses VL15, otherwise use VL in the packet */
1660 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1661 			ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK;
1662 		}
1663 
1664 		/*
1665 		 * The SLR (Source LID Replace) bit determines whether the
1666 		 * source LID for an outgoing MLX packet should come from the
1667 		 * PortInfo (SLR = 0) or should be left as it is in the
1668 		 * descriptor (SLR = 1).  The latter is necessary for packets
1669 		 * to be sent with the Permissive LID.
1670 		 */
1671 		if (udav.rlid == IB_LID_PERMISSIVE) {
1672 			ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK;
1673 		}
1674 
1675 		/* Fill in the max static rate from the address handle */
1676 		ctrl = ctrl | ((uint64_t)udav.max_stat_rate <<
1677 		    TAVOR_WQE_MLXHDR_SRATE_SHIFT);
1678 
1679 		/* All VL15 (i.e. SMI) traffic is required to use SL 0 */
1680 		if (qp->qp_is_special != TAVOR_QP_SMI) {
1681 			ctrl = ctrl | ((uint64_t)udav.sl <<
1682 			    TAVOR_WQE_MLXHDR_SL_SHIFT);
1683 		}
1684 
1685 		/* Set the "c" (i.e. "signaled") bit appropriately */
1686 		if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1687 			ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK;
1688 		}
1689 
1690 		/* Fill in the destination LID from the address handle */
1691 		ctrl = ctrl | ((uint64_t)udav.rlid <<
1692 		    TAVOR_WQE_MLXHDR_RLID_SHIFT);
1693 
1694 		TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1695 	}
1696 }
1697 
1698 
1699 /*
1700  * tavor_wqe_recv_build()
1701  *    Context: Can be called from interrupt or base context.
1702  */
1703 /* ARGSUSED */
1704 static int
tavor_wqe_recv_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_recv_wr_t * wr,uint64_t * desc,uint_t * size)1705 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
1706     ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size)
1707 {
1708 	tavor_hw_wqe_sgl_t	*ds;
1709 	int			i, num_ds;
1710 
1711 	ASSERT(MUTEX_HELD(&qp->qp_lock));
1712 
1713 	/* Check that work request transport type is valid */
1714 	if ((qp->qp_serv_type != TAVOR_QP_UD) &&
1715 	    (qp->qp_serv_type != TAVOR_QP_RC) &&
1716 	    (qp->qp_serv_type != TAVOR_QP_UC)) {
1717 		return (IBT_QP_SRV_TYPE_INVALID);
1718 	}
1719 
1720 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1721 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1722 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1723 	num_ds = 0;
1724 
1725 	/* Check for valid number of SGL entries */
1726 	if (wr->wr_nds > qp->qp_rq_sgl) {
1727 		return (IBT_QP_SGL_LEN_INVALID);
1728 	}
1729 
1730 	/*
1731 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1732 	 * segments.  Note: We skip any SGL with zero size because Tavor
1733 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1734 	 * the encoding for zero means a 2GB transfer.  Because of this special
1735 	 * encoding in the hardware, we mask the requested length with
1736 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1737 	 * zero.)
1738 	 */
1739 	for (i = 0; i < wr->wr_nds; i++) {
1740 		if (wr->wr_sgl[i].ds_len == 0) {
1741 			continue;
1742 		}
1743 
1744 		/*
1745 		 * Fill in the Data Segment(s) for the receive WQE, using the
1746 		 * information contained in the scatter-gather list of the
1747 		 * work request.
1748 		 */
1749 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]);
1750 		num_ds++;
1751 	}
1752 
1753 	/* Return the size of descriptor (in 16-byte chunks) */
1754 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1755 
1756 	return (DDI_SUCCESS);
1757 }
1758 
1759 
1760 /*
1761  * tavor_wqe_recv_linknext()
1762  *    Context: Can be called from interrupt or base context.
1763  */
1764 static void
tavor_wqe_recv_linknext(uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_qphdl_t qp)1765 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz,
1766     uint64_t *prev_desc, tavor_qphdl_t qp)
1767 {
1768 	uint64_t	next;
1769 
1770 	/*
1771 	 * Calculate the "next" field of the descriptor.  This amounts to
1772 	 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1773 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1774 	 * if the current descriptor is the last WQE on the chain), then set
1775 	 * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
1776 	 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1777 	 * In either case, we must add a single bit in the "reserved" field
1778 	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
1779 	 * workaround for a known Tavor errata that can cause Recv WQEs with
1780 	 * zero in the NDA field to behave improperly.
1781 	 */
1782 	if (curr_desc != NULL) {
1783 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1784 		    qp->qp_desc_off);
1785 		next = (uint64_t)((uintptr_t)curr_desc &
1786 		    TAVOR_WQE_NDA_MASK) << 32;
1787 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
1788 		    TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1789 	} else {
1790 		next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1791 	}
1792 
1793 	/*
1794 	 * If this WQE is supposed to be linked to the previous descriptor,
1795 	 * then we need to update not only the previous WQE's "next" fields
1796 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1797 	 * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
1798 	 * bits are always hardcoded to zero.
1799 	 */
1800 	if (prev_desc != NULL) {
1801 		TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next);
1802 	}
1803 }
1804 
1805 
1806 /*
1807  * tavor_wqe_srq_build()
1808  *    Context: Can be called from interrupt or base context.
1809  */
1810 /* ARGSUSED */
1811 static int
tavor_wqe_srq_build(tavor_state_t * state,tavor_srqhdl_t srq,ibt_recv_wr_t * wr,uint64_t * desc)1812 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
1813     ibt_recv_wr_t *wr, uint64_t *desc)
1814 {
1815 	tavor_hw_wqe_sgl_t	*ds;
1816 	ibt_wr_ds_t		end_sgl;
1817 	int			i, num_ds;
1818 
1819 	ASSERT(MUTEX_HELD(&srq->srq_lock));
1820 
1821 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1822 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1823 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1824 	num_ds = 0;
1825 
1826 	/* Check for valid number of SGL entries */
1827 	if (wr->wr_nds > srq->srq_wq_sgl) {
1828 		return (IBT_QP_SGL_LEN_INVALID);
1829 	}
1830 
1831 	/*
1832 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1833 	 * segments.  Note: We skip any SGL with zero size because Tavor
1834 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1835 	 * the encoding for zero means a 2GB transfer.  Because of this special
1836 	 * encoding in the hardware, we mask the requested length with
1837 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1838 	 * zero.)
1839 	 */
1840 	for (i = 0; i < wr->wr_nds; i++) {
1841 		if (wr->wr_sgl[i].ds_len == 0) {
1842 			continue;
1843 		}
1844 
1845 		/*
1846 		 * Fill in the Data Segment(s) for the receive WQE, using the
1847 		 * information contained in the scatter-gather list of the
1848 		 * work request.
1849 		 */
1850 		TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]);
1851 		num_ds++;
1852 	}
1853 
1854 	/*
1855 	 * For SRQ, if the number of data segments is less than the maximum
1856 	 * specified at alloc, then we have to fill in a special "key" entry in
1857 	 * the sgl entry after the last valid one in this post request.  We do
1858 	 * that here.
1859 	 */
1860 	if (num_ds < srq->srq_wq_sgl) {
1861 		end_sgl.ds_va  = 0;
1862 		end_sgl.ds_len = 0;
1863 		end_sgl.ds_key = 0x1;
1864 		TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl);
1865 	}
1866 
1867 	return (DDI_SUCCESS);
1868 }
1869 
1870 
1871 /*
1872  * tavor_wqe_srq_linknext()
1873  *    Context: Can be called from interrupt or base context.
1874  */
1875 static void
tavor_wqe_srq_linknext(uint64_t * curr_desc,uint64_t * prev_desc,tavor_srqhdl_t srq)1876 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc,
1877     tavor_srqhdl_t srq)
1878 {
1879 	uint64_t	next;
1880 
1881 	/*
1882 	 * Calculate the "next" field of the descriptor.  This amounts to
1883 	 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1884 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1885 	 * if the current descriptor is the last WQE on the chain), then set
1886 	 * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
1887 	 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1888 	 * In either case, we must add a single bit in the "reserved" field
1889 	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
1890 	 * workaround for a known Tavor errata that can cause Recv WQEs with
1891 	 * zero in the NDA field to behave improperly.
1892 	 */
1893 	if (curr_desc != NULL) {
1894 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1895 		    srq->srq_desc_off);
1896 		next = (uint64_t)((uintptr_t)curr_desc &
1897 		    TAVOR_WQE_NDA_MASK) << 32;
1898 		next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1899 	} else {
1900 		next = TAVOR_RCV_WQE_NDA0_WA_MASK;
1901 	}
1902 
1903 	/*
1904 	 * If this WQE is supposed to be linked to the previous descriptor,
1905 	 * then we need to update not only the previous WQE's "next" fields
1906 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1907 	 * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
1908 	 * bits are always hardcoded to zero.
1909 	 */
1910 	if (prev_desc != NULL) {
1911 		TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next);
1912 	}
1913 }
1914 
1915 
1916 /*
1917  * tavor_wr_get_immediate()
1918  *    Context: Can be called from interrupt or base context.
1919  */
1920 static uint32_t
tavor_wr_get_immediate(ibt_send_wr_t * wr)1921 tavor_wr_get_immediate(ibt_send_wr_t *wr)
1922 {
1923 	/*
1924 	 * This routine extracts the "immediate data" from the appropriate
1925 	 * location in the IBTF work request.  Because of the way the
1926 	 * work request structure is defined, the location for this data
1927 	 * depends on the actual work request operation type.
1928 	 */
1929 
1930 	/* For RDMA Write, test if RC or UC */
1931 	if (wr->wr_opcode == IBT_WRC_RDMAW) {
1932 		if (wr->wr_trans == IBT_RC_SRV) {
1933 			return (wr->wr.rc.rcwr.rdma.rdma_immed);
1934 		} else {  /* IBT_UC_SRV */
1935 			return (wr->wr.uc.ucwr.rdma.rdma_immed);
1936 		}
1937 	}
1938 
1939 	/* For Send, test if RC, UD, or UC */
1940 	if (wr->wr_opcode == IBT_WRC_SEND) {
1941 		if (wr->wr_trans == IBT_RC_SRV) {
1942 			return (wr->wr.rc.rcwr.send_immed);
1943 		} else if (wr->wr_trans == IBT_UD_SRV) {
1944 			return (wr->wr.ud.udwr_immed);
1945 		} else {  /* IBT_UC_SRV */
1946 			return (wr->wr.uc.ucwr.send_immed);
1947 		}
1948 	}
1949 
1950 	/*
1951 	 * If any other type of request, then immediate is undefined
1952 	 */
1953 	return (0);
1954 }
1955 
1956 
1957 /*
1958  * tavor_wqe_sync()
1959  *    Context: Can be called from interrupt or base context.
1960  */
1961 static void
tavor_wqe_sync(void * hdl,uint_t sync_from,uint_t sync_to,uint_t sync_type,uint_t flag)1962 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
1963     uint_t sync_type, uint_t flag)
1964 {
1965 	tavor_qphdl_t		qp;
1966 	tavor_srqhdl_t		srq;
1967 	uint_t			is_sync_req;
1968 	uint64_t		*wqe_from, *wqe_to, *wqe_base, *wqe_top;
1969 	ddi_dma_handle_t	dmahdl;
1970 	off_t			offset;
1971 	size_t			length;
1972 	uint32_t		qsize;
1973 	int			status;
1974 
1975 	if (sync_type == TAVOR_WR_SRQ) {
1976 		srq = (tavor_srqhdl_t)hdl;
1977 		is_sync_req = srq->srq_sync;
1978 		/* Get the DMA handle from SRQ context */
1979 		dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
1980 	} else {
1981 		qp = (tavor_qphdl_t)hdl;
1982 		is_sync_req = qp->qp_sync;
1983 		/* Get the DMA handle from QP context */
1984 		dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
1985 	}
1986 
1987 	/* Determine if the work queues need to be synced or not */
1988 	if (is_sync_req == 0) {
1989 		return;
1990 	}
1991 
1992 	/*
1993 	 * Depending on the type of the work queue, we grab information
1994 	 * about the address ranges we need to DMA sync.
1995 	 */
1996 	if (sync_type == TAVOR_WR_SEND) {
1997 		wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from);
1998 		wqe_to   = TAVOR_QP_SQ_ENTRY(qp, sync_to);
1999 		qsize	 = qp->qp_sq_bufsz;
2000 
2001 		wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0);
2002 		wqe_top	 = TAVOR_QP_SQ_ENTRY(qp, qsize);
2003 	} else if (sync_type == TAVOR_WR_RECV) {
2004 		wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from);
2005 		wqe_to   = TAVOR_QP_RQ_ENTRY(qp, sync_to);
2006 		qsize	 = qp->qp_rq_bufsz;
2007 
2008 		wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0);
2009 		wqe_top	 = TAVOR_QP_RQ_ENTRY(qp, qsize);
2010 	} else {
2011 		wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from);
2012 		wqe_to   = TAVOR_SRQ_WQ_ENTRY(srq, sync_to);
2013 		qsize	 = srq->srq_wq_bufsz;
2014 
2015 		wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0);
2016 		wqe_top	 = TAVOR_SRQ_WQ_ENTRY(srq, qsize);
2017 	}
2018 
2019 	/*
2020 	 * There are two possible cases for the beginning and end of the WQE
2021 	 * chain we are trying to sync.  Either this is the simple case, where
2022 	 * the end of the chain is below the beginning of the chain, or it is
2023 	 * the "wrap-around" case, where the end of the chain has wrapped over
2024 	 * the end of the queue.  In the former case, we simply need to
2025 	 * calculate the span from beginning to end and sync it.  In the latter
2026 	 * case, however, we need to calculate the span from the top of the
2027 	 * work queue to the end of the chain and sync that, and then we need
2028 	 * to find the other portion (from beginning of chain to end of queue)
2029 	 * and sync that as well.  Note: if the "top to end" span is actually
2030 	 * zero length, then we don't do a DMA sync because a zero length DMA
2031 	 * sync unnecessarily syncs the entire work queue.
2032 	 */
2033 	if (wqe_to > wqe_from) {
2034 		/* "From Beginning to End" */
2035 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2036 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2037 
2038 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2039 		if (status != DDI_SUCCESS) {
2040 			return;
2041 		}
2042 	} else {
2043 		/* "From Top to End" */
2044 		offset = (off_t)0;
2045 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base);
2046 		if (length) {
2047 			status = ddi_dma_sync(dmahdl, offset, length, flag);
2048 			if (status != DDI_SUCCESS) {
2049 				return;
2050 			}
2051 		}
2052 
2053 		/* "From Beginning to Bottom" */
2054 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2055 		length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from);
2056 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2057 		if (status != DDI_SUCCESS) {
2058 			return;
2059 		}
2060 	}
2061 }
2062 
2063 
2064 /*
2065  * tavor_wr_bind_check()
2066  *    Context: Can be called from interrupt or base context.
2067  */
2068 static int
tavor_wr_bind_check(tavor_state_t * state,ibt_send_wr_t * wr)2069 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr)
2070 {
2071 	ibt_bind_flags_t	bind_flags;
2072 	uint64_t		vaddr, len;
2073 	uint64_t		reg_start_addr, reg_end_addr;
2074 	tavor_mwhdl_t		mw;
2075 	tavor_mrhdl_t		mr;
2076 	tavor_rsrc_t		*mpt;
2077 	uint32_t		new_rkey;
2078 
2079 	/* Check for a valid Memory Window handle in the WR */
2080 	mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2081 	if (mw == NULL) {
2082 		return (IBT_MW_HDL_INVALID);
2083 	}
2084 
2085 	/* Check for a valid Memory Region handle in the WR */
2086 	mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2087 	if (mr == NULL) {
2088 		return (IBT_MR_HDL_INVALID);
2089 	}
2090 
2091 	mutex_enter(&mr->mr_lock);
2092 	mutex_enter(&mw->mr_lock);
2093 
2094 	/*
2095 	 * Check here to see if the memory region has already been partially
2096 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
2097 	 * If so, this is an error, return failure.
2098 	 */
2099 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2100 		mutex_exit(&mr->mr_lock);
2101 		mutex_exit(&mw->mr_lock);
2102 		return (IBT_MR_HDL_INVALID);
2103 	}
2104 
2105 	/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2106 	if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2107 		mutex_exit(&mr->mr_lock);
2108 		mutex_exit(&mw->mr_lock);
2109 		return (IBT_MR_RKEY_INVALID);
2110 	}
2111 
2112 	/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2113 	if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2114 		mutex_exit(&mr->mr_lock);
2115 		mutex_exit(&mw->mr_lock);
2116 		return (IBT_MR_LKEY_INVALID);
2117 	}
2118 
2119 	/*
2120 	 * Now check for valid "vaddr" and "len".  Note:  We don't check the
2121 	 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2122 	 */
2123 	len = wr->wr.rc.rcwr.bind->bind_len;
2124 	if (len != 0) {
2125 		vaddr = wr->wr.rc.rcwr.bind->bind_va;
2126 		reg_start_addr = mr->mr_bindinfo.bi_addr;
2127 		reg_end_addr   = mr->mr_bindinfo.bi_addr +
2128 		    (mr->mr_bindinfo.bi_len - 1);
2129 		if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2130 			mutex_exit(&mr->mr_lock);
2131 			mutex_exit(&mw->mr_lock);
2132 			return (IBT_MR_VA_INVALID);
2133 		}
2134 		vaddr = (vaddr + len) - 1;
2135 		if (vaddr > reg_end_addr) {
2136 			mutex_exit(&mr->mr_lock);
2137 			mutex_exit(&mw->mr_lock);
2138 			return (IBT_MR_LEN_INVALID);
2139 		}
2140 	}
2141 
2142 	/*
2143 	 * Validate the bind access flags.  Remote Write and Atomic access for
2144 	 * the Memory Window require that Local Write access be set in the
2145 	 * corresponding Memory Region.
2146 	 */
2147 	bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2148 	if (((bind_flags & IBT_WR_BIND_WRITE) ||
2149 	    (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2150 	    !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2151 		mutex_exit(&mr->mr_lock);
2152 		mutex_exit(&mw->mr_lock);
2153 		return (IBT_MR_ACCESS_REQ_INVALID);
2154 	}
2155 
2156 	/* Calculate the new RKey for the Memory Window */
2157 	mpt = mw->mr_mptrsrcp;
2158 	tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey);
2159 
2160 	wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2161 	mw->mr_rkey = new_rkey;
2162 
2163 	mutex_exit(&mr->mr_lock);
2164 	mutex_exit(&mw->mr_lock);
2165 	return (DDI_SUCCESS);
2166 }
2167 
2168 
2169 /*
2170  * tavor_wrid_from_reset_handling()
2171  *    Context: Can be called from interrupt or base context.
2172  */
2173 int
tavor_wrid_from_reset_handling(tavor_state_t * state,tavor_qphdl_t qp)2174 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2175 {
2176 	tavor_workq_hdr_t	*swq, *rwq;
2177 	tavor_wrid_list_hdr_t	*s_wridlist, *r_wridlist;
2178 	uint_t			create_new_swq = 0, create_new_rwq = 0;
2179 	uint_t			create_wql = 0;
2180 	uint_t			qp_srq_en;
2181 
2182 	/*
2183 	 * For each of this QP's Work Queues, make sure we have a (properly
2184 	 * initialized) Work Request ID list attached to the relevant
2185 	 * completion queue.  Grab the CQ lock(s) before manipulating the
2186 	 * lists.
2187 	 */
2188 	tavor_wrid_wqhdr_lock_both(qp);
2189 	swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum,
2190 	    TAVOR_WR_SEND);
2191 	if (swq == NULL) {
2192 		/* Couldn't find matching work queue header, create it */
2193 		create_new_swq = create_wql = 1;
2194 		swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl,
2195 		    qp->qp_qpnum, TAVOR_WR_SEND, create_wql);
2196 		if (swq == NULL) {
2197 			/*
2198 			 * If we couldn't find/allocate space for the workq
2199 			 * header, then drop the lock(s) and return failure.
2200 			 */
2201 			tavor_wrid_wqhdr_unlock_both(qp);
2202 			return (ibc_get_ci_failure(0));
2203 		}
2204 	}
2205 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq))
2206 	qp->qp_sq_wqhdr = swq;
2207 	swq->wq_size = qp->qp_sq_bufsz;
2208 	swq->wq_head = 0;
2209 	swq->wq_tail = 0;
2210 	swq->wq_full = 0;
2211 
2212 	/*
2213 	 * Allocate space for the tavor_wrid_entry_t container
2214 	 */
2215 	s_wridlist = tavor_wrid_get_list(swq->wq_size);
2216 	if (s_wridlist == NULL) {
2217 		/*
2218 		 * If we couldn't allocate space for tracking the WRID
2219 		 * entries, then cleanup the workq header from above (if
2220 		 * necessary, i.e. if we created the workq header).  Then
2221 		 * drop the lock(s) and return failure.
2222 		 */
2223 		if (create_new_swq) {
2224 			tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2225 		}
2226 
2227 		tavor_wrid_wqhdr_unlock_both(qp);
2228 		return (ibc_get_ci_failure(0));
2229 	}
2230 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist))
2231 	s_wridlist->wl_wqhdr = swq;
2232 
2233 	/* Chain the new WRID list container to the workq hdr list */
2234 	mutex_enter(&swq->wq_wrid_wql->wql_lock);
2235 	tavor_wrid_wqhdr_add(swq, s_wridlist);
2236 	mutex_exit(&swq->wq_wrid_wql->wql_lock);
2237 
2238 	qp_srq_en = qp->qp_srq_en;
2239 
2240 #ifdef __lock_lint
2241 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2242 #else
2243 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2244 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2245 	}
2246 #endif
2247 	/*
2248 	 * Now we repeat all the above operations for the receive work queue,
2249 	 * or shared receive work queue.
2250 	 *
2251 	 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2252 	 */
2253 	rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum,
2254 	    TAVOR_WR_RECV);
2255 	if (rwq == NULL) {
2256 		create_new_rwq = create_wql = 1;
2257 
2258 		/*
2259 		 * If this QP is associated with an SRQ, and this isn't the
2260 		 * first QP on the SRQ, then the 'srq_wrid_wql' will already be
2261 		 * created.  Since the WQL is created at 'wqhdr_create' time we
2262 		 * pass in the flag 'create_wql' here to be 0 if we have
2263 		 * already created it.  And later on below we then next setup
2264 		 * the WQL and rwq information based off the existing SRQ info.
2265 		 */
2266 		if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2267 		    qp->qp_srqhdl->srq_wrid_wql != NULL) {
2268 			create_wql = 0;
2269 		}
2270 
2271 		rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl,
2272 		    qp->qp_qpnum, TAVOR_WR_RECV, create_wql);
2273 		if (rwq == NULL) {
2274 			/*
2275 			 * If we couldn't find/allocate space for the workq
2276 			 * header, then free all the send queue resources we
2277 			 * just allocated and setup (above), drop the lock(s)
2278 			 * and return failure.
2279 			 */
2280 			mutex_enter(&swq->wq_wrid_wql->wql_lock);
2281 			tavor_wrid_wqhdr_remove(swq, s_wridlist);
2282 			mutex_exit(&swq->wq_wrid_wql->wql_lock);
2283 			if (create_new_swq) {
2284 				tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
2285 				    swq);
2286 			}
2287 
2288 #ifdef __lock_lint
2289 			mutex_exit(&qp->qp_srqhdl->srq_lock);
2290 #else
2291 			if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2292 				mutex_exit(&qp->qp_srqhdl->srq_lock);
2293 			}
2294 #endif
2295 
2296 			tavor_wrid_wqhdr_unlock_both(qp);
2297 			return (ibc_get_ci_failure(0));
2298 		}
2299 	}
2300 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq))
2301 
2302 	/*
2303 	 * Setup receive workq hdr
2304 	 *
2305 	 * If the QP is on an SRQ, we setup the SRQ specific fields, setting
2306 	 * keeping a copy of the rwq pointer, setting the rwq bufsize
2307 	 * appropriately, and initializing our part of the WQLock.
2308 	 *
2309 	 * In the normal QP case, the QP recv queue bufsize is used.
2310 	 */
2311 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2312 		rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz;
2313 		if (qp->qp_srqhdl->srq_wrid_wql == NULL) {
2314 			qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql;
2315 		} else {
2316 			rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql;
2317 		}
2318 		tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql);
2319 
2320 	} else {
2321 		rwq->wq_size = qp->qp_rq_bufsz;
2322 	}
2323 
2324 	qp->qp_rq_wqhdr = rwq;
2325 	rwq->wq_head = 0;
2326 	rwq->wq_tail = 0;
2327 	rwq->wq_full = 0;
2328 
2329 	/*
2330 	 * Allocate space for the tavor_wrid_entry_t container.
2331 	 *
2332 	 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must
2333 	 * allocate the wridlist normally.  However, if the srq_wridlist is !=
2334 	 * NULL, then we know this SRQ has already been initialized, thus the
2335 	 * wridlist has already been initialized.  So we re-use the
2336 	 * srq_wridlist as the r_wridlist for this QP in this case.
2337 	 */
2338 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2339 	    qp->qp_srqhdl->srq_wridlist != NULL) {
2340 		/* Use existing srq_wridlist pointer */
2341 		r_wridlist = qp->qp_srqhdl->srq_wridlist;
2342 		ASSERT(r_wridlist != NULL);
2343 	} else {
2344 		/* Allocate memory for the r_wridlist */
2345 		r_wridlist = tavor_wrid_get_list(rwq->wq_size);
2346 	}
2347 
2348 	/*
2349 	 * If the memory allocation failed for r_wridlist (or the SRQ pointer
2350 	 * is mistakenly NULL), we cleanup our previous swq allocation from
2351 	 * above
2352 	 */
2353 	if (r_wridlist == NULL) {
2354 		/*
2355 		 * If we couldn't allocate space for tracking the WRID
2356 		 * entries, then cleanup all the stuff from above.  Then
2357 		 * drop the lock(s) and return failure.
2358 		 */
2359 		mutex_enter(&swq->wq_wrid_wql->wql_lock);
2360 		tavor_wrid_wqhdr_remove(swq, s_wridlist);
2361 		mutex_exit(&swq->wq_wrid_wql->wql_lock);
2362 		if (create_new_swq) {
2363 			tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2364 		}
2365 		if (create_new_rwq) {
2366 			tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
2367 		}
2368 
2369 #ifdef __lock_lint
2370 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2371 #else
2372 		if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2373 			mutex_exit(&qp->qp_srqhdl->srq_lock);
2374 		}
2375 #endif
2376 
2377 		tavor_wrid_wqhdr_unlock_both(qp);
2378 		return (ibc_get_ci_failure(0));
2379 	}
2380 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist))
2381 
2382 	/*
2383 	 * Initialize the wridlist
2384 	 *
2385 	 * In the normal QP case, there is no special initialization needed.
2386 	 * We simply setup the wridlist backpointer to be the receive wqhdr
2387 	 * (rwq).
2388 	 *
2389 	 * But in the SRQ case, there is no backpointer to the wqhdr possible.
2390 	 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ
2391 	 * and thus potentially shared across multiple QPs with the SRQ.  We
2392 	 * also setup the srq_wridlist pointer to be the r_wridlist, and
2393 	 * intialize the freelist to an invalid index.  This srq_wridlist
2394 	 * pointer is used above on future moves from_reset to let us know that
2395 	 * the srq_wridlist has been initialized already.
2396 	 *
2397 	 * And finally, if we are in a non-UMAP case, we setup the srq wrid
2398 	 * free list.
2399 	 */
2400 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2401 	    qp->qp_srqhdl->srq_wridlist == NULL) {
2402 		r_wridlist->wl_srq_en = 1;
2403 		r_wridlist->wl_free_list_indx = -1;
2404 		qp->qp_srqhdl->srq_wridlist = r_wridlist;
2405 
2406 		/* Initialize srq wrid free list */
2407 		if (qp->qp_srqhdl->srq_is_umap == 0) {
2408 			mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2409 			tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0);
2410 			mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2411 		}
2412 	} else {
2413 		r_wridlist->wl_wqhdr = rwq;
2414 	}
2415 
2416 	/* Chain the WRID list "container" to the workq hdr list */
2417 	mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2418 	tavor_wrid_wqhdr_add(rwq, r_wridlist);
2419 	mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2420 
2421 #ifdef __lock_lint
2422 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2423 #else
2424 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2425 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2426 	}
2427 #endif
2428 
2429 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist))
2430 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq))
2431 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist))
2432 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq))
2433 
2434 	tavor_wrid_wqhdr_unlock_both(qp);
2435 	return (DDI_SUCCESS);
2436 }
2437 
2438 
2439 /*
2440  * tavor_wrid_to_reset_handling()
2441  *    Context: Can be called from interrupt or base context.
2442  */
2443 void
tavor_wrid_to_reset_handling(tavor_state_t * state,tavor_qphdl_t qp)2444 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2445 {
2446 	uint_t		free_wqhdr = 0;
2447 
2448 	/*
2449 	 * For each of this QP's Work Queues, move the WRID "container" to
2450 	 * the "reapable" list.  Although there may still be unpolled
2451 	 * entries in these containers, it is not a big deal.  We will not
2452 	 * reap the list until either the Poll CQ command detects an empty
2453 	 * condition or the CQ itself is freed.  Grab the CQ lock(s) before
2454 	 * manipulating the lists.
2455 	 */
2456 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2457 	tavor_wrid_wqhdr_lock_both(qp);
2458 	tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);
2459 
2460 	/*
2461 	 * Add the receive work queue header on to the reaplist.  But if we are
2462 	 * on SRQ, then don't add anything to the reaplist.  Instead we flush
2463 	 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the
2464 	 * WQHDR (if needed).  We must hold the WQL for these operations, yet
2465 	 * the call to tavor_cq_wqhdr_remove grabs the WQL internally.  So we
2466 	 * drop WQL before that call.  Then release the CQ WQHDR locks and the
2467 	 * CQ lock and return.
2468 	 */
2469 	if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2470 
2471 		/*
2472 		 * Pull off all (if any) entries for this QP from CQ.  This
2473 		 * only includes entries that have not yet been polled
2474 		 */
2475 		mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2476 		tavor_cq_srq_entries_flush(state, qp);
2477 
2478 		/* Remove wridlist from WQHDR */
2479 		tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
2480 		    qp->qp_rq_wqhdr->wq_wrid_post);
2481 
2482 		/* If wridlist chain is now empty, remove the wqhdr as well */
2483 		if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) {
2484 			free_wqhdr = 1;
2485 		} else {
2486 			free_wqhdr = 0;
2487 		}
2488 
2489 		mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2490 
2491 		/* Free the WQHDR */
2492 		if (free_wqhdr) {
2493 			tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2494 		}
2495 	} else {
2496 		tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2497 	}
2498 	tavor_wrid_wqhdr_unlock_both(qp);
2499 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2500 }
2501 
2502 
2503 /*
2504  * tavor_wrid_add_entry()
2505  *    Context: Can be called from interrupt or base context.
2506  */
2507 void
tavor_wrid_add_entry(tavor_workq_hdr_t * wq,uint64_t wrid,uint32_t wqeaddrsz,uint_t signaled_dbd)2508 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz,
2509     uint_t signaled_dbd)
2510 {
2511 	tavor_wrid_entry_t	*wre_tmp;
2512 	uint32_t		head, tail, size;
2513 
2514 	ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2515 
2516 	/*
2517 	 * Find the entry in the container pointed to by the "tail" index.
2518 	 * Add all of the relevant information to that entry, including WRID,
2519 	 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2520 	 * and/or doorbelled.
2521 	 */
2522 	head = wq->wq_wrid_post->wl_head;
2523 	tail = wq->wq_wrid_post->wl_tail;
2524 	size = wq->wq_wrid_post->wl_size;
2525 	wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2526 	wre_tmp->wr_wrid	  = wrid;
2527 	wre_tmp->wr_wqeaddrsz	  = wqeaddrsz;
2528 	wre_tmp->wr_signaled_dbd  = signaled_dbd;
2529 
2530 	/*
2531 	 * Update the "wrid_old_tail" pointer to point to the entry we just
2532 	 * inserted into the queue.  By tracking this pointer (the pointer to
2533 	 * the most recently inserted entry) it will possible later in the
2534 	 * PostSend() and PostRecv() code paths to find the entry that needs
2535 	 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2536 	 * tavor_post_send()).
2537 	 */
2538 	wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2539 
2540 	/* Update the tail index */
2541 	tail = ((tail + 1) & (size - 1));
2542 	wq->wq_wrid_post->wl_tail = tail;
2543 
2544 	/*
2545 	 * If the "tail" index has just wrapped over into the "head" index,
2546 	 * then we have filled the container.  We use the "full" flag to
2547 	 * indicate this condition and to distinguish it from the "empty"
2548 	 * condition (where head and tail are also equal).
2549 	 */
2550 	if (head == tail) {
2551 		wq->wq_wrid_post->wl_full = 1;
2552 	}
2553 }
2554 
2555 /*
2556  * tavor_wrid_add_entry_srq()
2557  * Context: Can be called from interrupt or base context
2558  */
2559 void
tavor_wrid_add_entry_srq(tavor_srqhdl_t srq,uint64_t wrid,uint_t signaled_dbd)2560 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd)
2561 {
2562 	tavor_wrid_entry_t	*wre;
2563 	uint64_t		*wl_wqe;
2564 	uint32_t		wqe_index;
2565 
2566 	/*
2567 	 * Find the next available WQE from the SRQ free_list.  Then update the
2568 	 * free_list to point to the next entry
2569 	 */
2570 	wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx);
2571 
2572 	wqe_index = srq->srq_wridlist->wl_free_list_indx;
2573 
2574 	/* ASSERT on impossible wqe_index values */
2575 	ASSERT(wqe_index < srq->srq_wq_bufsz);
2576 
2577 	/*
2578 	 * Setup the WRE.
2579 	 *
2580 	 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2581 	 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2582 	 * this information and associate the WRID to the WQE found on the CQE.
2583 	 */
2584 	wre = &srq->srq_wridlist->wl_wre[wqe_index];
2585 	wre->wr_wrid = wrid;
2586 	wre->wr_signaled_dbd  = signaled_dbd;
2587 
2588 	/* Update the free list index */
2589 	srq->srq_wridlist->wl_free_list_indx = ddi_get32(
2590 	    srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe);
2591 }
2592 
2593 
2594 /*
2595  * tavor_wrid_get_entry()
2596  *    Context: Can be called from interrupt or base context.
2597  */
2598 uint64_t
tavor_wrid_get_entry(tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,tavor_wrid_entry_t * wre)2599 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
2600     tavor_wrid_entry_t *wre)
2601 {
2602 	tavor_workq_hdr_t	*wq;
2603 	tavor_wrid_entry_t	*wre_tmp;
2604 	uint64_t		wrid;
2605 	uint_t			send_or_recv, qpnum, error, opcode;
2606 
2607 	/* Lock the list of work queues associated with this CQ */
2608 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
2609 
2610 	/*
2611 	 * Determine whether this CQE is a send or receive completion (and
2612 	 * whether it was a "successful" completion or not)
2613 	 */
2614 	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
2615 	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2616 	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2617 		error = 1;
2618 		send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ?
2619 		    TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV;
2620 	} else {
2621 		error = 0;
2622 		send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe);
2623 	}
2624 
2625 	/* Find the work queue for this QP number (send or receive side) */
2626 	qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
2627 	wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);
2628 	ASSERT(wq != NULL);
2629 
2630 	/*
2631 	 * Regardless of whether the completion is the result of a "success"
2632 	 * or a "failure", we lock the list of "containers" and attempt to
2633 	 * search for the the first matching completion (i.e. the first WR
2634 	 * with a matching WQE addr and size).  Once we find it, we pull out
2635 	 * the "wrid" field and return it (see below).  Note: One possible
2636 	 * future enhancement would be to enable this routine to skip over
2637 	 * any "unsignaled" completions to go directly to the next "signaled"
2638 	 * entry on success. XXX
2639 	 */
2640 	mutex_enter(&wq->wq_wrid_wql->wql_lock);
2641 	wre_tmp = tavor_wrid_find_match(wq, cq, cqe);
2642 
2643 	/*
2644 	 * If this is a "successful" completion, then we assert that this
2645 	 * completion must be a "signaled" completion.
2646 	 */
2647 	ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED));
2648 
2649 	/*
2650 	 * If the completion is a "failed" completion, then we save away the
2651 	 * contents of the entry (into the "wre" field passed in) for use
2652 	 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz()
2653 	 * function to grab "wqeaddrsz" from the next entry in the container.
2654 	 * This is required for error processing (where updating these fields
2655 	 * properly is necessary to correct handling of the "error" CQE)
2656 	 */
2657 	if (error && (wre != NULL)) {
2658 		*wre = *wre_tmp;
2659 		wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq);
2660 	}
2661 
2662 	/* Pull out the WRID and return it */
2663 	wrid = wre_tmp->wr_wrid;
2664 
2665 	mutex_exit(&wq->wq_wrid_wql->wql_lock);
2666 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
2667 
2668 	return (wrid);
2669 }
2670 
2671 
2672 /*
2673  * tavor_wrid_find_match()
2674  *    Context: Can be called from interrupt or base context.
2675  */
2676 static tavor_wrid_entry_t *
tavor_wrid_find_match(tavor_workq_hdr_t * wq,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe)2677 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq,
2678     tavor_hw_cqe_t *cqe)
2679 {
2680 	tavor_wrid_entry_t	*curr = NULL;
2681 	tavor_wrid_list_hdr_t	*container;
2682 	uint32_t		wqeaddr_size;
2683 	uint32_t		head, tail, size;
2684 	int			found = 0, last_container;
2685 
2686 	ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2687 
2688 	/* Pull the "wqeaddrsz" information from the CQE */
2689 	wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe);
2690 
2691 	/*
2692 	 * Walk the "containers" list(s), find first WR with a matching WQE
2693 	 * addr.  If the current "container" is not the last one on the list,
2694 	 * i.e. not the current one to which we are posting new WRID entries,
2695 	 * then we do not attempt to update the "q_head", "q_tail", and
2696 	 * "q_full" indicators on the main work queue header.  We do, however,
2697 	 * update the "head" and "full" indicators on the individual containers
2698 	 * as we go.  This is imperative because we need to be able to
2699 	 * determine when the current container has been emptied (so that we
2700 	 * can move on to the next container).
2701 	 */
2702 	container = wq->wq_wrid_poll;
2703 	while (container != NULL) {
2704 		/* Is this the last/only "container" on the list */
2705 		last_container = (container != wq->wq_wrid_post) ? 0 : 1;
2706 
2707 		/*
2708 		 * First check if we are on an SRQ.  If so, we grab the entry
2709 		 * and break out.  Since SRQ wridlist's are never added to
2710 		 * reaplist, they can only be the last container.
2711 		 */
2712 		if (container->wl_srq_en) {
2713 			ASSERT(last_container == 1);
2714 			curr = tavor_wrid_find_match_srq(container, cq, cqe);
2715 			break;
2716 		}
2717 
2718 		/*
2719 		 * Grab the current "head", "tail" and "size" fields before
2720 		 * walking the list in the current container. Note: the "size"
2721 		 * field here must always be a power-of-2.  The "full"
2722 		 * parameter is checked (and updated) here to distinguish the
2723 		 * "queue full" condition from "queue empty".
2724 		 */
2725 		head = container->wl_head;
2726 		tail = container->wl_tail;
2727 		size = container->wl_size;
2728 		while ((head != tail) || (container->wl_full)) {
2729 			container->wl_full = 0;
2730 			curr = &container->wl_wre[head];
2731 			head = ((head + 1) & (size - 1));
2732 
2733 			/*
2734 			 * If the current entry's "wqeaddrsz" matches the one
2735 			 * we're searching for, then this must correspond to
2736 			 * the work request that caused the completion.  Set
2737 			 * the "found" flag and bail out.
2738 			 */
2739 			if (curr->wr_wqeaddrsz == wqeaddr_size) {
2740 				found = 1;
2741 				break;
2742 			}
2743 		}
2744 
2745 		/*
2746 		 * If the current container is empty (having reached here the
2747 		 * "head == tail" condition can only mean that the container
2748 		 * is empty), then NULL out the "wrid_old_tail" field (see
2749 		 * tavor_post_send() and tavor_post_recv() for more details)
2750 		 * and (potentially) remove the current container from future
2751 		 * searches.
2752 		 */
2753 		if (head == tail) {
2754 
2755 			container->wl_wre_old_tail = NULL;
2756 			/*
2757 			 * If this wasn't the last "container" on the chain,
2758 			 * i.e. the one to which new WRID entries will be
2759 			 * added, then remove it from the list.
2760 			 * Note: we don't "lose" the memory pointed to by this
2761 			 * because we should have already put this container
2762 			 * on the "reapable" list (from where it will later be
2763 			 * pulled).
2764 			 */
2765 			if (!last_container) {
2766 				wq->wq_wrid_poll = container->wl_next;
2767 			}
2768 		}
2769 
2770 		/* Update the head index for the container */
2771 		container->wl_head = head;
2772 
2773 		/*
2774 		 * If the entry was found in this container, then continue to
2775 		 * bail out.  Else reset the "curr" pointer and move on to the
2776 		 * next container (if there is one).  Note: the only real
2777 		 * reason for setting "curr = NULL" here is so that the ASSERT
2778 		 * below can catch the case where no matching entry was found
2779 		 * on any of the lists.
2780 		 */
2781 		if (found) {
2782 			break;
2783 		} else {
2784 			curr = NULL;
2785 			container = container->wl_next;
2786 		}
2787 	}
2788 
2789 	/*
2790 	 * Update work queue header's "head" and "full" conditions to match
2791 	 * the last entry on the container list.  (Note: Only if we're pulling
2792 	 * entries from the last work queue portion of the list, i.e. not from
2793 	 * the previous portions that may be the "reapable" list.)
2794 	 */
2795 	if (last_container) {
2796 		wq->wq_head = wq->wq_wrid_post->wl_head;
2797 		wq->wq_full = wq->wq_wrid_post->wl_full;
2798 	}
2799 
2800 	/* Ensure that we've actually found what we were searching for */
2801 	ASSERT(curr != NULL);
2802 
2803 	return (curr);
2804 }
2805 
2806 
2807 /*
2808  * tavor_wrid_find_match_srq()
2809  *    Context: Can be called from interrupt or base context.
2810  */
2811 tavor_wrid_entry_t *
tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t * wl,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe)2812 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq,
2813     tavor_hw_cqe_t *cqe)
2814 {
2815 	tavor_wrid_entry_t	*wre;
2816 	uint64_t		*wl_wqe;
2817 	uint32_t		wqe_index;
2818 	uint64_t		wqe_addr;
2819 	uint32_t		cqe_wqe_addr;
2820 
2821 	/* Grab the WQE addr out of the CQE */
2822 	cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0;
2823 
2824 	/*
2825 	 * Use the WQE addr as the lower 32-bit, we add back on the
2826 	 * 'wl_srq_desc_off' because we have a zero-based queue.  Then the
2827 	 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in
2828 	 * the SRQ Work Queue itself.  We use this address as the index to find
2829 	 * out which Work Queue Entry this CQE corresponds with.
2830 	 *
2831 	 * We also use this address below to add the WQE back on to the free
2832 	 * list.
2833 	 */
2834 	wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) |
2835 	    (cqe_wqe_addr + wl->wl_srq_desc_off);
2836 
2837 	/*
2838 	 * Given the 'wqe_addr' just calculated and the srq buf address, we
2839 	 * find the 'wqe_index'.  The 'wre' returned below contains the WRID
2840 	 * that we are looking for.  This indexes into the wre_list for this
2841 	 * specific WQE.
2842 	 */
2843 	wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr,
2844 	    wl->wl_srq_log_wqesz);
2845 
2846 	/* ASSERT on impossible wqe_index values */
2847 	ASSERT(wqe_index < wl->wl_srq_wq_bufsz);
2848 
2849 	/* Get the pointer to this WQE */
2850 	wl_wqe = (uint64_t *)(uintptr_t)wqe_addr;
2851 
2852 	/* Put this WQE index back on the free list */
2853 	ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx);
2854 	wl->wl_free_list_indx = wqe_index;
2855 
2856 	/* Using the index, return the Work Request ID Entry (wre) */
2857 	wre = &wl->wl_wre[wqe_index];
2858 
2859 	return (wre);
2860 }
2861 
2862 
2863 /*
2864  * tavor_wrid_cq_reap()
2865  *    Context: Can be called from interrupt or base context.
2866  */
2867 void
tavor_wrid_cq_reap(tavor_cqhdl_t cq)2868 tavor_wrid_cq_reap(tavor_cqhdl_t cq)
2869 {
2870 	tavor_workq_hdr_t	*consume_wqhdr;
2871 	tavor_wrid_list_hdr_t	*container, *to_free;
2872 
2873 	ASSERT(MUTEX_HELD(&cq->cq_lock));
2874 
2875 	/* Lock the list of work queues associated with this CQ */
2876 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
2877 
2878 	/* Walk the "reapable" list and free up containers */
2879 	container = cq->cq_wrid_reap_head;
2880 	while (container != NULL) {
2881 		to_free	  = container;
2882 		container = container->wl_reap_next;
2883 		/*
2884 		 * If reaping the WRID list containers pulls the last
2885 		 * container from the given work queue header, then we free
2886 		 * the work queue header as well.
2887 		 */
2888 		consume_wqhdr = tavor_wrid_list_reap(to_free);
2889 		if (consume_wqhdr != NULL) {
2890 			tavor_cq_wqhdr_remove(cq, consume_wqhdr);
2891 		}
2892 	}
2893 
2894 	/* Once finished reaping, we reset the CQ's reap list */
2895 	cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;
2896 
2897 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
2898 }
2899 
2900 
2901 /*
2902  * tavor_wrid_cq_force_reap()
2903  *    Context: Can be called from interrupt or base context.
2904  */
2905 void
tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)2906 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)
2907 {
2908 	tavor_workq_hdr_t	*curr;
2909 	tavor_wrid_list_hdr_t	*container, *to_free;
2910 	avl_tree_t		*treep;
2911 	void			*cookie = NULL;
2912 
2913 	ASSERT(MUTEX_HELD(&cq->cq_lock));
2914 
2915 	/*
2916 	 * The first step is to walk the "reapable" list and free up those
2917 	 * containers.  This is necessary because the containers on the
2918 	 * reapable list are not otherwise connected to the work queue headers
2919 	 * anymore.
2920 	 */
2921 	tavor_wrid_cq_reap(cq);
2922 
2923 	/* Now lock the list of work queues associated with this CQ */
2924 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
2925 
2926 	/*
2927 	 * Walk the list of work queue headers and free up all the WRID list
2928 	 * containers chained to it.  Note: We don't need to grab the locks
2929 	 * for each of the individual WRID lists here because the only way
2930 	 * things can be added or removed from the list at this point would be
2931 	 * through post a work request to a QP.  But if we've come this far,
2932 	 * then we can be assured that there are no longer any QP associated
2933 	 * with the CQ that we are trying to free.
2934 	 */
2935 #ifdef __lock_lint
2936 	tavor_wrid_wqhdr_compare(NULL, NULL);
2937 #endif
2938 	treep = &cq->cq_wrid_wqhdr_avl_tree;
2939 	while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) {
2940 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr))
2941 		container = curr->wq_wrid_poll;
2942 		while (container != NULL) {
2943 			to_free	  = container;
2944 			container = container->wl_next;
2945 			/*
2946 			 * If reaping the WRID list containers pulls the last
2947 			 * container from the given work queue header, then
2948 			 * we free the work queue header as well.  Note: we
2949 			 * ignore the return value because we know that the
2950 			 * work queue header should always be freed once the
2951 			 * list of containers has come to an end.
2952 			 */
2953 			(void) tavor_wrid_list_reap(to_free);
2954 			if (container == NULL) {
2955 				tavor_cq_wqhdr_remove(cq, curr);
2956 			}
2957 		}
2958 	}
2959 	avl_destroy(treep);
2960 
2961 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
2962 }
2963 
2964 
2965 /*
2966  * tavor_wrid_get_list()
2967  *    Context: Can be called from interrupt or base context.
2968  */
2969 tavor_wrid_list_hdr_t *
tavor_wrid_get_list(uint32_t qsize)2970 tavor_wrid_get_list(uint32_t qsize)
2971 {
2972 	tavor_wrid_list_hdr_t	*wridlist;
2973 	uint32_t		size;
2974 
2975 	/*
2976 	 * The WRID list "container" consists of the tavor_wrid_list_hdr_t,
2977 	 * which holds the pointers necessary for maintaining the "reapable"
2978 	 * list, chaining together multiple "containers" old and new, and
2979 	 * tracking the head, tail, size, etc. for each container.
2980 	 *
2981 	 * The "container" also holds all the tavor_wrid_entry_t's, which is
2982 	 * allocated separately, one for each entry on the corresponding work
2983 	 * queue.
2984 	 */
2985 	size = sizeof (tavor_wrid_list_hdr_t);
2986 
2987 	/*
2988 	 * Note that this allocation has to be a NOSLEEP operation here
2989 	 * because we are holding the "wqhdr_list_lock" and, therefore,
2990 	 * could get raised to the interrupt level.
2991 	 */
2992 	wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP);
2993 	if (wridlist == NULL) {
2994 		return (NULL);
2995 	}
2996 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist))
2997 
2998 	/* Complete the "container" initialization */
2999 	wridlist->wl_size = qsize;
3000 	wridlist->wl_full = 0;
3001 	wridlist->wl_head = 0;
3002 	wridlist->wl_tail = 0;
3003 	wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize *
3004 	    sizeof (tavor_wrid_entry_t), KM_NOSLEEP);
3005 	if (wridlist->wl_wre == NULL) {
3006 		kmem_free(wridlist, size);
3007 		return (NULL);
3008 	}
3009 	wridlist->wl_wre_old_tail  = NULL;
3010 	wridlist->wl_reap_next = NULL;
3011 	wridlist->wl_next  = NULL;
3012 	wridlist->wl_prev  = NULL;
3013 	wridlist->wl_srq_en = 0;
3014 
3015 	return (wridlist);
3016 }
3017 
3018 /*
3019  * tavor_wrid_list_srq_init()
3020  * Context: Can be called from interrupt or base context
3021  */
3022 void
tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t * wridlist,tavor_srqhdl_t srq,uint_t wq_start)3023 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq,
3024     uint_t wq_start)
3025 {
3026 	uint64_t *wl_wqe;
3027 	int wqe_index;
3028 
3029 	ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock));
3030 
3031 	/* Setup pointers for use later when we are polling the CQ */
3032 	wridlist->wl_srq_wq_buf = srq->srq_wq_buf;
3033 	wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz;
3034 	wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz;
3035 	wridlist->wl_srq_desc_off = srq->srq_desc_off;
3036 	wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl;
3037 
3038 	/* Given wq_start to start initializing buf at, verify sanity */
3039 	ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz);
3040 
3041 	/*
3042 	 * Initialize wridlist free list
3043 	 *
3044 	 * For each WQ up to the size of our queue, we store an index in the WQ
3045 	 * memory itself, representing the next available free entry.  The
3046 	 * 'wl_free_list_indx' always holds the index of the next available
3047 	 * free entry in the WQ.  If 'wl_free_list_indx' is -1, then we are
3048 	 * completely full.  This gives us the advantage of being able to have
3049 	 * entries complete or be polled off the WQ out-of-order.
3050 	 *
3051 	 * For now, we write the free_list entries inside the WQ itself.  It
3052 	 * may be useful in the future to store this information in a separate
3053 	 * structure for debugging purposes.
3054 	 */
3055 	for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) {
3056 		wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index);
3057 		ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe,
3058 		    wridlist->wl_free_list_indx);
3059 		wridlist->wl_free_list_indx = wqe_index;
3060 	}
3061 }
3062 
3063 
3064 /*
3065  * tavor_wrid_reaplist_add()
3066  *    Context: Can be called from interrupt or base context.
3067  */
3068 static void
tavor_wrid_reaplist_add(tavor_cqhdl_t cq,tavor_workq_hdr_t * wq)3069 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq)
3070 {
3071 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3072 
3073 	mutex_enter(&wq->wq_wrid_wql->wql_lock);
3074 
3075 	/*
3076 	 * Add the "post" container (the last one on the current chain) to
3077 	 * the CQ's "reapable" list
3078 	 */
3079 	if ((cq->cq_wrid_reap_head == NULL) &&
3080 	    (cq->cq_wrid_reap_tail == NULL)) {
3081 		cq->cq_wrid_reap_head = wq->wq_wrid_post;
3082 		cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3083 	} else {
3084 		cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
3085 		cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3086 	}
3087 
3088 	mutex_exit(&wq->wq_wrid_wql->wql_lock);
3089 }
3090 
3091 
3092 int
tavor_wrid_wqhdr_compare(const void * p1,const void * p2)3093 tavor_wrid_wqhdr_compare(const void *p1, const void *p2)
3094 {
3095 	tavor_workq_compare_t	*cmpp;
3096 	tavor_workq_hdr_t	*curr;
3097 
3098 	cmpp = (tavor_workq_compare_t *)p1;
3099 	curr = (tavor_workq_hdr_t *)p2;
3100 
3101 	if (cmpp->cmp_qpn < curr->wq_qpn)
3102 		return (-1);
3103 	else if (cmpp->cmp_qpn > curr->wq_qpn)
3104 		return (+1);
3105 	else if (cmpp->cmp_type < curr->wq_type)
3106 		return (-1);
3107 	else if (cmpp->cmp_type > curr->wq_type)
3108 		return (+1);
3109 	else
3110 		return (0);
3111 }
3112 
3113 
3114 /*
3115  * tavor_wrid_wqhdr_find()
3116  *    Context: Can be called from interrupt or base context.
3117  */
3118 static tavor_workq_hdr_t *
tavor_wrid_wqhdr_find(tavor_cqhdl_t cq,uint_t qpn,uint_t wq_type)3119 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type)
3120 {
3121 	tavor_workq_hdr_t	*curr;
3122 	tavor_workq_compare_t	cmp;
3123 
3124 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3125 
3126 	/*
3127 	 * Walk the CQ's work queue list, trying to find a send or recv queue
3128 	 * with the same QP number.  We do this even if we are going to later
3129 	 * create a new entry because it helps us easily find the end of the
3130 	 * list.
3131 	 */
3132 	cmp.cmp_qpn = qpn;
3133 	cmp.cmp_type = wq_type;
3134 #ifdef __lock_lint
3135 	tavor_wrid_wqhdr_compare(NULL, NULL);
3136 #endif
3137 	curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
3138 
3139 	return (curr);
3140 }
3141 
3142 
3143 /*
3144  * tavor_wrid_wqhdr_create()
3145  *    Context: Can be called from interrupt or base context.
3146  */
3147 static tavor_workq_hdr_t *
tavor_wrid_wqhdr_create(tavor_state_t * state,tavor_cqhdl_t cq,uint_t qpn,uint_t wq_type,uint_t create_wql)3148 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn,
3149     uint_t wq_type, uint_t create_wql)
3150 {
3151 	tavor_workq_hdr_t	*wqhdr_tmp;
3152 
3153 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3154 
3155 	/*
3156 	 * Allocate space a work queue header structure and initialize it.
3157 	 * Each work queue header structure includes a "wq_wrid_wql"
3158 	 * which needs to be initialized.  Note that this allocation has to be
3159 	 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock"
3160 	 * and, therefore, could get raised to the interrupt level.
3161 	 */
3162 	wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc(
3163 	    sizeof (tavor_workq_hdr_t), KM_NOSLEEP);
3164 	if (wqhdr_tmp == NULL) {
3165 		return (NULL);
3166 	}
3167 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp))
3168 	wqhdr_tmp->wq_qpn	= qpn;
3169 	wqhdr_tmp->wq_type	= wq_type;
3170 
3171 	if (create_wql) {
3172 		wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state);
3173 		if (wqhdr_tmp->wq_wrid_wql == NULL) {
3174 			kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t));
3175 			return (NULL);
3176 		}
3177 	}
3178 
3179 	wqhdr_tmp->wq_wrid_poll = NULL;
3180 	wqhdr_tmp->wq_wrid_post = NULL;
3181 
3182 	/* Chain the newly allocated work queue header to the CQ's list */
3183 	tavor_cq_wqhdr_add(cq, wqhdr_tmp);
3184 
3185 	return (wqhdr_tmp);
3186 }
3187 
3188 
3189 /*
3190  * tavor_wrid_wql_create()
3191  *    Context: Can be called from interrupt or base context.
3192  */
3193 tavor_wq_lock_t *
tavor_wrid_wql_create(tavor_state_t * state)3194 tavor_wrid_wql_create(tavor_state_t *state)
3195 {
3196 	tavor_wq_lock_t *wql;
3197 
3198 	/*
3199 	 * Allocate the WQL and initialize it.
3200 	 */
3201 	wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP);
3202 	if (wql == NULL) {
3203 		return (NULL);
3204 	}
3205 
3206 	mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER,
3207 	    DDI_INTR_PRI(state->ts_intrmsi_pri));
3208 
3209 	/* Add refcount to WQL */
3210 	tavor_wql_refcnt_inc(wql);
3211 
3212 	return (wql);
3213 }
3214 
3215 
3216 /*
3217  * tavor_wrid_get_wqeaddrsz()
3218  *    Context: Can be called from interrupt or base context.
3219  */
3220 static uint32_t
tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t * wq)3221 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq)
3222 {
3223 	tavor_wrid_entry_t	*wre;
3224 	uint32_t		wqeaddrsz;
3225 	uint32_t		head;
3226 
3227 	/*
3228 	 * If the container is empty, then there is no next entry. So just
3229 	 * return zero.  Note: the "head == tail" condition here can only
3230 	 * mean that the container is empty because we have previously pulled
3231 	 * something from the container.
3232 	 *
3233 	 * If the container is not empty, then find the next entry and return
3234 	 * the contents of its "wqeaddrsz" field.
3235 	 */
3236 	if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
3237 		wqeaddrsz = 0;
3238 	} else {
3239 		/*
3240 		 * We don't need to calculate the "next" head pointer here
3241 		 * because "head" should already point to the next entry on
3242 		 * the list (since we just pulled something off - in
3243 		 * tavor_wrid_find_match() - and moved the head index forward.)
3244 		 */
3245 		head = wq->wq_wrid_poll->wl_head;
3246 		wre = &wq->wq_wrid_poll->wl_wre[head];
3247 		wqeaddrsz = wre->wr_wqeaddrsz;
3248 	}
3249 	return (wqeaddrsz);
3250 }
3251 
3252 
3253 /*
3254  * tavor_wrid_wqhdr_add()
3255  *    Context: Can be called from interrupt or base context.
3256  */
3257 static void
tavor_wrid_wqhdr_add(tavor_workq_hdr_t * wqhdr,tavor_wrid_list_hdr_t * wridlist)3258 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
3259     tavor_wrid_list_hdr_t *wridlist)
3260 {
3261 	ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3262 
3263 	/* Chain the new WRID list "container" to the work queue list */
3264 	if ((wqhdr->wq_wrid_post == NULL) &&
3265 	    (wqhdr->wq_wrid_poll == NULL)) {
3266 		wqhdr->wq_wrid_poll = wridlist;
3267 		wqhdr->wq_wrid_post = wridlist;
3268 	} else {
3269 		wqhdr->wq_wrid_post->wl_next = wridlist;
3270 		wridlist->wl_prev = wqhdr->wq_wrid_post;
3271 		wqhdr->wq_wrid_post = wridlist;
3272 	}
3273 }
3274 
3275 
3276 /*
3277  * tavor_wrid_wqhdr_remove()
3278  *    Context: Can be called from interrupt or base context.
3279  *
3280  *    Note: this is only called to remove the most recently added WRID list
3281  *    container (i.e. in tavor_from_reset() above)
3282  */
3283 static void
tavor_wrid_wqhdr_remove(tavor_workq_hdr_t * wqhdr,tavor_wrid_list_hdr_t * wridlist)3284 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
3285     tavor_wrid_list_hdr_t *wridlist)
3286 {
3287 	tavor_wrid_list_hdr_t	*prev, *next;
3288 
3289 	ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3290 
3291 	/* Unlink the WRID list "container" from the work queue list */
3292 	prev = wridlist->wl_prev;
3293 	next = wridlist->wl_next;
3294 	if (prev != NULL) {
3295 		prev->wl_next = next;
3296 	}
3297 	if (next != NULL) {
3298 		next->wl_prev = prev;
3299 	}
3300 
3301 	/*
3302 	 * Update any pointers in the work queue hdr that may point to this
3303 	 * WRID list container
3304 	 */
3305 	if (wqhdr->wq_wrid_post == wridlist) {
3306 		wqhdr->wq_wrid_post = prev;
3307 	}
3308 	if (wqhdr->wq_wrid_poll == wridlist) {
3309 		wqhdr->wq_wrid_poll = NULL;
3310 	}
3311 }
3312 
3313 
3314 /*
3315  * tavor_wrid_list_reap()
3316  *    Context: Can be called from interrupt or base context.
3317  *    Note: The "wqhdr_list_lock" must be held.
3318  */
3319 static tavor_workq_hdr_t *
tavor_wrid_list_reap(tavor_wrid_list_hdr_t * wridlist)3320 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist)
3321 {
3322 	tavor_workq_hdr_t	*wqhdr, *consume_wqhdr = NULL;
3323 	tavor_wrid_list_hdr_t	*prev, *next;
3324 	uint32_t		size;
3325 
3326 	/* Get the back pointer to the work queue header (see below) */
3327 	wqhdr = wridlist->wl_wqhdr;
3328 	mutex_enter(&wqhdr->wq_wrid_wql->wql_lock);
3329 
3330 	/* Unlink the WRID list "container" from the work queue list */
3331 	prev = wridlist->wl_prev;
3332 	next = wridlist->wl_next;
3333 	if (prev != NULL) {
3334 		prev->wl_next = next;
3335 	}
3336 	if (next != NULL) {
3337 		next->wl_prev = prev;
3338 	}
3339 
3340 	/*
3341 	 * If the back pointer to the work queue header shows that it
3342 	 * was pointing to the entry we are about to remove, then the work
3343 	 * queue header is reapable as well.
3344 	 */
3345 	if ((wqhdr->wq_wrid_poll == wridlist) &&
3346 	    (wqhdr->wq_wrid_post == wridlist)) {
3347 		consume_wqhdr = wqhdr;
3348 	}
3349 
3350 	/* Be sure to update the "poll" and "post" container pointers */
3351 	if (wqhdr->wq_wrid_poll == wridlist) {
3352 		wqhdr->wq_wrid_poll = next;
3353 	}
3354 	if (wqhdr->wq_wrid_post == wridlist) {
3355 		wqhdr->wq_wrid_post = NULL;
3356 	}
3357 
3358 	/* Calculate the size and free the container */
3359 	size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t));
3360 	kmem_free(wridlist->wl_wre, size);
3361 	kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t));
3362 
3363 	mutex_exit(&wqhdr->wq_wrid_wql->wql_lock);
3364 
3365 	return (consume_wqhdr);
3366 }
3367 
3368 
3369 /*
3370  * tavor_wrid_wqhdr_lock_both()
3371  *    Context: Can be called from interrupt or base context.
3372  */
3373 static void
tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)3374 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)
3375 {
3376 	tavor_cqhdl_t	sq_cq, rq_cq;
3377 
3378 	sq_cq = qp->qp_sq_cqhdl;
3379 	rq_cq = qp->qp_rq_cqhdl;
3380 
3381 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3382 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3383 
3384 	/*
3385 	 * If both work queues (send and recv) share a completion queue, then
3386 	 * grab the common lock.  If they use different CQs (hence different
3387 	 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
3388 	 * receive.  We do this consistently and correctly in
3389 	 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
3390 	 * of dead lock condition.  Note:  We add the "__lock_lint" code here
3391 	 * to fake out warlock into thinking we've grabbed both locks (when,
3392 	 * in fact, we only needed the one).
3393 	 */
3394 	if (sq_cq == rq_cq) {
3395 		mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3396 #ifdef	__lock_lint
3397 		mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3398 #endif
3399 	} else {
3400 		mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3401 		mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3402 	}
3403 }
3404 
3405 /*
3406  * tavor_wrid_wqhdr_unlock_both()
3407  *    Context: Can be called from interrupt or base context.
3408  */
3409 static void
tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)3410 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)
3411 {
3412 	tavor_cqhdl_t	sq_cq, rq_cq;
3413 
3414 	sq_cq = qp->qp_sq_cqhdl;
3415 	rq_cq = qp->qp_rq_cqhdl;
3416 
3417 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3418 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3419 
3420 	/*
3421 	 * See tavor_wrid_wqhdr_lock_both() above for more detail
3422 	 */
3423 	if (sq_cq == rq_cq) {
3424 #ifdef	__lock_lint
3425 		mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3426 #endif
3427 		mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3428 	} else {
3429 		mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3430 		mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3431 	}
3432 }
3433 
3434 
3435 /*
3436  * tavor_cq_wqhdr_add()
3437  *    Context: Can be called from interrupt or base context.
3438  */
3439 static void
tavor_cq_wqhdr_add(tavor_cqhdl_t cq,tavor_workq_hdr_t * wqhdr)3440 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3441 {
3442 	tavor_workq_compare_t	cmp;
3443 	avl_index_t		where;
3444 
3445 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3446 
3447 	cmp.cmp_qpn = wqhdr->wq_qpn;
3448 	cmp.cmp_type = wqhdr->wq_type;
3449 #ifdef __lock_lint
3450 	tavor_wrid_wqhdr_compare(NULL, NULL);
3451 #endif
3452 	(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
3453 	/*
3454 	 * If the CQ's work queue list is empty, then just add it.
3455 	 * Otherwise, chain it to the beginning of the list.
3456 	 */
3457 	avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where);
3458 }
3459 
3460 
3461 /*
3462  * tavor_cq_wqhdr_remove()
3463  *    Context: Can be called from interrupt or base context.
3464  */
3465 static void
tavor_cq_wqhdr_remove(tavor_cqhdl_t cq,tavor_workq_hdr_t * wqhdr)3466 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3467 {
3468 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3469 
3470 #ifdef __lock_lint
3471 	tavor_wrid_wqhdr_compare(NULL, NULL);
3472 #endif
3473 	/* Remove "wqhdr" from the work queue header list on "cq" */
3474 	avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr);
3475 
3476 	/*
3477 	 * Release reference to WQL; If this is the last reference, this call
3478 	 * also has the side effect of freeing up the 'wq_wrid_wql' memory.
3479 	 */
3480 	tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql);
3481 
3482 	/* Free the memory associated with "wqhdr" */
3483 	kmem_free(wqhdr, sizeof (tavor_workq_hdr_t));
3484 }
3485 
3486 
3487 /*
3488  * tavor_wql_refcnt_inc()
3489  * Context: Can be called from interrupt or base context
3490  */
3491 void
tavor_wql_refcnt_inc(tavor_wq_lock_t * wql)3492 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql)
3493 {
3494 	ASSERT(wql != NULL);
3495 
3496 	mutex_enter(&wql->wql_lock);
3497 	wql->wql_refcnt++;
3498 	mutex_exit(&wql->wql_lock);
3499 }
3500 
3501 /*
3502  * tavor_wql_refcnt_dec()
3503  * Context: Can be called from interrupt or base context
3504  */
3505 void
tavor_wql_refcnt_dec(tavor_wq_lock_t * wql)3506 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql)
3507 {
3508 	int	refcnt;
3509 
3510 	ASSERT(wql != NULL);
3511 
3512 	mutex_enter(&wql->wql_lock);
3513 	wql->wql_refcnt--;
3514 	refcnt = wql->wql_refcnt;
3515 	mutex_exit(&wql->wql_lock);
3516 
3517 	/*
3518 	 *
3519 	 * Free up WQL memory if we're the last one associated with this
3520 	 * structure.
3521 	 */
3522 	if (refcnt == 0) {
3523 		mutex_destroy(&wql->wql_lock);
3524 		kmem_free(wql, sizeof (tavor_wq_lock_t));
3525 	}
3526 }
3527