1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * hermon_wr.c
29  *    Hermon Work Request Processing Routines
30  *
31  *    Implements all the routines necessary to provide the PostSend(),
32  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
33  *    necessary to implement the Hermon WRID tracking mechanism.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/avl.h>
42 
43 #include <sys/ib/adapters/hermon/hermon.h>
44 
45 static uint32_t hermon_wr_get_immediate(ibt_send_wr_t *wr);
46 static int hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr);
47 static int hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
48     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
49 static int hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
50     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
51 static void hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp);
52 static int hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
53     ibt_recv_wr_t *wr, uint64_t *desc);
54 static int hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
55     ibt_recv_wr_t *wr, uint64_t *desc);
56 static void hermon_wqe_sync(void *hdl, uint_t sync_from,
57     uint_t sync_to, uint_t sync_type, uint_t flag);
58 static hermon_workq_avl_t *hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn,
59     uint_t send_or_recv);
60 static void hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl);
61 static void hermon_cq_workq_remove(hermon_cqhdl_t cq,
62     hermon_workq_avl_t *wqavl);
63 
64 static	ibt_wr_ds_t	null_sgl = { 0, 0x00000100, 0 };
65 
66 static int
67 hermon_post_send_ud(hermon_state_t *state, hermon_qphdl_t qp,
68     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
69 {
70 	hermon_hw_snd_wqe_ud_t		*ud;
71 	hermon_workq_hdr_t		*wq;
72 	hermon_ahhdl_t			ah;
73 	ibt_ud_dest_t			*dest;
74 	uint64_t			*desc;
75 	uint32_t			desc_sz;
76 	uint32_t			signaled_dbd, solicited;
77 	uint32_t			head, tail, next_tail, qsize_msk;
78 	uint32_t			hdrmwqes;
79 	uint32_t			nopcode, fence, immed_data = 0;
80 	hermon_hw_wqe_sgl_t		*ds, *old_ds;
81 	ibt_wr_ds_t			*sgl;
82 	uint32_t			nds, dnds;
83 	int				i, j, last_ds, num_ds, status;
84 	uint32_t			*wqe_start;
85 	int				sectperwqe;
86 	uint_t				posted_cnt = 0;
87 
88 	/* initialize the FMA retry loop */
89 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
90 
91 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
92 	_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
93 
94 	/* Grab the lock for the WRID list */
95 	membar_consumer();
96 
97 	/* Save away some initial QP state */
98 	wq = qp->qp_sq_wqhdr;
99 	qsize_msk = wq->wq_mask;
100 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
101 	sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
102 
103 	tail	  = wq->wq_tail;
104 	head	  = wq->wq_head;
105 	status	  = DDI_SUCCESS;
106 
107 post_next:
108 	/*
109 	 * Check for "queue full" condition.  If the queue
110 	 * is already full, then no more WQEs can be posted.
111 	 * So break out, ring a doorbell (if necessary) and
112 	 * return an error
113 	 */
114 	if (wq->wq_full != 0) {
115 		status = IBT_QP_FULL;
116 		goto done;
117 	}
118 
119 	next_tail = (tail + 1) & qsize_msk;
120 	if (((tail + hdrmwqes) & qsize_msk) == head) {
121 		wq->wq_full = 1;
122 	}
123 
124 	desc = HERMON_QP_SQ_ENTRY(qp, tail);
125 
126 	ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
127 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
128 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
129 	    sizeof (hermon_hw_snd_wqe_ud_t));
130 	nds = wr->wr_nds;
131 	sgl = wr->wr_sgl;
132 	num_ds = 0;
133 
134 	/* need to know the count of destination nds for backward loop */
135 	for (dnds = 0, i = 0; i < nds; i++) {
136 		if (sgl[i].ds_len != 0)
137 			dnds++;
138 	}
139 
140 	/*
141 	 * Build a Send or Send_LSO WQE
142 	 */
143 	if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
144 		int total_len;
145 
146 		nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
147 		if (wr->wr.ud_lso.lso_hdr_sz > 60) {
148 			nopcode |= (1 << 6);	/* ReRead bit must be set */
149 		}
150 		dest = wr->wr.ud_lso.lso_ud_dest;
151 		ah = (hermon_ahhdl_t)dest->ud_ah;
152 		if (ah == NULL) {
153 			status = IBT_AH_HDL_INVALID;
154 			goto done;
155 		}
156 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
157 
158 		total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
159 		if ((uintptr_t)ds + total_len + (nds * 16) >
160 		    (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz)) {
161 			status = IBT_QP_SGL_LEN_INVALID;
162 			goto done;
163 		}
164 		old_ds = ds;
165 		bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)old_ds + 1,
166 		    wr->wr.ud_lso.lso_hdr_sz);
167 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
168 		i = 0;
169 	} else if (wr->wr_opcode == IBT_WRC_SEND) {
170 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
171 			nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
172 			immed_data = wr->wr.ud.udwr_immed;
173 		} else {
174 			nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
175 		}
176 		dest = wr->wr.ud.udwr_dest;
177 		ah = (hermon_ahhdl_t)dest->ud_ah;
178 		if (ah == NULL) {
179 			status = IBT_AH_HDL_INVALID;
180 			goto done;
181 		}
182 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
183 		i = 0;
184 	} else {
185 		status = IBT_QP_OP_TYPE_INVALID;
186 		goto done;
187 	}
188 
189 	if (nds > qp->qp_sq_sgl) {
190 		status = IBT_QP_SGL_LEN_INVALID;
191 		goto done;
192 	}
193 	for (last_ds = num_ds, j = i; j < nds; j++) {
194 		if (sgl[j].ds_len != 0)
195 			last_ds++;	/* real last ds of wqe to fill */
196 	}
197 	desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
198 	for (j = nds; --j >= i; ) {
199 		if (sgl[j].ds_len == 0) {
200 			continue;
201 		}
202 
203 		/*
204 		 * Fill in the Data Segment(s) for the current WQE, using the
205 		 * information contained in the scatter-gather list of the
206 		 * work request.
207 		 */
208 		last_ds--;
209 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
210 	}
211 
212 	membar_producer();
213 
214 	if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
215 		HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
216 		    wr->wr.ud_lso.lso_hdr_sz);
217 	}
218 
219 	fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
220 
221 	signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
222 	    (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 1 : 0;
223 
224 	solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 1 : 0;
225 
226 	HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data,
227 	    solicited, signaled_dbd, wr->wr_flags & IBT_WR_SEND_CKSUM, qp);
228 
229 	wq->wq_wrid[tail] = wr->wr_id;
230 
231 	tail = next_tail;
232 
233 	/* Update some of the state in the QP */
234 	wq->wq_tail = tail;
235 
236 	membar_producer();
237 
238 	/* Now set the ownership bit and opcode (first dword). */
239 	HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
240 
241 	posted_cnt++;
242 	if (--num_wr > 0) {
243 		/* do the invalidate of the headroom */
244 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
245 		    (tail + hdrmwqes) & qsize_msk);
246 		for (i = 16; i < sectperwqe; i += 16) {
247 			wqe_start[i] = 0xFFFFFFFF;
248 		}
249 
250 		wr++;
251 		goto post_next;
252 	}
253 done:
254 	if (posted_cnt != 0) {
255 		ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
256 
257 		membar_producer();
258 
259 		/* the FMA retry loop starts for Hermon doorbell register. */
260 		hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
261 		    fm_status, fm_test_num);
262 
263 		HERMON_UAR_DOORBELL(state, uarhdl,
264 		    (uint64_t *)(void *)&state->hs_uar->send,
265 		    (uint64_t)qp->qp_ring);
266 
267 		/* the FMA retry loop ends. */
268 		hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
269 		    fm_status, fm_test_num);
270 
271 		/* do the invalidate of the headroom */
272 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
273 		    (tail + hdrmwqes) & qsize_msk);
274 		for (i = 16; i < sectperwqe; i += 16) {
275 			wqe_start[i] = 0xFFFFFFFF;
276 		}
277 	}
278 	if (num_posted != NULL)
279 		*num_posted = posted_cnt;
280 
281 	mutex_exit(&qp->qp_sq_lock);
282 
283 	return (status);
284 
285 pio_error:
286 	mutex_exit(&qp->qp_sq_lock);
287 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
288 	return (ibc_get_ci_failure(0));
289 }
290 
291 static int
292 hermon_post_send_rc(hermon_state_t *state, hermon_qphdl_t qp,
293     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
294 {
295 	uint64_t			*desc;
296 	hermon_workq_hdr_t		*wq;
297 	uint32_t			desc_sz;
298 	uint32_t			signaled_dbd, solicited;
299 	uint32_t			head, tail, next_tail, qsize_msk;
300 	uint32_t			hdrmwqes;
301 	int				status;
302 	uint32_t			nopcode, fence, immed_data = 0;
303 	hermon_hw_snd_wqe_remaddr_t	*rc;
304 	hermon_hw_snd_wqe_atomic_t	*at;
305 	hermon_hw_snd_wqe_bind_t	*bn;
306 	hermon_hw_wqe_sgl_t		*ds;
307 	ibt_wr_ds_t			*sgl;
308 	uint32_t			nds;
309 	int				i, last_ds, num_ds;
310 	uint32_t			*wqe_start;
311 	int				sectperwqe;
312 	uint_t				posted_cnt = 0;
313 
314 	/* initialize the FMA retry loop */
315 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
316 
317 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
318 	_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
319 
320 	/* make sure we see any update of wq_head */
321 	membar_consumer();
322 
323 	/* Save away some initial QP state */
324 	wq = qp->qp_sq_wqhdr;
325 	qsize_msk = wq->wq_mask;
326 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
327 	sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
328 
329 	tail	  = wq->wq_tail;
330 	head	  = wq->wq_head;
331 	status	  = DDI_SUCCESS;
332 
333 post_next:
334 	/*
335 	 * Check for "queue full" condition.  If the queue
336 	 * is already full, then no more WQEs can be posted.
337 	 * So break out, ring a doorbell (if necessary) and
338 	 * return an error
339 	 */
340 	if (wq->wq_full != 0) {
341 		status = IBT_QP_FULL;
342 		goto done;
343 	}
344 	next_tail = (tail + 1) & qsize_msk;
345 	if (((tail + hdrmwqes) & qsize_msk) == head) {
346 		wq->wq_full = 1;
347 	}
348 
349 	desc = HERMON_QP_SQ_ENTRY(qp, tail);
350 
351 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
352 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
353 	nds = wr->wr_nds;
354 	sgl = wr->wr_sgl;
355 	num_ds = 0;
356 
357 	/*
358 	 * Validate the operation type.  For RC requests, we allow
359 	 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
360 	 * operations, and memory window "Bind"
361 	 */
362 	switch (wr->wr_opcode) {
363 	default:
364 		status = IBT_QP_OP_TYPE_INVALID;
365 		goto done;
366 
367 	case IBT_WRC_SEND:
368 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
369 			nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
370 			immed_data = wr->wr.rc.rcwr.send_immed;
371 		} else {
372 			nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
373 		}
374 		break;
375 
376 	/*
377 	 * If this is an RDMA Read or RDMA Write request, then fill
378 	 * in the "Remote Address" header fields.
379 	 */
380 	case IBT_WRC_RDMAW:
381 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
382 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAWI;
383 			immed_data = wr->wr.rc.rcwr.rdma.rdma_immed;
384 		} else {
385 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
386 		}
387 		/* FALLTHROUGH */
388 	case IBT_WRC_RDMAR:
389 		if (wr->wr_opcode == IBT_WRC_RDMAR)
390 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
391 		rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
392 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
393 
394 		/*
395 		 * Build the Remote Address Segment for the WQE, using
396 		 * the information from the RC work request.
397 		 */
398 		HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
399 
400 		/* Update "ds" for filling in Data Segments (below) */
401 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
402 		    sizeof (hermon_hw_snd_wqe_remaddr_t));
403 		break;
404 
405 	/*
406 	 * If this is one of the Atomic type operations (i.e
407 	 * Compare-Swap or Fetch-Add), then fill in both the "Remote
408 	 * Address" header fields and the "Atomic" header fields.
409 	 */
410 	case IBT_WRC_CSWAP:
411 		nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
412 		/* FALLTHROUGH */
413 	case IBT_WRC_FADD:
414 		if (wr->wr_opcode == IBT_WRC_FADD)
415 			nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
416 		rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
417 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
418 		at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
419 		    sizeof (hermon_hw_snd_wqe_remaddr_t));
420 
421 		/*
422 		 * Build the Remote Address and Atomic Segments for
423 		 * the WQE, using the information from the RC Atomic
424 		 * work request.
425 		 */
426 		HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
427 		HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
428 
429 		/* Update "ds" for filling in Data Segments (below) */
430 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
431 		    sizeof (hermon_hw_snd_wqe_atomic_t));
432 
433 		/*
434 		 * Update "nds" and "sgl" because Atomic requests have
435 		 * only a single Data Segment.
436 		 */
437 		nds = 1;
438 		sgl = wr->wr_sgl;
439 		break;
440 
441 	/*
442 	 * If this is memory window Bind operation, then we call the
443 	 * hermon_wr_bind_check() routine to validate the request and
444 	 * to generate the updated RKey.  If this is successful, then
445 	 * we fill in the WQE's "Bind" header fields.
446 	 */
447 	case IBT_WRC_BIND:
448 		nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
449 		status = hermon_wr_bind_check(state, wr);
450 		if (status != DDI_SUCCESS)
451 			goto done;
452 
453 		bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
454 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
455 
456 		/*
457 		 * Build the Bind Memory Window Segments for the WQE,
458 		 * using the information from the RC Bind memory
459 		 * window work request.
460 		 */
461 		HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
462 
463 		/*
464 		 * Update the "ds" pointer.  Even though the "bind"
465 		 * operation requires no SGLs, this is necessary to
466 		 * facilitate the correct descriptor size calculations
467 		 * (below).
468 		 */
469 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
470 		    sizeof (hermon_hw_snd_wqe_bind_t));
471 		nds = 0;
472 	}
473 
474 	/*
475 	 * Now fill in the Data Segments (SGL) for the Send WQE based
476 	 * on the values setup above (i.e. "sgl", "nds", and the "ds"
477 	 * pointer. Start by checking for a valid number of SGL entries
478 	 */
479 	if (nds > qp->qp_sq_sgl) {
480 		status = IBT_QP_SGL_LEN_INVALID;
481 		goto done;
482 	}
483 
484 	for (last_ds = num_ds, i = 0; i < nds; i++) {
485 		if (sgl[i].ds_len != 0)
486 			last_ds++;	/* real last ds of wqe to fill */
487 	}
488 	desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
489 	for (i = nds; --i >= 0; ) {
490 		if (sgl[i].ds_len == 0) {
491 			continue;
492 		}
493 
494 		/*
495 		 * Fill in the Data Segment(s) for the current WQE, using the
496 		 * information contained in the scatter-gather list of the
497 		 * work request.
498 		 */
499 		last_ds--;
500 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[i]);
501 	}
502 
503 	fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
504 
505 	signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
506 	    (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 1 : 0;
507 
508 	solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 1 : 0;
509 
510 	HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited,
511 	    signaled_dbd, wr->wr_flags & IBT_WR_SEND_CKSUM, qp);
512 
513 	wq->wq_wrid[tail] = wr->wr_id;
514 
515 	tail = next_tail;
516 
517 	/* Update some of the state in the QP */
518 	wq->wq_tail = tail;
519 
520 	membar_producer();
521 
522 	/* Now set the ownership bit of the first one in the chain. */
523 	HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
524 
525 	posted_cnt++;
526 	if (--num_wr > 0) {
527 		/* do the invalidate of the headroom */
528 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
529 		    (tail + hdrmwqes) & qsize_msk);
530 		for (i = 16; i < sectperwqe; i += 16) {
531 			wqe_start[i] = 0xFFFFFFFF;
532 		}
533 
534 		wr++;
535 		goto post_next;
536 	}
537 done:
538 
539 	if (posted_cnt != 0) {
540 		ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
541 
542 		membar_producer();
543 
544 		/* the FMA retry loop starts for Hermon doorbell register. */
545 		hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
546 		    fm_status, fm_test_num);
547 
548 		/* Ring the doorbell */
549 		HERMON_UAR_DOORBELL(state, uarhdl,
550 		    (uint64_t *)(void *)&state->hs_uar->send,
551 		    (uint64_t)qp->qp_ring);
552 
553 		/* the FMA retry loop ends. */
554 		hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
555 		    fm_status, fm_test_num);
556 
557 		/* do the invalidate of the headroom */
558 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
559 		    (tail + hdrmwqes) & qsize_msk);
560 		for (i = 16; i < sectperwqe; i += 16) {
561 			wqe_start[i] = 0xFFFFFFFF;
562 		}
563 	}
564 	/*
565 	 * Update the "num_posted" return value (if necessary).
566 	 * Then drop the locks and return success.
567 	 */
568 	if (num_posted != NULL) {
569 		*num_posted = posted_cnt;
570 	}
571 
572 	mutex_exit(&qp->qp_sq_lock);
573 	return (status);
574 
575 pio_error:
576 	mutex_exit(&qp->qp_sq_lock);
577 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
578 	return (ibc_get_ci_failure(0));
579 }
580 
581 /*
582  * hermon_post_send()
583  *    Context: Can be called from interrupt or base context.
584  */
585 int
586 hermon_post_send(hermon_state_t *state, hermon_qphdl_t qp,
587     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
588 {
589 	ibt_send_wr_t 			*curr_wr;
590 	hermon_workq_hdr_t		*wq;
591 	hermon_ahhdl_t			ah;
592 	uint64_t			*desc, *prev;
593 	uint32_t			desc_sz;
594 	uint32_t			signaled_dbd, solicited;
595 	uint32_t			head, tail, next_tail, qsize_msk;
596 	uint32_t			sync_from, sync_to;
597 	uint32_t			hdrmwqes;
598 	uint_t				currindx, wrindx, numremain;
599 	uint_t				chainlen;
600 	uint_t				posted_cnt, maxstat;
601 	uint_t				total_posted;
602 	int				status;
603 	uint32_t			nopcode, fence, immed_data = 0;
604 	uint32_t			prev_nopcode;
605 
606 	/* initialize the FMA retry loop */
607 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test);
608 
609 	/*
610 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
611 	 * clients to post to QP memory that is accessible directly by the
612 	 * user.  If the QP memory is user accessible, then return an error.
613 	 */
614 	if (qp->qp_is_umap) {
615 		return (IBT_QP_HDL_INVALID);
616 	}
617 
618 	mutex_enter(&qp->qp_lock);
619 
620 	/*
621 	 * Check QP state.  Can not post Send requests from the "Reset",
622 	 * "Init", or "RTR" states
623 	 */
624 	if ((qp->qp_state == HERMON_QP_RESET) ||
625 	    (qp->qp_state == HERMON_QP_INIT) ||
626 	    (qp->qp_state == HERMON_QP_RTR)) {
627 		mutex_exit(&qp->qp_lock);
628 		return (IBT_QP_STATE_INVALID);
629 	}
630 	mutex_exit(&qp->qp_lock);
631 	mutex_enter(&qp->qp_sq_lock);
632 
633 	if (qp->qp_is_special)
634 		goto post_many;
635 
636 	/* Use these optimized functions most of the time */
637 	if (qp->qp_serv_type == HERMON_QP_UD)
638 		return (hermon_post_send_ud(state, qp, wr, num_wr, num_posted));
639 
640 	if (qp->qp_serv_type == HERMON_QP_RC)
641 		return (hermon_post_send_rc(state, qp, wr, num_wr, num_posted));
642 
643 	if (qp->qp_serv_type == HERMON_QP_UC)
644 		goto post_many;
645 
646 	mutex_exit(&qp->qp_sq_lock);
647 	return (IBT_QP_SRV_TYPE_INVALID);
648 
649 post_many:
650 	/* general loop for non-optimized posting */
651 
652 	/* Grab the lock for the WRID list */
653 	membar_consumer();
654 
655 	/* Save away some initial QP state */
656 	wq = qp->qp_sq_wqhdr;
657 	qsize_msk = wq->wq_mask;
658 	tail	  = wq->wq_tail;
659 	head	  = wq->wq_head;
660 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
661 
662 	/* Initialize posted_cnt */
663 	posted_cnt = 0;
664 	total_posted = 0;
665 
666 	/*
667 	 * For each ibt_send_wr_t in the wr[] list passed in, parse the
668 	 * request and build a Send WQE.  NOTE:  Because we are potentially
669 	 * building a chain of WQEs to post, we want to build them all first,
670 	 * and set the valid (HW Ownership) bit on all but the first.
671 	 * However, we do not want to validate the first one until the
672 	 * entire chain of WQEs has been built.  Then in the final
673 	 * we set the valid bit in the first, flush if needed, and as a last
674 	 * step ring the appropriate doorbell.  NOTE: the doorbell ring may
675 	 * NOT be needed if the HCA is already processing, but the doorbell
676 	 * ring will be done regardless. NOTE ALSO:  It is possible for
677 	 * more Work Requests to be posted than the HW will support at one
678 	 * shot.  If this happens, we need to be able to post and ring
679 	 * several chains here until the the entire request is complete.
680 	 * NOTE ALSO:  the term "chain" is used to differentiate it from
681 	 * Work Request List passed in; and because that's the terminology
682 	 * from the previous generations of HCA - but the WQEs are not, in fact
683 	 * chained together for Hermon
684 	 */
685 
686 	wrindx = 0;
687 	numremain = num_wr;
688 	status	  = DDI_SUCCESS;
689 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
690 		/*
691 		 * For the first WQE on a new chain we need "prev" to point
692 		 * to the current descriptor.
693 		 */
694 		prev = HERMON_QP_SQ_ENTRY(qp, tail);
695 
696 	/*
697 	 * unlike Tavor & Arbel, tail will maintain the number of the
698 	 * next (this) WQE to be posted.  Since there is no backward linking
699 	 * in Hermon, we can always just look ahead
700 	 */
701 		/*
702 		 * Before we begin, save the current "tail index" for later
703 		 * DMA sync
704 		 */
705 		/* NOTE: don't need to go back one like arbel/tavor */
706 		sync_from = tail;
707 
708 		/*
709 		 * Break the request up into lists that are less than or
710 		 * equal to the maximum number of WQEs that can be posted
711 		 * per doorbell ring - 256 currently
712 		 */
713 		chainlen = (numremain > HERMON_QP_MAXDESC_PER_DB) ?
714 		    HERMON_QP_MAXDESC_PER_DB : numremain;
715 		numremain -= chainlen;
716 
717 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
718 			/*
719 			 * Check for "queue full" condition.  If the queue
720 			 * is already full, then no more WQEs can be posted.
721 			 * So break out, ring a doorbell (if necessary) and
722 			 * return an error
723 			 */
724 			if (wq->wq_full != 0) {
725 				status = IBT_QP_FULL;
726 				break;
727 			}
728 
729 			/*
730 			 * Increment the "tail index". Check for "queue
731 			 * full" condition incl. headroom.  If we detect that
732 			 * the current work request is going to fill the work
733 			 * queue, then we mark this condition and continue.
734 			 * Don't need >=, because going one-by-one we have to
735 			 * hit it exactly sooner or later
736 			 */
737 
738 			next_tail = (tail + 1) & qsize_msk;
739 			if (((tail + hdrmwqes) & qsize_msk) == head) {
740 				wq->wq_full = 1;
741 			}
742 
743 			/*
744 			 * Get the address of the location where the next
745 			 * Send WQE should be built
746 			 */
747 			desc = HERMON_QP_SQ_ENTRY(qp, tail);
748 			/*
749 			 * Call hermon_wqe_send_build() to build the WQE
750 			 * at the given address.  This routine uses the
751 			 * information in the ibt_send_wr_t list (wr[]) and
752 			 * returns the size of the WQE when it returns.
753 			 */
754 			status = hermon_wqe_send_build(state, qp,
755 			    &wr[wrindx], desc, &desc_sz);
756 			if (status != DDI_SUCCESS) {
757 				break;
758 			}
759 
760 			/*
761 			 * Now, build the Ctrl Segment based on
762 			 * what was just done
763 			 */
764 			curr_wr = &wr[wrindx];
765 
766 			switch (curr_wr->wr_opcode) {
767 			case IBT_WRC_RDMAW:
768 				if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
769 					nopcode =
770 					    HERMON_WQE_SEND_NOPCODE_RDMAWI;
771 					immed_data =
772 					    hermon_wr_get_immediate(curr_wr);
773 				} else {
774 					nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
775 				}
776 				break;
777 
778 			case IBT_WRC_SEND:
779 				if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
780 					nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
781 					immed_data =
782 					    hermon_wr_get_immediate(curr_wr);
783 				} else {
784 					nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
785 				}
786 				break;
787 
788 			case IBT_WRC_SEND_LSO:
789 				nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
790 				break;
791 
792 			case IBT_WRC_RDMAR:
793 				nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
794 				break;
795 
796 			case IBT_WRC_CSWAP:
797 				nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
798 				break;
799 
800 			case IBT_WRC_FADD:
801 				nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
802 				break;
803 
804 			case IBT_WRC_BIND:
805 				nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
806 				break;
807 			}
808 
809 			fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
810 
811 			/*
812 			 * now, build up the control segment, leaving the
813 			 * owner bit as it is
814 			 */
815 
816 			if ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
817 			    (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL)) {
818 				signaled_dbd = 1;
819 			} else {
820 				signaled_dbd = 0;
821 			}
822 			if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT)
823 				solicited = 1;
824 			else
825 				solicited = 0;
826 
827 			if (qp->qp_is_special) {
828 				ah = (hermon_ahhdl_t)
829 				    curr_wr->wr.ud.udwr_dest->ud_ah;
830 				mutex_enter(&ah->ah_lock);
831 				maxstat = ah->ah_udav->max_stat_rate;
832 				HERMON_WQE_SET_MLX_CTRL_SEGMENT(desc, desc_sz,
833 				    signaled_dbd, maxstat, ah->ah_udav->rlid,
834 				    qp, ah->ah_udav->sl);
835 				mutex_exit(&ah->ah_lock);
836 			} else {
837 				HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz,
838 				    fence, immed_data, solicited,
839 				    signaled_dbd, curr_wr->wr_flags &
840 				    IBT_WR_SEND_CKSUM, qp);
841 			}
842 			wq->wq_wrid[tail] = curr_wr->wr_id;
843 
844 			/*
845 			 * If this is not the first descriptor on the current
846 			 * chain, then set the ownership bit.
847 			 */
848 			if (currindx != 0) {		/* not the first */
849 				membar_producer();
850 				HERMON_SET_SEND_WQE_OWNER(qp,
851 				    (uint32_t *)desc, nopcode);
852 			} else
853 				prev_nopcode = nopcode;
854 
855 			/*
856 			 * Update the current "tail index" and increment
857 			 * "posted_cnt"
858 			 */
859 			tail = next_tail;
860 			posted_cnt++;
861 		}
862 
863 		/*
864 		 * If we reach here and there are one or more WQEs which have
865 		 * been successfully built as a chain, we have to finish up
866 		 * and prepare them for writing to the HW
867 		 * The steps are:
868 		 * 	1. do the headroom fixup
869 		 *	2. add in the size of the headroom for the sync
870 		 *	3. write the owner bit for the first WQE
871 		 *	4. sync them
872 		 *	5. fix up the structures
873 		 *	6. hit the doorbell in UAR
874 		 */
875 		if (posted_cnt != 0) {
876 			ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
877 
878 			/*
879 			 * Save away updated "tail index" for the DMA sync
880 			 * including the headroom that will be needed
881 			 */
882 			sync_to = (tail + hdrmwqes) & qsize_msk;
883 
884 			/* do the invalidate of the headroom */
885 
886 			hermon_wqe_headroom(tail, qp);
887 
888 			/* Do a DMA sync for current send WQE(s) */
889 			hermon_wqe_sync(qp, sync_from, sync_to, HERMON_WR_SEND,
890 			    DDI_DMA_SYNC_FORDEV);
891 
892 			/* Update some of the state in the QP */
893 			wq->wq_tail = tail;
894 			total_posted += posted_cnt;
895 			posted_cnt = 0;
896 
897 			membar_producer();
898 
899 			/*
900 			 * Now set the ownership bit of the first
901 			 * one in the chain
902 			 */
903 			HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)prev,
904 			    prev_nopcode);
905 
906 			/* the FMA retry loop starts for Hermon doorbell. */
907 			hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
908 			    fm_status, fm_test);
909 
910 			HERMON_UAR_DOORBELL(state, uarhdl,
911 			    (uint64_t *)(void *)&state->hs_uar->send,
912 			    (uint64_t)qp->qp_ring);
913 
914 			/* the FMA retry loop ends. */
915 			hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
916 			    fm_status, fm_test);
917 		}
918 	}
919 
920 	/*
921 	 * Update the "num_posted" return value (if necessary).
922 	 * Then drop the locks and return success.
923 	 */
924 	if (num_posted != NULL) {
925 		*num_posted = total_posted;
926 	}
927 	mutex_exit(&qp->qp_sq_lock);
928 	return (status);
929 
930 pio_error:
931 	mutex_exit(&qp->qp_sq_lock);
932 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
933 	return (ibc_get_ci_failure(0));
934 }
935 
936 
937 /*
938  * hermon_post_recv()
939  *    Context: Can be called from interrupt or base context.
940  */
941 int
942 hermon_post_recv(hermon_state_t *state, hermon_qphdl_t qp,
943     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
944 {
945 	uint64_t			*desc;
946 	hermon_workq_hdr_t		*wq;
947 	uint32_t			head, tail, next_tail, qsize_msk;
948 	uint32_t			sync_from, sync_to;
949 	uint_t				wrindx;
950 	uint_t				posted_cnt;
951 	int				status;
952 
953 	/*
954 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
955 	 * clients to post to QP memory that is accessible directly by the
956 	 * user.  If the QP memory is user accessible, then return an error.
957 	 */
958 	if (qp->qp_is_umap) {
959 		return (IBT_QP_HDL_INVALID);
960 	}
961 
962 	/* Initialize posted_cnt */
963 	posted_cnt = 0;
964 
965 	mutex_enter(&qp->qp_lock);
966 
967 	/*
968 	 * Check if QP is associated with an SRQ
969 	 */
970 	if (qp->qp_srq_en == HERMON_QP_SRQ_ENABLED) {
971 		mutex_exit(&qp->qp_lock);
972 		return (IBT_SRQ_IN_USE);
973 	}
974 
975 	/*
976 	 * Check QP state.  Can not post Recv requests from the "Reset" state
977 	 */
978 	if (qp->qp_state == HERMON_QP_RESET) {
979 		mutex_exit(&qp->qp_lock);
980 		return (IBT_QP_STATE_INVALID);
981 	}
982 
983 	/* Check that work request transport type is valid */
984 	if ((qp->qp_serv_type != HERMON_QP_UD) &&
985 	    (qp->qp_serv_type != HERMON_QP_RC) &&
986 	    (qp->qp_serv_type != HERMON_QP_UC)) {
987 		mutex_exit(&qp->qp_lock);
988 		return (IBT_QP_SRV_TYPE_INVALID);
989 	}
990 
991 	mutex_exit(&qp->qp_lock);
992 	mutex_enter(&qp->qp_rq_lock);
993 
994 	/*
995 	 * Grab the lock for the WRID list, i.e., membar_consumer().
996 	 * This is not needed because the mutex_enter() above has
997 	 * the same effect.
998 	 */
999 
1000 	/* Save away some initial QP state */
1001 	wq = qp->qp_rq_wqhdr;
1002 	qsize_msk = wq->wq_mask;
1003 	tail	  = wq->wq_tail;
1004 	head	  = wq->wq_head;
1005 
1006 	wrindx = 0;
1007 	status	  = DDI_SUCCESS;
1008 	/*
1009 	 * Before we begin, save the current "tail index" for later
1010 	 * DMA sync
1011 	 */
1012 	sync_from = tail;
1013 
1014 	for (wrindx = 0; wrindx < num_wr; wrindx++) {
1015 		if (wq->wq_full != 0) {
1016 			status = IBT_QP_FULL;
1017 			break;
1018 		}
1019 		next_tail = (tail + 1) & qsize_msk;
1020 		if (next_tail == head) {
1021 			wq->wq_full = 1;
1022 		}
1023 		desc = HERMON_QP_RQ_ENTRY(qp, tail);
1024 		status = hermon_wqe_recv_build(state, qp, &wr[wrindx], desc);
1025 		if (status != DDI_SUCCESS) {
1026 			break;
1027 		}
1028 
1029 		wq->wq_wrid[tail] = wr[wrindx].wr_id;
1030 		qp->qp_rq_wqecntr++;
1031 
1032 		tail = next_tail;
1033 		posted_cnt++;
1034 	}
1035 
1036 	if (posted_cnt != 0) {
1037 		/* Save away updated "tail index" for the DMA sync */
1038 		sync_to = tail;
1039 
1040 		hermon_wqe_sync(qp, sync_from, sync_to, HERMON_WR_RECV,
1041 		    DDI_DMA_SYNC_FORDEV);
1042 
1043 		wq->wq_tail = tail;
1044 
1045 		membar_producer();	/* ensure wrids are visible */
1046 
1047 		/* Update the doorbell record w/ wqecntr */
1048 		HERMON_UAR_DB_RECORD_WRITE(qp->qp_rq_vdbr,
1049 		    qp->qp_rq_wqecntr & 0xFFFF);
1050 	}
1051 
1052 	if (num_posted != NULL) {
1053 		*num_posted = posted_cnt;
1054 	}
1055 
1056 
1057 	mutex_exit(&qp->qp_rq_lock);
1058 	return (status);
1059 }
1060 
1061 /*
1062  * hermon_post_srq()
1063  *    Context: Can be called from interrupt or base context.
1064  */
1065 int
1066 hermon_post_srq(hermon_state_t *state, hermon_srqhdl_t srq,
1067     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
1068 {
1069 	uint64_t			*desc;
1070 	hermon_workq_hdr_t		*wq;
1071 	uint_t				indx, wrindx;
1072 	uint_t				posted_cnt;
1073 	int				status;
1074 
1075 	mutex_enter(&srq->srq_lock);
1076 
1077 	/*
1078 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
1079 	 * clients to post to QP memory that is accessible directly by the
1080 	 * user.  If the QP memory is user accessible, then return an error.
1081 	 */
1082 	if (srq->srq_is_umap) {
1083 		mutex_exit(&srq->srq_lock);
1084 		return (IBT_SRQ_HDL_INVALID);
1085 	}
1086 
1087 	/*
1088 	 * Check SRQ state.  Can not post Recv requests when SRQ is in error
1089 	 */
1090 	if (srq->srq_state == HERMON_SRQ_STATE_ERROR) {
1091 		mutex_exit(&srq->srq_lock);
1092 		return (IBT_QP_STATE_INVALID);
1093 	}
1094 
1095 	status = DDI_SUCCESS;
1096 	posted_cnt = 0;
1097 	wq = srq->srq_wq_wqhdr;
1098 	indx = wq->wq_head;
1099 
1100 	for (wrindx = 0; wrindx < num_wr; wrindx++) {
1101 
1102 		if (indx == wq->wq_tail) {
1103 			status = IBT_QP_FULL;
1104 			break;
1105 		}
1106 		desc = HERMON_SRQ_WQE_ADDR(srq, indx);
1107 
1108 		wq->wq_wrid[indx] = wr[wrindx].wr_id;
1109 
1110 		status = hermon_wqe_srq_build(state, srq, &wr[wrindx], desc);
1111 		if (status != DDI_SUCCESS) {
1112 			break;
1113 		}
1114 
1115 		hermon_wqe_sync(srq, indx, indx + 1,
1116 		    HERMON_WR_SRQ, DDI_DMA_SYNC_FORDEV);
1117 		posted_cnt++;
1118 		indx = htons(((uint16_t *)desc)[1]);
1119 		wq->wq_head = indx;
1120 	}
1121 
1122 	if (posted_cnt != 0) {
1123 
1124 		srq->srq_wq_wqecntr += posted_cnt;
1125 
1126 		membar_producer();	/* ensure wrids are visible */
1127 
1128 		/* Ring the doorbell w/ wqecntr */
1129 		HERMON_UAR_DB_RECORD_WRITE(srq->srq_wq_vdbr,
1130 		    srq->srq_wq_wqecntr & 0xFFFF);
1131 	}
1132 
1133 	if (num_posted != NULL) {
1134 		*num_posted = posted_cnt;
1135 	}
1136 
1137 	mutex_exit(&srq->srq_lock);
1138 	return (status);
1139 }
1140 
1141 
1142 /*
1143  * hermon_wqe_send_build()
1144  *    Context: Can be called from interrupt or base context.
1145  */
1146 static int
1147 hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
1148     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1149 {
1150 	hermon_hw_snd_wqe_ud_t		*ud;
1151 	hermon_hw_snd_wqe_remaddr_t	*rc;
1152 	hermon_hw_snd_wqe_atomic_t	*at;
1153 	hermon_hw_snd_wqe_remaddr_t	*uc;
1154 	hermon_hw_snd_wqe_bind_t	*bn;
1155 	hermon_hw_wqe_sgl_t		*ds, *old_ds;
1156 	ibt_ud_dest_t			*dest;
1157 	ibt_wr_ds_t			*sgl;
1158 	hermon_ahhdl_t			ah;
1159 	uint32_t			nds;
1160 	int				i, j, last_ds, num_ds, status;
1161 	int				tmpsize;
1162 
1163 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1164 
1165 	/* Initialize the information for the Data Segments */
1166 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1167 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
1168 	nds = wr->wr_nds;
1169 	sgl = wr->wr_sgl;
1170 	num_ds = 0;
1171 	i = 0;
1172 
1173 	/*
1174 	 * Build a Send WQE depends first and foremost on the transport
1175 	 * type of Work Request (i.e. UD, RC, or UC)
1176 	 */
1177 	switch (wr->wr_trans) {
1178 	case IBT_UD_SRV:
1179 		/* Ensure that work request transport type matches QP type */
1180 		if (qp->qp_serv_type != HERMON_QP_UD) {
1181 			return (IBT_QP_SRV_TYPE_INVALID);
1182 		}
1183 
1184 		/*
1185 		 * Validate the operation type.  For UD requests, only the
1186 		 * "Send" and "Send LSO" operations are valid.
1187 		 */
1188 		if (wr->wr_opcode != IBT_WRC_SEND &&
1189 		    wr->wr_opcode != IBT_WRC_SEND_LSO) {
1190 			return (IBT_QP_OP_TYPE_INVALID);
1191 		}
1192 
1193 		/*
1194 		 * If this is a Special QP (QP0 or QP1), then we need to
1195 		 * build MLX WQEs instead.  So jump to hermon_wqe_mlx_build()
1196 		 * and return whatever status it returns
1197 		 */
1198 		if (qp->qp_is_special) {
1199 			if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1200 				return (IBT_QP_OP_TYPE_INVALID);
1201 			}
1202 			status = hermon_wqe_mlx_build(state, qp,
1203 			    wr, desc, size);
1204 			return (status);
1205 		}
1206 
1207 		/*
1208 		 * Otherwise, if this is a normal UD Send request, then fill
1209 		 * all the fields in the Hermon UD header for the WQE.  Note:
1210 		 * to do this we'll need to extract some information from the
1211 		 * Address Handle passed with the work request.
1212 		 */
1213 		ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1214 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
1215 		if (wr->wr_opcode == IBT_WRC_SEND) {
1216 			dest = wr->wr.ud.udwr_dest;
1217 		} else {
1218 			dest = wr->wr.ud_lso.lso_ud_dest;
1219 		}
1220 		ah = (hermon_ahhdl_t)dest->ud_ah;
1221 		if (ah == NULL) {
1222 			return (IBT_AH_HDL_INVALID);
1223 		}
1224 
1225 		/*
1226 		 * Build the Unreliable Datagram Segment for the WQE, using
1227 		 * the information from the address handle and the work
1228 		 * request.
1229 		 */
1230 		/* mutex_enter(&ah->ah_lock); */
1231 		if (wr->wr_opcode == IBT_WRC_SEND) {
1232 			HERMON_WQE_BUILD_UD(qp, ud, ah, wr->wr.ud.udwr_dest);
1233 		} else {	/* IBT_WRC_SEND_LSO */
1234 			HERMON_WQE_BUILD_UD(qp, ud, ah,
1235 			    wr->wr.ud_lso.lso_ud_dest);
1236 		}
1237 		/* mutex_exit(&ah->ah_lock); */
1238 
1239 		/* Update "ds" for filling in Data Segments (below) */
1240 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
1241 		    sizeof (hermon_hw_snd_wqe_ud_t));
1242 
1243 		if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1244 			int total_len;
1245 
1246 			total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
1247 			if ((uintptr_t)ds + total_len + (nds * 16) >
1248 			    (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz))
1249 				return (IBT_QP_SGL_LEN_INVALID);
1250 
1251 			bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)ds + 1,
1252 			    wr->wr.ud_lso.lso_hdr_sz);
1253 			old_ds = ds;
1254 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
1255 			for (; i < nds; i++) {
1256 				if (sgl[i].ds_len == 0)
1257 					continue;
1258 				HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds],
1259 				    &sgl[i]);
1260 				num_ds++;
1261 				i++;
1262 				break;
1263 			}
1264 			membar_producer();
1265 			HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
1266 			    wr->wr.ud_lso.lso_hdr_sz);
1267 		}
1268 
1269 		break;
1270 
1271 	case IBT_RC_SRV:
1272 		/* Ensure that work request transport type matches QP type */
1273 		if (qp->qp_serv_type != HERMON_QP_RC) {
1274 			return (IBT_QP_SRV_TYPE_INVALID);
1275 		}
1276 
1277 		/*
1278 		 * Validate the operation type.  For RC requests, we allow
1279 		 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1280 		 * operations, and memory window "Bind"
1281 		 */
1282 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1283 		    (wr->wr_opcode != IBT_WRC_RDMAR) &&
1284 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1285 		    (wr->wr_opcode != IBT_WRC_CSWAP) &&
1286 		    (wr->wr_opcode != IBT_WRC_FADD) &&
1287 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1288 			return (IBT_QP_OP_TYPE_INVALID);
1289 		}
1290 
1291 		/*
1292 		 * If this is a Send request, then all we need to do is break
1293 		 * out and here and begin the Data Segment processing below
1294 		 */
1295 		if (wr->wr_opcode == IBT_WRC_SEND) {
1296 			break;
1297 		}
1298 
1299 		/*
1300 		 * If this is an RDMA Read or RDMA Write request, then fill
1301 		 * in the "Remote Address" header fields.
1302 		 */
1303 		if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1304 		    (wr->wr_opcode == IBT_WRC_RDMAW)) {
1305 			rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1306 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1307 
1308 			/*
1309 			 * Build the Remote Address Segment for the WQE, using
1310 			 * the information from the RC work request.
1311 			 */
1312 			HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1313 
1314 			/* Update "ds" for filling in Data Segments (below) */
1315 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
1316 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1317 			break;
1318 		}
1319 
1320 		/*
1321 		 * If this is one of the Atomic type operations (i.e
1322 		 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1323 		 * Address" header fields and the "Atomic" header fields.
1324 		 */
1325 		if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1326 		    (wr->wr_opcode == IBT_WRC_FADD)) {
1327 			rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1328 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1329 			at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1330 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1331 
1332 			/*
1333 			 * Build the Remote Address and Atomic Segments for
1334 			 * the WQE, using the information from the RC Atomic
1335 			 * work request.
1336 			 */
1337 			HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1338 			HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1339 
1340 			/* Update "ds" for filling in Data Segments (below) */
1341 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
1342 			    sizeof (hermon_hw_snd_wqe_atomic_t));
1343 
1344 			/*
1345 			 * Update "nds" and "sgl" because Atomic requests have
1346 			 * only a single Data Segment (and they are encoded
1347 			 * somewhat differently in the work request.
1348 			 */
1349 			nds = 1;
1350 			sgl = wr->wr_sgl;
1351 			break;
1352 		}
1353 
1354 		/*
1355 		 * If this is memory window Bind operation, then we call the
1356 		 * hermon_wr_bind_check() routine to validate the request and
1357 		 * to generate the updated RKey.  If this is successful, then
1358 		 * we fill in the WQE's "Bind" header fields.
1359 		 */
1360 		if (wr->wr_opcode == IBT_WRC_BIND) {
1361 			status = hermon_wr_bind_check(state, wr);
1362 			if (status != DDI_SUCCESS) {
1363 				return (status);
1364 			}
1365 
1366 			bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1367 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1368 
1369 			/*
1370 			 * Build the Bind Memory Window Segments for the WQE,
1371 			 * using the information from the RC Bind memory
1372 			 * window work request.
1373 			 */
1374 			HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1375 
1376 			/*
1377 			 * Update the "ds" pointer.  Even though the "bind"
1378 			 * operation requires no SGLs, this is necessary to
1379 			 * facilitate the correct descriptor size calculations
1380 			 * (below).
1381 			 */
1382 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1383 			    sizeof (hermon_hw_snd_wqe_bind_t));
1384 			nds = 0;
1385 		}
1386 		break;
1387 
1388 	case IBT_UC_SRV:
1389 		/* Ensure that work request transport type matches QP type */
1390 		if (qp->qp_serv_type != HERMON_QP_UC) {
1391 			return (IBT_QP_SRV_TYPE_INVALID);
1392 		}
1393 
1394 		/*
1395 		 * Validate the operation type.  For UC requests, we only
1396 		 * allow "Send", "RDMA Write", and memory window "Bind".
1397 		 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1398 		 * operations
1399 		 */
1400 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1401 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1402 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1403 			return (IBT_QP_OP_TYPE_INVALID);
1404 		}
1405 
1406 		/*
1407 		 * If this is a Send request, then all we need to do is break
1408 		 * out and here and begin the Data Segment processing below
1409 		 */
1410 		if (wr->wr_opcode == IBT_WRC_SEND) {
1411 			break;
1412 		}
1413 
1414 		/*
1415 		 * If this is an RDMA Write request, then fill in the "Remote
1416 		 * Address" header fields.
1417 		 */
1418 		if (wr->wr_opcode == IBT_WRC_RDMAW) {
1419 			uc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1420 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1421 
1422 			/*
1423 			 * Build the Remote Address Segment for the WQE, using
1424 			 * the information from the UC work request.
1425 			 */
1426 			HERMON_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1427 
1428 			/* Update "ds" for filling in Data Segments (below) */
1429 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)uc +
1430 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1431 			break;
1432 		}
1433 
1434 		/*
1435 		 * If this is memory window Bind operation, then we call the
1436 		 * hermon_wr_bind_check() routine to validate the request and
1437 		 * to generate the updated RKey.  If this is successful, then
1438 		 * we fill in the WQE's "Bind" header fields.
1439 		 */
1440 		if (wr->wr_opcode == IBT_WRC_BIND) {
1441 			status = hermon_wr_bind_check(state, wr);
1442 			if (status != DDI_SUCCESS) {
1443 				return (status);
1444 			}
1445 
1446 			bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1447 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1448 
1449 			/*
1450 			 * Build the Bind Memory Window Segments for the WQE,
1451 			 * using the information from the UC Bind memory
1452 			 * window work request.
1453 			 */
1454 			HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1455 
1456 			/*
1457 			 * Update the "ds" pointer.  Even though the "bind"
1458 			 * operation requires no SGLs, this is necessary to
1459 			 * facilitate the correct descriptor size calculations
1460 			 * (below).
1461 			 */
1462 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1463 			    sizeof (hermon_hw_snd_wqe_bind_t));
1464 			nds = 0;
1465 		}
1466 		break;
1467 
1468 	default:
1469 		return (IBT_QP_SRV_TYPE_INVALID);
1470 	}
1471 
1472 	/*
1473 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
1474 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1475 	 * Start by checking for a valid number of SGL entries
1476 	 */
1477 	if (nds > qp->qp_sq_sgl) {
1478 		return (IBT_QP_SGL_LEN_INVALID);
1479 	}
1480 
1481 	/*
1482 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
1483 	 * segments.  Note: We skip any SGL with zero size because Hermon
1484 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1485 	 * the encoding for zero means a 2GB transfer.
1486 	 */
1487 	for (last_ds = num_ds, j = i; j < nds; j++) {
1488 		if (sgl[j].ds_len != 0)
1489 			last_ds++;	/* real last ds of wqe to fill */
1490 	}
1491 
1492 	/*
1493 	 * Return the size of descriptor (in 16-byte chunks)
1494 	 * For Hermon, we want them (for now) to be on stride size
1495 	 * boundaries, which was implicit in Tavor/Arbel
1496 	 *
1497 	 */
1498 	tmpsize = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc);
1499 
1500 	*size = tmpsize >> 0x4;
1501 
1502 	for (j = nds; --j >= i; ) {
1503 		if (sgl[j].ds_len == 0) {
1504 			continue;
1505 		}
1506 
1507 		/*
1508 		 * Fill in the Data Segment(s) for the current WQE, using the
1509 		 * information contained in the scatter-gather list of the
1510 		 * work request.
1511 		 */
1512 		last_ds--;
1513 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
1514 	}
1515 
1516 	return (DDI_SUCCESS);
1517 }
1518 
1519 
1520 
1521 /*
1522  * hermon_wqe_mlx_build()
1523  *    Context: Can be called from interrupt or base context.
1524  */
1525 static int
1526 hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
1527     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1528 {
1529 	hermon_ahhdl_t		ah;
1530 	hermon_hw_udav_t	*udav;
1531 	ib_lrh_hdr_t		*lrh;
1532 	ib_grh_t		*grh;
1533 	ib_bth_hdr_t		*bth;
1534 	ib_deth_hdr_t		*deth;
1535 	hermon_hw_wqe_sgl_t	*ds;
1536 	ibt_wr_ds_t		*sgl;
1537 	uint8_t			*mgmtclass, *hpoint, *hcount;
1538 	uint32_t		nds, offset, pktlen;
1539 	uint32_t		desc_sz;
1540 	int			i, num_ds;
1541 	int			tmpsize;
1542 
1543 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1544 
1545 	/* Initialize the information for the Data Segments */
1546 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1547 	    sizeof (hermon_hw_mlx_wqe_nextctrl_t));
1548 
1549 	/*
1550 	 * Pull the address handle from the work request. The UDAV will
1551 	 * be used to answer some questions about the request.
1552 	 */
1553 	ah = (hermon_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1554 	if (ah == NULL) {
1555 		return (IBT_AH_HDL_INVALID);
1556 	}
1557 	mutex_enter(&ah->ah_lock);
1558 	udav = ah->ah_udav;
1559 
1560 	/*
1561 	 * If the request is for QP1 and the destination LID is equal to
1562 	 * the Permissive LID, then return an error.  This combination is
1563 	 * not allowed
1564 	 */
1565 	if ((udav->rlid == IB_LID_PERMISSIVE) &&
1566 	    (qp->qp_is_special == HERMON_QP_GSI)) {
1567 		mutex_exit(&ah->ah_lock);
1568 		return (IBT_AH_HDL_INVALID);
1569 	}
1570 
1571 	/*
1572 	 * Calculate the size of the packet headers, including the GRH
1573 	 * (if necessary)
1574 	 */
1575 	desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1576 	    sizeof (ib_deth_hdr_t);
1577 	if (udav->grh) {
1578 		desc_sz += sizeof (ib_grh_t);
1579 	}
1580 
1581 	/*
1582 	 * Begin to build the first "inline" data segment for the packet
1583 	 * headers.  Note:  By specifying "inline" we can build the contents
1584 	 * of the MAD packet headers directly into the work queue (as part
1585 	 * descriptor).  This has the advantage of both speeding things up
1586 	 * and of not requiring the driver to allocate/register any additional
1587 	 * memory for the packet headers.
1588 	 */
1589 	HERMON_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1590 	desc_sz += 4;
1591 
1592 	/*
1593 	 * Build Local Route Header (LRH)
1594 	 *    We start here by building the LRH into a temporary location.
1595 	 *    When we have finished we copy the LRH data into the descriptor.
1596 	 *
1597 	 *    Notice that the VL values are hardcoded.  This is not a problem
1598 	 *    because VL15 is decided later based on the value in the MLX
1599 	 *    transport "next/ctrl" header (see the "vl15" bit below), and it
1600 	 *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1601 	 *    values.  This rule does not hold for loopback packets however
1602 	 *    (all of which bypass the SL-to-VL tables) and it is the reason
1603 	 *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1604 	 *
1605 	 *    Notice also that Source LID is hardcoded to the Permissive LID
1606 	 *    (0xFFFF).  This is also not a problem because if the Destination
1607 	 *    LID is not the Permissive LID, then the "slr" value in the MLX
1608 	 *    transport "next/ctrl" header will be set to zero and the hardware
1609 	 *    will pull the LID from value in the port.
1610 	 */
1611 	lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1612 	pktlen = (desc_sz + 0x100) >> 2;
1613 	HERMON_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1614 
1615 	/*
1616 	 * Build Global Route Header (GRH)
1617 	 *    This is only built if necessary as defined by the "grh" bit in
1618 	 *    the address vector.  Note:  We also calculate the offset to the
1619 	 *    next header (BTH) based on whether or not the "grh" bit is set.
1620 	 */
1621 	if (udav->grh) {
1622 		/*
1623 		 * If the request is for QP0, then return an error.  The
1624 		 * combination of global routine (GRH) and QP0 is not allowed.
1625 		 */
1626 		if (qp->qp_is_special == HERMON_QP_SMI) {
1627 			mutex_exit(&ah->ah_lock);
1628 			return (IBT_AH_HDL_INVALID);
1629 		}
1630 		grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1631 		HERMON_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1632 
1633 		bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1634 	} else {
1635 		bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1636 	}
1637 	mutex_exit(&ah->ah_lock);
1638 
1639 
1640 	/*
1641 	 * Build Base Transport Header (BTH)
1642 	 *    Notice that the M, PadCnt, and TVer fields are all set
1643 	 *    to zero implicitly.  This is true for all Management Datagrams
1644 	 *    MADs whether GSI are SMI.
1645 	 */
1646 	HERMON_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1647 
1648 	/*
1649 	 * Build Datagram Extended Transport Header (DETH)
1650 	 */
1651 	deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1652 	HERMON_WQE_BUILD_MLX_DETH(deth, qp);
1653 
1654 	/* Ensure that the Data Segment is aligned on a 16-byte boundary */
1655 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1656 	ds = (hermon_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1657 	nds = wr->wr_nds;
1658 	sgl = wr->wr_sgl;
1659 	num_ds = 0;
1660 
1661 	/*
1662 	 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1663 	 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1664 	 * Start by checking for a valid number of SGL entries
1665 	 */
1666 	if (nds > qp->qp_sq_sgl) {
1667 		return (IBT_QP_SGL_LEN_INVALID);
1668 	}
1669 
1670 	/*
1671 	 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1672 	 * segments.  Note: We skip any SGL with zero size because Hermon
1673 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1674 	 * the encoding for zero means a 2GB transfer.  Because of this special
1675 	 * encoding in the hardware, we mask the requested length with
1676 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1677 	 * zero.)
1678 	 */
1679 	mgmtclass = hpoint = hcount = NULL;
1680 	offset = 0;
1681 	for (i = 0; i < nds; i++) {
1682 		if (sgl[i].ds_len == 0) {
1683 			continue;
1684 		}
1685 
1686 		/*
1687 		 * Fill in the Data Segment(s) for the MLX send WQE, using
1688 		 * the information contained in the scatter-gather list of
1689 		 * the work request.
1690 		 */
1691 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]);
1692 
1693 		/*
1694 		 * Search through the contents of all MADs posted to QP0 to
1695 		 * initialize pointers to the places where Directed Route "hop
1696 		 * pointer", "hop count", and "mgmtclass" would be.  Hermon
1697 		 * needs these updated (i.e. incremented or decremented, as
1698 		 * necessary) by software.
1699 		 */
1700 		if (qp->qp_is_special == HERMON_QP_SMI) {
1701 
1702 			HERMON_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1703 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1704 
1705 			HERMON_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1706 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1707 
1708 			HERMON_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1709 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1710 
1711 			offset += sgl[i].ds_len;
1712 		}
1713 		num_ds++;
1714 	}
1715 
1716 	/*
1717 	 * Hermon's Directed Route MADs need to have the "hop pointer"
1718 	 * incremented/decremented (as necessary) depending on whether it is
1719 	 * currently less than or greater than the "hop count" (i.e. whether
1720 	 * the MAD is a request or a response.)
1721 	 */
1722 	if (qp->qp_is_special == HERMON_QP_SMI) {
1723 		HERMON_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1724 		    *hpoint, *hcount);
1725 	}
1726 
1727 	/*
1728 	 * Now fill in the ICRC Data Segment.  This data segment is inlined
1729 	 * just like the packets headers above, but it is only four bytes and
1730 	 * set to zero (to indicate that we wish the hardware to generate ICRC.
1731 	 */
1732 	HERMON_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1733 	num_ds++;
1734 
1735 	/*
1736 	 * Return the size of descriptor (in 16-byte chunks)
1737 	 * For Hermon, we want them (for now) to be on stride size
1738 	 * boundaries, which was implicit in Tavor/Arbel
1739 	 */
1740 	tmpsize = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc);
1741 
1742 	*size = tmpsize >> 0x04;
1743 
1744 	return (DDI_SUCCESS);
1745 }
1746 
1747 
1748 
1749 /*
1750  * hermon_wqe_recv_build()
1751  *    Context: Can be called from interrupt or base context.
1752  */
1753 /* ARGSUSED */
1754 static int
1755 hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
1756     ibt_recv_wr_t *wr, uint64_t *desc)
1757 {
1758 	hermon_hw_wqe_sgl_t	*ds;
1759 	int			i, num_ds;
1760 
1761 	ASSERT(MUTEX_HELD(&qp->qp_rq_lock));
1762 
1763 	/*
1764 	 * Fill in the Data Segments (SGL) for the Recv WQE  - don't
1765 	 * need to have a reserved for the ctrl, there is none on the
1766 	 * recv queue for hermon, but will need to put an invalid
1767 	 * (null) scatter pointer per PRM
1768 	 */
1769 	ds = (hermon_hw_wqe_sgl_t *)(uintptr_t)desc;
1770 	num_ds = 0;
1771 
1772 	/* Check for valid number of SGL entries */
1773 	if (wr->wr_nds > qp->qp_rq_sgl) {
1774 		return (IBT_QP_SGL_LEN_INVALID);
1775 	}
1776 
1777 	/*
1778 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1779 	 * segments.  Note: We skip any SGL with zero size because Hermon
1780 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1781 	 * the encoding for zero means a 2GB transfer.  Because of this special
1782 	 * encoding in the hardware, we mask the requested length with
1783 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1784 	 * zero.)
1785 	 */
1786 	for (i = 0; i < wr->wr_nds; i++) {
1787 		if (wr->wr_sgl[i].ds_len == 0) {
1788 			continue;
1789 		}
1790 
1791 		/*
1792 		 * Fill in the Data Segment(s) for the receive WQE, using the
1793 		 * information contained in the scatter-gather list of the
1794 		 * work request.
1795 		 */
1796 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1797 		num_ds++;
1798 	}
1799 
1800 	/* put the null sgl pointer as well if needed */
1801 	if (num_ds < qp->qp_rq_sgl) {
1802 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1803 	}
1804 
1805 	return (DDI_SUCCESS);
1806 }
1807 
1808 
1809 
1810 /*
1811  * hermon_wqe_srq_build()
1812  *    Context: Can be called from interrupt or base context.
1813  */
1814 /* ARGSUSED */
1815 static int
1816 hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
1817     ibt_recv_wr_t *wr, uint64_t *desc)
1818 {
1819 	hermon_hw_wqe_sgl_t	*ds;
1820 	int			i, num_ds;
1821 
1822 	ASSERT(MUTEX_HELD(&srq->srq_lock));
1823 
1824 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1825 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1826 	    sizeof (hermon_hw_srq_wqe_next_t));
1827 	num_ds = 0;
1828 
1829 	/* Check for valid number of SGL entries */
1830 	if (wr->wr_nds > srq->srq_wq_sgl) {
1831 		return (IBT_QP_SGL_LEN_INVALID);
1832 	}
1833 
1834 	/*
1835 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1836 	 * segments.  Note: We skip any SGL with zero size because Hermon
1837 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1838 	 * the encoding for zero means a 2GB transfer.  Because of this special
1839 	 * encoding in the hardware, we mask the requested length with
1840 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1841 	 * zero.)
1842 	 */
1843 	for (i = 0; i < wr->wr_nds; i++) {
1844 		if (wr->wr_sgl[i].ds_len == 0) {
1845 			continue;
1846 		}
1847 
1848 		/*
1849 		 * Fill in the Data Segment(s) for the receive WQE, using the
1850 		 * information contained in the scatter-gather list of the
1851 		 * work request.
1852 		 */
1853 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1854 		num_ds++;
1855 	}
1856 
1857 	/*
1858 	 * put in the null sgl pointer as well, if needed
1859 	 */
1860 	if (num_ds < srq->srq_wq_sgl) {
1861 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1862 	}
1863 
1864 	return (DDI_SUCCESS);
1865 }
1866 
1867 
1868 /*
1869  * hermon_wr_get_immediate()
1870  *    Context: Can be called from interrupt or base context.
1871  */
1872 static uint32_t
1873 hermon_wr_get_immediate(ibt_send_wr_t *wr)
1874 {
1875 	/*
1876 	 * This routine extracts the "immediate data" from the appropriate
1877 	 * location in the IBTF work request.  Because of the way the
1878 	 * work request structure is defined, the location for this data
1879 	 * depends on the actual work request operation type.
1880 	 */
1881 
1882 	/* For RDMA Write, test if RC or UC */
1883 	if (wr->wr_opcode == IBT_WRC_RDMAW) {
1884 		if (wr->wr_trans == IBT_RC_SRV) {
1885 			return (wr->wr.rc.rcwr.rdma.rdma_immed);
1886 		} else {  /* IBT_UC_SRV */
1887 			return (wr->wr.uc.ucwr.rdma.rdma_immed);
1888 		}
1889 	}
1890 
1891 	/* For Send, test if RC, UD, or UC */
1892 	if (wr->wr_opcode == IBT_WRC_SEND) {
1893 		if (wr->wr_trans == IBT_RC_SRV) {
1894 			return (wr->wr.rc.rcwr.send_immed);
1895 		} else if (wr->wr_trans == IBT_UD_SRV) {
1896 			return (wr->wr.ud.udwr_immed);
1897 		} else {  /* IBT_UC_SRV */
1898 			return (wr->wr.uc.ucwr.send_immed);
1899 		}
1900 	}
1901 
1902 	/*
1903 	 * If any other type of request, then immediate is undefined
1904 	 */
1905 	return (0);
1906 }
1907 
1908 /*
1909  * hermon_wqe_headroom()
1910  *	Context: can be called from interrupt or base, currently only from
1911  *	base context.
1912  * Routine that fills in the headroom for the Send Queue
1913  */
1914 
1915 static void
1916 hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp)
1917 {
1918 	uint32_t	*wqe_start, *wqe_top, *wqe_base, qsize;
1919 	int		hdrmwqes, wqesizebytes, sectperwqe;
1920 	uint32_t	invalue;
1921 	int		i, j;
1922 
1923 	qsize	 = qp->qp_sq_bufsz;
1924 	wqesizebytes = 1 << qp->qp_sq_log_wqesz;
1925 	sectperwqe = wqesizebytes >> 6; 	/* 64 bytes/section */
1926 	hdrmwqes = qp->qp_sq_hdrmwqes;
1927 	wqe_base  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, 0);
1928 	wqe_top	  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, qsize);
1929 	wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, from);
1930 
1931 	for (i = 0; i < hdrmwqes; i++)	{
1932 		for (j = 0; j < sectperwqe; j++) {
1933 			if (j == 0) {		/* 1st section of wqe */
1934 				/* perserve ownership bit */
1935 				invalue = ddi_get32(qp->qp_wqinfo.qa_acchdl,
1936 				    wqe_start) | 0x7FFFFFFF;
1937 			} else {
1938 				/* or just invalidate it */
1939 				invalue = 0xFFFFFFFF;
1940 			}
1941 			ddi_put32(qp->qp_wqinfo.qa_acchdl, wqe_start, invalue);
1942 			wqe_start += 16;	/* move 64 bytes */
1943 		}
1944 		if (wqe_start == wqe_top)	/* hit the end of the queue */
1945 			wqe_start = wqe_base;	/* wrap to start */
1946 	}
1947 }
1948 
1949 /*
1950  * hermon_wqe_sync()
1951  *    Context: Can be called from interrupt or base context.
1952  */
1953 static void
1954 hermon_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
1955     uint_t sync_type, uint_t flag)
1956 {
1957 	hermon_qphdl_t		qp;
1958 	hermon_srqhdl_t		srq;
1959 	uint64_t		*wqe_from, *wqe_to;
1960 	uint64_t		*wq_base, *wq_top, *qp_base;
1961 	ddi_dma_handle_t	dmahdl;
1962 	off_t			offset;
1963 	size_t			length;
1964 	uint32_t		qsize;
1965 	int			status;
1966 
1967 	if (sync_type == HERMON_WR_SRQ) {
1968 		srq = (hermon_srqhdl_t)hdl;
1969 		/* Get the DMA handle from SRQ context */
1970 		dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
1971 		/* get base addr of the buffer */
1972 		qp_base = (uint64_t *)(void *)srq->srq_wq_buf;
1973 	} else {
1974 		qp = (hermon_qphdl_t)hdl;
1975 		/* Get the DMA handle from QP context */
1976 		dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
1977 		/* Determine the base address of the QP buffer */
1978 		if (qp->qp_sq_baseaddr == 0) {
1979 			qp_base = (uint64_t *)(void *)(qp->qp_sq_buf);
1980 		} else {
1981 			qp_base = (uint64_t *)(void *)(qp->qp_rq_buf);
1982 		}
1983 	}
1984 
1985 	/*
1986 	 * Depending on the type of the work queue, we grab information
1987 	 * about the address ranges we need to DMA sync.
1988 	 */
1989 
1990 	if (sync_type == HERMON_WR_SEND) {
1991 		wqe_from = HERMON_QP_SQ_ENTRY(qp, sync_from);
1992 		wqe_to   = HERMON_QP_SQ_ENTRY(qp, sync_to);
1993 		qsize	 = qp->qp_sq_bufsz;
1994 
1995 		wq_base = HERMON_QP_SQ_ENTRY(qp, 0);
1996 		wq_top	 = HERMON_QP_SQ_ENTRY(qp, qsize);
1997 	} else if (sync_type == HERMON_WR_RECV) {
1998 		wqe_from = HERMON_QP_RQ_ENTRY(qp, sync_from);
1999 		wqe_to   = HERMON_QP_RQ_ENTRY(qp, sync_to);
2000 		qsize	 = qp->qp_rq_bufsz;
2001 
2002 		wq_base = HERMON_QP_RQ_ENTRY(qp, 0);
2003 		wq_top	 = HERMON_QP_RQ_ENTRY(qp, qsize);
2004 	} else {
2005 		wqe_from = HERMON_SRQ_WQ_ENTRY(srq, sync_from);
2006 		wqe_to   = HERMON_SRQ_WQ_ENTRY(srq, sync_to);
2007 		qsize	 = srq->srq_wq_bufsz;
2008 
2009 		wq_base = HERMON_SRQ_WQ_ENTRY(srq, 0);
2010 		wq_top	 = HERMON_SRQ_WQ_ENTRY(srq, qsize);
2011 	}
2012 
2013 	/*
2014 	 * There are two possible cases for the beginning and end of the WQE
2015 	 * chain we are trying to sync.  Either this is the simple case, where
2016 	 * the end of the chain is below the beginning of the chain, or it is
2017 	 * the "wrap-around" case, where the end of the chain has wrapped over
2018 	 * the end of the queue.  In the former case, we simply need to
2019 	 * calculate the span from beginning to end and sync it.  In the latter
2020 	 * case, however, we need to calculate the span from the top of the
2021 	 * work queue to the end of the chain and sync that, and then we need
2022 	 * to find the other portion (from beginning of chain to end of queue)
2023 	 * and sync that as well.  Note: if the "top to end" span is actually
2024 	 * zero length, then we don't do a DMA sync because a zero length DMA
2025 	 * sync unnecessarily syncs the entire work queue.
2026 	 */
2027 	if (wqe_to > wqe_from) {
2028 		/* "From Beginning to End" */
2029 
2030 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)qp_base);
2031 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2032 
2033 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2034 		if (status != DDI_SUCCESS) {
2035 			return;
2036 		}
2037 	} else {
2038 		/* "From Top to End" */
2039 
2040 		offset = (off_t)((uintptr_t)wq_base - (uintptr_t)qp_base);
2041 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wq_base);
2042 		if (length) {
2043 			status = ddi_dma_sync(dmahdl, offset, length, flag);
2044 			if (status != DDI_SUCCESS) {
2045 				return;
2046 			}
2047 		}
2048 
2049 		/* "From Beginning to Bottom" */
2050 
2051 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)qp_base);
2052 		length = (size_t)((uintptr_t)wq_top - (uintptr_t)wqe_from);
2053 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2054 		if (status != DDI_SUCCESS) {
2055 			return;
2056 		}
2057 	}
2058 }
2059 
2060 
2061 /*
2062  * hermon_wr_bind_check()
2063  *    Context: Can be called from interrupt or base context.
2064  */
2065 /* ARGSUSED */
2066 static int
2067 hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr)
2068 {
2069 	ibt_bind_flags_t	bind_flags;
2070 	uint64_t		vaddr, len;
2071 	uint64_t		reg_start_addr, reg_end_addr;
2072 	hermon_mwhdl_t		mw;
2073 	hermon_mrhdl_t		mr;
2074 	hermon_rsrc_t		*mpt;
2075 	uint32_t		new_rkey;
2076 
2077 	/* Check for a valid Memory Window handle in the WR */
2078 	mw = (hermon_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2079 	if (mw == NULL) {
2080 		return (IBT_MW_HDL_INVALID);
2081 	}
2082 
2083 	/* Check for a valid Memory Region handle in the WR */
2084 	mr = (hermon_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2085 	if (mr == NULL) {
2086 		return (IBT_MR_HDL_INVALID);
2087 	}
2088 
2089 	mutex_enter(&mr->mr_lock);
2090 	mutex_enter(&mw->mr_lock);
2091 
2092 	/*
2093 	 * Check here to see if the memory region has already been partially
2094 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
2095 	 * If so, this is an error, return failure.
2096 	 */
2097 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2098 		mutex_exit(&mr->mr_lock);
2099 		mutex_exit(&mw->mr_lock);
2100 		return (IBT_MR_HDL_INVALID);
2101 	}
2102 
2103 	/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2104 	if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2105 		mutex_exit(&mr->mr_lock);
2106 		mutex_exit(&mw->mr_lock);
2107 		return (IBT_MR_RKEY_INVALID);
2108 	}
2109 
2110 	/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2111 	if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2112 		mutex_exit(&mr->mr_lock);
2113 		mutex_exit(&mw->mr_lock);
2114 		return (IBT_MR_LKEY_INVALID);
2115 	}
2116 
2117 	/*
2118 	 * Now check for valid "vaddr" and "len".  Note:  We don't check the
2119 	 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2120 	 */
2121 	len = wr->wr.rc.rcwr.bind->bind_len;
2122 	if (len != 0) {
2123 		vaddr = wr->wr.rc.rcwr.bind->bind_va;
2124 		reg_start_addr = mr->mr_bindinfo.bi_addr;
2125 		reg_end_addr   = mr->mr_bindinfo.bi_addr +
2126 		    (mr->mr_bindinfo.bi_len - 1);
2127 		if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2128 			mutex_exit(&mr->mr_lock);
2129 			mutex_exit(&mw->mr_lock);
2130 			return (IBT_MR_VA_INVALID);
2131 		}
2132 		vaddr = (vaddr + len) - 1;
2133 		if (vaddr > reg_end_addr) {
2134 			mutex_exit(&mr->mr_lock);
2135 			mutex_exit(&mw->mr_lock);
2136 			return (IBT_MR_LEN_INVALID);
2137 		}
2138 	}
2139 
2140 	/*
2141 	 * Validate the bind access flags.  Remote Write and Atomic access for
2142 	 * the Memory Window require that Local Write access be set in the
2143 	 * corresponding Memory Region.
2144 	 */
2145 	bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2146 	if (((bind_flags & IBT_WR_BIND_WRITE) ||
2147 	    (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2148 	    !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2149 		mutex_exit(&mr->mr_lock);
2150 		mutex_exit(&mw->mr_lock);
2151 		return (IBT_MR_ACCESS_REQ_INVALID);
2152 	}
2153 
2154 	/* Calculate the new RKey for the Memory Window */
2155 	mpt = mw->mr_mptrsrcp;
2156 	new_rkey = hermon_mr_keycalc(mpt->hr_indx);
2157 	new_rkey = hermon_mr_key_swap(new_rkey);
2158 
2159 	wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2160 	mw->mr_rkey = new_rkey;
2161 
2162 	mutex_exit(&mr->mr_lock);
2163 	mutex_exit(&mw->mr_lock);
2164 	return (DDI_SUCCESS);
2165 }
2166 
2167 
2168 /*
2169  * hermon_wrid_from_reset_handling()
2170  *    Context: Can be called from interrupt or base context.
2171  */
2172 /* ARGSUSED */
2173 int
2174 hermon_wrid_from_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2175 {
2176 	hermon_workq_hdr_t	*swq, *rwq;
2177 	uint_t			qp_srq_en;
2178 
2179 	if (qp->qp_is_umap)
2180 		return (DDI_SUCCESS);
2181 
2182 	/* grab the cq lock(s) to modify the wqavl tree */
2183 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2184 #ifdef __lock_lint
2185 	mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2186 #else
2187 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2188 		mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2189 #endif
2190 
2191 	/* Chain the newly allocated work queue header to the CQ's list */
2192 	hermon_cq_workq_add(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2193 
2194 	swq = qp->qp_sq_wqhdr;
2195 	swq->wq_head = 0;
2196 	swq->wq_tail = 0;
2197 	swq->wq_full = 0;
2198 
2199 	/*
2200 	 * Now we repeat all the above operations for the receive work queue,
2201 	 * or shared receive work queue.
2202 	 *
2203 	 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2204 	 */
2205 	qp_srq_en = qp->qp_srq_en;
2206 
2207 #ifdef __lock_lint
2208 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2209 #else
2210 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2211 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2212 	} else {
2213 		rwq = qp->qp_rq_wqhdr;
2214 		rwq->wq_head = 0;
2215 		rwq->wq_tail = 0;
2216 		rwq->wq_full = 0;
2217 		qp->qp_rq_wqecntr = 0;
2218 	}
2219 #endif
2220 	hermon_cq_workq_add(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2221 
2222 #ifdef __lock_lint
2223 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2224 #else
2225 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2226 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2227 	}
2228 #endif
2229 
2230 #ifdef __lock_lint
2231 	mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2232 #else
2233 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2234 		mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2235 #endif
2236 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2237 	return (DDI_SUCCESS);
2238 }
2239 
2240 
2241 /*
2242  * hermon_wrid_to_reset_handling()
2243  *    Context: Can be called from interrupt or base context.
2244  */
2245 int
2246 hermon_wrid_to_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2247 {
2248 	uint_t			qp_srq_en;
2249 
2250 	if (qp->qp_is_umap)
2251 		return (DDI_SUCCESS);
2252 
2253 	/*
2254 	 * If there are unpolled entries in these CQs, they are
2255 	 * polled/flushed.
2256 	 * Grab the CQ lock(s) before manipulating the lists.
2257 	 */
2258 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2259 #ifdef __lock_lint
2260 	mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2261 #else
2262 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2263 		mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2264 #endif
2265 
2266 	qp_srq_en = qp->qp_srq_en;
2267 #ifdef __lock_lint
2268 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2269 #else
2270 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2271 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2272 	}
2273 #endif
2274 	/*
2275 	 * Flush the entries on the CQ for this QP's QPN.
2276 	 */
2277 	hermon_cq_entries_flush(state, qp);
2278 
2279 #ifdef __lock_lint
2280 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2281 #else
2282 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2283 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2284 	}
2285 #endif
2286 
2287 	hermon_cq_workq_remove(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2288 	hermon_cq_workq_remove(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2289 
2290 #ifdef __lock_lint
2291 	mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2292 #else
2293 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2294 		mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2295 #endif
2296 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2297 
2298 	return (IBT_SUCCESS);
2299 }
2300 
2301 
2302 /*
2303  * hermon_wrid_get_entry()
2304  *    Context: Can be called from interrupt or base context.
2305  */
2306 uint64_t
2307 hermon_wrid_get_entry(hermon_cqhdl_t cq, hermon_hw_cqe_t *cqe)
2308 {
2309 	hermon_workq_avl_t	*wqa;
2310 	hermon_workq_hdr_t	*wq;
2311 	uint64_t		wrid;
2312 	uint_t			send_or_recv, qpnum;
2313 	uint32_t		indx;
2314 
2315 	/*
2316 	 * Determine whether this CQE is a send or receive completion.
2317 	 */
2318 	send_or_recv = HERMON_CQE_SENDRECV_GET(cq, cqe);
2319 
2320 	/* Find the work queue for this QP number (send or receive side) */
2321 	qpnum = HERMON_CQE_QPNUM_GET(cq, cqe);
2322 	wqa = hermon_wrid_wqavl_find(cq, qpnum, send_or_recv);
2323 	wq = wqa->wqa_wq;
2324 
2325 	/*
2326 	 * Regardless of whether the completion is the result of a "success"
2327 	 * or a "failure", we lock the list of "containers" and attempt to
2328 	 * search for the the first matching completion (i.e. the first WR
2329 	 * with a matching WQE addr and size).  Once we find it, we pull out
2330 	 * the "wrid" field and return it (see below).  XXX Note: One possible
2331 	 * future enhancement would be to enable this routine to skip over
2332 	 * any "unsignaled" completions to go directly to the next "signaled"
2333 	 * entry on success.
2334 	 */
2335 	indx = HERMON_CQE_WQEADDRSZ_GET(cq, cqe) & wq->wq_mask;
2336 	wrid = wq->wq_wrid[indx];
2337 	if (wqa->wqa_srq_en) {
2338 		struct hermon_sw_srq_s	*srq;
2339 		uint64_t		*desc;
2340 
2341 		/* put wqe back on the srq free list */
2342 		srq = wqa->wqa_srq;
2343 		mutex_enter(&srq->srq_lock);
2344 		desc = HERMON_SRQ_WQE_ADDR(srq, wq->wq_tail);
2345 		((uint16_t *)desc)[1] = htons(indx);
2346 		wq->wq_tail = indx;
2347 		mutex_exit(&srq->srq_lock);
2348 	} else {
2349 		wq->wq_head = (indx + 1) & wq->wq_mask;
2350 		wq->wq_full = 0;
2351 	}
2352 
2353 	return (wrid);
2354 }
2355 
2356 
2357 int
2358 hermon_wrid_workq_compare(const void *p1, const void *p2)
2359 {
2360 	hermon_workq_compare_t	*cmpp;
2361 	hermon_workq_avl_t	*curr;
2362 
2363 	cmpp = (hermon_workq_compare_t *)p1;
2364 	curr = (hermon_workq_avl_t *)p2;
2365 
2366 	if (cmpp->cmp_qpn < curr->wqa_qpn)
2367 		return (-1);
2368 	else if (cmpp->cmp_qpn > curr->wqa_qpn)
2369 		return (+1);
2370 	else if (cmpp->cmp_type < curr->wqa_type)
2371 		return (-1);
2372 	else if (cmpp->cmp_type > curr->wqa_type)
2373 		return (+1);
2374 	else
2375 		return (0);
2376 }
2377 
2378 
2379 /*
2380  * hermon_wrid_workq_find()
2381  *    Context: Can be called from interrupt or base context.
2382  */
2383 static hermon_workq_avl_t *
2384 hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn, uint_t wq_type)
2385 {
2386 	hermon_workq_avl_t	*curr;
2387 	hermon_workq_compare_t	cmp;
2388 
2389 	/*
2390 	 * Walk the CQ's work queue list, trying to find a send or recv queue
2391 	 * with the same QP number.  We do this even if we are going to later
2392 	 * create a new entry because it helps us easily find the end of the
2393 	 * list.
2394 	 */
2395 	cmp.cmp_qpn = qpn;
2396 	cmp.cmp_type = wq_type;
2397 #ifdef __lock_lint
2398 	hermon_wrid_workq_compare(NULL, NULL);
2399 #endif
2400 	curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
2401 
2402 	return (curr);
2403 }
2404 
2405 
2406 /*
2407  * hermon_wrid_wqhdr_create()
2408  *    Context: Can be called from base context.
2409  */
2410 /* ARGSUSED */
2411 hermon_workq_hdr_t *
2412 hermon_wrid_wqhdr_create(int bufsz)
2413 {
2414 	hermon_workq_hdr_t	*wqhdr;
2415 
2416 	/*
2417 	 * Allocate space for the wqhdr, and an array to record all the wrids.
2418 	 */
2419 	wqhdr = (hermon_workq_hdr_t *)kmem_zalloc(sizeof (*wqhdr), KM_NOSLEEP);
2420 	if (wqhdr == NULL) {
2421 		return (NULL);
2422 	}
2423 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr))
2424 	wqhdr->wq_wrid = kmem_zalloc(bufsz * sizeof (uint64_t), KM_NOSLEEP);
2425 	if (wqhdr->wq_wrid == NULL) {
2426 		kmem_free(wqhdr, sizeof (*wqhdr));
2427 		return (NULL);
2428 	}
2429 	wqhdr->wq_size = bufsz;
2430 	wqhdr->wq_mask = bufsz - 1;
2431 
2432 	return (wqhdr);
2433 }
2434 
2435 void
2436 hermon_wrid_wqhdr_destroy(hermon_workq_hdr_t *wqhdr)
2437 {
2438 	kmem_free(wqhdr->wq_wrid, wqhdr->wq_size * sizeof (uint64_t));
2439 	kmem_free(wqhdr, sizeof (*wqhdr));
2440 }
2441 
2442 
2443 /*
2444  * hermon_cq_workq_add()
2445  *    Context: Can be called from interrupt or base context.
2446  */
2447 static void
2448 hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2449 {
2450 	hermon_workq_compare_t	cmp;
2451 	avl_index_t		where;
2452 
2453 	cmp.cmp_qpn = wqavl->wqa_qpn;
2454 	cmp.cmp_type = wqavl->wqa_type;
2455 #ifdef __lock_lint
2456 	hermon_wrid_workq_compare(NULL, NULL);
2457 #endif
2458 	(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
2459 	avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqavl, where);
2460 }
2461 
2462 
2463 /*
2464  * hermon_cq_workq_remove()
2465  *    Context: Can be called from interrupt or base context.
2466  */
2467 static void
2468 hermon_cq_workq_remove(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2469 {
2470 #ifdef __lock_lint
2471 	hermon_wrid_workq_compare(NULL, NULL);
2472 #endif
2473 	avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqavl);
2474 }
2475