xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/hermon/hermon_wr.c (revision c7facc54c4abed9e554ff80225311e6b7048d3c9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * hermon_wr.c
29  *    Hermon Work Request Processing Routines
30  *
31  *    Implements all the routines necessary to provide the PostSend(),
32  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
33  *    necessary to implement the Hermon WRID tracking mechanism.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/avl.h>
42 
43 #include <sys/ib/adapters/hermon/hermon.h>
44 
45 static uint32_t hermon_wr_get_immediate(ibt_send_wr_t *wr);
46 static int hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr);
47 static int hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
48     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
49 static int hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
50     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
51 static void hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp);
52 static int hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
53     ibt_recv_wr_t *wr, uint64_t *desc);
54 static int hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
55     ibt_recv_wr_t *wr, uint64_t *desc);
56 static void hermon_wqe_sync(void *hdl, uint_t sync_from,
57     uint_t sync_to, uint_t sync_type, uint_t flag);
58 static hermon_workq_avl_t *hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn,
59     uint_t send_or_recv);
60 static void hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl);
61 static void hermon_cq_workq_remove(hermon_cqhdl_t cq,
62     hermon_workq_avl_t *wqavl);
63 
64 static	ibt_wr_ds_t	null_sgl = { 0, 0x00000100, 0 };
65 
66 /*
67  * Add ability to try to debug RDMA_READ/RDMA_WRITE failures.
68  *
69  *      0x1 - print rkey used during post_send
70  *      0x2 - print sgls used during post_send
71  *	0x4 - print FMR comings and goings
72  */
73 int hermon_rdma_debug = 0x0;
74 
75 static int
76 hermon_post_send_ud(hermon_state_t *state, hermon_qphdl_t qp,
77     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
78 {
79 	hermon_hw_snd_wqe_ud_t		*ud;
80 	hermon_workq_hdr_t		*wq;
81 	hermon_ahhdl_t			ah;
82 	ibt_ud_dest_t			*dest;
83 	uint64_t			*desc;
84 	uint32_t			desc_sz;
85 	uint32_t			signaled_dbd, solicited;
86 	uint32_t			head, tail, next_tail, qsize_msk;
87 	uint32_t			hdrmwqes;
88 	uint32_t			nopcode, fence, immed_data = 0;
89 	hermon_hw_wqe_sgl_t		*ds, *old_ds;
90 	ibt_wr_ds_t			*sgl;
91 	uint32_t			nds, dnds;
92 	int				i, j, last_ds, num_ds, status;
93 	uint32_t			*wqe_start;
94 	int				sectperwqe;
95 	uint_t				posted_cnt = 0;
96 
97 	/* initialize the FMA retry loop */
98 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
99 
100 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
101 	_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
102 
103 	/* Grab the lock for the WRID list */
104 	membar_consumer();
105 
106 	/* Save away some initial QP state */
107 	wq = qp->qp_sq_wqhdr;
108 	qsize_msk = wq->wq_mask;
109 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
110 	sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
111 
112 	tail	  = wq->wq_tail;
113 	head	  = wq->wq_head;
114 	status	  = DDI_SUCCESS;
115 
116 post_next:
117 	/*
118 	 * Check for "queue full" condition.  If the queue
119 	 * is already full, then no more WQEs can be posted.
120 	 * So break out, ring a doorbell (if necessary) and
121 	 * return an error
122 	 */
123 	if (wq->wq_full != 0) {
124 		status = IBT_QP_FULL;
125 		goto done;
126 	}
127 
128 	next_tail = (tail + 1) & qsize_msk;
129 	if (((tail + hdrmwqes) & qsize_msk) == head) {
130 		wq->wq_full = 1;
131 	}
132 
133 	desc = HERMON_QP_SQ_ENTRY(qp, tail);
134 
135 	ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
136 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
137 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
138 	    sizeof (hermon_hw_snd_wqe_ud_t));
139 	nds = wr->wr_nds;
140 	sgl = wr->wr_sgl;
141 	num_ds = 0;
142 
143 	/* need to know the count of destination nds for backward loop */
144 	for (dnds = 0, i = 0; i < nds; i++) {
145 		if (sgl[i].ds_len != 0)
146 			dnds++;
147 	}
148 
149 	/*
150 	 * Build a Send or Send_LSO WQE
151 	 */
152 	if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
153 		int total_len;
154 
155 		nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
156 		if (wr->wr.ud_lso.lso_hdr_sz > 60) {
157 			nopcode |= (1 << 6);	/* ReRead bit must be set */
158 		}
159 		dest = wr->wr.ud_lso.lso_ud_dest;
160 		ah = (hermon_ahhdl_t)dest->ud_ah;
161 		if (ah == NULL) {
162 			status = IBT_AH_HDL_INVALID;
163 			goto done;
164 		}
165 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
166 
167 		total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
168 		if ((uintptr_t)ds + total_len + (nds * 16) >
169 		    (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz)) {
170 			status = IBT_QP_SGL_LEN_INVALID;
171 			goto done;
172 		}
173 		old_ds = ds;
174 		bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)old_ds + 1,
175 		    wr->wr.ud_lso.lso_hdr_sz);
176 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
177 		i = 0;
178 	} else if (wr->wr_opcode == IBT_WRC_SEND) {
179 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
180 			nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
181 			immed_data = wr->wr.ud.udwr_immed;
182 		} else {
183 			nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
184 		}
185 		dest = wr->wr.ud.udwr_dest;
186 		ah = (hermon_ahhdl_t)dest->ud_ah;
187 		if (ah == NULL) {
188 			status = IBT_AH_HDL_INVALID;
189 			goto done;
190 		}
191 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
192 		i = 0;
193 	} else {
194 		status = IBT_QP_OP_TYPE_INVALID;
195 		goto done;
196 	}
197 
198 	if (nds > qp->qp_sq_sgl) {
199 		status = IBT_QP_SGL_LEN_INVALID;
200 		goto done;
201 	}
202 	for (last_ds = num_ds, j = i; j < nds; j++) {
203 		if (sgl[j].ds_len != 0)
204 			last_ds++;	/* real last ds of wqe to fill */
205 	}
206 	desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
207 	for (j = nds; --j >= i; ) {
208 		if (sgl[j].ds_len == 0) {
209 			continue;
210 		}
211 
212 		/*
213 		 * Fill in the Data Segment(s) for the current WQE, using the
214 		 * information contained in the scatter-gather list of the
215 		 * work request.
216 		 */
217 		last_ds--;
218 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
219 	}
220 
221 	membar_producer();
222 
223 	if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
224 		HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
225 		    wr->wr.ud_lso.lso_hdr_sz);
226 	}
227 
228 	fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
229 
230 	signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
231 	    (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 1 : 0;
232 
233 	solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 1 : 0;
234 
235 	HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data,
236 	    solicited, signaled_dbd, wr->wr_flags & IBT_WR_SEND_CKSUM, qp);
237 
238 	wq->wq_wrid[tail] = wr->wr_id;
239 
240 	tail = next_tail;
241 
242 	/* Update some of the state in the QP */
243 	wq->wq_tail = tail;
244 
245 	membar_producer();
246 
247 	/* Now set the ownership bit and opcode (first dword). */
248 	HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
249 
250 	posted_cnt++;
251 	if (--num_wr > 0) {
252 		/* do the invalidate of the headroom */
253 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
254 		    (tail + hdrmwqes) & qsize_msk);
255 		for (i = 16; i < sectperwqe; i += 16) {
256 			wqe_start[i] = 0xFFFFFFFF;
257 		}
258 
259 		wr++;
260 		goto post_next;
261 	}
262 done:
263 	if (posted_cnt != 0) {
264 		ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
265 
266 		membar_producer();
267 
268 		/* the FMA retry loop starts for Hermon doorbell register. */
269 		hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
270 		    fm_status, fm_test_num);
271 
272 		HERMON_UAR_DOORBELL(state, uarhdl,
273 		    (uint64_t *)(void *)&state->hs_uar->send,
274 		    (uint64_t)qp->qp_ring);
275 
276 		/* the FMA retry loop ends. */
277 		hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
278 		    fm_status, fm_test_num);
279 
280 		/* do the invalidate of the headroom */
281 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
282 		    (tail + hdrmwqes) & qsize_msk);
283 		for (i = 16; i < sectperwqe; i += 16) {
284 			wqe_start[i] = 0xFFFFFFFF;
285 		}
286 	}
287 	if (num_posted != NULL)
288 		*num_posted = posted_cnt;
289 
290 	mutex_exit(&qp->qp_sq_lock);
291 
292 	return (status);
293 
294 pio_error:
295 	mutex_exit(&qp->qp_sq_lock);
296 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
297 	return (ibc_get_ci_failure(0));
298 }
299 
300 static int
301 hermon_post_send_rc(hermon_state_t *state, hermon_qphdl_t qp,
302     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
303 {
304 	uint64_t			*desc;
305 	hermon_workq_hdr_t		*wq;
306 	uint32_t			desc_sz;
307 	uint32_t			signaled_dbd, solicited;
308 	uint32_t			head, tail, next_tail, qsize_msk;
309 	uint32_t			hdrmwqes;
310 	int				status;
311 	uint32_t			nopcode, fence, immed_data = 0;
312 	hermon_hw_snd_wqe_remaddr_t	*rc;
313 	hermon_hw_snd_wqe_atomic_t	*at;
314 	hermon_hw_snd_wqe_bind_t	*bn;
315 	hermon_hw_wqe_sgl_t		*ds;
316 	ibt_wr_ds_t			*sgl;
317 	uint32_t			nds;
318 	int				i, last_ds, num_ds;
319 	uint32_t			*wqe_start;
320 	int				sectperwqe;
321 	uint_t				posted_cnt = 0;
322 	int				print_rdma;
323 	int				rlen;
324 	uint32_t			rkey;
325 	uint64_t			raddr;
326 
327 	/* initialize the FMA retry loop */
328 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
329 
330 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
331 	_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
332 
333 	/* Save away some initial QP state */
334 	wq = qp->qp_sq_wqhdr;
335 	qsize_msk = wq->wq_mask;
336 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
337 	sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
338 
339 	tail	  = wq->wq_tail;
340 	head	  = wq->wq_head;
341 	status	  = DDI_SUCCESS;
342 
343 post_next:
344 	print_rdma = 0;
345 	rlen = 0;
346 
347 	/*
348 	 * Check for "queue full" condition.  If the queue
349 	 * is already full, then no more WQEs can be posted.
350 	 * So break out, ring a doorbell (if necessary) and
351 	 * return an error
352 	 */
353 	if (wq->wq_full != 0) {
354 		status = IBT_QP_FULL;
355 		goto done;
356 	}
357 	next_tail = (tail + 1) & qsize_msk;
358 	if (((tail + hdrmwqes) & qsize_msk) == head) {
359 		wq->wq_full = 1;
360 	}
361 
362 	desc = HERMON_QP_SQ_ENTRY(qp, tail);
363 
364 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
365 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
366 	nds = wr->wr_nds;
367 	sgl = wr->wr_sgl;
368 	num_ds = 0;
369 
370 	/*
371 	 * Validate the operation type.  For RC requests, we allow
372 	 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
373 	 * operations, and memory window "Bind"
374 	 */
375 	switch (wr->wr_opcode) {
376 	default:
377 		status = IBT_QP_OP_TYPE_INVALID;
378 		goto done;
379 
380 	case IBT_WRC_SEND:
381 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
382 			nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
383 			immed_data = wr->wr.rc.rcwr.send_immed;
384 		} else {
385 			nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
386 		}
387 		break;
388 
389 	/*
390 	 * If this is an RDMA Read or RDMA Write request, then fill
391 	 * in the "Remote Address" header fields.
392 	 */
393 	case IBT_WRC_RDMAW:
394 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
395 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAWI;
396 			immed_data = wr->wr.rc.rcwr.rdma.rdma_immed;
397 		} else {
398 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
399 		}
400 		/* FALLTHROUGH */
401 	case IBT_WRC_RDMAR:
402 		if (wr->wr_opcode == IBT_WRC_RDMAR)
403 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
404 		rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
405 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
406 
407 		/*
408 		 * Build the Remote Address Segment for the WQE, using
409 		 * the information from the RC work request.
410 		 */
411 		HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
412 
413 		if (hermon_rdma_debug) {
414 			print_rdma = hermon_rdma_debug;
415 			rkey = wr->wr.rc.rcwr.rdma.rdma_rkey;
416 			raddr = wr->wr.rc.rcwr.rdma.rdma_raddr;
417 		}
418 
419 		/* Update "ds" for filling in Data Segments (below) */
420 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
421 		    sizeof (hermon_hw_snd_wqe_remaddr_t));
422 		break;
423 
424 	/*
425 	 * If this is one of the Atomic type operations (i.e
426 	 * Compare-Swap or Fetch-Add), then fill in both the "Remote
427 	 * Address" header fields and the "Atomic" header fields.
428 	 */
429 	case IBT_WRC_CSWAP:
430 		nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
431 		/* FALLTHROUGH */
432 	case IBT_WRC_FADD:
433 		if (wr->wr_opcode == IBT_WRC_FADD)
434 			nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
435 		rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
436 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
437 		at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
438 		    sizeof (hermon_hw_snd_wqe_remaddr_t));
439 
440 		/*
441 		 * Build the Remote Address and Atomic Segments for
442 		 * the WQE, using the information from the RC Atomic
443 		 * work request.
444 		 */
445 		HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
446 		HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
447 
448 		/* Update "ds" for filling in Data Segments (below) */
449 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
450 		    sizeof (hermon_hw_snd_wqe_atomic_t));
451 
452 		/*
453 		 * Update "nds" and "sgl" because Atomic requests have
454 		 * only a single Data Segment.
455 		 */
456 		nds = 1;
457 		sgl = wr->wr_sgl;
458 		break;
459 
460 	/*
461 	 * If this is memory window Bind operation, then we call the
462 	 * hermon_wr_bind_check() routine to validate the request and
463 	 * to generate the updated RKey.  If this is successful, then
464 	 * we fill in the WQE's "Bind" header fields.
465 	 */
466 	case IBT_WRC_BIND:
467 		nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
468 		status = hermon_wr_bind_check(state, wr);
469 		if (status != DDI_SUCCESS)
470 			goto done;
471 
472 		bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
473 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
474 
475 		/*
476 		 * Build the Bind Memory Window Segments for the WQE,
477 		 * using the information from the RC Bind memory
478 		 * window work request.
479 		 */
480 		HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
481 
482 		/*
483 		 * Update the "ds" pointer.  Even though the "bind"
484 		 * operation requires no SGLs, this is necessary to
485 		 * facilitate the correct descriptor size calculations
486 		 * (below).
487 		 */
488 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
489 		    sizeof (hermon_hw_snd_wqe_bind_t));
490 		nds = 0;
491 	}
492 
493 	/*
494 	 * Now fill in the Data Segments (SGL) for the Send WQE based
495 	 * on the values setup above (i.e. "sgl", "nds", and the "ds"
496 	 * pointer. Start by checking for a valid number of SGL entries
497 	 */
498 	if (nds > qp->qp_sq_sgl) {
499 		status = IBT_QP_SGL_LEN_INVALID;
500 		goto done;
501 	}
502 
503 	for (last_ds = num_ds, i = 0; i < nds; i++) {
504 		if (sgl[i].ds_len != 0)
505 			last_ds++;	/* real last ds of wqe to fill */
506 	}
507 	desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
508 	for (i = nds; --i >= 0; ) {
509 		if (sgl[i].ds_len == 0) {
510 			continue;
511 		}
512 		rlen += sgl[i].ds_len;
513 		if (print_rdma & 0x2)
514 			IBTF_DPRINTF_L2("rdma", "post: [%d]: laddr %llx  "
515 			    "llen %x", i, sgl[i].ds_va, sgl[i].ds_len);
516 
517 		/*
518 		 * Fill in the Data Segment(s) for the current WQE, using the
519 		 * information contained in the scatter-gather list of the
520 		 * work request.
521 		 */
522 		last_ds--;
523 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[i]);
524 	}
525 
526 	if (print_rdma & 0x1) {
527 		IBTF_DPRINTF_L2("rdma", "post: indx %x  rkey %x  raddr %llx  "
528 		    "total len %x", tail, rkey, raddr, rlen);
529 	}
530 
531 	fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
532 
533 	signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
534 	    (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 1 : 0;
535 
536 	solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 1 : 0;
537 
538 	HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited,
539 	    signaled_dbd, wr->wr_flags & IBT_WR_SEND_CKSUM, qp);
540 
541 	wq->wq_wrid[tail] = wr->wr_id;
542 
543 	tail = next_tail;
544 
545 	/* Update some of the state in the QP */
546 	wq->wq_tail = tail;
547 
548 	membar_producer();
549 
550 	/* Now set the ownership bit of the first one in the chain. */
551 	HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
552 
553 	posted_cnt++;
554 	if (--num_wr > 0) {
555 		/* do the invalidate of the headroom */
556 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
557 		    (tail + hdrmwqes) & qsize_msk);
558 		for (i = 16; i < sectperwqe; i += 16) {
559 			wqe_start[i] = 0xFFFFFFFF;
560 		}
561 
562 		wr++;
563 		goto post_next;
564 	}
565 done:
566 
567 	if (posted_cnt != 0) {
568 		ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
569 
570 		membar_producer();
571 
572 		/* the FMA retry loop starts for Hermon doorbell register. */
573 		hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
574 		    fm_status, fm_test_num);
575 
576 		/* Ring the doorbell */
577 		HERMON_UAR_DOORBELL(state, uarhdl,
578 		    (uint64_t *)(void *)&state->hs_uar->send,
579 		    (uint64_t)qp->qp_ring);
580 
581 		/* the FMA retry loop ends. */
582 		hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
583 		    fm_status, fm_test_num);
584 
585 		/* do the invalidate of the headroom */
586 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
587 		    (tail + hdrmwqes) & qsize_msk);
588 		for (i = 16; i < sectperwqe; i += 16) {
589 			wqe_start[i] = 0xFFFFFFFF;
590 		}
591 	}
592 	/*
593 	 * Update the "num_posted" return value (if necessary).
594 	 * Then drop the locks and return success.
595 	 */
596 	if (num_posted != NULL) {
597 		*num_posted = posted_cnt;
598 	}
599 
600 	mutex_exit(&qp->qp_sq_lock);
601 	return (status);
602 
603 pio_error:
604 	mutex_exit(&qp->qp_sq_lock);
605 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
606 	return (ibc_get_ci_failure(0));
607 }
608 
609 /*
610  * hermon_post_send()
611  *    Context: Can be called from interrupt or base context.
612  */
613 int
614 hermon_post_send(hermon_state_t *state, hermon_qphdl_t qp,
615     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
616 {
617 	ibt_send_wr_t 			*curr_wr;
618 	hermon_workq_hdr_t		*wq;
619 	hermon_ahhdl_t			ah;
620 	uint64_t			*desc, *prev;
621 	uint32_t			desc_sz;
622 	uint32_t			signaled_dbd, solicited;
623 	uint32_t			head, tail, next_tail, qsize_msk;
624 	uint32_t			sync_from, sync_to;
625 	uint32_t			hdrmwqes;
626 	uint_t				currindx, wrindx, numremain;
627 	uint_t				chainlen;
628 	uint_t				posted_cnt, maxstat;
629 	uint_t				total_posted;
630 	int				status;
631 	uint32_t			nopcode, fence, immed_data = 0;
632 	uint32_t			prev_nopcode;
633 
634 	/* initialize the FMA retry loop */
635 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test);
636 
637 	/*
638 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
639 	 * clients to post to QP memory that is accessible directly by the
640 	 * user.  If the QP memory is user accessible, then return an error.
641 	 */
642 	if (qp->qp_is_umap) {
643 		return (IBT_QP_HDL_INVALID);
644 	}
645 
646 	mutex_enter(&qp->qp_lock);
647 
648 	/*
649 	 * Check QP state.  Can not post Send requests from the "Reset",
650 	 * "Init", or "RTR" states
651 	 */
652 	if ((qp->qp_state == HERMON_QP_RESET) ||
653 	    (qp->qp_state == HERMON_QP_INIT) ||
654 	    (qp->qp_state == HERMON_QP_RTR)) {
655 		mutex_exit(&qp->qp_lock);
656 		return (IBT_QP_STATE_INVALID);
657 	}
658 	mutex_exit(&qp->qp_lock);
659 	mutex_enter(&qp->qp_sq_lock);
660 
661 	if (qp->qp_is_special)
662 		goto post_many;
663 
664 	/* Use these optimized functions most of the time */
665 	if (qp->qp_serv_type == HERMON_QP_UD) {
666 		if (wr->wr_trans != IBT_UD_SRV) {
667 			mutex_exit(&qp->qp_sq_lock);
668 			return (IBT_QP_SRV_TYPE_INVALID);
669 		}
670 		return (hermon_post_send_ud(state, qp, wr, num_wr, num_posted));
671 	}
672 
673 	if (qp->qp_serv_type == HERMON_QP_RC) {
674 		if (wr->wr_trans != IBT_RC_SRV) {
675 			mutex_exit(&qp->qp_sq_lock);
676 			return (IBT_QP_SRV_TYPE_INVALID);
677 		}
678 		return (hermon_post_send_rc(state, qp, wr, num_wr, num_posted));
679 	}
680 
681 	if (qp->qp_serv_type == HERMON_QP_UC)
682 		goto post_many;
683 
684 	mutex_exit(&qp->qp_sq_lock);
685 	return (IBT_QP_SRV_TYPE_INVALID);
686 
687 post_many:
688 	/* general loop for non-optimized posting */
689 
690 	/* Save away some initial QP state */
691 	wq = qp->qp_sq_wqhdr;
692 	qsize_msk = wq->wq_mask;
693 	tail	  = wq->wq_tail;
694 	head	  = wq->wq_head;
695 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
696 
697 	/* Initialize posted_cnt */
698 	posted_cnt = 0;
699 	total_posted = 0;
700 
701 	/*
702 	 * For each ibt_send_wr_t in the wr[] list passed in, parse the
703 	 * request and build a Send WQE.  NOTE:  Because we are potentially
704 	 * building a chain of WQEs to post, we want to build them all first,
705 	 * and set the valid (HW Ownership) bit on all but the first.
706 	 * However, we do not want to validate the first one until the
707 	 * entire chain of WQEs has been built.  Then in the final
708 	 * we set the valid bit in the first, flush if needed, and as a last
709 	 * step ring the appropriate doorbell.  NOTE: the doorbell ring may
710 	 * NOT be needed if the HCA is already processing, but the doorbell
711 	 * ring will be done regardless. NOTE ALSO:  It is possible for
712 	 * more Work Requests to be posted than the HW will support at one
713 	 * shot.  If this happens, we need to be able to post and ring
714 	 * several chains here until the the entire request is complete.
715 	 * NOTE ALSO:  the term "chain" is used to differentiate it from
716 	 * Work Request List passed in; and because that's the terminology
717 	 * from the previous generations of HCA - but the WQEs are not, in fact
718 	 * chained together for Hermon
719 	 */
720 
721 	wrindx = 0;
722 	numremain = num_wr;
723 	status	  = DDI_SUCCESS;
724 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
725 		/*
726 		 * For the first WQE on a new chain we need "prev" to point
727 		 * to the current descriptor.
728 		 */
729 		prev = HERMON_QP_SQ_ENTRY(qp, tail);
730 
731 	/*
732 	 * unlike Tavor & Arbel, tail will maintain the number of the
733 	 * next (this) WQE to be posted.  Since there is no backward linking
734 	 * in Hermon, we can always just look ahead
735 	 */
736 		/*
737 		 * Before we begin, save the current "tail index" for later
738 		 * DMA sync
739 		 */
740 		/* NOTE: don't need to go back one like arbel/tavor */
741 		sync_from = tail;
742 
743 		/*
744 		 * Break the request up into lists that are less than or
745 		 * equal to the maximum number of WQEs that can be posted
746 		 * per doorbell ring - 256 currently
747 		 */
748 		chainlen = (numremain > HERMON_QP_MAXDESC_PER_DB) ?
749 		    HERMON_QP_MAXDESC_PER_DB : numremain;
750 		numremain -= chainlen;
751 
752 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
753 			/*
754 			 * Check for "queue full" condition.  If the queue
755 			 * is already full, then no more WQEs can be posted.
756 			 * So break out, ring a doorbell (if necessary) and
757 			 * return an error
758 			 */
759 			if (wq->wq_full != 0) {
760 				status = IBT_QP_FULL;
761 				break;
762 			}
763 
764 			/*
765 			 * Increment the "tail index". Check for "queue
766 			 * full" condition incl. headroom.  If we detect that
767 			 * the current work request is going to fill the work
768 			 * queue, then we mark this condition and continue.
769 			 * Don't need >=, because going one-by-one we have to
770 			 * hit it exactly sooner or later
771 			 */
772 
773 			next_tail = (tail + 1) & qsize_msk;
774 			if (((tail + hdrmwqes) & qsize_msk) == head) {
775 				wq->wq_full = 1;
776 			}
777 
778 			/*
779 			 * Get the address of the location where the next
780 			 * Send WQE should be built
781 			 */
782 			desc = HERMON_QP_SQ_ENTRY(qp, tail);
783 			/*
784 			 * Call hermon_wqe_send_build() to build the WQE
785 			 * at the given address.  This routine uses the
786 			 * information in the ibt_send_wr_t list (wr[]) and
787 			 * returns the size of the WQE when it returns.
788 			 */
789 			status = hermon_wqe_send_build(state, qp,
790 			    &wr[wrindx], desc, &desc_sz);
791 			if (status != DDI_SUCCESS) {
792 				break;
793 			}
794 
795 			/*
796 			 * Now, build the Ctrl Segment based on
797 			 * what was just done
798 			 */
799 			curr_wr = &wr[wrindx];
800 
801 			switch (curr_wr->wr_opcode) {
802 			case IBT_WRC_RDMAW:
803 				if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
804 					nopcode =
805 					    HERMON_WQE_SEND_NOPCODE_RDMAWI;
806 					immed_data =
807 					    hermon_wr_get_immediate(curr_wr);
808 				} else {
809 					nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
810 				}
811 				break;
812 
813 			case IBT_WRC_SEND:
814 				if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
815 					nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
816 					immed_data =
817 					    hermon_wr_get_immediate(curr_wr);
818 				} else {
819 					nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
820 				}
821 				break;
822 
823 			case IBT_WRC_SEND_LSO:
824 				nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
825 				break;
826 
827 			case IBT_WRC_RDMAR:
828 				nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
829 				break;
830 
831 			case IBT_WRC_CSWAP:
832 				nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
833 				break;
834 
835 			case IBT_WRC_FADD:
836 				nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
837 				break;
838 
839 			case IBT_WRC_BIND:
840 				nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
841 				break;
842 			}
843 
844 			fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
845 
846 			/*
847 			 * now, build up the control segment, leaving the
848 			 * owner bit as it is
849 			 */
850 
851 			if ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
852 			    (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL)) {
853 				signaled_dbd = 1;
854 			} else {
855 				signaled_dbd = 0;
856 			}
857 			if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT)
858 				solicited = 1;
859 			else
860 				solicited = 0;
861 
862 			if (qp->qp_is_special) {
863 				/* Ensure correctness, set the ReRead bit */
864 				nopcode |= (1 << 6);
865 				ah = (hermon_ahhdl_t)
866 				    curr_wr->wr.ud.udwr_dest->ud_ah;
867 				mutex_enter(&ah->ah_lock);
868 				maxstat = ah->ah_udav->max_stat_rate;
869 				HERMON_WQE_SET_MLX_CTRL_SEGMENT(desc, desc_sz,
870 				    signaled_dbd, maxstat, ah->ah_udav->rlid,
871 				    qp, ah->ah_udav->sl);
872 				mutex_exit(&ah->ah_lock);
873 			} else {
874 				HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz,
875 				    fence, immed_data, solicited,
876 				    signaled_dbd, curr_wr->wr_flags &
877 				    IBT_WR_SEND_CKSUM, qp);
878 			}
879 			wq->wq_wrid[tail] = curr_wr->wr_id;
880 
881 			/*
882 			 * If this is not the first descriptor on the current
883 			 * chain, then set the ownership bit.
884 			 */
885 			if (currindx != 0) {		/* not the first */
886 				membar_producer();
887 				HERMON_SET_SEND_WQE_OWNER(qp,
888 				    (uint32_t *)desc, nopcode);
889 			} else
890 				prev_nopcode = nopcode;
891 
892 			/*
893 			 * Update the current "tail index" and increment
894 			 * "posted_cnt"
895 			 */
896 			tail = next_tail;
897 			posted_cnt++;
898 		}
899 
900 		/*
901 		 * If we reach here and there are one or more WQEs which have
902 		 * been successfully built as a chain, we have to finish up
903 		 * and prepare them for writing to the HW
904 		 * The steps are:
905 		 * 	1. do the headroom fixup
906 		 *	2. add in the size of the headroom for the sync
907 		 *	3. write the owner bit for the first WQE
908 		 *	4. sync them
909 		 *	5. fix up the structures
910 		 *	6. hit the doorbell in UAR
911 		 */
912 		if (posted_cnt != 0) {
913 			ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
914 
915 			/*
916 			 * Save away updated "tail index" for the DMA sync
917 			 * including the headroom that will be needed
918 			 */
919 			sync_to = (tail + hdrmwqes) & qsize_msk;
920 
921 			/* do the invalidate of the headroom */
922 
923 			hermon_wqe_headroom(tail, qp);
924 
925 			/* Do a DMA sync for current send WQE(s) */
926 			hermon_wqe_sync(qp, sync_from, sync_to, HERMON_WR_SEND,
927 			    DDI_DMA_SYNC_FORDEV);
928 
929 			/* Update some of the state in the QP */
930 			wq->wq_tail = tail;
931 			total_posted += posted_cnt;
932 			posted_cnt = 0;
933 
934 			membar_producer();
935 
936 			/*
937 			 * Now set the ownership bit of the first
938 			 * one in the chain
939 			 */
940 			HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)prev,
941 			    prev_nopcode);
942 
943 			/* the FMA retry loop starts for Hermon doorbell. */
944 			hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
945 			    fm_status, fm_test);
946 
947 			HERMON_UAR_DOORBELL(state, uarhdl,
948 			    (uint64_t *)(void *)&state->hs_uar->send,
949 			    (uint64_t)qp->qp_ring);
950 
951 			/* the FMA retry loop ends. */
952 			hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
953 			    fm_status, fm_test);
954 		}
955 	}
956 
957 	/*
958 	 * Update the "num_posted" return value (if necessary).
959 	 * Then drop the locks and return success.
960 	 */
961 	if (num_posted != NULL) {
962 		*num_posted = total_posted;
963 	}
964 	mutex_exit(&qp->qp_sq_lock);
965 	return (status);
966 
967 pio_error:
968 	mutex_exit(&qp->qp_sq_lock);
969 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
970 	return (ibc_get_ci_failure(0));
971 }
972 
973 
974 /*
975  * hermon_post_recv()
976  *    Context: Can be called from interrupt or base context.
977  */
978 int
979 hermon_post_recv(hermon_state_t *state, hermon_qphdl_t qp,
980     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
981 {
982 	uint64_t			*desc;
983 	hermon_workq_hdr_t		*wq;
984 	uint32_t			head, tail, next_tail, qsize_msk;
985 	uint32_t			sync_from, sync_to;
986 	uint_t				wrindx;
987 	uint_t				posted_cnt;
988 	int				status;
989 
990 	/*
991 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
992 	 * clients to post to QP memory that is accessible directly by the
993 	 * user.  If the QP memory is user accessible, then return an error.
994 	 */
995 	if (qp->qp_is_umap) {
996 		return (IBT_QP_HDL_INVALID);
997 	}
998 
999 	/* Initialize posted_cnt */
1000 	posted_cnt = 0;
1001 
1002 	mutex_enter(&qp->qp_lock);
1003 
1004 	/*
1005 	 * Check if QP is associated with an SRQ
1006 	 */
1007 	if (qp->qp_srq_en == HERMON_QP_SRQ_ENABLED) {
1008 		mutex_exit(&qp->qp_lock);
1009 		return (IBT_SRQ_IN_USE);
1010 	}
1011 
1012 	/*
1013 	 * Check QP state.  Can not post Recv requests from the "Reset" state
1014 	 */
1015 	if (qp->qp_state == HERMON_QP_RESET) {
1016 		mutex_exit(&qp->qp_lock);
1017 		return (IBT_QP_STATE_INVALID);
1018 	}
1019 
1020 	/* Check that work request transport type is valid */
1021 	if ((qp->qp_serv_type != HERMON_QP_UD) &&
1022 	    (qp->qp_serv_type != HERMON_QP_RC) &&
1023 	    (qp->qp_serv_type != HERMON_QP_UC)) {
1024 		mutex_exit(&qp->qp_lock);
1025 		return (IBT_QP_SRV_TYPE_INVALID);
1026 	}
1027 
1028 	mutex_exit(&qp->qp_lock);
1029 	mutex_enter(&qp->qp_rq_lock);
1030 
1031 	/*
1032 	 * Grab the lock for the WRID list, i.e., membar_consumer().
1033 	 * This is not needed because the mutex_enter() above has
1034 	 * the same effect.
1035 	 */
1036 
1037 	/* Save away some initial QP state */
1038 	wq = qp->qp_rq_wqhdr;
1039 	qsize_msk = wq->wq_mask;
1040 	tail	  = wq->wq_tail;
1041 	head	  = wq->wq_head;
1042 
1043 	wrindx = 0;
1044 	status	  = DDI_SUCCESS;
1045 	/*
1046 	 * Before we begin, save the current "tail index" for later
1047 	 * DMA sync
1048 	 */
1049 	sync_from = tail;
1050 
1051 	for (wrindx = 0; wrindx < num_wr; wrindx++) {
1052 		if (wq->wq_full != 0) {
1053 			status = IBT_QP_FULL;
1054 			break;
1055 		}
1056 		next_tail = (tail + 1) & qsize_msk;
1057 		if (next_tail == head) {
1058 			wq->wq_full = 1;
1059 		}
1060 		desc = HERMON_QP_RQ_ENTRY(qp, tail);
1061 		status = hermon_wqe_recv_build(state, qp, &wr[wrindx], desc);
1062 		if (status != DDI_SUCCESS) {
1063 			break;
1064 		}
1065 
1066 		wq->wq_wrid[tail] = wr[wrindx].wr_id;
1067 		qp->qp_rq_wqecntr++;
1068 
1069 		tail = next_tail;
1070 		posted_cnt++;
1071 	}
1072 
1073 	if (posted_cnt != 0) {
1074 		/* Save away updated "tail index" for the DMA sync */
1075 		sync_to = tail;
1076 
1077 		hermon_wqe_sync(qp, sync_from, sync_to, HERMON_WR_RECV,
1078 		    DDI_DMA_SYNC_FORDEV);
1079 
1080 		wq->wq_tail = tail;
1081 
1082 		membar_producer();	/* ensure wrids are visible */
1083 
1084 		/* Update the doorbell record w/ wqecntr */
1085 		HERMON_UAR_DB_RECORD_WRITE(qp->qp_rq_vdbr,
1086 		    qp->qp_rq_wqecntr & 0xFFFF);
1087 	}
1088 
1089 	if (num_posted != NULL) {
1090 		*num_posted = posted_cnt;
1091 	}
1092 
1093 
1094 	mutex_exit(&qp->qp_rq_lock);
1095 	return (status);
1096 }
1097 
1098 /*
1099  * hermon_post_srq()
1100  *    Context: Can be called from interrupt or base context.
1101  */
1102 int
1103 hermon_post_srq(hermon_state_t *state, hermon_srqhdl_t srq,
1104     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
1105 {
1106 	uint64_t			*desc;
1107 	hermon_workq_hdr_t		*wq;
1108 	uint_t				indx, wrindx;
1109 	uint_t				posted_cnt;
1110 	int				status;
1111 
1112 	mutex_enter(&srq->srq_lock);
1113 
1114 	/*
1115 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
1116 	 * clients to post to QP memory that is accessible directly by the
1117 	 * user.  If the QP memory is user accessible, then return an error.
1118 	 */
1119 	if (srq->srq_is_umap) {
1120 		mutex_exit(&srq->srq_lock);
1121 		return (IBT_SRQ_HDL_INVALID);
1122 	}
1123 
1124 	/*
1125 	 * Check SRQ state.  Can not post Recv requests when SRQ is in error
1126 	 */
1127 	if (srq->srq_state == HERMON_SRQ_STATE_ERROR) {
1128 		mutex_exit(&srq->srq_lock);
1129 		return (IBT_QP_STATE_INVALID);
1130 	}
1131 
1132 	status = DDI_SUCCESS;
1133 	posted_cnt = 0;
1134 	wq = srq->srq_wq_wqhdr;
1135 	indx = wq->wq_head;
1136 
1137 	for (wrindx = 0; wrindx < num_wr; wrindx++) {
1138 
1139 		if (indx == wq->wq_tail) {
1140 			status = IBT_QP_FULL;
1141 			break;
1142 		}
1143 		desc = HERMON_SRQ_WQE_ADDR(srq, indx);
1144 
1145 		wq->wq_wrid[indx] = wr[wrindx].wr_id;
1146 
1147 		status = hermon_wqe_srq_build(state, srq, &wr[wrindx], desc);
1148 		if (status != DDI_SUCCESS) {
1149 			break;
1150 		}
1151 
1152 		hermon_wqe_sync(srq, indx, indx + 1,
1153 		    HERMON_WR_SRQ, DDI_DMA_SYNC_FORDEV);
1154 		posted_cnt++;
1155 		indx = htons(((uint16_t *)desc)[1]);
1156 		wq->wq_head = indx;
1157 	}
1158 
1159 	if (posted_cnt != 0) {
1160 
1161 		srq->srq_wq_wqecntr += posted_cnt;
1162 
1163 		membar_producer();	/* ensure wrids are visible */
1164 
1165 		/* Ring the doorbell w/ wqecntr */
1166 		HERMON_UAR_DB_RECORD_WRITE(srq->srq_wq_vdbr,
1167 		    srq->srq_wq_wqecntr & 0xFFFF);
1168 	}
1169 
1170 	if (num_posted != NULL) {
1171 		*num_posted = posted_cnt;
1172 	}
1173 
1174 	mutex_exit(&srq->srq_lock);
1175 	return (status);
1176 }
1177 
1178 
1179 /*
1180  * hermon_wqe_send_build()
1181  *    Context: Can be called from interrupt or base context.
1182  */
1183 static int
1184 hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
1185     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1186 {
1187 	hermon_hw_snd_wqe_ud_t		*ud;
1188 	hermon_hw_snd_wqe_remaddr_t	*rc;
1189 	hermon_hw_snd_wqe_atomic_t	*at;
1190 	hermon_hw_snd_wqe_remaddr_t	*uc;
1191 	hermon_hw_snd_wqe_bind_t	*bn;
1192 	hermon_hw_wqe_sgl_t		*ds, *old_ds;
1193 	ibt_ud_dest_t			*dest;
1194 	ibt_wr_ds_t			*sgl;
1195 	hermon_ahhdl_t			ah;
1196 	uint32_t			nds;
1197 	int				i, j, last_ds, num_ds, status;
1198 	int				tmpsize;
1199 
1200 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1201 
1202 	/* Initialize the information for the Data Segments */
1203 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1204 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
1205 	nds = wr->wr_nds;
1206 	sgl = wr->wr_sgl;
1207 	num_ds = 0;
1208 	i = 0;
1209 
1210 	/*
1211 	 * Build a Send WQE depends first and foremost on the transport
1212 	 * type of Work Request (i.e. UD, RC, or UC)
1213 	 */
1214 	switch (wr->wr_trans) {
1215 	case IBT_UD_SRV:
1216 		/* Ensure that work request transport type matches QP type */
1217 		if (qp->qp_serv_type != HERMON_QP_UD) {
1218 			return (IBT_QP_SRV_TYPE_INVALID);
1219 		}
1220 
1221 		/*
1222 		 * Validate the operation type.  For UD requests, only the
1223 		 * "Send" and "Send LSO" operations are valid.
1224 		 */
1225 		if (wr->wr_opcode != IBT_WRC_SEND &&
1226 		    wr->wr_opcode != IBT_WRC_SEND_LSO) {
1227 			return (IBT_QP_OP_TYPE_INVALID);
1228 		}
1229 
1230 		/*
1231 		 * If this is a Special QP (QP0 or QP1), then we need to
1232 		 * build MLX WQEs instead.  So jump to hermon_wqe_mlx_build()
1233 		 * and return whatever status it returns
1234 		 */
1235 		if (qp->qp_is_special) {
1236 			if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1237 				return (IBT_QP_OP_TYPE_INVALID);
1238 			}
1239 			status = hermon_wqe_mlx_build(state, qp,
1240 			    wr, desc, size);
1241 			return (status);
1242 		}
1243 
1244 		/*
1245 		 * Otherwise, if this is a normal UD Send request, then fill
1246 		 * all the fields in the Hermon UD header for the WQE.  Note:
1247 		 * to do this we'll need to extract some information from the
1248 		 * Address Handle passed with the work request.
1249 		 */
1250 		ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1251 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
1252 		if (wr->wr_opcode == IBT_WRC_SEND) {
1253 			dest = wr->wr.ud.udwr_dest;
1254 		} else {
1255 			dest = wr->wr.ud_lso.lso_ud_dest;
1256 		}
1257 		ah = (hermon_ahhdl_t)dest->ud_ah;
1258 		if (ah == NULL) {
1259 			return (IBT_AH_HDL_INVALID);
1260 		}
1261 
1262 		/*
1263 		 * Build the Unreliable Datagram Segment for the WQE, using
1264 		 * the information from the address handle and the work
1265 		 * request.
1266 		 */
1267 		/* mutex_enter(&ah->ah_lock); */
1268 		if (wr->wr_opcode == IBT_WRC_SEND) {
1269 			HERMON_WQE_BUILD_UD(qp, ud, ah, wr->wr.ud.udwr_dest);
1270 		} else {	/* IBT_WRC_SEND_LSO */
1271 			HERMON_WQE_BUILD_UD(qp, ud, ah,
1272 			    wr->wr.ud_lso.lso_ud_dest);
1273 		}
1274 		/* mutex_exit(&ah->ah_lock); */
1275 
1276 		/* Update "ds" for filling in Data Segments (below) */
1277 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
1278 		    sizeof (hermon_hw_snd_wqe_ud_t));
1279 
1280 		if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1281 			int total_len;
1282 
1283 			total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
1284 			if ((uintptr_t)ds + total_len + (nds * 16) >
1285 			    (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz))
1286 				return (IBT_QP_SGL_LEN_INVALID);
1287 
1288 			bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)ds + 1,
1289 			    wr->wr.ud_lso.lso_hdr_sz);
1290 			old_ds = ds;
1291 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
1292 			for (; i < nds; i++) {
1293 				if (sgl[i].ds_len == 0)
1294 					continue;
1295 				HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds],
1296 				    &sgl[i]);
1297 				num_ds++;
1298 				i++;
1299 				break;
1300 			}
1301 			membar_producer();
1302 			HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
1303 			    wr->wr.ud_lso.lso_hdr_sz);
1304 		}
1305 
1306 		break;
1307 
1308 	case IBT_RC_SRV:
1309 		/* Ensure that work request transport type matches QP type */
1310 		if (qp->qp_serv_type != HERMON_QP_RC) {
1311 			return (IBT_QP_SRV_TYPE_INVALID);
1312 		}
1313 
1314 		/*
1315 		 * Validate the operation type.  For RC requests, we allow
1316 		 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1317 		 * operations, and memory window "Bind"
1318 		 */
1319 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1320 		    (wr->wr_opcode != IBT_WRC_RDMAR) &&
1321 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1322 		    (wr->wr_opcode != IBT_WRC_CSWAP) &&
1323 		    (wr->wr_opcode != IBT_WRC_FADD) &&
1324 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1325 			return (IBT_QP_OP_TYPE_INVALID);
1326 		}
1327 
1328 		/*
1329 		 * If this is a Send request, then all we need to do is break
1330 		 * out and here and begin the Data Segment processing below
1331 		 */
1332 		if (wr->wr_opcode == IBT_WRC_SEND) {
1333 			break;
1334 		}
1335 
1336 		/*
1337 		 * If this is an RDMA Read or RDMA Write request, then fill
1338 		 * in the "Remote Address" header fields.
1339 		 */
1340 		if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1341 		    (wr->wr_opcode == IBT_WRC_RDMAW)) {
1342 			rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1343 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1344 
1345 			/*
1346 			 * Build the Remote Address Segment for the WQE, using
1347 			 * the information from the RC work request.
1348 			 */
1349 			HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1350 
1351 			/* Update "ds" for filling in Data Segments (below) */
1352 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
1353 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1354 			break;
1355 		}
1356 
1357 		/*
1358 		 * If this is one of the Atomic type operations (i.e
1359 		 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1360 		 * Address" header fields and the "Atomic" header fields.
1361 		 */
1362 		if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1363 		    (wr->wr_opcode == IBT_WRC_FADD)) {
1364 			rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1365 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1366 			at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1367 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1368 
1369 			/*
1370 			 * Build the Remote Address and Atomic Segments for
1371 			 * the WQE, using the information from the RC Atomic
1372 			 * work request.
1373 			 */
1374 			HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1375 			HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1376 
1377 			/* Update "ds" for filling in Data Segments (below) */
1378 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
1379 			    sizeof (hermon_hw_snd_wqe_atomic_t));
1380 
1381 			/*
1382 			 * Update "nds" and "sgl" because Atomic requests have
1383 			 * only a single Data Segment (and they are encoded
1384 			 * somewhat differently in the work request.
1385 			 */
1386 			nds = 1;
1387 			sgl = wr->wr_sgl;
1388 			break;
1389 		}
1390 
1391 		/*
1392 		 * If this is memory window Bind operation, then we call the
1393 		 * hermon_wr_bind_check() routine to validate the request and
1394 		 * to generate the updated RKey.  If this is successful, then
1395 		 * we fill in the WQE's "Bind" header fields.
1396 		 */
1397 		if (wr->wr_opcode == IBT_WRC_BIND) {
1398 			status = hermon_wr_bind_check(state, wr);
1399 			if (status != DDI_SUCCESS) {
1400 				return (status);
1401 			}
1402 
1403 			bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1404 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1405 
1406 			/*
1407 			 * Build the Bind Memory Window Segments for the WQE,
1408 			 * using the information from the RC Bind memory
1409 			 * window work request.
1410 			 */
1411 			HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1412 
1413 			/*
1414 			 * Update the "ds" pointer.  Even though the "bind"
1415 			 * operation requires no SGLs, this is necessary to
1416 			 * facilitate the correct descriptor size calculations
1417 			 * (below).
1418 			 */
1419 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1420 			    sizeof (hermon_hw_snd_wqe_bind_t));
1421 			nds = 0;
1422 		}
1423 		break;
1424 
1425 	case IBT_UC_SRV:
1426 		/* Ensure that work request transport type matches QP type */
1427 		if (qp->qp_serv_type != HERMON_QP_UC) {
1428 			return (IBT_QP_SRV_TYPE_INVALID);
1429 		}
1430 
1431 		/*
1432 		 * Validate the operation type.  For UC requests, we only
1433 		 * allow "Send", "RDMA Write", and memory window "Bind".
1434 		 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1435 		 * operations
1436 		 */
1437 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1438 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1439 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1440 			return (IBT_QP_OP_TYPE_INVALID);
1441 		}
1442 
1443 		/*
1444 		 * If this is a Send request, then all we need to do is break
1445 		 * out and here and begin the Data Segment processing below
1446 		 */
1447 		if (wr->wr_opcode == IBT_WRC_SEND) {
1448 			break;
1449 		}
1450 
1451 		/*
1452 		 * If this is an RDMA Write request, then fill in the "Remote
1453 		 * Address" header fields.
1454 		 */
1455 		if (wr->wr_opcode == IBT_WRC_RDMAW) {
1456 			uc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1457 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1458 
1459 			/*
1460 			 * Build the Remote Address Segment for the WQE, using
1461 			 * the information from the UC work request.
1462 			 */
1463 			HERMON_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1464 
1465 			/* Update "ds" for filling in Data Segments (below) */
1466 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)uc +
1467 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1468 			break;
1469 		}
1470 
1471 		/*
1472 		 * If this is memory window Bind operation, then we call the
1473 		 * hermon_wr_bind_check() routine to validate the request and
1474 		 * to generate the updated RKey.  If this is successful, then
1475 		 * we fill in the WQE's "Bind" header fields.
1476 		 */
1477 		if (wr->wr_opcode == IBT_WRC_BIND) {
1478 			status = hermon_wr_bind_check(state, wr);
1479 			if (status != DDI_SUCCESS) {
1480 				return (status);
1481 			}
1482 
1483 			bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1484 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1485 
1486 			/*
1487 			 * Build the Bind Memory Window Segments for the WQE,
1488 			 * using the information from the UC Bind memory
1489 			 * window work request.
1490 			 */
1491 			HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1492 
1493 			/*
1494 			 * Update the "ds" pointer.  Even though the "bind"
1495 			 * operation requires no SGLs, this is necessary to
1496 			 * facilitate the correct descriptor size calculations
1497 			 * (below).
1498 			 */
1499 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1500 			    sizeof (hermon_hw_snd_wqe_bind_t));
1501 			nds = 0;
1502 		}
1503 		break;
1504 
1505 	default:
1506 		return (IBT_QP_SRV_TYPE_INVALID);
1507 	}
1508 
1509 	/*
1510 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
1511 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1512 	 * Start by checking for a valid number of SGL entries
1513 	 */
1514 	if (nds > qp->qp_sq_sgl) {
1515 		return (IBT_QP_SGL_LEN_INVALID);
1516 	}
1517 
1518 	/*
1519 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
1520 	 * segments.  Note: We skip any SGL with zero size because Hermon
1521 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1522 	 * the encoding for zero means a 2GB transfer.
1523 	 */
1524 	for (last_ds = num_ds, j = i; j < nds; j++) {
1525 		if (sgl[j].ds_len != 0)
1526 			last_ds++;	/* real last ds of wqe to fill */
1527 	}
1528 
1529 	/*
1530 	 * Return the size of descriptor (in 16-byte chunks)
1531 	 * For Hermon, we want them (for now) to be on stride size
1532 	 * boundaries, which was implicit in Tavor/Arbel
1533 	 *
1534 	 */
1535 	tmpsize = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc);
1536 
1537 	*size = tmpsize >> 0x4;
1538 
1539 	for (j = nds; --j >= i; ) {
1540 		if (sgl[j].ds_len == 0) {
1541 			continue;
1542 		}
1543 
1544 		/*
1545 		 * Fill in the Data Segment(s) for the current WQE, using the
1546 		 * information contained in the scatter-gather list of the
1547 		 * work request.
1548 		 */
1549 		last_ds--;
1550 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
1551 	}
1552 
1553 	return (DDI_SUCCESS);
1554 }
1555 
1556 
1557 
1558 /*
1559  * hermon_wqe_mlx_build()
1560  *    Context: Can be called from interrupt or base context.
1561  */
1562 static int
1563 hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
1564     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1565 {
1566 	hermon_ahhdl_t		ah;
1567 	hermon_hw_udav_t	*udav;
1568 	ib_lrh_hdr_t		*lrh;
1569 	ib_grh_t		*grh;
1570 	ib_bth_hdr_t		*bth;
1571 	ib_deth_hdr_t		*deth;
1572 	hermon_hw_wqe_sgl_t	*ds;
1573 	ibt_wr_ds_t		*sgl;
1574 	uint8_t			*mgmtclass, *hpoint, *hcount;
1575 	uint32_t		nds, offset, pktlen;
1576 	uint32_t		desc_sz;
1577 	int			i, num_ds;
1578 	int			tmpsize;
1579 
1580 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1581 
1582 	/* Initialize the information for the Data Segments */
1583 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1584 	    sizeof (hermon_hw_mlx_wqe_nextctrl_t));
1585 
1586 	/*
1587 	 * Pull the address handle from the work request. The UDAV will
1588 	 * be used to answer some questions about the request.
1589 	 */
1590 	ah = (hermon_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1591 	if (ah == NULL) {
1592 		return (IBT_AH_HDL_INVALID);
1593 	}
1594 	mutex_enter(&ah->ah_lock);
1595 	udav = ah->ah_udav;
1596 
1597 	/*
1598 	 * If the request is for QP1 and the destination LID is equal to
1599 	 * the Permissive LID, then return an error.  This combination is
1600 	 * not allowed
1601 	 */
1602 	if ((udav->rlid == IB_LID_PERMISSIVE) &&
1603 	    (qp->qp_is_special == HERMON_QP_GSI)) {
1604 		mutex_exit(&ah->ah_lock);
1605 		return (IBT_AH_HDL_INVALID);
1606 	}
1607 
1608 	/*
1609 	 * Calculate the size of the packet headers, including the GRH
1610 	 * (if necessary)
1611 	 */
1612 	desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1613 	    sizeof (ib_deth_hdr_t);
1614 	if (udav->grh) {
1615 		desc_sz += sizeof (ib_grh_t);
1616 	}
1617 
1618 	/*
1619 	 * Begin to build the first "inline" data segment for the packet
1620 	 * headers.  Note:  By specifying "inline" we can build the contents
1621 	 * of the MAD packet headers directly into the work queue (as part
1622 	 * descriptor).  This has the advantage of both speeding things up
1623 	 * and of not requiring the driver to allocate/register any additional
1624 	 * memory for the packet headers.
1625 	 */
1626 	HERMON_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1627 	desc_sz += 4;
1628 
1629 	/*
1630 	 * Build Local Route Header (LRH)
1631 	 *    We start here by building the LRH into a temporary location.
1632 	 *    When we have finished we copy the LRH data into the descriptor.
1633 	 *
1634 	 *    Notice that the VL values are hardcoded.  This is not a problem
1635 	 *    because VL15 is decided later based on the value in the MLX
1636 	 *    transport "next/ctrl" header (see the "vl15" bit below), and it
1637 	 *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1638 	 *    values.  This rule does not hold for loopback packets however
1639 	 *    (all of which bypass the SL-to-VL tables) and it is the reason
1640 	 *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1641 	 *
1642 	 *    Notice also that Source LID is hardcoded to the Permissive LID
1643 	 *    (0xFFFF).  This is also not a problem because if the Destination
1644 	 *    LID is not the Permissive LID, then the "slr" value in the MLX
1645 	 *    transport "next/ctrl" header will be set to zero and the hardware
1646 	 *    will pull the LID from value in the port.
1647 	 */
1648 	lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1649 	pktlen = (desc_sz + 0x100) >> 2;
1650 	HERMON_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1651 
1652 	/*
1653 	 * Build Global Route Header (GRH)
1654 	 *    This is only built if necessary as defined by the "grh" bit in
1655 	 *    the address vector.  Note:  We also calculate the offset to the
1656 	 *    next header (BTH) based on whether or not the "grh" bit is set.
1657 	 */
1658 	if (udav->grh) {
1659 		/*
1660 		 * If the request is for QP0, then return an error.  The
1661 		 * combination of global routine (GRH) and QP0 is not allowed.
1662 		 */
1663 		if (qp->qp_is_special == HERMON_QP_SMI) {
1664 			mutex_exit(&ah->ah_lock);
1665 			return (IBT_AH_HDL_INVALID);
1666 		}
1667 		grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1668 		HERMON_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1669 
1670 		bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1671 	} else {
1672 		bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1673 	}
1674 	mutex_exit(&ah->ah_lock);
1675 
1676 
1677 	/*
1678 	 * Build Base Transport Header (BTH)
1679 	 *    Notice that the M, PadCnt, and TVer fields are all set
1680 	 *    to zero implicitly.  This is true for all Management Datagrams
1681 	 *    MADs whether GSI are SMI.
1682 	 */
1683 	HERMON_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1684 
1685 	/*
1686 	 * Build Datagram Extended Transport Header (DETH)
1687 	 */
1688 	deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1689 	HERMON_WQE_BUILD_MLX_DETH(deth, qp);
1690 
1691 	/* Ensure that the Data Segment is aligned on a 16-byte boundary */
1692 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1693 	ds = (hermon_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1694 	nds = wr->wr_nds;
1695 	sgl = wr->wr_sgl;
1696 	num_ds = 0;
1697 
1698 	/*
1699 	 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1700 	 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1701 	 * Start by checking for a valid number of SGL entries
1702 	 */
1703 	if (nds > qp->qp_sq_sgl) {
1704 		return (IBT_QP_SGL_LEN_INVALID);
1705 	}
1706 
1707 	/*
1708 	 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1709 	 * segments.  Note: We skip any SGL with zero size because Hermon
1710 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1711 	 * the encoding for zero means a 2GB transfer.  Because of this special
1712 	 * encoding in the hardware, we mask the requested length with
1713 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1714 	 * zero.)
1715 	 */
1716 	mgmtclass = hpoint = hcount = NULL;
1717 	offset = 0;
1718 	for (i = 0; i < nds; i++) {
1719 		if (sgl[i].ds_len == 0) {
1720 			continue;
1721 		}
1722 
1723 		/*
1724 		 * Fill in the Data Segment(s) for the MLX send WQE, using
1725 		 * the information contained in the scatter-gather list of
1726 		 * the work request.
1727 		 */
1728 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]);
1729 
1730 		/*
1731 		 * Search through the contents of all MADs posted to QP0 to
1732 		 * initialize pointers to the places where Directed Route "hop
1733 		 * pointer", "hop count", and "mgmtclass" would be.  Hermon
1734 		 * needs these updated (i.e. incremented or decremented, as
1735 		 * necessary) by software.
1736 		 */
1737 		if (qp->qp_is_special == HERMON_QP_SMI) {
1738 
1739 			HERMON_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1740 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1741 
1742 			HERMON_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1743 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1744 
1745 			HERMON_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1746 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1747 
1748 			offset += sgl[i].ds_len;
1749 		}
1750 		num_ds++;
1751 	}
1752 
1753 	/*
1754 	 * Hermon's Directed Route MADs need to have the "hop pointer"
1755 	 * incremented/decremented (as necessary) depending on whether it is
1756 	 * currently less than or greater than the "hop count" (i.e. whether
1757 	 * the MAD is a request or a response.)
1758 	 */
1759 	if (qp->qp_is_special == HERMON_QP_SMI) {
1760 		HERMON_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1761 		    *hpoint, *hcount);
1762 	}
1763 
1764 	/*
1765 	 * Now fill in the ICRC Data Segment.  This data segment is inlined
1766 	 * just like the packets headers above, but it is only four bytes and
1767 	 * set to zero (to indicate that we wish the hardware to generate ICRC.
1768 	 */
1769 	HERMON_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1770 	num_ds++;
1771 
1772 	/*
1773 	 * Return the size of descriptor (in 16-byte chunks)
1774 	 * For Hermon, we want them (for now) to be on stride size
1775 	 * boundaries, which was implicit in Tavor/Arbel
1776 	 */
1777 	tmpsize = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc);
1778 
1779 	*size = tmpsize >> 0x04;
1780 
1781 	return (DDI_SUCCESS);
1782 }
1783 
1784 
1785 
1786 /*
1787  * hermon_wqe_recv_build()
1788  *    Context: Can be called from interrupt or base context.
1789  */
1790 /* ARGSUSED */
1791 static int
1792 hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
1793     ibt_recv_wr_t *wr, uint64_t *desc)
1794 {
1795 	hermon_hw_wqe_sgl_t	*ds;
1796 	int			i, num_ds;
1797 
1798 	ASSERT(MUTEX_HELD(&qp->qp_rq_lock));
1799 
1800 	/*
1801 	 * Fill in the Data Segments (SGL) for the Recv WQE  - don't
1802 	 * need to have a reserved for the ctrl, there is none on the
1803 	 * recv queue for hermon, but will need to put an invalid
1804 	 * (null) scatter pointer per PRM
1805 	 */
1806 	ds = (hermon_hw_wqe_sgl_t *)(uintptr_t)desc;
1807 	num_ds = 0;
1808 
1809 	/* Check for valid number of SGL entries */
1810 	if (wr->wr_nds > qp->qp_rq_sgl) {
1811 		return (IBT_QP_SGL_LEN_INVALID);
1812 	}
1813 
1814 	/*
1815 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1816 	 * segments.  Note: We skip any SGL with zero size because Hermon
1817 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1818 	 * the encoding for zero means a 2GB transfer.  Because of this special
1819 	 * encoding in the hardware, we mask the requested length with
1820 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1821 	 * zero.)
1822 	 */
1823 	for (i = 0; i < wr->wr_nds; i++) {
1824 		if (wr->wr_sgl[i].ds_len == 0) {
1825 			continue;
1826 		}
1827 
1828 		/*
1829 		 * Fill in the Data Segment(s) for the receive WQE, using the
1830 		 * information contained in the scatter-gather list of the
1831 		 * work request.
1832 		 */
1833 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1834 		num_ds++;
1835 	}
1836 
1837 	/* put the null sgl pointer as well if needed */
1838 	if (num_ds < qp->qp_rq_sgl) {
1839 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1840 	}
1841 
1842 	return (DDI_SUCCESS);
1843 }
1844 
1845 
1846 
1847 /*
1848  * hermon_wqe_srq_build()
1849  *    Context: Can be called from interrupt or base context.
1850  */
1851 /* ARGSUSED */
1852 static int
1853 hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
1854     ibt_recv_wr_t *wr, uint64_t *desc)
1855 {
1856 	hermon_hw_wqe_sgl_t	*ds;
1857 	int			i, num_ds;
1858 
1859 	ASSERT(MUTEX_HELD(&srq->srq_lock));
1860 
1861 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1862 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1863 	    sizeof (hermon_hw_srq_wqe_next_t));
1864 	num_ds = 0;
1865 
1866 	/* Check for valid number of SGL entries */
1867 	if (wr->wr_nds > srq->srq_wq_sgl) {
1868 		return (IBT_QP_SGL_LEN_INVALID);
1869 	}
1870 
1871 	/*
1872 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1873 	 * segments.  Note: We skip any SGL with zero size because Hermon
1874 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1875 	 * the encoding for zero means a 2GB transfer.  Because of this special
1876 	 * encoding in the hardware, we mask the requested length with
1877 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1878 	 * zero.)
1879 	 */
1880 	for (i = 0; i < wr->wr_nds; i++) {
1881 		if (wr->wr_sgl[i].ds_len == 0) {
1882 			continue;
1883 		}
1884 
1885 		/*
1886 		 * Fill in the Data Segment(s) for the receive WQE, using the
1887 		 * information contained in the scatter-gather list of the
1888 		 * work request.
1889 		 */
1890 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1891 		num_ds++;
1892 	}
1893 
1894 	/*
1895 	 * put in the null sgl pointer as well, if needed
1896 	 */
1897 	if (num_ds < srq->srq_wq_sgl) {
1898 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1899 	}
1900 
1901 	return (DDI_SUCCESS);
1902 }
1903 
1904 
1905 /*
1906  * hermon_wr_get_immediate()
1907  *    Context: Can be called from interrupt or base context.
1908  */
1909 static uint32_t
1910 hermon_wr_get_immediate(ibt_send_wr_t *wr)
1911 {
1912 	/*
1913 	 * This routine extracts the "immediate data" from the appropriate
1914 	 * location in the IBTF work request.  Because of the way the
1915 	 * work request structure is defined, the location for this data
1916 	 * depends on the actual work request operation type.
1917 	 */
1918 
1919 	/* For RDMA Write, test if RC or UC */
1920 	if (wr->wr_opcode == IBT_WRC_RDMAW) {
1921 		if (wr->wr_trans == IBT_RC_SRV) {
1922 			return (wr->wr.rc.rcwr.rdma.rdma_immed);
1923 		} else {  /* IBT_UC_SRV */
1924 			return (wr->wr.uc.ucwr.rdma.rdma_immed);
1925 		}
1926 	}
1927 
1928 	/* For Send, test if RC, UD, or UC */
1929 	if (wr->wr_opcode == IBT_WRC_SEND) {
1930 		if (wr->wr_trans == IBT_RC_SRV) {
1931 			return (wr->wr.rc.rcwr.send_immed);
1932 		} else if (wr->wr_trans == IBT_UD_SRV) {
1933 			return (wr->wr.ud.udwr_immed);
1934 		} else {  /* IBT_UC_SRV */
1935 			return (wr->wr.uc.ucwr.send_immed);
1936 		}
1937 	}
1938 
1939 	/*
1940 	 * If any other type of request, then immediate is undefined
1941 	 */
1942 	return (0);
1943 }
1944 
1945 /*
1946  * hermon_wqe_headroom()
1947  *	Context: can be called from interrupt or base, currently only from
1948  *	base context.
1949  * Routine that fills in the headroom for the Send Queue
1950  */
1951 
1952 static void
1953 hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp)
1954 {
1955 	uint32_t	*wqe_start, *wqe_top, *wqe_base, qsize;
1956 	int		hdrmwqes, wqesizebytes, sectperwqe;
1957 	uint32_t	invalue;
1958 	int		i, j;
1959 
1960 	qsize	 = qp->qp_sq_bufsz;
1961 	wqesizebytes = 1 << qp->qp_sq_log_wqesz;
1962 	sectperwqe = wqesizebytes >> 6; 	/* 64 bytes/section */
1963 	hdrmwqes = qp->qp_sq_hdrmwqes;
1964 	wqe_base  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, 0);
1965 	wqe_top	  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, qsize);
1966 	wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, from);
1967 
1968 	for (i = 0; i < hdrmwqes; i++)	{
1969 		for (j = 0; j < sectperwqe; j++) {
1970 			if (j == 0) {		/* 1st section of wqe */
1971 				/* perserve ownership bit */
1972 				invalue = ddi_get32(qp->qp_wqinfo.qa_acchdl,
1973 				    wqe_start) | 0x7FFFFFFF;
1974 			} else {
1975 				/* or just invalidate it */
1976 				invalue = 0xFFFFFFFF;
1977 			}
1978 			ddi_put32(qp->qp_wqinfo.qa_acchdl, wqe_start, invalue);
1979 			wqe_start += 16;	/* move 64 bytes */
1980 		}
1981 		if (wqe_start == wqe_top)	/* hit the end of the queue */
1982 			wqe_start = wqe_base;	/* wrap to start */
1983 	}
1984 }
1985 
1986 /*
1987  * hermon_wqe_sync()
1988  *    Context: Can be called from interrupt or base context.
1989  */
1990 static void
1991 hermon_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
1992     uint_t sync_type, uint_t flag)
1993 {
1994 	hermon_qphdl_t		qp;
1995 	hermon_srqhdl_t		srq;
1996 	uint64_t		*wqe_from, *wqe_to;
1997 	uint64_t		*wq_base, *wq_top, *qp_base;
1998 	ddi_dma_handle_t	dmahdl;
1999 	off_t			offset;
2000 	size_t			length;
2001 	uint32_t		qsize;
2002 	int			status;
2003 
2004 	if (sync_type == HERMON_WR_SRQ) {
2005 		srq = (hermon_srqhdl_t)hdl;
2006 		/* Get the DMA handle from SRQ context */
2007 		dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
2008 		/* get base addr of the buffer */
2009 		qp_base = (uint64_t *)(void *)srq->srq_wq_buf;
2010 	} else {
2011 		qp = (hermon_qphdl_t)hdl;
2012 		/* Get the DMA handle from QP context */
2013 		dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
2014 		/* Determine the base address of the QP buffer */
2015 		if (qp->qp_sq_baseaddr == 0) {
2016 			qp_base = (uint64_t *)(void *)(qp->qp_sq_buf);
2017 		} else {
2018 			qp_base = (uint64_t *)(void *)(qp->qp_rq_buf);
2019 		}
2020 	}
2021 
2022 	/*
2023 	 * Depending on the type of the work queue, we grab information
2024 	 * about the address ranges we need to DMA sync.
2025 	 */
2026 
2027 	if (sync_type == HERMON_WR_SEND) {
2028 		wqe_from = HERMON_QP_SQ_ENTRY(qp, sync_from);
2029 		wqe_to   = HERMON_QP_SQ_ENTRY(qp, sync_to);
2030 		qsize	 = qp->qp_sq_bufsz;
2031 
2032 		wq_base = HERMON_QP_SQ_ENTRY(qp, 0);
2033 		wq_top	 = HERMON_QP_SQ_ENTRY(qp, qsize);
2034 	} else if (sync_type == HERMON_WR_RECV) {
2035 		wqe_from = HERMON_QP_RQ_ENTRY(qp, sync_from);
2036 		wqe_to   = HERMON_QP_RQ_ENTRY(qp, sync_to);
2037 		qsize	 = qp->qp_rq_bufsz;
2038 
2039 		wq_base = HERMON_QP_RQ_ENTRY(qp, 0);
2040 		wq_top	 = HERMON_QP_RQ_ENTRY(qp, qsize);
2041 	} else {
2042 		wqe_from = HERMON_SRQ_WQ_ENTRY(srq, sync_from);
2043 		wqe_to   = HERMON_SRQ_WQ_ENTRY(srq, sync_to);
2044 		qsize	 = srq->srq_wq_bufsz;
2045 
2046 		wq_base = HERMON_SRQ_WQ_ENTRY(srq, 0);
2047 		wq_top	 = HERMON_SRQ_WQ_ENTRY(srq, qsize);
2048 	}
2049 
2050 	/*
2051 	 * There are two possible cases for the beginning and end of the WQE
2052 	 * chain we are trying to sync.  Either this is the simple case, where
2053 	 * the end of the chain is below the beginning of the chain, or it is
2054 	 * the "wrap-around" case, where the end of the chain has wrapped over
2055 	 * the end of the queue.  In the former case, we simply need to
2056 	 * calculate the span from beginning to end and sync it.  In the latter
2057 	 * case, however, we need to calculate the span from the top of the
2058 	 * work queue to the end of the chain and sync that, and then we need
2059 	 * to find the other portion (from beginning of chain to end of queue)
2060 	 * and sync that as well.  Note: if the "top to end" span is actually
2061 	 * zero length, then we don't do a DMA sync because a zero length DMA
2062 	 * sync unnecessarily syncs the entire work queue.
2063 	 */
2064 	if (wqe_to > wqe_from) {
2065 		/* "From Beginning to End" */
2066 
2067 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)qp_base);
2068 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2069 
2070 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2071 		if (status != DDI_SUCCESS) {
2072 			return;
2073 		}
2074 	} else {
2075 		/* "From Top to End" */
2076 
2077 		offset = (off_t)((uintptr_t)wq_base - (uintptr_t)qp_base);
2078 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wq_base);
2079 		if (length) {
2080 			status = ddi_dma_sync(dmahdl, offset, length, flag);
2081 			if (status != DDI_SUCCESS) {
2082 				return;
2083 			}
2084 		}
2085 
2086 		/* "From Beginning to Bottom" */
2087 
2088 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)qp_base);
2089 		length = (size_t)((uintptr_t)wq_top - (uintptr_t)wqe_from);
2090 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2091 		if (status != DDI_SUCCESS) {
2092 			return;
2093 		}
2094 	}
2095 }
2096 
2097 
2098 /*
2099  * hermon_wr_bind_check()
2100  *    Context: Can be called from interrupt or base context.
2101  */
2102 /* ARGSUSED */
2103 static int
2104 hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr)
2105 {
2106 	ibt_bind_flags_t	bind_flags;
2107 	uint64_t		vaddr, len;
2108 	uint64_t		reg_start_addr, reg_end_addr;
2109 	hermon_mwhdl_t		mw;
2110 	hermon_mrhdl_t		mr;
2111 	hermon_rsrc_t		*mpt;
2112 	uint32_t		new_rkey;
2113 
2114 	/* Check for a valid Memory Window handle in the WR */
2115 	mw = (hermon_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2116 	if (mw == NULL) {
2117 		return (IBT_MW_HDL_INVALID);
2118 	}
2119 
2120 	/* Check for a valid Memory Region handle in the WR */
2121 	mr = (hermon_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2122 	if (mr == NULL) {
2123 		return (IBT_MR_HDL_INVALID);
2124 	}
2125 
2126 	mutex_enter(&mr->mr_lock);
2127 	mutex_enter(&mw->mr_lock);
2128 
2129 	/*
2130 	 * Check here to see if the memory region has already been partially
2131 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
2132 	 * If so, this is an error, return failure.
2133 	 */
2134 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2135 		mutex_exit(&mr->mr_lock);
2136 		mutex_exit(&mw->mr_lock);
2137 		return (IBT_MR_HDL_INVALID);
2138 	}
2139 
2140 	/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2141 	if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2142 		mutex_exit(&mr->mr_lock);
2143 		mutex_exit(&mw->mr_lock);
2144 		return (IBT_MR_RKEY_INVALID);
2145 	}
2146 
2147 	/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2148 	if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2149 		mutex_exit(&mr->mr_lock);
2150 		mutex_exit(&mw->mr_lock);
2151 		return (IBT_MR_LKEY_INVALID);
2152 	}
2153 
2154 	/*
2155 	 * Now check for valid "vaddr" and "len".  Note:  We don't check the
2156 	 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2157 	 */
2158 	len = wr->wr.rc.rcwr.bind->bind_len;
2159 	if (len != 0) {
2160 		vaddr = wr->wr.rc.rcwr.bind->bind_va;
2161 		reg_start_addr = mr->mr_bindinfo.bi_addr;
2162 		reg_end_addr   = mr->mr_bindinfo.bi_addr +
2163 		    (mr->mr_bindinfo.bi_len - 1);
2164 		if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2165 			mutex_exit(&mr->mr_lock);
2166 			mutex_exit(&mw->mr_lock);
2167 			return (IBT_MR_VA_INVALID);
2168 		}
2169 		vaddr = (vaddr + len) - 1;
2170 		if (vaddr > reg_end_addr) {
2171 			mutex_exit(&mr->mr_lock);
2172 			mutex_exit(&mw->mr_lock);
2173 			return (IBT_MR_LEN_INVALID);
2174 		}
2175 	}
2176 
2177 	/*
2178 	 * Validate the bind access flags.  Remote Write and Atomic access for
2179 	 * the Memory Window require that Local Write access be set in the
2180 	 * corresponding Memory Region.
2181 	 */
2182 	bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2183 	if (((bind_flags & IBT_WR_BIND_WRITE) ||
2184 	    (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2185 	    !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2186 		mutex_exit(&mr->mr_lock);
2187 		mutex_exit(&mw->mr_lock);
2188 		return (IBT_MR_ACCESS_REQ_INVALID);
2189 	}
2190 
2191 	/* Calculate the new RKey for the Memory Window */
2192 	mpt = mw->mr_mptrsrcp;
2193 	new_rkey = hermon_mr_keycalc(mpt->hr_indx);
2194 	new_rkey = hermon_mr_key_swap(new_rkey);
2195 
2196 	wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2197 	mw->mr_rkey = new_rkey;
2198 
2199 	mutex_exit(&mr->mr_lock);
2200 	mutex_exit(&mw->mr_lock);
2201 	return (DDI_SUCCESS);
2202 }
2203 
2204 
2205 /*
2206  * hermon_wrid_from_reset_handling()
2207  *    Context: Can be called from interrupt or base context.
2208  */
2209 /* ARGSUSED */
2210 int
2211 hermon_wrid_from_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2212 {
2213 	hermon_workq_hdr_t	*swq, *rwq;
2214 	uint_t			qp_srq_en;
2215 
2216 	if (qp->qp_is_umap)
2217 		return (DDI_SUCCESS);
2218 
2219 	/* grab the cq lock(s) to modify the wqavl tree */
2220 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2221 #ifdef __lock_lint
2222 	mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2223 #else
2224 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2225 		mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2226 #endif
2227 
2228 	/* Chain the newly allocated work queue header to the CQ's list */
2229 	hermon_cq_workq_add(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2230 
2231 	swq = qp->qp_sq_wqhdr;
2232 	swq->wq_head = 0;
2233 	swq->wq_tail = 0;
2234 	swq->wq_full = 0;
2235 
2236 	/*
2237 	 * Now we repeat all the above operations for the receive work queue,
2238 	 * or shared receive work queue.
2239 	 *
2240 	 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2241 	 */
2242 	qp_srq_en = qp->qp_srq_en;
2243 
2244 #ifdef __lock_lint
2245 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2246 #else
2247 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2248 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2249 	} else {
2250 		rwq = qp->qp_rq_wqhdr;
2251 		rwq->wq_head = 0;
2252 		rwq->wq_tail = 0;
2253 		rwq->wq_full = 0;
2254 		qp->qp_rq_wqecntr = 0;
2255 	}
2256 #endif
2257 	hermon_cq_workq_add(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2258 
2259 #ifdef __lock_lint
2260 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2261 #else
2262 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2263 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2264 	}
2265 #endif
2266 
2267 #ifdef __lock_lint
2268 	mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2269 #else
2270 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2271 		mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2272 #endif
2273 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2274 	return (DDI_SUCCESS);
2275 }
2276 
2277 
2278 /*
2279  * hermon_wrid_to_reset_handling()
2280  *    Context: Can be called from interrupt or base context.
2281  */
2282 int
2283 hermon_wrid_to_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2284 {
2285 	uint_t			qp_srq_en;
2286 
2287 	if (qp->qp_is_umap)
2288 		return (DDI_SUCCESS);
2289 
2290 	/*
2291 	 * If there are unpolled entries in these CQs, they are
2292 	 * polled/flushed.
2293 	 * Grab the CQ lock(s) before manipulating the lists.
2294 	 */
2295 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2296 #ifdef __lock_lint
2297 	mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2298 #else
2299 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2300 		mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2301 #endif
2302 
2303 	qp_srq_en = qp->qp_srq_en;
2304 #ifdef __lock_lint
2305 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2306 #else
2307 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2308 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2309 	}
2310 #endif
2311 	/*
2312 	 * Flush the entries on the CQ for this QP's QPN.
2313 	 */
2314 	hermon_cq_entries_flush(state, qp);
2315 
2316 #ifdef __lock_lint
2317 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2318 #else
2319 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2320 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2321 	}
2322 #endif
2323 
2324 	hermon_cq_workq_remove(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2325 	hermon_cq_workq_remove(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2326 
2327 #ifdef __lock_lint
2328 	mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2329 #else
2330 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2331 		mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2332 #endif
2333 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2334 
2335 	return (IBT_SUCCESS);
2336 }
2337 
2338 
2339 /*
2340  * hermon_wrid_get_entry()
2341  *    Context: Can be called from interrupt or base context.
2342  */
2343 uint64_t
2344 hermon_wrid_get_entry(hermon_cqhdl_t cq, hermon_hw_cqe_t *cqe)
2345 {
2346 	hermon_workq_avl_t	*wqa;
2347 	hermon_workq_hdr_t	*wq;
2348 	uint64_t		wrid;
2349 	uint_t			send_or_recv, qpnum;
2350 	uint32_t		indx;
2351 
2352 	/*
2353 	 * Determine whether this CQE is a send or receive completion.
2354 	 */
2355 	send_or_recv = HERMON_CQE_SENDRECV_GET(cq, cqe);
2356 
2357 	/* Find the work queue for this QP number (send or receive side) */
2358 	qpnum = HERMON_CQE_QPNUM_GET(cq, cqe);
2359 	wqa = hermon_wrid_wqavl_find(cq, qpnum, send_or_recv);
2360 	wq = wqa->wqa_wq;
2361 
2362 	/*
2363 	 * Regardless of whether the completion is the result of a "success"
2364 	 * or a "failure", we lock the list of "containers" and attempt to
2365 	 * search for the the first matching completion (i.e. the first WR
2366 	 * with a matching WQE addr and size).  Once we find it, we pull out
2367 	 * the "wrid" field and return it (see below).  XXX Note: One possible
2368 	 * future enhancement would be to enable this routine to skip over
2369 	 * any "unsignaled" completions to go directly to the next "signaled"
2370 	 * entry on success.
2371 	 */
2372 	indx = HERMON_CQE_WQEADDRSZ_GET(cq, cqe) & wq->wq_mask;
2373 	wrid = wq->wq_wrid[indx];
2374 	if (wqa->wqa_srq_en) {
2375 		struct hermon_sw_srq_s	*srq;
2376 		uint64_t		*desc;
2377 
2378 		/* put wqe back on the srq free list */
2379 		srq = wqa->wqa_srq;
2380 		mutex_enter(&srq->srq_lock);
2381 		desc = HERMON_SRQ_WQE_ADDR(srq, wq->wq_tail);
2382 		((uint16_t *)desc)[1] = htons(indx);
2383 		wq->wq_tail = indx;
2384 		mutex_exit(&srq->srq_lock);
2385 	} else {
2386 		wq->wq_head = (indx + 1) & wq->wq_mask;
2387 		wq->wq_full = 0;
2388 	}
2389 
2390 	return (wrid);
2391 }
2392 
2393 
2394 int
2395 hermon_wrid_workq_compare(const void *p1, const void *p2)
2396 {
2397 	hermon_workq_compare_t	*cmpp;
2398 	hermon_workq_avl_t	*curr;
2399 
2400 	cmpp = (hermon_workq_compare_t *)p1;
2401 	curr = (hermon_workq_avl_t *)p2;
2402 
2403 	if (cmpp->cmp_qpn < curr->wqa_qpn)
2404 		return (-1);
2405 	else if (cmpp->cmp_qpn > curr->wqa_qpn)
2406 		return (+1);
2407 	else if (cmpp->cmp_type < curr->wqa_type)
2408 		return (-1);
2409 	else if (cmpp->cmp_type > curr->wqa_type)
2410 		return (+1);
2411 	else
2412 		return (0);
2413 }
2414 
2415 
2416 /*
2417  * hermon_wrid_workq_find()
2418  *    Context: Can be called from interrupt or base context.
2419  */
2420 static hermon_workq_avl_t *
2421 hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn, uint_t wq_type)
2422 {
2423 	hermon_workq_avl_t	*curr;
2424 	hermon_workq_compare_t	cmp;
2425 
2426 	/*
2427 	 * Walk the CQ's work queue list, trying to find a send or recv queue
2428 	 * with the same QP number.  We do this even if we are going to later
2429 	 * create a new entry because it helps us easily find the end of the
2430 	 * list.
2431 	 */
2432 	cmp.cmp_qpn = qpn;
2433 	cmp.cmp_type = wq_type;
2434 #ifdef __lock_lint
2435 	hermon_wrid_workq_compare(NULL, NULL);
2436 #endif
2437 	curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
2438 
2439 	return (curr);
2440 }
2441 
2442 
2443 /*
2444  * hermon_wrid_wqhdr_create()
2445  *    Context: Can be called from base context.
2446  */
2447 /* ARGSUSED */
2448 hermon_workq_hdr_t *
2449 hermon_wrid_wqhdr_create(int bufsz)
2450 {
2451 	hermon_workq_hdr_t	*wqhdr;
2452 
2453 	/*
2454 	 * Allocate space for the wqhdr, and an array to record all the wrids.
2455 	 */
2456 	wqhdr = (hermon_workq_hdr_t *)kmem_zalloc(sizeof (*wqhdr), KM_NOSLEEP);
2457 	if (wqhdr == NULL) {
2458 		return (NULL);
2459 	}
2460 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr))
2461 	wqhdr->wq_wrid = kmem_zalloc(bufsz * sizeof (uint64_t), KM_NOSLEEP);
2462 	if (wqhdr->wq_wrid == NULL) {
2463 		kmem_free(wqhdr, sizeof (*wqhdr));
2464 		return (NULL);
2465 	}
2466 	wqhdr->wq_size = bufsz;
2467 	wqhdr->wq_mask = bufsz - 1;
2468 
2469 	return (wqhdr);
2470 }
2471 
2472 void
2473 hermon_wrid_wqhdr_destroy(hermon_workq_hdr_t *wqhdr)
2474 {
2475 	kmem_free(wqhdr->wq_wrid, wqhdr->wq_size * sizeof (uint64_t));
2476 	kmem_free(wqhdr, sizeof (*wqhdr));
2477 }
2478 
2479 
2480 /*
2481  * hermon_cq_workq_add()
2482  *    Context: Can be called from interrupt or base context.
2483  */
2484 static void
2485 hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2486 {
2487 	hermon_workq_compare_t	cmp;
2488 	avl_index_t		where;
2489 
2490 	cmp.cmp_qpn = wqavl->wqa_qpn;
2491 	cmp.cmp_type = wqavl->wqa_type;
2492 #ifdef __lock_lint
2493 	hermon_wrid_workq_compare(NULL, NULL);
2494 #endif
2495 	(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
2496 	avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqavl, where);
2497 }
2498 
2499 
2500 /*
2501  * hermon_cq_workq_remove()
2502  *    Context: Can be called from interrupt or base context.
2503  */
2504 static void
2505 hermon_cq_workq_remove(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2506 {
2507 #ifdef __lock_lint
2508 	hermon_wrid_workq_compare(NULL, NULL);
2509 #endif
2510 	avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqavl);
2511 }
2512