xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/hermon/hermon_wr.c (revision 949b58c70cf907006b9f724dfad665d44eca5881)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * hermon_wr.c
29  *    Hermon Work Request Processing Routines
30  *
31  *    Implements all the routines necessary to provide the PostSend(),
32  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
33  *    necessary to implement the Hermon WRID tracking mechanism.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/avl.h>
42 
43 #include <sys/ib/adapters/hermon/hermon.h>
44 
45 static uint32_t hermon_wr_get_immediate(ibt_send_wr_t *wr);
46 static int hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr);
47 static int hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
48     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
49 static int hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
50     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
51 static void hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp);
52 static int hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
53     ibt_recv_wr_t *wr, uint64_t *desc);
54 static int hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
55     ibt_recv_wr_t *wr, uint64_t *desc);
56 static void hermon_wqe_sync(void *hdl, uint_t sync_from,
57     uint_t sync_to, uint_t sync_type, uint_t flag);
58 static hermon_workq_avl_t *hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn,
59     uint_t send_or_recv);
60 static void hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl);
61 static void hermon_cq_workq_remove(hermon_cqhdl_t cq,
62     hermon_workq_avl_t *wqavl);
63 
64 static	ibt_wr_ds_t	null_sgl = { 0, 0x00000100, 0 };
65 
66 static int
67 hermon_post_send_ud(hermon_state_t *state, hermon_qphdl_t qp,
68     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
69 {
70 	hermon_hw_snd_wqe_ud_t		*ud;
71 	hermon_workq_hdr_t		*wq;
72 	hermon_ahhdl_t			ah;
73 	ibt_ud_dest_t			*dest;
74 	uint64_t			*desc;
75 	uint32_t			desc_sz;
76 	uint32_t			signaled_dbd, solicited;
77 	uint32_t			head, tail, next_tail, qsize_msk;
78 	uint32_t			hdrmwqes;
79 	uint32_t			nopcode, fence, immed_data = 0;
80 	hermon_hw_wqe_sgl_t		*ds, *old_ds;
81 	ibt_wr_ds_t			*sgl;
82 	uint32_t			nds, dnds;
83 	int				i, j, last_ds, num_ds, status;
84 	uint32_t			*wqe_start;
85 	int				sectperwqe;
86 	uint_t				posted_cnt = 0;
87 
88 	/* initialize the FMA retry loop */
89 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
90 
91 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
92 	_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
93 
94 	/* Grab the lock for the WRID list */
95 	membar_consumer();
96 
97 	/* Save away some initial QP state */
98 	wq = qp->qp_sq_wqhdr;
99 	qsize_msk = wq->wq_mask;
100 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
101 	sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
102 
103 	tail	  = wq->wq_tail;
104 	head	  = wq->wq_head;
105 	status	  = DDI_SUCCESS;
106 
107 post_next:
108 	/*
109 	 * Check for "queue full" condition.  If the queue
110 	 * is already full, then no more WQEs can be posted.
111 	 * So break out, ring a doorbell (if necessary) and
112 	 * return an error
113 	 */
114 	if (wq->wq_full != 0) {
115 		status = IBT_QP_FULL;
116 		goto done;
117 	}
118 
119 	next_tail = (tail + 1) & qsize_msk;
120 	if (((tail + hdrmwqes) & qsize_msk) == head) {
121 		wq->wq_full = 1;
122 	}
123 
124 	desc = HERMON_QP_SQ_ENTRY(qp, tail);
125 
126 	ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
127 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
128 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
129 	    sizeof (hermon_hw_snd_wqe_ud_t));
130 	nds = wr->wr_nds;
131 	sgl = wr->wr_sgl;
132 	num_ds = 0;
133 
134 	/* need to know the count of destination nds for backward loop */
135 	for (dnds = 0, i = 0; i < nds; i++) {
136 		if (sgl[i].ds_len != 0)
137 			dnds++;
138 	}
139 
140 	/*
141 	 * Build a Send or Send_LSO WQE
142 	 */
143 	if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
144 		int total_len;
145 
146 		nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
147 		if (wr->wr.ud_lso.lso_hdr_sz > 60) {
148 			nopcode |= (1 << 6);	/* ReRead bit must be set */
149 		}
150 		dest = wr->wr.ud_lso.lso_ud_dest;
151 		ah = (hermon_ahhdl_t)dest->ud_ah;
152 		if (ah == NULL) {
153 			status = IBT_AH_HDL_INVALID;
154 			goto done;
155 		}
156 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
157 
158 		total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
159 		if ((uintptr_t)ds + total_len + (nds * 16) >
160 		    (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz)) {
161 			status = IBT_QP_SGL_LEN_INVALID;
162 			goto done;
163 		}
164 		old_ds = ds;
165 		bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)old_ds + 1,
166 		    wr->wr.ud_lso.lso_hdr_sz);
167 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
168 		i = 0;
169 	} else if (wr->wr_opcode == IBT_WRC_SEND) {
170 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
171 			nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
172 			immed_data = wr->wr.ud.udwr_immed;
173 		} else {
174 			nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
175 		}
176 		dest = wr->wr.ud.udwr_dest;
177 		ah = (hermon_ahhdl_t)dest->ud_ah;
178 		if (ah == NULL) {
179 			status = IBT_AH_HDL_INVALID;
180 			goto done;
181 		}
182 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
183 		i = 0;
184 	} else {
185 		status = IBT_QP_OP_TYPE_INVALID;
186 		goto done;
187 	}
188 
189 	if (nds > qp->qp_sq_sgl) {
190 		status = IBT_QP_SGL_LEN_INVALID;
191 		goto done;
192 	}
193 	for (last_ds = num_ds, j = i; j < nds; j++) {
194 		if (sgl[j].ds_len != 0)
195 			last_ds++;	/* real last ds of wqe to fill */
196 	}
197 	desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
198 	for (j = nds; --j >= i; ) {
199 		if (sgl[j].ds_len == 0) {
200 			continue;
201 		}
202 
203 		/*
204 		 * Fill in the Data Segment(s) for the current WQE, using the
205 		 * information contained in the scatter-gather list of the
206 		 * work request.
207 		 */
208 		last_ds--;
209 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
210 	}
211 
212 	membar_producer();
213 
214 	if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
215 		HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
216 		    wr->wr.ud_lso.lso_hdr_sz);
217 	}
218 
219 	fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
220 
221 	signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
222 	    (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 1 : 0;
223 
224 	solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 1 : 0;
225 
226 	HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data,
227 	    solicited, signaled_dbd, wr->wr_flags & IBT_WR_SEND_CKSUM, qp);
228 
229 	wq->wq_wrid[tail] = wr->wr_id;
230 
231 	tail = next_tail;
232 
233 	/* Update some of the state in the QP */
234 	wq->wq_tail = tail;
235 
236 	membar_producer();
237 
238 	/* Now set the ownership bit and opcode (first dword). */
239 	HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
240 
241 	posted_cnt++;
242 	if (--num_wr > 0) {
243 		/* do the invalidate of the headroom */
244 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
245 		    (tail + hdrmwqes) & qsize_msk);
246 		for (i = 16; i < sectperwqe; i += 16) {
247 			wqe_start[i] = 0xFFFFFFFF;
248 		}
249 
250 		wr++;
251 		goto post_next;
252 	}
253 done:
254 	if (posted_cnt != 0) {
255 		ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
256 
257 		membar_producer();
258 
259 		/* the FMA retry loop starts for Hermon doorbell register. */
260 		hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
261 		    fm_status, fm_test_num);
262 
263 		HERMON_UAR_DOORBELL(state, uarhdl,
264 		    (uint64_t *)(void *)&state->hs_uar->send,
265 		    (uint64_t)qp->qp_ring);
266 
267 		/* the FMA retry loop ends. */
268 		hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
269 		    fm_status, fm_test_num);
270 
271 		/* do the invalidate of the headroom */
272 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
273 		    (tail + hdrmwqes) & qsize_msk);
274 		for (i = 16; i < sectperwqe; i += 16) {
275 			wqe_start[i] = 0xFFFFFFFF;
276 		}
277 	}
278 	if (num_posted != NULL)
279 		*num_posted = posted_cnt;
280 
281 	mutex_exit(&qp->qp_sq_lock);
282 
283 	return (status);
284 
285 pio_error:
286 	mutex_exit(&qp->qp_sq_lock);
287 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
288 	return (ibc_get_ci_failure(0));
289 }
290 
291 static int
292 hermon_post_send_rc(hermon_state_t *state, hermon_qphdl_t qp,
293     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
294 {
295 	uint64_t			*desc;
296 	hermon_workq_hdr_t		*wq;
297 	uint32_t			desc_sz;
298 	uint32_t			signaled_dbd, solicited;
299 	uint32_t			head, tail, next_tail, qsize_msk;
300 	uint32_t			hdrmwqes;
301 	int				status;
302 	uint32_t			nopcode, fence, immed_data = 0;
303 	hermon_hw_snd_wqe_remaddr_t	*rc;
304 	hermon_hw_snd_wqe_atomic_t	*at;
305 	hermon_hw_snd_wqe_bind_t	*bn;
306 	hermon_hw_wqe_sgl_t		*ds;
307 	ibt_wr_ds_t			*sgl;
308 	uint32_t			nds;
309 	int				i, last_ds, num_ds;
310 	uint32_t			*wqe_start;
311 	int				sectperwqe;
312 	uint_t				posted_cnt = 0;
313 
314 	/* initialize the FMA retry loop */
315 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
316 
317 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
318 	_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
319 
320 	/* make sure we see any update of wq_head */
321 	membar_consumer();
322 
323 	/* Save away some initial QP state */
324 	wq = qp->qp_sq_wqhdr;
325 	qsize_msk = wq->wq_mask;
326 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
327 	sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
328 
329 	tail	  = wq->wq_tail;
330 	head	  = wq->wq_head;
331 	status	  = DDI_SUCCESS;
332 
333 post_next:
334 	/*
335 	 * Check for "queue full" condition.  If the queue
336 	 * is already full, then no more WQEs can be posted.
337 	 * So break out, ring a doorbell (if necessary) and
338 	 * return an error
339 	 */
340 	if (wq->wq_full != 0) {
341 		status = IBT_QP_FULL;
342 		goto done;
343 	}
344 	next_tail = (tail + 1) & qsize_msk;
345 	if (((tail + hdrmwqes) & qsize_msk) == head) {
346 		wq->wq_full = 1;
347 	}
348 
349 	desc = HERMON_QP_SQ_ENTRY(qp, tail);
350 
351 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
352 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
353 	nds = wr->wr_nds;
354 	sgl = wr->wr_sgl;
355 	num_ds = 0;
356 
357 	/*
358 	 * Validate the operation type.  For RC requests, we allow
359 	 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
360 	 * operations, and memory window "Bind"
361 	 */
362 	switch (wr->wr_opcode) {
363 	default:
364 		status = IBT_QP_OP_TYPE_INVALID;
365 		goto done;
366 
367 	case IBT_WRC_SEND:
368 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
369 			nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
370 			immed_data = wr->wr.rc.rcwr.send_immed;
371 		} else {
372 			nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
373 		}
374 		break;
375 
376 	/*
377 	 * If this is an RDMA Read or RDMA Write request, then fill
378 	 * in the "Remote Address" header fields.
379 	 */
380 	case IBT_WRC_RDMAW:
381 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
382 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAWI;
383 			immed_data = wr->wr.rc.rcwr.rdma.rdma_immed;
384 		} else {
385 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
386 		}
387 		/* FALLTHROUGH */
388 	case IBT_WRC_RDMAR:
389 		if (wr->wr_opcode == IBT_WRC_RDMAR)
390 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
391 		rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
392 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
393 
394 		/*
395 		 * Build the Remote Address Segment for the WQE, using
396 		 * the information from the RC work request.
397 		 */
398 		HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
399 
400 		/* Update "ds" for filling in Data Segments (below) */
401 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
402 		    sizeof (hermon_hw_snd_wqe_remaddr_t));
403 		break;
404 
405 	/*
406 	 * If this is one of the Atomic type operations (i.e
407 	 * Compare-Swap or Fetch-Add), then fill in both the "Remote
408 	 * Address" header fields and the "Atomic" header fields.
409 	 */
410 	case IBT_WRC_CSWAP:
411 		nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
412 		/* FALLTHROUGH */
413 	case IBT_WRC_FADD:
414 		if (wr->wr_opcode == IBT_WRC_FADD)
415 			nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
416 		rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
417 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
418 		at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
419 		    sizeof (hermon_hw_snd_wqe_remaddr_t));
420 
421 		/*
422 		 * Build the Remote Address and Atomic Segments for
423 		 * the WQE, using the information from the RC Atomic
424 		 * work request.
425 		 */
426 		HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
427 		HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
428 
429 		/* Update "ds" for filling in Data Segments (below) */
430 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
431 		    sizeof (hermon_hw_snd_wqe_atomic_t));
432 
433 		/*
434 		 * Update "nds" and "sgl" because Atomic requests have
435 		 * only a single Data Segment.
436 		 */
437 		nds = 1;
438 		sgl = wr->wr_sgl;
439 		break;
440 
441 	/*
442 	 * If this is memory window Bind operation, then we call the
443 	 * hermon_wr_bind_check() routine to validate the request and
444 	 * to generate the updated RKey.  If this is successful, then
445 	 * we fill in the WQE's "Bind" header fields.
446 	 */
447 	case IBT_WRC_BIND:
448 		nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
449 		status = hermon_wr_bind_check(state, wr);
450 		if (status != DDI_SUCCESS)
451 			goto done;
452 
453 		bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
454 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
455 
456 		/*
457 		 * Build the Bind Memory Window Segments for the WQE,
458 		 * using the information from the RC Bind memory
459 		 * window work request.
460 		 */
461 		HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
462 
463 		/*
464 		 * Update the "ds" pointer.  Even though the "bind"
465 		 * operation requires no SGLs, this is necessary to
466 		 * facilitate the correct descriptor size calculations
467 		 * (below).
468 		 */
469 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
470 		    sizeof (hermon_hw_snd_wqe_bind_t));
471 		nds = 0;
472 	}
473 
474 	/*
475 	 * Now fill in the Data Segments (SGL) for the Send WQE based
476 	 * on the values setup above (i.e. "sgl", "nds", and the "ds"
477 	 * pointer. Start by checking for a valid number of SGL entries
478 	 */
479 	if (nds > qp->qp_sq_sgl) {
480 		status = IBT_QP_SGL_LEN_INVALID;
481 		goto done;
482 	}
483 
484 	for (last_ds = num_ds, i = 0; i < nds; i++) {
485 		if (sgl[i].ds_len != 0)
486 			last_ds++;	/* real last ds of wqe to fill */
487 	}
488 	desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
489 	for (i = nds; --i >= 0; ) {
490 		if (sgl[i].ds_len == 0) {
491 			continue;
492 		}
493 
494 		/*
495 		 * Fill in the Data Segment(s) for the current WQE, using the
496 		 * information contained in the scatter-gather list of the
497 		 * work request.
498 		 */
499 		last_ds--;
500 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[i]);
501 	}
502 
503 	fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
504 
505 	signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
506 	    (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 1 : 0;
507 
508 	solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 1 : 0;
509 
510 	HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited,
511 	    signaled_dbd, wr->wr_flags & IBT_WR_SEND_CKSUM, qp);
512 
513 	wq->wq_wrid[tail] = wr->wr_id;
514 
515 	tail = next_tail;
516 
517 	/* Update some of the state in the QP */
518 	wq->wq_tail = tail;
519 
520 	membar_producer();
521 
522 	/* Now set the ownership bit of the first one in the chain. */
523 	HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
524 
525 	posted_cnt++;
526 	if (--num_wr > 0) {
527 		/* do the invalidate of the headroom */
528 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
529 		    (tail + hdrmwqes) & qsize_msk);
530 		for (i = 16; i < sectperwqe; i += 16) {
531 			wqe_start[i] = 0xFFFFFFFF;
532 		}
533 
534 		wr++;
535 		goto post_next;
536 	}
537 done:
538 
539 	if (posted_cnt != 0) {
540 		ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
541 
542 		membar_producer();
543 
544 		/* the FMA retry loop starts for Hermon doorbell register. */
545 		hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
546 		    fm_status, fm_test_num);
547 
548 		/* Ring the doorbell */
549 		HERMON_UAR_DOORBELL(state, uarhdl,
550 		    (uint64_t *)(void *)&state->hs_uar->send,
551 		    (uint64_t)qp->qp_ring);
552 
553 		/* the FMA retry loop ends. */
554 		hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
555 		    fm_status, fm_test_num);
556 
557 		/* do the invalidate of the headroom */
558 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
559 		    (tail + hdrmwqes) & qsize_msk);
560 		for (i = 16; i < sectperwqe; i += 16) {
561 			wqe_start[i] = 0xFFFFFFFF;
562 		}
563 	}
564 	/*
565 	 * Update the "num_posted" return value (if necessary).
566 	 * Then drop the locks and return success.
567 	 */
568 	if (num_posted != NULL) {
569 		*num_posted = posted_cnt;
570 	}
571 
572 	mutex_exit(&qp->qp_sq_lock);
573 	return (status);
574 
575 pio_error:
576 	mutex_exit(&qp->qp_sq_lock);
577 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
578 	return (ibc_get_ci_failure(0));
579 }
580 
581 /*
582  * hermon_post_send()
583  *    Context: Can be called from interrupt or base context.
584  */
585 int
586 hermon_post_send(hermon_state_t *state, hermon_qphdl_t qp,
587     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
588 {
589 	ibt_send_wr_t 			*curr_wr;
590 	hermon_workq_hdr_t		*wq;
591 	hermon_ahhdl_t			ah;
592 	uint64_t			*desc, *prev;
593 	uint32_t			desc_sz;
594 	uint32_t			signaled_dbd, solicited;
595 	uint32_t			head, tail, next_tail, qsize_msk;
596 	uint32_t			sync_from, sync_to;
597 	uint32_t			hdrmwqes;
598 	uint_t				currindx, wrindx, numremain;
599 	uint_t				chainlen;
600 	uint_t				posted_cnt, maxstat;
601 	uint_t				total_posted;
602 	int				status;
603 	uint32_t			nopcode, fence, immed_data = 0;
604 	uint32_t			prev_nopcode;
605 
606 	/* initialize the FMA retry loop */
607 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test);
608 
609 	/*
610 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
611 	 * clients to post to QP memory that is accessible directly by the
612 	 * user.  If the QP memory is user accessible, then return an error.
613 	 */
614 	if (qp->qp_is_umap) {
615 		return (IBT_QP_HDL_INVALID);
616 	}
617 
618 	mutex_enter(&qp->qp_lock);
619 
620 	/*
621 	 * Check QP state.  Can not post Send requests from the "Reset",
622 	 * "Init", or "RTR" states
623 	 */
624 	if ((qp->qp_state == HERMON_QP_RESET) ||
625 	    (qp->qp_state == HERMON_QP_INIT) ||
626 	    (qp->qp_state == HERMON_QP_RTR)) {
627 		mutex_exit(&qp->qp_lock);
628 		return (IBT_QP_STATE_INVALID);
629 	}
630 	mutex_exit(&qp->qp_lock);
631 	mutex_enter(&qp->qp_sq_lock);
632 
633 	if (qp->qp_is_special)
634 		goto post_many;
635 
636 	/* Use these optimized functions most of the time */
637 	if (qp->qp_serv_type == HERMON_QP_UD)
638 		return (hermon_post_send_ud(state, qp, wr, num_wr, num_posted));
639 
640 	if (qp->qp_serv_type == HERMON_QP_RC)
641 		return (hermon_post_send_rc(state, qp, wr, num_wr, num_posted));
642 
643 	if (qp->qp_serv_type == HERMON_QP_UC)
644 		goto post_many;
645 
646 	mutex_exit(&qp->qp_sq_lock);
647 	return (IBT_QP_SRV_TYPE_INVALID);
648 
649 post_many:
650 	/* general loop for non-optimized posting */
651 
652 	/* Grab the lock for the WRID list */
653 	membar_consumer();
654 
655 	/* Save away some initial QP state */
656 	wq = qp->qp_sq_wqhdr;
657 	qsize_msk = wq->wq_mask;
658 	tail	  = wq->wq_tail;
659 	head	  = wq->wq_head;
660 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
661 
662 	/* Initialize posted_cnt */
663 	posted_cnt = 0;
664 	total_posted = 0;
665 
666 	/*
667 	 * For each ibt_send_wr_t in the wr[] list passed in, parse the
668 	 * request and build a Send WQE.  NOTE:  Because we are potentially
669 	 * building a chain of WQEs to post, we want to build them all first,
670 	 * and set the valid (HW Ownership) bit on all but the first.
671 	 * However, we do not want to validate the first one until the
672 	 * entire chain of WQEs has been built.  Then in the final
673 	 * we set the valid bit in the first, flush if needed, and as a last
674 	 * step ring the appropriate doorbell.  NOTE: the doorbell ring may
675 	 * NOT be needed if the HCA is already processing, but the doorbell
676 	 * ring will be done regardless. NOTE ALSO:  It is possible for
677 	 * more Work Requests to be posted than the HW will support at one
678 	 * shot.  If this happens, we need to be able to post and ring
679 	 * several chains here until the the entire request is complete.
680 	 * NOTE ALSO:  the term "chain" is used to differentiate it from
681 	 * Work Request List passed in; and because that's the terminology
682 	 * from the previous generations of HCA - but the WQEs are not, in fact
683 	 * chained together for Hermon
684 	 */
685 
686 	wrindx = 0;
687 	numremain = num_wr;
688 	status	  = DDI_SUCCESS;
689 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
690 		/*
691 		 * For the first WQE on a new chain we need "prev" to point
692 		 * to the current descriptor.
693 		 */
694 		prev = HERMON_QP_SQ_ENTRY(qp, tail);
695 
696 	/*
697 	 * unlike Tavor & Arbel, tail will maintain the number of the
698 	 * next (this) WQE to be posted.  Since there is no backward linking
699 	 * in Hermon, we can always just look ahead
700 	 */
701 		/*
702 		 * Before we begin, save the current "tail index" for later
703 		 * DMA sync
704 		 */
705 		/* NOTE: don't need to go back one like arbel/tavor */
706 		sync_from = tail;
707 
708 		/*
709 		 * Break the request up into lists that are less than or
710 		 * equal to the maximum number of WQEs that can be posted
711 		 * per doorbell ring - 256 currently
712 		 */
713 		chainlen = (numremain > HERMON_QP_MAXDESC_PER_DB) ?
714 		    HERMON_QP_MAXDESC_PER_DB : numremain;
715 		numremain -= chainlen;
716 
717 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
718 			/*
719 			 * Check for "queue full" condition.  If the queue
720 			 * is already full, then no more WQEs can be posted.
721 			 * So break out, ring a doorbell (if necessary) and
722 			 * return an error
723 			 */
724 			if (wq->wq_full != 0) {
725 				status = IBT_QP_FULL;
726 				break;
727 			}
728 
729 			/*
730 			 * Increment the "tail index". Check for "queue
731 			 * full" condition incl. headroom.  If we detect that
732 			 * the current work request is going to fill the work
733 			 * queue, then we mark this condition and continue.
734 			 * Don't need >=, because going one-by-one we have to
735 			 * hit it exactly sooner or later
736 			 */
737 
738 			next_tail = (tail + 1) & qsize_msk;
739 			if (((tail + hdrmwqes) & qsize_msk) == head) {
740 				wq->wq_full = 1;
741 			}
742 
743 			/*
744 			 * Get the address of the location where the next
745 			 * Send WQE should be built
746 			 */
747 			desc = HERMON_QP_SQ_ENTRY(qp, tail);
748 			/*
749 			 * Call hermon_wqe_send_build() to build the WQE
750 			 * at the given address.  This routine uses the
751 			 * information in the ibt_send_wr_t list (wr[]) and
752 			 * returns the size of the WQE when it returns.
753 			 */
754 			status = hermon_wqe_send_build(state, qp,
755 			    &wr[wrindx], desc, &desc_sz);
756 			if (status != DDI_SUCCESS) {
757 				break;
758 			}
759 
760 			/*
761 			 * Now, build the Ctrl Segment based on
762 			 * what was just done
763 			 */
764 			curr_wr = &wr[wrindx];
765 
766 			switch (curr_wr->wr_opcode) {
767 			case IBT_WRC_RDMAW:
768 				if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
769 					nopcode =
770 					    HERMON_WQE_SEND_NOPCODE_RDMAWI;
771 					immed_data =
772 					    hermon_wr_get_immediate(curr_wr);
773 				} else {
774 					nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
775 				}
776 				break;
777 
778 			case IBT_WRC_SEND:
779 				if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
780 					nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
781 					immed_data =
782 					    hermon_wr_get_immediate(curr_wr);
783 				} else {
784 					nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
785 				}
786 				break;
787 
788 			case IBT_WRC_SEND_LSO:
789 				nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
790 				break;
791 
792 			case IBT_WRC_RDMAR:
793 				nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
794 				break;
795 
796 			case IBT_WRC_CSWAP:
797 				nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
798 				break;
799 
800 			case IBT_WRC_FADD:
801 				nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
802 				break;
803 
804 			case IBT_WRC_BIND:
805 				nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
806 				break;
807 			}
808 
809 			fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
810 
811 			/*
812 			 * now, build up the control segment, leaving the
813 			 * owner bit as it is
814 			 */
815 
816 			if ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
817 			    (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL)) {
818 				signaled_dbd = 1;
819 			} else {
820 				signaled_dbd = 0;
821 			}
822 			if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT)
823 				solicited = 1;
824 			else
825 				solicited = 0;
826 
827 			if (qp->qp_is_special) {
828 				/* Ensure correctness, set the ReRead bit */
829 				nopcode |= (1 << 6);
830 				ah = (hermon_ahhdl_t)
831 				    curr_wr->wr.ud.udwr_dest->ud_ah;
832 				mutex_enter(&ah->ah_lock);
833 				maxstat = ah->ah_udav->max_stat_rate;
834 				HERMON_WQE_SET_MLX_CTRL_SEGMENT(desc, desc_sz,
835 				    signaled_dbd, maxstat, ah->ah_udav->rlid,
836 				    qp, ah->ah_udav->sl);
837 				mutex_exit(&ah->ah_lock);
838 			} else {
839 				HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz,
840 				    fence, immed_data, solicited,
841 				    signaled_dbd, curr_wr->wr_flags &
842 				    IBT_WR_SEND_CKSUM, qp);
843 			}
844 			wq->wq_wrid[tail] = curr_wr->wr_id;
845 
846 			/*
847 			 * If this is not the first descriptor on the current
848 			 * chain, then set the ownership bit.
849 			 */
850 			if (currindx != 0) {		/* not the first */
851 				membar_producer();
852 				HERMON_SET_SEND_WQE_OWNER(qp,
853 				    (uint32_t *)desc, nopcode);
854 			} else
855 				prev_nopcode = nopcode;
856 
857 			/*
858 			 * Update the current "tail index" and increment
859 			 * "posted_cnt"
860 			 */
861 			tail = next_tail;
862 			posted_cnt++;
863 		}
864 
865 		/*
866 		 * If we reach here and there are one or more WQEs which have
867 		 * been successfully built as a chain, we have to finish up
868 		 * and prepare them for writing to the HW
869 		 * The steps are:
870 		 * 	1. do the headroom fixup
871 		 *	2. add in the size of the headroom for the sync
872 		 *	3. write the owner bit for the first WQE
873 		 *	4. sync them
874 		 *	5. fix up the structures
875 		 *	6. hit the doorbell in UAR
876 		 */
877 		if (posted_cnt != 0) {
878 			ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
879 
880 			/*
881 			 * Save away updated "tail index" for the DMA sync
882 			 * including the headroom that will be needed
883 			 */
884 			sync_to = (tail + hdrmwqes) & qsize_msk;
885 
886 			/* do the invalidate of the headroom */
887 
888 			hermon_wqe_headroom(tail, qp);
889 
890 			/* Do a DMA sync for current send WQE(s) */
891 			hermon_wqe_sync(qp, sync_from, sync_to, HERMON_WR_SEND,
892 			    DDI_DMA_SYNC_FORDEV);
893 
894 			/* Update some of the state in the QP */
895 			wq->wq_tail = tail;
896 			total_posted += posted_cnt;
897 			posted_cnt = 0;
898 
899 			membar_producer();
900 
901 			/*
902 			 * Now set the ownership bit of the first
903 			 * one in the chain
904 			 */
905 			HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)prev,
906 			    prev_nopcode);
907 
908 			/* the FMA retry loop starts for Hermon doorbell. */
909 			hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
910 			    fm_status, fm_test);
911 
912 			HERMON_UAR_DOORBELL(state, uarhdl,
913 			    (uint64_t *)(void *)&state->hs_uar->send,
914 			    (uint64_t)qp->qp_ring);
915 
916 			/* the FMA retry loop ends. */
917 			hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
918 			    fm_status, fm_test);
919 		}
920 	}
921 
922 	/*
923 	 * Update the "num_posted" return value (if necessary).
924 	 * Then drop the locks and return success.
925 	 */
926 	if (num_posted != NULL) {
927 		*num_posted = total_posted;
928 	}
929 	mutex_exit(&qp->qp_sq_lock);
930 	return (status);
931 
932 pio_error:
933 	mutex_exit(&qp->qp_sq_lock);
934 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
935 	return (ibc_get_ci_failure(0));
936 }
937 
938 
939 /*
940  * hermon_post_recv()
941  *    Context: Can be called from interrupt or base context.
942  */
943 int
944 hermon_post_recv(hermon_state_t *state, hermon_qphdl_t qp,
945     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
946 {
947 	uint64_t			*desc;
948 	hermon_workq_hdr_t		*wq;
949 	uint32_t			head, tail, next_tail, qsize_msk;
950 	uint32_t			sync_from, sync_to;
951 	uint_t				wrindx;
952 	uint_t				posted_cnt;
953 	int				status;
954 
955 	/*
956 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
957 	 * clients to post to QP memory that is accessible directly by the
958 	 * user.  If the QP memory is user accessible, then return an error.
959 	 */
960 	if (qp->qp_is_umap) {
961 		return (IBT_QP_HDL_INVALID);
962 	}
963 
964 	/* Initialize posted_cnt */
965 	posted_cnt = 0;
966 
967 	mutex_enter(&qp->qp_lock);
968 
969 	/*
970 	 * Check if QP is associated with an SRQ
971 	 */
972 	if (qp->qp_srq_en == HERMON_QP_SRQ_ENABLED) {
973 		mutex_exit(&qp->qp_lock);
974 		return (IBT_SRQ_IN_USE);
975 	}
976 
977 	/*
978 	 * Check QP state.  Can not post Recv requests from the "Reset" state
979 	 */
980 	if (qp->qp_state == HERMON_QP_RESET) {
981 		mutex_exit(&qp->qp_lock);
982 		return (IBT_QP_STATE_INVALID);
983 	}
984 
985 	/* Check that work request transport type is valid */
986 	if ((qp->qp_serv_type != HERMON_QP_UD) &&
987 	    (qp->qp_serv_type != HERMON_QP_RC) &&
988 	    (qp->qp_serv_type != HERMON_QP_UC)) {
989 		mutex_exit(&qp->qp_lock);
990 		return (IBT_QP_SRV_TYPE_INVALID);
991 	}
992 
993 	mutex_exit(&qp->qp_lock);
994 	mutex_enter(&qp->qp_rq_lock);
995 
996 	/*
997 	 * Grab the lock for the WRID list, i.e., membar_consumer().
998 	 * This is not needed because the mutex_enter() above has
999 	 * the same effect.
1000 	 */
1001 
1002 	/* Save away some initial QP state */
1003 	wq = qp->qp_rq_wqhdr;
1004 	qsize_msk = wq->wq_mask;
1005 	tail	  = wq->wq_tail;
1006 	head	  = wq->wq_head;
1007 
1008 	wrindx = 0;
1009 	status	  = DDI_SUCCESS;
1010 	/*
1011 	 * Before we begin, save the current "tail index" for later
1012 	 * DMA sync
1013 	 */
1014 	sync_from = tail;
1015 
1016 	for (wrindx = 0; wrindx < num_wr; wrindx++) {
1017 		if (wq->wq_full != 0) {
1018 			status = IBT_QP_FULL;
1019 			break;
1020 		}
1021 		next_tail = (tail + 1) & qsize_msk;
1022 		if (next_tail == head) {
1023 			wq->wq_full = 1;
1024 		}
1025 		desc = HERMON_QP_RQ_ENTRY(qp, tail);
1026 		status = hermon_wqe_recv_build(state, qp, &wr[wrindx], desc);
1027 		if (status != DDI_SUCCESS) {
1028 			break;
1029 		}
1030 
1031 		wq->wq_wrid[tail] = wr[wrindx].wr_id;
1032 		qp->qp_rq_wqecntr++;
1033 
1034 		tail = next_tail;
1035 		posted_cnt++;
1036 	}
1037 
1038 	if (posted_cnt != 0) {
1039 		/* Save away updated "tail index" for the DMA sync */
1040 		sync_to = tail;
1041 
1042 		hermon_wqe_sync(qp, sync_from, sync_to, HERMON_WR_RECV,
1043 		    DDI_DMA_SYNC_FORDEV);
1044 
1045 		wq->wq_tail = tail;
1046 
1047 		membar_producer();	/* ensure wrids are visible */
1048 
1049 		/* Update the doorbell record w/ wqecntr */
1050 		HERMON_UAR_DB_RECORD_WRITE(qp->qp_rq_vdbr,
1051 		    qp->qp_rq_wqecntr & 0xFFFF);
1052 	}
1053 
1054 	if (num_posted != NULL) {
1055 		*num_posted = posted_cnt;
1056 	}
1057 
1058 
1059 	mutex_exit(&qp->qp_rq_lock);
1060 	return (status);
1061 }
1062 
1063 /*
1064  * hermon_post_srq()
1065  *    Context: Can be called from interrupt or base context.
1066  */
1067 int
1068 hermon_post_srq(hermon_state_t *state, hermon_srqhdl_t srq,
1069     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
1070 {
1071 	uint64_t			*desc;
1072 	hermon_workq_hdr_t		*wq;
1073 	uint_t				indx, wrindx;
1074 	uint_t				posted_cnt;
1075 	int				status;
1076 
1077 	mutex_enter(&srq->srq_lock);
1078 
1079 	/*
1080 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
1081 	 * clients to post to QP memory that is accessible directly by the
1082 	 * user.  If the QP memory is user accessible, then return an error.
1083 	 */
1084 	if (srq->srq_is_umap) {
1085 		mutex_exit(&srq->srq_lock);
1086 		return (IBT_SRQ_HDL_INVALID);
1087 	}
1088 
1089 	/*
1090 	 * Check SRQ state.  Can not post Recv requests when SRQ is in error
1091 	 */
1092 	if (srq->srq_state == HERMON_SRQ_STATE_ERROR) {
1093 		mutex_exit(&srq->srq_lock);
1094 		return (IBT_QP_STATE_INVALID);
1095 	}
1096 
1097 	status = DDI_SUCCESS;
1098 	posted_cnt = 0;
1099 	wq = srq->srq_wq_wqhdr;
1100 	indx = wq->wq_head;
1101 
1102 	for (wrindx = 0; wrindx < num_wr; wrindx++) {
1103 
1104 		if (indx == wq->wq_tail) {
1105 			status = IBT_QP_FULL;
1106 			break;
1107 		}
1108 		desc = HERMON_SRQ_WQE_ADDR(srq, indx);
1109 
1110 		wq->wq_wrid[indx] = wr[wrindx].wr_id;
1111 
1112 		status = hermon_wqe_srq_build(state, srq, &wr[wrindx], desc);
1113 		if (status != DDI_SUCCESS) {
1114 			break;
1115 		}
1116 
1117 		hermon_wqe_sync(srq, indx, indx + 1,
1118 		    HERMON_WR_SRQ, DDI_DMA_SYNC_FORDEV);
1119 		posted_cnt++;
1120 		indx = htons(((uint16_t *)desc)[1]);
1121 		wq->wq_head = indx;
1122 	}
1123 
1124 	if (posted_cnt != 0) {
1125 
1126 		srq->srq_wq_wqecntr += posted_cnt;
1127 
1128 		membar_producer();	/* ensure wrids are visible */
1129 
1130 		/* Ring the doorbell w/ wqecntr */
1131 		HERMON_UAR_DB_RECORD_WRITE(srq->srq_wq_vdbr,
1132 		    srq->srq_wq_wqecntr & 0xFFFF);
1133 	}
1134 
1135 	if (num_posted != NULL) {
1136 		*num_posted = posted_cnt;
1137 	}
1138 
1139 	mutex_exit(&srq->srq_lock);
1140 	return (status);
1141 }
1142 
1143 
1144 /*
1145  * hermon_wqe_send_build()
1146  *    Context: Can be called from interrupt or base context.
1147  */
1148 static int
1149 hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
1150     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1151 {
1152 	hermon_hw_snd_wqe_ud_t		*ud;
1153 	hermon_hw_snd_wqe_remaddr_t	*rc;
1154 	hermon_hw_snd_wqe_atomic_t	*at;
1155 	hermon_hw_snd_wqe_remaddr_t	*uc;
1156 	hermon_hw_snd_wqe_bind_t	*bn;
1157 	hermon_hw_wqe_sgl_t		*ds, *old_ds;
1158 	ibt_ud_dest_t			*dest;
1159 	ibt_wr_ds_t			*sgl;
1160 	hermon_ahhdl_t			ah;
1161 	uint32_t			nds;
1162 	int				i, j, last_ds, num_ds, status;
1163 	int				tmpsize;
1164 
1165 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1166 
1167 	/* Initialize the information for the Data Segments */
1168 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1169 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
1170 	nds = wr->wr_nds;
1171 	sgl = wr->wr_sgl;
1172 	num_ds = 0;
1173 	i = 0;
1174 
1175 	/*
1176 	 * Build a Send WQE depends first and foremost on the transport
1177 	 * type of Work Request (i.e. UD, RC, or UC)
1178 	 */
1179 	switch (wr->wr_trans) {
1180 	case IBT_UD_SRV:
1181 		/* Ensure that work request transport type matches QP type */
1182 		if (qp->qp_serv_type != HERMON_QP_UD) {
1183 			return (IBT_QP_SRV_TYPE_INVALID);
1184 		}
1185 
1186 		/*
1187 		 * Validate the operation type.  For UD requests, only the
1188 		 * "Send" and "Send LSO" operations are valid.
1189 		 */
1190 		if (wr->wr_opcode != IBT_WRC_SEND &&
1191 		    wr->wr_opcode != IBT_WRC_SEND_LSO) {
1192 			return (IBT_QP_OP_TYPE_INVALID);
1193 		}
1194 
1195 		/*
1196 		 * If this is a Special QP (QP0 or QP1), then we need to
1197 		 * build MLX WQEs instead.  So jump to hermon_wqe_mlx_build()
1198 		 * and return whatever status it returns
1199 		 */
1200 		if (qp->qp_is_special) {
1201 			if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1202 				return (IBT_QP_OP_TYPE_INVALID);
1203 			}
1204 			status = hermon_wqe_mlx_build(state, qp,
1205 			    wr, desc, size);
1206 			return (status);
1207 		}
1208 
1209 		/*
1210 		 * Otherwise, if this is a normal UD Send request, then fill
1211 		 * all the fields in the Hermon UD header for the WQE.  Note:
1212 		 * to do this we'll need to extract some information from the
1213 		 * Address Handle passed with the work request.
1214 		 */
1215 		ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1216 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
1217 		if (wr->wr_opcode == IBT_WRC_SEND) {
1218 			dest = wr->wr.ud.udwr_dest;
1219 		} else {
1220 			dest = wr->wr.ud_lso.lso_ud_dest;
1221 		}
1222 		ah = (hermon_ahhdl_t)dest->ud_ah;
1223 		if (ah == NULL) {
1224 			return (IBT_AH_HDL_INVALID);
1225 		}
1226 
1227 		/*
1228 		 * Build the Unreliable Datagram Segment for the WQE, using
1229 		 * the information from the address handle and the work
1230 		 * request.
1231 		 */
1232 		/* mutex_enter(&ah->ah_lock); */
1233 		if (wr->wr_opcode == IBT_WRC_SEND) {
1234 			HERMON_WQE_BUILD_UD(qp, ud, ah, wr->wr.ud.udwr_dest);
1235 		} else {	/* IBT_WRC_SEND_LSO */
1236 			HERMON_WQE_BUILD_UD(qp, ud, ah,
1237 			    wr->wr.ud_lso.lso_ud_dest);
1238 		}
1239 		/* mutex_exit(&ah->ah_lock); */
1240 
1241 		/* Update "ds" for filling in Data Segments (below) */
1242 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
1243 		    sizeof (hermon_hw_snd_wqe_ud_t));
1244 
1245 		if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1246 			int total_len;
1247 
1248 			total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
1249 			if ((uintptr_t)ds + total_len + (nds * 16) >
1250 			    (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz))
1251 				return (IBT_QP_SGL_LEN_INVALID);
1252 
1253 			bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)ds + 1,
1254 			    wr->wr.ud_lso.lso_hdr_sz);
1255 			old_ds = ds;
1256 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
1257 			for (; i < nds; i++) {
1258 				if (sgl[i].ds_len == 0)
1259 					continue;
1260 				HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds],
1261 				    &sgl[i]);
1262 				num_ds++;
1263 				i++;
1264 				break;
1265 			}
1266 			membar_producer();
1267 			HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
1268 			    wr->wr.ud_lso.lso_hdr_sz);
1269 		}
1270 
1271 		break;
1272 
1273 	case IBT_RC_SRV:
1274 		/* Ensure that work request transport type matches QP type */
1275 		if (qp->qp_serv_type != HERMON_QP_RC) {
1276 			return (IBT_QP_SRV_TYPE_INVALID);
1277 		}
1278 
1279 		/*
1280 		 * Validate the operation type.  For RC requests, we allow
1281 		 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1282 		 * operations, and memory window "Bind"
1283 		 */
1284 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1285 		    (wr->wr_opcode != IBT_WRC_RDMAR) &&
1286 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1287 		    (wr->wr_opcode != IBT_WRC_CSWAP) &&
1288 		    (wr->wr_opcode != IBT_WRC_FADD) &&
1289 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1290 			return (IBT_QP_OP_TYPE_INVALID);
1291 		}
1292 
1293 		/*
1294 		 * If this is a Send request, then all we need to do is break
1295 		 * out and here and begin the Data Segment processing below
1296 		 */
1297 		if (wr->wr_opcode == IBT_WRC_SEND) {
1298 			break;
1299 		}
1300 
1301 		/*
1302 		 * If this is an RDMA Read or RDMA Write request, then fill
1303 		 * in the "Remote Address" header fields.
1304 		 */
1305 		if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1306 		    (wr->wr_opcode == IBT_WRC_RDMAW)) {
1307 			rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1308 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1309 
1310 			/*
1311 			 * Build the Remote Address Segment for the WQE, using
1312 			 * the information from the RC work request.
1313 			 */
1314 			HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1315 
1316 			/* Update "ds" for filling in Data Segments (below) */
1317 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
1318 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1319 			break;
1320 		}
1321 
1322 		/*
1323 		 * If this is one of the Atomic type operations (i.e
1324 		 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1325 		 * Address" header fields and the "Atomic" header fields.
1326 		 */
1327 		if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1328 		    (wr->wr_opcode == IBT_WRC_FADD)) {
1329 			rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1330 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1331 			at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1332 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1333 
1334 			/*
1335 			 * Build the Remote Address and Atomic Segments for
1336 			 * the WQE, using the information from the RC Atomic
1337 			 * work request.
1338 			 */
1339 			HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1340 			HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1341 
1342 			/* Update "ds" for filling in Data Segments (below) */
1343 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
1344 			    sizeof (hermon_hw_snd_wqe_atomic_t));
1345 
1346 			/*
1347 			 * Update "nds" and "sgl" because Atomic requests have
1348 			 * only a single Data Segment (and they are encoded
1349 			 * somewhat differently in the work request.
1350 			 */
1351 			nds = 1;
1352 			sgl = wr->wr_sgl;
1353 			break;
1354 		}
1355 
1356 		/*
1357 		 * If this is memory window Bind operation, then we call the
1358 		 * hermon_wr_bind_check() routine to validate the request and
1359 		 * to generate the updated RKey.  If this is successful, then
1360 		 * we fill in the WQE's "Bind" header fields.
1361 		 */
1362 		if (wr->wr_opcode == IBT_WRC_BIND) {
1363 			status = hermon_wr_bind_check(state, wr);
1364 			if (status != DDI_SUCCESS) {
1365 				return (status);
1366 			}
1367 
1368 			bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1369 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1370 
1371 			/*
1372 			 * Build the Bind Memory Window Segments for the WQE,
1373 			 * using the information from the RC Bind memory
1374 			 * window work request.
1375 			 */
1376 			HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1377 
1378 			/*
1379 			 * Update the "ds" pointer.  Even though the "bind"
1380 			 * operation requires no SGLs, this is necessary to
1381 			 * facilitate the correct descriptor size calculations
1382 			 * (below).
1383 			 */
1384 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1385 			    sizeof (hermon_hw_snd_wqe_bind_t));
1386 			nds = 0;
1387 		}
1388 		break;
1389 
1390 	case IBT_UC_SRV:
1391 		/* Ensure that work request transport type matches QP type */
1392 		if (qp->qp_serv_type != HERMON_QP_UC) {
1393 			return (IBT_QP_SRV_TYPE_INVALID);
1394 		}
1395 
1396 		/*
1397 		 * Validate the operation type.  For UC requests, we only
1398 		 * allow "Send", "RDMA Write", and memory window "Bind".
1399 		 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1400 		 * operations
1401 		 */
1402 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1403 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1404 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1405 			return (IBT_QP_OP_TYPE_INVALID);
1406 		}
1407 
1408 		/*
1409 		 * If this is a Send request, then all we need to do is break
1410 		 * out and here and begin the Data Segment processing below
1411 		 */
1412 		if (wr->wr_opcode == IBT_WRC_SEND) {
1413 			break;
1414 		}
1415 
1416 		/*
1417 		 * If this is an RDMA Write request, then fill in the "Remote
1418 		 * Address" header fields.
1419 		 */
1420 		if (wr->wr_opcode == IBT_WRC_RDMAW) {
1421 			uc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1422 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1423 
1424 			/*
1425 			 * Build the Remote Address Segment for the WQE, using
1426 			 * the information from the UC work request.
1427 			 */
1428 			HERMON_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1429 
1430 			/* Update "ds" for filling in Data Segments (below) */
1431 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)uc +
1432 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1433 			break;
1434 		}
1435 
1436 		/*
1437 		 * If this is memory window Bind operation, then we call the
1438 		 * hermon_wr_bind_check() routine to validate the request and
1439 		 * to generate the updated RKey.  If this is successful, then
1440 		 * we fill in the WQE's "Bind" header fields.
1441 		 */
1442 		if (wr->wr_opcode == IBT_WRC_BIND) {
1443 			status = hermon_wr_bind_check(state, wr);
1444 			if (status != DDI_SUCCESS) {
1445 				return (status);
1446 			}
1447 
1448 			bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1449 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1450 
1451 			/*
1452 			 * Build the Bind Memory Window Segments for the WQE,
1453 			 * using the information from the UC Bind memory
1454 			 * window work request.
1455 			 */
1456 			HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1457 
1458 			/*
1459 			 * Update the "ds" pointer.  Even though the "bind"
1460 			 * operation requires no SGLs, this is necessary to
1461 			 * facilitate the correct descriptor size calculations
1462 			 * (below).
1463 			 */
1464 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1465 			    sizeof (hermon_hw_snd_wqe_bind_t));
1466 			nds = 0;
1467 		}
1468 		break;
1469 
1470 	default:
1471 		return (IBT_QP_SRV_TYPE_INVALID);
1472 	}
1473 
1474 	/*
1475 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
1476 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1477 	 * Start by checking for a valid number of SGL entries
1478 	 */
1479 	if (nds > qp->qp_sq_sgl) {
1480 		return (IBT_QP_SGL_LEN_INVALID);
1481 	}
1482 
1483 	/*
1484 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
1485 	 * segments.  Note: We skip any SGL with zero size because Hermon
1486 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1487 	 * the encoding for zero means a 2GB transfer.
1488 	 */
1489 	for (last_ds = num_ds, j = i; j < nds; j++) {
1490 		if (sgl[j].ds_len != 0)
1491 			last_ds++;	/* real last ds of wqe to fill */
1492 	}
1493 
1494 	/*
1495 	 * Return the size of descriptor (in 16-byte chunks)
1496 	 * For Hermon, we want them (for now) to be on stride size
1497 	 * boundaries, which was implicit in Tavor/Arbel
1498 	 *
1499 	 */
1500 	tmpsize = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc);
1501 
1502 	*size = tmpsize >> 0x4;
1503 
1504 	for (j = nds; --j >= i; ) {
1505 		if (sgl[j].ds_len == 0) {
1506 			continue;
1507 		}
1508 
1509 		/*
1510 		 * Fill in the Data Segment(s) for the current WQE, using the
1511 		 * information contained in the scatter-gather list of the
1512 		 * work request.
1513 		 */
1514 		last_ds--;
1515 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
1516 	}
1517 
1518 	return (DDI_SUCCESS);
1519 }
1520 
1521 
1522 
1523 /*
1524  * hermon_wqe_mlx_build()
1525  *    Context: Can be called from interrupt or base context.
1526  */
1527 static int
1528 hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
1529     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1530 {
1531 	hermon_ahhdl_t		ah;
1532 	hermon_hw_udav_t	*udav;
1533 	ib_lrh_hdr_t		*lrh;
1534 	ib_grh_t		*grh;
1535 	ib_bth_hdr_t		*bth;
1536 	ib_deth_hdr_t		*deth;
1537 	hermon_hw_wqe_sgl_t	*ds;
1538 	ibt_wr_ds_t		*sgl;
1539 	uint8_t			*mgmtclass, *hpoint, *hcount;
1540 	uint32_t		nds, offset, pktlen;
1541 	uint32_t		desc_sz;
1542 	int			i, num_ds;
1543 	int			tmpsize;
1544 
1545 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1546 
1547 	/* Initialize the information for the Data Segments */
1548 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1549 	    sizeof (hermon_hw_mlx_wqe_nextctrl_t));
1550 
1551 	/*
1552 	 * Pull the address handle from the work request. The UDAV will
1553 	 * be used to answer some questions about the request.
1554 	 */
1555 	ah = (hermon_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1556 	if (ah == NULL) {
1557 		return (IBT_AH_HDL_INVALID);
1558 	}
1559 	mutex_enter(&ah->ah_lock);
1560 	udav = ah->ah_udav;
1561 
1562 	/*
1563 	 * If the request is for QP1 and the destination LID is equal to
1564 	 * the Permissive LID, then return an error.  This combination is
1565 	 * not allowed
1566 	 */
1567 	if ((udav->rlid == IB_LID_PERMISSIVE) &&
1568 	    (qp->qp_is_special == HERMON_QP_GSI)) {
1569 		mutex_exit(&ah->ah_lock);
1570 		return (IBT_AH_HDL_INVALID);
1571 	}
1572 
1573 	/*
1574 	 * Calculate the size of the packet headers, including the GRH
1575 	 * (if necessary)
1576 	 */
1577 	desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1578 	    sizeof (ib_deth_hdr_t);
1579 	if (udav->grh) {
1580 		desc_sz += sizeof (ib_grh_t);
1581 	}
1582 
1583 	/*
1584 	 * Begin to build the first "inline" data segment for the packet
1585 	 * headers.  Note:  By specifying "inline" we can build the contents
1586 	 * of the MAD packet headers directly into the work queue (as part
1587 	 * descriptor).  This has the advantage of both speeding things up
1588 	 * and of not requiring the driver to allocate/register any additional
1589 	 * memory for the packet headers.
1590 	 */
1591 	HERMON_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1592 	desc_sz += 4;
1593 
1594 	/*
1595 	 * Build Local Route Header (LRH)
1596 	 *    We start here by building the LRH into a temporary location.
1597 	 *    When we have finished we copy the LRH data into the descriptor.
1598 	 *
1599 	 *    Notice that the VL values are hardcoded.  This is not a problem
1600 	 *    because VL15 is decided later based on the value in the MLX
1601 	 *    transport "next/ctrl" header (see the "vl15" bit below), and it
1602 	 *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1603 	 *    values.  This rule does not hold for loopback packets however
1604 	 *    (all of which bypass the SL-to-VL tables) and it is the reason
1605 	 *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1606 	 *
1607 	 *    Notice also that Source LID is hardcoded to the Permissive LID
1608 	 *    (0xFFFF).  This is also not a problem because if the Destination
1609 	 *    LID is not the Permissive LID, then the "slr" value in the MLX
1610 	 *    transport "next/ctrl" header will be set to zero and the hardware
1611 	 *    will pull the LID from value in the port.
1612 	 */
1613 	lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1614 	pktlen = (desc_sz + 0x100) >> 2;
1615 	HERMON_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1616 
1617 	/*
1618 	 * Build Global Route Header (GRH)
1619 	 *    This is only built if necessary as defined by the "grh" bit in
1620 	 *    the address vector.  Note:  We also calculate the offset to the
1621 	 *    next header (BTH) based on whether or not the "grh" bit is set.
1622 	 */
1623 	if (udav->grh) {
1624 		/*
1625 		 * If the request is for QP0, then return an error.  The
1626 		 * combination of global routine (GRH) and QP0 is not allowed.
1627 		 */
1628 		if (qp->qp_is_special == HERMON_QP_SMI) {
1629 			mutex_exit(&ah->ah_lock);
1630 			return (IBT_AH_HDL_INVALID);
1631 		}
1632 		grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1633 		HERMON_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1634 
1635 		bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1636 	} else {
1637 		bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1638 	}
1639 	mutex_exit(&ah->ah_lock);
1640 
1641 
1642 	/*
1643 	 * Build Base Transport Header (BTH)
1644 	 *    Notice that the M, PadCnt, and TVer fields are all set
1645 	 *    to zero implicitly.  This is true for all Management Datagrams
1646 	 *    MADs whether GSI are SMI.
1647 	 */
1648 	HERMON_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1649 
1650 	/*
1651 	 * Build Datagram Extended Transport Header (DETH)
1652 	 */
1653 	deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1654 	HERMON_WQE_BUILD_MLX_DETH(deth, qp);
1655 
1656 	/* Ensure that the Data Segment is aligned on a 16-byte boundary */
1657 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1658 	ds = (hermon_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1659 	nds = wr->wr_nds;
1660 	sgl = wr->wr_sgl;
1661 	num_ds = 0;
1662 
1663 	/*
1664 	 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1665 	 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1666 	 * Start by checking for a valid number of SGL entries
1667 	 */
1668 	if (nds > qp->qp_sq_sgl) {
1669 		return (IBT_QP_SGL_LEN_INVALID);
1670 	}
1671 
1672 	/*
1673 	 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1674 	 * segments.  Note: We skip any SGL with zero size because Hermon
1675 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1676 	 * the encoding for zero means a 2GB transfer.  Because of this special
1677 	 * encoding in the hardware, we mask the requested length with
1678 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1679 	 * zero.)
1680 	 */
1681 	mgmtclass = hpoint = hcount = NULL;
1682 	offset = 0;
1683 	for (i = 0; i < nds; i++) {
1684 		if (sgl[i].ds_len == 0) {
1685 			continue;
1686 		}
1687 
1688 		/*
1689 		 * Fill in the Data Segment(s) for the MLX send WQE, using
1690 		 * the information contained in the scatter-gather list of
1691 		 * the work request.
1692 		 */
1693 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]);
1694 
1695 		/*
1696 		 * Search through the contents of all MADs posted to QP0 to
1697 		 * initialize pointers to the places where Directed Route "hop
1698 		 * pointer", "hop count", and "mgmtclass" would be.  Hermon
1699 		 * needs these updated (i.e. incremented or decremented, as
1700 		 * necessary) by software.
1701 		 */
1702 		if (qp->qp_is_special == HERMON_QP_SMI) {
1703 
1704 			HERMON_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1705 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1706 
1707 			HERMON_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1708 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1709 
1710 			HERMON_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1711 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1712 
1713 			offset += sgl[i].ds_len;
1714 		}
1715 		num_ds++;
1716 	}
1717 
1718 	/*
1719 	 * Hermon's Directed Route MADs need to have the "hop pointer"
1720 	 * incremented/decremented (as necessary) depending on whether it is
1721 	 * currently less than or greater than the "hop count" (i.e. whether
1722 	 * the MAD is a request or a response.)
1723 	 */
1724 	if (qp->qp_is_special == HERMON_QP_SMI) {
1725 		HERMON_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1726 		    *hpoint, *hcount);
1727 	}
1728 
1729 	/*
1730 	 * Now fill in the ICRC Data Segment.  This data segment is inlined
1731 	 * just like the packets headers above, but it is only four bytes and
1732 	 * set to zero (to indicate that we wish the hardware to generate ICRC.
1733 	 */
1734 	HERMON_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1735 	num_ds++;
1736 
1737 	/*
1738 	 * Return the size of descriptor (in 16-byte chunks)
1739 	 * For Hermon, we want them (for now) to be on stride size
1740 	 * boundaries, which was implicit in Tavor/Arbel
1741 	 */
1742 	tmpsize = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc);
1743 
1744 	*size = tmpsize >> 0x04;
1745 
1746 	return (DDI_SUCCESS);
1747 }
1748 
1749 
1750 
1751 /*
1752  * hermon_wqe_recv_build()
1753  *    Context: Can be called from interrupt or base context.
1754  */
1755 /* ARGSUSED */
1756 static int
1757 hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
1758     ibt_recv_wr_t *wr, uint64_t *desc)
1759 {
1760 	hermon_hw_wqe_sgl_t	*ds;
1761 	int			i, num_ds;
1762 
1763 	ASSERT(MUTEX_HELD(&qp->qp_rq_lock));
1764 
1765 	/*
1766 	 * Fill in the Data Segments (SGL) for the Recv WQE  - don't
1767 	 * need to have a reserved for the ctrl, there is none on the
1768 	 * recv queue for hermon, but will need to put an invalid
1769 	 * (null) scatter pointer per PRM
1770 	 */
1771 	ds = (hermon_hw_wqe_sgl_t *)(uintptr_t)desc;
1772 	num_ds = 0;
1773 
1774 	/* Check for valid number of SGL entries */
1775 	if (wr->wr_nds > qp->qp_rq_sgl) {
1776 		return (IBT_QP_SGL_LEN_INVALID);
1777 	}
1778 
1779 	/*
1780 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1781 	 * segments.  Note: We skip any SGL with zero size because Hermon
1782 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1783 	 * the encoding for zero means a 2GB transfer.  Because of this special
1784 	 * encoding in the hardware, we mask the requested length with
1785 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1786 	 * zero.)
1787 	 */
1788 	for (i = 0; i < wr->wr_nds; i++) {
1789 		if (wr->wr_sgl[i].ds_len == 0) {
1790 			continue;
1791 		}
1792 
1793 		/*
1794 		 * Fill in the Data Segment(s) for the receive WQE, using the
1795 		 * information contained in the scatter-gather list of the
1796 		 * work request.
1797 		 */
1798 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1799 		num_ds++;
1800 	}
1801 
1802 	/* put the null sgl pointer as well if needed */
1803 	if (num_ds < qp->qp_rq_sgl) {
1804 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1805 	}
1806 
1807 	return (DDI_SUCCESS);
1808 }
1809 
1810 
1811 
1812 /*
1813  * hermon_wqe_srq_build()
1814  *    Context: Can be called from interrupt or base context.
1815  */
1816 /* ARGSUSED */
1817 static int
1818 hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
1819     ibt_recv_wr_t *wr, uint64_t *desc)
1820 {
1821 	hermon_hw_wqe_sgl_t	*ds;
1822 	int			i, num_ds;
1823 
1824 	ASSERT(MUTEX_HELD(&srq->srq_lock));
1825 
1826 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1827 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1828 	    sizeof (hermon_hw_srq_wqe_next_t));
1829 	num_ds = 0;
1830 
1831 	/* Check for valid number of SGL entries */
1832 	if (wr->wr_nds > srq->srq_wq_sgl) {
1833 		return (IBT_QP_SGL_LEN_INVALID);
1834 	}
1835 
1836 	/*
1837 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1838 	 * segments.  Note: We skip any SGL with zero size because Hermon
1839 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1840 	 * the encoding for zero means a 2GB transfer.  Because of this special
1841 	 * encoding in the hardware, we mask the requested length with
1842 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1843 	 * zero.)
1844 	 */
1845 	for (i = 0; i < wr->wr_nds; i++) {
1846 		if (wr->wr_sgl[i].ds_len == 0) {
1847 			continue;
1848 		}
1849 
1850 		/*
1851 		 * Fill in the Data Segment(s) for the receive WQE, using the
1852 		 * information contained in the scatter-gather list of the
1853 		 * work request.
1854 		 */
1855 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1856 		num_ds++;
1857 	}
1858 
1859 	/*
1860 	 * put in the null sgl pointer as well, if needed
1861 	 */
1862 	if (num_ds < srq->srq_wq_sgl) {
1863 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1864 	}
1865 
1866 	return (DDI_SUCCESS);
1867 }
1868 
1869 
1870 /*
1871  * hermon_wr_get_immediate()
1872  *    Context: Can be called from interrupt or base context.
1873  */
1874 static uint32_t
1875 hermon_wr_get_immediate(ibt_send_wr_t *wr)
1876 {
1877 	/*
1878 	 * This routine extracts the "immediate data" from the appropriate
1879 	 * location in the IBTF work request.  Because of the way the
1880 	 * work request structure is defined, the location for this data
1881 	 * depends on the actual work request operation type.
1882 	 */
1883 
1884 	/* For RDMA Write, test if RC or UC */
1885 	if (wr->wr_opcode == IBT_WRC_RDMAW) {
1886 		if (wr->wr_trans == IBT_RC_SRV) {
1887 			return (wr->wr.rc.rcwr.rdma.rdma_immed);
1888 		} else {  /* IBT_UC_SRV */
1889 			return (wr->wr.uc.ucwr.rdma.rdma_immed);
1890 		}
1891 	}
1892 
1893 	/* For Send, test if RC, UD, or UC */
1894 	if (wr->wr_opcode == IBT_WRC_SEND) {
1895 		if (wr->wr_trans == IBT_RC_SRV) {
1896 			return (wr->wr.rc.rcwr.send_immed);
1897 		} else if (wr->wr_trans == IBT_UD_SRV) {
1898 			return (wr->wr.ud.udwr_immed);
1899 		} else {  /* IBT_UC_SRV */
1900 			return (wr->wr.uc.ucwr.send_immed);
1901 		}
1902 	}
1903 
1904 	/*
1905 	 * If any other type of request, then immediate is undefined
1906 	 */
1907 	return (0);
1908 }
1909 
1910 /*
1911  * hermon_wqe_headroom()
1912  *	Context: can be called from interrupt or base, currently only from
1913  *	base context.
1914  * Routine that fills in the headroom for the Send Queue
1915  */
1916 
1917 static void
1918 hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp)
1919 {
1920 	uint32_t	*wqe_start, *wqe_top, *wqe_base, qsize;
1921 	int		hdrmwqes, wqesizebytes, sectperwqe;
1922 	uint32_t	invalue;
1923 	int		i, j;
1924 
1925 	qsize	 = qp->qp_sq_bufsz;
1926 	wqesizebytes = 1 << qp->qp_sq_log_wqesz;
1927 	sectperwqe = wqesizebytes >> 6; 	/* 64 bytes/section */
1928 	hdrmwqes = qp->qp_sq_hdrmwqes;
1929 	wqe_base  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, 0);
1930 	wqe_top	  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, qsize);
1931 	wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, from);
1932 
1933 	for (i = 0; i < hdrmwqes; i++)	{
1934 		for (j = 0; j < sectperwqe; j++) {
1935 			if (j == 0) {		/* 1st section of wqe */
1936 				/* perserve ownership bit */
1937 				invalue = ddi_get32(qp->qp_wqinfo.qa_acchdl,
1938 				    wqe_start) | 0x7FFFFFFF;
1939 			} else {
1940 				/* or just invalidate it */
1941 				invalue = 0xFFFFFFFF;
1942 			}
1943 			ddi_put32(qp->qp_wqinfo.qa_acchdl, wqe_start, invalue);
1944 			wqe_start += 16;	/* move 64 bytes */
1945 		}
1946 		if (wqe_start == wqe_top)	/* hit the end of the queue */
1947 			wqe_start = wqe_base;	/* wrap to start */
1948 	}
1949 }
1950 
1951 /*
1952  * hermon_wqe_sync()
1953  *    Context: Can be called from interrupt or base context.
1954  */
1955 static void
1956 hermon_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
1957     uint_t sync_type, uint_t flag)
1958 {
1959 	hermon_qphdl_t		qp;
1960 	hermon_srqhdl_t		srq;
1961 	uint64_t		*wqe_from, *wqe_to;
1962 	uint64_t		*wq_base, *wq_top, *qp_base;
1963 	ddi_dma_handle_t	dmahdl;
1964 	off_t			offset;
1965 	size_t			length;
1966 	uint32_t		qsize;
1967 	int			status;
1968 
1969 	if (sync_type == HERMON_WR_SRQ) {
1970 		srq = (hermon_srqhdl_t)hdl;
1971 		/* Get the DMA handle from SRQ context */
1972 		dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
1973 		/* get base addr of the buffer */
1974 		qp_base = (uint64_t *)(void *)srq->srq_wq_buf;
1975 	} else {
1976 		qp = (hermon_qphdl_t)hdl;
1977 		/* Get the DMA handle from QP context */
1978 		dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
1979 		/* Determine the base address of the QP buffer */
1980 		if (qp->qp_sq_baseaddr == 0) {
1981 			qp_base = (uint64_t *)(void *)(qp->qp_sq_buf);
1982 		} else {
1983 			qp_base = (uint64_t *)(void *)(qp->qp_rq_buf);
1984 		}
1985 	}
1986 
1987 	/*
1988 	 * Depending on the type of the work queue, we grab information
1989 	 * about the address ranges we need to DMA sync.
1990 	 */
1991 
1992 	if (sync_type == HERMON_WR_SEND) {
1993 		wqe_from = HERMON_QP_SQ_ENTRY(qp, sync_from);
1994 		wqe_to   = HERMON_QP_SQ_ENTRY(qp, sync_to);
1995 		qsize	 = qp->qp_sq_bufsz;
1996 
1997 		wq_base = HERMON_QP_SQ_ENTRY(qp, 0);
1998 		wq_top	 = HERMON_QP_SQ_ENTRY(qp, qsize);
1999 	} else if (sync_type == HERMON_WR_RECV) {
2000 		wqe_from = HERMON_QP_RQ_ENTRY(qp, sync_from);
2001 		wqe_to   = HERMON_QP_RQ_ENTRY(qp, sync_to);
2002 		qsize	 = qp->qp_rq_bufsz;
2003 
2004 		wq_base = HERMON_QP_RQ_ENTRY(qp, 0);
2005 		wq_top	 = HERMON_QP_RQ_ENTRY(qp, qsize);
2006 	} else {
2007 		wqe_from = HERMON_SRQ_WQ_ENTRY(srq, sync_from);
2008 		wqe_to   = HERMON_SRQ_WQ_ENTRY(srq, sync_to);
2009 		qsize	 = srq->srq_wq_bufsz;
2010 
2011 		wq_base = HERMON_SRQ_WQ_ENTRY(srq, 0);
2012 		wq_top	 = HERMON_SRQ_WQ_ENTRY(srq, qsize);
2013 	}
2014 
2015 	/*
2016 	 * There are two possible cases for the beginning and end of the WQE
2017 	 * chain we are trying to sync.  Either this is the simple case, where
2018 	 * the end of the chain is below the beginning of the chain, or it is
2019 	 * the "wrap-around" case, where the end of the chain has wrapped over
2020 	 * the end of the queue.  In the former case, we simply need to
2021 	 * calculate the span from beginning to end and sync it.  In the latter
2022 	 * case, however, we need to calculate the span from the top of the
2023 	 * work queue to the end of the chain and sync that, and then we need
2024 	 * to find the other portion (from beginning of chain to end of queue)
2025 	 * and sync that as well.  Note: if the "top to end" span is actually
2026 	 * zero length, then we don't do a DMA sync because a zero length DMA
2027 	 * sync unnecessarily syncs the entire work queue.
2028 	 */
2029 	if (wqe_to > wqe_from) {
2030 		/* "From Beginning to End" */
2031 
2032 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)qp_base);
2033 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2034 
2035 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2036 		if (status != DDI_SUCCESS) {
2037 			return;
2038 		}
2039 	} else {
2040 		/* "From Top to End" */
2041 
2042 		offset = (off_t)((uintptr_t)wq_base - (uintptr_t)qp_base);
2043 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wq_base);
2044 		if (length) {
2045 			status = ddi_dma_sync(dmahdl, offset, length, flag);
2046 			if (status != DDI_SUCCESS) {
2047 				return;
2048 			}
2049 		}
2050 
2051 		/* "From Beginning to Bottom" */
2052 
2053 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)qp_base);
2054 		length = (size_t)((uintptr_t)wq_top - (uintptr_t)wqe_from);
2055 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2056 		if (status != DDI_SUCCESS) {
2057 			return;
2058 		}
2059 	}
2060 }
2061 
2062 
2063 /*
2064  * hermon_wr_bind_check()
2065  *    Context: Can be called from interrupt or base context.
2066  */
2067 /* ARGSUSED */
2068 static int
2069 hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr)
2070 {
2071 	ibt_bind_flags_t	bind_flags;
2072 	uint64_t		vaddr, len;
2073 	uint64_t		reg_start_addr, reg_end_addr;
2074 	hermon_mwhdl_t		mw;
2075 	hermon_mrhdl_t		mr;
2076 	hermon_rsrc_t		*mpt;
2077 	uint32_t		new_rkey;
2078 
2079 	/* Check for a valid Memory Window handle in the WR */
2080 	mw = (hermon_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2081 	if (mw == NULL) {
2082 		return (IBT_MW_HDL_INVALID);
2083 	}
2084 
2085 	/* Check for a valid Memory Region handle in the WR */
2086 	mr = (hermon_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2087 	if (mr == NULL) {
2088 		return (IBT_MR_HDL_INVALID);
2089 	}
2090 
2091 	mutex_enter(&mr->mr_lock);
2092 	mutex_enter(&mw->mr_lock);
2093 
2094 	/*
2095 	 * Check here to see if the memory region has already been partially
2096 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
2097 	 * If so, this is an error, return failure.
2098 	 */
2099 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2100 		mutex_exit(&mr->mr_lock);
2101 		mutex_exit(&mw->mr_lock);
2102 		return (IBT_MR_HDL_INVALID);
2103 	}
2104 
2105 	/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2106 	if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2107 		mutex_exit(&mr->mr_lock);
2108 		mutex_exit(&mw->mr_lock);
2109 		return (IBT_MR_RKEY_INVALID);
2110 	}
2111 
2112 	/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2113 	if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2114 		mutex_exit(&mr->mr_lock);
2115 		mutex_exit(&mw->mr_lock);
2116 		return (IBT_MR_LKEY_INVALID);
2117 	}
2118 
2119 	/*
2120 	 * Now check for valid "vaddr" and "len".  Note:  We don't check the
2121 	 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2122 	 */
2123 	len = wr->wr.rc.rcwr.bind->bind_len;
2124 	if (len != 0) {
2125 		vaddr = wr->wr.rc.rcwr.bind->bind_va;
2126 		reg_start_addr = mr->mr_bindinfo.bi_addr;
2127 		reg_end_addr   = mr->mr_bindinfo.bi_addr +
2128 		    (mr->mr_bindinfo.bi_len - 1);
2129 		if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2130 			mutex_exit(&mr->mr_lock);
2131 			mutex_exit(&mw->mr_lock);
2132 			return (IBT_MR_VA_INVALID);
2133 		}
2134 		vaddr = (vaddr + len) - 1;
2135 		if (vaddr > reg_end_addr) {
2136 			mutex_exit(&mr->mr_lock);
2137 			mutex_exit(&mw->mr_lock);
2138 			return (IBT_MR_LEN_INVALID);
2139 		}
2140 	}
2141 
2142 	/*
2143 	 * Validate the bind access flags.  Remote Write and Atomic access for
2144 	 * the Memory Window require that Local Write access be set in the
2145 	 * corresponding Memory Region.
2146 	 */
2147 	bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2148 	if (((bind_flags & IBT_WR_BIND_WRITE) ||
2149 	    (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2150 	    !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2151 		mutex_exit(&mr->mr_lock);
2152 		mutex_exit(&mw->mr_lock);
2153 		return (IBT_MR_ACCESS_REQ_INVALID);
2154 	}
2155 
2156 	/* Calculate the new RKey for the Memory Window */
2157 	mpt = mw->mr_mptrsrcp;
2158 	new_rkey = hermon_mr_keycalc(mpt->hr_indx);
2159 	new_rkey = hermon_mr_key_swap(new_rkey);
2160 
2161 	wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2162 	mw->mr_rkey = new_rkey;
2163 
2164 	mutex_exit(&mr->mr_lock);
2165 	mutex_exit(&mw->mr_lock);
2166 	return (DDI_SUCCESS);
2167 }
2168 
2169 
2170 /*
2171  * hermon_wrid_from_reset_handling()
2172  *    Context: Can be called from interrupt or base context.
2173  */
2174 /* ARGSUSED */
2175 int
2176 hermon_wrid_from_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2177 {
2178 	hermon_workq_hdr_t	*swq, *rwq;
2179 	uint_t			qp_srq_en;
2180 
2181 	if (qp->qp_is_umap)
2182 		return (DDI_SUCCESS);
2183 
2184 	/* grab the cq lock(s) to modify the wqavl tree */
2185 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2186 #ifdef __lock_lint
2187 	mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2188 #else
2189 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2190 		mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2191 #endif
2192 
2193 	/* Chain the newly allocated work queue header to the CQ's list */
2194 	hermon_cq_workq_add(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2195 
2196 	swq = qp->qp_sq_wqhdr;
2197 	swq->wq_head = 0;
2198 	swq->wq_tail = 0;
2199 	swq->wq_full = 0;
2200 
2201 	/*
2202 	 * Now we repeat all the above operations for the receive work queue,
2203 	 * or shared receive work queue.
2204 	 *
2205 	 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2206 	 */
2207 	qp_srq_en = qp->qp_srq_en;
2208 
2209 #ifdef __lock_lint
2210 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2211 #else
2212 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2213 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2214 	} else {
2215 		rwq = qp->qp_rq_wqhdr;
2216 		rwq->wq_head = 0;
2217 		rwq->wq_tail = 0;
2218 		rwq->wq_full = 0;
2219 		qp->qp_rq_wqecntr = 0;
2220 	}
2221 #endif
2222 	hermon_cq_workq_add(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2223 
2224 #ifdef __lock_lint
2225 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2226 #else
2227 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2228 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2229 	}
2230 #endif
2231 
2232 #ifdef __lock_lint
2233 	mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2234 #else
2235 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2236 		mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2237 #endif
2238 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2239 	return (DDI_SUCCESS);
2240 }
2241 
2242 
2243 /*
2244  * hermon_wrid_to_reset_handling()
2245  *    Context: Can be called from interrupt or base context.
2246  */
2247 int
2248 hermon_wrid_to_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2249 {
2250 	uint_t			qp_srq_en;
2251 
2252 	if (qp->qp_is_umap)
2253 		return (DDI_SUCCESS);
2254 
2255 	/*
2256 	 * If there are unpolled entries in these CQs, they are
2257 	 * polled/flushed.
2258 	 * Grab the CQ lock(s) before manipulating the lists.
2259 	 */
2260 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2261 #ifdef __lock_lint
2262 	mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2263 #else
2264 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2265 		mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2266 #endif
2267 
2268 	qp_srq_en = qp->qp_srq_en;
2269 #ifdef __lock_lint
2270 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2271 #else
2272 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2273 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2274 	}
2275 #endif
2276 	/*
2277 	 * Flush the entries on the CQ for this QP's QPN.
2278 	 */
2279 	hermon_cq_entries_flush(state, qp);
2280 
2281 #ifdef __lock_lint
2282 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2283 #else
2284 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2285 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2286 	}
2287 #endif
2288 
2289 	hermon_cq_workq_remove(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2290 	hermon_cq_workq_remove(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2291 
2292 #ifdef __lock_lint
2293 	mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2294 #else
2295 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2296 		mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2297 #endif
2298 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2299 
2300 	return (IBT_SUCCESS);
2301 }
2302 
2303 
2304 /*
2305  * hermon_wrid_get_entry()
2306  *    Context: Can be called from interrupt or base context.
2307  */
2308 uint64_t
2309 hermon_wrid_get_entry(hermon_cqhdl_t cq, hermon_hw_cqe_t *cqe)
2310 {
2311 	hermon_workq_avl_t	*wqa;
2312 	hermon_workq_hdr_t	*wq;
2313 	uint64_t		wrid;
2314 	uint_t			send_or_recv, qpnum;
2315 	uint32_t		indx;
2316 
2317 	/*
2318 	 * Determine whether this CQE is a send or receive completion.
2319 	 */
2320 	send_or_recv = HERMON_CQE_SENDRECV_GET(cq, cqe);
2321 
2322 	/* Find the work queue for this QP number (send or receive side) */
2323 	qpnum = HERMON_CQE_QPNUM_GET(cq, cqe);
2324 	wqa = hermon_wrid_wqavl_find(cq, qpnum, send_or_recv);
2325 	wq = wqa->wqa_wq;
2326 
2327 	/*
2328 	 * Regardless of whether the completion is the result of a "success"
2329 	 * or a "failure", we lock the list of "containers" and attempt to
2330 	 * search for the the first matching completion (i.e. the first WR
2331 	 * with a matching WQE addr and size).  Once we find it, we pull out
2332 	 * the "wrid" field and return it (see below).  XXX Note: One possible
2333 	 * future enhancement would be to enable this routine to skip over
2334 	 * any "unsignaled" completions to go directly to the next "signaled"
2335 	 * entry on success.
2336 	 */
2337 	indx = HERMON_CQE_WQEADDRSZ_GET(cq, cqe) & wq->wq_mask;
2338 	wrid = wq->wq_wrid[indx];
2339 	if (wqa->wqa_srq_en) {
2340 		struct hermon_sw_srq_s	*srq;
2341 		uint64_t		*desc;
2342 
2343 		/* put wqe back on the srq free list */
2344 		srq = wqa->wqa_srq;
2345 		mutex_enter(&srq->srq_lock);
2346 		desc = HERMON_SRQ_WQE_ADDR(srq, wq->wq_tail);
2347 		((uint16_t *)desc)[1] = htons(indx);
2348 		wq->wq_tail = indx;
2349 		mutex_exit(&srq->srq_lock);
2350 	} else {
2351 		wq->wq_head = (indx + 1) & wq->wq_mask;
2352 		wq->wq_full = 0;
2353 	}
2354 
2355 	return (wrid);
2356 }
2357 
2358 
2359 int
2360 hermon_wrid_workq_compare(const void *p1, const void *p2)
2361 {
2362 	hermon_workq_compare_t	*cmpp;
2363 	hermon_workq_avl_t	*curr;
2364 
2365 	cmpp = (hermon_workq_compare_t *)p1;
2366 	curr = (hermon_workq_avl_t *)p2;
2367 
2368 	if (cmpp->cmp_qpn < curr->wqa_qpn)
2369 		return (-1);
2370 	else if (cmpp->cmp_qpn > curr->wqa_qpn)
2371 		return (+1);
2372 	else if (cmpp->cmp_type < curr->wqa_type)
2373 		return (-1);
2374 	else if (cmpp->cmp_type > curr->wqa_type)
2375 		return (+1);
2376 	else
2377 		return (0);
2378 }
2379 
2380 
2381 /*
2382  * hermon_wrid_workq_find()
2383  *    Context: Can be called from interrupt or base context.
2384  */
2385 static hermon_workq_avl_t *
2386 hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn, uint_t wq_type)
2387 {
2388 	hermon_workq_avl_t	*curr;
2389 	hermon_workq_compare_t	cmp;
2390 
2391 	/*
2392 	 * Walk the CQ's work queue list, trying to find a send or recv queue
2393 	 * with the same QP number.  We do this even if we are going to later
2394 	 * create a new entry because it helps us easily find the end of the
2395 	 * list.
2396 	 */
2397 	cmp.cmp_qpn = qpn;
2398 	cmp.cmp_type = wq_type;
2399 #ifdef __lock_lint
2400 	hermon_wrid_workq_compare(NULL, NULL);
2401 #endif
2402 	curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
2403 
2404 	return (curr);
2405 }
2406 
2407 
2408 /*
2409  * hermon_wrid_wqhdr_create()
2410  *    Context: Can be called from base context.
2411  */
2412 /* ARGSUSED */
2413 hermon_workq_hdr_t *
2414 hermon_wrid_wqhdr_create(int bufsz)
2415 {
2416 	hermon_workq_hdr_t	*wqhdr;
2417 
2418 	/*
2419 	 * Allocate space for the wqhdr, and an array to record all the wrids.
2420 	 */
2421 	wqhdr = (hermon_workq_hdr_t *)kmem_zalloc(sizeof (*wqhdr), KM_NOSLEEP);
2422 	if (wqhdr == NULL) {
2423 		return (NULL);
2424 	}
2425 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr))
2426 	wqhdr->wq_wrid = kmem_zalloc(bufsz * sizeof (uint64_t), KM_NOSLEEP);
2427 	if (wqhdr->wq_wrid == NULL) {
2428 		kmem_free(wqhdr, sizeof (*wqhdr));
2429 		return (NULL);
2430 	}
2431 	wqhdr->wq_size = bufsz;
2432 	wqhdr->wq_mask = bufsz - 1;
2433 
2434 	return (wqhdr);
2435 }
2436 
2437 void
2438 hermon_wrid_wqhdr_destroy(hermon_workq_hdr_t *wqhdr)
2439 {
2440 	kmem_free(wqhdr->wq_wrid, wqhdr->wq_size * sizeof (uint64_t));
2441 	kmem_free(wqhdr, sizeof (*wqhdr));
2442 }
2443 
2444 
2445 /*
2446  * hermon_cq_workq_add()
2447  *    Context: Can be called from interrupt or base context.
2448  */
2449 static void
2450 hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2451 {
2452 	hermon_workq_compare_t	cmp;
2453 	avl_index_t		where;
2454 
2455 	cmp.cmp_qpn = wqavl->wqa_qpn;
2456 	cmp.cmp_type = wqavl->wqa_type;
2457 #ifdef __lock_lint
2458 	hermon_wrid_workq_compare(NULL, NULL);
2459 #endif
2460 	(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
2461 	avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqavl, where);
2462 }
2463 
2464 
2465 /*
2466  * hermon_cq_workq_remove()
2467  *    Context: Can be called from interrupt or base context.
2468  */
2469 static void
2470 hermon_cq_workq_remove(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2471 {
2472 #ifdef __lock_lint
2473 	hermon_wrid_workq_compare(NULL, NULL);
2474 #endif
2475 	avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqavl);
2476 }
2477