xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/hermon/hermon_qp.c (revision 17a2b317610f531d565bf4e940433aab2d9e6985)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * hermon_qp.c
28  *    Hermon Queue Pair Processing Routines
29  *
30  *    Implements all the routines necessary for allocating, freeing, and
31  *    querying the Hermon queue pairs.
32  */
33 
34 #include <sys/types.h>
35 #include <sys/conf.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/modctl.h>
39 #include <sys/bitmap.h>
40 #include <sys/sysmacros.h>
41 
42 #include <sys/ib/adapters/hermon/hermon.h>
43 #include <sys/ib/ib_pkt_hdrs.h>
44 
45 static int hermon_qp_create_qpn(hermon_state_t *state, hermon_qphdl_t qp,
46     hermon_rsrc_t *qpc);
47 static int hermon_qpn_avl_compare(const void *q, const void *e);
48 static int hermon_special_qp_rsrc_alloc(hermon_state_t *state,
49     ibt_sqp_type_t type, uint_t port, hermon_rsrc_t **qp_rsrc);
50 static int hermon_special_qp_rsrc_free(hermon_state_t *state,
51     ibt_sqp_type_t type, uint_t port);
52 static void hermon_qp_sgl_to_logwqesz(hermon_state_t *state, uint_t num_sgl,
53     uint_t real_max_sgl, hermon_qp_wq_type_t wq_type,
54     uint_t *logwqesz, uint_t *max_sgl);
55 
56 /*
57  * hermon_qp_alloc()
58  *    Context: Can be called only from user or kernel context.
59  */
60 int
61 hermon_qp_alloc(hermon_state_t *state, hermon_qp_info_t *qpinfo,
62     uint_t sleepflag)
63 {
64 	hermon_rsrc_t			*qpc, *rsrc;
65 	hermon_rsrc_type_t		rsrc_type;
66 	hermon_umap_db_entry_t		*umapdb;
67 	hermon_qphdl_t			qp;
68 	ibt_qp_alloc_attr_t		*attr_p;
69 	ibt_qp_alloc_flags_t		alloc_flags;
70 	ibt_qp_type_t			type;
71 	hermon_qp_wq_type_t		swq_type;
72 	ibtl_qp_hdl_t			ibt_qphdl;
73 	ibt_chan_sizes_t		*queuesz_p;
74 	ib_qpn_t			*qpn;
75 	hermon_qphdl_t			*qphdl;
76 	ibt_mr_attr_t			mr_attr;
77 	hermon_mr_options_t		mr_op;
78 	hermon_srqhdl_t			srq;
79 	hermon_pdhdl_t			pd;
80 	hermon_cqhdl_t			sq_cq, rq_cq;
81 	hermon_mrhdl_t			mr;
82 	uint64_t			value, qp_desc_off;
83 	uint64_t			*thewqe, thewqesz;
84 	uint32_t			*sq_buf, *rq_buf;
85 	uint32_t			log_qp_sq_size, log_qp_rq_size;
86 	uint32_t			sq_size, rq_size;
87 	uint32_t			sq_depth, rq_depth;
88 	uint32_t			sq_wqe_size, rq_wqe_size, wqesz_shift;
89 	uint32_t			max_sgl, max_recv_sgl, uarpg;
90 	uint_t				qp_is_umap;
91 	uint_t				qp_srq_en, i, j;
92 	int				status, flag;
93 
94 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p, *queuesz_p))
95 
96 	/*
97 	 * Extract the necessary info from the hermon_qp_info_t structure
98 	 */
99 	attr_p	  = qpinfo->qpi_attrp;
100 	type	  = qpinfo->qpi_type;
101 	ibt_qphdl = qpinfo->qpi_ibt_qphdl;
102 	queuesz_p = qpinfo->qpi_queueszp;
103 	qpn	  = qpinfo->qpi_qpn;
104 	qphdl	  = &qpinfo->qpi_qphdl;
105 	alloc_flags = attr_p->qp_alloc_flags;
106 
107 	/*
108 	 * Verify correctness of alloc_flags.
109 	 *
110 	 * 1. FEXCH and RSS are only allocated via qp_range.
111 	 */
112 	if (alloc_flags & (IBT_QP_USES_FEXCH | IBT_QP_USES_RSS)) {
113 		return (IBT_INVALID_PARAM);
114 	}
115 	rsrc_type = HERMON_QPC;
116 	qp_is_umap = 0;
117 
118 	/* 2. Make sure only one of these flags is set. */
119 	switch (alloc_flags &
120 	    (IBT_QP_USER_MAP | IBT_QP_USES_RFCI | IBT_QP_USES_FCMD)) {
121 	case IBT_QP_USER_MAP:
122 		qp_is_umap = 1;
123 		break;
124 	case IBT_QP_USES_RFCI:
125 		if (type != IBT_UD_RQP)
126 			return (IBT_INVALID_PARAM);
127 
128 		switch (attr_p->qp_fc.fc_hca_port) {
129 		case 1:
130 			rsrc_type = HERMON_QPC_RFCI_PORT1;
131 			break;
132 		case 2:
133 			rsrc_type = HERMON_QPC_RFCI_PORT2;
134 			break;
135 		default:
136 			return (IBT_INVALID_PARAM);
137 		}
138 		break;
139 	case IBT_QP_USES_FCMD:
140 		if (type != IBT_UD_RQP)
141 			return (IBT_INVALID_PARAM);
142 		break;
143 	case 0:
144 		break;
145 	default:
146 		return (IBT_INVALID_PARAM);	/* conflicting flags set */
147 	}
148 
149 	/*
150 	 * Determine whether QP is being allocated for userland access or
151 	 * whether it is being allocated for kernel access.  If the QP is
152 	 * being allocated for userland access, then lookup the UAR
153 	 * page number for the current process.  Note:  If this is not found
154 	 * (e.g. if the process has not previously open()'d the Hermon driver),
155 	 * then an error is returned.
156 	 */
157 	if (qp_is_umap) {
158 		status = hermon_umap_db_find(state->hs_instance, ddi_get_pid(),
159 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
160 		if (status != DDI_SUCCESS) {
161 			return (IBT_INVALID_PARAM);
162 		}
163 		uarpg = ((hermon_rsrc_t *)(uintptr_t)value)->hr_indx;
164 	} else {
165 		uarpg = state->hs_kernel_uar_index;
166 	}
167 
168 	/*
169 	 * Determine whether QP is being associated with an SRQ
170 	 */
171 	qp_srq_en = (alloc_flags & IBT_QP_USES_SRQ) ? 1 : 0;
172 	if (qp_srq_en) {
173 		/*
174 		 * Check for valid SRQ handle pointers
175 		 */
176 		if (attr_p->qp_ibc_srq_hdl == NULL) {
177 			status = IBT_SRQ_HDL_INVALID;
178 			goto qpalloc_fail;
179 		}
180 		srq = (hermon_srqhdl_t)attr_p->qp_ibc_srq_hdl;
181 	}
182 
183 	/*
184 	 * Check for valid QP service type (only UD/RC/UC supported)
185 	 */
186 	if (((type != IBT_UD_RQP) && (type != IBT_RC_RQP) &&
187 	    (type != IBT_UC_RQP))) {
188 		status = IBT_QP_SRV_TYPE_INVALID;
189 		goto qpalloc_fail;
190 	}
191 
192 
193 	/*
194 	 * Check for valid PD handle pointer
195 	 */
196 	if (attr_p->qp_pd_hdl == NULL) {
197 		status = IBT_PD_HDL_INVALID;
198 		goto qpalloc_fail;
199 	}
200 	pd = (hermon_pdhdl_t)attr_p->qp_pd_hdl;
201 
202 	/*
203 	 * If on an SRQ, check to make sure the PD is the same
204 	 */
205 	if (qp_srq_en && (pd->pd_pdnum != srq->srq_pdhdl->pd_pdnum)) {
206 		status = IBT_PD_HDL_INVALID;
207 		goto qpalloc_fail;
208 	}
209 
210 	/* Increment the reference count on the protection domain (PD) */
211 	hermon_pd_refcnt_inc(pd);
212 
213 	/*
214 	 * Check for valid CQ handle pointers
215 	 *
216 	 * FCMD QPs do not require a receive cq handle.
217 	 */
218 	if (attr_p->qp_ibc_scq_hdl == NULL) {
219 		status = IBT_CQ_HDL_INVALID;
220 		goto qpalloc_fail1;
221 	}
222 	sq_cq = (hermon_cqhdl_t)attr_p->qp_ibc_scq_hdl;
223 	if ((attr_p->qp_ibc_rcq_hdl == NULL)) {
224 		if ((alloc_flags & IBT_QP_USES_FCMD) == 0) {
225 			status = IBT_CQ_HDL_INVALID;
226 			goto qpalloc_fail1;
227 		}
228 		rq_cq = sq_cq;	/* just use the send cq */
229 	} else
230 		rq_cq = (hermon_cqhdl_t)attr_p->qp_ibc_rcq_hdl;
231 
232 	/*
233 	 * Increment the reference count on the CQs.  One or both of these
234 	 * could return error if we determine that the given CQ is already
235 	 * being used with a special (SMI/GSI) QP.
236 	 */
237 	status = hermon_cq_refcnt_inc(sq_cq, HERMON_CQ_IS_NORMAL);
238 	if (status != DDI_SUCCESS) {
239 		status = IBT_CQ_HDL_INVALID;
240 		goto qpalloc_fail1;
241 	}
242 	status = hermon_cq_refcnt_inc(rq_cq, HERMON_CQ_IS_NORMAL);
243 	if (status != DDI_SUCCESS) {
244 		status = IBT_CQ_HDL_INVALID;
245 		goto qpalloc_fail2;
246 	}
247 
248 	/*
249 	 * Allocate an QP context entry.  This will be filled in with all
250 	 * the necessary parameters to define the Queue Pair.  Unlike
251 	 * other Hermon hardware resources, ownership is not immediately
252 	 * given to hardware in the final step here.  Instead, we must
253 	 * wait until the QP is later transitioned to the "Init" state before
254 	 * passing the QP to hardware.  If we fail here, we must undo all
255 	 * the reference count (CQ and PD).
256 	 */
257 	status = hermon_rsrc_alloc(state, rsrc_type, 1, sleepflag, &qpc);
258 	if (status != DDI_SUCCESS) {
259 		status = IBT_INSUFF_RESOURCE;
260 		goto qpalloc_fail3;
261 	}
262 
263 	/*
264 	 * Allocate the software structure for tracking the queue pair
265 	 * (i.e. the Hermon Queue Pair handle).  If we fail here, we must
266 	 * undo the reference counts and the previous resource allocation.
267 	 */
268 	status = hermon_rsrc_alloc(state, HERMON_QPHDL, 1, sleepflag, &rsrc);
269 	if (status != DDI_SUCCESS) {
270 		status = IBT_INSUFF_RESOURCE;
271 		goto qpalloc_fail4;
272 	}
273 	qp = (hermon_qphdl_t)rsrc->hr_addr;
274 	bzero(qp, sizeof (struct hermon_sw_qp_s));
275 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
276 
277 	qp->qp_alloc_flags = alloc_flags;
278 
279 	/*
280 	 * Calculate the QP number from QPC index.  This routine handles
281 	 * all of the operations necessary to keep track of used, unused,
282 	 * and released QP numbers.
283 	 */
284 	if (type == IBT_UD_RQP) {
285 		qp->qp_qpnum = qpc->hr_indx;
286 		qp->qp_ring = qp->qp_qpnum << 8;
287 		qp->qp_qpn_hdl = NULL;
288 	} else {
289 		status = hermon_qp_create_qpn(state, qp, qpc);
290 		if (status != DDI_SUCCESS) {
291 			status = IBT_INSUFF_RESOURCE;
292 			goto qpalloc_fail5;
293 		}
294 	}
295 
296 	/*
297 	 * If this will be a user-mappable QP, then allocate an entry for
298 	 * the "userland resources database".  This will later be added to
299 	 * the database (after all further QP operations are successful).
300 	 * If we fail here, we must undo the reference counts and the
301 	 * previous resource allocation.
302 	 */
303 	if (qp_is_umap) {
304 		umapdb = hermon_umap_db_alloc(state->hs_instance, qp->qp_qpnum,
305 		    MLNX_UMAP_QPMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
306 		if (umapdb == NULL) {
307 			status = IBT_INSUFF_RESOURCE;
308 			goto qpalloc_fail6;
309 		}
310 	}
311 
312 	/*
313 	 * Allocate the doorbell record.  Hermon just needs one for the RQ,
314 	 * if the QP is not associated with an SRQ, and use uarpg (above) as
315 	 * the uar index
316 	 */
317 
318 	if (!qp_srq_en) {
319 		status = hermon_dbr_alloc(state, uarpg, &qp->qp_rq_dbr_acchdl,
320 		    &qp->qp_rq_vdbr, &qp->qp_rq_pdbr, &qp->qp_rdbr_mapoffset);
321 		if (status != DDI_SUCCESS) {
322 			status = IBT_INSUFF_RESOURCE;
323 			goto qpalloc_fail6;
324 		}
325 	}
326 
327 	qp->qp_uses_lso = (attr_p->qp_flags & IBT_USES_LSO);
328 
329 	/*
330 	 * We verify that the requested number of SGL is valid (i.e.
331 	 * consistent with the device limits and/or software-configured
332 	 * limits).  If not, then obviously the same cleanup needs to be done.
333 	 */
334 	if (type == IBT_UD_RQP) {
335 		max_sgl = state->hs_ibtfinfo.hca_attr->hca_ud_send_sgl_sz;
336 		swq_type = HERMON_QP_WQ_TYPE_SENDQ_UD;
337 	} else {
338 		max_sgl = state->hs_ibtfinfo.hca_attr->hca_conn_send_sgl_sz;
339 		swq_type = HERMON_QP_WQ_TYPE_SENDQ_CONN;
340 	}
341 	max_recv_sgl = state->hs_ibtfinfo.hca_attr->hca_recv_sgl_sz;
342 	if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
343 	    (!qp_srq_en && (attr_p->qp_sizes.cs_rq_sgl > max_recv_sgl))) {
344 		status = IBT_HCA_SGL_EXCEEDED;
345 		goto qpalloc_fail7;
346 	}
347 
348 	/*
349 	 * Determine this QP's WQE stride (for both the Send and Recv WQEs).
350 	 * This will depend on the requested number of SGLs.  Note: this
351 	 * has the side-effect of also calculating the real number of SGLs
352 	 * (for the calculated WQE size).
353 	 *
354 	 * For QP's on an SRQ, we set these to 0.
355 	 */
356 	if (qp_srq_en) {
357 		qp->qp_rq_log_wqesz = 0;
358 		qp->qp_rq_sgl = 0;
359 	} else {
360 		hermon_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
361 		    max_recv_sgl, HERMON_QP_WQ_TYPE_RECVQ,
362 		    &qp->qp_rq_log_wqesz, &qp->qp_rq_sgl);
363 	}
364 	hermon_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
365 	    max_sgl, swq_type, &qp->qp_sq_log_wqesz, &qp->qp_sq_sgl);
366 
367 	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
368 
369 	/* NOTE: currently policy in driver, later maybe IBTF interface */
370 	qp->qp_no_prefetch = 0;
371 
372 	/*
373 	 * for prefetching, we need to add the number of wqes in
374 	 * the 2k area plus one to the number requested, but
375 	 * ONLY for send queue.  If no_prefetch == 1 (prefetch off)
376 	 * it's exactly TWO wqes for the headroom
377 	 */
378 	if (qp->qp_no_prefetch)
379 		qp->qp_sq_headroom = 2 * sq_wqe_size;
380 	else
381 		qp->qp_sq_headroom = sq_wqe_size + HERMON_QP_OH_SIZE;
382 	/*
383 	 * hdrm wqes must be integral since both sq_wqe_size &
384 	 * HERMON_QP_OH_SIZE are power of 2
385 	 */
386 	qp->qp_sq_hdrmwqes = (qp->qp_sq_headroom / sq_wqe_size);
387 
388 
389 	/*
390 	 * Calculate the appropriate size for the work queues.
391 	 * For send queue, add in the headroom wqes to the calculation.
392 	 * Note:  All Hermon QP work queues must be a power-of-2 in size.  Also
393 	 * they may not be any smaller than HERMON_QP_MIN_SIZE.  This step is
394 	 * to round the requested size up to the next highest power-of-2
395 	 */
396 	/* first, adjust to a minimum and tell the caller the change */
397 	attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq,
398 	    HERMON_QP_MIN_SIZE);
399 	attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq,
400 	    HERMON_QP_MIN_SIZE);
401 	/*
402 	 * now, calculate the alloc size, taking into account
403 	 * the headroom for the sq
404 	 */
405 	log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq + qp->qp_sq_hdrmwqes);
406 	/* if the total is a power of two, reduce it */
407 	if (((attr_p->qp_sizes.cs_sq + qp->qp_sq_hdrmwqes)  &
408 	    (attr_p->qp_sizes.cs_sq + qp->qp_sq_hdrmwqes - 1)) == 0)	{
409 		log_qp_sq_size = log_qp_sq_size - 1;
410 	}
411 
412 	log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
413 	if ((attr_p->qp_sizes.cs_rq & (attr_p->qp_sizes.cs_rq - 1)) == 0) {
414 		log_qp_rq_size = log_qp_rq_size - 1;
415 	}
416 
417 	/*
418 	 * Next we verify that the rounded-up size is valid (i.e. consistent
419 	 * with the device limits and/or software-configured limits).  If not,
420 	 * then obviously we have a lot of cleanup to do before returning.
421 	 *
422 	 * NOTE: the first condition deals with the (test) case of cs_sq
423 	 * being just less than 2^32.  In this case, the headroom addition
424 	 * to the requested cs_sq will pass the test when it should not.
425 	 * This test no longer lets that case slip through the check.
426 	 */
427 	if ((attr_p->qp_sizes.cs_sq >
428 	    (1 << state->hs_cfg_profile->cp_log_max_qp_sz)) ||
429 	    (log_qp_sq_size > state->hs_cfg_profile->cp_log_max_qp_sz) ||
430 	    (!qp_srq_en && (log_qp_rq_size >
431 	    state->hs_cfg_profile->cp_log_max_qp_sz))) {
432 		status = IBT_HCA_WR_EXCEEDED;
433 		goto qpalloc_fail7;
434 	}
435 
436 	/*
437 	 * Allocate the memory for QP work queues. Since Hermon work queues
438 	 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
439 	 * the work queue memory is very important.  We used to allocate
440 	 * work queues (the combined receive and send queues) so that they
441 	 * would be aligned on their combined size.  That alignment guaranteed
442 	 * that they would never cross the 4GB boundary (Hermon work queues
443 	 * are on the order of MBs at maximum).  Now we are able to relax
444 	 * this alignment constraint by ensuring that the IB address assigned
445 	 * to the queue memory (as a result of the hermon_mr_register() call)
446 	 * is offset from zero.
447 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
448 	 * guarantee the alignment, but when attempting to use IOMMU bypass
449 	 * mode we found that we were not allowed to specify any alignment
450 	 * that was more restrictive than the system page size.
451 	 * So we avoided this constraint by passing two alignment values,
452 	 * one for the memory allocation itself and the other for the DMA
453 	 * handle (for later bind).  This used to cause more memory than
454 	 * necessary to be allocated (in order to guarantee the more
455 	 * restrictive alignment contraint).  But by guaranteeing the
456 	 * zero-based IB virtual address for the queue, we are able to
457 	 * conserve this memory.
458 	 */
459 	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
460 	sq_depth    = 1 << log_qp_sq_size;
461 	sq_size	    = sq_depth * sq_wqe_size;
462 
463 	/* QP on SRQ sets these to 0 */
464 	if (qp_srq_en) {
465 		rq_wqe_size = 0;
466 		rq_size	    = 0;
467 	} else {
468 		rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
469 		rq_depth    = 1 << log_qp_rq_size;
470 		rq_size	    = rq_depth * rq_wqe_size;
471 	}
472 
473 	qp->qp_wqinfo.qa_size = sq_size + rq_size;
474 
475 	qp->qp_wqinfo.qa_alloc_align = PAGESIZE;
476 	qp->qp_wqinfo.qa_bind_align  = PAGESIZE;
477 
478 	if (qp_is_umap) {
479 		qp->qp_wqinfo.qa_location = HERMON_QUEUE_LOCATION_USERLAND;
480 	} else {
481 		qp->qp_wqinfo.qa_location = HERMON_QUEUE_LOCATION_NORMAL;
482 	}
483 	status = hermon_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
484 	if (status != DDI_SUCCESS) {
485 		status = IBT_INSUFF_RESOURCE;
486 		goto qpalloc_fail7;
487 	}
488 
489 	/*
490 	 * Sort WQs in memory according to stride (*q_wqe_size), largest first
491 	 * If they are equal, still put the SQ first
492 	 */
493 	qp->qp_sq_baseaddr = 0;
494 	qp->qp_rq_baseaddr = 0;
495 	if ((sq_wqe_size > rq_wqe_size) || (sq_wqe_size == rq_wqe_size)) {
496 		sq_buf = qp->qp_wqinfo.qa_buf_aligned;
497 
498 		/* if this QP is on an SRQ, set the rq_buf to NULL */
499 		if (qp_srq_en) {
500 			rq_buf = NULL;
501 		} else {
502 			rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
503 			qp->qp_rq_baseaddr = sq_size;
504 		}
505 	} else {
506 		rq_buf = qp->qp_wqinfo.qa_buf_aligned;
507 		sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
508 		qp->qp_sq_baseaddr = rq_size;
509 	}
510 
511 	if (qp_is_umap == 0) {
512 		qp->qp_sq_wqhdr = hermon_wrid_wqhdr_create(sq_depth);
513 		if (qp->qp_sq_wqhdr == NULL) {
514 			status = IBT_INSUFF_RESOURCE;
515 			goto qpalloc_fail8;
516 		}
517 		if (qp_srq_en) {
518 			qp->qp_rq_wqavl.wqa_wq = srq->srq_wq_wqhdr;
519 			qp->qp_rq_wqavl.wqa_srq_en = 1;
520 			qp->qp_rq_wqavl.wqa_srq = srq;
521 		} else {
522 			qp->qp_rq_wqhdr = hermon_wrid_wqhdr_create(rq_depth);
523 			if (qp->qp_rq_wqhdr == NULL) {
524 				status = IBT_INSUFF_RESOURCE;
525 				goto qpalloc_fail8;
526 			}
527 			qp->qp_rq_wqavl.wqa_wq = qp->qp_rq_wqhdr;
528 		}
529 		qp->qp_sq_wqavl.wqa_qpn = qp->qp_qpnum;
530 		qp->qp_sq_wqavl.wqa_type = HERMON_WR_SEND;
531 		qp->qp_sq_wqavl.wqa_wq = qp->qp_sq_wqhdr;
532 		qp->qp_rq_wqavl.wqa_qpn = qp->qp_qpnum;
533 		qp->qp_rq_wqavl.wqa_type = HERMON_WR_RECV;
534 	}
535 
536 	/*
537 	 * Register the memory for the QP work queues.  The memory for the
538 	 * QP must be registered in the Hermon cMPT tables.  This gives us the
539 	 * LKey to specify in the QP context later.  Note: The memory for
540 	 * Hermon work queues (both Send and Recv) must be contiguous and
541 	 * registered as a single memory region.  Note: If the QP memory is
542 	 * user-mappable, force DDI_DMA_CONSISTENT mapping. Also, in order to
543 	 * meet the alignment restriction, we pass the "mro_bind_override_addr"
544 	 * flag in the call to hermon_mr_register(). This guarantees that the
545 	 * resulting IB vaddr will be zero-based (modulo the offset into the
546 	 * first page). If we fail here, we still have the bunch of resource
547 	 * and reference count cleanup to do.
548 	 */
549 	flag = (sleepflag == HERMON_SLEEP) ? IBT_MR_SLEEP :
550 	    IBT_MR_NOSLEEP;
551 	mr_attr.mr_vaddr    = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
552 	mr_attr.mr_len	    = qp->qp_wqinfo.qa_size;
553 	mr_attr.mr_as	    = NULL;
554 	mr_attr.mr_flags    = flag;
555 	if (qp_is_umap) {
556 		mr_op.mro_bind_type = state->hs_cfg_profile->cp_iommu_bypass;
557 	} else {
558 		/* HERMON_QUEUE_LOCATION_NORMAL */
559 		mr_op.mro_bind_type =
560 		    state->hs_cfg_profile->cp_iommu_bypass;
561 	}
562 	mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
563 	mr_op.mro_bind_override_addr = 1;
564 	status = hermon_mr_register(state, pd, &mr_attr, &mr,
565 	    &mr_op, HERMON_QP_CMPT);
566 	if (status != DDI_SUCCESS) {
567 		status = IBT_INSUFF_RESOURCE;
568 		goto qpalloc_fail9;
569 	}
570 
571 	/*
572 	 * Calculate the offset between the kernel virtual address space
573 	 * and the IB virtual address space.  This will be used when
574 	 * posting work requests to properly initialize each WQE.
575 	 */
576 	qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
577 	    (uint64_t)mr->mr_bindinfo.bi_addr;
578 
579 	/*
580 	 * Fill in all the return arguments (if necessary).  This includes
581 	 * real work queue sizes (in wqes), real SGLs, and QP number
582 	 */
583 	if (queuesz_p != NULL) {
584 		queuesz_p->cs_sq 	=
585 		    (1 << log_qp_sq_size) - qp->qp_sq_hdrmwqes;
586 		queuesz_p->cs_sq_sgl	= qp->qp_sq_sgl;
587 
588 		/* if this QP is on an SRQ, set these to 0 */
589 		if (qp_srq_en) {
590 			queuesz_p->cs_rq	= 0;
591 			queuesz_p->cs_rq_sgl	= 0;
592 		} else {
593 			queuesz_p->cs_rq	= (1 << log_qp_rq_size);
594 			queuesz_p->cs_rq_sgl	= qp->qp_rq_sgl;
595 		}
596 	}
597 	if (qpn != NULL) {
598 		*qpn = (ib_qpn_t)qp->qp_qpnum;
599 	}
600 
601 	/*
602 	 * Fill in the rest of the Hermon Queue Pair handle.
603 	 */
604 	qp->qp_qpcrsrcp		= qpc;
605 	qp->qp_rsrcp		= rsrc;
606 	qp->qp_state		= HERMON_QP_RESET;
607 	HERMON_SET_QP_POST_SEND_STATE(qp, HERMON_QP_RESET);
608 	qp->qp_pdhdl		= pd;
609 	qp->qp_mrhdl		= mr;
610 	qp->qp_sq_sigtype	= (attr_p->qp_flags & IBT_WR_SIGNALED) ?
611 	    HERMON_QP_SQ_WR_SIGNALED : HERMON_QP_SQ_ALL_SIGNALED;
612 	qp->qp_is_special	= 0;
613 	qp->qp_uarpg		= uarpg;
614 	qp->qp_umap_dhp		= (devmap_cookie_t)NULL;
615 	qp->qp_sq_cqhdl		= sq_cq;
616 	qp->qp_sq_bufsz		= (1 << log_qp_sq_size);
617 	qp->qp_sq_logqsz	= log_qp_sq_size;
618 	qp->qp_sq_buf		= sq_buf;
619 	qp->qp_desc_off		= qp_desc_off;
620 	qp->qp_rq_cqhdl		= rq_cq;
621 	qp->qp_rq_buf		= rq_buf;
622 	qp->qp_rlky		= (attr_p->qp_flags & IBT_FAST_REG_RES_LKEY) !=
623 	    0;
624 
625 	/* if this QP is on an SRQ, set rq_bufsz to 0 */
626 	if (qp_srq_en) {
627 		qp->qp_rq_bufsz		= 0;
628 		qp->qp_rq_logqsz	= 0;
629 	} else {
630 		qp->qp_rq_bufsz		= (1 << log_qp_rq_size);
631 		qp->qp_rq_logqsz	= log_qp_rq_size;
632 	}
633 
634 	qp->qp_forward_sqd_event  = 0;
635 	qp->qp_sqd_still_draining = 0;
636 	qp->qp_hdlrarg		= (void *)ibt_qphdl;
637 	qp->qp_mcg_refcnt	= 0;
638 
639 	/*
640 	 * If this QP is to be associated with an SRQ, set the SRQ handle
641 	 */
642 	if (qp_srq_en) {
643 		qp->qp_srqhdl = srq;
644 		hermon_srq_refcnt_inc(qp->qp_srqhdl);
645 	} else {
646 		qp->qp_srqhdl = NULL;
647 	}
648 
649 	/* Determine the QP service type */
650 	qp->qp_type = type;
651 	if (type == IBT_RC_RQP) {
652 		qp->qp_serv_type = HERMON_QP_RC;
653 	} else if (type == IBT_UD_RQP) {
654 		if (alloc_flags & IBT_QP_USES_RFCI)
655 			qp->qp_serv_type = HERMON_QP_RFCI;
656 		else if (alloc_flags & IBT_QP_USES_FCMD)
657 			qp->qp_serv_type = HERMON_QP_FCMND;
658 		else
659 			qp->qp_serv_type = HERMON_QP_UD;
660 	} else {
661 		qp->qp_serv_type = HERMON_QP_UC;
662 	}
663 
664 	/*
665 	 * Initialize the RQ WQEs - unlike Arbel, no Rcv init is needed
666 	 */
667 
668 	/*
669 	 * Initialize the SQ WQEs - all that needs to be done is every 64 bytes
670 	 * set the quadword to all F's - high-order bit is owner (init to one)
671 	 * and the rest for the headroom definition of prefetching
672 	 *
673 	 */
674 	wqesz_shift = qp->qp_sq_log_wqesz;
675 	thewqesz    = 1 << wqesz_shift;
676 	thewqe = (uint64_t *)(void *)(qp->qp_sq_buf);
677 	if (qp_is_umap == 0) {
678 		for (i = 0; i < sq_depth; i++) {
679 			/*
680 			 * for each stride, go through and every 64 bytes
681 			 * write the init value - having set the address
682 			 * once, just keep incrementing it
683 			 */
684 			for (j = 0; j < thewqesz; j += 64, thewqe += 8) {
685 				*(uint32_t *)thewqe = 0xFFFFFFFF;
686 			}
687 		}
688 	}
689 
690 	/* Zero out the QP context */
691 	bzero(&qp->qpc, sizeof (hermon_hw_qpc_t));
692 
693 	/*
694 	 * Put QP handle in Hermon QPNum-to-QPHdl list.  Then fill in the
695 	 * "qphdl" and return success
696 	 */
697 	hermon_icm_set_num_to_hdl(state, HERMON_QPC, qpc->hr_indx, qp);
698 
699 	/*
700 	 * If this is a user-mappable QP, then we need to insert the previously
701 	 * allocated entry into the "userland resources database".  This will
702 	 * allow for later lookup during devmap() (i.e. mmap()) calls.
703 	 */
704 	if (qp_is_umap) {
705 		hermon_umap_db_add(umapdb);
706 	}
707 	mutex_init(&qp->qp_sq_lock, NULL, MUTEX_DRIVER,
708 	    DDI_INTR_PRI(state->hs_intrmsi_pri));
709 
710 	*qphdl = qp;
711 
712 	return (DDI_SUCCESS);
713 
714 /*
715  * The following is cleanup for all possible failure cases in this routine
716  */
717 qpalloc_fail9:
718 	hermon_queue_free(&qp->qp_wqinfo);
719 qpalloc_fail8:
720 	if (qp->qp_sq_wqhdr)
721 		hermon_wrid_wqhdr_destroy(qp->qp_sq_wqhdr);
722 	if (qp->qp_rq_wqhdr)
723 		hermon_wrid_wqhdr_destroy(qp->qp_rq_wqhdr);
724 qpalloc_fail7:
725 	if (qp_is_umap) {
726 		hermon_umap_db_free(umapdb);
727 	}
728 	if (!qp_srq_en) {
729 		hermon_dbr_free(state, uarpg, qp->qp_rq_vdbr);
730 	}
731 
732 qpalloc_fail6:
733 	/*
734 	 * Releasing the QPN will also free up the QPC context.  Update
735 	 * the QPC context pointer to indicate this.
736 	 */
737 	if (qp->qp_qpn_hdl) {
738 		hermon_qp_release_qpn(state, qp->qp_qpn_hdl,
739 		    HERMON_QPN_RELEASE);
740 	} else {
741 		hermon_rsrc_free(state, &qpc);
742 	}
743 	qpc = NULL;
744 qpalloc_fail5:
745 	hermon_rsrc_free(state, &rsrc);
746 qpalloc_fail4:
747 	if (qpc) {
748 		hermon_rsrc_free(state, &qpc);
749 	}
750 qpalloc_fail3:
751 	hermon_cq_refcnt_dec(rq_cq);
752 qpalloc_fail2:
753 	hermon_cq_refcnt_dec(sq_cq);
754 qpalloc_fail1:
755 	hermon_pd_refcnt_dec(pd);
756 qpalloc_fail:
757 	return (status);
758 }
759 
760 
761 
762 /*
763  * hermon_special_qp_alloc()
764  *    Context: Can be called only from user or kernel context.
765  */
766 int
767 hermon_special_qp_alloc(hermon_state_t *state, hermon_qp_info_t *qpinfo,
768     uint_t sleepflag)
769 {
770 	hermon_rsrc_t		*qpc, *rsrc;
771 	hermon_qphdl_t		qp;
772 	ibt_qp_alloc_attr_t	*attr_p;
773 	ibt_sqp_type_t		type;
774 	uint8_t			port;
775 	ibtl_qp_hdl_t		ibt_qphdl;
776 	ibt_chan_sizes_t	*queuesz_p;
777 	hermon_qphdl_t		*qphdl;
778 	ibt_mr_attr_t		mr_attr;
779 	hermon_mr_options_t	mr_op;
780 	hermon_pdhdl_t		pd;
781 	hermon_cqhdl_t		sq_cq, rq_cq;
782 	hermon_mrhdl_t		mr;
783 	uint64_t		qp_desc_off;
784 	uint64_t		*thewqe, thewqesz;
785 	uint32_t		*sq_buf, *rq_buf;
786 	uint32_t		log_qp_sq_size, log_qp_rq_size;
787 	uint32_t		sq_size, rq_size, max_sgl;
788 	uint32_t		uarpg;
789 	uint32_t		sq_depth;
790 	uint32_t		sq_wqe_size, rq_wqe_size, wqesz_shift;
791 	int			status, flag, i, j;
792 
793 	/*
794 	 * Extract the necessary info from the hermon_qp_info_t structure
795 	 */
796 	attr_p	  = qpinfo->qpi_attrp;
797 	type	  = qpinfo->qpi_type;
798 	port	  = qpinfo->qpi_port;
799 	ibt_qphdl = qpinfo->qpi_ibt_qphdl;
800 	queuesz_p = qpinfo->qpi_queueszp;
801 	qphdl	  = &qpinfo->qpi_qphdl;
802 
803 	/*
804 	 * Check for valid special QP type (only SMI & GSI supported)
805 	 */
806 	if ((type != IBT_SMI_SQP) && (type != IBT_GSI_SQP)) {
807 		status = IBT_QP_SPECIAL_TYPE_INVALID;
808 		goto spec_qpalloc_fail;
809 	}
810 
811 	/*
812 	 * Check for valid port number
813 	 */
814 	if (!hermon_portnum_is_valid(state, port)) {
815 		status = IBT_HCA_PORT_INVALID;
816 		goto spec_qpalloc_fail;
817 	}
818 	port = port - 1;
819 
820 	/*
821 	 * Check for valid PD handle pointer
822 	 */
823 	if (attr_p->qp_pd_hdl == NULL) {
824 		status = IBT_PD_HDL_INVALID;
825 		goto spec_qpalloc_fail;
826 	}
827 	pd = (hermon_pdhdl_t)attr_p->qp_pd_hdl;
828 
829 	/* Increment the reference count on the PD */
830 	hermon_pd_refcnt_inc(pd);
831 
832 	/*
833 	 * Check for valid CQ handle pointers
834 	 */
835 	if ((attr_p->qp_ibc_scq_hdl == NULL) ||
836 	    (attr_p->qp_ibc_rcq_hdl == NULL)) {
837 		status = IBT_CQ_HDL_INVALID;
838 		goto spec_qpalloc_fail1;
839 	}
840 	sq_cq = (hermon_cqhdl_t)attr_p->qp_ibc_scq_hdl;
841 	rq_cq = (hermon_cqhdl_t)attr_p->qp_ibc_rcq_hdl;
842 
843 	/*
844 	 * Increment the reference count on the CQs.  One or both of these
845 	 * could return error if we determine that the given CQ is already
846 	 * being used with a non-special QP (i.e. a normal QP).
847 	 */
848 	status = hermon_cq_refcnt_inc(sq_cq, HERMON_CQ_IS_SPECIAL);
849 	if (status != DDI_SUCCESS) {
850 		status = IBT_CQ_HDL_INVALID;
851 		goto spec_qpalloc_fail1;
852 	}
853 	status = hermon_cq_refcnt_inc(rq_cq, HERMON_CQ_IS_SPECIAL);
854 	if (status != DDI_SUCCESS) {
855 		status = IBT_CQ_HDL_INVALID;
856 		goto spec_qpalloc_fail2;
857 	}
858 
859 	/*
860 	 * Allocate the special QP resources.  Essentially, this allocation
861 	 * amounts to checking if the request special QP has already been
862 	 * allocated.  If successful, the QP context return is an actual
863 	 * QP context that has been "aliased" to act as a special QP of the
864 	 * appropriate type (and for the appropriate port).  Just as in
865 	 * hermon_qp_alloc() above, ownership for this QP context is not
866 	 * immediately given to hardware in the final step here.  Instead, we
867 	 * wait until the QP is later transitioned to the "Init" state before
868 	 * passing the QP to hardware.  If we fail here, we must undo all
869 	 * the reference count (CQ and PD).
870 	 */
871 	status = hermon_special_qp_rsrc_alloc(state, type, port, &qpc);
872 	if (status != DDI_SUCCESS) {
873 		goto spec_qpalloc_fail3;
874 	}
875 
876 	/*
877 	 * Allocate the software structure for tracking the special queue
878 	 * pair (i.e. the Hermon Queue Pair handle).  If we fail here, we
879 	 * must undo the reference counts and the previous resource allocation.
880 	 */
881 	status = hermon_rsrc_alloc(state, HERMON_QPHDL, 1, sleepflag, &rsrc);
882 	if (status != DDI_SUCCESS) {
883 		status = IBT_INSUFF_RESOURCE;
884 		goto spec_qpalloc_fail4;
885 	}
886 	qp = (hermon_qphdl_t)rsrc->hr_addr;
887 
888 	bzero(qp, sizeof (struct hermon_sw_qp_s));
889 
890 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
891 	qp->qp_alloc_flags = attr_p->qp_alloc_flags;
892 
893 	/*
894 	 * Actual QP number is a combination of the index of the QPC and
895 	 * the port number.  This is because the special QP contexts must
896 	 * be allocated two-at-a-time.
897 	 */
898 	qp->qp_qpnum = qpc->hr_indx + port;
899 	qp->qp_ring = qp->qp_qpnum << 8;
900 
901 	uarpg = state->hs_kernel_uar_index; /* must be for spec qp */
902 	/*
903 	 * Allocate the doorbell record.  Hermon uses only one for the RQ so
904 	 * alloc a qp doorbell, using uarpg (above) as the uar index
905 	 */
906 
907 	status = hermon_dbr_alloc(state, uarpg, &qp->qp_rq_dbr_acchdl,
908 	    &qp->qp_rq_vdbr, &qp->qp_rq_pdbr, &qp->qp_rdbr_mapoffset);
909 	if (status != DDI_SUCCESS) {
910 		status = IBT_INSUFF_RESOURCE;
911 		goto spec_qpalloc_fail5;
912 	}
913 	/*
914 	 * Calculate the appropriate size for the work queues.
915 	 * Note:  All Hermon QP work queues must be a power-of-2 in size.  Also
916 	 * they may not be any smaller than HERMON_QP_MIN_SIZE.  This step is
917 	 * to round the requested size up to the next highest power-of-2
918 	 */
919 	attr_p->qp_sizes.cs_sq =
920 	    max(attr_p->qp_sizes.cs_sq, HERMON_QP_MIN_SIZE);
921 	attr_p->qp_sizes.cs_rq =
922 	    max(attr_p->qp_sizes.cs_rq, HERMON_QP_MIN_SIZE);
923 	log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq);
924 	if ((attr_p->qp_sizes.cs_sq & (attr_p->qp_sizes.cs_sq - 1)) == 0) {
925 		log_qp_sq_size = log_qp_sq_size - 1;
926 	}
927 	log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
928 	if ((attr_p->qp_sizes.cs_rq & (attr_p->qp_sizes.cs_rq - 1)) == 0) {
929 		log_qp_rq_size = log_qp_rq_size - 1;
930 	}
931 
932 	/*
933 	 * Next we verify that the rounded-up size is valid (i.e. consistent
934 	 * with the device limits and/or software-configured limits).  If not,
935 	 * then obviously we have a bit of cleanup to do before returning.
936 	 */
937 	if ((log_qp_sq_size > state->hs_cfg_profile->cp_log_max_qp_sz) ||
938 	    (log_qp_rq_size > state->hs_cfg_profile->cp_log_max_qp_sz)) {
939 		status = IBT_HCA_WR_EXCEEDED;
940 		goto spec_qpalloc_fail5a;
941 	}
942 
943 	/*
944 	 * Next we verify that the requested number of SGL is valid (i.e.
945 	 * consistent with the device limits and/or software-configured
946 	 * limits).  If not, then obviously the same cleanup needs to be done.
947 	 */
948 	max_sgl = state->hs_cfg_profile->cp_wqe_real_max_sgl;
949 	if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
950 	    (attr_p->qp_sizes.cs_rq_sgl > max_sgl)) {
951 		status = IBT_HCA_SGL_EXCEEDED;
952 		goto spec_qpalloc_fail5a;
953 	}
954 
955 	/*
956 	 * Determine this QP's WQE stride (for both the Send and Recv WQEs).
957 	 * This will depend on the requested number of SGLs.  Note: this
958 	 * has the side-effect of also calculating the real number of SGLs
959 	 * (for the calculated WQE size).
960 	 */
961 	hermon_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
962 	    max_sgl, HERMON_QP_WQ_TYPE_RECVQ,
963 	    &qp->qp_rq_log_wqesz, &qp->qp_rq_sgl);
964 	if (type == IBT_SMI_SQP) {
965 		hermon_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
966 		    max_sgl, HERMON_QP_WQ_TYPE_SENDMLX_QP0,
967 		    &qp->qp_sq_log_wqesz, &qp->qp_sq_sgl);
968 	} else {
969 		hermon_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
970 		    max_sgl, HERMON_QP_WQ_TYPE_SENDMLX_QP1,
971 		    &qp->qp_sq_log_wqesz, &qp->qp_sq_sgl);
972 	}
973 
974 	/*
975 	 * Allocate the memory for QP work queues. Since Hermon work queues
976 	 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
977 	 * the work queue memory is very important.  We used to allocate
978 	 * work queues (the combined receive and send queues) so that they
979 	 * would be aligned on their combined size.  That alignment guaranteed
980 	 * that they would never cross the 4GB boundary (Hermon work queues
981 	 * are on the order of MBs at maximum).  Now we are able to relax
982 	 * this alignment constraint by ensuring that the IB address assigned
983 	 * to the queue memory (as a result of the hermon_mr_register() call)
984 	 * is offset from zero.
985 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
986 	 * guarantee the alignment, but when attempting to use IOMMU bypass
987 	 * mode we found that we were not allowed to specify any alignment
988 	 * that was more restrictive than the system page size.
989 	 * So we avoided this constraint by passing two alignment values,
990 	 * one for the memory allocation itself and the other for the DMA
991 	 * handle (for later bind).  This used to cause more memory than
992 	 * necessary to be allocated (in order to guarantee the more
993 	 * restrictive alignment contraint).  But by guaranteeing the
994 	 * zero-based IB virtual address for the queue, we are able to
995 	 * conserve this memory.
996 	 */
997 	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
998 	sq_depth    = 1 << log_qp_sq_size;
999 	sq_size	    = (1 << log_qp_sq_size) * sq_wqe_size;
1000 
1001 	rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
1002 	rq_size	    = (1 << log_qp_rq_size) * rq_wqe_size;
1003 
1004 	qp->qp_wqinfo.qa_size	  = sq_size + rq_size;
1005 
1006 	qp->qp_wqinfo.qa_alloc_align = PAGESIZE;
1007 	qp->qp_wqinfo.qa_bind_align  = PAGESIZE;
1008 	qp->qp_wqinfo.qa_location = HERMON_QUEUE_LOCATION_NORMAL;
1009 
1010 	status = hermon_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
1011 	if (status != NULL) {
1012 		status = IBT_INSUFF_RESOURCE;
1013 		goto spec_qpalloc_fail5a;
1014 	}
1015 
1016 	/*
1017 	 * Sort WQs in memory according to depth, stride (*q_wqe_size),
1018 	 * biggest first. If equal, the Send Queue still goes first
1019 	 */
1020 	qp->qp_sq_baseaddr = 0;
1021 	qp->qp_rq_baseaddr = 0;
1022 	if ((sq_wqe_size > rq_wqe_size) || (sq_wqe_size == rq_wqe_size)) {
1023 		sq_buf = qp->qp_wqinfo.qa_buf_aligned;
1024 		rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
1025 		qp->qp_rq_baseaddr = sq_size;
1026 	} else {
1027 		rq_buf = qp->qp_wqinfo.qa_buf_aligned;
1028 		sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
1029 		qp->qp_sq_baseaddr = rq_size;
1030 	}
1031 
1032 	qp->qp_sq_wqhdr = hermon_wrid_wqhdr_create(sq_depth);
1033 	if (qp->qp_sq_wqhdr == NULL) {
1034 		status = IBT_INSUFF_RESOURCE;
1035 		goto spec_qpalloc_fail6;
1036 	}
1037 	qp->qp_rq_wqhdr = hermon_wrid_wqhdr_create(1 << log_qp_rq_size);
1038 	if (qp->qp_rq_wqhdr == NULL) {
1039 		status = IBT_INSUFF_RESOURCE;
1040 		goto spec_qpalloc_fail6;
1041 	}
1042 	qp->qp_sq_wqavl.wqa_qpn = qp->qp_qpnum;
1043 	qp->qp_sq_wqavl.wqa_type = HERMON_WR_SEND;
1044 	qp->qp_sq_wqavl.wqa_wq = qp->qp_sq_wqhdr;
1045 	qp->qp_rq_wqavl.wqa_qpn = qp->qp_qpnum;
1046 	qp->qp_rq_wqavl.wqa_type = HERMON_WR_RECV;
1047 	qp->qp_rq_wqavl.wqa_wq = qp->qp_rq_wqhdr;
1048 
1049 	/*
1050 	 * Register the memory for the special QP work queues.  The memory for
1051 	 * the special QP must be registered in the Hermon cMPT tables.  This
1052 	 * gives us the LKey to specify in the QP context later.  Note: The
1053 	 * memory for Hermon work queues (both Send and Recv) must be contiguous
1054 	 * and registered as a single memory region. Also, in order to meet the
1055 	 * alignment restriction, we pass the "mro_bind_override_addr" flag in
1056 	 * the call to hermon_mr_register(). This guarantees that the resulting
1057 	 * IB vaddr will be zero-based (modulo the offset into the first page).
1058 	 * If we fail here, we have a bunch of resource and reference count
1059 	 * cleanup to do.
1060 	 */
1061 	flag = (sleepflag == HERMON_SLEEP) ? IBT_MR_SLEEP :
1062 	    IBT_MR_NOSLEEP;
1063 	mr_attr.mr_vaddr    = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
1064 	mr_attr.mr_len	    = qp->qp_wqinfo.qa_size;
1065 	mr_attr.mr_as	    = NULL;
1066 	mr_attr.mr_flags    = flag;
1067 
1068 	mr_op.mro_bind_type = state->hs_cfg_profile->cp_iommu_bypass;
1069 	mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
1070 	mr_op.mro_bind_override_addr = 1;
1071 
1072 	status = hermon_mr_register(state, pd, &mr_attr, &mr, &mr_op,
1073 	    HERMON_QP_CMPT);
1074 	if (status != DDI_SUCCESS) {
1075 		status = IBT_INSUFF_RESOURCE;
1076 		goto spec_qpalloc_fail6;
1077 	}
1078 
1079 	/*
1080 	 * Calculate the offset between the kernel virtual address space
1081 	 * and the IB virtual address space.  This will be used when
1082 	 * posting work requests to properly initialize each WQE.
1083 	 */
1084 	qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
1085 	    (uint64_t)mr->mr_bindinfo.bi_addr;
1086 
1087 	/* set the prefetch - initially, not prefetching */
1088 	qp->qp_no_prefetch = 1;
1089 
1090 	if (qp->qp_no_prefetch)
1091 		qp->qp_sq_headroom = 2 * sq_wqe_size;
1092 	else
1093 		qp->qp_sq_headroom = sq_wqe_size + HERMON_QP_OH_SIZE;
1094 	/*
1095 	 * hdrm wqes must be integral since both sq_wqe_size &
1096 	 * HERMON_QP_OH_SIZE are power of 2
1097 	 */
1098 	qp->qp_sq_hdrmwqes = (qp->qp_sq_headroom / sq_wqe_size);
1099 	/*
1100 	 * Fill in all the return arguments (if necessary).  This includes
1101 	 * real work queue sizes, real SGLs, and QP number (which will be
1102 	 * either zero or one, depending on the special QP type)
1103 	 */
1104 	if (queuesz_p != NULL) {
1105 		queuesz_p->cs_sq	=
1106 		    (1 << log_qp_sq_size) - qp->qp_sq_hdrmwqes;
1107 		queuesz_p->cs_sq_sgl	= qp->qp_sq_sgl;
1108 		queuesz_p->cs_rq	= (1 << log_qp_rq_size);
1109 		queuesz_p->cs_rq_sgl	= qp->qp_rq_sgl;
1110 	}
1111 
1112 	/*
1113 	 * Fill in the rest of the Hermon Queue Pair handle.  We can update
1114 	 * the following fields for use in further operations on the QP.
1115 	 */
1116 	qp->qp_qpcrsrcp		= qpc;
1117 	qp->qp_rsrcp		= rsrc;
1118 	qp->qp_state		= HERMON_QP_RESET;
1119 	HERMON_SET_QP_POST_SEND_STATE(qp, HERMON_QP_RESET);
1120 	qp->qp_pdhdl		= pd;
1121 	qp->qp_mrhdl		= mr;
1122 	qp->qp_sq_sigtype	= (attr_p->qp_flags & IBT_WR_SIGNALED) ?
1123 	    HERMON_QP_SQ_WR_SIGNALED : HERMON_QP_SQ_ALL_SIGNALED;
1124 	qp->qp_is_special	= (type == IBT_SMI_SQP) ?
1125 	    HERMON_QP_SMI : HERMON_QP_GSI;
1126 	qp->qp_uarpg		= uarpg;
1127 	qp->qp_umap_dhp		= (devmap_cookie_t)NULL;
1128 	qp->qp_sq_cqhdl		= sq_cq;
1129 	qp->qp_sq_bufsz		= (1 << log_qp_sq_size);
1130 	qp->qp_sq_buf		= sq_buf;
1131 	qp->qp_sq_logqsz	= log_qp_sq_size;
1132 	qp->qp_desc_off		= qp_desc_off;
1133 	qp->qp_rq_cqhdl		= rq_cq;
1134 	qp->qp_rq_bufsz		= (1 << log_qp_rq_size);
1135 	qp->qp_rq_buf		= rq_buf;
1136 	qp->qp_rq_logqsz	= log_qp_rq_size;
1137 	qp->qp_portnum		= port;
1138 	qp->qp_pkeyindx		= 0;
1139 	qp->qp_forward_sqd_event  = 0;
1140 	qp->qp_sqd_still_draining = 0;
1141 	qp->qp_hdlrarg		= (void *)ibt_qphdl;
1142 	qp->qp_mcg_refcnt	= 0;
1143 	qp->qp_srqhdl		= NULL;
1144 
1145 	/* All special QPs are UD QP service type */
1146 	qp->qp_type = IBT_UD_RQP;
1147 	qp->qp_serv_type = HERMON_QP_UD;
1148 
1149 	/*
1150 	 * Initialize the RQ WQEs - unlike Arbel, no Rcv init is needed
1151 	 */
1152 
1153 	/*
1154 	 * Initialize the SQ WQEs - all that needs to be done is every 64 bytes
1155 	 * set the quadword to all F's - high-order bit is owner (init to one)
1156 	 * and the rest for the headroom definition of prefetching
1157 	 *
1158 	 */
1159 
1160 	wqesz_shift = qp->qp_sq_log_wqesz;
1161 	thewqesz    = 1 << wqesz_shift;
1162 	thewqe = (uint64_t *)(void *)(qp->qp_sq_buf);
1163 	for (i = 0; i < sq_depth; i++) {
1164 		/*
1165 		 * for each stride, go through and every 64 bytes write the
1166 		 * init value - having set the address once, just keep
1167 		 * incrementing it
1168 		 */
1169 		for (j = 0; j < thewqesz; j += 64, thewqe += 8) {
1170 			*(uint32_t *)thewqe = 0xFFFFFFFF;
1171 		}
1172 	}
1173 
1174 
1175 	/* Zero out the QP context */
1176 	bzero(&qp->qpc, sizeof (hermon_hw_qpc_t));
1177 
1178 	/*
1179 	 * Put QP handle in Hermon QPNum-to-QPHdl list.  Then fill in the
1180 	 * "qphdl" and return success
1181 	 */
1182 	hermon_icm_set_num_to_hdl(state, HERMON_QPC, qpc->hr_indx + port, qp);
1183 
1184 	mutex_init(&qp->qp_sq_lock, NULL, MUTEX_DRIVER,
1185 	    DDI_INTR_PRI(state->hs_intrmsi_pri));
1186 
1187 	*qphdl = qp;
1188 
1189 	return (DDI_SUCCESS);
1190 
1191 /*
1192  * The following is cleanup for all possible failure cases in this routine
1193  */
1194 spec_qpalloc_fail6:
1195 	hermon_queue_free(&qp->qp_wqinfo);
1196 	if (qp->qp_sq_wqhdr)
1197 		hermon_wrid_wqhdr_destroy(qp->qp_sq_wqhdr);
1198 	if (qp->qp_rq_wqhdr)
1199 		hermon_wrid_wqhdr_destroy(qp->qp_rq_wqhdr);
1200 spec_qpalloc_fail5a:
1201 	hermon_dbr_free(state, uarpg, qp->qp_rq_vdbr);
1202 spec_qpalloc_fail5:
1203 	hermon_rsrc_free(state, &rsrc);
1204 spec_qpalloc_fail4:
1205 	if (hermon_special_qp_rsrc_free(state, type, port) != DDI_SUCCESS) {
1206 		HERMON_WARNING(state, "failed to free special QP rsrc");
1207 	}
1208 spec_qpalloc_fail3:
1209 	hermon_cq_refcnt_dec(rq_cq);
1210 spec_qpalloc_fail2:
1211 	hermon_cq_refcnt_dec(sq_cq);
1212 spec_qpalloc_fail1:
1213 	hermon_pd_refcnt_dec(pd);
1214 spec_qpalloc_fail:
1215 	return (status);
1216 }
1217 
1218 
1219 /*
1220  * hermon_qp_alloc_range()
1221  *    Context: Can be called only from user or kernel context.
1222  */
1223 int
1224 hermon_qp_alloc_range(hermon_state_t *state, uint_t log2,
1225     hermon_qp_info_t *qpinfo, ibtl_qp_hdl_t *ibt_qphdl,
1226     ibc_cq_hdl_t *send_cq, ibc_cq_hdl_t *recv_cq,
1227     hermon_qphdl_t *qphdl, uint_t sleepflag)
1228 {
1229 	hermon_rsrc_t			*qpc, *rsrc;
1230 	hermon_rsrc_type_t		rsrc_type;
1231 	hermon_qphdl_t			qp;
1232 	hermon_qp_range_t		*qp_range_p;
1233 	ibt_qp_alloc_attr_t		*attr_p;
1234 	ibt_qp_type_t			type;
1235 	hermon_qp_wq_type_t		swq_type;
1236 	ibt_chan_sizes_t		*queuesz_p;
1237 	ibt_mr_attr_t			mr_attr;
1238 	hermon_mr_options_t		mr_op;
1239 	hermon_srqhdl_t			srq;
1240 	hermon_pdhdl_t			pd;
1241 	hermon_cqhdl_t			sq_cq, rq_cq;
1242 	hermon_mrhdl_t			mr;
1243 	uint64_t			qp_desc_off;
1244 	uint64_t			*thewqe, thewqesz;
1245 	uint32_t			*sq_buf, *rq_buf;
1246 	uint32_t			log_qp_sq_size, log_qp_rq_size;
1247 	uint32_t			sq_size, rq_size;
1248 	uint32_t			sq_depth, rq_depth;
1249 	uint32_t			sq_wqe_size, rq_wqe_size, wqesz_shift;
1250 	uint32_t			max_sgl, max_recv_sgl, uarpg;
1251 	uint_t				qp_srq_en, i, j;
1252 	int				ii;	/* loop counter for range */
1253 	int				status, flag;
1254 	uint_t				serv_type;
1255 
1256 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p, *queuesz_p))
1257 
1258 	/*
1259 	 * Extract the necessary info from the hermon_qp_info_t structure
1260 	 */
1261 	attr_p	  = qpinfo->qpi_attrp;
1262 	type	  = qpinfo->qpi_type;
1263 	queuesz_p = qpinfo->qpi_queueszp;
1264 
1265 	if (attr_p->qp_alloc_flags & IBT_QP_USES_RSS) {
1266 		if (log2 > state->hs_ibtfinfo.hca_attr->hca_rss_max_log2_table)
1267 			return (IBT_INSUFF_RESOURCE);
1268 		rsrc_type = HERMON_QPC;
1269 		serv_type = HERMON_QP_UD;
1270 	} else if (attr_p->qp_alloc_flags & IBT_QP_USES_FEXCH) {
1271 		if (log2 > state->hs_ibtfinfo.hca_attr->hca_fexch_max_log2_qp)
1272 			return (IBT_INSUFF_RESOURCE);
1273 		switch (attr_p->qp_fc.fc_hca_port) {
1274 		case 1:
1275 			rsrc_type = HERMON_QPC_FEXCH_PORT1;
1276 			break;
1277 		case 2:
1278 			rsrc_type = HERMON_QPC_FEXCH_PORT2;
1279 			break;
1280 		default:
1281 			return (IBT_INVALID_PARAM);
1282 		}
1283 		serv_type = HERMON_QP_FEXCH;
1284 	} else
1285 		return (IBT_INVALID_PARAM);
1286 
1287 	/*
1288 	 * Determine whether QP is being allocated for userland access or
1289 	 * whether it is being allocated for kernel access.  If the QP is
1290 	 * being allocated for userland access, fail (too complex for now).
1291 	 */
1292 	if (attr_p->qp_alloc_flags & IBT_QP_USER_MAP) {
1293 		return (IBT_NOT_SUPPORTED);
1294 	} else {
1295 		uarpg = state->hs_kernel_uar_index;
1296 	}
1297 
1298 	/*
1299 	 * Determine whether QP is being associated with an SRQ
1300 	 */
1301 	qp_srq_en = (attr_p->qp_alloc_flags & IBT_QP_USES_SRQ) ? 1 : 0;
1302 	if (qp_srq_en) {
1303 		/*
1304 		 * Check for valid SRQ handle pointers
1305 		 */
1306 		if (attr_p->qp_ibc_srq_hdl == NULL) {
1307 			return (IBT_SRQ_HDL_INVALID);
1308 		}
1309 		srq = (hermon_srqhdl_t)attr_p->qp_ibc_srq_hdl;
1310 	}
1311 
1312 	/*
1313 	 * Check for valid QP service type (only UD supported)
1314 	 */
1315 	if (type != IBT_UD_RQP) {
1316 		return (IBT_QP_SRV_TYPE_INVALID);
1317 	}
1318 
1319 	/*
1320 	 * Check for valid PD handle pointer
1321 	 */
1322 	if (attr_p->qp_pd_hdl == NULL) {
1323 		return (IBT_PD_HDL_INVALID);
1324 	}
1325 	pd = (hermon_pdhdl_t)attr_p->qp_pd_hdl;
1326 
1327 	/*
1328 	 * If on an SRQ, check to make sure the PD is the same
1329 	 */
1330 	if (qp_srq_en && (pd->pd_pdnum != srq->srq_pdhdl->pd_pdnum)) {
1331 		return (IBT_PD_HDL_INVALID);
1332 	}
1333 
1334 	/* set loop variable here, for freeing resources on error */
1335 	ii = 0;
1336 
1337 	/*
1338 	 * Allocate 2^log2 contiguous/aligned QP context entries.  This will
1339 	 * be filled in with all the necessary parameters to define the
1340 	 * Queue Pairs.  Unlike other Hermon hardware resources, ownership
1341 	 * is not immediately given to hardware in the final step here.
1342 	 * Instead, we must wait until the QP is later transitioned to the
1343 	 * "Init" state before passing the QP to hardware.  If we fail here,
1344 	 * we must undo all the reference count (CQ and PD).
1345 	 */
1346 	status = hermon_rsrc_alloc(state, rsrc_type, 1 << log2, sleepflag,
1347 	    &qpc);
1348 	if (status != DDI_SUCCESS) {
1349 		return (IBT_INSUFF_RESOURCE);
1350 	}
1351 
1352 	if (attr_p->qp_alloc_flags & IBT_QP_USES_FEXCH)
1353 		/*
1354 		 * Need to init the MKEYs for the FEXCH QPs.
1355 		 *
1356 		 * For FEXCH QP subranges, we return the QPN base as
1357 		 * "relative" to the full FEXCH QP range for the port.
1358 		 */
1359 		*(qpinfo->qpi_qpn) = hermon_fcoib_fexch_relative_qpn(state,
1360 		    attr_p->qp_fc.fc_hca_port, qpc->hr_indx);
1361 	else
1362 		*(qpinfo->qpi_qpn) = (ib_qpn_t)qpc->hr_indx;
1363 
1364 	qp_range_p = kmem_alloc(sizeof (*qp_range_p),
1365 	    (sleepflag == HERMON_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
1366 	if (qp_range_p == NULL) {
1367 		status = IBT_INSUFF_RESOURCE;
1368 		goto qpalloc_fail0;
1369 	}
1370 	mutex_init(&qp_range_p->hqpr_lock, NULL, MUTEX_DRIVER,
1371 	    DDI_INTR_PRI(state->hs_intrmsi_pri));
1372 	mutex_enter(&qp_range_p->hqpr_lock);
1373 	qp_range_p->hqpr_refcnt = 1 << log2;
1374 	qp_range_p->hqpr_qpcrsrc = qpc;
1375 	mutex_exit(&qp_range_p->hqpr_lock);
1376 
1377 for_each_qp:
1378 
1379 	/* Increment the reference count on the protection domain (PD) */
1380 	hermon_pd_refcnt_inc(pd);
1381 
1382 	rq_cq = (hermon_cqhdl_t)recv_cq[ii];
1383 	sq_cq = (hermon_cqhdl_t)send_cq[ii];
1384 	if (sq_cq == NULL) {
1385 		if (attr_p->qp_alloc_flags & IBT_QP_USES_FEXCH) {
1386 			/* if no send completions, just use rq_cq */
1387 			sq_cq = rq_cq;
1388 		} else {
1389 			status = IBT_CQ_HDL_INVALID;
1390 			goto qpalloc_fail1;
1391 		}
1392 	}
1393 
1394 	/*
1395 	 * Increment the reference count on the CQs.  One or both of these
1396 	 * could return error if we determine that the given CQ is already
1397 	 * being used with a special (SMI/GSI) QP.
1398 	 */
1399 	status = hermon_cq_refcnt_inc(sq_cq, HERMON_CQ_IS_NORMAL);
1400 	if (status != DDI_SUCCESS) {
1401 		status = IBT_CQ_HDL_INVALID;
1402 		goto qpalloc_fail1;
1403 	}
1404 	status = hermon_cq_refcnt_inc(rq_cq, HERMON_CQ_IS_NORMAL);
1405 	if (status != DDI_SUCCESS) {
1406 		status = IBT_CQ_HDL_INVALID;
1407 		goto qpalloc_fail2;
1408 	}
1409 
1410 	/*
1411 	 * Allocate the software structure for tracking the queue pair
1412 	 * (i.e. the Hermon Queue Pair handle).  If we fail here, we must
1413 	 * undo the reference counts and the previous resource allocation.
1414 	 */
1415 	status = hermon_rsrc_alloc(state, HERMON_QPHDL, 1, sleepflag, &rsrc);
1416 	if (status != DDI_SUCCESS) {
1417 		status = IBT_INSUFF_RESOURCE;
1418 		goto qpalloc_fail4;
1419 	}
1420 	qp = (hermon_qphdl_t)rsrc->hr_addr;
1421 	bzero(qp, sizeof (struct hermon_sw_qp_s));
1422 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
1423 	qp->qp_alloc_flags = attr_p->qp_alloc_flags;
1424 
1425 	/*
1426 	 * Calculate the QP number from QPC index.  This routine handles
1427 	 * all of the operations necessary to keep track of used, unused,
1428 	 * and released QP numbers.
1429 	 */
1430 	qp->qp_qpnum = qpc->hr_indx + ii;
1431 	qp->qp_ring = qp->qp_qpnum << 8;
1432 	qp->qp_qpn_hdl = NULL;
1433 
1434 	/*
1435 	 * Allocate the doorbell record.  Hermon just needs one for the RQ,
1436 	 * if the QP is not associated with an SRQ, and use uarpg (above) as
1437 	 * the uar index
1438 	 */
1439 
1440 	if (!qp_srq_en) {
1441 		status = hermon_dbr_alloc(state, uarpg, &qp->qp_rq_dbr_acchdl,
1442 		    &qp->qp_rq_vdbr, &qp->qp_rq_pdbr, &qp->qp_rdbr_mapoffset);
1443 		if (status != DDI_SUCCESS) {
1444 			status = IBT_INSUFF_RESOURCE;
1445 			goto qpalloc_fail6;
1446 		}
1447 	}
1448 
1449 	qp->qp_uses_lso = (attr_p->qp_flags & IBT_USES_LSO);
1450 
1451 	/*
1452 	 * We verify that the requested number of SGL is valid (i.e.
1453 	 * consistent with the device limits and/or software-configured
1454 	 * limits).  If not, then obviously the same cleanup needs to be done.
1455 	 */
1456 	max_sgl = state->hs_ibtfinfo.hca_attr->hca_ud_send_sgl_sz;
1457 	swq_type = HERMON_QP_WQ_TYPE_SENDQ_UD;
1458 	max_recv_sgl = state->hs_ibtfinfo.hca_attr->hca_recv_sgl_sz;
1459 	if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
1460 	    (!qp_srq_en && (attr_p->qp_sizes.cs_rq_sgl > max_recv_sgl))) {
1461 		status = IBT_HCA_SGL_EXCEEDED;
1462 		goto qpalloc_fail7;
1463 	}
1464 
1465 	/*
1466 	 * Determine this QP's WQE stride (for both the Send and Recv WQEs).
1467 	 * This will depend on the requested number of SGLs.  Note: this
1468 	 * has the side-effect of also calculating the real number of SGLs
1469 	 * (for the calculated WQE size).
1470 	 *
1471 	 * For QP's on an SRQ, we set these to 0.
1472 	 */
1473 	if (qp_srq_en) {
1474 		qp->qp_rq_log_wqesz = 0;
1475 		qp->qp_rq_sgl = 0;
1476 	} else {
1477 		hermon_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
1478 		    max_recv_sgl, HERMON_QP_WQ_TYPE_RECVQ,
1479 		    &qp->qp_rq_log_wqesz, &qp->qp_rq_sgl);
1480 	}
1481 	hermon_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
1482 	    max_sgl, swq_type, &qp->qp_sq_log_wqesz, &qp->qp_sq_sgl);
1483 
1484 	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
1485 
1486 	/* NOTE: currently policy in driver, later maybe IBTF interface */
1487 	qp->qp_no_prefetch = 0;
1488 
1489 	/*
1490 	 * for prefetching, we need to add the number of wqes in
1491 	 * the 2k area plus one to the number requested, but
1492 	 * ONLY for send queue.  If no_prefetch == 1 (prefetch off)
1493 	 * it's exactly TWO wqes for the headroom
1494 	 */
1495 	if (qp->qp_no_prefetch)
1496 		qp->qp_sq_headroom = 2 * sq_wqe_size;
1497 	else
1498 		qp->qp_sq_headroom = sq_wqe_size + HERMON_QP_OH_SIZE;
1499 	/*
1500 	 * hdrm wqes must be integral since both sq_wqe_size &
1501 	 * HERMON_QP_OH_SIZE are power of 2
1502 	 */
1503 	qp->qp_sq_hdrmwqes = (qp->qp_sq_headroom / sq_wqe_size);
1504 
1505 
1506 	/*
1507 	 * Calculate the appropriate size for the work queues.
1508 	 * For send queue, add in the headroom wqes to the calculation.
1509 	 * Note:  All Hermon QP work queues must be a power-of-2 in size.  Also
1510 	 * they may not be any smaller than HERMON_QP_MIN_SIZE.  This step is
1511 	 * to round the requested size up to the next highest power-of-2
1512 	 */
1513 	/* first, adjust to a minimum and tell the caller the change */
1514 	attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq,
1515 	    HERMON_QP_MIN_SIZE);
1516 	attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq,
1517 	    HERMON_QP_MIN_SIZE);
1518 	/*
1519 	 * now, calculate the alloc size, taking into account
1520 	 * the headroom for the sq
1521 	 */
1522 	log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq + qp->qp_sq_hdrmwqes);
1523 	/* if the total is a power of two, reduce it */
1524 	if (((attr_p->qp_sizes.cs_sq + qp->qp_sq_hdrmwqes)  &
1525 	    (attr_p->qp_sizes.cs_sq + qp->qp_sq_hdrmwqes - 1)) == 0)	{
1526 		log_qp_sq_size = log_qp_sq_size - 1;
1527 	}
1528 
1529 	log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
1530 	if ((attr_p->qp_sizes.cs_rq & (attr_p->qp_sizes.cs_rq - 1)) == 0) {
1531 		log_qp_rq_size = log_qp_rq_size - 1;
1532 	}
1533 
1534 	/*
1535 	 * Next we verify that the rounded-up size is valid (i.e. consistent
1536 	 * with the device limits and/or software-configured limits).  If not,
1537 	 * then obviously we have a lot of cleanup to do before returning.
1538 	 *
1539 	 * NOTE: the first condition deals with the (test) case of cs_sq
1540 	 * being just less than 2^32.  In this case, the headroom addition
1541 	 * to the requested cs_sq will pass the test when it should not.
1542 	 * This test no longer lets that case slip through the check.
1543 	 */
1544 	if ((attr_p->qp_sizes.cs_sq >
1545 	    (1 << state->hs_cfg_profile->cp_log_max_qp_sz)) ||
1546 	    (log_qp_sq_size > state->hs_cfg_profile->cp_log_max_qp_sz) ||
1547 	    (!qp_srq_en && (log_qp_rq_size >
1548 	    state->hs_cfg_profile->cp_log_max_qp_sz))) {
1549 		status = IBT_HCA_WR_EXCEEDED;
1550 		goto qpalloc_fail7;
1551 	}
1552 
1553 	/*
1554 	 * Allocate the memory for QP work queues. Since Hermon work queues
1555 	 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
1556 	 * the work queue memory is very important.  We used to allocate
1557 	 * work queues (the combined receive and send queues) so that they
1558 	 * would be aligned on their combined size.  That alignment guaranteed
1559 	 * that they would never cross the 4GB boundary (Hermon work queues
1560 	 * are on the order of MBs at maximum).  Now we are able to relax
1561 	 * this alignment constraint by ensuring that the IB address assigned
1562 	 * to the queue memory (as a result of the hermon_mr_register() call)
1563 	 * is offset from zero.
1564 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
1565 	 * guarantee the alignment, but when attempting to use IOMMU bypass
1566 	 * mode we found that we were not allowed to specify any alignment
1567 	 * that was more restrictive than the system page size.
1568 	 * So we avoided this constraint by passing two alignment values,
1569 	 * one for the memory allocation itself and the other for the DMA
1570 	 * handle (for later bind).  This used to cause more memory than
1571 	 * necessary to be allocated (in order to guarantee the more
1572 	 * restrictive alignment contraint).  But by guaranteeing the
1573 	 * zero-based IB virtual address for the queue, we are able to
1574 	 * conserve this memory.
1575 	 */
1576 	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
1577 	sq_depth    = 1 << log_qp_sq_size;
1578 	sq_size	    = sq_depth * sq_wqe_size;
1579 
1580 	/* QP on SRQ sets these to 0 */
1581 	if (qp_srq_en) {
1582 		rq_wqe_size = 0;
1583 		rq_size	    = 0;
1584 	} else {
1585 		rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
1586 		rq_depth    = 1 << log_qp_rq_size;
1587 		rq_size	    = rq_depth * rq_wqe_size;
1588 	}
1589 
1590 	qp->qp_wqinfo.qa_size = sq_size + rq_size;
1591 	qp->qp_wqinfo.qa_alloc_align = PAGESIZE;
1592 	qp->qp_wqinfo.qa_bind_align  = PAGESIZE;
1593 	qp->qp_wqinfo.qa_location = HERMON_QUEUE_LOCATION_NORMAL;
1594 	status = hermon_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
1595 	if (status != DDI_SUCCESS) {
1596 		status = IBT_INSUFF_RESOURCE;
1597 		goto qpalloc_fail7;
1598 	}
1599 
1600 	/*
1601 	 * Sort WQs in memory according to stride (*q_wqe_size), largest first
1602 	 * If they are equal, still put the SQ first
1603 	 */
1604 	qp->qp_sq_baseaddr = 0;
1605 	qp->qp_rq_baseaddr = 0;
1606 	if ((sq_wqe_size > rq_wqe_size) || (sq_wqe_size == rq_wqe_size)) {
1607 		sq_buf = qp->qp_wqinfo.qa_buf_aligned;
1608 
1609 		/* if this QP is on an SRQ, set the rq_buf to NULL */
1610 		if (qp_srq_en) {
1611 			rq_buf = NULL;
1612 		} else {
1613 			rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
1614 			qp->qp_rq_baseaddr = sq_size;
1615 		}
1616 	} else {
1617 		rq_buf = qp->qp_wqinfo.qa_buf_aligned;
1618 		sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
1619 		qp->qp_sq_baseaddr = rq_size;
1620 	}
1621 
1622 	qp->qp_sq_wqhdr = hermon_wrid_wqhdr_create(sq_depth);
1623 	if (qp->qp_sq_wqhdr == NULL) {
1624 		status = IBT_INSUFF_RESOURCE;
1625 		goto qpalloc_fail8;
1626 	}
1627 	if (qp_srq_en) {
1628 		qp->qp_rq_wqavl.wqa_wq = srq->srq_wq_wqhdr;
1629 		qp->qp_rq_wqavl.wqa_srq_en = 1;
1630 		qp->qp_rq_wqavl.wqa_srq = srq;
1631 	} else {
1632 		qp->qp_rq_wqhdr = hermon_wrid_wqhdr_create(rq_depth);
1633 		if (qp->qp_rq_wqhdr == NULL) {
1634 			status = IBT_INSUFF_RESOURCE;
1635 			goto qpalloc_fail8;
1636 		}
1637 		qp->qp_rq_wqavl.wqa_wq = qp->qp_rq_wqhdr;
1638 	}
1639 	qp->qp_sq_wqavl.wqa_qpn = qp->qp_qpnum;
1640 	qp->qp_sq_wqavl.wqa_type = HERMON_WR_SEND;
1641 	qp->qp_sq_wqavl.wqa_wq = qp->qp_sq_wqhdr;
1642 	qp->qp_rq_wqavl.wqa_qpn = qp->qp_qpnum;
1643 	qp->qp_rq_wqavl.wqa_type = HERMON_WR_RECV;
1644 
1645 	/*
1646 	 * Register the memory for the QP work queues.  The memory for the
1647 	 * QP must be registered in the Hermon cMPT tables.  This gives us the
1648 	 * LKey to specify in the QP context later.  Note: The memory for
1649 	 * Hermon work queues (both Send and Recv) must be contiguous and
1650 	 * registered as a single memory region.  Note: If the QP memory is
1651 	 * user-mappable, force DDI_DMA_CONSISTENT mapping. Also, in order to
1652 	 * meet the alignment restriction, we pass the "mro_bind_override_addr"
1653 	 * flag in the call to hermon_mr_register(). This guarantees that the
1654 	 * resulting IB vaddr will be zero-based (modulo the offset into the
1655 	 * first page). If we fail here, we still have the bunch of resource
1656 	 * and reference count cleanup to do.
1657 	 */
1658 	flag = (sleepflag == HERMON_SLEEP) ? IBT_MR_SLEEP :
1659 	    IBT_MR_NOSLEEP;
1660 	mr_attr.mr_vaddr    = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
1661 	mr_attr.mr_len	    = qp->qp_wqinfo.qa_size;
1662 	mr_attr.mr_as	    = NULL;
1663 	mr_attr.mr_flags    = flag;
1664 	/* HERMON_QUEUE_LOCATION_NORMAL */
1665 	mr_op.mro_bind_type =
1666 	    state->hs_cfg_profile->cp_iommu_bypass;
1667 	mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
1668 	mr_op.mro_bind_override_addr = 1;
1669 	status = hermon_mr_register(state, pd, &mr_attr, &mr,
1670 	    &mr_op, HERMON_QP_CMPT);
1671 	if (status != DDI_SUCCESS) {
1672 		status = IBT_INSUFF_RESOURCE;
1673 		goto qpalloc_fail9;
1674 	}
1675 
1676 	/*
1677 	 * Calculate the offset between the kernel virtual address space
1678 	 * and the IB virtual address space.  This will be used when
1679 	 * posting work requests to properly initialize each WQE.
1680 	 */
1681 	qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
1682 	    (uint64_t)mr->mr_bindinfo.bi_addr;
1683 
1684 	/*
1685 	 * Fill in all the return arguments (if necessary).  This includes
1686 	 * real work queue sizes (in wqes), real SGLs, and QP number
1687 	 */
1688 	if (queuesz_p != NULL) {
1689 		queuesz_p->cs_sq 	=
1690 		    (1 << log_qp_sq_size) - qp->qp_sq_hdrmwqes;
1691 		queuesz_p->cs_sq_sgl	= qp->qp_sq_sgl;
1692 
1693 		/* if this QP is on an SRQ, set these to 0 */
1694 		if (qp_srq_en) {
1695 			queuesz_p->cs_rq	= 0;
1696 			queuesz_p->cs_rq_sgl	= 0;
1697 		} else {
1698 			queuesz_p->cs_rq	= (1 << log_qp_rq_size);
1699 			queuesz_p->cs_rq_sgl	= qp->qp_rq_sgl;
1700 		}
1701 	}
1702 
1703 	/*
1704 	 * Fill in the rest of the Hermon Queue Pair handle.
1705 	 */
1706 	qp->qp_qpcrsrcp		= NULL;
1707 	qp->qp_rsrcp		= rsrc;
1708 	qp->qp_state		= HERMON_QP_RESET;
1709 	HERMON_SET_QP_POST_SEND_STATE(qp, HERMON_QP_RESET);
1710 	qp->qp_pdhdl		= pd;
1711 	qp->qp_mrhdl		= mr;
1712 	qp->qp_sq_sigtype	= (attr_p->qp_flags & IBT_WR_SIGNALED) ?
1713 	    HERMON_QP_SQ_WR_SIGNALED : HERMON_QP_SQ_ALL_SIGNALED;
1714 	qp->qp_is_special	= 0;
1715 	qp->qp_uarpg		= uarpg;
1716 	qp->qp_umap_dhp		= (devmap_cookie_t)NULL;
1717 	qp->qp_sq_cqhdl		= sq_cq;
1718 	qp->qp_sq_bufsz		= (1 << log_qp_sq_size);
1719 	qp->qp_sq_logqsz	= log_qp_sq_size;
1720 	qp->qp_sq_buf		= sq_buf;
1721 	qp->qp_desc_off		= qp_desc_off;
1722 	qp->qp_rq_cqhdl		= rq_cq;
1723 	qp->qp_rq_buf		= rq_buf;
1724 	qp->qp_rlky		= (attr_p->qp_flags & IBT_FAST_REG_RES_LKEY) !=
1725 	    0;
1726 
1727 	/* if this QP is on an SRQ, set rq_bufsz to 0 */
1728 	if (qp_srq_en) {
1729 		qp->qp_rq_bufsz		= 0;
1730 		qp->qp_rq_logqsz	= 0;
1731 	} else {
1732 		qp->qp_rq_bufsz		= (1 << log_qp_rq_size);
1733 		qp->qp_rq_logqsz	= log_qp_rq_size;
1734 	}
1735 
1736 	qp->qp_forward_sqd_event  = 0;
1737 	qp->qp_sqd_still_draining = 0;
1738 	qp->qp_hdlrarg		= (void *)ibt_qphdl[ii];
1739 	qp->qp_mcg_refcnt	= 0;
1740 
1741 	/*
1742 	 * If this QP is to be associated with an SRQ, set the SRQ handle
1743 	 */
1744 	if (qp_srq_en) {
1745 		qp->qp_srqhdl = srq;
1746 		hermon_srq_refcnt_inc(qp->qp_srqhdl);
1747 	} else {
1748 		qp->qp_srqhdl = NULL;
1749 	}
1750 
1751 	qp->qp_type = IBT_UD_RQP;
1752 	qp->qp_serv_type = serv_type;
1753 
1754 	/*
1755 	 * Initialize the RQ WQEs - unlike Arbel, no Rcv init is needed
1756 	 */
1757 
1758 	/*
1759 	 * Initialize the SQ WQEs - all that needs to be done is every 64 bytes
1760 	 * set the quadword to all F's - high-order bit is owner (init to one)
1761 	 * and the rest for the headroom definition of prefetching.
1762 	 */
1763 	if ((attr_p->qp_alloc_flags & IBT_QP_USES_FEXCH) == 0) {
1764 		wqesz_shift = qp->qp_sq_log_wqesz;
1765 		thewqesz    = 1 << wqesz_shift;
1766 		thewqe = (uint64_t *)(void *)(qp->qp_sq_buf);
1767 		for (i = 0; i < sq_depth; i++) {
1768 			/*
1769 			 * for each stride, go through and every 64 bytes
1770 			 * write the init value - having set the address
1771 			 * once, just keep incrementing it
1772 			 */
1773 			for (j = 0; j < thewqesz; j += 64, thewqe += 8) {
1774 				*(uint32_t *)thewqe = 0xFFFFFFFF;
1775 			}
1776 		}
1777 	}
1778 
1779 	/* Zero out the QP context */
1780 	bzero(&qp->qpc, sizeof (hermon_hw_qpc_t));
1781 
1782 	/*
1783 	 * Put QP handle in Hermon QPNum-to-QPHdl list.  Then fill in the
1784 	 * "qphdl" and return success
1785 	 */
1786 	hermon_icm_set_num_to_hdl(state, HERMON_QPC, qpc->hr_indx + ii, qp);
1787 
1788 	mutex_init(&qp->qp_sq_lock, NULL, MUTEX_DRIVER,
1789 	    DDI_INTR_PRI(state->hs_intrmsi_pri));
1790 
1791 	qp->qp_rangep = qp_range_p;
1792 
1793 	qphdl[ii] = qp;
1794 
1795 	if (++ii < (1 << log2))
1796 		goto for_each_qp;
1797 
1798 	return (DDI_SUCCESS);
1799 
1800 /*
1801  * The following is cleanup for all possible failure cases in this routine
1802  */
1803 qpalloc_fail9:
1804 	hermon_queue_free(&qp->qp_wqinfo);
1805 qpalloc_fail8:
1806 	if (qp->qp_sq_wqhdr)
1807 		hermon_wrid_wqhdr_destroy(qp->qp_sq_wqhdr);
1808 	if (qp->qp_rq_wqhdr)
1809 		hermon_wrid_wqhdr_destroy(qp->qp_rq_wqhdr);
1810 qpalloc_fail7:
1811 	if (!qp_srq_en) {
1812 		hermon_dbr_free(state, uarpg, qp->qp_rq_vdbr);
1813 	}
1814 
1815 qpalloc_fail6:
1816 	hermon_rsrc_free(state, &rsrc);
1817 qpalloc_fail4:
1818 	hermon_cq_refcnt_dec(rq_cq);
1819 qpalloc_fail2:
1820 	hermon_cq_refcnt_dec(sq_cq);
1821 qpalloc_fail1:
1822 	hermon_pd_refcnt_dec(pd);
1823 qpalloc_fail0:
1824 	if (ii == 0) {
1825 		if (qp_range_p)
1826 			kmem_free(qp_range_p, sizeof (*qp_range_p));
1827 		hermon_rsrc_free(state, &qpc);
1828 	} else {
1829 		/* qp_range_p and qpc rsrc will be freed in hermon_qp_free */
1830 
1831 		mutex_enter(&qp->qp_rangep->hqpr_lock);
1832 		qp_range_p->hqpr_refcnt = ii;
1833 		mutex_exit(&qp->qp_rangep->hqpr_lock);
1834 		while (--ii >= 0) {
1835 			ibc_qpn_hdl_t qpn_hdl;
1836 			int free_status;
1837 
1838 			free_status = hermon_qp_free(state, &qphdl[ii],
1839 			    IBC_FREE_QP_AND_QPN, &qpn_hdl, sleepflag);
1840 			if (free_status != DDI_SUCCESS)
1841 				cmn_err(CE_CONT, "!qp_range: status 0x%x: "
1842 				    "error status %x during free",
1843 				    status, free_status);
1844 		}
1845 	}
1846 
1847 	return (status);
1848 }
1849 
1850 
1851 /*
1852  * hermon_qp_free()
1853  *    This function frees up the QP resources.  Depending on the value
1854  *    of the "free_qp_flags", the QP number may not be released until
1855  *    a subsequent call to hermon_qp_release_qpn().
1856  *
1857  *    Context: Can be called only from user or kernel context.
1858  */
1859 /* ARGSUSED */
1860 int
1861 hermon_qp_free(hermon_state_t *state, hermon_qphdl_t *qphdl,
1862     ibc_free_qp_flags_t free_qp_flags, ibc_qpn_hdl_t *qpnh,
1863     uint_t sleepflag)
1864 {
1865 	hermon_rsrc_t		*qpc, *rsrc;
1866 	hermon_umap_db_entry_t	*umapdb;
1867 	hermon_qpn_entry_t	*entry;
1868 	hermon_pdhdl_t		pd;
1869 	hermon_mrhdl_t		mr;
1870 	hermon_cqhdl_t		sq_cq, rq_cq;
1871 	hermon_srqhdl_t		srq;
1872 	hermon_qphdl_t		qp;
1873 	uint64_t		value;
1874 	uint_t			type, port;
1875 	uint_t			maxprot;
1876 	uint_t			qp_srq_en;
1877 	int			status;
1878 
1879 	/*
1880 	 * Pull all the necessary information from the Hermon Queue Pair
1881 	 * handle.  This is necessary here because the resource for the
1882 	 * QP handle is going to be freed up as part of this operation.
1883 	 */
1884 	qp	= *qphdl;
1885 	mutex_enter(&qp->qp_lock);
1886 	qpc	= qp->qp_qpcrsrcp;	/* NULL if part of a "range" */
1887 	rsrc	= qp->qp_rsrcp;
1888 	pd	= qp->qp_pdhdl;
1889 	srq	= qp->qp_srqhdl;
1890 	mr	= qp->qp_mrhdl;
1891 	rq_cq	= qp->qp_rq_cqhdl;
1892 	sq_cq	= qp->qp_sq_cqhdl;
1893 	port	= qp->qp_portnum;
1894 	qp_srq_en = qp->qp_alloc_flags & IBT_QP_USES_SRQ;
1895 
1896 	/*
1897 	 * If the QP is part of an MCG, then we fail the qp_free
1898 	 */
1899 	if (qp->qp_mcg_refcnt != 0) {
1900 		mutex_exit(&qp->qp_lock);
1901 		status = ibc_get_ci_failure(0);
1902 		goto qpfree_fail;
1903 	}
1904 
1905 	/*
1906 	 * If the QP is not already in "Reset" state, then transition to
1907 	 * "Reset".  This is necessary because software does not reclaim
1908 	 * ownership of the QP context until the QP is in the "Reset" state.
1909 	 * If the ownership transfer fails for any reason, then it is an
1910 	 * indication that something (either in HW or SW) has gone seriously
1911 	 * wrong.  So we print a warning message and return.
1912 	 */
1913 	if (qp->qp_state != HERMON_QP_RESET) {
1914 		if (hermon_qp_to_reset(state, qp) != DDI_SUCCESS) {
1915 			mutex_exit(&qp->qp_lock);
1916 			HERMON_WARNING(state, "failed to reset QP context");
1917 			status = ibc_get_ci_failure(0);
1918 			goto qpfree_fail;
1919 		}
1920 		qp->qp_state = HERMON_QP_RESET;
1921 		HERMON_SET_QP_POST_SEND_STATE(qp, HERMON_QP_RESET);
1922 
1923 		/*
1924 		 * Do any additional handling necessary for the transition
1925 		 * to the "Reset" state (e.g. update the WRID lists)
1926 		 */
1927 		if (hermon_wrid_to_reset_handling(state, qp) != DDI_SUCCESS) {
1928 			mutex_exit(&qp->qp_lock);
1929 			HERMON_WARNING(state, "failed to reset QP WRID list");
1930 			status = ibc_get_ci_failure(0);
1931 			goto qpfree_fail;
1932 		}
1933 	}
1934 
1935 	/*
1936 	 * If this was a user-mappable QP, then we need to remove its entry
1937 	 * from the "userland resources database".  If it is also currently
1938 	 * mmap()'d out to a user process, then we need to call
1939 	 * devmap_devmem_remap() to remap the QP memory to an invalid mapping.
1940 	 * We also need to invalidate the QP tracking information for the
1941 	 * user mapping.
1942 	 */
1943 	if (qp->qp_alloc_flags & IBT_QP_USER_MAP) {
1944 		status = hermon_umap_db_find(state->hs_instance, qp->qp_qpnum,
1945 		    MLNX_UMAP_QPMEM_RSRC, &value, HERMON_UMAP_DB_REMOVE,
1946 		    &umapdb);
1947 		if (status != DDI_SUCCESS) {
1948 			mutex_exit(&qp->qp_lock);
1949 			HERMON_WARNING(state, "failed to find in database");
1950 			return (ibc_get_ci_failure(0));
1951 		}
1952 		hermon_umap_db_free(umapdb);
1953 		if (qp->qp_umap_dhp != NULL) {
1954 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
1955 			status = devmap_devmem_remap(qp->qp_umap_dhp,
1956 			    state->hs_dip, 0, 0, qp->qp_wqinfo.qa_size,
1957 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
1958 			if (status != DDI_SUCCESS) {
1959 				mutex_exit(&qp->qp_lock);
1960 				HERMON_WARNING(state, "failed in QP memory "
1961 				    "devmap_devmem_remap()");
1962 				return (ibc_get_ci_failure(0));
1963 			}
1964 			qp->qp_umap_dhp = (devmap_cookie_t)NULL;
1965 		}
1966 	}
1967 
1968 
1969 	/*
1970 	 * Put NULL into the Hermon QPNum-to-QPHdl list.  This will allow any
1971 	 * in-progress events to detect that the QP corresponding to this
1972 	 * number has been freed.  Note: it does depend in whether we are
1973 	 * freeing a special QP or not.
1974 	 */
1975 	if (qpc == NULL) {
1976 		hermon_icm_set_num_to_hdl(state, HERMON_QPC,
1977 		    qp->qp_qpnum, NULL);
1978 	} else if (qp->qp_is_special) {
1979 		hermon_icm_set_num_to_hdl(state, HERMON_QPC,
1980 		    qpc->hr_indx + port, NULL);
1981 	} else {
1982 		hermon_icm_set_num_to_hdl(state, HERMON_QPC,
1983 		    qpc->hr_indx, NULL);
1984 	}
1985 
1986 	/*
1987 	 * Drop the QP lock
1988 	 *    At this point the lock is no longer necessary.  We cannot
1989 	 *    protect from multiple simultaneous calls to free the same QP.
1990 	 *    In addition, since the QP lock is contained in the QP "software
1991 	 *    handle" resource, which we will free (see below), it is
1992 	 *    important that we have no further references to that memory.
1993 	 */
1994 	mutex_exit(&qp->qp_lock);
1995 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
1996 
1997 	/*
1998 	 * Free the QP resources
1999 	 *    Start by deregistering and freeing the memory for work queues.
2000 	 *    Next free any previously allocated context information
2001 	 *    (depending on QP type)
2002 	 *    Finally, decrement the necessary reference counts.
2003 	 * If this fails for any reason, then it is an indication that
2004 	 * something (either in HW or SW) has gone seriously wrong.  So we
2005 	 * print a warning message and return.
2006 	 */
2007 	status = hermon_mr_deregister(state, &mr, HERMON_MR_DEREG_ALL,
2008 	    sleepflag);
2009 	if (status != DDI_SUCCESS) {
2010 		HERMON_WARNING(state, "failed to deregister QP memory");
2011 		status = ibc_get_ci_failure(0);
2012 		goto qpfree_fail;
2013 	}
2014 
2015 	/* Free the memory for the QP */
2016 	hermon_queue_free(&qp->qp_wqinfo);
2017 
2018 	if (qp->qp_sq_wqhdr)
2019 		hermon_wrid_wqhdr_destroy(qp->qp_sq_wqhdr);
2020 	if (qp->qp_rq_wqhdr)
2021 		hermon_wrid_wqhdr_destroy(qp->qp_rq_wqhdr);
2022 
2023 	/* Free the dbr */
2024 	if (!qp_srq_en) {
2025 		hermon_dbr_free(state, qp->qp_uarpg, qp->qp_rq_vdbr);
2026 	}
2027 
2028 	/*
2029 	 * Free up the remainder of the QP resources.  Note: we have a few
2030 	 * different resources to free up depending on whether the QP is a
2031 	 * special QP or not.  As described above, if any of these fail for
2032 	 * any reason it is an indication that something (either in HW or SW)
2033 	 * has gone seriously wrong.  So we print a warning message and
2034 	 * return.
2035 	 */
2036 	if (qp->qp_is_special) {
2037 		type = (qp->qp_is_special == HERMON_QP_SMI) ?
2038 		    IBT_SMI_SQP : IBT_GSI_SQP;
2039 
2040 		/* Free up resources for the special QP */
2041 		status = hermon_special_qp_rsrc_free(state, type, port);
2042 		if (status != DDI_SUCCESS) {
2043 			HERMON_WARNING(state, "failed to free special QP rsrc");
2044 			status = ibc_get_ci_failure(0);
2045 			goto qpfree_fail;
2046 		}
2047 
2048 	} else if (qp->qp_rangep) {
2049 		int refcnt;
2050 		mutex_enter(&qp->qp_rangep->hqpr_lock);
2051 		refcnt = --qp->qp_rangep->hqpr_refcnt;
2052 		mutex_exit(&qp->qp_rangep->hqpr_lock);
2053 		if (refcnt == 0) {
2054 			mutex_destroy(&qp->qp_rangep->hqpr_lock);
2055 			hermon_rsrc_free(state, &qp->qp_rangep->hqpr_qpcrsrc);
2056 			kmem_free(qp->qp_rangep, sizeof (*qp->qp_rangep));
2057 		}
2058 		qp->qp_rangep = NULL;
2059 	} else if (qp->qp_qpn_hdl == NULL) {
2060 		hermon_rsrc_free(state, &qpc);
2061 	} else {
2062 		/*
2063 		 * Check the flags and determine whether to release the
2064 		 * QPN or not, based on their value.
2065 		 */
2066 		if (free_qp_flags == IBC_FREE_QP_ONLY) {
2067 			entry = qp->qp_qpn_hdl;
2068 			hermon_qp_release_qpn(state, qp->qp_qpn_hdl,
2069 			    HERMON_QPN_FREE_ONLY);
2070 			*qpnh = (ibc_qpn_hdl_t)entry;
2071 		} else {
2072 			hermon_qp_release_qpn(state, qp->qp_qpn_hdl,
2073 			    HERMON_QPN_RELEASE);
2074 		}
2075 	}
2076 
2077 	mutex_destroy(&qp->qp_sq_lock);
2078 
2079 	/* Free the Hermon Queue Pair handle */
2080 	hermon_rsrc_free(state, &rsrc);
2081 
2082 	/* Decrement the reference counts on CQs, PD and SRQ (if needed) */
2083 	hermon_cq_refcnt_dec(rq_cq);
2084 	hermon_cq_refcnt_dec(sq_cq);
2085 	hermon_pd_refcnt_dec(pd);
2086 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2087 		hermon_srq_refcnt_dec(srq);
2088 	}
2089 
2090 	/* Set the qphdl pointer to NULL and return success */
2091 	*qphdl = NULL;
2092 
2093 	return (DDI_SUCCESS);
2094 
2095 qpfree_fail:
2096 	return (status);
2097 }
2098 
2099 
2100 /*
2101  * hermon_qp_query()
2102  *    Context: Can be called from interrupt or base context.
2103  */
2104 int
2105 hermon_qp_query(hermon_state_t *state, hermon_qphdl_t qp,
2106     ibt_qp_query_attr_t *attr_p)
2107 {
2108 	ibt_cep_state_t		qp_state;
2109 	ibt_qp_ud_attr_t	*ud;
2110 	ibt_qp_rc_attr_t	*rc;
2111 	ibt_qp_uc_attr_t	*uc;
2112 	ibt_cep_flags_t		enable_flags;
2113 	hermon_hw_addr_path_t	*qpc_path, *qpc_alt_path;
2114 	ibt_cep_path_t		*path_ptr, *alt_path_ptr;
2115 	hermon_hw_qpc_t		*qpc;
2116 	int			status;
2117 	uint_t			tmp_sched_q, tmp_alt_sched_q;
2118 
2119 	mutex_enter(&qp->qp_lock);
2120 
2121 	/*
2122 	 * Grab the temporary QPC entry from QP software state
2123 	 */
2124 	qpc = &qp->qpc;
2125 
2126 	/* Convert the current Hermon QP state to IBTF QP state */
2127 	switch (qp->qp_state) {
2128 	case HERMON_QP_RESET:
2129 		qp_state = IBT_STATE_RESET;		/* "Reset" */
2130 		break;
2131 	case HERMON_QP_INIT:
2132 		qp_state = IBT_STATE_INIT;		/* Initialized */
2133 		break;
2134 	case HERMON_QP_RTR:
2135 		qp_state = IBT_STATE_RTR;		/* Ready to Receive */
2136 		break;
2137 	case HERMON_QP_RTS:
2138 		qp_state = IBT_STATE_RTS;		/* Ready to Send */
2139 		break;
2140 	case HERMON_QP_SQERR:
2141 		qp_state = IBT_STATE_SQE;		/* Send Queue Error */
2142 		break;
2143 	case HERMON_QP_SQD:
2144 		if (qp->qp_sqd_still_draining) {
2145 			qp_state = IBT_STATE_SQDRAIN;	/* SQ Draining */
2146 		} else {
2147 			qp_state = IBT_STATE_SQD;	/* SQ Drained */
2148 		}
2149 		break;
2150 	case HERMON_QP_ERR:
2151 		qp_state = IBT_STATE_ERROR;		/* Error */
2152 		break;
2153 	default:
2154 		mutex_exit(&qp->qp_lock);
2155 		return (ibc_get_ci_failure(0));
2156 	}
2157 	attr_p->qp_info.qp_state = qp_state;
2158 
2159 	/* SRQ Hook. */
2160 	attr_p->qp_srq = NULL;
2161 
2162 	/*
2163 	 * The following QP information is always returned, regardless of
2164 	 * the current QP state.  Note: Some special handling is necessary
2165 	 * for calculating the QP number on special QP (QP0 and QP1).
2166 	 */
2167 	attr_p->qp_sq_cq    =
2168 	    (qp->qp_sq_cqhdl == NULL) ? NULL : qp->qp_sq_cqhdl->cq_hdlrarg;
2169 	attr_p->qp_rq_cq    =
2170 	    (qp->qp_rq_cqhdl == NULL) ? NULL : qp->qp_rq_cqhdl->cq_hdlrarg;
2171 	if (qp->qp_is_special) {
2172 		attr_p->qp_qpn = (qp->qp_is_special == HERMON_QP_SMI) ? 0 : 1;
2173 	} else {
2174 		attr_p->qp_qpn = (ib_qpn_t)qp->qp_qpnum;
2175 	}
2176 	attr_p->qp_sq_sgl   = qp->qp_sq_sgl;
2177 	attr_p->qp_rq_sgl   = qp->qp_rq_sgl;
2178 	attr_p->qp_info.qp_sq_sz = qp->qp_sq_bufsz - qp->qp_sq_hdrmwqes;
2179 	attr_p->qp_info.qp_rq_sz = qp->qp_rq_bufsz;
2180 
2181 	/*
2182 	 * If QP is currently in the "Reset" state, then only the above are
2183 	 * returned
2184 	 */
2185 	if (qp_state == IBT_STATE_RESET) {
2186 		mutex_exit(&qp->qp_lock);
2187 		return (DDI_SUCCESS);
2188 	}
2189 
2190 	/*
2191 	 * Post QUERY_QP command to firmware
2192 	 *
2193 	 * We do a HERMON_NOSLEEP here because we are holding the "qp_lock".
2194 	 * Since we may be in the interrupt context (or subsequently raised
2195 	 * to interrupt level by priority inversion), we do not want to block
2196 	 * in this routine waiting for success.
2197 	 */
2198 	tmp_sched_q = qpc->pri_addr_path.sched_q;
2199 	tmp_alt_sched_q = qpc->alt_addr_path.sched_q;
2200 	status = hermon_cmn_query_cmd_post(state, QUERY_QP, 0, qp->qp_qpnum,
2201 	    qpc, sizeof (hermon_hw_qpc_t), HERMON_CMD_NOSLEEP_SPIN);
2202 	if (status != HERMON_CMD_SUCCESS) {
2203 		mutex_exit(&qp->qp_lock);
2204 		cmn_err(CE_WARN, "hermon%d: hermon_qp_query: QUERY_QP "
2205 		    "command failed: %08x\n", state->hs_instance, status);
2206 		if (status == HERMON_CMD_INVALID_STATUS) {
2207 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2208 		}
2209 		return (ibc_get_ci_failure(0));
2210 	}
2211 	qpc->pri_addr_path.sched_q = tmp_sched_q;
2212 	qpc->alt_addr_path.sched_q = tmp_alt_sched_q;
2213 
2214 	/*
2215 	 * Fill in the additional QP info based on the QP's transport type.
2216 	 */
2217 	if (qp->qp_type == IBT_UD_RQP) {
2218 
2219 		/* Fill in the UD-specific info */
2220 		ud = &attr_p->qp_info.qp_transport.ud;
2221 		ud->ud_qkey	= (ib_qkey_t)qpc->qkey;
2222 		ud->ud_sq_psn	= qpc->next_snd_psn;
2223 		ud->ud_pkey_ix	= qpc->pri_addr_path.pkey_indx;
2224 		/* port+1 for port 1/2 */
2225 		ud->ud_port	=
2226 		    (uint8_t)(((qpc->pri_addr_path.sched_q >> 6) & 0x01) + 1);
2227 
2228 		attr_p->qp_info.qp_trans = IBT_UD_SRV;
2229 
2230 		if (qp->qp_serv_type == HERMON_QP_FEXCH) {
2231 			ibt_pmr_desc_t *pmr;
2232 			uint64_t heart_beat;
2233 
2234 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pmr))
2235 			pmr = &attr_p->qp_query_fexch.fq_uni_mem_desc;
2236 			pmr->pmd_iova = 0;
2237 			pmr->pmd_lkey = pmr->pmd_rkey =
2238 			    hermon_fcoib_qpn_to_mkey(state, qp->qp_qpnum);
2239 			pmr->pmd_phys_buf_list_sz =
2240 			    state->hs_fcoib.hfc_mtts_per_mpt;
2241 			pmr->pmd_sync_required = 0;
2242 
2243 			pmr = &attr_p->qp_query_fexch.fq_bi_mem_desc;
2244 			pmr->pmd_iova = 0;
2245 			pmr->pmd_lkey = 0;
2246 			pmr->pmd_rkey = 0;
2247 			pmr->pmd_phys_buf_list_sz = 0;
2248 			pmr->pmd_sync_required = 0;
2249 
2250 			attr_p->qp_query_fexch.fq_flags =
2251 			    ((hermon_get_heart_beat_rq_cmd_post(state,
2252 			    qp->qp_qpnum, &heart_beat) == HERMON_CMD_SUCCESS) &&
2253 			    (heart_beat == 0)) ? IBT_FEXCH_HEART_BEAT_OK :
2254 			    IBT_FEXCH_NO_FLAGS;
2255 
2256 			ud->ud_fc = qp->qp_fc_attr;
2257 		} else if (qp->qp_serv_type == HERMON_QP_FCMND ||
2258 		    qp->qp_serv_type == HERMON_QP_RFCI) {
2259 			ud->ud_fc = qp->qp_fc_attr;
2260 		}
2261 
2262 	} else if (qp->qp_serv_type == HERMON_QP_RC) {
2263 
2264 		/* Fill in the RC-specific info */
2265 		rc = &attr_p->qp_info.qp_transport.rc;
2266 		rc->rc_sq_psn	= qpc->next_snd_psn;
2267 		rc->rc_rq_psn	= qpc->next_rcv_psn;
2268 		rc->rc_dst_qpn	= qpc->rem_qpn;
2269 
2270 		/* Grab the path migration state information */
2271 		if (qpc->pm_state == HERMON_QP_PMSTATE_MIGRATED) {
2272 			rc->rc_mig_state = IBT_STATE_MIGRATED;
2273 		} else if (qpc->pm_state == HERMON_QP_PMSTATE_REARM) {
2274 			rc->rc_mig_state = IBT_STATE_REARMED;
2275 		} else {
2276 			rc->rc_mig_state = IBT_STATE_ARMED;
2277 		}
2278 		rc->rc_rdma_ra_out = (1 << qpc->sra_max);
2279 		rc->rc_rdma_ra_in  = (1 << qpc->rra_max);
2280 		rc->rc_min_rnr_nak = qpc->min_rnr_nak;
2281 		rc->rc_path_mtu	   = qpc->mtu;
2282 		rc->rc_retry_cnt   = qpc->retry_cnt;
2283 
2284 		/* Get the common primary address path fields */
2285 		qpc_path = &qpc->pri_addr_path;
2286 		path_ptr = &rc->rc_path;
2287 		hermon_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect,
2288 		    HERMON_ADDRPATH_QP);
2289 
2290 		/* Fill in the additional primary address path fields */
2291 		path_ptr->cep_pkey_ix	   = qpc_path->pkey_indx;
2292 		path_ptr->cep_hca_port_num =
2293 		    path_ptr->cep_adds_vect.av_port_num =
2294 		    (uint8_t)(((qpc_path->sched_q >> 6) & 0x01) + 1);
2295 		path_ptr->cep_timeout	   = qpc_path->ack_timeout;
2296 
2297 		/* Get the common alternate address path fields */
2298 		qpc_alt_path = &qpc->alt_addr_path;
2299 		alt_path_ptr = &rc->rc_alt_path;
2300 		hermon_get_addr_path(state, qpc_alt_path,
2301 		    &alt_path_ptr->cep_adds_vect, HERMON_ADDRPATH_QP);
2302 
2303 		/* Fill in the additional alternate address path fields */
2304 		alt_path_ptr->cep_pkey_ix	= qpc_alt_path->pkey_indx;
2305 		alt_path_ptr->cep_hca_port_num	=
2306 		    alt_path_ptr->cep_adds_vect.av_port_num =
2307 		    (uint8_t)(((qpc_alt_path->sched_q >> 6) & 0x01) + 1);
2308 		alt_path_ptr->cep_timeout	= qpc_alt_path->ack_timeout;
2309 
2310 		/* Get the RNR retry time from primary path */
2311 		rc->rc_rnr_retry_cnt = qpc->rnr_retry;
2312 
2313 		/* Set the enable flags based on RDMA/Atomic enable bits */
2314 		enable_flags = IBT_CEP_NO_FLAGS;
2315 		enable_flags |= ((qpc->rre == 0) ? 0 : IBT_CEP_RDMA_RD);
2316 		enable_flags |= ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR);
2317 		enable_flags |= ((qpc->rae == 0) ? 0 : IBT_CEP_ATOMIC);
2318 		attr_p->qp_info.qp_flags = enable_flags;
2319 
2320 		attr_p->qp_info.qp_trans = IBT_RC_SRV;
2321 
2322 	} else if (qp->qp_serv_type == HERMON_QP_UC) {
2323 
2324 		/* Fill in the UC-specific info */
2325 		uc = &attr_p->qp_info.qp_transport.uc;
2326 		uc->uc_sq_psn	= qpc->next_snd_psn;
2327 		uc->uc_rq_psn	= qpc->next_rcv_psn;
2328 		uc->uc_dst_qpn	= qpc->rem_qpn;
2329 
2330 		/* Grab the path migration state information */
2331 		if (qpc->pm_state == HERMON_QP_PMSTATE_MIGRATED) {
2332 			uc->uc_mig_state = IBT_STATE_MIGRATED;
2333 		} else if (qpc->pm_state == HERMON_QP_PMSTATE_REARM) {
2334 			uc->uc_mig_state = IBT_STATE_REARMED;
2335 		} else {
2336 			uc->uc_mig_state = IBT_STATE_ARMED;
2337 		}
2338 		uc->uc_path_mtu = qpc->mtu;
2339 
2340 		/* Get the common primary address path fields */
2341 		qpc_path = &qpc->pri_addr_path;
2342 		path_ptr = &uc->uc_path;
2343 		hermon_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect,
2344 		    HERMON_ADDRPATH_QP);
2345 
2346 		/* Fill in the additional primary address path fields */
2347 		path_ptr->cep_pkey_ix	   = qpc_path->pkey_indx;
2348 		path_ptr->cep_hca_port_num =
2349 		    path_ptr->cep_adds_vect.av_port_num =
2350 		    (uint8_t)(((qpc_path->sched_q >> 6) & 0x01) + 1);
2351 
2352 		/* Get the common alternate address path fields */
2353 		qpc_alt_path = &qpc->alt_addr_path;
2354 		alt_path_ptr = &uc->uc_alt_path;
2355 		hermon_get_addr_path(state, qpc_alt_path,
2356 		    &alt_path_ptr->cep_adds_vect, HERMON_ADDRPATH_QP);
2357 
2358 		/* Fill in the additional alternate address path fields */
2359 		alt_path_ptr->cep_pkey_ix	= qpc_alt_path->pkey_indx;
2360 		alt_path_ptr->cep_hca_port_num	=
2361 		    alt_path_ptr->cep_adds_vect.av_port_num =
2362 		    (uint8_t)(((qpc_alt_path->sched_q >> 6) & 0x01) + 1);
2363 
2364 		/*
2365 		 * Set the enable flags based on RDMA enable bits (by
2366 		 * definition UC doesn't support Atomic or RDMA Read)
2367 		 */
2368 		enable_flags = ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR);
2369 		attr_p->qp_info.qp_flags = enable_flags;
2370 
2371 		attr_p->qp_info.qp_trans = IBT_UC_SRV;
2372 
2373 	} else {
2374 		HERMON_WARNING(state, "unexpected QP transport type");
2375 		mutex_exit(&qp->qp_lock);
2376 		return (ibc_get_ci_failure(0));
2377 	}
2378 
2379 	/*
2380 	 * Under certain circumstances it is possible for the Hermon hardware
2381 	 * to transition to one of the error states without software directly
2382 	 * knowing about it.  The QueryQP() call is the one place where we
2383 	 * have an opportunity to sample and update our view of the QP state.
2384 	 */
2385 	if (qpc->state == HERMON_QP_SQERR) {
2386 		attr_p->qp_info.qp_state = IBT_STATE_SQE;
2387 		qp->qp_state = HERMON_QP_SQERR;
2388 		HERMON_SET_QP_POST_SEND_STATE(qp, HERMON_QP_SQERR);
2389 	}
2390 	if (qpc->state == HERMON_QP_ERR) {
2391 		attr_p->qp_info.qp_state = IBT_STATE_ERROR;
2392 		qp->qp_state = HERMON_QP_ERR;
2393 		HERMON_SET_QP_POST_SEND_STATE(qp, HERMON_QP_ERR);
2394 	}
2395 	mutex_exit(&qp->qp_lock);
2396 
2397 	return (DDI_SUCCESS);
2398 }
2399 
2400 
2401 /*
2402  * hermon_qp_create_qpn()
2403  *    Context: Can be called from interrupt or base context.
2404  */
2405 static int
2406 hermon_qp_create_qpn(hermon_state_t *state, hermon_qphdl_t qp,
2407     hermon_rsrc_t *qpc)
2408 {
2409 	hermon_qpn_entry_t	query;
2410 	hermon_qpn_entry_t	*entry;
2411 	avl_index_t		where;
2412 
2413 	/*
2414 	 * Build a query (for the AVL tree lookup) and attempt to find
2415 	 * a previously added entry that has a matching QPC index.  If
2416 	 * no matching entry is found, then allocate, initialize, and
2417 	 * add an entry to the AVL tree.
2418 	 * If a matching entry is found, then increment its QPN counter
2419 	 * and reference counter.
2420 	 */
2421 	query.qpn_indx = qpc->hr_indx;
2422 	mutex_enter(&state->hs_qpn_avl_lock);
2423 	entry = (hermon_qpn_entry_t *)avl_find(&state->hs_qpn_avl,
2424 	    &query, &where);
2425 	if (entry == NULL) {
2426 		/*
2427 		 * Allocate and initialize a QPN entry, then insert
2428 		 * it into the AVL tree.
2429 		 */
2430 		entry = (hermon_qpn_entry_t *)kmem_zalloc(
2431 		    sizeof (hermon_qpn_entry_t), KM_NOSLEEP);
2432 		if (entry == NULL) {
2433 			mutex_exit(&state->hs_qpn_avl_lock);
2434 			return (DDI_FAILURE);
2435 		}
2436 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*entry))
2437 
2438 		entry->qpn_indx	   = qpc->hr_indx;
2439 		entry->qpn_refcnt  = 0;
2440 		entry->qpn_counter = 0;
2441 
2442 		avl_insert(&state->hs_qpn_avl, entry, where);
2443 	}
2444 
2445 	/*
2446 	 * Make the AVL tree entry point to the QP context resource that
2447 	 * it will be responsible for tracking
2448 	 */
2449 	entry->qpn_qpc = qpc;
2450 
2451 	/*
2452 	 * Setup the QP handle to point to the AVL tree entry.  Then
2453 	 * generate the new QP number from the entry's QPN counter value
2454 	 * and the hardware's QP context table index.
2455 	 */
2456 	qp->qp_qpn_hdl	= entry;
2457 	qp->qp_qpnum	= ((entry->qpn_counter <<
2458 	    state->hs_cfg_profile->cp_log_num_qp) | qpc->hr_indx) &
2459 	    HERMON_QP_MAXNUMBER_MSK;
2460 	qp->qp_ring = qp->qp_qpnum << 8;
2461 
2462 	/*
2463 	 * Increment the reference counter and QPN counter.  The QPN
2464 	 * counter always indicates the next available number for use.
2465 	 */
2466 	entry->qpn_counter++;
2467 	entry->qpn_refcnt++;
2468 
2469 	mutex_exit(&state->hs_qpn_avl_lock);
2470 
2471 	return (DDI_SUCCESS);
2472 }
2473 
2474 
2475 /*
2476  * hermon_qp_release_qpn()
2477  *    Context: Can be called only from user or kernel context.
2478  */
2479 void
2480 hermon_qp_release_qpn(hermon_state_t *state, hermon_qpn_entry_t *entry,
2481     int flags)
2482 {
2483 	ASSERT(entry != NULL);
2484 
2485 	mutex_enter(&state->hs_qpn_avl_lock);
2486 
2487 	/*
2488 	 * If we are releasing the QP number here, then we decrement the
2489 	 * reference count and check for zero references.  If there are
2490 	 * zero references, then we free the QPC context (if it hadn't
2491 	 * already been freed during a HERMON_QPN_FREE_ONLY free, i.e. for
2492 	 * reuse with another similar QP number) and remove the tracking
2493 	 * structure from the QP number AVL tree and free the structure.
2494 	 * If we are not releasing the QP number here, then, as long as we
2495 	 * have not exhausted the usefulness of the QPC context (that is,
2496 	 * re-used it too many times without the reference count having
2497 	 * gone to zero), we free up the QPC context for use by another
2498 	 * thread (which will use it to construct a different QP number
2499 	 * from the same QPC table index).
2500 	 */
2501 	if (flags == HERMON_QPN_RELEASE) {
2502 		entry->qpn_refcnt--;
2503 
2504 		/*
2505 		 * If the reference count is zero, then we free the QPC
2506 		 * context (if it hadn't already been freed in an early
2507 		 * step, e.g. HERMON_QPN_FREE_ONLY) and remove/free the
2508 		 * tracking structure from the QP number AVL tree.
2509 		 */
2510 		if (entry->qpn_refcnt == 0) {
2511 			if (entry->qpn_qpc != NULL) {
2512 				hermon_rsrc_free(state, &entry->qpn_qpc);
2513 			}
2514 
2515 			/*
2516 			 * If the current entry has served it's useful
2517 			 * purpose (i.e. been reused the maximum allowable
2518 			 * number of times), then remove it from QP number
2519 			 * AVL tree and free it up.
2520 			 */
2521 			if (entry->qpn_counter >= (1 <<
2522 			    (24 - state->hs_cfg_profile->cp_log_num_qp))) {
2523 				avl_remove(&state->hs_qpn_avl, entry);
2524 				kmem_free(entry, sizeof (hermon_qpn_entry_t));
2525 			}
2526 		}
2527 
2528 	} else if (flags == HERMON_QPN_FREE_ONLY) {
2529 		/*
2530 		 * Even if we are not freeing the QP number, that will not
2531 		 * always prevent us from releasing the QPC context.  In fact,
2532 		 * since the QPC context only forms part of the whole QPN,
2533 		 * we want to free it up for use by other consumers.  But
2534 		 * if the reference count is non-zero (which it will always
2535 		 * be when we are doing HERMON_QPN_FREE_ONLY) and the counter
2536 		 * has reached its maximum value, then we cannot reuse the
2537 		 * QPC context until the reference count eventually reaches
2538 		 * zero (in HERMON_QPN_RELEASE, above).
2539 		 */
2540 		if (entry->qpn_counter < (1 <<
2541 		    (24 - state->hs_cfg_profile->cp_log_num_qp))) {
2542 			hermon_rsrc_free(state, &entry->qpn_qpc);
2543 		}
2544 	}
2545 	mutex_exit(&state->hs_qpn_avl_lock);
2546 }
2547 
2548 
2549 /*
2550  * hermon_qpn_avl_compare()
2551  *    Context: Can be called from user or kernel context.
2552  */
2553 static int
2554 hermon_qpn_avl_compare(const void *q, const void *e)
2555 {
2556 	hermon_qpn_entry_t	*entry, *query;
2557 
2558 	entry = (hermon_qpn_entry_t *)e;
2559 	query = (hermon_qpn_entry_t *)q;
2560 
2561 	if (query->qpn_indx < entry->qpn_indx) {
2562 		return (-1);
2563 	} else if (query->qpn_indx > entry->qpn_indx) {
2564 		return (+1);
2565 	} else {
2566 		return (0);
2567 	}
2568 }
2569 
2570 
2571 /*
2572  * hermon_qpn_avl_init()
2573  *    Context: Only called from attach() path context
2574  */
2575 void
2576 hermon_qpn_avl_init(hermon_state_t *state)
2577 {
2578 	/* Initialize the lock used for QP number (QPN) AVL tree access */
2579 	mutex_init(&state->hs_qpn_avl_lock, NULL, MUTEX_DRIVER,
2580 	    DDI_INTR_PRI(state->hs_intrmsi_pri));
2581 
2582 	/* Initialize the AVL tree for the QP number (QPN) storage */
2583 	avl_create(&state->hs_qpn_avl, hermon_qpn_avl_compare,
2584 	    sizeof (hermon_qpn_entry_t),
2585 	    offsetof(hermon_qpn_entry_t, qpn_avlnode));
2586 }
2587 
2588 
2589 /*
2590  * hermon_qpn_avl_fini()
2591  *    Context: Only called from attach() and/or detach() path contexts
2592  */
2593 void
2594 hermon_qpn_avl_fini(hermon_state_t *state)
2595 {
2596 	hermon_qpn_entry_t	*entry;
2597 	void			*cookie;
2598 
2599 	/*
2600 	 * Empty all entries (if necessary) and destroy the AVL tree
2601 	 * that was used for QP number (QPN) tracking.
2602 	 */
2603 	cookie = NULL;
2604 	while ((entry = (hermon_qpn_entry_t *)avl_destroy_nodes(
2605 	    &state->hs_qpn_avl, &cookie)) != NULL) {
2606 		kmem_free(entry, sizeof (hermon_qpn_entry_t));
2607 	}
2608 	avl_destroy(&state->hs_qpn_avl);
2609 
2610 	/* Destroy the lock used for QP number (QPN) AVL tree access */
2611 	mutex_destroy(&state->hs_qpn_avl_lock);
2612 }
2613 
2614 
2615 /*
2616  * hermon_qphdl_from_qpnum()
2617  *    Context: Can be called from interrupt or base context.
2618  *
2619  *    This routine is important because changing the unconstrained
2620  *    portion of the QP number is critical to the detection of a
2621  *    potential race condition in the QP event handler code (i.e. the case
2622  *    where a QP is freed and alloc'd again before an event for the
2623  *    "old" QP can be handled).
2624  *
2625  *    While this is not a perfect solution (not sure that one exists)
2626  *    it does help to mitigate the chance that this race condition will
2627  *    cause us to deliver a "stale" event to the new QP owner.  Note:
2628  *    this solution does not scale well because the number of constrained
2629  *    bits increases (and, hence, the number of unconstrained bits
2630  *    decreases) as the number of supported QPs grows.  For small and
2631  *    intermediate values, it should hopefully provide sufficient
2632  *    protection.
2633  */
2634 hermon_qphdl_t
2635 hermon_qphdl_from_qpnum(hermon_state_t *state, uint_t qpnum)
2636 {
2637 	uint_t	qpindx, qpmask;
2638 
2639 	/* Calculate the QP table index from the qpnum */
2640 	qpmask = (1 << state->hs_cfg_profile->cp_log_num_qp) - 1;
2641 	qpindx = qpnum & qpmask;
2642 	return (hermon_icm_num_to_hdl(state, HERMON_QPC, qpindx));
2643 }
2644 
2645 
2646 /*
2647  * hermon_special_qp_rsrc_alloc
2648  *    Context: Can be called from interrupt or base context.
2649  */
2650 static int
2651 hermon_special_qp_rsrc_alloc(hermon_state_t *state, ibt_sqp_type_t type,
2652     uint_t port, hermon_rsrc_t **qp_rsrc)
2653 {
2654 	uint_t		mask, flags;
2655 	int		status;
2656 
2657 	mutex_enter(&state->hs_spec_qplock);
2658 	flags = state->hs_spec_qpflags;
2659 	if (type == IBT_SMI_SQP) {
2660 		/*
2661 		 * Check here to see if the driver has been configured
2662 		 * to instruct the Hermon firmware to handle all incoming
2663 		 * SMP messages (i.e. messages sent to SMA).  If so,
2664 		 * then we will treat QP0 as if it has already been
2665 		 * allocated (for internal use).  Otherwise, if we allow
2666 		 * the allocation to happen, it will cause unexpected
2667 		 * behaviors (e.g. Hermon SMA becomes unresponsive).
2668 		 */
2669 		if (state->hs_cfg_profile->cp_qp0_agents_in_fw != 0) {
2670 			mutex_exit(&state->hs_spec_qplock);
2671 			return (IBT_QP_IN_USE);
2672 		}
2673 
2674 		/*
2675 		 * If this is the first QP0 allocation, then post
2676 		 * a CONF_SPECIAL_QP firmware command
2677 		 */
2678 		if ((flags & HERMON_SPECIAL_QP0_RSRC_MASK) == 0) {
2679 			status = hermon_conf_special_qp_cmd_post(state,
2680 			    state->hs_spec_qp0->hr_indx, HERMON_CMD_QP_SMI,
2681 			    HERMON_CMD_NOSLEEP_SPIN,
2682 			    HERMON_CMD_SPEC_QP_OPMOD(
2683 			    state->hs_cfg_profile->cp_qp0_agents_in_fw,
2684 			    state->hs_cfg_profile->cp_qp1_agents_in_fw));
2685 			if (status != HERMON_CMD_SUCCESS) {
2686 				mutex_exit(&state->hs_spec_qplock);
2687 				cmn_err(CE_NOTE, "hermon%d: CONF_SPECIAL_QP "
2688 				    "command failed: %08x\n",
2689 				    state->hs_instance, status);
2690 				return (IBT_INSUFF_RESOURCE);
2691 			}
2692 		}
2693 
2694 		/*
2695 		 * Now check (and, if necessary, modify) the flags to indicate
2696 		 * whether the allocation was successful
2697 		 */
2698 		mask = (1 << (HERMON_SPECIAL_QP0_RSRC + port));
2699 		if (flags & mask) {
2700 			mutex_exit(&state->hs_spec_qplock);
2701 			return (IBT_QP_IN_USE);
2702 		}
2703 		state->hs_spec_qpflags |= mask;
2704 		*qp_rsrc = state->hs_spec_qp0;
2705 
2706 	} else {
2707 		/*
2708 		 * If this is the first QP1 allocation, then post
2709 		 * a CONF_SPECIAL_QP firmware command
2710 		 */
2711 		if ((flags & HERMON_SPECIAL_QP1_RSRC_MASK) == 0) {
2712 			status = hermon_conf_special_qp_cmd_post(state,
2713 			    state->hs_spec_qp1->hr_indx, HERMON_CMD_QP_GSI,
2714 			    HERMON_CMD_NOSLEEP_SPIN,
2715 			    HERMON_CMD_SPEC_QP_OPMOD(
2716 			    state->hs_cfg_profile->cp_qp0_agents_in_fw,
2717 			    state->hs_cfg_profile->cp_qp1_agents_in_fw));
2718 			if (status != HERMON_CMD_SUCCESS) {
2719 				mutex_exit(&state->hs_spec_qplock);
2720 				cmn_err(CE_NOTE, "hermon%d: CONF_SPECIAL_QP "
2721 				    "command failed: %08x\n",
2722 				    state->hs_instance, status);
2723 				return (IBT_INSUFF_RESOURCE);
2724 			}
2725 		}
2726 
2727 		/*
2728 		 * Now check (and, if necessary, modify) the flags to indicate
2729 		 * whether the allocation was successful
2730 		 */
2731 		mask = (1 << (HERMON_SPECIAL_QP1_RSRC + port));
2732 		if (flags & mask) {
2733 			mutex_exit(&state->hs_spec_qplock);
2734 			return (IBT_QP_IN_USE);
2735 		}
2736 		state->hs_spec_qpflags |= mask;
2737 		*qp_rsrc = state->hs_spec_qp1;
2738 	}
2739 
2740 	mutex_exit(&state->hs_spec_qplock);
2741 	return (DDI_SUCCESS);
2742 }
2743 
2744 
2745 /*
2746  * hermon_special_qp_rsrc_free
2747  *    Context: Can be called from interrupt or base context.
2748  */
2749 static int
2750 hermon_special_qp_rsrc_free(hermon_state_t *state, ibt_sqp_type_t type,
2751     uint_t port)
2752 {
2753 	uint_t		mask, flags;
2754 	int		status;
2755 
2756 	mutex_enter(&state->hs_spec_qplock);
2757 	if (type == IBT_SMI_SQP) {
2758 		mask = (1 << (HERMON_SPECIAL_QP0_RSRC + port));
2759 		state->hs_spec_qpflags &= ~mask;
2760 		flags = state->hs_spec_qpflags;
2761 
2762 		/*
2763 		 * If this is the last QP0 free, then post a CONF_SPECIAL_QP
2764 		 * NOW, If this is the last Special QP free, then post a
2765 		 * CONF_SPECIAL_QP firmware command - it'll stop them all
2766 		 */
2767 		if (flags) {
2768 			status = hermon_conf_special_qp_cmd_post(state, 0,
2769 			    HERMON_CMD_QP_SMI, HERMON_CMD_NOSLEEP_SPIN, 0);
2770 			if (status != HERMON_CMD_SUCCESS) {
2771 				mutex_exit(&state->hs_spec_qplock);
2772 				cmn_err(CE_NOTE, "hermon%d: CONF_SPECIAL_QP "
2773 				    "command failed: %08x\n",
2774 				    state->hs_instance, status);
2775 				if (status == HERMON_CMD_INVALID_STATUS) {
2776 					hermon_fm_ereport(state, HCA_SYS_ERR,
2777 					    HCA_ERR_SRV_LOST);
2778 				}
2779 				return (ibc_get_ci_failure(0));
2780 			}
2781 		}
2782 	} else {
2783 		mask = (1 << (HERMON_SPECIAL_QP1_RSRC + port));
2784 		state->hs_spec_qpflags &= ~mask;
2785 		flags = state->hs_spec_qpflags;
2786 
2787 		/*
2788 		 * If this is the last QP1 free, then post a CONF_SPECIAL_QP
2789 		 * NOW, if this is the last special QP free, then post a
2790 		 * CONF_SPECIAL_QP firmware command - it'll stop them all
2791 		 */
2792 		if (flags) {
2793 			status = hermon_conf_special_qp_cmd_post(state, 0,
2794 			    HERMON_CMD_QP_GSI, HERMON_CMD_NOSLEEP_SPIN, 0);
2795 			if (status != HERMON_CMD_SUCCESS) {
2796 				mutex_exit(&state->hs_spec_qplock);
2797 				cmn_err(CE_NOTE, "hermon%d: CONF_SPECIAL_QP "
2798 				    "command failed: %08x\n",
2799 				    state->hs_instance, status);
2800 				if (status == HERMON_CMD_INVALID_STATUS) {
2801 					hermon_fm_ereport(state, HCA_SYS_ERR,
2802 					    HCA_ERR_SRV_LOST);
2803 				}
2804 				return (ibc_get_ci_failure(0));
2805 			}
2806 		}
2807 	}
2808 
2809 	mutex_exit(&state->hs_spec_qplock);
2810 	return (DDI_SUCCESS);
2811 }
2812 
2813 
2814 /*
2815  * hermon_qp_sgl_to_logwqesz()
2816  *    Context: Can be called from interrupt or base context.
2817  */
2818 static void
2819 hermon_qp_sgl_to_logwqesz(hermon_state_t *state, uint_t num_sgl,
2820     uint_t real_max_sgl, hermon_qp_wq_type_t wq_type,
2821     uint_t *logwqesz, uint_t *max_sgl)
2822 {
2823 	uint_t	max_size, log2, actual_sgl;
2824 
2825 	switch (wq_type) {
2826 	case HERMON_QP_WQ_TYPE_SENDQ_UD:
2827 		/*
2828 		 * Use requested maximum SGL to calculate max descriptor size
2829 		 * (while guaranteeing that the descriptor size is a
2830 		 * power-of-2 cachelines).
2831 		 */
2832 		max_size = (HERMON_QP_WQE_MLX_SND_HDRS + (num_sgl << 4));
2833 		log2 = highbit(max_size);
2834 		if ((max_size & (max_size - 1)) == 0) {
2835 			log2 = log2 - 1;
2836 		}
2837 
2838 		/* Make sure descriptor is at least the minimum size */
2839 		log2 = max(log2, HERMON_QP_WQE_LOG_MINIMUM);
2840 
2841 		/* Calculate actual number of SGL (given WQE size) */
2842 		actual_sgl = ((1 << log2) -
2843 		    sizeof (hermon_hw_snd_wqe_ctrl_t)) >> 4;
2844 		break;
2845 
2846 	case HERMON_QP_WQ_TYPE_SENDQ_CONN:
2847 		/*
2848 		 * Use requested maximum SGL to calculate max descriptor size
2849 		 * (while guaranteeing that the descriptor size is a
2850 		 * power-of-2 cachelines).
2851 		 */
2852 		max_size = (HERMON_QP_WQE_MLX_SND_HDRS + (num_sgl << 4));
2853 		log2 = highbit(max_size);
2854 		if ((max_size & (max_size - 1)) == 0) {
2855 			log2 = log2 - 1;
2856 		}
2857 
2858 		/* Make sure descriptor is at least the minimum size */
2859 		log2 = max(log2, HERMON_QP_WQE_LOG_MINIMUM);
2860 
2861 		/* Calculate actual number of SGL (given WQE size) */
2862 		actual_sgl = ((1 << log2) - HERMON_QP_WQE_MLX_SND_HDRS) >> 4;
2863 		break;
2864 
2865 	case HERMON_QP_WQ_TYPE_RECVQ:
2866 		/*
2867 		 * Same as above (except for Recv WQEs)
2868 		 */
2869 		max_size = (HERMON_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
2870 		log2 = highbit(max_size);
2871 		if ((max_size & (max_size - 1)) == 0) {
2872 			log2 = log2 - 1;
2873 		}
2874 
2875 		/* Make sure descriptor is at least the minimum size */
2876 		log2 = max(log2, HERMON_QP_WQE_LOG_MINIMUM);
2877 
2878 		/* Calculate actual number of SGL (given WQE size) */
2879 		actual_sgl = ((1 << log2) - HERMON_QP_WQE_MLX_RCV_HDRS) >> 4;
2880 		break;
2881 
2882 	case HERMON_QP_WQ_TYPE_SENDMLX_QP0:
2883 		/*
2884 		 * Same as above (except for MLX transport WQEs).  For these
2885 		 * WQEs we have to account for the space consumed by the
2886 		 * "inline" packet headers.  (This is smaller than for QP1
2887 		 * below because QP0 is not allowed to send packets with a GRH.
2888 		 */
2889 		max_size = (HERMON_QP_WQE_MLX_QP0_HDRS + (num_sgl << 4));
2890 		log2 = highbit(max_size);
2891 		if ((max_size & (max_size - 1)) == 0) {
2892 			log2 = log2 - 1;
2893 		}
2894 
2895 		/* Make sure descriptor is at least the minimum size */
2896 		log2 = max(log2, HERMON_QP_WQE_LOG_MINIMUM);
2897 
2898 		/* Calculate actual number of SGL (given WQE size) */
2899 		actual_sgl = ((1 << log2) - HERMON_QP_WQE_MLX_QP0_HDRS) >> 4;
2900 		break;
2901 
2902 	case HERMON_QP_WQ_TYPE_SENDMLX_QP1:
2903 		/*
2904 		 * Same as above.  For these WQEs we again have to account for
2905 		 * the space consumed by the "inline" packet headers.  (This
2906 		 * is larger than for QP0 above because we have to account for
2907 		 * the possibility of a GRH in each packet - and this
2908 		 * introduces an alignment issue that causes us to consume
2909 		 * an additional 8 bytes).
2910 		 */
2911 		max_size = (HERMON_QP_WQE_MLX_QP1_HDRS + (num_sgl << 4));
2912 		log2 = highbit(max_size);
2913 		if ((max_size & (max_size - 1)) == 0) {
2914 			log2 = log2 - 1;
2915 		}
2916 
2917 		/* Make sure descriptor is at least the minimum size */
2918 		log2 = max(log2, HERMON_QP_WQE_LOG_MINIMUM);
2919 
2920 		/* Calculate actual number of SGL (given WQE size) */
2921 		actual_sgl = ((1 << log2) - HERMON_QP_WQE_MLX_QP1_HDRS) >> 4;
2922 		break;
2923 
2924 	default:
2925 		HERMON_WARNING(state, "unexpected work queue type");
2926 		break;
2927 	}
2928 
2929 	/* Fill in the return values */
2930 	*logwqesz = log2;
2931 	*max_sgl  = min(real_max_sgl, actual_sgl);
2932 }
2933