1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_qp.c
29  *    Tavor Queue Pair Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, and
32  *    querying the Tavor queue pairs.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/conf.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/modctl.h>
40 #include <sys/bitmap.h>
41 #include <sys/sysmacros.h>
42 
43 #include <sys/ib/adapters/tavor/tavor.h>
44 #include <sys/ib/ib_pkt_hdrs.h>
45 
46 static int tavor_qp_create_qpn(tavor_state_t *state, tavor_qphdl_t qp,
47     tavor_rsrc_t *qpc);
48 static int tavor_qpn_avl_compare(const void *q, const void *e);
49 static int tavor_special_qp_rsrc_alloc(tavor_state_t *state,
50     ibt_sqp_type_t type, uint_t port, tavor_rsrc_t **qp_rsrc);
51 static int tavor_special_qp_rsrc_free(tavor_state_t *state, ibt_sqp_type_t type,
52     uint_t port);
53 static void tavor_qp_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
54     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
55 
56 /*
57  * tavor_qp_alloc()
58  *    Context: Can be called only from user or kernel context.
59  */
60 int
tavor_qp_alloc(tavor_state_t * state,tavor_qp_info_t * qpinfo,uint_t sleepflag,tavor_qp_options_t * op)61 tavor_qp_alloc(tavor_state_t *state, tavor_qp_info_t *qpinfo,
62     uint_t sleepflag, tavor_qp_options_t *op)
63 {
64 	tavor_rsrc_pool_info_t	*rsrc_pool;
65 	tavor_rsrc_t		*qpc, *rsrc, *rdb;
66 	tavor_umap_db_entry_t	*umapdb;
67 	tavor_qphdl_t		qp;
68 	ibt_qp_alloc_attr_t	*attr_p;
69 	ibt_qp_type_t		type;
70 	ibtl_qp_hdl_t		ibt_qphdl;
71 	ibt_chan_sizes_t	*queuesz_p;
72 	ib_qpn_t		*qpn;
73 	tavor_qphdl_t		*qphdl;
74 	ibt_mr_attr_t		mr_attr;
75 	tavor_mr_options_t	mr_op;
76 	tavor_srqhdl_t		srq;
77 	tavor_pdhdl_t		pd;
78 	tavor_cqhdl_t		sq_cq, rq_cq;
79 	tavor_mrhdl_t		mr;
80 	uint64_t		value, qp_desc_off;
81 	uint32_t		*sq_buf, *rq_buf;
82 	uint32_t		log_qp_sq_size, log_qp_rq_size;
83 	uint32_t		sq_size, rq_size;
84 	uint32_t		sq_wqe_size, rq_wqe_size;
85 	uint32_t		max_rdb, max_sgl, uarpg;
86 	uint_t			wq_location, dma_xfer_mode, qp_is_umap;
87 	uint_t			qp_srq_en;
88 	int			status, flag;
89 
90 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p, *queuesz_p))
91 
92 	/*
93 	 * Check the "options" flag.  Currently this flag tells the driver
94 	 * whether or not the QP's work queues should be come from normal
95 	 * system memory or whether they should be allocated from DDR memory.
96 	 */
97 	if (op == NULL) {
98 		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
99 	} else {
100 		wq_location = op->qpo_wq_loc;
101 	}
102 
103 	/*
104 	 * Extract the necessary info from the tavor_qp_info_t structure
105 	 */
106 	attr_p	  = qpinfo->qpi_attrp;
107 	type	  = qpinfo->qpi_type;
108 	ibt_qphdl = qpinfo->qpi_ibt_qphdl;
109 	queuesz_p = qpinfo->qpi_queueszp;
110 	qpn	  = qpinfo->qpi_qpn;
111 	qphdl	  = &qpinfo->qpi_qphdl;
112 
113 	/*
114 	 * Determine whether QP is being allocated for userland access or
115 	 * whether it is being allocated for kernel access.  If the QP is
116 	 * being allocated for userland access, then lookup the UAR doorbell
117 	 * page number for the current process.  Note:  If this is not found
118 	 * (e.g. if the process has not previously open()'d the Tavor driver),
119 	 * then an error is returned.
120 	 */
121 	qp_is_umap = (attr_p->qp_alloc_flags & IBT_QP_USER_MAP) ? 1 : 0;
122 	if (qp_is_umap) {
123 		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
124 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
125 		if (status != DDI_SUCCESS) {
126 			goto qpalloc_fail;
127 		}
128 		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
129 	}
130 
131 	/*
132 	 * Determine whether QP is being associated with an SRQ
133 	 */
134 	qp_srq_en = (attr_p->qp_alloc_flags & IBT_QP_USES_SRQ) ? 1 : 0;
135 	if (qp_srq_en) {
136 		/*
137 		 * Check for valid SRQ handle pointers
138 		 */
139 		if (attr_p->qp_ibc_srq_hdl == NULL) {
140 			goto qpalloc_fail;
141 		}
142 		srq = (tavor_srqhdl_t)attr_p->qp_ibc_srq_hdl;
143 	}
144 
145 	/*
146 	 * Check for valid QP service type (only UD/RC/UC supported)
147 	 */
148 	if (((type != IBT_UD_RQP) && (type != IBT_RC_RQP) &&
149 	    (type != IBT_UC_RQP))) {
150 		goto qpalloc_fail;
151 	}
152 
153 	/*
154 	 * Only RC is supported on an SRQ -- This is a Tavor hardware
155 	 * limitation.  Arbel native mode will not have this shortcoming.
156 	 */
157 	if (qp_srq_en && type != IBT_RC_RQP) {
158 		goto qpalloc_fail;
159 	}
160 
161 	/*
162 	 * Check for valid PD handle pointer
163 	 */
164 	if (attr_p->qp_pd_hdl == NULL) {
165 		goto qpalloc_fail;
166 	}
167 	pd = (tavor_pdhdl_t)attr_p->qp_pd_hdl;
168 
169 	/*
170 	 * If on an SRQ, check to make sure the PD is the same
171 	 */
172 	if (qp_srq_en && (pd->pd_pdnum != srq->srq_pdhdl->pd_pdnum)) {
173 		goto qpalloc_fail;
174 	}
175 
176 	/* Increment the reference count on the protection domain (PD) */
177 	tavor_pd_refcnt_inc(pd);
178 
179 	/*
180 	 * Check for valid CQ handle pointers
181 	 */
182 	if ((attr_p->qp_ibc_scq_hdl == NULL) ||
183 	    (attr_p->qp_ibc_rcq_hdl == NULL)) {
184 		goto qpalloc_fail1;
185 	}
186 	sq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_scq_hdl;
187 	rq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_rcq_hdl;
188 
189 	/*
190 	 * Increment the reference count on the CQs.  One or both of these
191 	 * could return error if we determine that the given CQ is already
192 	 * being used with a special (SMI/GSI) QP.
193 	 */
194 	status = tavor_cq_refcnt_inc(sq_cq, TAVOR_CQ_IS_NORMAL);
195 	if (status != DDI_SUCCESS) {
196 		goto qpalloc_fail1;
197 	}
198 	status = tavor_cq_refcnt_inc(rq_cq, TAVOR_CQ_IS_NORMAL);
199 	if (status != DDI_SUCCESS) {
200 		goto qpalloc_fail2;
201 	}
202 
203 	/*
204 	 * Allocate an QP context entry.  This will be filled in with all
205 	 * the necessary parameters to define the Queue Pair.  Unlike
206 	 * other Tavor hardware resources, ownership is not immediately
207 	 * given to hardware in the final step here.  Instead, we must
208 	 * wait until the QP is later transitioned to the "Init" state before
209 	 * passing the QP to hardware.  If we fail here, we must undo all
210 	 * the reference count (CQ and PD).
211 	 */
212 	status = tavor_rsrc_alloc(state, TAVOR_QPC, 1, sleepflag, &qpc);
213 	if (status != DDI_SUCCESS) {
214 		goto qpalloc_fail3;
215 	}
216 
217 	/*
218 	 * Allocate the software structure for tracking the queue pair
219 	 * (i.e. the Tavor Queue Pair handle).  If we fail here, we must
220 	 * undo the reference counts and the previous resource allocation.
221 	 */
222 	status = tavor_rsrc_alloc(state, TAVOR_QPHDL, 1, sleepflag, &rsrc);
223 	if (status != DDI_SUCCESS) {
224 		goto qpalloc_fail4;
225 	}
226 	qp = (tavor_qphdl_t)rsrc->tr_addr;
227 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
228 
229 	/*
230 	 * Calculate the QP number from QPC index.  This routine handles
231 	 * all of the operations necessary to keep track of used, unused,
232 	 * and released QP numbers.
233 	 */
234 	status = tavor_qp_create_qpn(state, qp, qpc);
235 	if (status != DDI_SUCCESS) {
236 		goto qpalloc_fail5;
237 	}
238 
239 	/*
240 	 * If this will be a user-mappable QP, then allocate an entry for
241 	 * the "userland resources database".  This will later be added to
242 	 * the database (after all further QP operations are successful).
243 	 * If we fail here, we must undo the reference counts and the
244 	 * previous resource allocation.
245 	 */
246 	if (qp_is_umap) {
247 		umapdb = tavor_umap_db_alloc(state->ts_instance, qp->qp_qpnum,
248 		    MLNX_UMAP_QPMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
249 		if (umapdb == NULL) {
250 			goto qpalloc_fail6;
251 		}
252 	}
253 
254 	/*
255 	 * If this is an RC QP, then pre-allocate the maximum number of RDB
256 	 * entries.  This allows us to ensure that we can later cover all
257 	 * the resources needed by hardware for handling multiple incoming
258 	 * RDMA Reads.  Note: These resources are obviously not always
259 	 * necessary.  They are allocated here anyway.  Someday maybe this
260 	 * can be modified to allocate these on-the-fly (i.e. only if RDMA
261 	 * Read or Atomic operations are enabled) XXX
262 	 * If we fail here, we have a bunch of resource and reference count
263 	 * cleanup to do.
264 	 */
265 	if (type == IBT_RC_RQP) {
266 		max_rdb = state->ts_cfg_profile->cp_hca_max_rdma_in_qp;
267 		status = tavor_rsrc_alloc(state, TAVOR_RDB, max_rdb,
268 		    sleepflag, &rdb);
269 		if (status != DDI_SUCCESS) {
270 			goto qpalloc_fail7;
271 		}
272 		qp->qp_rdbrsrcp = rdb;
273 		/* Calculate offset (into DDR memory) of RDB entries */
274 		rsrc_pool = &state->ts_rsrc_hdl[TAVOR_RDB];
275 		qp->qp_rdb_ddraddr = (uintptr_t)rsrc_pool->rsrc_ddr_offset +
276 		    (rdb->tr_indx << TAVOR_RDB_SIZE_SHIFT);
277 	}
278 
279 	/*
280 	 * Calculate the appropriate size for the work queues.
281 	 * Note:  All Tavor QP work queues must be a power-of-2 in size.  Also
282 	 * they may not be any smaller than TAVOR_QP_MIN_SIZE.  This step is
283 	 * to round the requested size up to the next highest power-of-2
284 	 */
285 	attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq, TAVOR_QP_MIN_SIZE);
286 	attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq, TAVOR_QP_MIN_SIZE);
287 	log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq);
288 	if (ISP2(attr_p->qp_sizes.cs_sq)) {
289 		log_qp_sq_size = log_qp_sq_size - 1;
290 	}
291 	log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
292 	if (ISP2(attr_p->qp_sizes.cs_rq)) {
293 		log_qp_rq_size = log_qp_rq_size - 1;
294 	}
295 
296 	/*
297 	 * Next we verify that the rounded-up size is valid (i.e. consistent
298 	 * with the device limits and/or software-configured limits).  If not,
299 	 * then obviously we have a lot of cleanup to do before returning.
300 	 */
301 	if ((log_qp_sq_size > state->ts_cfg_profile->cp_log_max_qp_sz) ||
302 	    (!qp_srq_en && (log_qp_rq_size >
303 	    state->ts_cfg_profile->cp_log_max_qp_sz))) {
304 		goto qpalloc_fail8;
305 	}
306 
307 	/*
308 	 * Next we verify that the requested number of SGL is valid (i.e.
309 	 * consistent with the device limits and/or software-configured
310 	 * limits).  If not, then obviously the same cleanup needs to be done.
311 	 */
312 	max_sgl = state->ts_cfg_profile->cp_wqe_real_max_sgl;
313 	if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
314 	    (!qp_srq_en && (attr_p->qp_sizes.cs_rq_sgl > max_sgl))) {
315 		goto qpalloc_fail8;
316 	}
317 
318 	/*
319 	 * Determine this QP's WQE sizes (for both the Send and Recv WQEs).
320 	 * This will depend on the requested number of SGLs.  Note: this
321 	 * has the side-effect of also calculating the real number of SGLs
322 	 * (for the calculated WQE size).
323 	 *
324 	 * For QP's on an SRQ, we set these to 0.
325 	 */
326 	if (qp_srq_en) {
327 		qp->qp_rq_log_wqesz = 0;
328 		qp->qp_rq_sgl = 0;
329 	} else {
330 		tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
331 		    TAVOR_QP_WQ_TYPE_RECVQ, &qp->qp_rq_log_wqesz,
332 		    &qp->qp_rq_sgl);
333 	}
334 	tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
335 	    TAVOR_QP_WQ_TYPE_SENDQ, &qp->qp_sq_log_wqesz, &qp->qp_sq_sgl);
336 
337 	/*
338 	 * Allocate the memory for QP work queues.  Note:  The location from
339 	 * which we will allocate these work queues has been passed in
340 	 * through the tavor_qp_options_t structure.  Since Tavor work queues
341 	 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
342 	 * the work queue memory is very important.  We used to allocate
343 	 * work queues (the combined receive and send queues) so that they
344 	 * would be aligned on their combined size.  That alignment guaranteed
345 	 * that they would never cross the 4GB boundary (Tavor work queues
346 	 * are on the order of MBs at maximum).  Now we are able to relax
347 	 * this alignment constraint by ensuring that the IB address assigned
348 	 * to the queue memory (as a result of the tavor_mr_register() call)
349 	 * is offset from zero.
350 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
351 	 * guarantee the alignment, but when attempting to use IOMMU bypass
352 	 * mode we found that we were not allowed to specify any alignment
353 	 * that was more restrictive than the system page size.
354 	 * So we avoided this constraint by passing two alignment values,
355 	 * one for the memory allocation itself and the other for the DMA
356 	 * handle (for later bind).  This used to cause more memory than
357 	 * necessary to be allocated (in order to guarantee the more
358 	 * restrictive alignment contraint).  But be guaranteeing the
359 	 * zero-based IB virtual address for the queue, we are able to
360 	 * conserve this memory.
361 	 * Note: If QP is not user-mappable, then it may come from either
362 	 * kernel system memory or from HCA-attached local DDR memory.
363 	 */
364 	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
365 	sq_size	    = (1 << log_qp_sq_size) * sq_wqe_size;
366 
367 	/* QP on SRQ sets these to 0 */
368 	if (qp_srq_en) {
369 		rq_wqe_size = 0;
370 		rq_size	    = 0;
371 	} else {
372 		rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
373 		rq_size	    = (1 << log_qp_rq_size) * rq_wqe_size;
374 	}
375 
376 	qp->qp_wqinfo.qa_size = sq_size + rq_size;
377 	qp->qp_wqinfo.qa_alloc_align = max(sq_wqe_size, rq_wqe_size);
378 	qp->qp_wqinfo.qa_bind_align  = max(sq_wqe_size, rq_wqe_size);
379 	if (qp_is_umap) {
380 		qp->qp_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
381 	} else {
382 		qp->qp_wqinfo.qa_location = wq_location;
383 	}
384 	status = tavor_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
385 	if (status != DDI_SUCCESS) {
386 		goto qpalloc_fail8;
387 	}
388 	if (sq_wqe_size > rq_wqe_size) {
389 		sq_buf = qp->qp_wqinfo.qa_buf_aligned;
390 
391 		/*
392 		 * If QP's on an SRQ, we set the rq_buf to NULL
393 		 */
394 		if (qp_srq_en)
395 			rq_buf = NULL;
396 		else
397 			rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
398 	} else {
399 		rq_buf = qp->qp_wqinfo.qa_buf_aligned;
400 		sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
401 	}
402 
403 	/*
404 	 * Register the memory for the QP work queues.  The memory for the
405 	 * QP must be registered in the Tavor TPT tables.  This gives us the
406 	 * LKey to specify in the QP context later.  Note: The memory for
407 	 * Tavor work queues (both Send and Recv) must be contiguous and
408 	 * registered as a single memory region.  Note also: If the work
409 	 * queue is to be allocated from DDR memory, then only a "bypass"
410 	 * mapping is appropriate.  And if the QP memory is user-mappable,
411 	 * then we force DDI_DMA_CONSISTENT mapping.
412 	 * Also, in order to meet the alignment restriction, we pass the
413 	 * "mro_bind_override_addr" flag in the call to tavor_mr_register().
414 	 * This guarantees that the resulting IB vaddr will be zero-based
415 	 * (modulo the offset into the first page).
416 	 * If we fail here, we still have the bunch of resource and reference
417 	 * count cleanup to do.
418 	 */
419 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
420 	    IBT_MR_NOSLEEP;
421 	mr_attr.mr_vaddr    = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
422 	mr_attr.mr_len	    = qp->qp_wqinfo.qa_size;
423 	mr_attr.mr_as	    = NULL;
424 	mr_attr.mr_flags    = flag;
425 	if (qp_is_umap) {
426 		mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
427 	} else {
428 		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
429 			mr_op.mro_bind_type =
430 			    state->ts_cfg_profile->cp_iommu_bypass;
431 			dma_xfer_mode =
432 			    state->ts_cfg_profile->cp_streaming_consistent;
433 			if (dma_xfer_mode == DDI_DMA_STREAMING) {
434 				mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
435 			}
436 		} else {
437 			mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
438 		}
439 	}
440 	mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
441 	mr_op.mro_bind_override_addr = 1;
442 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
443 	if (status != DDI_SUCCESS) {
444 		goto qpalloc_fail9;
445 	}
446 
447 	/*
448 	 * Calculate the offset between the kernel virtual address space
449 	 * and the IB virtual address space.  This will be used when
450 	 * posting work requests to properly initialize each WQE.
451 	 */
452 	qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
453 	    (uint64_t)mr->mr_bindinfo.bi_addr;
454 
455 	/*
456 	 * Fill in all the return arguments (if necessary).  This includes
457 	 * real work queue sizes, real SGLs, and QP number
458 	 */
459 	if (queuesz_p != NULL) {
460 		queuesz_p->cs_sq	= (1 << log_qp_sq_size);
461 		queuesz_p->cs_sq_sgl	= qp->qp_sq_sgl;
462 
463 		/* QP on an SRQ set these to 0 */
464 		if (qp_srq_en) {
465 			queuesz_p->cs_rq	= 0;
466 			queuesz_p->cs_rq_sgl	= 0;
467 		} else {
468 			queuesz_p->cs_rq	= (1 << log_qp_rq_size);
469 			queuesz_p->cs_rq_sgl	= qp->qp_rq_sgl;
470 		}
471 	}
472 	if (qpn != NULL) {
473 		*qpn = (ib_qpn_t)qp->qp_qpnum;
474 	}
475 
476 	/*
477 	 * Fill in the rest of the Tavor Queue Pair handle.  We can update
478 	 * the following fields for use in further operations on the QP.
479 	 */
480 	qp->qp_qpcrsrcp		= qpc;
481 	qp->qp_rsrcp		= rsrc;
482 	qp->qp_state		= TAVOR_QP_RESET;
483 	qp->qp_pdhdl		= pd;
484 	qp->qp_mrhdl		= mr;
485 	qp->qp_sq_sigtype	= (attr_p->qp_flags & IBT_WR_SIGNALED) ?
486 	    TAVOR_QP_SQ_WR_SIGNALED : TAVOR_QP_SQ_ALL_SIGNALED;
487 	qp->qp_is_special	= 0;
488 	qp->qp_is_umap		= qp_is_umap;
489 	qp->qp_uarpg		= (qp->qp_is_umap) ? uarpg : 0;
490 	qp->qp_umap_dhp		= (devmap_cookie_t)NULL;
491 	qp->qp_sq_cqhdl		= sq_cq;
492 	qp->qp_sq_lastwqeaddr	= NULL;
493 	qp->qp_sq_bufsz		= (1 << log_qp_sq_size);
494 	qp->qp_sq_buf		= sq_buf;
495 	qp->qp_desc_off		= qp_desc_off;
496 	qp->qp_rq_cqhdl		= rq_cq;
497 	qp->qp_rq_lastwqeaddr	= NULL;
498 	qp->qp_rq_buf		= rq_buf;
499 
500 	/* QP on an SRQ sets this to 0 */
501 	if (qp_srq_en) {
502 		qp->qp_rq_bufsz		= 0;
503 	} else {
504 		qp->qp_rq_bufsz		= (1 << log_qp_rq_size);
505 	}
506 
507 	qp->qp_forward_sqd_event  = 0;
508 	qp->qp_sqd_still_draining = 0;
509 	qp->qp_hdlrarg		= (void *)ibt_qphdl;
510 	qp->qp_mcg_refcnt	= 0;
511 
512 	/*
513 	 * If this QP is to be associated with an SRQ, then set the SRQ handle
514 	 * appropriately.
515 	 */
516 	if (qp_srq_en) {
517 		qp->qp_srqhdl = srq;
518 		qp->qp_srq_en = TAVOR_QP_SRQ_ENABLED;
519 		tavor_srq_refcnt_inc(qp->qp_srqhdl);
520 	} else {
521 		qp->qp_srqhdl = NULL;
522 		qp->qp_srq_en = TAVOR_QP_SRQ_DISABLED;
523 	}
524 
525 	/* Determine if later ddi_dma_sync will be necessary */
526 	qp->qp_sync = TAVOR_QP_IS_SYNC_REQ(state, qp->qp_wqinfo);
527 
528 	/* Determine the QP service type */
529 	if (type == IBT_RC_RQP) {
530 		qp->qp_serv_type = TAVOR_QP_RC;
531 	} else if (type == IBT_UD_RQP) {
532 		qp->qp_serv_type = TAVOR_QP_UD;
533 	} else {
534 		qp->qp_serv_type = TAVOR_QP_UC;
535 	}
536 
537 	/* Zero out the QP context */
538 	bzero(&qp->qpc, sizeof (tavor_hw_qpc_t));
539 
540 	/*
541 	 * Put QP handle in Tavor QPNum-to-QPHdl list.  Then fill in the
542 	 * "qphdl" and return success
543 	 */
544 	ASSERT(state->ts_qphdl[qpc->tr_indx] == NULL);
545 	state->ts_qphdl[qpc->tr_indx] = qp;
546 
547 	/*
548 	 * If this is a user-mappable QP, then we need to insert the previously
549 	 * allocated entry into the "userland resources database".  This will
550 	 * allow for later lookup during devmap() (i.e. mmap()) calls.
551 	 */
552 	if (qp_is_umap) {
553 		tavor_umap_db_add(umapdb);
554 	}
555 
556 	*qphdl = qp;
557 
558 	return (DDI_SUCCESS);
559 
560 /*
561  * The following is cleanup for all possible failure cases in this routine
562  */
563 qpalloc_fail9:
564 	tavor_queue_free(state, &qp->qp_wqinfo);
565 qpalloc_fail8:
566 	if (type == IBT_RC_RQP) {
567 		tavor_rsrc_free(state, &rdb);
568 	}
569 qpalloc_fail7:
570 	if (qp_is_umap) {
571 		tavor_umap_db_free(umapdb);
572 	}
573 qpalloc_fail6:
574 	/*
575 	 * Releasing the QPN will also free up the QPC context.  Update
576 	 * the QPC context pointer to indicate this.
577 	 */
578 	tavor_qp_release_qpn(state, qp->qp_qpn_hdl, TAVOR_QPN_RELEASE);
579 	qpc = NULL;
580 qpalloc_fail5:
581 	tavor_rsrc_free(state, &rsrc);
582 qpalloc_fail4:
583 	if (qpc) {
584 		tavor_rsrc_free(state, &qpc);
585 	}
586 qpalloc_fail3:
587 	tavor_cq_refcnt_dec(rq_cq);
588 qpalloc_fail2:
589 	tavor_cq_refcnt_dec(sq_cq);
590 qpalloc_fail1:
591 	tavor_pd_refcnt_dec(pd);
592 qpalloc_fail:
593 	return (status);
594 }
595 
596 
597 
598 /*
599  * tavor_special_qp_alloc()
600  *    Context: Can be called only from user or kernel context.
601  */
602 int
tavor_special_qp_alloc(tavor_state_t * state,tavor_qp_info_t * qpinfo,uint_t sleepflag,tavor_qp_options_t * op)603 tavor_special_qp_alloc(tavor_state_t *state, tavor_qp_info_t *qpinfo,
604     uint_t sleepflag, tavor_qp_options_t *op)
605 {
606 	tavor_rsrc_t		*qpc, *rsrc;
607 	tavor_qphdl_t		qp;
608 	ibt_qp_alloc_attr_t	*attr_p;
609 	ibt_sqp_type_t		type;
610 	uint8_t			port;
611 	ibtl_qp_hdl_t		ibt_qphdl;
612 	ibt_chan_sizes_t	*queuesz_p;
613 	tavor_qphdl_t		*qphdl;
614 	ibt_mr_attr_t		mr_attr;
615 	tavor_mr_options_t	mr_op;
616 	tavor_pdhdl_t		pd;
617 	tavor_cqhdl_t		sq_cq, rq_cq;
618 	tavor_mrhdl_t		mr;
619 	uint64_t		qp_desc_off;
620 	uint32_t		*sq_buf, *rq_buf;
621 	uint32_t		log_qp_sq_size, log_qp_rq_size;
622 	uint32_t		sq_size, rq_size, max_sgl;
623 	uint32_t		sq_wqe_size, rq_wqe_size;
624 	uint_t			wq_location, dma_xfer_mode;
625 	int			status, flag;
626 
627 	/*
628 	 * Check the "options" flag.  Currently this flag tells the driver
629 	 * whether or not the QP's work queues should be come from normal
630 	 * system memory or whether they should be allocated from DDR memory.
631 	 */
632 	if (op == NULL) {
633 		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
634 	} else {
635 		wq_location = op->qpo_wq_loc;
636 	}
637 
638 	/*
639 	 * Extract the necessary info from the tavor_qp_info_t structure
640 	 */
641 	attr_p	  = qpinfo->qpi_attrp;
642 	type	  = qpinfo->qpi_type;
643 	port	  = qpinfo->qpi_port;
644 	ibt_qphdl = qpinfo->qpi_ibt_qphdl;
645 	queuesz_p = qpinfo->qpi_queueszp;
646 	qphdl	  = &qpinfo->qpi_qphdl;
647 
648 	/*
649 	 * Check for valid special QP type (only SMI & GSI supported)
650 	 */
651 	if ((type != IBT_SMI_SQP) && (type != IBT_GSI_SQP)) {
652 		goto spec_qpalloc_fail;
653 	}
654 
655 	/*
656 	 * Check for valid port number
657 	 */
658 	if (!tavor_portnum_is_valid(state, port)) {
659 		goto spec_qpalloc_fail;
660 	}
661 	port = port - 1;
662 
663 	/*
664 	 * Check for valid PD handle pointer
665 	 */
666 	if (attr_p->qp_pd_hdl == NULL) {
667 		goto spec_qpalloc_fail;
668 	}
669 	pd = (tavor_pdhdl_t)attr_p->qp_pd_hdl;
670 
671 	/* Increment the reference count on the PD */
672 	tavor_pd_refcnt_inc(pd);
673 
674 	/*
675 	 * Check for valid CQ handle pointers
676 	 */
677 	if ((attr_p->qp_ibc_scq_hdl == NULL) ||
678 	    (attr_p->qp_ibc_rcq_hdl == NULL)) {
679 		goto spec_qpalloc_fail1;
680 	}
681 	sq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_scq_hdl;
682 	rq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_rcq_hdl;
683 
684 	/*
685 	 * Increment the reference count on the CQs.  One or both of these
686 	 * could return error if we determine that the given CQ is already
687 	 * being used with a non-special QP (i.e. a normal QP).
688 	 */
689 	status = tavor_cq_refcnt_inc(sq_cq, TAVOR_CQ_IS_SPECIAL);
690 	if (status != DDI_SUCCESS) {
691 		goto spec_qpalloc_fail1;
692 	}
693 	status = tavor_cq_refcnt_inc(rq_cq, TAVOR_CQ_IS_SPECIAL);
694 	if (status != DDI_SUCCESS) {
695 		goto spec_qpalloc_fail2;
696 	}
697 
698 	/*
699 	 * Allocate the special QP resources.  Essentially, this allocation
700 	 * amounts to checking if the request special QP has already been
701 	 * allocated.  If successful, the QP context return is an actual
702 	 * QP context that has been "aliased" to act as a special QP of the
703 	 * appropriate type (and for the appropriate port).  Just as in
704 	 * tavor_qp_alloc() above, ownership for this QP context is not
705 	 * immediately given to hardware in the final step here.  Instead, we
706 	 * wait until the QP is later transitioned to the "Init" state before
707 	 * passing the QP to hardware.  If we fail here, we must undo all
708 	 * the reference count (CQ and PD).
709 	 */
710 	status = tavor_special_qp_rsrc_alloc(state, type, port, &qpc);
711 	if (status != DDI_SUCCESS) {
712 		goto spec_qpalloc_fail3;
713 	}
714 
715 	/*
716 	 * Allocate the software structure for tracking the special queue
717 	 * pair (i.e. the Tavor Queue Pair handle).  If we fail here, we
718 	 * must undo the reference counts and the previous resource allocation.
719 	 */
720 	status = tavor_rsrc_alloc(state, TAVOR_QPHDL, 1, sleepflag, &rsrc);
721 	if (status != DDI_SUCCESS) {
722 		goto spec_qpalloc_fail4;
723 	}
724 	qp = (tavor_qphdl_t)rsrc->tr_addr;
725 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
726 
727 	/*
728 	 * Actual QP number is a combination of the index of the QPC and
729 	 * the port number.  This is because the special QP contexts must
730 	 * be allocated two-at-a-time.
731 	 */
732 	qp->qp_qpnum = qpc->tr_indx + port;
733 
734 	/*
735 	 * Calculate the appropriate size for the work queues.
736 	 * Note:  All Tavor QP work queues must be a power-of-2 in size.  Also
737 	 * they may not be any smaller than TAVOR_QP_MIN_SIZE.  This step is
738 	 * to round the requested size up to the next highest power-of-2
739 	 */
740 	attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq, TAVOR_QP_MIN_SIZE);
741 	attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq, TAVOR_QP_MIN_SIZE);
742 	log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq);
743 	if (ISP2(attr_p->qp_sizes.cs_sq)) {
744 		log_qp_sq_size = log_qp_sq_size - 1;
745 	}
746 	log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
747 	if (ISP2(attr_p->qp_sizes.cs_rq)) {
748 		log_qp_rq_size = log_qp_rq_size - 1;
749 	}
750 
751 	/*
752 	 * Next we verify that the rounded-up size is valid (i.e. consistent
753 	 * with the device limits and/or software-configured limits).  If not,
754 	 * then obviously we have a bit of cleanup to do before returning.
755 	 */
756 	if ((log_qp_sq_size > state->ts_cfg_profile->cp_log_max_qp_sz) ||
757 	    (log_qp_rq_size > state->ts_cfg_profile->cp_log_max_qp_sz)) {
758 		goto spec_qpalloc_fail5;
759 	}
760 
761 	/*
762 	 * Next we verify that the requested number of SGL is valid (i.e.
763 	 * consistent with the device limits and/or software-configured
764 	 * limits).  If not, then obviously the same cleanup needs to be done.
765 	 */
766 	max_sgl = state->ts_cfg_profile->cp_wqe_real_max_sgl;
767 	if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
768 	    (attr_p->qp_sizes.cs_rq_sgl > max_sgl)) {
769 		goto spec_qpalloc_fail5;
770 	}
771 
772 	/*
773 	 * Determine this QP's WQE sizes (for both the Send and Recv WQEs).
774 	 * This will depend on the requested number of SGLs.  Note: this
775 	 * has the side-effect of also calculating the real number of SGLs
776 	 * (for the calculated WQE size).
777 	 */
778 	tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
779 	    TAVOR_QP_WQ_TYPE_RECVQ, &qp->qp_rq_log_wqesz, &qp->qp_rq_sgl);
780 	if (type == IBT_SMI_SQP) {
781 		tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
782 		    TAVOR_QP_WQ_TYPE_SENDMLX_QP0, &qp->qp_sq_log_wqesz,
783 		    &qp->qp_sq_sgl);
784 	} else {
785 		tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
786 		    TAVOR_QP_WQ_TYPE_SENDMLX_QP1, &qp->qp_sq_log_wqesz,
787 		    &qp->qp_sq_sgl);
788 	}
789 
790 	/*
791 	 * Allocate the memory for QP work queues.  Note:  The location from
792 	 * which we will allocate these work queues has been passed in
793 	 * through the tavor_qp_options_t structure.  Since Tavor work queues
794 	 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
795 	 * the work queue memory is very important.  We used to allocate
796 	 * work queues (the combined receive and send queues) so that they
797 	 * would be aligned on their combined size.  That alignment guaranteed
798 	 * that they would never cross the 4GB boundary (Tavor work queues
799 	 * are on the order of MBs at maximum).  Now we are able to relax
800 	 * this alignment constraint by ensuring that the IB address assigned
801 	 * to the queue memory (as a result of the tavor_mr_register() call)
802 	 * is offset from zero.
803 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
804 	 * guarantee the alignment, but when attempting to use IOMMU bypass
805 	 * mode we found that we were not allowed to specify any alignment
806 	 * that was more restrictive than the system page size.
807 	 * So we avoided this constraint by passing two alignment values,
808 	 * one for the memory allocation itself and the other for the DMA
809 	 * handle (for later bind).  This used to cause more memory than
810 	 * necessary to be allocated (in order to guarantee the more
811 	 * restrictive alignment contraint).  But be guaranteeing the
812 	 * zero-based IB virtual address for the queue, we are able to
813 	 * conserve this memory.
814 	 */
815 	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
816 	rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
817 	sq_size	    = (1 << log_qp_sq_size) * sq_wqe_size;
818 	rq_size	    = (1 << log_qp_rq_size) * rq_wqe_size;
819 	qp->qp_wqinfo.qa_size	  = sq_size + rq_size;
820 	qp->qp_wqinfo.qa_alloc_align = max(sq_wqe_size, rq_wqe_size);
821 	qp->qp_wqinfo.qa_bind_align  = max(sq_wqe_size, rq_wqe_size);
822 	qp->qp_wqinfo.qa_location = wq_location;
823 	status = tavor_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
824 	if (status != 0) {
825 		goto spec_qpalloc_fail5;
826 	}
827 	if (sq_wqe_size > rq_wqe_size) {
828 		sq_buf = qp->qp_wqinfo.qa_buf_aligned;
829 		rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
830 	} else {
831 		rq_buf = qp->qp_wqinfo.qa_buf_aligned;
832 		sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
833 	}
834 
835 	/*
836 	 * Register the memory for the special QP work queues.  The memory for
837 	 * the special QP must be registered in the Tavor TPT tables.  This
838 	 * gives us the LKey to specify in the QP context later.  Note: The
839 	 * memory for Tavor work queues (both Send and Recv) must be contiguous
840 	 * and registered as a single memory region.  Note also: If the work
841 	 * queue is to be allocated from DDR memory, then only a "bypass"
842 	 * mapping is appropriate.
843 	 * Also, in order to meet the alignment restriction, we pass the
844 	 * "mro_bind_override_addr" flag in the call to tavor_mr_register().
845 	 * This guarantees that the resulting IB vaddr will be zero-based
846 	 * (modulo the offset into the first page).
847 	 * If we fail here, we have a bunch of resource and reference count
848 	 * cleanup to do.
849 	 */
850 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
851 	    IBT_MR_NOSLEEP;
852 	mr_attr.mr_vaddr    = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
853 	mr_attr.mr_len	    = qp->qp_wqinfo.qa_size;
854 	mr_attr.mr_as	    = NULL;
855 	mr_attr.mr_flags    = flag;
856 	if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
857 		mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
858 
859 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
860 		if (dma_xfer_mode == DDI_DMA_STREAMING) {
861 			mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
862 		}
863 	} else {
864 		mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
865 	}
866 	mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
867 	mr_op.mro_bind_override_addr = 1;
868 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
869 	if (status != DDI_SUCCESS) {
870 		goto spec_qpalloc_fail6;
871 	}
872 
873 	/*
874 	 * Calculate the offset between the kernel virtual address space
875 	 * and the IB virtual address space.  This will be used when
876 	 * posting work requests to properly initialize each WQE.
877 	 */
878 	qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
879 	    (uint64_t)mr->mr_bindinfo.bi_addr;
880 
881 	/*
882 	 * Fill in all the return arguments (if necessary).  This includes
883 	 * real work queue sizes, real SGLs, and QP number (which will be
884 	 * either zero or one, depending on the special QP type)
885 	 */
886 	if (queuesz_p != NULL) {
887 		queuesz_p->cs_sq	= (1 << log_qp_sq_size);
888 		queuesz_p->cs_sq_sgl	= qp->qp_sq_sgl;
889 		queuesz_p->cs_rq	= (1 << log_qp_rq_size);
890 		queuesz_p->cs_rq_sgl	= qp->qp_rq_sgl;
891 	}
892 
893 	/*
894 	 * Fill in the rest of the Tavor Queue Pair handle.  We can update
895 	 * the following fields for use in further operations on the QP.
896 	 */
897 	qp->qp_qpcrsrcp		= qpc;
898 	qp->qp_rsrcp		= rsrc;
899 	qp->qp_state		= TAVOR_QP_RESET;
900 	qp->qp_pdhdl		= pd;
901 	qp->qp_mrhdl		= mr;
902 	qp->qp_sq_sigtype	= (attr_p->qp_flags & IBT_WR_SIGNALED) ?
903 	    TAVOR_QP_SQ_WR_SIGNALED : TAVOR_QP_SQ_ALL_SIGNALED;
904 	qp->qp_is_special	= (type == IBT_SMI_SQP) ?
905 	    TAVOR_QP_SMI : TAVOR_QP_GSI;
906 	qp->qp_is_umap		= 0;
907 	qp->qp_uarpg		= 0;
908 	qp->qp_sq_cqhdl		= sq_cq;
909 	qp->qp_sq_lastwqeaddr	= NULL;
910 	qp->qp_sq_bufsz		= (1 << log_qp_sq_size);
911 	qp->qp_sq_buf		= sq_buf;
912 	qp->qp_desc_off		= qp_desc_off;
913 	qp->qp_rq_cqhdl		= rq_cq;
914 	qp->qp_rq_lastwqeaddr	= NULL;
915 	qp->qp_rq_bufsz		= (1 << log_qp_rq_size);
916 	qp->qp_rq_buf		= rq_buf;
917 	qp->qp_portnum		= port;
918 	qp->qp_pkeyindx		= 0;
919 	qp->qp_hdlrarg		= (void *)ibt_qphdl;
920 	qp->qp_mcg_refcnt	= 0;
921 	qp->qp_srq_en		= 0;
922 	qp->qp_srqhdl		= NULL;
923 
924 	/* Determine if later ddi_dma_sync will be necessary */
925 	qp->qp_sync = TAVOR_QP_IS_SYNC_REQ(state, qp->qp_wqinfo);
926 
927 	/* All special QPs are UD QP service type */
928 	qp->qp_serv_type = TAVOR_QP_UD;
929 
930 	/* Zero out the QP context */
931 	bzero(&qp->qpc, sizeof (tavor_hw_qpc_t));
932 
933 	/*
934 	 * Put QP handle in Tavor QPNum-to-QPHdl list.  Then fill in the
935 	 * "qphdl" and return success
936 	 */
937 	ASSERT(state->ts_qphdl[qpc->tr_indx + port] == NULL);
938 	state->ts_qphdl[qpc->tr_indx + port] = qp;
939 
940 	*qphdl = qp;
941 
942 	return (DDI_SUCCESS);
943 
944 /*
945  * The following is cleanup for all possible failure cases in this routine
946  */
947 spec_qpalloc_fail6:
948 	tavor_queue_free(state, &qp->qp_wqinfo);
949 spec_qpalloc_fail5:
950 	tavor_rsrc_free(state, &rsrc);
951 spec_qpalloc_fail4:
952 	if (tavor_special_qp_rsrc_free(state, type, port) != DDI_SUCCESS) {
953 		TAVOR_WARNING(state, "failed to free special QP rsrc");
954 	}
955 spec_qpalloc_fail3:
956 	tavor_cq_refcnt_dec(rq_cq);
957 spec_qpalloc_fail2:
958 	tavor_cq_refcnt_dec(sq_cq);
959 spec_qpalloc_fail1:
960 	tavor_pd_refcnt_dec(pd);
961 spec_qpalloc_fail:
962 	return (status);
963 }
964 
965 
966 /*
967  * tavor_qp_free()
968  *    This function frees up the QP resources.  Depending on the value
969  *    of the "free_qp_flags", the QP number may not be released until
970  *    a subsequent call to tavor_qp_release_qpn().
971  *
972  *    Context: Can be called only from user or kernel context.
973  */
974 /* ARGSUSED */
975 int
tavor_qp_free(tavor_state_t * state,tavor_qphdl_t * qphdl,ibc_free_qp_flags_t free_qp_flags,ibc_qpn_hdl_t * qpnh,uint_t sleepflag)976 tavor_qp_free(tavor_state_t *state, tavor_qphdl_t *qphdl,
977     ibc_free_qp_flags_t free_qp_flags, ibc_qpn_hdl_t *qpnh,
978     uint_t sleepflag)
979 {
980 	tavor_rsrc_t		*qpc, *rdb, *rsrc;
981 	tavor_umap_db_entry_t	*umapdb;
982 	tavor_qpn_entry_t	*entry;
983 	tavor_pdhdl_t		pd;
984 	tavor_mrhdl_t		mr;
985 	tavor_cqhdl_t		sq_cq, rq_cq;
986 	tavor_srqhdl_t		srq;
987 	tavor_qphdl_t		qp;
988 	uint64_t		value;
989 	uint_t			type, port;
990 	uint_t			maxprot;
991 	uint_t			qp_srq_en;
992 	int			status;
993 
994 	/*
995 	 * Pull all the necessary information from the Tavor Queue Pair
996 	 * handle.  This is necessary here because the resource for the
997 	 * QP handle is going to be freed up as part of this operation.
998 	 */
999 	qp	= *qphdl;
1000 	mutex_enter(&qp->qp_lock);
1001 	qpc	= qp->qp_qpcrsrcp;
1002 	rsrc	= qp->qp_rsrcp;
1003 	pd	= qp->qp_pdhdl;
1004 	srq	= qp->qp_srqhdl;
1005 	mr	= qp->qp_mrhdl;
1006 	rq_cq	= qp->qp_rq_cqhdl;
1007 	sq_cq	= qp->qp_sq_cqhdl;
1008 	rdb	= qp->qp_rdbrsrcp;
1009 	port	= qp->qp_portnum;
1010 	qp_srq_en = qp->qp_srq_en;
1011 
1012 	/*
1013 	 * If the QP is part of an MCG, then we fail the qp_free
1014 	 */
1015 	if (qp->qp_mcg_refcnt != 0) {
1016 		mutex_exit(&qp->qp_lock);
1017 		goto qpfree_fail;
1018 	}
1019 
1020 	/*
1021 	 * If the QP is not already in "Reset" state, then transition to
1022 	 * "Reset".  This is necessary because software does not reclaim
1023 	 * ownership of the QP context until the QP is in the "Reset" state.
1024 	 * If the ownership transfer fails for any reason, then it is an
1025 	 * indication that something (either in HW or SW) has gone seriously
1026 	 * wrong.  So we print a warning message and return.
1027 	 */
1028 	if (qp->qp_state != TAVOR_QP_RESET) {
1029 		if (tavor_qp_to_reset(state, qp) != DDI_SUCCESS) {
1030 			mutex_exit(&qp->qp_lock);
1031 			TAVOR_WARNING(state, "failed to reset QP context");
1032 			goto qpfree_fail;
1033 		}
1034 		qp->qp_state = TAVOR_QP_RESET;
1035 
1036 		/*
1037 		 * Do any additional handling necessary for the transition
1038 		 * to the "Reset" state (e.g. update the WRID lists)
1039 		 */
1040 		tavor_wrid_to_reset_handling(state, qp);
1041 	}
1042 
1043 	/*
1044 	 * If this was a user-mappable QP, then we need to remove its entry
1045 	 * from the "userland resources database".  If it is also currently
1046 	 * mmap()'d out to a user process, then we need to call
1047 	 * devmap_devmem_remap() to remap the QP memory to an invalid mapping.
1048 	 * We also need to invalidate the QP tracking information for the
1049 	 * user mapping.
1050 	 */
1051 	if (qp->qp_is_umap) {
1052 		status = tavor_umap_db_find(state->ts_instance, qp->qp_qpnum,
1053 		    MLNX_UMAP_QPMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
1054 		    &umapdb);
1055 		if (status != DDI_SUCCESS) {
1056 			mutex_exit(&qp->qp_lock);
1057 			TAVOR_WARNING(state, "failed to find in database");
1058 			return (ibc_get_ci_failure(0));
1059 		}
1060 		tavor_umap_db_free(umapdb);
1061 		if (qp->qp_umap_dhp != NULL) {
1062 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
1063 			status = devmap_devmem_remap(qp->qp_umap_dhp,
1064 			    state->ts_dip, 0, 0, qp->qp_wqinfo.qa_size,
1065 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
1066 			if (status != DDI_SUCCESS) {
1067 				mutex_exit(&qp->qp_lock);
1068 				TAVOR_WARNING(state, "failed in QP memory "
1069 				    "devmap_devmem_remap()");
1070 				return (ibc_get_ci_failure(0));
1071 			}
1072 			qp->qp_umap_dhp = (devmap_cookie_t)NULL;
1073 		}
1074 	}
1075 
1076 	/*
1077 	 * Put NULL into the Tavor QPNum-to-QPHdl list.  This will allow any
1078 	 * in-progress events to detect that the QP corresponding to this
1079 	 * number has been freed.  Note: it does depend in whether we are
1080 	 * freeing a special QP or not.
1081 	 */
1082 	if (qp->qp_is_special) {
1083 		state->ts_qphdl[qpc->tr_indx + port] = NULL;
1084 	} else {
1085 		state->ts_qphdl[qpc->tr_indx] = NULL;
1086 	}
1087 
1088 	/*
1089 	 * Drop the QP lock
1090 	 *    At this point the lock is no longer necessary.  We cannot
1091 	 *    protect from multiple simultaneous calls to free the same QP.
1092 	 *    In addition, since the QP lock is contained in the QP "software
1093 	 *    handle" resource, which we will free (see below), it is
1094 	 *    important that we have no further references to that memory.
1095 	 */
1096 	mutex_exit(&qp->qp_lock);
1097 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
1098 
1099 	/*
1100 	 * Free the QP resources
1101 	 *    Start by deregistering and freeing the memory for work queues.
1102 	 *    Next free any previously allocated context information
1103 	 *    (depending on QP type)
1104 	 *    Finally, decrement the necessary reference counts.
1105 	 * If this fails for any reason, then it is an indication that
1106 	 * something (either in HW or SW) has gone seriously wrong.  So we
1107 	 * print a warning message and return.
1108 	 */
1109 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
1110 	    sleepflag);
1111 	if (status != DDI_SUCCESS) {
1112 		TAVOR_WARNING(state, "failed to deregister QP memory");
1113 		goto qpfree_fail;
1114 	}
1115 
1116 	/* Free the memory for the QP */
1117 	tavor_queue_free(state, &qp->qp_wqinfo);
1118 
1119 	/*
1120 	 * Free up the remainder of the QP resources.  Note: we have a few
1121 	 * different resources to free up depending on whether the QP is a
1122 	 * special QP or not.  As described above, if any of these fail for
1123 	 * any reason it is an indication that something (either in HW or SW)
1124 	 * has gone seriously wrong.  So we print a warning message and
1125 	 * return.
1126 	 */
1127 	if (qp->qp_is_special) {
1128 		type = (qp->qp_is_special == TAVOR_QP_SMI) ?
1129 		    IBT_SMI_SQP : IBT_GSI_SQP;
1130 
1131 		/* Free up resources for the special QP */
1132 		status = tavor_special_qp_rsrc_free(state, type, port);
1133 		if (status != DDI_SUCCESS) {
1134 			TAVOR_WARNING(state, "failed to free special QP rsrc");
1135 			goto qpfree_fail;
1136 		}
1137 
1138 	} else {
1139 		type = qp->qp_serv_type;
1140 
1141 		/* Free up the RDB entries resource */
1142 		if (type == TAVOR_QP_RC) {
1143 			tavor_rsrc_free(state, &rdb);
1144 		}
1145 
1146 		/*
1147 		 * Check the flags and determine whether to release the
1148 		 * QPN or not, based on their value.
1149 		 */
1150 		if (free_qp_flags == IBC_FREE_QP_ONLY) {
1151 			entry = qp->qp_qpn_hdl;
1152 			tavor_qp_release_qpn(state, qp->qp_qpn_hdl,
1153 			    TAVOR_QPN_FREE_ONLY);
1154 			*qpnh = (ibc_qpn_hdl_t)entry;
1155 		} else {
1156 			tavor_qp_release_qpn(state, qp->qp_qpn_hdl,
1157 			    TAVOR_QPN_RELEASE);
1158 		}
1159 	}
1160 
1161 	/* Free the Tavor Queue Pair handle */
1162 	tavor_rsrc_free(state, &rsrc);
1163 
1164 	/* Decrement the reference counts on CQs, PD and SRQ (if needed) */
1165 	tavor_cq_refcnt_dec(rq_cq);
1166 	tavor_cq_refcnt_dec(sq_cq);
1167 	tavor_pd_refcnt_dec(pd);
1168 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
1169 		tavor_srq_refcnt_dec(srq);
1170 	}
1171 
1172 	/* Set the qphdl pointer to NULL and return success */
1173 	*qphdl = NULL;
1174 
1175 	return (DDI_SUCCESS);
1176 
1177 qpfree_fail:
1178 	return (status);
1179 }
1180 
1181 
1182 /*
1183  * tavor_qp_query()
1184  *    Context: Can be called from interrupt or base context.
1185  */
1186 int
tavor_qp_query(tavor_state_t * state,tavor_qphdl_t qp,ibt_qp_query_attr_t * attr_p)1187 tavor_qp_query(tavor_state_t *state, tavor_qphdl_t qp,
1188     ibt_qp_query_attr_t *attr_p)
1189 {
1190 	ibt_cep_state_t		qp_state;
1191 	ibt_qp_ud_attr_t	*ud;
1192 	ibt_qp_rc_attr_t	*rc;
1193 	ibt_qp_uc_attr_t	*uc;
1194 	ibt_cep_flags_t		enable_flags;
1195 	tavor_hw_addr_path_t	*qpc_path, *qpc_alt_path;
1196 	ibt_cep_path_t		*path_ptr, *alt_path_ptr;
1197 	tavor_hw_qpc_t		*qpc;
1198 	int			status;
1199 
1200 	mutex_enter(&qp->qp_lock);
1201 
1202 	/*
1203 	 * Grab the temporary QPC entry from QP software state
1204 	 */
1205 	qpc = &qp->qpc;
1206 
1207 	/* Convert the current Tavor QP state to IBTF QP state */
1208 	switch (qp->qp_state) {
1209 	case TAVOR_QP_RESET:
1210 		qp_state = IBT_STATE_RESET;		/* "Reset" */
1211 		break;
1212 	case TAVOR_QP_INIT:
1213 		qp_state = IBT_STATE_INIT;		/* Initialized */
1214 		break;
1215 	case TAVOR_QP_RTR:
1216 		qp_state = IBT_STATE_RTR;		/* Ready to Receive */
1217 		break;
1218 	case TAVOR_QP_RTS:
1219 		qp_state = IBT_STATE_RTS;		/* Ready to Send */
1220 		break;
1221 	case TAVOR_QP_SQERR:
1222 		qp_state = IBT_STATE_SQE;		/* Send Queue Error */
1223 		break;
1224 	case TAVOR_QP_SQD:
1225 		if (qp->qp_sqd_still_draining) {
1226 			qp_state = IBT_STATE_SQDRAIN;	/* SQ Draining */
1227 		} else {
1228 			qp_state = IBT_STATE_SQD;	/* SQ Drained */
1229 		}
1230 		break;
1231 	case TAVOR_QP_ERR:
1232 		qp_state = IBT_STATE_ERROR;		/* Error */
1233 		break;
1234 	default:
1235 		mutex_exit(&qp->qp_lock);
1236 		return (ibc_get_ci_failure(0));
1237 	}
1238 	attr_p->qp_info.qp_state = qp_state;
1239 
1240 	/* SRQ Hook. */
1241 	attr_p->qp_srq = NULL;
1242 
1243 	/*
1244 	 * The following QP information is always returned, regardless of
1245 	 * the current QP state.  Note: Some special handling is necessary
1246 	 * for calculating the QP number on special QP (QP0 and QP1).
1247 	 */
1248 	attr_p->qp_sq_cq    = qp->qp_sq_cqhdl->cq_hdlrarg;
1249 	attr_p->qp_rq_cq    = qp->qp_rq_cqhdl->cq_hdlrarg;
1250 	if (qp->qp_is_special) {
1251 		attr_p->qp_qpn = (qp->qp_is_special == TAVOR_QP_SMI) ? 0 : 1;
1252 	} else {
1253 		attr_p->qp_qpn = (ib_qpn_t)qp->qp_qpnum;
1254 	}
1255 	attr_p->qp_sq_sgl   = qp->qp_sq_sgl;
1256 	attr_p->qp_rq_sgl   = qp->qp_rq_sgl;
1257 	attr_p->qp_info.qp_sq_sz = qp->qp_sq_bufsz;
1258 	attr_p->qp_info.qp_rq_sz = qp->qp_rq_bufsz;
1259 
1260 	/*
1261 	 * If QP is currently in the "Reset" state, then only the above are
1262 	 * returned
1263 	 */
1264 	if (qp_state == IBT_STATE_RESET) {
1265 		mutex_exit(&qp->qp_lock);
1266 		return (DDI_SUCCESS);
1267 	}
1268 
1269 	/*
1270 	 * Post QUERY_QP command to firmware
1271 	 *
1272 	 * We do a TAVOR_NOSLEEP here because we are holding the "qp_lock".
1273 	 * Since we may be in the interrupt context (or subsequently raised
1274 	 * to interrupt level by priority inversion), we do not want to block
1275 	 * in this routine waiting for success.
1276 	 */
1277 	status = tavor_cmn_query_cmd_post(state, QUERY_QP, qp->qp_qpnum,
1278 	    qpc, sizeof (tavor_hw_qpc_t), TAVOR_CMD_NOSLEEP_SPIN);
1279 	if (status != TAVOR_CMD_SUCCESS) {
1280 		mutex_exit(&qp->qp_lock);
1281 		cmn_err(CE_CONT, "Tavor: QUERY_QP command failed: %08x\n",
1282 		    status);
1283 		return (ibc_get_ci_failure(0));
1284 	}
1285 
1286 	/*
1287 	 * Fill in the additional QP info based on the QP's transport type.
1288 	 */
1289 	if (qp->qp_serv_type == TAVOR_QP_UD) {
1290 
1291 		/* Fill in the UD-specific info */
1292 		ud = &attr_p->qp_info.qp_transport.ud;
1293 		ud->ud_qkey	= (ib_qkey_t)qpc->qkey;
1294 		ud->ud_sq_psn	= qpc->next_snd_psn;
1295 		ud->ud_pkey_ix	= qpc->pri_addr_path.pkey_indx;
1296 		ud->ud_port	= qpc->pri_addr_path.portnum;
1297 
1298 		attr_p->qp_info.qp_trans = IBT_UD_SRV;
1299 
1300 	} else if (qp->qp_serv_type == TAVOR_QP_RC) {
1301 
1302 		/* Fill in the RC-specific info */
1303 		rc = &attr_p->qp_info.qp_transport.rc;
1304 		rc->rc_sq_psn	= qpc->next_snd_psn;
1305 		rc->rc_rq_psn	= qpc->next_rcv_psn;
1306 		rc->rc_dst_qpn	= qpc->rem_qpn;
1307 
1308 		/* Grab the path migration state information */
1309 		if (qpc->pm_state == TAVOR_QP_PMSTATE_MIGRATED) {
1310 			rc->rc_mig_state = IBT_STATE_MIGRATED;
1311 		} else if (qpc->pm_state == TAVOR_QP_PMSTATE_REARM) {
1312 			rc->rc_mig_state = IBT_STATE_REARMED;
1313 		} else {
1314 			rc->rc_mig_state = IBT_STATE_ARMED;
1315 		}
1316 		rc->rc_rdma_ra_out = (1 << qpc->sra_max);
1317 		rc->rc_rdma_ra_in  = (1 << qpc->rra_max);
1318 		rc->rc_min_rnr_nak = qpc->min_rnr_nak;
1319 		rc->rc_path_mtu	   = qpc->mtu;
1320 		rc->rc_retry_cnt   = qpc->retry_cnt;
1321 
1322 		/* Get the common primary address path fields */
1323 		qpc_path = &qpc->pri_addr_path;
1324 		path_ptr = &rc->rc_path;
1325 		tavor_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect,
1326 		    TAVOR_ADDRPATH_QP, qp);
1327 
1328 		/* Fill in the additional primary address path fields */
1329 		path_ptr->cep_pkey_ix	   = qpc_path->pkey_indx;
1330 		path_ptr->cep_hca_port_num = qpc_path->portnum;
1331 		path_ptr->cep_timeout	   = qpc_path->ack_timeout;
1332 
1333 		/* Get the common alternate address path fields */
1334 		qpc_alt_path = &qpc->alt_addr_path;
1335 		alt_path_ptr = &rc->rc_alt_path;
1336 		tavor_get_addr_path(state, qpc_alt_path,
1337 		    &alt_path_ptr->cep_adds_vect, TAVOR_ADDRPATH_QP, qp);
1338 
1339 		/* Fill in the additional alternate address path fields */
1340 		alt_path_ptr->cep_pkey_ix	= qpc_alt_path->pkey_indx;
1341 		alt_path_ptr->cep_hca_port_num	= qpc_alt_path->portnum;
1342 		alt_path_ptr->cep_timeout	= qpc_alt_path->ack_timeout;
1343 
1344 		/* Get the RNR retry time from primary path */
1345 		rc->rc_rnr_retry_cnt = qpc_path->rnr_retry;
1346 
1347 		/* Set the enable flags based on RDMA/Atomic enable bits */
1348 		enable_flags = IBT_CEP_NO_FLAGS;
1349 		enable_flags |= ((qpc->rre == 0) ? 0 : IBT_CEP_RDMA_RD);
1350 		enable_flags |= ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR);
1351 		enable_flags |= ((qpc->rae == 0) ? 0 : IBT_CEP_ATOMIC);
1352 		attr_p->qp_info.qp_flags = enable_flags;
1353 
1354 		attr_p->qp_info.qp_trans = IBT_RC_SRV;
1355 
1356 	} else if (qp->qp_serv_type == TAVOR_QP_UC) {
1357 
1358 		/* Fill in the UC-specific info */
1359 		uc = &attr_p->qp_info.qp_transport.uc;
1360 		uc->uc_sq_psn	= qpc->next_snd_psn;
1361 		uc->uc_rq_psn	= qpc->next_rcv_psn;
1362 		uc->uc_dst_qpn	= qpc->rem_qpn;
1363 
1364 		/* Grab the path migration state information */
1365 		if (qpc->pm_state == TAVOR_QP_PMSTATE_MIGRATED) {
1366 			uc->uc_mig_state = IBT_STATE_MIGRATED;
1367 		} else if (qpc->pm_state == TAVOR_QP_PMSTATE_REARM) {
1368 			uc->uc_mig_state = IBT_STATE_REARMED;
1369 		} else {
1370 			uc->uc_mig_state = IBT_STATE_ARMED;
1371 		}
1372 		uc->uc_path_mtu = qpc->mtu;
1373 
1374 		/* Get the common primary address path fields */
1375 		qpc_path = &qpc->pri_addr_path;
1376 		path_ptr = &uc->uc_path;
1377 		tavor_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect,
1378 		    TAVOR_ADDRPATH_QP, qp);
1379 
1380 		/* Fill in the additional primary address path fields */
1381 		path_ptr->cep_pkey_ix	   = qpc_path->pkey_indx;
1382 		path_ptr->cep_hca_port_num = qpc_path->portnum;
1383 
1384 		/* Get the common alternate address path fields */
1385 		qpc_alt_path = &qpc->alt_addr_path;
1386 		alt_path_ptr = &uc->uc_alt_path;
1387 		tavor_get_addr_path(state, qpc_alt_path,
1388 		    &alt_path_ptr->cep_adds_vect, TAVOR_ADDRPATH_QP, qp);
1389 
1390 		/* Fill in the additional alternate address path fields */
1391 		alt_path_ptr->cep_pkey_ix	= qpc_alt_path->pkey_indx;
1392 		alt_path_ptr->cep_hca_port_num	= qpc_alt_path->portnum;
1393 
1394 		/*
1395 		 * Set the enable flags based on RDMA enable bits (by
1396 		 * definition UC doesn't support Atomic or RDMA Read)
1397 		 */
1398 		enable_flags = ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR);
1399 		attr_p->qp_info.qp_flags = enable_flags;
1400 
1401 		attr_p->qp_info.qp_trans = IBT_UC_SRV;
1402 
1403 	} else {
1404 		TAVOR_WARNING(state, "unexpected QP transport type");
1405 		mutex_exit(&qp->qp_lock);
1406 		return (ibc_get_ci_failure(0));
1407 	}
1408 
1409 	/*
1410 	 * Under certain circumstances it is possible for the Tavor hardware
1411 	 * to transition to one of the error states without software directly
1412 	 * knowing about it.  The QueryQP() call is the one place where we
1413 	 * have an opportunity to sample and update our view of the QP state.
1414 	 */
1415 	if (qpc->state == TAVOR_QP_SQERR) {
1416 		attr_p->qp_info.qp_state = IBT_STATE_SQE;
1417 		qp->qp_state = TAVOR_QP_SQERR;
1418 	}
1419 	if (qpc->state == TAVOR_QP_ERR) {
1420 		attr_p->qp_info.qp_state = IBT_STATE_ERROR;
1421 		qp->qp_state = TAVOR_QP_ERR;
1422 	}
1423 	mutex_exit(&qp->qp_lock);
1424 
1425 	return (DDI_SUCCESS);
1426 }
1427 
1428 
1429 /*
1430  * tavor_qp_create_qpn()
1431  *    Context: Can be called from interrupt or base context.
1432  */
1433 static int
tavor_qp_create_qpn(tavor_state_t * state,tavor_qphdl_t qp,tavor_rsrc_t * qpc)1434 tavor_qp_create_qpn(tavor_state_t *state, tavor_qphdl_t qp, tavor_rsrc_t *qpc)
1435 {
1436 	tavor_qpn_entry_t	query;
1437 	tavor_qpn_entry_t	*entry;
1438 	avl_index_t		where;
1439 
1440 	/*
1441 	 * Build a query (for the AVL tree lookup) and attempt to find
1442 	 * a previously added entry that has a matching QPC index.  If
1443 	 * no matching entry is found, then allocate, initialize, and
1444 	 * add an entry to the AVL tree.
1445 	 * If a matching entry is found, then increment its QPN counter
1446 	 * and reference counter.
1447 	 */
1448 	query.qpn_indx = qpc->tr_indx;
1449 	mutex_enter(&state->ts_qpn_avl_lock);
1450 	entry = (tavor_qpn_entry_t *)avl_find(&state->ts_qpn_avl,
1451 	    &query, &where);
1452 	if (entry == NULL) {
1453 		/*
1454 		 * Allocate and initialize a QPN entry, then insert
1455 		 * it into the AVL tree.
1456 		 */
1457 		entry = (tavor_qpn_entry_t *)kmem_zalloc(
1458 		    sizeof (tavor_qpn_entry_t), KM_NOSLEEP);
1459 		if (entry == NULL) {
1460 			mutex_exit(&state->ts_qpn_avl_lock);
1461 			return (DDI_FAILURE);
1462 		}
1463 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*entry))
1464 
1465 		entry->qpn_indx	   = qpc->tr_indx;
1466 		entry->qpn_refcnt  = 0;
1467 		entry->qpn_counter = 0;
1468 
1469 		avl_insert(&state->ts_qpn_avl, entry, where);
1470 	}
1471 
1472 	/*
1473 	 * Make the AVL tree entry point to the QP context resource that
1474 	 * it will be responsible for tracking
1475 	 */
1476 	entry->qpn_qpc = qpc;
1477 
1478 	/*
1479 	 * Setup the QP handle to point to the AVL tree entry.  Then
1480 	 * generate the new QP number from the entry's QPN counter value
1481 	 * and the hardware's QP context table index.
1482 	 */
1483 	qp->qp_qpn_hdl	= entry;
1484 	qp->qp_qpnum	= ((entry->qpn_counter <<
1485 	    state->ts_cfg_profile->cp_log_num_qp) | qpc->tr_indx) &
1486 	    TAVOR_QP_MAXNUMBER_MSK;
1487 
1488 	/*
1489 	 * Increment the reference counter and QPN counter.  The QPN
1490 	 * counter always indicates the next available number for use.
1491 	 */
1492 	entry->qpn_counter++;
1493 	entry->qpn_refcnt++;
1494 
1495 	mutex_exit(&state->ts_qpn_avl_lock);
1496 	return (DDI_SUCCESS);
1497 }
1498 
1499 
1500 /*
1501  * tavor_qp_release_qpn()
1502  *    Context: Can be called only from user or kernel context.
1503  */
1504 void
tavor_qp_release_qpn(tavor_state_t * state,tavor_qpn_entry_t * entry,int flags)1505 tavor_qp_release_qpn(tavor_state_t *state, tavor_qpn_entry_t *entry, int flags)
1506 {
1507 	ASSERT(entry != NULL);
1508 
1509 	mutex_enter(&state->ts_qpn_avl_lock);
1510 
1511 	/*
1512 	 * If we are releasing the QP number here, then we decrement the
1513 	 * reference count and check for zero references.  If there are
1514 	 * zero references, then we free the QPC context (if it hadn't
1515 	 * already been freed during a TAVOR_QPN_FREE_ONLY free, i.e. for
1516 	 * reuse with another similar QP number) and remove the tracking
1517 	 * structure from the QP number AVL tree and free the structure.
1518 	 * If we are not releasing the QP number here, then, as long as we
1519 	 * have not exhausted the usefulness of the QPC context (that is,
1520 	 * re-used it too many times without the reference count having
1521 	 * gone to zero), we free up the QPC context for use by another
1522 	 * thread (which will use it to construct a different QP number
1523 	 * from the same QPC table index).
1524 	 */
1525 	if (flags == TAVOR_QPN_RELEASE) {
1526 		entry->qpn_refcnt--;
1527 
1528 		/*
1529 		 * If the reference count is zero, then we free the QPC
1530 		 * context (if it hadn't already been freed in an early
1531 		 * step, e.g. TAVOR_QPN_FREE_ONLY) and remove/free the
1532 		 * tracking structure from the QP number AVL tree.
1533 		 */
1534 		if (entry->qpn_refcnt == 0) {
1535 			if (entry->qpn_qpc != NULL) {
1536 				tavor_rsrc_free(state, &entry->qpn_qpc);
1537 			}
1538 
1539 			/*
1540 			 * If the current entry has served it's useful
1541 			 * purpose (i.e. been reused the maximum allowable
1542 			 * number of times), then remove it from QP number
1543 			 * AVL tree and free it up.
1544 			 */
1545 			if (entry->qpn_counter >= (1 <<
1546 			    (24 - state->ts_cfg_profile->cp_log_num_qp))) {
1547 				avl_remove(&state->ts_qpn_avl, entry);
1548 				kmem_free(entry, sizeof (tavor_qpn_entry_t));
1549 			}
1550 		}
1551 
1552 	} else if (flags == TAVOR_QPN_FREE_ONLY) {
1553 		/*
1554 		 * Even if we are not freeing the QP number, that will not
1555 		 * always prevent us from releasing the QPC context.  In fact,
1556 		 * since the QPC context only forms part of the whole QPN,
1557 		 * we want to free it up for use by other consumers.  But
1558 		 * if the reference count is non-zero (which it will always
1559 		 * be when we are doing TAVOR_QPN_FREE_ONLY) and the counter
1560 		 * has reached its maximum value, then we cannot reuse the
1561 		 * QPC context until the reference count eventually reaches
1562 		 * zero (in TAVOR_QPN_RELEASE, above).
1563 		 */
1564 		if (entry->qpn_counter < (1 <<
1565 		    (24 - state->ts_cfg_profile->cp_log_num_qp))) {
1566 			tavor_rsrc_free(state, &entry->qpn_qpc);
1567 		}
1568 	}
1569 	mutex_exit(&state->ts_qpn_avl_lock);
1570 }
1571 
1572 
1573 /*
1574  * tavor_qpn_db_compare()
1575  *    Context: Can be called from user or kernel context.
1576  */
1577 static int
tavor_qpn_avl_compare(const void * q,const void * e)1578 tavor_qpn_avl_compare(const void *q, const void *e)
1579 {
1580 	tavor_qpn_entry_t	*entry, *query;
1581 
1582 	entry = (tavor_qpn_entry_t *)e;
1583 	query = (tavor_qpn_entry_t *)q;
1584 
1585 	if (query->qpn_indx < entry->qpn_indx) {
1586 		return (-1);
1587 	} else if (query->qpn_indx > entry->qpn_indx) {
1588 		return (+1);
1589 	} else {
1590 		return (0);
1591 	}
1592 }
1593 
1594 
1595 /*
1596  * tavor_qpn_avl_init()
1597  *    Context: Only called from attach() path context
1598  */
1599 void
tavor_qpn_avl_init(tavor_state_t * state)1600 tavor_qpn_avl_init(tavor_state_t *state)
1601 {
1602 	/* Initialize the lock used for QP number (QPN) AVL tree access */
1603 	mutex_init(&state->ts_qpn_avl_lock, NULL, MUTEX_DRIVER,
1604 	    DDI_INTR_PRI(state->ts_intrmsi_pri));
1605 
1606 	/* Initialize the AVL tree for the QP number (QPN) storage */
1607 	avl_create(&state->ts_qpn_avl, tavor_qpn_avl_compare,
1608 	    sizeof (tavor_qpn_entry_t),
1609 	    offsetof(tavor_qpn_entry_t, qpn_avlnode));
1610 }
1611 
1612 
1613 /*
1614  * tavor_qpn_avl_fini()
1615  *    Context: Only called from attach() and/or detach() path contexts
1616  */
1617 void
tavor_qpn_avl_fini(tavor_state_t * state)1618 tavor_qpn_avl_fini(tavor_state_t *state)
1619 {
1620 	tavor_qpn_entry_t	*entry;
1621 	void			*cookie;
1622 
1623 	/*
1624 	 * Empty all entries (if necessary) and destroy the AVL tree
1625 	 * that was used for QP number (QPN) tracking.
1626 	 */
1627 	cookie = NULL;
1628 	while ((entry = (tavor_qpn_entry_t *)avl_destroy_nodes(
1629 	    &state->ts_qpn_avl, &cookie)) != NULL) {
1630 		kmem_free(entry, sizeof (tavor_qpn_entry_t));
1631 	}
1632 	avl_destroy(&state->ts_qpn_avl);
1633 
1634 	/* Destroy the lock used for QP number (QPN) AVL tree access */
1635 	mutex_destroy(&state->ts_qpn_avl_lock);
1636 }
1637 
1638 
1639 /*
1640  * tavor_qphdl_from_qpnum()
1641  *    Context: Can be called from interrupt or base context.
1642  *
1643  *    This routine is important because changing the unconstrained
1644  *    portion of the QP number is critical to the detection of a
1645  *    potential race condition in the QP event handler code (i.e. the case
1646  *    where a QP is freed and alloc'd again before an event for the
1647  *    "old" QP can be handled).
1648  *
1649  *    While this is not a perfect solution (not sure that one exists)
1650  *    it does help to mitigate the chance that this race condition will
1651  *    cause us to deliver a "stale" event to the new QP owner.  Note:
1652  *    this solution does not scale well because the number of constrained
1653  *    bits increases (and, hence, the number of unconstrained bits
1654  *    decreases) as the number of supported QPs grows.  For small and
1655  *    intermediate values, it should hopefully provide sufficient
1656  *    protection.
1657  */
1658 tavor_qphdl_t
tavor_qphdl_from_qpnum(tavor_state_t * state,uint_t qpnum)1659 tavor_qphdl_from_qpnum(tavor_state_t *state, uint_t qpnum)
1660 {
1661 	uint_t	qpindx, qpmask;
1662 
1663 	/* Calculate the QP table index from the qpnum */
1664 	qpmask = (1 << state->ts_cfg_profile->cp_log_num_qp) - 1;
1665 	qpindx = qpnum & qpmask;
1666 	return (state->ts_qphdl[qpindx]);
1667 }
1668 
1669 
1670 /*
1671  * tavor_special_qp_rsrc_alloc
1672  *    Context: Can be called from interrupt or base context.
1673  */
1674 static int
tavor_special_qp_rsrc_alloc(tavor_state_t * state,ibt_sqp_type_t type,uint_t port,tavor_rsrc_t ** qp_rsrc)1675 tavor_special_qp_rsrc_alloc(tavor_state_t *state, ibt_sqp_type_t type,
1676     uint_t port, tavor_rsrc_t **qp_rsrc)
1677 {
1678 	uint_t		mask, flags;
1679 	int		status;
1680 
1681 	mutex_enter(&state->ts_spec_qplock);
1682 	flags = state->ts_spec_qpflags;
1683 	if (type == IBT_SMI_SQP) {
1684 		/*
1685 		 * Check here to see if the driver has been configured
1686 		 * to instruct the Tavor firmware to handle all incoming
1687 		 * SMP messages (i.e. messages sent to SMA).  If so,
1688 		 * then we will treat QP0 as if it has already been
1689 		 * allocated (for internal use).  Otherwise, if we allow
1690 		 * the allocation to happen, it will cause unexpected
1691 		 * behaviors (e.g. Tavor SMA becomes unresponsive).
1692 		 */
1693 		if (state->ts_cfg_profile->cp_qp0_agents_in_fw != 0) {
1694 			mutex_exit(&state->ts_spec_qplock);
1695 			return (IBT_QP_IN_USE);
1696 		}
1697 
1698 		/*
1699 		 * If this is the first QP0 allocation, then post
1700 		 * a CONF_SPECIAL_QP firmware command
1701 		 */
1702 		if ((flags & TAVOR_SPECIAL_QP0_RSRC_MASK) == 0) {
1703 			status = tavor_conf_special_qp_cmd_post(state,
1704 			    state->ts_spec_qp0->tr_indx, TAVOR_CMD_QP_SMI,
1705 			    TAVOR_CMD_NOSLEEP_SPIN);
1706 			if (status != TAVOR_CMD_SUCCESS) {
1707 				mutex_exit(&state->ts_spec_qplock);
1708 				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1709 				    "command failed: %08x\n", status);
1710 				return (IBT_INSUFF_RESOURCE);
1711 			}
1712 		}
1713 
1714 		/*
1715 		 * Now check (and, if necessary, modify) the flags to indicate
1716 		 * whether the allocation was successful
1717 		 */
1718 		mask = (1 << (TAVOR_SPECIAL_QP0_RSRC + port));
1719 		if (flags & mask) {
1720 			mutex_exit(&state->ts_spec_qplock);
1721 			return (IBT_QP_IN_USE);
1722 		}
1723 		state->ts_spec_qpflags |= mask;
1724 		*qp_rsrc = state->ts_spec_qp0;
1725 
1726 	} else {
1727 		/*
1728 		 * If this is the first QP1 allocation, then post
1729 		 * a CONF_SPECIAL_QP firmware command
1730 		 */
1731 		if ((flags & TAVOR_SPECIAL_QP1_RSRC_MASK) == 0) {
1732 			status = tavor_conf_special_qp_cmd_post(state,
1733 			    state->ts_spec_qp1->tr_indx, TAVOR_CMD_QP_GSI,
1734 			    TAVOR_CMD_NOSLEEP_SPIN);
1735 			if (status != TAVOR_CMD_SUCCESS) {
1736 				mutex_exit(&state->ts_spec_qplock);
1737 				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1738 				    "command failed: %08x\n", status);
1739 				return (IBT_INSUFF_RESOURCE);
1740 			}
1741 		}
1742 
1743 		/*
1744 		 * Now check (and, if necessary, modify) the flags to indicate
1745 		 * whether the allocation was successful
1746 		 */
1747 		mask = (1 << (TAVOR_SPECIAL_QP1_RSRC + port));
1748 		if (flags & mask) {
1749 			mutex_exit(&state->ts_spec_qplock);
1750 			return (IBT_QP_IN_USE);
1751 		}
1752 		state->ts_spec_qpflags |= mask;
1753 		*qp_rsrc = state->ts_spec_qp1;
1754 	}
1755 
1756 	mutex_exit(&state->ts_spec_qplock);
1757 	return (DDI_SUCCESS);
1758 }
1759 
1760 
1761 /*
1762  * tavor_special_qp_rsrc_free
1763  *    Context: Can be called from interrupt or base context.
1764  */
1765 static int
tavor_special_qp_rsrc_free(tavor_state_t * state,ibt_sqp_type_t type,uint_t port)1766 tavor_special_qp_rsrc_free(tavor_state_t *state, ibt_sqp_type_t type,
1767     uint_t port)
1768 {
1769 	uint_t		mask, flags;
1770 	int		status;
1771 
1772 	mutex_enter(&state->ts_spec_qplock);
1773 	if (type == IBT_SMI_SQP) {
1774 		mask = (1 << (TAVOR_SPECIAL_QP0_RSRC + port));
1775 		state->ts_spec_qpflags &= ~mask;
1776 		flags = state->ts_spec_qpflags;
1777 
1778 		/*
1779 		 * If this is the last QP0 free, then post a CONF_SPECIAL_QP
1780 		 * firmware command
1781 		 */
1782 		if ((flags & TAVOR_SPECIAL_QP0_RSRC_MASK) == 0) {
1783 			status = tavor_conf_special_qp_cmd_post(state, 0,
1784 			    TAVOR_CMD_QP_SMI, TAVOR_CMD_NOSLEEP_SPIN);
1785 			if (status != TAVOR_CMD_SUCCESS) {
1786 				mutex_exit(&state->ts_spec_qplock);
1787 				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1788 				    "command failed: %08x\n", status);
1789 				return (ibc_get_ci_failure(0));
1790 			}
1791 		}
1792 	} else {
1793 		mask = (1 << (TAVOR_SPECIAL_QP1_RSRC + port));
1794 		state->ts_spec_qpflags &= ~mask;
1795 		flags = state->ts_spec_qpflags;
1796 
1797 		/*
1798 		 * If this is the last QP1 free, then post a CONF_SPECIAL_QP
1799 		 * firmware command
1800 		 */
1801 		if ((flags & TAVOR_SPECIAL_QP1_RSRC_MASK) == 0) {
1802 			status = tavor_conf_special_qp_cmd_post(state, 0,
1803 			    TAVOR_CMD_QP_GSI, TAVOR_CMD_NOSLEEP_SPIN);
1804 			if (status != TAVOR_CMD_SUCCESS) {
1805 				mutex_exit(&state->ts_spec_qplock);
1806 				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1807 				    "command failed: %08x\n", status);
1808 				return (ibc_get_ci_failure(0));
1809 			}
1810 		}
1811 	}
1812 
1813 	mutex_exit(&state->ts_spec_qplock);
1814 	return (DDI_SUCCESS);
1815 }
1816 
1817 
1818 /*
1819  * tavor_qp_sgl_to_logwqesz()
1820  *    Context: Can be called from interrupt or base context.
1821  */
1822 static void
tavor_qp_sgl_to_logwqesz(tavor_state_t * state,uint_t num_sgl,tavor_qp_wq_type_t wq_type,uint_t * logwqesz,uint_t * max_sgl)1823 tavor_qp_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1824     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1825 {
1826 	uint_t	max_size, log2, actual_sgl;
1827 
1828 	switch (wq_type) {
1829 	case TAVOR_QP_WQ_TYPE_SENDQ:
1830 		/*
1831 		 * Use requested maximum SGL to calculate max descriptor size
1832 		 * (while guaranteeing that the descriptor size is a
1833 		 * power-of-2 cachelines).
1834 		 */
1835 		max_size = (TAVOR_QP_WQE_MLX_SND_HDRS + (num_sgl << 4));
1836 		log2 = highbit(max_size);
1837 		if (ISP2(max_size)) {
1838 			log2 = log2 - 1;
1839 		}
1840 
1841 		/* Make sure descriptor is at least the minimum size */
1842 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1843 
1844 		/* Calculate actual number of SGL (given WQE size) */
1845 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_SND_HDRS) >> 4;
1846 		break;
1847 
1848 	case TAVOR_QP_WQ_TYPE_RECVQ:
1849 		/*
1850 		 * Same as above (except for Recv WQEs)
1851 		 */
1852 		max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
1853 		log2 = highbit(max_size);
1854 		if (ISP2(max_size)) {
1855 			log2 = log2 - 1;
1856 		}
1857 
1858 		/* Make sure descriptor is at least the minimum size */
1859 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1860 
1861 		/* Calculate actual number of SGL (given WQE size) */
1862 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
1863 		break;
1864 
1865 	case TAVOR_QP_WQ_TYPE_SENDMLX_QP0:
1866 		/*
1867 		 * Same as above (except for MLX transport WQEs).  For these
1868 		 * WQEs we have to account for the space consumed by the
1869 		 * "inline" packet headers.  (This is smaller than for QP1
1870 		 * below because QP0 is not allowed to send packets with a GRH.
1871 		 */
1872 		max_size = (TAVOR_QP_WQE_MLX_QP0_HDRS + (num_sgl << 4));
1873 		log2 = highbit(max_size);
1874 		if (ISP2(max_size)) {
1875 			log2 = log2 - 1;
1876 		}
1877 
1878 		/* Make sure descriptor is at least the minimum size */
1879 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1880 
1881 		/* Calculate actual number of SGL (given WQE size) */
1882 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_QP0_HDRS) >> 4;
1883 		break;
1884 
1885 	case TAVOR_QP_WQ_TYPE_SENDMLX_QP1:
1886 		/*
1887 		 * Same as above.  For these WQEs we again have to account for
1888 		 * the space consumed by the "inline" packet headers.  (This
1889 		 * is larger than for QP0 above because we have to account for
1890 		 * the possibility of a GRH in each packet - and this
1891 		 * introduces an alignment issue that causes us to consume
1892 		 * an additional 8 bytes).
1893 		 */
1894 		max_size = (TAVOR_QP_WQE_MLX_QP1_HDRS + (num_sgl << 4));
1895 		log2 = highbit(max_size);
1896 		if (ISP2(max_size)) {
1897 			log2 = log2 - 1;
1898 		}
1899 
1900 		/* Make sure descriptor is at least the minimum size */
1901 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1902 
1903 		/* Calculate actual number of SGL (given WQE size) */
1904 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_QP1_HDRS) >> 4;
1905 		break;
1906 
1907 	default:
1908 		TAVOR_WARNING(state, "unexpected work queue type");
1909 		break;
1910 	}
1911 
1912 	/* Fill in the return values */
1913 	*logwqesz = log2;
1914 	*max_sgl  = min(state->ts_cfg_profile->cp_wqe_real_max_sgl, actual_sgl);
1915 }
1916