1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_srq.c
29  *    Tavor Shared Receive Queue Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, querying,
32  *    modifying and posting shared receive queues.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/conf.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/modctl.h>
40 #include <sys/bitmap.h>
41 
42 #include <sys/ib/adapters/tavor/tavor.h>
43 
44 /*
45  * Used by tavor_srq_numcalc() below to fill in the "unconstrained" portion of
46  * Tavor shared receive queue number
47  */
48 static uint_t tavor_debug_srqnum_cnt = 0x00000000;
49 static void tavor_srq_numcalc(tavor_state_t *state, uint32_t indx,
50     uint32_t *key);
51 static void tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
52     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
53 
54 /*
55  * tavor_srq_alloc()
56  *    Context: Can be called only from user or kernel context.
57  */
58 int
59 tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
60     uint_t sleepflag, tavor_srq_options_t *op)
61 {
62 	ibt_srq_hdl_t		ibt_srqhdl;
63 	tavor_pdhdl_t		pd;
64 	ibt_srq_sizes_t		*sizes;
65 	ibt_srq_sizes_t		*real_sizes;
66 	tavor_srqhdl_t		*srqhdl;
67 	ibt_srq_flags_t		flags;
68 	tavor_rsrc_t		*srqc, *rsrc;
69 	tavor_hw_srqc_t		srqc_entry;
70 	uint32_t		*buf;
71 	tavor_srqhdl_t		srq;
72 	tavor_umap_db_entry_t	*umapdb;
73 	ibt_mr_attr_t		mr_attr;
74 	tavor_mr_options_t	mr_op;
75 	tavor_mrhdl_t		mr;
76 	uint64_t		addr;
77 	uint64_t		value, srq_desc_off;
78 	uint32_t		lkey;
79 	uint32_t		log_srq_size;
80 	uint32_t		uarpg;
81 	uint_t			wq_location, dma_xfer_mode, srq_is_umap;
82 	int			flag, status;
83 	char			*errormsg;
84 	uint_t			max_sgl;
85 	uint_t			wqesz;
86 
87 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))
88 
89 	TAVOR_TNF_ENTER(tavor_srq_alloc);
90 
91 	/*
92 	 * Check the "options" flag.  Currently this flag tells the driver
93 	 * whether or not the SRQ's work queues should be come from normal
94 	 * system memory or whether they should be allocated from DDR memory.
95 	 */
96 	if (op == NULL) {
97 		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
98 	} else {
99 		wq_location = op->srqo_wq_loc;
100 	}
101 
102 	/*
103 	 * Extract the necessary info from the tavor_srq_info_t structure
104 	 */
105 	real_sizes = srqinfo->srqi_real_sizes;
106 	sizes	   = srqinfo->srqi_sizes;
107 	pd	   = srqinfo->srqi_pd;
108 	ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
109 	flags	   = srqinfo->srqi_flags;
110 	srqhdl	   = srqinfo->srqi_srqhdl;
111 
112 	/*
113 	 * Determine whether SRQ is being allocated for userland access or
114 	 * whether it is being allocated for kernel access.  If the SRQ is
115 	 * being allocated for userland access, then lookup the UAR doorbell
116 	 * page number for the current process.  Note:  If this is not found
117 	 * (e.g. if the process has not previously open()'d the Tavor driver),
118 	 * then an error is returned.
119 	 */
120 	srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
121 	if (srq_is_umap) {
122 		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
123 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
124 		if (status != DDI_SUCCESS) {
125 			/* Set "status" and "errormsg" and goto failure */
126 			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
127 			goto srqalloc_fail3;
128 		}
129 		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
130 	}
131 
132 	/* Increase PD refcnt */
133 	tavor_pd_refcnt_inc(pd);
134 
135 	/* Allocate an SRQ context entry */
136 	status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
137 	if (status != DDI_SUCCESS) {
138 		/* Set "status" and "errormsg" and goto failure */
139 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
140 		goto srqalloc_fail1;
141 	}
142 
143 	/* Allocate the SRQ Handle entry */
144 	status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
145 	if (status != DDI_SUCCESS) {
146 		/* Set "status" and "errormsg" and goto failure */
147 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
148 		goto srqalloc_fail2;
149 	}
150 
151 	srq = (tavor_srqhdl_t)rsrc->tr_addr;
152 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))
153 
154 	/* Calculate the SRQ number */
155 	tavor_srq_numcalc(state, srqc->tr_indx, &srq->srq_srqnum);
156 
157 	/*
158 	 * If this will be a user-mappable SRQ, then allocate an entry for
159 	 * the "userland resources database".  This will later be added to
160 	 * the database (after all further SRQ operations are successful).
161 	 * If we fail here, we must undo the reference counts and the
162 	 * previous resource allocation.
163 	 */
164 	if (srq_is_umap) {
165 		umapdb = tavor_umap_db_alloc(state->ts_instance,
166 		    srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
167 		    (uint64_t)(uintptr_t)rsrc);
168 		if (umapdb == NULL) {
169 			/* Set "status" and "errormsg" and goto failure */
170 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
171 			goto srqalloc_fail3;
172 		}
173 	}
174 
175 	/*
176 	 * Calculate the appropriate size for the SRQ.
177 	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
178 	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
179 	 * is to round the requested size up to the next highest power-of-2
180 	 */
181 	sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
182 	log_srq_size = highbit(sizes->srq_wr_sz);
183 	if ((sizes->srq_wr_sz & (sizes->srq_wr_sz - 1)) == 0) {
184 		log_srq_size = log_srq_size - 1;
185 	}
186 
187 	/*
188 	 * Next we verify that the rounded-up size is valid (i.e. consistent
189 	 * with the device limits and/or software-configured limits).  If not,
190 	 * then obviously we have a lot of cleanup to do before returning.
191 	 */
192 	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
193 		/* Set "status" and "errormsg" and goto failure */
194 		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
195 		goto srqalloc_fail4;
196 	}
197 
198 	/*
199 	 * Next we verify that the requested number of SGL is valid (i.e.
200 	 * consistent with the device limits and/or software-configured
201 	 * limits).  If not, then obviously the same cleanup needs to be done.
202 	 */
203 	max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
204 	if (sizes->srq_sgl_sz > max_sgl) {
205 		/* Set "status" and "errormsg" and goto failure */
206 		TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max SRQ SGL");
207 		goto srqalloc_fail4;
208 	}
209 
210 	/*
211 	 * Determine the SRQ's WQE sizes.  This depends on the requested
212 	 * number of SGLs.  Note: This also has the side-effect of
213 	 * calculating the real number of SGLs (for the calculated WQE size)
214 	 */
215 	tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
216 	    TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
217 	    &srq->srq_wq_sgl);
218 
219 	/*
220 	 * Allocate the memory for SRQ work queues.  Note:  The location from
221 	 * which we will allocate these work queues has been passed in through
222 	 * the tavor_qp_options_t structure.  Since Tavor work queues are not
223 	 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
224 	 * queue memory is very important.  We used to allocate work queues
225 	 * (the combined receive and send queues) so that they would be aligned
226 	 * on their combined size.  That alignment guaranteed that they would
227 	 * never cross the 4GB boundary (Tavor work queues are on the order of
228 	 * MBs at maximum).  Now we are able to relax this alignment constraint
229 	 * by ensuring that the IB address assigned to the queue memory (as a
230 	 * result of the tavor_mr_register() call) is offset from zero.
231 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
232 	 * guarantee the alignment, but when attempting to use IOMMU bypass
233 	 * mode we found that we were not allowed to specify any alignment that
234 	 * was more restrictive than the system page size.  So we avoided this
235 	 * constraint by passing two alignment values, one for the memory
236 	 * allocation itself and the other for the DMA handle (for later bind).
237 	 * This used to cause more memory than necessary to be allocated (in
238 	 * order to guarantee the more restrictive alignment contraint).  But
239 	 * be guaranteeing the zero-based IB virtual address for the queue, we
240 	 * are able to conserve this memory.
241 	 *
242 	 * Note: If SRQ is not user-mappable, then it may come from either
243 	 * kernel system memory or from HCA-attached local DDR memory.
244 	 *
245 	 * Note2: We align this queue on a pagesize boundary.  This is required
246 	 * to make sure that all the resulting IB addresses will start at 0, for
247 	 * a zero-based queue.  By making sure we are aligned on at least a
248 	 * page, any offset we use into our queue will be the same as when we
249 	 * perform tavor_srq_modify() operations later.
250 	 */
251 	wqesz = (1 << srq->srq_wq_log_wqesz);
252 	srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
253 	srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
254 	srq->srq_wqinfo.qa_bind_align = PAGESIZE;
255 	if (srq_is_umap) {
256 		srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
257 	} else {
258 		srq->srq_wqinfo.qa_location = wq_location;
259 	}
260 	status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
261 	if (status != DDI_SUCCESS) {
262 		/* Set "status" and "errormsg" and goto failure */
263 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
264 		goto srqalloc_fail4;
265 	}
266 	buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
267 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
268 
269 	/*
270 	 * Register the memory for the SRQ work queues.  The memory for the SRQ
271 	 * must be registered in the Tavor TPT tables.  This gives us the LKey
272 	 * to specify in the SRQ context later.  Note: If the work queue is to
273 	 * be allocated from DDR memory, then only a "bypass" mapping is
274 	 * appropriate.  And if the SRQ memory is user-mappable, then we force
275 	 * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
276 	 * restriction, we pass the "mro_bind_override_addr" flag in the call
277 	 * to tavor_mr_register().  This guarantees that the resulting IB vaddr
278 	 * will be zero-based (modulo the offset into the first page).  If we
279 	 * fail here, we still have the bunch of resource and reference count
280 	 * cleanup to do.
281 	 */
282 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
283 	    IBT_MR_NOSLEEP;
284 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
285 	mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
286 	mr_attr.mr_as    = NULL;
287 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
288 	if (srq_is_umap) {
289 		mr_op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
290 	} else {
291 		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
292 			mr_op.mro_bind_type =
293 			    state->ts_cfg_profile->cp_iommu_bypass;
294 			dma_xfer_mode =
295 			    state->ts_cfg_profile->cp_streaming_consistent;
296 			if (dma_xfer_mode == DDI_DMA_STREAMING) {
297 				mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
298 			}
299 		} else {
300 			mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
301 		}
302 	}
303 	mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
304 	mr_op.mro_bind_override_addr = 1;
305 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
306 	if (status != DDI_SUCCESS) {
307 		/* Set "status" and "errormsg" and goto failure */
308 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
309 		goto srqalloc_fail5;
310 	}
311 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
312 	addr = mr->mr_bindinfo.bi_addr;
313 	lkey = mr->mr_lkey;
314 
315 	/*
316 	 * Calculate the offset between the kernel virtual address space
317 	 * and the IB virtual address space.  This will be used when
318 	 * posting work requests to properly initialize each WQE.
319 	 */
320 	srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
321 	    (uint64_t)mr->mr_bindinfo.bi_addr;
322 
323 	/*
324 	 * Create WQL and Wridlist for use by this SRQ
325 	 */
326 	srq->srq_wrid_wql = tavor_wrid_wql_create(state);
327 	if (srq->srq_wrid_wql == NULL) {
328 		/* Set "status" and "errormsg" and goto failure */
329 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
330 		goto srqalloc_fail6;
331 	}
332 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))
333 
334 	srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
335 	if (srq->srq_wridlist == NULL) {
336 		/* Set "status" and "errormsg" and goto failure */
337 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
338 		goto srqalloc_fail7;
339 	}
340 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))
341 
342 	srq->srq_wridlist->wl_srq_en = 1;
343 	srq->srq_wridlist->wl_free_list_indx = -1;
344 
345 	/*
346 	 * Fill in all the return arguments (if necessary).  This includes
347 	 * real queue size and real SGLs.
348 	 */
349 	if (real_sizes != NULL) {
350 		real_sizes->srq_wr_sz = (1 << log_srq_size);
351 		real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
352 	}
353 
354 	/*
355 	 * Fill in the SRQC entry.  This is the final step before passing
356 	 * ownership of the SRQC entry to the Tavor hardware.  We use all of
357 	 * the information collected/calculated above to fill in the
358 	 * requisite portions of the SRQC.  Note: If this SRQ is going to be
359 	 * used for userland access, then we need to set the UAR page number
360 	 * appropriately (otherwise it's a "don't care")
361 	 */
362 	bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
363 	srqc_entry.wqe_addr_h	   = (addr >> 32);
364 	srqc_entry.next_wqe_addr_l = 0;
365 	srqc_entry.ds		   = (wqesz >> 4);
366 	srqc_entry.state	   = TAVOR_SRQ_STATE_HW_OWNER;
367 	srqc_entry.pd		   = pd->pd_pdnum;
368 	srqc_entry.lkey		   = lkey;
369 	srqc_entry.wqe_cnt	   = 0;
370 	if (srq_is_umap) {
371 		srqc_entry.uar	   = uarpg;
372 	} else {
373 		srqc_entry.uar	   = 0;
374 	}
375 
376 	/*
377 	 * Write the SRQC entry to hardware.  Lastly, we pass ownership of
378 	 * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
379 	 * command).  Note: In general, this operation shouldn't fail.  But
380 	 * if it does, we have to undo everything we've done above before
381 	 * returning error.
382 	 */
383 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
384 	    sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
385 	    sleepflag);
386 	if (status != TAVOR_CMD_SUCCESS) {
387 		cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
388 		    status);
389 		TNF_PROBE_1(tavor_srq_alloc_sw2hw_srq_cmd_fail,
390 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
391 		/* Set "status" and "errormsg" and goto failure */
392 		TAVOR_TNF_FAIL(IBT_FAILURE, "tavor SW2HW_SRQ command");
393 		goto srqalloc_fail8;
394 	}
395 
396 	/*
397 	 * Fill in the rest of the Tavor SRQ handle.  We can update
398 	 * the following fields for use in further operations on the SRQ.
399 	 */
400 	srq->srq_srqcrsrcp = srqc;
401 	srq->srq_rsrcp	   = rsrc;
402 	srq->srq_mrhdl	   = mr;
403 	srq->srq_refcnt	   = 0;
404 	srq->srq_is_umap   = srq_is_umap;
405 	srq->srq_uarpg	   = (srq->srq_is_umap) ? uarpg : 0;
406 	srq->srq_umap_dhp  = (devmap_cookie_t)NULL;
407 	srq->srq_pdhdl	   = pd;
408 	srq->srq_wq_lastwqeindx = -1;
409 	srq->srq_wq_bufsz  = (1 << log_srq_size);
410 	srq->srq_wq_buf	   = buf;
411 	srq->srq_desc_off  = srq_desc_off;
412 	srq->srq_hdlrarg   = (void *)ibt_srqhdl;
413 	srq->srq_state	   = 0;
414 	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
415 	srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;
416 
417 	/* Determine if later ddi_dma_sync will be necessary */
418 	srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
419 
420 	/*
421 	 * Put SRQ handle in Tavor SRQNum-to-SRQhdl list.  Then fill in the
422 	 * "srqhdl" and return success
423 	 */
424 	ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
425 	state->ts_srqhdl[srqc->tr_indx] = srq;
426 
427 	/*
428 	 * If this is a user-mappable SRQ, then we need to insert the
429 	 * previously allocated entry into the "userland resources database".
430 	 * This will allow for later lookup during devmap() (i.e. mmap())
431 	 * calls.
432 	 */
433 	if (srq->srq_is_umap) {
434 		tavor_umap_db_add(umapdb);
435 	} else {
436 		mutex_enter(&srq->srq_wrid_wql->wql_lock);
437 		tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
438 		mutex_exit(&srq->srq_wrid_wql->wql_lock);
439 	}
440 
441 	*srqhdl = srq;
442 
443 	TAVOR_TNF_EXIT(tavor_srq_alloc);
444 	return (status);
445 
446 /*
447  * The following is cleanup for all possible failure cases in this routine
448  */
449 srqalloc_fail8:
450 	kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
451 	    sizeof (tavor_wrid_entry_t));
452 	kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
453 srqalloc_fail7:
454 	tavor_wql_refcnt_dec(srq->srq_wrid_wql);
455 srqalloc_fail6:
456 	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
457 	    TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
458 		TAVOR_WARNING(state, "failed to deregister SRQ memory");
459 	}
460 srqalloc_fail5:
461 	tavor_queue_free(state, &srq->srq_wqinfo);
462 srqalloc_fail4:
463 	if (srq_is_umap) {
464 		tavor_umap_db_free(umapdb);
465 	}
466 srqalloc_fail3:
467 	tavor_rsrc_free(state, &rsrc);
468 srqalloc_fail2:
469 	tavor_rsrc_free(state, &srqc);
470 srqalloc_fail1:
471 	tavor_pd_refcnt_dec(pd);
472 srqalloc_fail:
473 	TNF_PROBE_1(tavor_srq_alloc_fail, TAVOR_TNF_ERROR, "",
474 	    tnf_string, msg, errormsg);
475 	TAVOR_TNF_EXIT(tavor_srq_alloc);
476 	return (status);
477 }
478 
479 
480 /*
481  * tavor_srq_free()
482  *    Context: Can be called only from user or kernel context.
483  */
484 /* ARGSUSED */
485 int
486 tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag)
487 {
488 	tavor_rsrc_t		*srqc, *rsrc;
489 	tavor_umap_db_entry_t	*umapdb;
490 	uint64_t		value;
491 	tavor_srqhdl_t		srq;
492 	tavor_mrhdl_t		mr;
493 	tavor_pdhdl_t		pd;
494 	tavor_hw_srqc_t		srqc_entry;
495 	uint32_t		srqnum;
496 	uint32_t		size;
497 	uint_t			maxprot;
498 	int			status;
499 
500 	TAVOR_TNF_ENTER(tavor_srq_free);
501 
502 	/*
503 	 * Pull all the necessary information from the Tavor Shared Receive
504 	 * Queue handle.  This is necessary here because the resource for the
505 	 * SRQ handle is going to be freed up as part of this operation.
506 	 */
507 	srq	= *srqhdl;
508 	mutex_enter(&srq->srq_lock);
509 	srqc	= srq->srq_srqcrsrcp;
510 	rsrc	= srq->srq_rsrcp;
511 	pd	= srq->srq_pdhdl;
512 	mr	= srq->srq_mrhdl;
513 	srqnum	= srq->srq_srqnum;
514 
515 	/*
516 	 * If there are work queues still associated with the SRQ, then return
517 	 * an error.  Otherwise, we will be holding the SRQ lock.
518 	 */
519 	if (srq->srq_refcnt != 0) {
520 		mutex_exit(&srq->srq_lock);
521 		TNF_PROBE_1(tavor_srq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
522 		    tnf_int, refcnt, srq->srq_refcnt);
523 		TAVOR_TNF_EXIT(tavor_srq_free);
524 		return (IBT_SRQ_IN_USE);
525 	}
526 
527 	/*
528 	 * If this was a user-mappable SRQ, then we need to remove its entry
529 	 * from the "userland resources database".  If it is also currently
530 	 * mmap()'d out to a user process, then we need to call
531 	 * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
532 	 * We also need to invalidate the SRQ tracking information for the
533 	 * user mapping.
534 	 */
535 	if (srq->srq_is_umap) {
536 		status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum,
537 		    MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
538 		    &umapdb);
539 		if (status != DDI_SUCCESS) {
540 			mutex_exit(&srq->srq_lock);
541 			TAVOR_WARNING(state, "failed to find in database");
542 			TAVOR_TNF_EXIT(tavor_srq_free);
543 			return (ibc_get_ci_failure(0));
544 		}
545 		tavor_umap_db_free(umapdb);
546 		if (srq->srq_umap_dhp != NULL) {
547 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
548 			status = devmap_devmem_remap(srq->srq_umap_dhp,
549 			    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size,
550 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
551 			if (status != DDI_SUCCESS) {
552 				mutex_exit(&srq->srq_lock);
553 				TAVOR_WARNING(state, "failed in SRQ memory "
554 				    "devmap_devmem_remap()");
555 				TAVOR_TNF_EXIT(tavor_srq_free);
556 				return (ibc_get_ci_failure(0));
557 			}
558 			srq->srq_umap_dhp = (devmap_cookie_t)NULL;
559 		}
560 	}
561 
562 	/*
563 	 * Put NULL into the Tavor SRQNum-to-SRQHdl list.  This will allow any
564 	 * in-progress events to detect that the SRQ corresponding to this
565 	 * number has been freed.
566 	 */
567 	state->ts_srqhdl[srqc->tr_indx] = NULL;
568 
569 	mutex_exit(&srq->srq_lock);
570 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
571 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist));
572 
573 	/*
574 	 * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
575 	 * firmware command).  If the ownership transfer fails for any reason,
576 	 * then it is an indication that something (either in HW or SW) has
577 	 * gone seriously wrong.
578 	 */
579 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
580 	    sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
581 	if (status != TAVOR_CMD_SUCCESS) {
582 		TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
583 		cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
584 		    status);
585 		TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail,
586 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
587 		TAVOR_TNF_EXIT(tavor_srq_free);
588 		return (IBT_FAILURE);
589 	}
590 
591 	/*
592 	 * Deregister the memory for the Shared Receive Queue.  If this fails
593 	 * for any reason, then it is an indication that something (either
594 	 * in HW or SW) has gone seriously wrong.  So we print a warning
595 	 * message and return.
596 	 */
597 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
598 	    sleepflag);
599 	if (status != DDI_SUCCESS) {
600 		TAVOR_WARNING(state, "failed to deregister SRQ memory");
601 		TNF_PROBE_0(tavor_srq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
602 		TAVOR_TNF_EXIT(tavor_srq_free);
603 		return (IBT_FAILURE);
604 	}
605 
606 	/* Calculate the size and free the wridlist container */
607 	if (srq->srq_wridlist != NULL) {
608 		size = (srq->srq_wridlist->wl_size *
609 		    sizeof (tavor_wrid_entry_t));
610 		kmem_free(srq->srq_wridlist->wl_wre, size);
611 		kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
612 
613 		/*
614 		 * Release reference to WQL; If this is the last reference,
615 		 * this call also has the side effect of freeing up the
616 		 * 'srq_wrid_wql' memory.
617 		 */
618 		tavor_wql_refcnt_dec(srq->srq_wrid_wql);
619 	}
620 
621 	/* Free the memory for the SRQ */
622 	tavor_queue_free(state, &srq->srq_wqinfo);
623 
624 	/* Free the Tavor SRQ Handle */
625 	tavor_rsrc_free(state, &rsrc);
626 
627 	/* Free the SRQC entry resource */
628 	tavor_rsrc_free(state, &srqc);
629 
630 	/* Decrement the reference count on the protection domain (PD) */
631 	tavor_pd_refcnt_dec(pd);
632 
633 	/* Set the srqhdl pointer to NULL and return success */
634 	*srqhdl = NULL;
635 
636 	TAVOR_TNF_EXIT(tavor_srq_free);
637 	return (DDI_SUCCESS);
638 }
639 
640 
641 /*
642  * tavor_srq_modify()
643  *    Context: Can be called only from user or kernel context.
644  */
645 int
646 tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
647     uint_t *real_size, uint_t sleepflag)
648 {
649 	tavor_qalloc_info_t	new_srqinfo, old_srqinfo;
650 	tavor_rsrc_t		*mtt, *mpt, *old_mtt;
651 	tavor_bind_info_t	bind;
652 	tavor_bind_info_t	old_bind;
653 	tavor_rsrc_pool_info_t	*rsrc_pool;
654 	tavor_mrhdl_t		mr;
655 	tavor_hw_mpt_t		mpt_entry;
656 	tavor_wrid_entry_t	*wre_new, *wre_old;
657 	uint64_t		mtt_ddrbaseaddr, mtt_addr;
658 	uint64_t		srq_desc_off;
659 	uint32_t		*buf, srq_old_bufsz;
660 	uint32_t		wqesz;
661 	uint_t			max_srq_size;
662 	uint_t			dma_xfer_mode, mtt_pgsize_bits;
663 	uint_t			srq_sync, log_srq_size, maxprot;
664 	uint_t			wq_location;
665 	int			status;
666 	char			*errormsg;
667 
668 	TAVOR_TNF_ENTER(tavor_srq_modify);
669 
670 	/*
671 	 * Check the "inddr" flag.  This flag tells the driver whether or not
672 	 * the SRQ's work queues should be come from normal system memory or
673 	 * whether they should be allocated from DDR memory.
674 	 */
675 	wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;
676 
677 	/*
678 	 * If size requested is larger than device capability, return
679 	 * Insufficient Resources
680 	 */
681 	max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
682 	if (size > max_srq_size) {
683 		TNF_PROBE_0(tavor_srq_modify_size_larger_than_maxsize,
684 		    TAVOR_TNF_ERROR, "");
685 		TAVOR_TNF_EXIT(tavor_srq_modify);
686 		return (IBT_HCA_WR_EXCEEDED);
687 	}
688 
689 	/*
690 	 * Calculate the appropriate size for the SRQ.
691 	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
692 	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
693 	 * is to round the requested size up to the next highest power-of-2
694 	 */
695 	size = max(size, TAVOR_SRQ_MIN_SIZE);
696 	log_srq_size = highbit(size);
697 	if ((size & (size - 1)) == 0) {
698 		log_srq_size = log_srq_size - 1;
699 	}
700 
701 	/*
702 	 * Next we verify that the rounded-up size is valid (i.e. consistent
703 	 * with the device limits and/or software-configured limits).
704 	 */
705 	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
706 		/* Set "status" and "errormsg" and goto failure */
707 		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
708 		goto srqmodify_fail;
709 	}
710 
711 	/*
712 	 * Allocate the memory for newly resized Shared Receive Queue.
713 	 *
714 	 * Note: If SRQ is not user-mappable, then it may come from either
715 	 * kernel system memory or from HCA-attached local DDR memory.
716 	 *
717 	 * Note2: We align this queue on a pagesize boundary.  This is required
718 	 * to make sure that all the resulting IB addresses will start at 0,
719 	 * for a zero-based queue.  By making sure we are aligned on at least a
720 	 * page, any offset we use into our queue will be the same as it was
721 	 * when we allocated it at tavor_srq_alloc() time.
722 	 */
723 	wqesz = (1 << srq->srq_wq_log_wqesz);
724 	new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
725 	new_srqinfo.qa_alloc_align = PAGESIZE;
726 	new_srqinfo.qa_bind_align  = PAGESIZE;
727 	if (srq->srq_is_umap) {
728 		new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
729 	} else {
730 		new_srqinfo.qa_location = wq_location;
731 	}
732 	status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
733 	if (status != DDI_SUCCESS) {
734 		/* Set "status" and "errormsg" and goto failure */
735 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
736 		goto srqmodify_fail;
737 	}
738 	buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
739 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
740 
741 	/*
742 	 * Allocate the memory for the new WRE list.  This will be used later
743 	 * when we resize the wridlist based on the new SRQ size.
744 	 */
745 	wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
746 	    sizeof (tavor_wrid_entry_t), sleepflag);
747 	if (wre_new == NULL) {
748 		/* Set "status" and "errormsg" and goto failure */
749 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
750 		    "failed wre_new alloc");
751 		goto srqmodify_fail;
752 	}
753 
754 	/*
755 	 * Fill in the "bind" struct.  This struct provides the majority
756 	 * of the information that will be used to distinguish between an
757 	 * "addr" binding (as is the case here) and a "buf" binding (see
758 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
759 	 * which does most of the "heavy lifting" for the Tavor memory
760 	 * registration routines.
761 	 */
762 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
763 	bzero(&bind, sizeof (tavor_bind_info_t));
764 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
765 	bind.bi_addr  = (uint64_t)(uintptr_t)buf;
766 	bind.bi_len   = new_srqinfo.qa_size;
767 	bind.bi_as    = NULL;
768 	bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
769 	    IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
770 	if (srq->srq_is_umap) {
771 		bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
772 	} else {
773 		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
774 			bind.bi_bypass =
775 			    state->ts_cfg_profile->cp_iommu_bypass;
776 			dma_xfer_mode =
777 			    state->ts_cfg_profile->cp_streaming_consistent;
778 			if (dma_xfer_mode == DDI_DMA_STREAMING) {
779 				bind.bi_flags |= IBT_MR_NONCOHERENT;
780 			}
781 		} else {
782 			bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
783 		}
784 	}
785 	status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
786 	    &mtt_pgsize_bits);
787 	if (status != DDI_SUCCESS) {
788 		/* Set "status" and "errormsg" and goto failure */
789 		TAVOR_TNF_FAIL(status, "failed mtt bind");
790 		kmem_free(wre_new, srq->srq_wq_bufsz *
791 		    sizeof (tavor_wrid_entry_t));
792 		tavor_queue_free(state, &new_srqinfo);
793 		goto srqmodify_fail;
794 	}
795 
796 	/*
797 	 * Calculate the offset between the kernel virtual address space
798 	 * and the IB virtual address space.  This will be used when
799 	 * posting work requests to properly initialize each WQE.
800 	 *
801 	 * Note: bind addr is zero-based (from alloc) so we calculate the
802 	 * correct new offset here.
803 	 */
804 	bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
805 	srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
806 	    (uint64_t)bind.bi_addr;
807 
808 	/*
809 	 * Get the base address for the MTT table.  This will be necessary
810 	 * below when we are modifying the MPT entry.
811 	 */
812 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
813 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
814 
815 	/*
816 	 * Fill in the MPT entry.  This is the final step before passing
817 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
818 	 * the information collected/calculated above to fill in the
819 	 * requisite portions of the MPT.
820 	 */
821 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
822 	mpt_entry.reg_win_len	= bind.bi_len;
823 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
824 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
825 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
826 
827 	/*
828 	 * Now we grab the SRQ lock.  Since we will be updating the actual
829 	 * SRQ location and the producer/consumer indexes, we should hold
830 	 * the lock.
831 	 *
832 	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
833 	 * holding the "srq_lock" and if we got raised to interrupt level
834 	 * by priority inversion, we would not want to block in this routine
835 	 * waiting for success.
836 	 */
837 	mutex_enter(&srq->srq_lock);
838 
839 	/*
840 	 * Copy old entries to new buffer
841 	 */
842 	srq_old_bufsz = srq->srq_wq_bufsz;
843 	bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);
844 
845 	/* Determine if later ddi_dma_sync will be necessary */
846 	srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
847 
848 	/* Sync entire "new" SRQ for use by hardware (if necessary) */
849 	if (srq_sync) {
850 		(void) ddi_dma_sync(bind.bi_dmahdl, 0,
851 		    new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
852 	}
853 
854 	/*
855 	 * Setup MPT information for use in the MODIFY_MPT command
856 	 */
857 	mr = srq->srq_mrhdl;
858 	mutex_enter(&mr->mr_lock);
859 	mpt = srq->srq_mrhdl->mr_mptrsrcp;
860 
861 	/*
862 	 * MODIFY_MPT
863 	 *
864 	 * If this fails for any reason, then it is an indication that
865 	 * something (either in HW or SW) has gone seriously wrong.  So we
866 	 * print a warning message and return.
867 	 */
868 	status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
869 	    TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
870 	if (status != TAVOR_CMD_SUCCESS) {
871 		cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
872 		    status);
873 		TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
874 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
875 		TAVOR_TNF_FAIL(status, "MODIFY_MPT command failed");
876 		(void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
877 		    srq->srq_mrhdl->mr_mttrsrcp);
878 		kmem_free(wre_new, srq->srq_wq_bufsz *
879 		    sizeof (tavor_wrid_entry_t));
880 		tavor_queue_free(state, &new_srqinfo);
881 		mutex_exit(&mr->mr_lock);
882 		mutex_exit(&srq->srq_lock);
883 		return (ibc_get_ci_failure(0));
884 	}
885 
886 	/*
887 	 * Update the Tavor Shared Receive Queue handle with all the new
888 	 * information.  At the same time, save away all the necessary
889 	 * information for freeing up the old resources
890 	 */
891 	old_srqinfo	   = srq->srq_wqinfo;
892 	old_mtt		   = srq->srq_mrhdl->mr_mttrsrcp;
893 	bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
894 	    sizeof (tavor_bind_info_t));
895 
896 	/* Now set the new info */
897 	srq->srq_wqinfo	   = new_srqinfo;
898 	srq->srq_wq_buf	   = buf;
899 	srq->srq_wq_bufsz  = (1 << log_srq_size);
900 	bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
901 	srq->srq_mrhdl->mr_mttrsrcp = mtt;
902 	srq->srq_desc_off  = srq_desc_off;
903 	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
904 
905 	/* Update MR mtt pagesize */
906 	mr->mr_logmttpgsz = mtt_pgsize_bits;
907 	mutex_exit(&mr->mr_lock);
908 
909 #ifdef __lock_lint
910 	mutex_enter(&srq->srq_wrid_wql->wql_lock);
911 #else
912 	if (srq->srq_wrid_wql != NULL) {
913 		mutex_enter(&srq->srq_wrid_wql->wql_lock);
914 	}
915 #endif
916 
917 	/*
918 	 * Initialize new wridlist, if needed.
919 	 *
920 	 * If a wridlist already is setup on an SRQ (the QP associated with an
921 	 * SRQ has moved "from_reset") then we must update this wridlist based
922 	 * on the new SRQ size.  We allocate the new size of Work Request ID
923 	 * Entries, copy over the old entries to the new list, and
924 	 * re-initialize the srq wridlist in non-umap case
925 	 */
926 	wre_old = NULL;
927 	if (srq->srq_wridlist != NULL) {
928 		wre_old = srq->srq_wridlist->wl_wre;
929 
930 		bcopy(wre_old, wre_new, srq_old_bufsz *
931 		    sizeof (tavor_wrid_entry_t));
932 
933 		/* Setup new sizes in wre */
934 		srq->srq_wridlist->wl_wre = wre_new;
935 		srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
936 
937 		if (!srq->srq_is_umap) {
938 			tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
939 			    srq_old_bufsz);
940 		}
941 	}
942 
943 #ifdef __lock_lint
944 	mutex_exit(&srq->srq_wrid_wql->wql_lock);
945 #else
946 	if (srq->srq_wrid_wql != NULL) {
947 		mutex_exit(&srq->srq_wrid_wql->wql_lock);
948 	}
949 #endif
950 
951 	/*
952 	 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
953 	 * to a user process, then we need to call devmap_devmem_remap() to
954 	 * invalidate the mapping to the SRQ memory.  We also need to
955 	 * invalidate the SRQ tracking information for the user mapping.
956 	 *
957 	 * Note: On failure, the remap really shouldn't ever happen.  So, if it
958 	 * does, it is an indication that something has gone seriously wrong.
959 	 * So we print a warning message and return error (knowing, of course,
960 	 * that the "old" SRQ memory will be leaked)
961 	 */
962 	if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
963 		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
964 		status = devmap_devmem_remap(srq->srq_umap_dhp,
965 		    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
966 		    DEVMAP_MAPPING_INVALID, NULL);
967 		if (status != DDI_SUCCESS) {
968 			mutex_exit(&srq->srq_lock);
969 			TAVOR_WARNING(state, "failed in SRQ memory "
970 			    "devmap_devmem_remap()");
971 			/* We can, however, free the memory for old wre */
972 			if (wre_old != NULL) {
973 				kmem_free(wre_old, srq_old_bufsz *
974 				    sizeof (tavor_wrid_entry_t));
975 			}
976 			TAVOR_TNF_EXIT(tavor_srq_modify);
977 			return (ibc_get_ci_failure(0));
978 		}
979 		srq->srq_umap_dhp = (devmap_cookie_t)NULL;
980 	}
981 
982 	/*
983 	 * Drop the SRQ lock now.  The only thing left to do is to free up
984 	 * the old resources.
985 	 */
986 	mutex_exit(&srq->srq_lock);
987 
988 	/*
989 	 * Unbind the MTT entries.
990 	 */
991 	status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
992 	if (status != DDI_SUCCESS) {
993 		TAVOR_WARNING(state, "failed to unbind old SRQ memory");
994 		/* Set "status" and "errormsg" and goto failure */
995 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
996 		    "failed to unbind (old)");
997 		goto srqmodify_fail;
998 	}
999 
1000 	/* Free the memory for old wre */
1001 	if (wre_old != NULL) {
1002 		kmem_free(wre_old, srq_old_bufsz *
1003 		    sizeof (tavor_wrid_entry_t));
1004 	}
1005 
1006 	/* Free the memory for the old SRQ */
1007 	tavor_queue_free(state, &old_srqinfo);
1008 
1009 	/*
1010 	 * Fill in the return arguments (if necessary).  This includes the
1011 	 * real new completion queue size.
1012 	 */
1013 	if (real_size != NULL) {
1014 		*real_size = (1 << log_srq_size);
1015 	}
1016 
1017 	TAVOR_TNF_EXIT(tavor_srq_modify);
1018 	return (DDI_SUCCESS);
1019 
1020 srqmodify_fail:
1021 	TNF_PROBE_1(tavor_srq_modify_fail, TAVOR_TNF_ERROR, "",
1022 	    tnf_string, msg, errormsg);
1023 	TAVOR_TNF_EXIT(tavor_srq_modify);
1024 	return (status);
1025 }
1026 
1027 
1028 /*
1029  * tavor_srq_numcalc()
1030  *    Context: Can be called from interrupt or base context.
1031  */
1032 static void
1033 tavor_srq_numcalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
1034 {
1035 	uint32_t	tmp, log_num_srq;
1036 
1037 	/*
1038 	 * Generate a simple key from counter.  Note:  We increment this
1039 	 * static variable _intentionally_ without any kind of mutex around
1040 	 * it.  First, single-threading all operations through a single lock
1041 	 * would be a bad idea (from a performance point-of-view).  Second,
1042 	 * the upper "unconstrained" bits don't really have to be unique
1043 	 * because the lower bits are guaranteed to be (although we do make a
1044 	 * best effort to ensure that they are).  Third, the window for the
1045 	 * race (where both threads read and update the counter at the same
1046 	 * time) is incredibly small.
1047 	 */
1048 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_srqnum_cnt))
1049 	log_num_srq = state->ts_cfg_profile->cp_log_num_srq;
1050 	tmp = (tavor_debug_srqnum_cnt++) << log_num_srq;
1051 	*key = (tmp | indx) & TAVOR_CQ_MAXNUMBER_MSK;
1052 }
1053 
1054 
1055 /*
1056  * tavor_srq_refcnt_inc()
1057  *    Context: Can be called from interrupt or base context.
1058  */
1059 void
1060 tavor_srq_refcnt_inc(tavor_srqhdl_t srq)
1061 {
1062 	mutex_enter(&srq->srq_lock);
1063 	TNF_PROBE_1_DEBUG(tavor_srq_refcnt_inc, TAVOR_TNF_TRACE, "",
1064 	    tnf_uint, refcnt, srq->srq_refcnt);
1065 	srq->srq_refcnt++;
1066 	mutex_exit(&srq->srq_lock);
1067 }
1068 
1069 
1070 /*
1071  * tavor_srq_refcnt_dec()
1072  *    Context: Can be called from interrupt or base context.
1073  */
1074 void
1075 tavor_srq_refcnt_dec(tavor_srqhdl_t srq)
1076 {
1077 	mutex_enter(&srq->srq_lock);
1078 	srq->srq_refcnt--;
1079 	TNF_PROBE_1_DEBUG(tavor_srq_refcnt_dec, TAVOR_TNF_TRACE, "",
1080 	    tnf_uint, refcnt, srq->srq_refcnt);
1081 	mutex_exit(&srq->srq_lock);
1082 }
1083 
1084 
1085 /*
1086  * tavor_srqhdl_from_srqnum()
1087  *    Context: Can be called from interrupt or base context.
1088  *
1089  *    This routine is important because changing the unconstrained
1090  *    portion of the SRQ number is critical to the detection of a
1091  *    potential race condition in the SRQ handler code (i.e. the case
1092  *    where a SRQ is freed and alloc'd again before an event for the
1093  *    "old" SRQ can be handled).
1094  *
1095  *    While this is not a perfect solution (not sure that one exists)
1096  *    it does help to mitigate the chance that this race condition will
1097  *    cause us to deliver a "stale" event to the new SRQ owner.  Note:
1098  *    this solution does not scale well because the number of constrained
1099  *    bits increases (and, hence, the number of unconstrained bits
1100  *    decreases) as the number of supported SRQ grows.  For small and
1101  *    intermediate values, it should hopefully provide sufficient
1102  *    protection.
1103  */
1104 tavor_srqhdl_t
1105 tavor_srqhdl_from_srqnum(tavor_state_t *state, uint_t srqnum)
1106 {
1107 	uint_t	srqindx, srqmask;
1108 
1109 	/* Calculate the SRQ table index from the srqnum */
1110 	srqmask = (1 << state->ts_cfg_profile->cp_log_num_srq) - 1;
1111 	srqindx = srqnum & srqmask;
1112 	return (state->ts_srqhdl[srqindx]);
1113 }
1114 
1115 
1116 /*
1117  * tavor_srq_sgl_to_logwqesz()
1118  *    Context: Can be called from interrupt or base context.
1119  */
1120 static void
1121 tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1122     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1123 {
1124 	uint_t	max_size, log2, actual_sgl;
1125 
1126 	TAVOR_TNF_ENTER(tavor_srq_sgl_to_logwqesz);
1127 
1128 	switch (wq_type) {
1129 	case TAVOR_QP_WQ_TYPE_RECVQ:
1130 		/*
1131 		 * Use requested maximum SGL to calculate max descriptor size
1132 		 * (while guaranteeing that the descriptor size is a
1133 		 * power-of-2 cachelines).
1134 		 */
1135 		max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
1136 		log2 = highbit(max_size);
1137 		if ((max_size & (max_size - 1)) == 0) {
1138 			log2 = log2 - 1;
1139 		}
1140 
1141 		/* Make sure descriptor is at least the minimum size */
1142 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1143 
1144 		/* Calculate actual number of SGL (given WQE size) */
1145 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
1146 		break;
1147 
1148 	default:
1149 		TAVOR_WARNING(state, "unexpected work queue type");
1150 		TNF_PROBE_0(tavor_srq_sgl_to_logwqesz_inv_wqtype_fail,
1151 		    TAVOR_TNF_ERROR, "");
1152 		break;
1153 	}
1154 
1155 	/* Fill in the return values */
1156 	*logwqesz = log2;
1157 	*max_sgl  = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl);
1158 
1159 	TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);
1160 }
1161