xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/hermon/hermon_srq.c (revision 9e39c5ba00a55fa05777cc94b148296af305e135)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * hermon_srq.c
29  *    Hermon Shared Receive Queue Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, querying,
32  *    modifying and posting shared receive queues.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/conf.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/modctl.h>
40 #include <sys/bitmap.h>
41 
42 #include <sys/ib/adapters/hermon/hermon.h>
43 
44 static void hermon_srq_sgl_to_logwqesz(hermon_state_t *state, uint_t num_sgl,
45     hermon_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
46 
47 /*
48  * hermon_srq_alloc()
49  *    Context: Can be called only from user or kernel context.
50  */
51 int
52 hermon_srq_alloc(hermon_state_t *state, hermon_srq_info_t *srqinfo,
53     uint_t sleepflag)
54 {
55 	ibt_srq_hdl_t		ibt_srqhdl;
56 	hermon_pdhdl_t		pd;
57 	ibt_srq_sizes_t		*sizes;
58 	ibt_srq_sizes_t		*real_sizes;
59 	hermon_srqhdl_t		*srqhdl;
60 	ibt_srq_flags_t		flags;
61 	hermon_rsrc_t		*srqc, *rsrc;
62 	hermon_hw_srqc_t	srqc_entry;
63 	uint32_t		*buf;
64 	hermon_srqhdl_t		srq;
65 	hermon_umap_db_entry_t	*umapdb;
66 	ibt_mr_attr_t		mr_attr;
67 	hermon_mr_options_t	mr_op;
68 	hermon_mrhdl_t		mr;
69 	uint64_t		value, srq_desc_off;
70 	uint32_t		log_srq_size;
71 	uint32_t		uarpg;
72 	uint_t			srq_is_umap;
73 	int			flag, status;
74 	uint_t			max_sgl;
75 	uint_t			wqesz;
76 	uint_t			srq_wr_sz;
77 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))
78 
79 	/*
80 	 * options-->wq_location used to be for location, now explicitly
81 	 * LOCATION_NORMAL
82 	 */
83 
84 	/*
85 	 * Extract the necessary info from the hermon_srq_info_t structure
86 	 */
87 	real_sizes = srqinfo->srqi_real_sizes;
88 	sizes	   = srqinfo->srqi_sizes;
89 	pd	   = srqinfo->srqi_pd;
90 	ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
91 	flags	   = srqinfo->srqi_flags;
92 	srqhdl	   = srqinfo->srqi_srqhdl;
93 
94 	/*
95 	 * Determine whether SRQ is being allocated for userland access or
96 	 * whether it is being allocated for kernel access.  If the SRQ is
97 	 * being allocated for userland access, then lookup the UAR doorbell
98 	 * page number for the current process.  Note:  If this is not found
99 	 * (e.g. if the process has not previously open()'d the Hermon driver),
100 	 * then an error is returned.
101 	 */
102 	srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
103 	if (srq_is_umap) {
104 		status = hermon_umap_db_find(state->hs_instance, ddi_get_pid(),
105 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
106 		if (status != DDI_SUCCESS) {
107 			status = IBT_INVALID_PARAM;
108 			goto srqalloc_fail3;
109 		}
110 		uarpg = ((hermon_rsrc_t *)(uintptr_t)value)->hr_indx;
111 	} else {
112 		uarpg = state->hs_kernel_uar_index;
113 	}
114 
115 	/* Increase PD refcnt */
116 	hermon_pd_refcnt_inc(pd);
117 
118 	/* Allocate an SRQ context entry */
119 	status = hermon_rsrc_alloc(state, HERMON_SRQC, 1, sleepflag, &srqc);
120 	if (status != DDI_SUCCESS) {
121 		status = IBT_INSUFF_RESOURCE;
122 		goto srqalloc_fail1;
123 	}
124 
125 	/* Allocate the SRQ Handle entry */
126 	status = hermon_rsrc_alloc(state, HERMON_SRQHDL, 1, sleepflag, &rsrc);
127 	if (status != DDI_SUCCESS) {
128 		status = IBT_INSUFF_RESOURCE;
129 		goto srqalloc_fail2;
130 	}
131 
132 	srq = (hermon_srqhdl_t)rsrc->hr_addr;
133 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))
134 
135 	bzero(srq, sizeof (struct hermon_sw_srq_s));
136 	/* Calculate the SRQ number */
137 
138 	/* just use the index, implicit in Hermon */
139 	srq->srq_srqnum = srqc->hr_indx;
140 
141 	/*
142 	 * If this will be a user-mappable SRQ, then allocate an entry for
143 	 * the "userland resources database".  This will later be added to
144 	 * the database (after all further SRQ operations are successful).
145 	 * If we fail here, we must undo the reference counts and the
146 	 * previous resource allocation.
147 	 */
148 	if (srq_is_umap) {
149 		umapdb = hermon_umap_db_alloc(state->hs_instance,
150 		    srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
151 		    (uint64_t)(uintptr_t)rsrc);
152 		if (umapdb == NULL) {
153 			status = IBT_INSUFF_RESOURCE;
154 			goto srqalloc_fail3;
155 		}
156 	}
157 
158 	/*
159 	 * Allocate the doorbell record.  Hermon just needs one for the
160 	 * SRQ, and use uarpg (above) as the uar index
161 	 */
162 
163 	status = hermon_dbr_alloc(state, uarpg, &srq->srq_wq_dbr_acchdl,
164 	    &srq->srq_wq_vdbr, &srq->srq_wq_pdbr, &srq->srq_rdbr_mapoffset);
165 	if (status != DDI_SUCCESS) {
166 		status = IBT_INSUFF_RESOURCE;
167 		goto srqalloc_fail4;
168 	}
169 
170 	/*
171 	 * Calculate the appropriate size for the SRQ.
172 	 * Note:  All Hermon SRQs must be a power-of-2 in size.  Also
173 	 * they may not be any smaller than HERMON_SRQ_MIN_SIZE.  This step
174 	 * is to round the requested size up to the next highest power-of-2
175 	 */
176 	srq_wr_sz = max(sizes->srq_wr_sz + 1, HERMON_SRQ_MIN_SIZE);
177 	log_srq_size = highbit(srq_wr_sz);
178 	if ((srq_wr_sz & (srq_wr_sz - 1)) == 0) {
179 		log_srq_size = log_srq_size - 1;
180 	}
181 
182 	/*
183 	 * Next we verify that the rounded-up size is valid (i.e. consistent
184 	 * with the device limits and/or software-configured limits).  If not,
185 	 * then obviously we have a lot of cleanup to do before returning.
186 	 */
187 	if (log_srq_size > state->hs_cfg_profile->cp_log_max_srq_sz) {
188 		status = IBT_HCA_WR_EXCEEDED;
189 		goto srqalloc_fail4a;
190 	}
191 
192 	/*
193 	 * Next we verify that the requested number of SGL is valid (i.e.
194 	 * consistent with the device limits and/or software-configured
195 	 * limits).  If not, then obviously the same cleanup needs to be done.
196 	 */
197 	max_sgl = state->hs_ibtfinfo.hca_attr->hca_max_srq_sgl;
198 	if (sizes->srq_sgl_sz > max_sgl) {
199 		status = IBT_HCA_SGL_EXCEEDED;
200 		goto srqalloc_fail4a;
201 	}
202 
203 	/*
204 	 * Determine the SRQ's WQE sizes.  This depends on the requested
205 	 * number of SGLs.  Note: This also has the side-effect of
206 	 * calculating the real number of SGLs (for the calculated WQE size)
207 	 */
208 	hermon_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
209 	    HERMON_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
210 	    &srq->srq_wq_sgl);
211 
212 	/*
213 	 * Allocate the memory for SRQ work queues.  Note:  The location from
214 	 * which we will allocate these work queues is always
215 	 * QUEUE_LOCATION_NORMAL.  Since Hermon work queues are not
216 	 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
217 	 * queue memory is very important.  We used to allocate work queues
218 	 * (the combined receive and send queues) so that they would be aligned
219 	 * on their combined size.  That alignment guaranteed that they would
220 	 * never cross the 4GB boundary (Hermon work queues are on the order of
221 	 * MBs at maximum).  Now we are able to relax this alignment constraint
222 	 * by ensuring that the IB address assigned to the queue memory (as a
223 	 * result of the hermon_mr_register() call) is offset from zero.
224 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
225 	 * guarantee the alignment, but when attempting to use IOMMU bypass
226 	 * mode we found that we were not allowed to specify any alignment that
227 	 * was more restrictive than the system page size.  So we avoided this
228 	 * constraint by passing two alignment values, one for the memory
229 	 * allocation itself and the other for the DMA handle (for later bind).
230 	 * This used to cause more memory than necessary to be allocated (in
231 	 * order to guarantee the more restrictive alignment contraint).  But
232 	 * be guaranteeing the zero-based IB virtual address for the queue, we
233 	 * are able to conserve this memory.
234 	 *
235 	 * Note: If SRQ is not user-mappable, then it may come from either
236 	 * kernel system memory or from HCA-attached local DDR memory.
237 	 *
238 	 * Note2: We align this queue on a pagesize boundary.  This is required
239 	 * to make sure that all the resulting IB addresses will start at 0, for
240 	 * a zero-based queue.  By making sure we are aligned on at least a
241 	 * page, any offset we use into our queue will be the same as when we
242 	 * perform hermon_srq_modify() operations later.
243 	 */
244 	wqesz = (1 << srq->srq_wq_log_wqesz);
245 	srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
246 	srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
247 	srq->srq_wqinfo.qa_bind_align = PAGESIZE;
248 	if (srq_is_umap) {
249 		srq->srq_wqinfo.qa_location = HERMON_QUEUE_LOCATION_USERLAND;
250 	} else {
251 		srq->srq_wqinfo.qa_location = HERMON_QUEUE_LOCATION_NORMAL;
252 	}
253 	status = hermon_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
254 	if (status != DDI_SUCCESS) {
255 		status = IBT_INSUFF_RESOURCE;
256 		goto srqalloc_fail4a;
257 	}
258 	buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
259 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
260 
261 	/*
262 	 * Register the memory for the SRQ work queues.  The memory for the SRQ
263 	 * must be registered in the Hermon cMPT tables.  This gives us the LKey
264 	 * to specify in the SRQ context later.  Note: If the work queue is to
265 	 * be allocated from DDR memory, then only a "bypass" mapping is
266 	 * appropriate.  And if the SRQ memory is user-mappable, then we force
267 	 * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
268 	 * restriction, we pass the "mro_bind_override_addr" flag in the call
269 	 * to hermon_mr_register().  This guarantees that the resulting IB vaddr
270 	 * will be zero-based (modulo the offset into the first page).  If we
271 	 * fail here, we still have the bunch of resource and reference count
272 	 * cleanup to do.
273 	 */
274 	flag = (sleepflag == HERMON_SLEEP) ? IBT_MR_SLEEP :
275 	    IBT_MR_NOSLEEP;
276 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
277 	mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
278 	mr_attr.mr_as    = NULL;
279 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
280 	mr_op.mro_bind_type   = state->hs_cfg_profile->cp_iommu_bypass;
281 	mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
282 	mr_op.mro_bind_override_addr = 1;
283 	status = hermon_mr_register(state, pd, &mr_attr, &mr,
284 	    &mr_op, HERMON_SRQ_CMPT);
285 	if (status != DDI_SUCCESS) {
286 		status = IBT_INSUFF_RESOURCE;
287 		goto srqalloc_fail5;
288 	}
289 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
290 
291 	/*
292 	 * Calculate the offset between the kernel virtual address space
293 	 * and the IB virtual address space.  This will be used when
294 	 * posting work requests to properly initialize each WQE.
295 	 */
296 	srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
297 	    (uint64_t)mr->mr_bindinfo.bi_addr;
298 
299 	srq->srq_wq_wqhdr = hermon_wrid_wqhdr_create(1 << log_srq_size);
300 
301 	/*
302 	 * Fill in all the return arguments (if necessary).  This includes
303 	 * real queue size and real SGLs.
304 	 */
305 	if (real_sizes != NULL) {
306 		real_sizes->srq_wr_sz = (1 << log_srq_size) - 1;
307 		real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
308 	}
309 
310 	/*
311 	 * Fill in the SRQC entry.  This is the final step before passing
312 	 * ownership of the SRQC entry to the Hermon hardware.  We use all of
313 	 * the information collected/calculated above to fill in the
314 	 * requisite portions of the SRQC.  Note: If this SRQ is going to be
315 	 * used for userland access, then we need to set the UAR page number
316 	 * appropriately (otherwise it's a "don't care")
317 	 */
318 	bzero(&srqc_entry, sizeof (hermon_hw_srqc_t));
319 	srqc_entry.state	   = HERMON_SRQ_STATE_HW_OWNER;
320 	srqc_entry.log_srq_size	   = log_srq_size;
321 	srqc_entry.srqn		   = srq->srq_srqnum;
322 	srqc_entry.log_rq_stride   = srq->srq_wq_log_wqesz - 4;
323 					/* 16-byte chunks */
324 
325 	srqc_entry.page_offs	   = srq->srq_wqinfo.qa_pgoffs >> 6;
326 	srqc_entry.log2_pgsz	   = mr->mr_log2_pgsz;
327 	srqc_entry.mtt_base_addrh  = (uint32_t)((mr->mr_mttaddr >> 32) & 0xFF);
328 	srqc_entry.mtt_base_addrl  = mr->mr_mttaddr >> 3;
329 	srqc_entry.pd		   = pd->pd_pdnum;
330 	srqc_entry.dbr_addrh = (uint32_t)((uint64_t)srq->srq_wq_pdbr >> 32);
331 	srqc_entry.dbr_addrl = (uint32_t)((uint64_t)srq->srq_wq_pdbr >> 2);
332 
333 	/*
334 	 * all others - specifically, xrcd, cqn_xrc, lwm, wqe_cnt, and wqe_cntr
335 	 * are zero thanks to the bzero of the structure
336 	 */
337 
338 	/*
339 	 * Write the SRQC entry to hardware.  Lastly, we pass ownership of
340 	 * the entry to the hardware (using the Hermon SW2HW_SRQ firmware
341 	 * command).  Note: In general, this operation shouldn't fail.  But
342 	 * if it does, we have to undo everything we've done above before
343 	 * returning error.
344 	 */
345 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
346 	    sizeof (hermon_hw_srqc_t), srq->srq_srqnum,
347 	    sleepflag);
348 	if (status != HERMON_CMD_SUCCESS) {
349 		cmn_err(CE_CONT, "Hermon: SW2HW_SRQ command failed: %08x\n",
350 		    status);
351 		if (status == HERMON_CMD_INVALID_STATUS) {
352 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
353 		}
354 		status = ibc_get_ci_failure(0);
355 		goto srqalloc_fail8;
356 	}
357 
358 	/*
359 	 * Fill in the rest of the Hermon SRQ handle.  We can update
360 	 * the following fields for use in further operations on the SRQ.
361 	 */
362 	srq->srq_srqcrsrcp = srqc;
363 	srq->srq_rsrcp	   = rsrc;
364 	srq->srq_mrhdl	   = mr;
365 	srq->srq_refcnt	   = 0;
366 	srq->srq_is_umap   = srq_is_umap;
367 	srq->srq_uarpg	   = uarpg;
368 	srq->srq_umap_dhp  = (devmap_cookie_t)NULL;
369 	srq->srq_pdhdl	   = pd;
370 	srq->srq_wq_bufsz  = (1 << log_srq_size);
371 	srq->srq_wq_buf	   = buf;
372 	srq->srq_desc_off  = srq_desc_off;
373 	srq->srq_hdlrarg   = (void *)ibt_srqhdl;
374 	srq->srq_state	   = 0;
375 	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
376 	srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;
377 
378 	/*
379 	 * Put SRQ handle in Hermon SRQNum-to-SRQhdl list.  Then fill in the
380 	 * "srqhdl" and return success
381 	 */
382 	ASSERT(state->hs_srqhdl[srqc->hr_indx] == NULL);
383 	state->hs_srqhdl[srqc->hr_indx] = srq;
384 
385 	/*
386 	 * If this is a user-mappable SRQ, then we need to insert the
387 	 * previously allocated entry into the "userland resources database".
388 	 * This will allow for later lookup during devmap() (i.e. mmap())
389 	 * calls.
390 	 */
391 	if (srq->srq_is_umap) {
392 		hermon_umap_db_add(umapdb);
393 	} else {	/* initialize work queue for kernel SRQs */
394 		int i, len, last;
395 		uint16_t *desc;
396 
397 		desc = (uint16_t *)buf;
398 		len = wqesz / sizeof (*desc);
399 		last = srq->srq_wq_bufsz - 1;
400 		for (i = 0; i < last; i++) {
401 			desc[1] = htons(i + 1);
402 			desc += len;
403 		}
404 		srq->srq_wq_wqhdr->wq_tail = last;
405 		srq->srq_wq_wqhdr->wq_head = 0;
406 	}
407 
408 	*srqhdl = srq;
409 
410 	return (status);
411 
412 /*
413  * The following is cleanup for all possible failure cases in this routine
414  */
415 srqalloc_fail8:
416 	hermon_wrid_wqhdr_destroy(srq->srq_wq_wqhdr);
417 srqalloc_fail7:
418 	if (hermon_mr_deregister(state, &mr, HERMON_MR_DEREG_ALL,
419 	    HERMON_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
420 		HERMON_WARNING(state, "failed to deregister SRQ memory");
421 	}
422 srqalloc_fail5:
423 	hermon_queue_free(&srq->srq_wqinfo);
424 srqalloc_fail4a:
425 	hermon_dbr_free(state, uarpg, srq->srq_wq_vdbr);
426 srqalloc_fail4:
427 	if (srq_is_umap) {
428 		hermon_umap_db_free(umapdb);
429 	}
430 srqalloc_fail3:
431 	hermon_rsrc_free(state, &rsrc);
432 srqalloc_fail2:
433 	hermon_rsrc_free(state, &srqc);
434 srqalloc_fail1:
435 	hermon_pd_refcnt_dec(pd);
436 srqalloc_fail:
437 	return (status);
438 }
439 
440 
441 /*
442  * hermon_srq_free()
443  *    Context: Can be called only from user or kernel context.
444  */
445 /* ARGSUSED */
446 int
447 hermon_srq_free(hermon_state_t *state, hermon_srqhdl_t *srqhdl,
448     uint_t sleepflag)
449 {
450 	hermon_rsrc_t		*srqc, *rsrc;
451 	hermon_umap_db_entry_t	*umapdb;
452 	uint64_t		value;
453 	hermon_srqhdl_t		srq;
454 	hermon_mrhdl_t		mr;
455 	hermon_pdhdl_t		pd;
456 	hermon_hw_srqc_t	srqc_entry;
457 	uint32_t		srqnum;
458 	uint_t			maxprot;
459 	int			status;
460 
461 	/*
462 	 * Pull all the necessary information from the Hermon Shared Receive
463 	 * Queue handle.  This is necessary here because the resource for the
464 	 * SRQ handle is going to be freed up as part of this operation.
465 	 */
466 	srq	= *srqhdl;
467 	mutex_enter(&srq->srq_lock);
468 	srqc	= srq->srq_srqcrsrcp;
469 	rsrc	= srq->srq_rsrcp;
470 	pd	= srq->srq_pdhdl;
471 	mr	= srq->srq_mrhdl;
472 	srqnum	= srq->srq_srqnum;
473 
474 	/*
475 	 * If there are work queues still associated with the SRQ, then return
476 	 * an error.  Otherwise, we will be holding the SRQ lock.
477 	 */
478 	if (srq->srq_refcnt != 0) {
479 		mutex_exit(&srq->srq_lock);
480 		return (IBT_SRQ_IN_USE);
481 	}
482 
483 	/*
484 	 * If this was a user-mappable SRQ, then we need to remove its entry
485 	 * from the "userland resources database".  If it is also currently
486 	 * mmap()'d out to a user process, then we need to call
487 	 * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
488 	 * We also need to invalidate the SRQ tracking information for the
489 	 * user mapping.
490 	 */
491 	if (srq->srq_is_umap) {
492 		status = hermon_umap_db_find(state->hs_instance,
493 		    srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC, &value,
494 		    HERMON_UMAP_DB_REMOVE, &umapdb);
495 		if (status != DDI_SUCCESS) {
496 			mutex_exit(&srq->srq_lock);
497 			HERMON_WARNING(state, "failed to find in database");
498 			return (ibc_get_ci_failure(0));
499 		}
500 		hermon_umap_db_free(umapdb);
501 		if (srq->srq_umap_dhp != NULL) {
502 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
503 			status = devmap_devmem_remap(srq->srq_umap_dhp,
504 			    state->hs_dip, 0, 0, srq->srq_wqinfo.qa_size,
505 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
506 			if (status != DDI_SUCCESS) {
507 				mutex_exit(&srq->srq_lock);
508 				HERMON_WARNING(state, "failed in SRQ memory "
509 				    "devmap_devmem_remap()");
510 				return (ibc_get_ci_failure(0));
511 			}
512 			srq->srq_umap_dhp = (devmap_cookie_t)NULL;
513 		}
514 	}
515 
516 	/*
517 	 * Put NULL into the Hermon SRQNum-to-SRQHdl list.  This will allow any
518 	 * in-progress events to detect that the SRQ corresponding to this
519 	 * number has been freed.
520 	 */
521 	state->hs_srqhdl[srqc->hr_indx] = NULL;
522 
523 	mutex_exit(&srq->srq_lock);
524 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
525 
526 	/*
527 	 * Reclaim SRQC entry from hardware (using the Hermon HW2SW_SRQ
528 	 * firmware command).  If the ownership transfer fails for any reason,
529 	 * then it is an indication that something (either in HW or SW) has
530 	 * gone seriously wrong.
531 	 */
532 	status = hermon_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
533 	    sizeof (hermon_hw_srqc_t), srqnum, sleepflag);
534 	if (status != HERMON_CMD_SUCCESS) {
535 		HERMON_WARNING(state, "failed to reclaim SRQC ownership");
536 		cmn_err(CE_CONT, "Hermon: HW2SW_SRQ command failed: %08x\n",
537 		    status);
538 		if (status == HERMON_CMD_INVALID_STATUS) {
539 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
540 		}
541 		return (ibc_get_ci_failure(0));
542 	}
543 
544 	/*
545 	 * Deregister the memory for the Shared Receive Queue.  If this fails
546 	 * for any reason, then it is an indication that something (either
547 	 * in HW or SW) has gone seriously wrong.  So we print a warning
548 	 * message and return.
549 	 */
550 	status = hermon_mr_deregister(state, &mr, HERMON_MR_DEREG_ALL,
551 	    sleepflag);
552 	if (status != DDI_SUCCESS) {
553 		HERMON_WARNING(state, "failed to deregister SRQ memory");
554 		return (IBT_FAILURE);
555 	}
556 
557 	hermon_wrid_wqhdr_destroy(srq->srq_wq_wqhdr);
558 
559 	/* Free the memory for the SRQ */
560 	hermon_queue_free(&srq->srq_wqinfo);
561 
562 	/* Free the dbr */
563 	hermon_dbr_free(state, srq->srq_uarpg, srq->srq_wq_vdbr);
564 
565 	/* Free the Hermon SRQ Handle */
566 	hermon_rsrc_free(state, &rsrc);
567 
568 	/* Free the SRQC entry resource */
569 	hermon_rsrc_free(state, &srqc);
570 
571 	/* Decrement the reference count on the protection domain (PD) */
572 	hermon_pd_refcnt_dec(pd);
573 
574 	/* Set the srqhdl pointer to NULL and return success */
575 	*srqhdl = NULL;
576 
577 	return (DDI_SUCCESS);
578 }
579 
580 
581 /*
582  * hermon_srq_modify()
583  *    Context: Can be called only from user or kernel context.
584  */
585 int
586 hermon_srq_modify(hermon_state_t *state, hermon_srqhdl_t srq, uint_t size,
587     uint_t *real_size, uint_t sleepflag)
588 {
589 	hermon_qalloc_info_t	new_srqinfo, old_srqinfo;
590 	hermon_rsrc_t		*mtt, *old_mtt;
591 	hermon_bind_info_t	bind;
592 	hermon_bind_info_t	old_bind;
593 	hermon_mrhdl_t		mr;
594 	hermon_hw_srqc_t	srqc_entry;
595 	hermon_hw_dmpt_t	mpt_entry;
596 	uint64_t		*wre_new, *wre_old;
597 	uint64_t		mtt_addr;
598 	uint64_t		srq_pgoffs;
599 	uint64_t		srq_desc_off;
600 	uint32_t		*buf, srq_old_bufsz;
601 	uint32_t		wqesz;
602 	uint_t			max_srq_size;
603 	uint_t			mtt_pgsize_bits;
604 	uint_t			log_srq_size, maxprot;
605 	int			status;
606 
607 	if ((state->hs_devlim.mod_wr_srq == 0) ||
608 	    (state->hs_cfg_profile->cp_srq_resize_enabled == 0))
609 		return (IBT_NOT_SUPPORTED);
610 
611 	/*
612 	 * If size requested is larger than device capability, return
613 	 * Insufficient Resources
614 	 */
615 	max_srq_size = (1 << state->hs_cfg_profile->cp_log_max_srq_sz);
616 	if (size > max_srq_size) {
617 		return (IBT_HCA_WR_EXCEEDED);
618 	}
619 
620 	/*
621 	 * Calculate the appropriate size for the SRQ.
622 	 * Note:  All Hermon SRQs must be a power-of-2 in size.  Also
623 	 * they may not be any smaller than HERMON_SRQ_MIN_SIZE.  This step
624 	 * is to round the requested size up to the next highest power-of-2
625 	 */
626 	size = max(size, HERMON_SRQ_MIN_SIZE);
627 	log_srq_size = highbit(size);
628 	if ((size & (size - 1)) == 0) {
629 		log_srq_size = log_srq_size - 1;
630 	}
631 
632 	/*
633 	 * Next we verify that the rounded-up size is valid (i.e. consistent
634 	 * with the device limits and/or software-configured limits).
635 	 */
636 	if (log_srq_size > state->hs_cfg_profile->cp_log_max_srq_sz) {
637 		status = IBT_HCA_WR_EXCEEDED;
638 		goto srqmodify_fail;
639 	}
640 
641 	/*
642 	 * Allocate the memory for newly resized Shared Receive Queue.
643 	 *
644 	 * Note: If SRQ is not user-mappable, then it may come from either
645 	 * kernel system memory or from HCA-attached local DDR memory.
646 	 *
647 	 * Note2: We align this queue on a pagesize boundary.  This is required
648 	 * to make sure that all the resulting IB addresses will start at 0,
649 	 * for a zero-based queue.  By making sure we are aligned on at least a
650 	 * page, any offset we use into our queue will be the same as it was
651 	 * when we allocated it at hermon_srq_alloc() time.
652 	 */
653 	wqesz = (1 << srq->srq_wq_log_wqesz);
654 	new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
655 	new_srqinfo.qa_alloc_align = PAGESIZE;
656 	new_srqinfo.qa_bind_align  = PAGESIZE;
657 	if (srq->srq_is_umap) {
658 		new_srqinfo.qa_location = HERMON_QUEUE_LOCATION_USERLAND;
659 	} else {
660 		new_srqinfo.qa_location = HERMON_QUEUE_LOCATION_NORMAL;
661 	}
662 	status = hermon_queue_alloc(state, &new_srqinfo, sleepflag);
663 	if (status != DDI_SUCCESS) {
664 		status = IBT_INSUFF_RESOURCE;
665 		goto srqmodify_fail;
666 	}
667 	buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
668 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
669 
670 	/*
671 	 * Allocate the memory for the new WRE list.  This will be used later
672 	 * when we resize the wridlist based on the new SRQ size.
673 	 */
674 	wre_new = kmem_zalloc((1 << log_srq_size) * sizeof (uint64_t),
675 	    sleepflag);
676 	if (wre_new == NULL) {
677 		status = IBT_INSUFF_RESOURCE;
678 		goto srqmodify_fail;
679 	}
680 
681 	/*
682 	 * Fill in the "bind" struct.  This struct provides the majority
683 	 * of the information that will be used to distinguish between an
684 	 * "addr" binding (as is the case here) and a "buf" binding (see
685 	 * below).  The "bind" struct is later passed to hermon_mr_mem_bind()
686 	 * which does most of the "heavy lifting" for the Hermon memory
687 	 * registration routines.
688 	 */
689 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
690 	bzero(&bind, sizeof (hermon_bind_info_t));
691 	bind.bi_type  = HERMON_BINDHDL_VADDR;
692 	bind.bi_addr  = (uint64_t)(uintptr_t)buf;
693 	bind.bi_len   = new_srqinfo.qa_size;
694 	bind.bi_as    = NULL;
695 	bind.bi_flags = sleepflag == HERMON_SLEEP ? IBT_MR_SLEEP :
696 	    IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
697 	bind.bi_bypass = state->hs_cfg_profile->cp_iommu_bypass;
698 
699 	status = hermon_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
700 	    &mtt_pgsize_bits, 0); /* no relaxed ordering */
701 	if (status != DDI_SUCCESS) {
702 		status = status;
703 		kmem_free(wre_new, (1 << log_srq_size) *
704 		    sizeof (uint64_t));
705 		hermon_queue_free(&new_srqinfo);
706 		goto srqmodify_fail;
707 	}
708 
709 	/*
710 	 * Calculate the offset between the kernel virtual address space
711 	 * and the IB virtual address space.  This will be used when
712 	 * posting work requests to properly initialize each WQE.
713 	 *
714 	 * Note: bind addr is zero-based (from alloc) so we calculate the
715 	 * correct new offset here.
716 	 */
717 	bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
718 	srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
719 	    (uint64_t)bind.bi_addr;
720 	srq_pgoffs   = (uint_t)
721 	    ((uintptr_t)new_srqinfo.qa_buf_aligned & HERMON_PAGEMASK);
722 
723 	/*
724 	 * Fill in the MPT entry.  This is the final step before passing
725 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
726 	 * the information collected/calculated above to fill in the
727 	 * requisite portions of the MPT.
728 	 */
729 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
730 	mpt_entry.reg_win_len	= bind.bi_len;
731 	mtt_addr = (mtt->hr_indx << HERMON_MTT_SIZE_SHIFT);
732 	mpt_entry.mtt_addr_h = mtt_addr >> 32;
733 	mpt_entry.mtt_addr_l = mtt_addr >> 3;
734 
735 	/*
736 	 * for hermon we build up a new srqc and pass that (partially filled
737 	 * to resize SRQ instead of modifying the (d)mpt directly
738 	 */
739 
740 
741 
742 	/*
743 	 * Now we grab the SRQ lock.  Since we will be updating the actual
744 	 * SRQ location and the producer/consumer indexes, we should hold
745 	 * the lock.
746 	 *
747 	 * We do a HERMON_NOSLEEP here (and below), though, because we are
748 	 * holding the "srq_lock" and if we got raised to interrupt level
749 	 * by priority inversion, we would not want to block in this routine
750 	 * waiting for success.
751 	 */
752 	mutex_enter(&srq->srq_lock);
753 
754 	/*
755 	 * Copy old entries to new buffer
756 	 */
757 	srq_old_bufsz = srq->srq_wq_bufsz;
758 	bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);
759 
760 	/* Sync entire "new" SRQ for use by hardware (if necessary) */
761 	(void) ddi_dma_sync(bind.bi_dmahdl, 0, new_srqinfo.qa_size,
762 	    DDI_DMA_SYNC_FORDEV);
763 
764 	/*
765 	 * Setup MPT information for use in the MODIFY_MPT command
766 	 */
767 	mr = srq->srq_mrhdl;
768 	mutex_enter(&mr->mr_lock);
769 
770 	/*
771 	 * now, setup the srqc information needed for resize - limit the
772 	 * values, but use the same structure as the srqc
773 	 */
774 
775 	srqc_entry.log_srq_size	  = log_srq_size;
776 	srqc_entry.page_offs	  = srq_pgoffs >> 6;
777 	srqc_entry.log2_pgsz	  = mr->mr_log2_pgsz;
778 	srqc_entry.mtt_base_addrl = (uint64_t)mtt_addr >> 32;
779 	srqc_entry.mtt_base_addrh = mtt_addr >> 3;
780 
781 	/*
782 	 * RESIZE_SRQ
783 	 *
784 	 * If this fails for any reason, then it is an indication that
785 	 * something (either in HW or SW) has gone seriously wrong.  So we
786 	 * print a warning message and return.
787 	 */
788 	status = hermon_resize_srq_cmd_post(state, &srqc_entry,
789 	    srq->srq_srqnum, sleepflag);
790 	if (status != HERMON_CMD_SUCCESS) {
791 		cmn_err(CE_CONT, "Hermon: RESIZE_SRQ command failed: %08x\n",
792 		    status);
793 		if (status == HERMON_CMD_INVALID_STATUS) {
794 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
795 		}
796 		(void) hermon_mr_mtt_unbind(state, &bind, mtt);
797 		kmem_free(wre_new, (1 << log_srq_size) *
798 		    sizeof (uint64_t));
799 		hermon_queue_free(&new_srqinfo);
800 		mutex_exit(&mr->mr_lock);
801 		mutex_exit(&srq->srq_lock);
802 		return (ibc_get_ci_failure(0));
803 	}
804 	/*
805 	 * Update the Hermon Shared Receive Queue handle with all the new
806 	 * information.  At the same time, save away all the necessary
807 	 * information for freeing up the old resources
808 	 */
809 	old_srqinfo	   = srq->srq_wqinfo;
810 	old_mtt		   = srq->srq_mrhdl->mr_mttrsrcp;
811 	bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
812 	    sizeof (hermon_bind_info_t));
813 
814 	/* Now set the new info */
815 	srq->srq_wqinfo	   = new_srqinfo;
816 	srq->srq_wq_buf	   = buf;
817 	srq->srq_wq_bufsz  = (1 << log_srq_size);
818 	bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (hermon_bind_info_t));
819 	srq->srq_mrhdl->mr_mttrsrcp = mtt;
820 	srq->srq_desc_off  = srq_desc_off;
821 	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
822 
823 	/* Update MR mtt pagesize */
824 	mr->mr_logmttpgsz = mtt_pgsize_bits;
825 	mutex_exit(&mr->mr_lock);
826 
827 	/*
828 	 * Initialize new wridlist, if needed.
829 	 *
830 	 * If a wridlist already is setup on an SRQ (the QP associated with an
831 	 * SRQ has moved "from_reset") then we must update this wridlist based
832 	 * on the new SRQ size.  We allocate the new size of Work Request ID
833 	 * Entries, copy over the old entries to the new list, and
834 	 * re-initialize the srq wridlist in non-umap case
835 	 */
836 	wre_old = srq->srq_wq_wqhdr->wq_wrid;
837 
838 	bcopy(wre_old, wre_new, srq_old_bufsz * sizeof (uint64_t));
839 
840 	/* Setup new sizes in wre */
841 	srq->srq_wq_wqhdr->wq_wrid = wre_new;
842 
843 	/*
844 	 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
845 	 * to a user process, then we need to call devmap_devmem_remap() to
846 	 * invalidate the mapping to the SRQ memory.  We also need to
847 	 * invalidate the SRQ tracking information for the user mapping.
848 	 *
849 	 * Note: On failure, the remap really shouldn't ever happen.  So, if it
850 	 * does, it is an indication that something has gone seriously wrong.
851 	 * So we print a warning message and return error (knowing, of course,
852 	 * that the "old" SRQ memory will be leaked)
853 	 */
854 	if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
855 		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
856 		status = devmap_devmem_remap(srq->srq_umap_dhp,
857 		    state->hs_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
858 		    DEVMAP_MAPPING_INVALID, NULL);
859 		if (status != DDI_SUCCESS) {
860 			mutex_exit(&srq->srq_lock);
861 			HERMON_WARNING(state, "failed in SRQ memory "
862 			    "devmap_devmem_remap()");
863 			/* We can, however, free the memory for old wre */
864 			kmem_free(wre_old, srq_old_bufsz * sizeof (uint64_t));
865 			return (ibc_get_ci_failure(0));
866 		}
867 		srq->srq_umap_dhp = (devmap_cookie_t)NULL;
868 	}
869 
870 	/*
871 	 * Drop the SRQ lock now.  The only thing left to do is to free up
872 	 * the old resources.
873 	 */
874 	mutex_exit(&srq->srq_lock);
875 
876 	/*
877 	 * Unbind the MTT entries.
878 	 */
879 	status = hermon_mr_mtt_unbind(state, &old_bind, old_mtt);
880 	if (status != DDI_SUCCESS) {
881 		HERMON_WARNING(state, "failed to unbind old SRQ memory");
882 		status = ibc_get_ci_failure(0);
883 		goto srqmodify_fail;
884 	}
885 
886 	/* Free the memory for old wre */
887 	kmem_free(wre_old, srq_old_bufsz * sizeof (uint64_t));
888 
889 	/* Free the memory for the old SRQ */
890 	hermon_queue_free(&old_srqinfo);
891 
892 	/*
893 	 * Fill in the return arguments (if necessary).  This includes the
894 	 * real new completion queue size.
895 	 */
896 	if (real_size != NULL) {
897 		*real_size = (1 << log_srq_size);
898 	}
899 
900 	return (DDI_SUCCESS);
901 
902 srqmodify_fail:
903 	return (status);
904 }
905 
906 
907 /*
908  * hermon_srq_refcnt_inc()
909  *    Context: Can be called from interrupt or base context.
910  */
911 void
912 hermon_srq_refcnt_inc(hermon_srqhdl_t srq)
913 {
914 	mutex_enter(&srq->srq_lock);
915 	srq->srq_refcnt++;
916 	mutex_exit(&srq->srq_lock);
917 }
918 
919 
920 /*
921  * hermon_srq_refcnt_dec()
922  *    Context: Can be called from interrupt or base context.
923  */
924 void
925 hermon_srq_refcnt_dec(hermon_srqhdl_t srq)
926 {
927 	mutex_enter(&srq->srq_lock);
928 	srq->srq_refcnt--;
929 	mutex_exit(&srq->srq_lock);
930 }
931 
932 
933 /*
934  * hermon_srqhdl_from_srqnum()
935  *    Context: Can be called from interrupt or base context.
936  *
937  *    This routine is important because changing the unconstrained
938  *    portion of the SRQ number is critical to the detection of a
939  *    potential race condition in the SRQ handler code (i.e. the case
940  *    where a SRQ is freed and alloc'd again before an event for the
941  *    "old" SRQ can be handled).
942  *
943  *    While this is not a perfect solution (not sure that one exists)
944  *    it does help to mitigate the chance that this race condition will
945  *    cause us to deliver a "stale" event to the new SRQ owner.  Note:
946  *    this solution does not scale well because the number of constrained
947  *    bits increases (and, hence, the number of unconstrained bits
948  *    decreases) as the number of supported SRQ grows.  For small and
949  *    intermediate values, it should hopefully provide sufficient
950  *    protection.
951  */
952 hermon_srqhdl_t
953 hermon_srqhdl_from_srqnum(hermon_state_t *state, uint_t srqnum)
954 {
955 	uint_t	srqindx, srqmask;
956 
957 	/* Calculate the SRQ table index from the srqnum */
958 	srqmask = (1 << state->hs_cfg_profile->cp_log_num_srq) - 1;
959 	srqindx = srqnum & srqmask;
960 	return (state->hs_srqhdl[srqindx]);
961 }
962 
963 
964 /*
965  * hermon_srq_sgl_to_logwqesz()
966  *    Context: Can be called from interrupt or base context.
967  */
968 static void
969 hermon_srq_sgl_to_logwqesz(hermon_state_t *state, uint_t num_sgl,
970     hermon_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
971 {
972 	uint_t	max_size, log2, actual_sgl;
973 
974 	switch (wq_type) {
975 	case HERMON_QP_WQ_TYPE_RECVQ:
976 		/*
977 		 * Use requested maximum SGL to calculate max descriptor size
978 		 * (while guaranteeing that the descriptor size is a
979 		 * power-of-2 cachelines).
980 		 */
981 		max_size = (HERMON_QP_WQE_MLX_SRQ_HDRS + (num_sgl << 4));
982 		log2 = highbit(max_size);
983 		if ((max_size & (max_size - 1)) == 0) {
984 			log2 = log2 - 1;
985 		}
986 
987 		/* Make sure descriptor is at least the minimum size */
988 		log2 = max(log2, HERMON_QP_WQE_LOG_MINIMUM);
989 
990 		/* Calculate actual number of SGL (given WQE size) */
991 		actual_sgl = ((1 << log2) - HERMON_QP_WQE_MLX_SRQ_HDRS) >> 4;
992 		break;
993 
994 	default:
995 		HERMON_WARNING(state, "unexpected work queue type");
996 		break;
997 	}
998 
999 	/* Fill in the return values */
1000 	*logwqesz = log2;
1001 	*max_sgl  = min(state->hs_cfg_profile->cp_srq_max_sgl, actual_sgl);
1002 }
1003