xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/hermon/hermon_misc.c (revision d9c882fa1001c77987b156290d6733010c824ec4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * hermon_misc.c
29  *    Hermon Miscellaneous routines - Address Handle, Multicast, Protection
30  *    Domain, and port-related operations
31  *
32  *    Implements all the routines necessary for allocating, freeing, querying
33  *    and modifying Address Handles and Protection Domains.  Also implements
34  *    all the routines necessary for adding and removing Queue Pairs to/from
35  *    Multicast Groups.  Lastly, it implements the routines necessary for
36  *    port-related query and modify operations.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/bitmap.h>
45 #include <sys/sysmacros.h>
46 
47 #include <sys/ib/adapters/hermon/hermon.h>
48 
49 extern uint32_t hermon_kernel_data_ro;
50 
51 /* used for helping uniquify fmr pool taskq name */
52 static uint_t hermon_debug_fmrpool_cnt = 0x00000000;
53 
54 static int hermon_mcg_qplist_add(hermon_state_t *state, hermon_mcghdl_t mcg,
55     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp, uint_t *qp_found);
56 static int hermon_mcg_qplist_remove(hermon_mcghdl_t mcg,
57     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp);
58 static void hermon_qp_mcg_refcnt_inc(hermon_qphdl_t qp);
59 static void hermon_qp_mcg_refcnt_dec(hermon_qphdl_t qp);
60 static uint_t hermon_mcg_walk_mgid_hash(hermon_state_t *state,
61     uint64_t start_indx, ib_gid_t mgid, uint_t *prev_indx);
62 static void hermon_mcg_setup_new_hdr(hermon_mcghdl_t mcg,
63     hermon_hw_mcg_t *mcg_hdr, ib_gid_t mgid, hermon_rsrc_t *mcg_rsrc);
64 static int hermon_mcg_hash_list_remove(hermon_state_t *state, uint_t curr_indx,
65     uint_t prev_indx, hermon_hw_mcg_t *mcg_entry);
66 static int hermon_mcg_entry_invalidate(hermon_state_t *state,
67     hermon_hw_mcg_t *mcg_entry, uint_t indx);
68 static int hermon_mgid_is_valid(ib_gid_t gid);
69 static int hermon_mlid_is_valid(ib_lid_t lid);
70 static void hermon_fmr_processing(void *fmr_args);
71 static int hermon_fmr_cleanup(hermon_state_t *state, hermon_fmrhdl_t pool);
72 static void hermon_fmr_cache_init(hermon_fmrhdl_t fmr);
73 static void hermon_fmr_cache_fini(hermon_fmrhdl_t fmr);
74 static int hermon_fmr_avl_compare(const void *q, const void *e);
75 
76 
77 #define	HERMON_MAX_DBR_PAGES_PER_USER	64
78 #define	HERMON_DBR_KEY(index, page) \
79 	(((uint64_t)index) * HERMON_MAX_DBR_PAGES_PER_USER + (page))
80 
81 static hermon_udbr_page_t *
82 hermon_dbr_new_user_page(hermon_state_t *state, uint_t index,
83     uint_t page)
84 {
85 	hermon_udbr_page_t *pagep;
86 	ddi_dma_attr_t dma_attr;
87 	uint_t cookiecnt;
88 	int i, status;
89 	uint64_t *p;
90 	hermon_umap_db_entry_t *umapdb;
91 
92 	pagep = kmem_alloc(sizeof (*pagep), KM_SLEEP);
93 	pagep->upg_index = page;
94 	pagep->upg_nfree = PAGESIZE / sizeof (hermon_dbr_t);
95 	pagep->upg_firstfree = 0;
96 	pagep->upg_kvaddr = ddi_umem_alloc(PAGESIZE, DDI_UMEM_SLEEP,
97 	    &pagep->upg_umemcookie); /* not HERMON_PAGESIZE here */
98 
99 	/* link free entries */
100 	p = (uint64_t *)(void *)pagep->upg_kvaddr;
101 	for (i = pagep->upg_firstfree; i < pagep->upg_nfree; i++)
102 		p[i] = i + 1;
103 	pagep->upg_buf = ddi_umem_iosetup(pagep->upg_umemcookie, 0,
104 	    PAGESIZE, B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
105 
106 	hermon_dma_attr_init(state, &dma_attr);
107 	status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
108 	    DDI_DMA_SLEEP, NULL, &pagep->upg_dmahdl);
109 	if (status != DDI_SUCCESS) {
110 		IBTF_DPRINTF_L2("hermon", "hermon_new_user_page: "
111 		    "ddi_dma_buf_bind_handle failed: %d", status);
112 		return (NULL);
113 	}
114 	status = ddi_dma_buf_bind_handle(pagep->upg_dmahdl,
115 	    pagep->upg_buf, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
116 	    DDI_DMA_SLEEP, NULL, &pagep->upg_dmacookie, &cookiecnt);
117 	if (status != DDI_SUCCESS) {
118 		IBTF_DPRINTF_L2("hermon", "hermon_dbr_new_user_page: "
119 		    "ddi_dma_buf_bind_handle failed: %d", status);
120 		ddi_dma_free_handle(&pagep->upg_dmahdl);
121 		return (NULL);
122 	}
123 	ASSERT(cookiecnt == 1);
124 
125 	/* create db entry for mmap */
126 	umapdb = hermon_umap_db_alloc(state->hs_instance,
127 	    HERMON_DBR_KEY(index, page), MLNX_UMAP_DBRMEM_RSRC,
128 	    (uint64_t)(uintptr_t)pagep);
129 	hermon_umap_db_add(umapdb);
130 	return (pagep);
131 }
132 
133 
134 /*ARGSUSED*/
135 static int
136 hermon_user_dbr_alloc(hermon_state_t *state, uint_t index,
137     ddi_acc_handle_t *acchdl, hermon_dbr_t **vdbr, uint64_t *pdbr,
138     uint64_t *mapoffset)
139 {
140 	hermon_user_dbr_t *udbr;
141 	hermon_udbr_page_t *pagep;
142 	uint_t next_page;
143 	int j;
144 
145 	mutex_enter(&state->hs_dbr_lock);
146 	for (udbr = state->hs_user_dbr; udbr != NULL; udbr = udbr->udbr_link)
147 		if (udbr->udbr_index == index)
148 			break;
149 	if (udbr == NULL) {
150 		udbr = kmem_alloc(sizeof (*udbr), KM_SLEEP);
151 		udbr->udbr_link = state->hs_user_dbr;
152 		state->hs_user_dbr = udbr;
153 		udbr->udbr_index = index;
154 		udbr->udbr_pagep = NULL;
155 	}
156 	pagep = udbr->udbr_pagep;
157 	next_page = (pagep == NULL) ? 0 : (pagep->upg_index + 1);
158 	while (pagep != NULL)
159 		if (pagep->upg_nfree > 0)
160 			break;
161 		else
162 			pagep = pagep->upg_link;
163 	if (pagep == NULL) {
164 		pagep = hermon_dbr_new_user_page(state, index, next_page);
165 		if (pagep == NULL) {
166 			mutex_exit(&state->hs_dbr_lock);
167 			return (DDI_FAILURE);
168 		}
169 		pagep->upg_link = udbr->udbr_pagep;
170 		udbr->udbr_pagep = pagep;
171 	}
172 	j = pagep->upg_firstfree;	/* index within page */
173 	pagep->upg_firstfree = ((uint64_t *)(void *)pagep->upg_kvaddr)[j];
174 	pagep->upg_nfree--;
175 	((uint64_t *)(void *)pagep->upg_kvaddr)[j] = 0;	/* clear dbr */
176 	*mapoffset = ((HERMON_DBR_KEY(index, pagep->upg_index) <<
177 	    MLNX_UMAP_RSRC_TYPE_SHIFT) | MLNX_UMAP_DBRMEM_RSRC) << PAGESHIFT;
178 	*vdbr = (hermon_dbr_t *)((uint64_t *)(void *)pagep->upg_kvaddr + j);
179 	*pdbr = pagep->upg_dmacookie.dmac_laddress + j * sizeof (uint64_t);
180 
181 	mutex_exit(&state->hs_dbr_lock);
182 	return (DDI_SUCCESS);
183 }
184 
185 static void
186 hermon_user_dbr_free(hermon_state_t *state, uint_t index, hermon_dbr_t *record)
187 {
188 	hermon_user_dbr_t	*udbr;
189 	hermon_udbr_page_t	*pagep;
190 	caddr_t			kvaddr;
191 	uint_t			dbr_index;
192 	uint_t			max_free = PAGESIZE / sizeof (hermon_dbr_t);
193 
194 	dbr_index = (uintptr_t)record & PAGEOFFSET; /* offset (not yet index) */
195 	kvaddr = (caddr_t)record - dbr_index;
196 	dbr_index /= sizeof (hermon_dbr_t); /* now it's the index */
197 
198 	mutex_enter(&state->hs_dbr_lock);
199 	for (udbr = state->hs_user_dbr; udbr != NULL; udbr = udbr->udbr_link)
200 		if (udbr->udbr_index == index)
201 			break;
202 	if (udbr == NULL) {
203 		IBTF_DPRINTF_L2("hermon", "free user dbr: udbr struct not "
204 		    "found for index %x", index);
205 		mutex_exit(&state->hs_dbr_lock);
206 		return;
207 	}
208 	for (pagep = udbr->udbr_pagep; pagep != NULL; pagep = pagep->upg_link)
209 		if (pagep->upg_kvaddr == kvaddr)
210 			break;
211 	if (pagep == NULL) {
212 		IBTF_DPRINTF_L2("hermon", "free user dbr: pagep struct not"
213 		    " found for index %x, kvaddr %p, DBR index %x",
214 		    index, kvaddr, dbr_index);
215 		mutex_exit(&state->hs_dbr_lock);
216 		return;
217 	}
218 	if (pagep->upg_nfree >= max_free) {
219 		IBTF_DPRINTF_L2("hermon", "free user dbr: overflow: "
220 		    "UCE index %x, DBR index %x", index, dbr_index);
221 		mutex_exit(&state->hs_dbr_lock);
222 		return;
223 	}
224 	ASSERT(dbr_index < max_free);
225 	((uint64_t *)(void *)kvaddr)[dbr_index] = pagep->upg_firstfree;
226 	pagep->upg_firstfree = dbr_index;
227 	pagep->upg_nfree++;
228 	mutex_exit(&state->hs_dbr_lock);
229 
230 	/* XXX still need to unlink and free struct */
231 	/* XXX munmap needs to be managed */
232 }
233 
234 /*
235  * hermon_dbr_page_alloc()
236  *	first page allocation - called from attach or open
237  *	in this case, we want exactly one page per call, and aligned on a
238  *	page - and may need to be mapped to the user for access
239  */
240 
241 int
242 hermon_dbr_page_alloc(hermon_state_t *state, hermon_dbr_info_t **dinfo)
243 {
244 	int			status;
245 	ddi_dma_handle_t	dma_hdl;
246 	ddi_acc_handle_t	acc_hdl;
247 	ddi_dma_attr_t		dma_attr;
248 	ddi_dma_cookie_t	cookie;
249 	uint_t			cookie_cnt;
250 	hermon_dbr_header_t	*pagehdr;
251 	int			i;
252 	hermon_dbr_info_t 	*info;
253 	uint64_t		dmaaddr;
254 	uint64_t		dmalen;
255 
256 	info = kmem_zalloc(sizeof (hermon_dbr_info_t), KM_SLEEP);
257 
258 	/*
259 	 * Initialize many of the default DMA attributes.  Then set additional
260 	 * alignment restrictions if necessary for the dbr memory, meaning
261 	 * page aligned.  Also use the configured value for IOMMU bypass
262 	 */
263 	hermon_dma_attr_init(state, &dma_attr);
264 	dma_attr.dma_attr_align = PAGESIZE;
265 	dma_attr.dma_attr_sgllen = 1;	/* make sure only one cookie */
266 
267 	status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
268 	    DDI_DMA_SLEEP, NULL, &dma_hdl);
269 	if (status != DDI_SUCCESS) {
270 		kmem_free((void *)info, sizeof (hermon_dbr_info_t));
271 		cmn_err(CE_NOTE, "dbr DMA handle alloc failed\n");
272 		return (DDI_FAILURE);
273 	}
274 
275 	status = ddi_dma_mem_alloc(dma_hdl, PAGESIZE,
276 	    &state->hs_reg_accattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP,
277 	    NULL, (caddr_t *)&dmaaddr, (size_t *)&dmalen, &acc_hdl);
278 	if (status != DDI_SUCCESS)	{
279 		ddi_dma_free_handle(&dma_hdl);
280 		cmn_err(CE_CONT, "dbr DMA mem alloc failed(status %d)", status);
281 		kmem_free((void *)info, sizeof (hermon_dbr_info_t));
282 		return (DDI_FAILURE);
283 	}
284 
285 	/* this memory won't be IB registered, so do the bind here */
286 	status = ddi_dma_addr_bind_handle(dma_hdl, NULL,
287 	    (caddr_t)(uintptr_t)dmaaddr, (size_t)dmalen, DDI_DMA_RDWR |
288 	    DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &cookie, &cookie_cnt);
289 	if (status != DDI_SUCCESS) {
290 		ddi_dma_mem_free(&acc_hdl);
291 		ddi_dma_free_handle(&dma_hdl);
292 		kmem_free((void *)info, sizeof (hermon_dbr_info_t));
293 		cmn_err(CE_CONT, "dbr DMA bind handle failed (status %d)",
294 		    status);
295 		return (DDI_FAILURE);
296 	}
297 	*dinfo = info;		/* Pass back the pointer */
298 
299 	/* init the info structure with returned info */
300 	info->dbr_dmahdl = dma_hdl;
301 	info->dbr_acchdl = acc_hdl;
302 	info->dbr_page   = (caddr_t)(uintptr_t)dmaaddr;
303 	/* extract the phys addr from the cookie */
304 	info->dbr_paddr = cookie.dmac_laddress;
305 	/* should have everything now, so do the init of the header */
306 	pagehdr = (hermon_dbr_header_t *)(void *)info->dbr_page;
307 	pagehdr->next = 0;
308 	pagehdr->firstfree = 0;
309 	pagehdr->nfree = HERMON_NUM_DBR_PER_PAGE;
310 	pagehdr->dbr_info = info;
311 	/* link all DBrs onto the free list */
312 	for (i = 0; i < HERMON_NUM_DBR_PER_PAGE; i++) {
313 		pagehdr->dbr[i] = i + 1;
314 	}
315 
316 	return (DDI_SUCCESS);
317 }
318 
319 
320 /*
321  * hermon_dbr_alloc()
322  *	DBr record allocation - called from alloc cq/qp/srq
323  *	will check for available dbrs in current
324  *	page - if needed it will allocate another and link them
325  */
326 
327 int
328 hermon_dbr_alloc(hermon_state_t *state, uint_t index, ddi_acc_handle_t *acchdl,
329     hermon_dbr_t **vdbr, uint64_t *pdbr, uint64_t *mapoffset)
330 {
331 	hermon_dbr_header_t	*pagehdr, *lastpage;
332 	hermon_dbr_t		*record = NULL;
333 	hermon_dbr_info_t	*dinfo = NULL;
334 	int			status;
335 
336 	if (index != state->hs_kernel_uar_index)
337 		return (hermon_user_dbr_alloc(state, index, acchdl, vdbr, pdbr,
338 		    mapoffset));
339 
340 	mutex_enter(&state->hs_dbr_lock);
341 	/* 'pagehdr' holds pointer to first page */
342 	pagehdr = (hermon_dbr_header_t *)(void *)state->hs_kern_dbr;
343 	do {
344 		lastpage = pagehdr; /* save pagehdr for later linking */
345 		if (pagehdr->nfree == 0) {
346 			pagehdr = (hermon_dbr_header_t *)(void *)pagehdr->next;
347 			continue; /* page is full, go to next if there is one */
348 		}
349 		dinfo = pagehdr->dbr_info;
350 		break;			/* found a page w/ one available */
351 	} while (pagehdr != 0);
352 
353 	if (dinfo == NULL) {	/* did NOT find a page with one available */
354 		status = hermon_dbr_page_alloc(state, &dinfo);
355 		if (status != DDI_SUCCESS) {
356 			/* do error handling */
357 			mutex_exit(&state->hs_dbr_lock);
358 			return (DDI_FAILURE);
359 		}
360 		/* got a new page, so link it in. */
361 		pagehdr = (hermon_dbr_header_t *)(void *)dinfo->dbr_page;
362 		lastpage->next = pagehdr;
363 	}
364 	record = pagehdr->dbr + pagehdr->firstfree;
365 	pagehdr->firstfree = *record;
366 	pagehdr->nfree--;
367 	*record = 0;
368 
369 	*acchdl = dinfo->dbr_acchdl;
370 	*vdbr = record;
371 	*pdbr = ((uintptr_t)record - (uintptr_t)pagehdr + dinfo->dbr_paddr);
372 	mutex_exit(&state->hs_dbr_lock);
373 	return (DDI_SUCCESS);
374 }
375 
376 /*
377  * hermon_dbr_free()
378  *	DBr record deallocation - called from free cq/qp
379  *	will update the counter in the header, and invalidate
380  *	the dbr, but will NEVER free pages of dbrs - small
381  *	price to pay, but userland access never will anyway
382  */
383 
384 void
385 hermon_dbr_free(hermon_state_t *state, uint_t indx, hermon_dbr_t *record)
386 {
387 	hermon_dbr_header_t	*pagehdr;
388 
389 	if (indx != state->hs_kernel_uar_index) {
390 		hermon_user_dbr_free(state, indx, record);
391 		return;
392 	}
393 	mutex_enter(&state->hs_dbr_lock);
394 	pagehdr = (hermon_dbr_header_t *)((uintptr_t)record &
395 	    (uintptr_t)PAGEMASK);
396 	*record = pagehdr->firstfree;
397 	pagehdr->firstfree = record - pagehdr->dbr;
398 	pagehdr->nfree++;		/* decr the count for this one */
399 	mutex_exit(&state->hs_dbr_lock);
400 }
401 
402 /*
403  * hermon_dbr_kern_free()
404  *    Context: Can be called only from detach context.
405  *
406  *	Free all kernel dbr pages.  This includes the freeing of all the dma
407  *	resources acquired during the allocation of the pages.
408  *
409  *	Also, free all the user dbr pages.
410  */
411 void
412 hermon_dbr_kern_free(hermon_state_t *state)
413 {
414 	hermon_dbr_header_t	*pagehdr, *lastpage;
415 	hermon_dbr_info_t	*dinfo;
416 	hermon_user_dbr_t	*udbr, *next;
417 	hermon_udbr_page_t	*pagep, *nextp;
418 	hermon_umap_db_entry_t	*umapdb;
419 	int			instance, status;
420 	uint64_t		value;
421 	extern			hermon_umap_db_t hermon_userland_rsrc_db;
422 
423 	mutex_enter(&state->hs_dbr_lock);
424 	pagehdr = (hermon_dbr_header_t *)(void *)state->hs_kern_dbr;
425 	while (pagehdr != NULL) {
426 		lastpage = (hermon_dbr_header_t *)(void *)pagehdr->next;
427 		dinfo = pagehdr->dbr_info;
428 		(void) ddi_dma_unbind_handle(dinfo->dbr_dmahdl);
429 		ddi_dma_mem_free(&dinfo->dbr_acchdl);	/* free page */
430 		ddi_dma_free_handle(&dinfo->dbr_dmahdl);
431 		kmem_free(dinfo, sizeof (hermon_dbr_info_t));
432 		pagehdr = lastpage;
433 	}
434 
435 	udbr = state->hs_user_dbr;
436 	instance = state->hs_instance;
437 	mutex_enter(&hermon_userland_rsrc_db.hdl_umapdb_lock);
438 	while (udbr != NULL) {
439 		pagep = udbr->udbr_pagep;
440 		while (pagep != NULL) {
441 			/* probably need to remove "db" */
442 			(void) ddi_dma_unbind_handle(pagep->upg_dmahdl);
443 			ddi_dma_free_handle(&pagep->upg_dmahdl);
444 			freerbuf(pagep->upg_buf);
445 			ddi_umem_free(pagep->upg_umemcookie);
446 			status = hermon_umap_db_find_nolock(instance,
447 			    HERMON_DBR_KEY(udbr->udbr_index,
448 			    pagep->upg_index), MLNX_UMAP_DBRMEM_RSRC,
449 			    &value, HERMON_UMAP_DB_REMOVE, &umapdb);
450 			if (status == DDI_SUCCESS)
451 				hermon_umap_db_free(umapdb);
452 			nextp = pagep->upg_link;
453 			kmem_free(pagep, sizeof (*pagep));
454 			pagep = nextp;
455 		}
456 		next = udbr->udbr_link;
457 		kmem_free(udbr, sizeof (*udbr));
458 		udbr = next;
459 	}
460 	mutex_exit(&hermon_userland_rsrc_db.hdl_umapdb_lock);
461 	mutex_exit(&state->hs_dbr_lock);
462 }
463 
464 /*
465  * hermon_ah_alloc()
466  *    Context: Can be called only from user or kernel context.
467  */
468 int
469 hermon_ah_alloc(hermon_state_t *state, hermon_pdhdl_t pd,
470     ibt_adds_vect_t *attr_p, hermon_ahhdl_t *ahhdl, uint_t sleepflag)
471 {
472 	hermon_rsrc_t		*rsrc;
473 	hermon_hw_udav_t	*udav;
474 	hermon_ahhdl_t		ah;
475 	int			status;
476 
477 	/*
478 	 * Someday maybe the "ibt_adds_vect_t *attr_p" will be NULL to
479 	 * indicate that we wish to allocate an "invalid" (i.e. empty)
480 	 * address handle XXX
481 	 */
482 
483 	/* Validate that specified port number is legal */
484 	if (!hermon_portnum_is_valid(state, attr_p->av_port_num)) {
485 		return (IBT_HCA_PORT_INVALID);
486 	}
487 
488 	/*
489 	 * Allocate the software structure for tracking the address handle
490 	 * (i.e. the Hermon Address Handle struct).
491 	 */
492 	status = hermon_rsrc_alloc(state, HERMON_AHHDL, 1, sleepflag, &rsrc);
493 	if (status != DDI_SUCCESS) {
494 		return (IBT_INSUFF_RESOURCE);
495 	}
496 	ah = (hermon_ahhdl_t)rsrc->hr_addr;
497 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
498 
499 	/* Increment the reference count on the protection domain (PD) */
500 	hermon_pd_refcnt_inc(pd);
501 
502 	udav = (hermon_hw_udav_t *)kmem_zalloc(sizeof (hermon_hw_udav_t),
503 	    KM_SLEEP);
504 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*udav))
505 
506 	/*
507 	 * Fill in the UDAV data. We first zero out the UDAV, then populate
508 	 * it by then calling hermon_set_addr_path() to fill in the common
509 	 * portions that can be pulled from the "ibt_adds_vect_t" passed in
510 	 */
511 	status = hermon_set_addr_path(state, attr_p,
512 	    (hermon_hw_addr_path_t *)udav, HERMON_ADDRPATH_UDAV);
513 	if (status != DDI_SUCCESS) {
514 		hermon_pd_refcnt_dec(pd);
515 		hermon_rsrc_free(state, &rsrc);
516 		return (status);
517 	}
518 	udav->pd	= pd->pd_pdnum;
519 	udav->sl	= attr_p->av_srvl;
520 
521 	/*
522 	 * Fill in the rest of the Hermon Address Handle struct.
523 	 *
524 	 * NOTE: We are saving away a copy of the "av_dgid.gid_guid" field
525 	 * here because we may need to return it later to the IBTF (as a
526 	 * result of a subsequent query operation).  Unlike the other UDAV
527 	 * parameters, the value of "av_dgid.gid_guid" is not always preserved.
528 	 * The reason for this is described in hermon_set_addr_path().
529 	 */
530 	ah->ah_rsrcp	 = rsrc;
531 	ah->ah_pdhdl	 = pd;
532 	ah->ah_udav	 = udav;
533 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
534 	*ahhdl = ah;
535 
536 	return (DDI_SUCCESS);
537 }
538 
539 
540 /*
541  * hermon_ah_free()
542  *    Context: Can be called only from user or kernel context.
543  */
544 /* ARGSUSED */
545 int
546 hermon_ah_free(hermon_state_t *state, hermon_ahhdl_t *ahhdl, uint_t sleepflag)
547 {
548 	hermon_rsrc_t		*rsrc;
549 	hermon_pdhdl_t		pd;
550 	hermon_ahhdl_t		ah;
551 
552 	/*
553 	 * Pull all the necessary information from the Hermon Address Handle
554 	 * struct.  This is necessary here because the resource for the
555 	 * AH is going to be freed up as part of this operation.
556 	 */
557 	ah    = *ahhdl;
558 	mutex_enter(&ah->ah_lock);
559 	rsrc  = ah->ah_rsrcp;
560 	pd    = ah->ah_pdhdl;
561 	mutex_exit(&ah->ah_lock);
562 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
563 
564 	/* Free the UDAV memory */
565 	kmem_free(ah->ah_udav, sizeof (hermon_hw_udav_t));
566 
567 	/* Decrement the reference count on the protection domain (PD) */
568 	hermon_pd_refcnt_dec(pd);
569 
570 	/* Free the Hermon Address Handle structure */
571 	hermon_rsrc_free(state, &rsrc);
572 
573 	/* Set the ahhdl pointer to NULL and return success */
574 	*ahhdl = NULL;
575 
576 	return (DDI_SUCCESS);
577 }
578 
579 
580 /*
581  * hermon_ah_query()
582  *    Context: Can be called from interrupt or base context.
583  */
584 /* ARGSUSED */
585 int
586 hermon_ah_query(hermon_state_t *state, hermon_ahhdl_t ah, hermon_pdhdl_t *pd,
587     ibt_adds_vect_t *attr_p)
588 {
589 	mutex_enter(&ah->ah_lock);
590 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p))
591 
592 	/*
593 	 * Pull the PD and UDAV from the Hermon Address Handle structure
594 	 */
595 	*pd = ah->ah_pdhdl;
596 
597 	/*
598 	 * Fill in "ibt_adds_vect_t".  We call hermon_get_addr_path() to fill
599 	 * the common portions that can be pulled from the UDAV we pass in.
600 	 *
601 	 * NOTE: We will also fill the "av_dgid.gid_guid" field from the
602 	 * "ah_save_guid" field we have previously saved away.  The reason
603 	 * for this is described in hermon_ah_alloc() and hermon_ah_modify().
604 	 */
605 	hermon_get_addr_path(state, (hermon_hw_addr_path_t *)ah->ah_udav,
606 	    attr_p, HERMON_ADDRPATH_UDAV);
607 
608 	attr_p->av_dgid.gid_guid = ah->ah_save_guid;
609 
610 	mutex_exit(&ah->ah_lock);
611 	return (DDI_SUCCESS);
612 }
613 
614 
615 /*
616  * hermon_ah_modify()
617  *    Context: Can be called from interrupt or base context.
618  */
619 /* ARGSUSED */
620 int
621 hermon_ah_modify(hermon_state_t *state, hermon_ahhdl_t ah,
622     ibt_adds_vect_t *attr_p)
623 {
624 	hermon_hw_udav_t	old_udav;
625 	uint64_t		data_old;
626 	int			status, size, i;
627 
628 	/* Validate that specified port number is legal */
629 	if (!hermon_portnum_is_valid(state, attr_p->av_port_num)) {
630 		return (IBT_HCA_PORT_INVALID);
631 	}
632 
633 	mutex_enter(&ah->ah_lock);
634 
635 	/* Save a copy of the current UDAV data in old_udav. */
636 	bcopy(ah->ah_udav, &old_udav, sizeof (hermon_hw_udav_t));
637 
638 	/*
639 	 * Fill in the new UDAV with the caller's data, passed in via the
640 	 * "ibt_adds_vect_t" structure.
641 	 *
642 	 * NOTE: We also need to save away a copy of the "av_dgid.gid_guid"
643 	 * field here (just as we did during hermon_ah_alloc()) because we
644 	 * may need to return it later to the IBTF (as a result of a
645 	 * subsequent query operation).  As explained in hermon_ah_alloc(),
646 	 * unlike the other UDAV parameters, the value of "av_dgid.gid_guid"
647 	 * is not always preserved. The reason for this is described in
648 	 * hermon_set_addr_path().
649 	 */
650 	status = hermon_set_addr_path(state, attr_p,
651 	    (hermon_hw_addr_path_t *)ah->ah_udav, HERMON_ADDRPATH_UDAV);
652 	if (status != DDI_SUCCESS) {
653 		mutex_exit(&ah->ah_lock);
654 		return (status);
655 	}
656 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
657 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(ah->ah_udav)))
658 	ah->ah_udav->sl  = attr_p->av_srvl;
659 
660 	/*
661 	 * Copy changes into the new UDAV.
662 	 *    Note:  We copy in 64-bit chunks.  For the first two of these
663 	 *    chunks it is necessary to read the current contents of the
664 	 *    UDAV, mask off the modifiable portions (maintaining any
665 	 *    of the "reserved" portions), and then mask on the new data.
666 	 */
667 	size = sizeof (hermon_hw_udav_t) >> 3;
668 	for (i = 0; i < size; i++) {
669 		data_old = ((uint64_t *)&old_udav)[i];
670 
671 		/*
672 		 * Apply mask to change only the relevant values.
673 		 */
674 		if (i == 0) {
675 			data_old = data_old & HERMON_UDAV_MODIFY_MASK0;
676 		} else if (i == 1) {
677 			data_old = data_old & HERMON_UDAV_MODIFY_MASK1;
678 		} else {
679 			data_old = 0;
680 		}
681 
682 		/* Store the updated values to the UDAV */
683 		((uint64_t *)ah->ah_udav)[i] |= data_old;
684 	}
685 
686 	/*
687 	 * Put the valid PD number back into the UDAV entry, as it
688 	 * might have been clobbered above.
689 	 */
690 	ah->ah_udav->pd = old_udav.pd;
691 
692 
693 	mutex_exit(&ah->ah_lock);
694 	return (DDI_SUCCESS);
695 }
696 
697 /*
698  * hermon_mcg_attach()
699  *    Context: Can be called only from user or kernel context.
700  */
701 int
702 hermon_mcg_attach(hermon_state_t *state, hermon_qphdl_t qp, ib_gid_t gid,
703     ib_lid_t lid)
704 {
705 	hermon_rsrc_t		*rsrc;
706 	hermon_hw_mcg_t		*mcg_entry;
707 	hermon_hw_mcg_qp_list_t	*mcg_entry_qplist;
708 	hermon_mcghdl_t		mcg, newmcg;
709 	uint64_t		mgid_hash;
710 	uint32_t		end_indx;
711 	int			status;
712 	uint_t			qp_found;
713 
714 	/*
715 	 * It is only allowed to attach MCG to UD queue pairs.  Verify
716 	 * that the intended QP is of the appropriate transport type
717 	 */
718 	if (qp->qp_serv_type != HERMON_QP_UD) {
719 		return (IBT_QP_SRV_TYPE_INVALID);
720 	}
721 
722 	/*
723 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
724 	 * LIDs should be within a well defined range.  If the specified LID
725 	 * is outside of that range, then return an error.
726 	 */
727 	if (hermon_mlid_is_valid(lid) == 0) {
728 		return (IBT_MC_MLID_INVALID);
729 	}
730 	/*
731 	 * Check for invalid Multicast GID.  All Multicast GIDs should have
732 	 * a well-defined pattern of bits and flags that are allowable.  If
733 	 * the specified GID does not meet the criteria, then return an error.
734 	 */
735 	if (hermon_mgid_is_valid(gid) == 0) {
736 		return (IBT_MC_MGID_INVALID);
737 	}
738 
739 	/*
740 	 * Compute the MGID hash value.  Since the MCG table is arranged as
741 	 * a number of separate hash chains, this operation converts the
742 	 * specified MGID into the starting index of an entry in the hash
743 	 * table (i.e. the index for the start of the appropriate hash chain).
744 	 * Subsequent operations below will walk the chain searching for the
745 	 * right place to add this new QP.
746 	 */
747 	status = hermon_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
748 	    &mgid_hash, HERMON_SLEEPFLAG_FOR_CONTEXT());
749 	if (status != HERMON_CMD_SUCCESS) {
750 		cmn_err(CE_CONT, "Hermon: MGID_HASH command failed: %08x\n",
751 		    status);
752 		if (status == HERMON_CMD_INVALID_STATUS) {
753 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
754 		}
755 		return (ibc_get_ci_failure(0));
756 	}
757 
758 	/*
759 	 * Grab the multicast group mutex.  Then grab the pre-allocated
760 	 * temporary buffer used for holding and/or modifying MCG entries.
761 	 * Zero out the temporary MCG entry before we begin.
762 	 */
763 	mutex_enter(&state->hs_mcglock);
764 	mcg_entry = state->hs_mcgtmp;
765 	mcg_entry_qplist = HERMON_MCG_GET_QPLIST_PTR(mcg_entry);
766 	bzero(mcg_entry, HERMON_MCGMEM_SZ(state));
767 
768 	/*
769 	 * Walk through the array of MCG entries starting at "mgid_hash".
770 	 * Try to find the appropriate place for this new QP to be added.
771 	 * This could happen when the first entry of the chain has MGID == 0
772 	 * (which means that the hash chain is empty), or because we find
773 	 * an entry with the same MGID (in which case we'll add the QP to
774 	 * that MCG), or because we come to the end of the chain (in which
775 	 * case this is the first QP being added to the multicast group that
776 	 * corresponds to the MGID.  The hermon_mcg_walk_mgid_hash() routine
777 	 * walks the list and returns an index into the MCG table.  The entry
778 	 * at this index is then checked to determine which case we have
779 	 * fallen into (see below).  Note:  We are using the "shadow" MCG
780 	 * list (of hermon_mcg_t structs) for this lookup because the real
781 	 * MCG entries are in hardware (and the lookup process would be much
782 	 * more time consuming).
783 	 */
784 	end_indx = hermon_mcg_walk_mgid_hash(state, mgid_hash, gid, NULL);
785 	mcg	 = &state->hs_mcghdl[end_indx];
786 
787 	/*
788 	 * If MGID == 0, then the hash chain is empty.  Just fill in the
789 	 * current entry.  Note:  No need to allocate an MCG table entry
790 	 * as all the hash chain "heads" are already preallocated.
791 	 */
792 	if ((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) {
793 
794 		/* Fill in the current entry in the "shadow" MCG list */
795 		hermon_mcg_setup_new_hdr(mcg, mcg_entry, gid, NULL);
796 
797 		/*
798 		 * Try to add the new QP number to the list.  This (and the
799 		 * above) routine fills in a temporary MCG.  The "mcg_entry"
800 		 * and "mcg_entry_qplist" pointers simply point to different
801 		 * offsets within the same temporary copy of the MCG (for
802 		 * convenience).  Note:  If this fails, we need to invalidate
803 		 * the entries we've already put into the "shadow" list entry
804 		 * above.
805 		 */
806 		status = hermon_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
807 		    &qp_found);
808 		if (status != DDI_SUCCESS) {
809 			bzero(mcg, sizeof (struct hermon_sw_mcg_list_s));
810 			mutex_exit(&state->hs_mcglock);
811 			return (status);
812 		}
813 		if (!qp_found)
814 			mcg_entry->member_cnt = (mcg->mcg_num_qps + 1);
815 			    /* set the member count */
816 
817 		/*
818 		 * Once the temporary MCG has been filled in, write the entry
819 		 * into the appropriate location in the Hermon MCG entry table.
820 		 * If it's successful, then drop the lock and return success.
821 		 * Note: In general, this operation shouldn't fail.  If it
822 		 * does, then it is an indication that something (probably in
823 		 * HW, but maybe in SW) has gone seriously wrong.  We still
824 		 * want to zero out the entries that we've filled in above
825 		 * (in the hermon_mcg_setup_new_hdr() routine).
826 		 */
827 		status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
828 		    HERMON_CMD_NOSLEEP_SPIN);
829 		if (status != HERMON_CMD_SUCCESS) {
830 			bzero(mcg, sizeof (struct hermon_sw_mcg_list_s));
831 			mutex_exit(&state->hs_mcglock);
832 			HERMON_WARNING(state, "failed to write MCG entry");
833 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
834 			    "%08x\n", status);
835 			if (status == HERMON_CMD_INVALID_STATUS) {
836 				hermon_fm_ereport(state, HCA_SYS_ERR,
837 				    HCA_ERR_SRV_LOST);
838 			}
839 			return (ibc_get_ci_failure(0));
840 		}
841 
842 		/*
843 		 * Now that we know all the Hermon firmware accesses have been
844 		 * successful, we update the "shadow" MCG entry by incrementing
845 		 * the "number of attached QPs" count.
846 		 *
847 		 * We increment only if the QP is not already part of the
848 		 * MCG by checking the 'qp_found' flag returned from the
849 		 * qplist_add above.
850 		 */
851 		if (!qp_found) {
852 			mcg->mcg_num_qps++;
853 
854 			/*
855 			 * Increment the refcnt for this QP.  Because the QP
856 			 * was added to this MCG, the refcnt must be
857 			 * incremented.
858 			 */
859 			hermon_qp_mcg_refcnt_inc(qp);
860 		}
861 
862 		/*
863 		 * We drop the lock and return success.
864 		 */
865 		mutex_exit(&state->hs_mcglock);
866 		return (DDI_SUCCESS);
867 	}
868 
869 	/*
870 	 * If the specified MGID matches the MGID in the current entry, then
871 	 * we need to try to add the QP to the current MCG entry.  In this
872 	 * case, it means that we need to read the existing MCG entry (into
873 	 * the temporary MCG), add the new QP number to the temporary entry
874 	 * (using the same method we used above), and write the entry back
875 	 * to the hardware (same as above).
876 	 */
877 	if ((mcg->mcg_mgid_h == gid.gid_prefix) &&
878 	    (mcg->mcg_mgid_l == gid.gid_guid)) {
879 
880 		/*
881 		 * Read the current MCG entry into the temporary MCG.  Note:
882 		 * In general, this operation shouldn't fail.  If it does,
883 		 * then it is an indication that something (probably in HW,
884 		 * but maybe in SW) has gone seriously wrong.
885 		 */
886 		status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
887 		    HERMON_CMD_NOSLEEP_SPIN);
888 		if (status != HERMON_CMD_SUCCESS) {
889 			mutex_exit(&state->hs_mcglock);
890 			HERMON_WARNING(state, "failed to read MCG entry");
891 			cmn_err(CE_CONT, "Hermon: READ_MGM command failed: "
892 			    "%08x\n", status);
893 			if (status == HERMON_CMD_INVALID_STATUS) {
894 				hermon_fm_ereport(state, HCA_SYS_ERR,
895 				    HCA_ERR_SRV_LOST);
896 			}
897 			return (ibc_get_ci_failure(0));
898 		}
899 
900 		/*
901 		 * Try to add the new QP number to the list.  This routine
902 		 * fills in the necessary pieces of the temporary MCG.  The
903 		 * "mcg_entry_qplist" pointer is used to point to the portion
904 		 * of the temporary MCG that holds the QP numbers.
905 		 *
906 		 * Note: hermon_mcg_qplist_add() returns SUCCESS if it
907 		 * already found the QP in the list.  In this case, the QP is
908 		 * not added on to the list again.  Check the flag 'qp_found'
909 		 * if this value is needed to be known.
910 		 *
911 		 */
912 		status = hermon_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
913 		    &qp_found);
914 		if (status != DDI_SUCCESS) {
915 			mutex_exit(&state->hs_mcglock);
916 			return (status);
917 		}
918 		if (!qp_found)
919 			mcg_entry->member_cnt = (mcg->mcg_num_qps + 1);
920 			    /* set the member count */
921 
922 		/*
923 		 * Once the temporary MCG has been updated, write the entry
924 		 * into the appropriate location in the Hermon MCG entry table.
925 		 * If it's successful, then drop the lock and return success.
926 		 * Note: In general, this operation shouldn't fail.  If it
927 		 * does, then it is an indication that something (probably in
928 		 * HW, but maybe in SW) has gone seriously wrong.
929 		 */
930 		status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
931 		    HERMON_CMD_NOSLEEP_SPIN);
932 		if (status != HERMON_CMD_SUCCESS) {
933 			mutex_exit(&state->hs_mcglock);
934 			HERMON_WARNING(state, "failed to write MCG entry");
935 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
936 			    "%08x\n", status);
937 			if (status == HERMON_CMD_INVALID_STATUS) {
938 				hermon_fm_ereport(state, HCA_SYS_ERR,
939 				    HCA_ERR_SRV_LOST);
940 			}
941 			return (ibc_get_ci_failure(0));
942 		}
943 
944 		/*
945 		 * Now that we know all the Hermon firmware accesses have been
946 		 * successful, we update the current "shadow" MCG entry by
947 		 * incrementing the "number of attached QPs" count.
948 		 *
949 		 * We increment only if the QP is not already part of the
950 		 * MCG by checking the 'qp_found' flag returned
951 		 * hermon_mcg_walk_mgid_hashfrom the qplist_add above.
952 		 */
953 		if (!qp_found) {
954 			mcg->mcg_num_qps++;
955 
956 			/*
957 			 * Increment the refcnt for this QP.  Because the QP
958 			 * was added to this MCG, the refcnt must be
959 			 * incremented.
960 			 */
961 			hermon_qp_mcg_refcnt_inc(qp);
962 		}
963 
964 		/*
965 		 * We drop the lock and return success.
966 		 */
967 		mutex_exit(&state->hs_mcglock);
968 		return (DDI_SUCCESS);
969 	}
970 
971 	/*
972 	 * If we've reached here, then we're at the end of the hash chain.
973 	 * We need to allocate a new MCG entry, fill it in, write it to Hermon,
974 	 * and update the previous entry to link the new one to the end of the
975 	 * chain.
976 	 */
977 
978 	/*
979 	 * Allocate an MCG table entry.  This will be filled in with all
980 	 * the necessary parameters to define the multicast group.  Then it
981 	 * will be written to the hardware in the next-to-last step below.
982 	 */
983 	status = hermon_rsrc_alloc(state, HERMON_MCG, 1, HERMON_NOSLEEP, &rsrc);
984 	if (status != DDI_SUCCESS) {
985 		mutex_exit(&state->hs_mcglock);
986 		return (IBT_INSUFF_RESOURCE);
987 	}
988 
989 	/*
990 	 * Fill in the new entry in the "shadow" MCG list.  Note:  Just as
991 	 * it does above, hermon_mcg_setup_new_hdr() also fills in a portion
992 	 * of the temporary MCG entry (the rest of which will be filled in by
993 	 * hermon_mcg_qplist_add() below)
994 	 */
995 	newmcg = &state->hs_mcghdl[rsrc->hr_indx];
996 	hermon_mcg_setup_new_hdr(newmcg, mcg_entry, gid, rsrc);
997 
998 	/*
999 	 * Try to add the new QP number to the list.  This routine fills in
1000 	 * the final necessary pieces of the temporary MCG.  The
1001 	 * "mcg_entry_qplist" pointer is used to point to the portion of the
1002 	 * temporary MCG that holds the QP numbers.  If we fail here, we
1003 	 * must undo the previous resource allocation.
1004 	 *
1005 	 * Note: hermon_mcg_qplist_add() can we return SUCCESS if it already
1006 	 * found the QP in the list.  In this case, the QP is not added on to
1007 	 * the list again.  Check the flag 'qp_found' if this value is needed
1008 	 * to be known.
1009 	 */
1010 	status = hermon_mcg_qplist_add(state, newmcg, mcg_entry_qplist, qp,
1011 	    &qp_found);
1012 	if (status != DDI_SUCCESS) {
1013 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1014 		hermon_rsrc_free(state, &rsrc);
1015 		mutex_exit(&state->hs_mcglock);
1016 		return (status);
1017 	}
1018 	mcg_entry->member_cnt = (newmcg->mcg_num_qps + 1);
1019 	    /* set the member count */
1020 
1021 	/*
1022 	 * Once the temporary MCG has been updated, write the entry into the
1023 	 * appropriate location in the Hermon MCG entry table.  If this is
1024 	 * successful, then we need to chain the previous entry to this one.
1025 	 * Note: In general, this operation shouldn't fail.  If it does, then
1026 	 * it is an indication that something (probably in HW, but maybe in
1027 	 * SW) has gone seriously wrong.
1028 	 */
1029 	status = hermon_write_mgm_cmd_post(state, mcg_entry, rsrc->hr_indx,
1030 	    HERMON_CMD_NOSLEEP_SPIN);
1031 	if (status != HERMON_CMD_SUCCESS) {
1032 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1033 		hermon_rsrc_free(state, &rsrc);
1034 		mutex_exit(&state->hs_mcglock);
1035 		HERMON_WARNING(state, "failed to write MCG entry");
1036 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1037 		    status);
1038 		if (status == HERMON_CMD_INVALID_STATUS) {
1039 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1040 		}
1041 		return (ibc_get_ci_failure(0));
1042 	}
1043 
1044 	/*
1045 	 * Now read the current MCG entry (the one previously at the end of
1046 	 * hash chain) into the temporary MCG.  We are going to update its
1047 	 * "next_gid_indx" now and write the entry back to the MCG table.
1048 	 * Note:  In general, this operation shouldn't fail.  If it does, then
1049 	 * it is an indication that something (probably in HW, but maybe in SW)
1050 	 * has gone seriously wrong.  We will free up the MCG entry resource,
1051 	 * but we will not undo the previously written MCG entry in the HW.
1052 	 * This is OK, though, because the MCG entry is not currently attached
1053 	 * to any hash chain.
1054 	 */
1055 	status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
1056 	    HERMON_CMD_NOSLEEP_SPIN);
1057 	if (status != HERMON_CMD_SUCCESS) {
1058 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1059 		hermon_rsrc_free(state, &rsrc);
1060 		mutex_exit(&state->hs_mcglock);
1061 		HERMON_WARNING(state, "failed to read MCG entry");
1062 		cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
1063 		    status);
1064 		if (status == HERMON_CMD_INVALID_STATUS) {
1065 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1066 		}
1067 		return (ibc_get_ci_failure(0));
1068 	}
1069 
1070 	/*
1071 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
1072 	 * and attempt to write the entry back into the Hermon MCG table.  If
1073 	 * this succeeds, then we update the "shadow" list to reflect the
1074 	 * change, drop the lock, and return success.  Note:  In general, this
1075 	 * operation shouldn't fail.  If it does, then it is an indication
1076 	 * that something (probably in HW, but maybe in SW) has gone seriously
1077 	 * wrong.  Just as we do above, we will free up the MCG entry resource,
1078 	 * but we will not try to undo the previously written MCG entry.  This
1079 	 * is OK, though, because (since we failed here to update the end of
1080 	 * the chain) that other entry is not currently attached to any chain.
1081 	 */
1082 	mcg_entry->next_gid_indx = rsrc->hr_indx;
1083 	status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
1084 	    HERMON_CMD_NOSLEEP_SPIN);
1085 	if (status != HERMON_CMD_SUCCESS) {
1086 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1087 		hermon_rsrc_free(state, &rsrc);
1088 		mutex_exit(&state->hs_mcglock);
1089 		HERMON_WARNING(state, "failed to write MCG entry");
1090 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1091 		    status);
1092 		if (status == HERMON_CMD_INVALID_STATUS) {
1093 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1094 		}
1095 		return (ibc_get_ci_failure(0));
1096 	}
1097 	mcg = &state->hs_mcghdl[end_indx];
1098 	mcg->mcg_next_indx = rsrc->hr_indx;
1099 
1100 	/*
1101 	 * Now that we know all the Hermon firmware accesses have been
1102 	 * successful, we update the new "shadow" MCG entry by incrementing
1103 	 * the "number of attached QPs" count.  Then we drop the lock and
1104 	 * return success.
1105 	 */
1106 	newmcg->mcg_num_qps++;
1107 
1108 	/*
1109 	 * Increment the refcnt for this QP.  Because the QP
1110 	 * was added to this MCG, the refcnt must be
1111 	 * incremented.
1112 	 */
1113 	hermon_qp_mcg_refcnt_inc(qp);
1114 
1115 	mutex_exit(&state->hs_mcglock);
1116 	return (DDI_SUCCESS);
1117 }
1118 
1119 
1120 /*
1121  * hermon_mcg_detach()
1122  *    Context: Can be called only from user or kernel context.
1123  */
1124 int
1125 hermon_mcg_detach(hermon_state_t *state, hermon_qphdl_t qp, ib_gid_t gid,
1126     ib_lid_t lid)
1127 {
1128 	hermon_hw_mcg_t		*mcg_entry;
1129 	hermon_hw_mcg_qp_list_t	*mcg_entry_qplist;
1130 	hermon_mcghdl_t		mcg;
1131 	uint64_t		mgid_hash;
1132 	uint32_t		end_indx, prev_indx;
1133 	int			status;
1134 
1135 	/*
1136 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
1137 	 * LIDs should be within a well defined range.  If the specified LID
1138 	 * is outside of that range, then return an error.
1139 	 */
1140 	if (hermon_mlid_is_valid(lid) == 0) {
1141 		return (IBT_MC_MLID_INVALID);
1142 	}
1143 
1144 	/*
1145 	 * Compute the MGID hash value.  As described above, the MCG table is
1146 	 * arranged as a number of separate hash chains.  This operation
1147 	 * converts the specified MGID into the starting index of an entry in
1148 	 * the hash table (i.e. the index for the start of the appropriate
1149 	 * hash chain).  Subsequent operations below will walk the chain
1150 	 * searching for a matching entry from which to attempt to remove
1151 	 * the specified QP.
1152 	 */
1153 	status = hermon_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
1154 	    &mgid_hash, HERMON_SLEEPFLAG_FOR_CONTEXT());
1155 	if (status != HERMON_CMD_SUCCESS) {
1156 		cmn_err(CE_CONT, "Hermon: MGID_HASH command failed: %08x\n",
1157 		    status);
1158 		if (status == HERMON_CMD_INVALID_STATUS) {
1159 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1160 		}
1161 		return (ibc_get_ci_failure(0));
1162 	}
1163 
1164 	/*
1165 	 * Grab the multicast group mutex.  Then grab the pre-allocated
1166 	 * temporary buffer used for holding and/or modifying MCG entries.
1167 	 */
1168 	mutex_enter(&state->hs_mcglock);
1169 	mcg_entry = state->hs_mcgtmp;
1170 	mcg_entry_qplist = HERMON_MCG_GET_QPLIST_PTR(mcg_entry);
1171 
1172 	/*
1173 	 * Walk through the array of MCG entries starting at "mgid_hash".
1174 	 * Try to find an MCG entry with a matching MGID.  The
1175 	 * hermon_mcg_walk_mgid_hash() routine walks the list and returns an
1176 	 * index into the MCG table.  The entry at this index is checked to
1177 	 * determine whether it is a match or not.  If it is a match, then
1178 	 * we continue on to attempt to remove the QP from the MCG.  If it
1179 	 * is not a match (or not a valid MCG entry), then we return an error.
1180 	 */
1181 	end_indx = hermon_mcg_walk_mgid_hash(state, mgid_hash, gid, &prev_indx);
1182 	mcg	 = &state->hs_mcghdl[end_indx];
1183 
1184 	/*
1185 	 * If MGID == 0 (the hash chain is empty) or if the specified MGID
1186 	 * does not match the MGID in the current entry, then return
1187 	 * IBT_MC_MGID_INVALID (to indicate that the specified MGID is not
1188 	 * valid).
1189 	 */
1190 	if (((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) ||
1191 	    ((mcg->mcg_mgid_h != gid.gid_prefix) ||
1192 	    (mcg->mcg_mgid_l != gid.gid_guid))) {
1193 		mutex_exit(&state->hs_mcglock);
1194 		return (IBT_MC_MGID_INVALID);
1195 	}
1196 
1197 	/*
1198 	 * Read the current MCG entry into the temporary MCG.  Note: In
1199 	 * general, this operation shouldn't fail.  If it does, then it is
1200 	 * an indication that something (probably in HW, but maybe in SW)
1201 	 * has gone seriously wrong.
1202 	 */
1203 	status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
1204 	    HERMON_CMD_NOSLEEP_SPIN);
1205 	if (status != HERMON_CMD_SUCCESS) {
1206 		mutex_exit(&state->hs_mcglock);
1207 		HERMON_WARNING(state, "failed to read MCG entry");
1208 		cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
1209 		    status);
1210 		if (status == HERMON_CMD_INVALID_STATUS) {
1211 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1212 		}
1213 		return (ibc_get_ci_failure(0));
1214 	}
1215 
1216 	/*
1217 	 * Search the QP number list for a match.  If a match is found, then
1218 	 * remove the entry from the QP list.  Otherwise, if no match is found,
1219 	 * return an error.
1220 	 */
1221 	status = hermon_mcg_qplist_remove(mcg, mcg_entry_qplist, qp);
1222 	if (status != DDI_SUCCESS) {
1223 		mutex_exit(&state->hs_mcglock);
1224 		return (status);
1225 	}
1226 
1227 	/*
1228 	 * Decrement the MCG count for this QP.  When the 'qp_mcg'
1229 	 * field becomes 0, then this QP is no longer a member of any
1230 	 * MCG.
1231 	 */
1232 	hermon_qp_mcg_refcnt_dec(qp);
1233 
1234 	/*
1235 	 * If the current MCG's QP number list is about to be made empty
1236 	 * ("mcg_num_qps" == 1), then remove the entry itself from the hash
1237 	 * chain.  Otherwise, just write the updated MCG entry back to the
1238 	 * hardware.  In either case, once we successfully update the hardware
1239 	 * chain, then we decrement the "shadow" list entry's "mcg_num_qps"
1240 	 * count (or zero out the entire "shadow" list entry) before returning
1241 	 * success.  Note:  Zeroing out the "shadow" list entry is done
1242 	 * inside of hermon_mcg_hash_list_remove().
1243 	 */
1244 	if (mcg->mcg_num_qps == 1) {
1245 
1246 		/* Remove an MCG entry from the hash chain */
1247 		status = hermon_mcg_hash_list_remove(state, end_indx, prev_indx,
1248 		    mcg_entry);
1249 		if (status != DDI_SUCCESS) {
1250 			mutex_exit(&state->hs_mcglock);
1251 			return (status);
1252 		}
1253 
1254 	} else {
1255 		/*
1256 		 * Write the updated MCG entry back to the Hermon MCG table.
1257 		 * If this succeeds, then we update the "shadow" list to
1258 		 * reflect the change (i.e. decrement the "mcg_num_qps"),
1259 		 * drop the lock, and return success.  Note:  In general,
1260 		 * this operation shouldn't fail.  If it does, then it is an
1261 		 * indication that something (probably in HW, but maybe in SW)
1262 		 * has gone seriously wrong.
1263 		 */
1264 		mcg_entry->member_cnt = (mcg->mcg_num_qps - 1);
1265 		status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
1266 		    HERMON_CMD_NOSLEEP_SPIN);
1267 		if (status != HERMON_CMD_SUCCESS) {
1268 			mutex_exit(&state->hs_mcglock);
1269 			HERMON_WARNING(state, "failed to write MCG entry");
1270 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
1271 			    "%08x\n", status);
1272 			if (status == HERMON_CMD_INVALID_STATUS) {
1273 				hermon_fm_ereport(state, HCA_SYS_ERR,
1274 				    HCA_ERR_SRV_LOST);
1275 			}
1276 			return (ibc_get_ci_failure(0));
1277 		}
1278 		mcg->mcg_num_qps--;
1279 	}
1280 
1281 	mutex_exit(&state->hs_mcglock);
1282 	return (DDI_SUCCESS);
1283 }
1284 
1285 /*
1286  * hermon_qp_mcg_refcnt_inc()
1287  *    Context: Can be called from interrupt or base context.
1288  */
1289 static void
1290 hermon_qp_mcg_refcnt_inc(hermon_qphdl_t qp)
1291 {
1292 	/* Increment the QP's MCG reference count */
1293 	mutex_enter(&qp->qp_lock);
1294 	qp->qp_mcg_refcnt++;
1295 	mutex_exit(&qp->qp_lock);
1296 }
1297 
1298 
1299 /*
1300  * hermon_qp_mcg_refcnt_dec()
1301  *    Context: Can be called from interrupt or base context.
1302  */
1303 static void
1304 hermon_qp_mcg_refcnt_dec(hermon_qphdl_t qp)
1305 {
1306 	/* Decrement the QP's MCG reference count */
1307 	mutex_enter(&qp->qp_lock);
1308 	qp->qp_mcg_refcnt--;
1309 	mutex_exit(&qp->qp_lock);
1310 }
1311 
1312 
1313 /*
1314  * hermon_mcg_qplist_add()
1315  *    Context: Can be called from interrupt or base context.
1316  */
1317 static int
1318 hermon_mcg_qplist_add(hermon_state_t *state, hermon_mcghdl_t mcg,
1319     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp,
1320     uint_t *qp_found)
1321 {
1322 	uint_t		qplist_indx;
1323 
1324 	ASSERT(MUTEX_HELD(&state->hs_mcglock));
1325 
1326 	qplist_indx = mcg->mcg_num_qps;
1327 
1328 	/*
1329 	 * Determine if we have exceeded the maximum number of QP per
1330 	 * multicast group.  If we have, then return an error
1331 	 */
1332 	if (qplist_indx >= state->hs_cfg_profile->cp_num_qp_per_mcg) {
1333 		return (IBT_HCA_MCG_QP_EXCEEDED);
1334 	}
1335 
1336 	/*
1337 	 * Determine if the QP is already attached to this MCG table.  If it
1338 	 * is, then we break out and treat this operation as a NO-OP
1339 	 */
1340 	for (qplist_indx = 0; qplist_indx < mcg->mcg_num_qps;
1341 	    qplist_indx++) {
1342 		if (mcg_qplist[qplist_indx].qpn == qp->qp_qpnum) {
1343 			break;
1344 		}
1345 	}
1346 
1347 	/*
1348 	 * If the QP was already on the list, set 'qp_found' to TRUE.  We still
1349 	 * return SUCCESS in this case, but the qplist will not have been
1350 	 * updated because the QP was already on the list.
1351 	 */
1352 	if (qplist_indx < mcg->mcg_num_qps) {
1353 		*qp_found = 1;
1354 	} else {
1355 		/*
1356 		 * Otherwise, append the new QP number to the end of the
1357 		 * current QP list.  Note: We will increment the "mcg_num_qps"
1358 		 * field on the "shadow" MCG list entry later (after we know
1359 		 * that all necessary Hermon firmware accesses have been
1360 		 * successful).
1361 		 *
1362 		 * Set 'qp_found' to 0 so we know the QP was added on to the
1363 		 * list for sure.
1364 		 */
1365 		mcg_qplist[qplist_indx].qpn =
1366 		    (qp->qp_qpnum | HERMON_MCG_QPN_BLOCK_LB);
1367 		*qp_found = 0;
1368 	}
1369 
1370 	return (DDI_SUCCESS);
1371 }
1372 
1373 
1374 
1375 /*
1376  * hermon_mcg_qplist_remove()
1377  *    Context: Can be called from interrupt or base context.
1378  */
1379 static int
1380 hermon_mcg_qplist_remove(hermon_mcghdl_t mcg,
1381     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp)
1382 {
1383 	uint_t		i, qplist_indx;
1384 
1385 	/*
1386 	 * Search the MCG QP list for a matching QPN.  When
1387 	 * it's found, we swap the last entry with the current
1388 	 * one, set the last entry to zero, decrement the last
1389 	 * entry, and return.  If it's not found, then it's
1390 	 * and error.
1391 	 */
1392 	qplist_indx = mcg->mcg_num_qps;
1393 	for (i = 0; i < qplist_indx; i++) {
1394 		if (mcg_qplist[i].qpn == qp->qp_qpnum) {
1395 			mcg_qplist[i] = mcg_qplist[qplist_indx - 1];
1396 			mcg_qplist[qplist_indx - 1].qpn = 0;
1397 
1398 			return (DDI_SUCCESS);
1399 		}
1400 	}
1401 
1402 	return (IBT_QP_HDL_INVALID);
1403 }
1404 
1405 
1406 /*
1407  * hermon_mcg_walk_mgid_hash()
1408  *    Context: Can be called from interrupt or base context.
1409  */
1410 static uint_t
1411 hermon_mcg_walk_mgid_hash(hermon_state_t *state, uint64_t start_indx,
1412     ib_gid_t mgid, uint_t *p_indx)
1413 {
1414 	hermon_mcghdl_t	curr_mcghdl;
1415 	uint_t		curr_indx, prev_indx;
1416 
1417 	ASSERT(MUTEX_HELD(&state->hs_mcglock));
1418 
1419 	/* Start at the head of the hash chain */
1420 	curr_indx   = (uint_t)start_indx;
1421 	prev_indx   = curr_indx;
1422 	curr_mcghdl = &state->hs_mcghdl[curr_indx];
1423 
1424 	/* If the first entry in the chain has MGID == 0, then stop */
1425 	if ((curr_mcghdl->mcg_mgid_h == 0) &&
1426 	    (curr_mcghdl->mcg_mgid_l == 0)) {
1427 		goto end_mgid_hash_walk;
1428 	}
1429 
1430 	/* If the first entry in the chain matches the MGID, then stop */
1431 	if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1432 	    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1433 		goto end_mgid_hash_walk;
1434 	}
1435 
1436 	/* Otherwise, walk the hash chain looking for a match */
1437 	while (curr_mcghdl->mcg_next_indx != 0) {
1438 		prev_indx = curr_indx;
1439 		curr_indx = curr_mcghdl->mcg_next_indx;
1440 		curr_mcghdl = &state->hs_mcghdl[curr_indx];
1441 
1442 		if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1443 		    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1444 			break;
1445 		}
1446 	}
1447 
1448 end_mgid_hash_walk:
1449 	/*
1450 	 * If necessary, return the index of the previous entry too.  This
1451 	 * is primarily used for detaching a QP from a multicast group.  It
1452 	 * may be necessary, in that case, to delete an MCG entry from the
1453 	 * hash chain and having the index of the previous entry is helpful.
1454 	 */
1455 	if (p_indx != NULL) {
1456 		*p_indx = prev_indx;
1457 	}
1458 	return (curr_indx);
1459 }
1460 
1461 
1462 /*
1463  * hermon_mcg_setup_new_hdr()
1464  *    Context: Can be called from interrupt or base context.
1465  */
1466 static void
1467 hermon_mcg_setup_new_hdr(hermon_mcghdl_t mcg, hermon_hw_mcg_t *mcg_hdr,
1468     ib_gid_t mgid, hermon_rsrc_t *mcg_rsrc)
1469 {
1470 	/*
1471 	 * Fill in the fields of the "shadow" entry used by software
1472 	 * to track MCG hardware entry
1473 	 */
1474 	mcg->mcg_mgid_h	   = mgid.gid_prefix;
1475 	mcg->mcg_mgid_l	   = mgid.gid_guid;
1476 	mcg->mcg_rsrcp	   = mcg_rsrc;
1477 	mcg->mcg_next_indx = 0;
1478 	mcg->mcg_num_qps   = 0;
1479 
1480 	/*
1481 	 * Fill the header fields of the MCG entry (in the temporary copy)
1482 	 */
1483 	mcg_hdr->mgid_h		= mgid.gid_prefix;
1484 	mcg_hdr->mgid_l		= mgid.gid_guid;
1485 	mcg_hdr->next_gid_indx	= 0;
1486 }
1487 
1488 
1489 /*
1490  * hermon_mcg_hash_list_remove()
1491  *    Context: Can be called only from user or kernel context.
1492  */
1493 static int
1494 hermon_mcg_hash_list_remove(hermon_state_t *state, uint_t curr_indx,
1495     uint_t prev_indx, hermon_hw_mcg_t *mcg_entry)
1496 {
1497 	hermon_mcghdl_t		curr_mcg, prev_mcg, next_mcg;
1498 	uint_t			next_indx;
1499 	int			status;
1500 
1501 	/* Get the pointer to "shadow" list for current entry */
1502 	curr_mcg = &state->hs_mcghdl[curr_indx];
1503 
1504 	/*
1505 	 * If this is the first entry on a hash chain, then attempt to replace
1506 	 * the entry with the next entry on the chain.  If there are no
1507 	 * subsequent entries on the chain, then this is the only entry and
1508 	 * should be invalidated.
1509 	 */
1510 	if (curr_indx == prev_indx) {
1511 
1512 		/*
1513 		 * If this is the only entry on the chain, then invalidate it.
1514 		 * Note:  Invalidating an MCG entry means writing all zeros
1515 		 * to the entry.  This is only necessary for those MCG
1516 		 * entries that are the "head" entries of the individual hash
1517 		 * chains.  Regardless of whether this operation returns
1518 		 * success or failure, return that result to the caller.
1519 		 */
1520 		next_indx = curr_mcg->mcg_next_indx;
1521 		if (next_indx == 0) {
1522 			status = hermon_mcg_entry_invalidate(state, mcg_entry,
1523 			    curr_indx);
1524 			bzero(curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
1525 			return (status);
1526 		}
1527 
1528 		/*
1529 		 * Otherwise, this is just the first entry on the chain, so
1530 		 * grab the next one
1531 		 */
1532 		next_mcg = &state->hs_mcghdl[next_indx];
1533 
1534 		/*
1535 		 * Read the next MCG entry into the temporary MCG.  Note:
1536 		 * In general, this operation shouldn't fail.  If it does,
1537 		 * then it is an indication that something (probably in HW,
1538 		 * but maybe in SW) has gone seriously wrong.
1539 		 */
1540 		status = hermon_read_mgm_cmd_post(state, mcg_entry, next_indx,
1541 		    HERMON_CMD_NOSLEEP_SPIN);
1542 		if (status != HERMON_CMD_SUCCESS) {
1543 			HERMON_WARNING(state, "failed to read MCG entry");
1544 			cmn_err(CE_CONT, "Hermon: READ_MGM command failed: "
1545 			    "%08x\n", status);
1546 			if (status == HERMON_CMD_INVALID_STATUS) {
1547 				hermon_fm_ereport(state, HCA_SYS_ERR,
1548 				    HCA_ERR_SRV_LOST);
1549 			}
1550 			return (ibc_get_ci_failure(0));
1551 		}
1552 
1553 		/*
1554 		 * Copy/Write the temporary MCG back to the hardware MCG list
1555 		 * using the current index.  This essentially removes the
1556 		 * current MCG entry from the list by writing over it with
1557 		 * the next one.  If this is successful, then we can do the
1558 		 * same operation for the "shadow" list.  And we can also
1559 		 * free up the Hermon MCG entry resource that was associated
1560 		 * with the (old) next entry.  Note:  In general, this
1561 		 * operation shouldn't fail.  If it does, then it is an
1562 		 * indication that something (probably in HW, but maybe in SW)
1563 		 * has gone seriously wrong.
1564 		 */
1565 		status = hermon_write_mgm_cmd_post(state, mcg_entry, curr_indx,
1566 		    HERMON_CMD_NOSLEEP_SPIN);
1567 		if (status != HERMON_CMD_SUCCESS) {
1568 			HERMON_WARNING(state, "failed to write MCG entry");
1569 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
1570 			    "%08x\n", status);
1571 			if (status == HERMON_CMD_INVALID_STATUS) {
1572 				hermon_fm_ereport(state, HCA_SYS_ERR,
1573 				    HCA_ERR_SRV_LOST);
1574 			}
1575 			return (ibc_get_ci_failure(0));
1576 		}
1577 
1578 		/*
1579 		 * Copy all the software tracking information from the next
1580 		 * entry on the "shadow" MCG list into the current entry on
1581 		 * the list.  Then invalidate (zero out) the other "shadow"
1582 		 * list entry.
1583 		 */
1584 		bcopy(next_mcg, curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
1585 		bzero(next_mcg, sizeof (struct hermon_sw_mcg_list_s));
1586 
1587 		/*
1588 		 * Free up the Hermon MCG entry resource used by the "next"
1589 		 * MCG entry.  That resource is no longer needed by any
1590 		 * MCG entry which is first on a hash chain (like the "next"
1591 		 * entry has just become).
1592 		 */
1593 		hermon_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1594 
1595 		return (DDI_SUCCESS);
1596 	}
1597 
1598 	/*
1599 	 * Else if this is the last entry on the hash chain (or a middle
1600 	 * entry, then we update the previous entry's "next_gid_index" field
1601 	 * to make it point instead to the next entry on the chain.  By
1602 	 * skipping over the removed entry in this way, we can then free up
1603 	 * any resources associated with the current entry.  Note:  We don't
1604 	 * need to invalidate the "skipped over" hardware entry because it
1605 	 * will no be longer connected to any hash chains, and if/when it is
1606 	 * finally re-used, it will be written with entirely new values.
1607 	 */
1608 
1609 	/*
1610 	 * Read the next MCG entry into the temporary MCG.  Note:  In general,
1611 	 * this operation shouldn't fail.  If it does, then it is an
1612 	 * indication that something (probably in HW, but maybe in SW) has
1613 	 * gone seriously wrong.
1614 	 */
1615 	status = hermon_read_mgm_cmd_post(state, mcg_entry, prev_indx,
1616 	    HERMON_CMD_NOSLEEP_SPIN);
1617 	if (status != HERMON_CMD_SUCCESS) {
1618 		HERMON_WARNING(state, "failed to read MCG entry");
1619 		cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
1620 		    status);
1621 		if (status == HERMON_CMD_INVALID_STATUS) {
1622 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1623 		}
1624 		return (ibc_get_ci_failure(0));
1625 	}
1626 
1627 	/*
1628 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
1629 	 * and attempt to write the entry back into the Hermon MCG table.  If
1630 	 * this succeeds, then we update the "shadow" list to reflect the
1631 	 * change, free up the Hermon MCG entry resource that was associated
1632 	 * with the current entry, and return success.  Note:  In general,
1633 	 * this operation shouldn't fail.  If it does, then it is an indication
1634 	 * that something (probably in HW, but maybe in SW) has gone seriously
1635 	 * wrong.
1636 	 */
1637 	mcg_entry->next_gid_indx = curr_mcg->mcg_next_indx;
1638 	status = hermon_write_mgm_cmd_post(state, mcg_entry, prev_indx,
1639 	    HERMON_CMD_NOSLEEP_SPIN);
1640 	if (status != HERMON_CMD_SUCCESS) {
1641 		HERMON_WARNING(state, "failed to write MCG entry");
1642 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1643 		    status);
1644 		if (status == HERMON_CMD_INVALID_STATUS) {
1645 			hermon_fm_ereport(state, HCA_SYS_ERR,
1646 			    HCA_ERR_SRV_LOST);
1647 		}
1648 		return (ibc_get_ci_failure(0));
1649 	}
1650 
1651 	/*
1652 	 * Get the pointer to the "shadow" MCG list entry for the previous
1653 	 * MCG.  Update its "mcg_next_indx" to point to the next entry
1654 	 * the one after the current entry. Note:  This next index may be
1655 	 * zero, indicating the end of the list.
1656 	 */
1657 	prev_mcg = &state->hs_mcghdl[prev_indx];
1658 	prev_mcg->mcg_next_indx = curr_mcg->mcg_next_indx;
1659 
1660 	/*
1661 	 * Free up the Hermon MCG entry resource used by the current entry.
1662 	 * This resource is no longer needed because the chain now skips over
1663 	 * the current entry.  Then invalidate (zero out) the current "shadow"
1664 	 * list entry.
1665 	 */
1666 	hermon_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1667 	bzero(curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
1668 
1669 	return (DDI_SUCCESS);
1670 }
1671 
1672 
1673 /*
1674  * hermon_mcg_entry_invalidate()
1675  *    Context: Can be called only from user or kernel context.
1676  */
1677 static int
1678 hermon_mcg_entry_invalidate(hermon_state_t *state, hermon_hw_mcg_t *mcg_entry,
1679     uint_t indx)
1680 {
1681 	int		status;
1682 
1683 	/*
1684 	 * Invalidate the hardware MCG entry by zeroing out this temporary
1685 	 * MCG and writing it the the hardware.  Note: In general, this
1686 	 * operation shouldn't fail.  If it does, then it is an indication
1687 	 * that something (probably in HW, but maybe in SW) has gone seriously
1688 	 * wrong.
1689 	 */
1690 	bzero(mcg_entry, HERMON_MCGMEM_SZ(state));
1691 	status = hermon_write_mgm_cmd_post(state, mcg_entry, indx,
1692 	    HERMON_CMD_NOSLEEP_SPIN);
1693 	if (status != HERMON_CMD_SUCCESS) {
1694 		HERMON_WARNING(state, "failed to write MCG entry");
1695 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1696 		    status);
1697 		if (status == HERMON_CMD_INVALID_STATUS) {
1698 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1699 		}
1700 		return (ibc_get_ci_failure(0));
1701 	}
1702 
1703 	return (DDI_SUCCESS);
1704 }
1705 
1706 
1707 /*
1708  * hermon_mgid_is_valid()
1709  *    Context: Can be called from interrupt or base context.
1710  */
1711 static int
1712 hermon_mgid_is_valid(ib_gid_t gid)
1713 {
1714 	uint_t		topbits, flags, scope;
1715 
1716 	/*
1717 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1718 	 * "multicast GID" must have its top eight bits set to all ones
1719 	 */
1720 	topbits = (gid.gid_prefix >> HERMON_MCG_TOPBITS_SHIFT) &
1721 	    HERMON_MCG_TOPBITS_MASK;
1722 	if (topbits != HERMON_MCG_TOPBITS) {
1723 		return (0);
1724 	}
1725 
1726 	/*
1727 	 * The next 4 bits are the "flag" bits.  These are valid only
1728 	 * if they are "0" (which correspond to permanently assigned/
1729 	 * "well-known" multicast GIDs) or "1" (for so-called "transient"
1730 	 * multicast GIDs).  All other values are reserved.
1731 	 */
1732 	flags = (gid.gid_prefix >> HERMON_MCG_FLAGS_SHIFT) &
1733 	    HERMON_MCG_FLAGS_MASK;
1734 	if (!((flags == HERMON_MCG_FLAGS_PERM) ||
1735 	    (flags == HERMON_MCG_FLAGS_NONPERM))) {
1736 		return (0);
1737 	}
1738 
1739 	/*
1740 	 * The next 4 bits are the "scope" bits.  These are valid only
1741 	 * if they are "2" (Link-local), "5" (Site-local), "8"
1742 	 * (Organization-local) or "E" (Global).  All other values
1743 	 * are reserved (or currently unassigned).
1744 	 */
1745 	scope = (gid.gid_prefix >> HERMON_MCG_SCOPE_SHIFT) &
1746 	    HERMON_MCG_SCOPE_MASK;
1747 	if (!((scope == HERMON_MCG_SCOPE_LINKLOC) ||
1748 	    (scope == HERMON_MCG_SCOPE_SITELOC)	 ||
1749 	    (scope == HERMON_MCG_SCOPE_ORGLOC)	 ||
1750 	    (scope == HERMON_MCG_SCOPE_GLOBAL))) {
1751 		return (0);
1752 	}
1753 
1754 	/*
1755 	 * If it passes all of the above checks, then we will consider it
1756 	 * a valid multicast GID.
1757 	 */
1758 	return (1);
1759 }
1760 
1761 
1762 /*
1763  * hermon_mlid_is_valid()
1764  *    Context: Can be called from interrupt or base context.
1765  */
1766 static int
1767 hermon_mlid_is_valid(ib_lid_t lid)
1768 {
1769 	/*
1770 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1771 	 * "multicast DLID" must be between 0xC000 and 0xFFFE.
1772 	 */
1773 	if ((lid < IB_LID_MC_FIRST) || (lid > IB_LID_MC_LAST)) {
1774 		return (0);
1775 	}
1776 
1777 	return (1);
1778 }
1779 
1780 
1781 /*
1782  * hermon_pd_alloc()
1783  *    Context: Can be called only from user or kernel context.
1784  */
1785 int
1786 hermon_pd_alloc(hermon_state_t *state, hermon_pdhdl_t *pdhdl, uint_t sleepflag)
1787 {
1788 	hermon_rsrc_t	*rsrc;
1789 	hermon_pdhdl_t	pd;
1790 	int		status;
1791 
1792 	/*
1793 	 * Allocate the software structure for tracking the protection domain
1794 	 * (i.e. the Hermon Protection Domain handle).  By default each PD
1795 	 * structure will have a unique PD number assigned to it.  All that
1796 	 * is necessary is for software to initialize the PD reference count
1797 	 * (to zero) and return success.
1798 	 */
1799 	status = hermon_rsrc_alloc(state, HERMON_PDHDL, 1, sleepflag, &rsrc);
1800 	if (status != DDI_SUCCESS) {
1801 		return (IBT_INSUFF_RESOURCE);
1802 	}
1803 	pd = (hermon_pdhdl_t)rsrc->hr_addr;
1804 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1805 
1806 	pd->pd_refcnt = 0;
1807 	*pdhdl = pd;
1808 
1809 	return (DDI_SUCCESS);
1810 }
1811 
1812 
1813 /*
1814  * hermon_pd_free()
1815  *    Context: Can be called only from user or kernel context.
1816  */
1817 int
1818 hermon_pd_free(hermon_state_t *state, hermon_pdhdl_t *pdhdl)
1819 {
1820 	hermon_rsrc_t	*rsrc;
1821 	hermon_pdhdl_t	pd;
1822 
1823 	/*
1824 	 * Pull all the necessary information from the Hermon Protection Domain
1825 	 * handle.  This is necessary here because the resource for the
1826 	 * PD is going to be freed up as part of this operation.
1827 	 */
1828 	pd   = *pdhdl;
1829 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1830 	rsrc = pd->pd_rsrcp;
1831 
1832 	/*
1833 	 * Check the PD reference count.  If the reference count is non-zero,
1834 	 * then it means that this protection domain is still referenced by
1835 	 * some memory region, queue pair, address handle, or other IB object
1836 	 * If it is non-zero, then return an error.  Otherwise, free the
1837 	 * Hermon resource and return success.
1838 	 */
1839 	if (pd->pd_refcnt != 0) {
1840 		return (IBT_PD_IN_USE);
1841 	}
1842 
1843 	/* Free the Hermon Protection Domain handle */
1844 	hermon_rsrc_free(state, &rsrc);
1845 
1846 	/* Set the pdhdl pointer to NULL and return success */
1847 	*pdhdl = (hermon_pdhdl_t)NULL;
1848 
1849 	return (DDI_SUCCESS);
1850 }
1851 
1852 
1853 /*
1854  * hermon_pd_refcnt_inc()
1855  *    Context: Can be called from interrupt or base context.
1856  */
1857 void
1858 hermon_pd_refcnt_inc(hermon_pdhdl_t pd)
1859 {
1860 	/* Increment the protection domain's reference count */
1861 	atomic_inc_32(&pd->pd_refcnt);
1862 }
1863 
1864 
1865 /*
1866  * hermon_pd_refcnt_dec()
1867  *    Context: Can be called from interrupt or base context.
1868  */
1869 void
1870 hermon_pd_refcnt_dec(hermon_pdhdl_t pd)
1871 {
1872 	/* Decrement the protection domain's reference count */
1873 	atomic_dec_32(&pd->pd_refcnt);
1874 }
1875 
1876 
1877 /*
1878  * hermon_port_query()
1879  *    Context: Can be called only from user or kernel context.
1880  */
1881 int
1882 hermon_port_query(hermon_state_t *state, uint_t port, ibt_hca_portinfo_t *pi)
1883 {
1884 	sm_portinfo_t		portinfo;
1885 	sm_guidinfo_t		guidinfo;
1886 	sm_pkey_table_t		pkeytable;
1887 	ib_gid_t		*sgid;
1888 	uint_t			sgid_max, pkey_max, tbl_size;
1889 	int			i, j, indx, status;
1890 	ib_pkey_t		*pkeyp;
1891 	ib_guid_t		*guidp;
1892 
1893 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pi))
1894 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1895 
1896 	/* Validate that specified port number is legal */
1897 	if (!hermon_portnum_is_valid(state, port)) {
1898 		return (IBT_HCA_PORT_INVALID);
1899 	}
1900 	pkeyp = state->hs_pkey[port - 1];
1901 	guidp = state->hs_guid[port - 1];
1902 
1903 	/*
1904 	 * We use the Hermon MAD_IFC command to post a GetPortInfo MAD
1905 	 * to the firmware (for the specified port number).  This returns
1906 	 * a full PortInfo MAD (in "portinfo") which we subsequently
1907 	 * parse to fill in the "ibt_hca_portinfo_t" structure returned
1908 	 * to the IBTF.
1909 	 */
1910 	status = hermon_getportinfo_cmd_post(state, port,
1911 	    HERMON_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
1912 	if (status != HERMON_CMD_SUCCESS) {
1913 		cmn_err(CE_CONT, "Hermon: GetPortInfo (port %02d) command "
1914 		    "failed: %08x\n", port, status);
1915 		if (status == HERMON_CMD_INVALID_STATUS) {
1916 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1917 		}
1918 		return (ibc_get_ci_failure(0));
1919 	}
1920 
1921 	/*
1922 	 * Parse the PortInfo MAD and fill in the IBTF structure
1923 	 */
1924 	pi->p_base_lid		= portinfo.LID;
1925 	pi->p_qkey_violations	= portinfo.Q_KeyViolations;
1926 	pi->p_pkey_violations	= portinfo.P_KeyViolations;
1927 	pi->p_sm_sl		= portinfo.MasterSMSL;
1928 	pi->p_sm_lid		= portinfo.MasterSMLID;
1929 	pi->p_linkstate		= portinfo.PortState;
1930 	pi->p_port_num		= portinfo.LocalPortNum;
1931 	pi->p_phys_state	= portinfo.PortPhysicalState;
1932 	pi->p_width_supported	= portinfo.LinkWidthSupported;
1933 	pi->p_width_enabled	= portinfo.LinkWidthEnabled;
1934 	pi->p_width_active	= portinfo.LinkWidthActive;
1935 	pi->p_speed_supported	= portinfo.LinkSpeedSupported;
1936 	pi->p_speed_enabled	= portinfo.LinkSpeedEnabled;
1937 	pi->p_speed_active	= portinfo.LinkSpeedActive;
1938 	pi->p_mtu		= portinfo.MTUCap;
1939 	pi->p_lmc		= portinfo.LMC;
1940 	pi->p_max_vl		= portinfo.VLCap;
1941 	pi->p_subnet_timeout	= portinfo.SubnetTimeOut;
1942 	pi->p_msg_sz		= ((uint32_t)1 << HERMON_QP_LOG_MAX_MSGSZ);
1943 	tbl_size = state->hs_cfg_profile->cp_log_max_gidtbl;
1944 	pi->p_sgid_tbl_sz	= (1 << tbl_size);
1945 	tbl_size = state->hs_cfg_profile->cp_log_max_pkeytbl;
1946 	pi->p_pkey_tbl_sz	= (1 << tbl_size);
1947 	state->hs_sn_prefix[port - 1] = portinfo.GidPrefix;
1948 
1949 	/*
1950 	 * Convert InfiniBand-defined port capability flags to the format
1951 	 * specified by the IBTF
1952 	 */
1953 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM)
1954 		pi->p_capabilities |= IBT_PORT_CAP_SM;
1955 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM_DISABLED)
1956 		pi->p_capabilities |= IBT_PORT_CAP_SM_DISABLED;
1957 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SNMP_SUPPD)
1958 		pi->p_capabilities |= IBT_PORT_CAP_SNMP_TUNNEL;
1959 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_DM_SUPPD)
1960 		pi->p_capabilities |= IBT_PORT_CAP_DM;
1961 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_VM_SUPPD)
1962 		pi->p_capabilities |= IBT_PORT_CAP_VENDOR;
1963 
1964 	/*
1965 	 * Fill in the SGID table.  Since the only access to the Hermon
1966 	 * GID tables is through the firmware's MAD_IFC interface, we
1967 	 * post as many GetGUIDInfo MADs as necessary to read in the entire
1968 	 * contents of the SGID table (for the specified port).  Note:  The
1969 	 * GetGUIDInfo command only gets eight GUIDs per operation.  These
1970 	 * GUIDs are then appended to the GID prefix for the port (from the
1971 	 * GetPortInfo above) to form the entire SGID table.
1972 	 */
1973 	for (i = 0; i < pi->p_sgid_tbl_sz; i += 8) {
1974 		status = hermon_getguidinfo_cmd_post(state, port, i >> 3,
1975 		    HERMON_SLEEPFLAG_FOR_CONTEXT(), &guidinfo);
1976 		if (status != HERMON_CMD_SUCCESS) {
1977 			cmn_err(CE_CONT, "Hermon: GetGUIDInfo (port %02d) "
1978 			    "command failed: %08x\n", port, status);
1979 			if (status == HERMON_CMD_INVALID_STATUS) {
1980 				hermon_fm_ereport(state, HCA_SYS_ERR,
1981 				    HCA_ERR_SRV_LOST);
1982 			}
1983 			return (ibc_get_ci_failure(0));
1984 		}
1985 
1986 		/* Figure out how many of the entries are valid */
1987 		sgid_max = min((pi->p_sgid_tbl_sz - i), 8);
1988 		for (j = 0; j < sgid_max; j++) {
1989 			indx = (i + j);
1990 			sgid = &pi->p_sgid_tbl[indx];
1991 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgid))
1992 			sgid->gid_prefix = portinfo.GidPrefix;
1993 			guidp[indx] = sgid->gid_guid =
1994 			    guidinfo.GUIDBlocks[j];
1995 		}
1996 	}
1997 
1998 	/*
1999 	 * Fill in the PKey table.  Just as for the GID tables above, the
2000 	 * only access to the Hermon PKey tables is through the firmware's
2001 	 * MAD_IFC interface.  We post as many GetPKeyTable MADs as necessary
2002 	 * to read in the entire contents of the PKey table (for the specified
2003 	 * port).  Note:  The GetPKeyTable command only gets 32 PKeys per
2004 	 * operation.
2005 	 */
2006 	for (i = 0; i < pi->p_pkey_tbl_sz; i += 32) {
2007 		status = hermon_getpkeytable_cmd_post(state, port, i,
2008 		    HERMON_SLEEPFLAG_FOR_CONTEXT(), &pkeytable);
2009 		if (status != HERMON_CMD_SUCCESS) {
2010 			cmn_err(CE_CONT, "Hermon: GetPKeyTable (port %02d) "
2011 			    "command failed: %08x\n", port, status);
2012 			if (status == HERMON_CMD_INVALID_STATUS) {
2013 				hermon_fm_ereport(state, HCA_SYS_ERR,
2014 				    HCA_ERR_SRV_LOST);
2015 			}
2016 			return (ibc_get_ci_failure(0));
2017 		}
2018 
2019 		/* Figure out how many of the entries are valid */
2020 		pkey_max = min((pi->p_pkey_tbl_sz - i), 32);
2021 		for (j = 0; j < pkey_max; j++) {
2022 			indx = (i + j);
2023 			pkeyp[indx] = pi->p_pkey_tbl[indx] =
2024 			    pkeytable.P_KeyTableBlocks[j];
2025 		}
2026 	}
2027 
2028 	return (DDI_SUCCESS);
2029 }
2030 
2031 
2032 /*
2033  * hermon_port_modify()
2034  *    Context: Can be called only from user or kernel context.
2035  */
2036 /* ARGSUSED */
2037 int
2038 hermon_port_modify(hermon_state_t *state, uint8_t port,
2039     ibt_port_modify_flags_t flags, uint8_t init_type)
2040 {
2041 	sm_portinfo_t		portinfo;
2042 	uint32_t		capmask;
2043 	int			status;
2044 	hermon_hw_set_port_t	set_port;
2045 
2046 	/*
2047 	 * Return an error if either of the unsupported flags are set
2048 	 */
2049 	if ((flags & IBT_PORT_SHUTDOWN) ||
2050 	    (flags & IBT_PORT_SET_INIT_TYPE)) {
2051 		return (IBT_NOT_SUPPORTED);
2052 	}
2053 
2054 	bzero(&set_port, sizeof (set_port));
2055 
2056 	/*
2057 	 * Determine whether we are trying to reset the QKey counter
2058 	 */
2059 	if (flags & IBT_PORT_RESET_QKEY)
2060 		set_port.rqk = 1;
2061 
2062 	/* Validate that specified port number is legal */
2063 	if (!hermon_portnum_is_valid(state, port)) {
2064 		return (IBT_HCA_PORT_INVALID);
2065 	}
2066 
2067 	/*
2068 	 * Use the Hermon MAD_IFC command to post a GetPortInfo MAD to the
2069 	 * firmware (for the specified port number).  This returns a full
2070 	 * PortInfo MAD (in "portinfo") from which we pull the current
2071 	 * capability mask.  We then modify the capability mask as directed
2072 	 * by the "pmod_flags" field, and write the updated capability mask
2073 	 * using the Hermon SET_IB command (below).
2074 	 */
2075 	status = hermon_getportinfo_cmd_post(state, port,
2076 	    HERMON_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
2077 	if (status != HERMON_CMD_SUCCESS) {
2078 		if (status == HERMON_CMD_INVALID_STATUS) {
2079 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2080 		}
2081 		return (ibc_get_ci_failure(0));
2082 	}
2083 
2084 	/*
2085 	 * Convert InfiniBand-defined port capability flags to the format
2086 	 * specified by the IBTF.  Specifically, we modify the capability
2087 	 * mask based on the specified values.
2088 	 */
2089 	capmask = portinfo.CapabilityMask;
2090 
2091 	if (flags & IBT_PORT_RESET_SM)
2092 		capmask &= ~SM_CAP_MASK_IS_SM;
2093 	else if (flags & IBT_PORT_SET_SM)
2094 		capmask |= SM_CAP_MASK_IS_SM;
2095 
2096 	if (flags & IBT_PORT_RESET_SNMP)
2097 		capmask &= ~SM_CAP_MASK_IS_SNMP_SUPPD;
2098 	else if (flags & IBT_PORT_SET_SNMP)
2099 		capmask |= SM_CAP_MASK_IS_SNMP_SUPPD;
2100 
2101 	if (flags & IBT_PORT_RESET_DEVMGT)
2102 		capmask &= ~SM_CAP_MASK_IS_DM_SUPPD;
2103 	else if (flags & IBT_PORT_SET_DEVMGT)
2104 		capmask |= SM_CAP_MASK_IS_DM_SUPPD;
2105 
2106 	if (flags & IBT_PORT_RESET_VENDOR)
2107 		capmask &= ~SM_CAP_MASK_IS_VM_SUPPD;
2108 	else if (flags & IBT_PORT_SET_VENDOR)
2109 		capmask |= SM_CAP_MASK_IS_VM_SUPPD;
2110 
2111 	set_port.cap_mask = capmask;
2112 
2113 	/*
2114 	 * Use the Hermon SET_PORT command to update the capability mask and
2115 	 * (possibly) reset the QKey violation counter for the specified port.
2116 	 * Note: In general, this operation shouldn't fail.  If it does, then
2117 	 * it is an indication that something (probably in HW, but maybe in
2118 	 * SW) has gone seriously wrong.
2119 	 */
2120 	status = hermon_set_port_cmd_post(state, &set_port, port,
2121 	    HERMON_SLEEPFLAG_FOR_CONTEXT());
2122 	if (status != HERMON_CMD_SUCCESS) {
2123 		HERMON_WARNING(state, "failed to modify port capabilities");
2124 		cmn_err(CE_CONT, "Hermon: SET_IB (port %02d) command failed: "
2125 		    "%08x\n", port, status);
2126 		if (status == HERMON_CMD_INVALID_STATUS) {
2127 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2128 		}
2129 		return (ibc_get_ci_failure(0));
2130 	}
2131 
2132 	return (DDI_SUCCESS);
2133 }
2134 
2135 
2136 /*
2137  * hermon_set_addr_path()
2138  *    Context: Can be called from interrupt or base context.
2139  *
2140  * Note: This routine is used for two purposes.  It is used to fill in the
2141  * Hermon UDAV fields, and it is used to fill in the address path information
2142  * for QPs.  Because the two Hermon structures are similar, common fields can
2143  * be filled in here.  Because they are different, however, we pass
2144  * an additional flag to indicate which type is being filled and do each one
2145  * uniquely
2146  */
2147 
2148 int hermon_srate_override = -1;	/* allows ease of testing */
2149 
2150 int
2151 hermon_set_addr_path(hermon_state_t *state, ibt_adds_vect_t *av,
2152     hermon_hw_addr_path_t *path, uint_t type)
2153 {
2154 	uint_t		gidtbl_sz;
2155 	hermon_hw_udav_t *udav;
2156 
2157 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2158 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2159 
2160 	udav = (hermon_hw_udav_t *)(void *)path;
2161 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*udav))
2162 	path->mlid	= av->av_src_path;
2163 	path->rlid	= av->av_dlid;
2164 
2165 	switch (av->av_srate) {
2166 	case IBT_SRATE_2:	/* 1xSDR-2.5Gb/s injection rate */
2167 		path->max_stat_rate = 7; break;
2168 	case IBT_SRATE_10:	/* 4xSDR-10.0Gb/s injection rate */
2169 		path->max_stat_rate = 8; break;
2170 	case IBT_SRATE_30:	/* 12xSDR-30Gb/s injection rate */
2171 		path->max_stat_rate = 9; break;
2172 	case IBT_SRATE_5:	/* 1xDDR-5Gb/s injection rate */
2173 		path->max_stat_rate = 10; break;
2174 	case IBT_SRATE_20:	/* 4xDDR-20Gb/s injection rate */
2175 		path->max_stat_rate = 11; break;
2176 	case IBT_SRATE_40:	/* 4xQDR-40Gb/s injection rate */
2177 		path->max_stat_rate = 12; break;
2178 	case IBT_SRATE_60:	/* 12xDDR-60Gb/s injection rate */
2179 		path->max_stat_rate = 13; break;
2180 	case IBT_SRATE_80:	/* 8xQDR-80Gb/s injection rate */
2181 		path->max_stat_rate = 14; break;
2182 	case IBT_SRATE_120:	/* 12xQDR-120Gb/s injection rate */
2183 		path->max_stat_rate = 15; break;
2184 	case IBT_SRATE_NOT_SPECIFIED:	/* Max */
2185 		path->max_stat_rate = 0; break;
2186 	default:
2187 		return (IBT_STATIC_RATE_INVALID);
2188 	}
2189 	if (hermon_srate_override != -1) /* for evaluating HCA firmware */
2190 		path->max_stat_rate = hermon_srate_override;
2191 
2192 	/* If "grh" flag is set, then check for valid SGID index too */
2193 	gidtbl_sz = (1 << state->hs_queryport.log_max_gid);
2194 	if ((av->av_send_grh) && (av->av_sgid_ix > gidtbl_sz)) {
2195 		return (IBT_SGID_INVALID);
2196 	}
2197 
2198 	/*
2199 	 * Fill in all "global" values regardless of the value in the GRH
2200 	 * flag.  Because "grh" is not set unless "av_send_grh" is set, the
2201 	 * hardware will ignore the other "global" values as necessary.  Note:
2202 	 * SW does this here to enable later query operations to return
2203 	 * exactly the same params that were passed when the addr path was
2204 	 * last written.
2205 	 */
2206 	path->grh = av->av_send_grh;
2207 	if (type == HERMON_ADDRPATH_QP) {
2208 		path->mgid_index = av->av_sgid_ix;
2209 	} else {
2210 		/*
2211 		 * For Hermon UDAV, the "mgid_index" field is the index into
2212 		 * a combined table (not a per-port table), but having sections
2213 		 * for each port. So some extra calculations are necessary.
2214 		 */
2215 
2216 		path->mgid_index = ((av->av_port_num - 1) * gidtbl_sz) +
2217 		    av->av_sgid_ix;
2218 
2219 		udav->portnum = av->av_port_num;
2220 	}
2221 
2222 	/*
2223 	 * According to Hermon PRM, the (31:0) part of rgid_l must be set to
2224 	 * "0x2" if the 'grh' or 'g' bit is cleared.  It also says that we
2225 	 * only need to do it for UDAV's.  So we enforce that here.
2226 	 *
2227 	 * NOTE: The entire 64 bits worth of GUID info is actually being
2228 	 * preserved (for UDAVs) by the callers of this function
2229 	 * (hermon_ah_alloc() and hermon_ah_modify()) and as long as the
2230 	 * 'grh' bit is not set, the upper 32 bits (63:32) of rgid_l are
2231 	 * "don't care".
2232 	 */
2233 	if ((path->grh) || (type == HERMON_ADDRPATH_QP)) {
2234 		path->flow_label = av->av_flow;
2235 		path->tclass	 = av->av_tclass;
2236 		path->hop_limit	 = av->av_hop;
2237 		bcopy(&(av->av_dgid.gid_prefix), &(path->rgid_h),
2238 		    sizeof (uint64_t));
2239 		bcopy(&(av->av_dgid.gid_guid), &(path->rgid_l),
2240 		    sizeof (uint64_t));
2241 	} else {
2242 		path->rgid_l	 = 0x2;
2243 		path->flow_label = 0;
2244 		path->tclass	 = 0;
2245 		path->hop_limit	 = 0;
2246 		path->rgid_h	 = 0;
2247 	}
2248 	/* extract the default service level */
2249 	udav->sl = (HERMON_DEF_SCHED_SELECTION & 0x3C) >> 2;
2250 
2251 	return (DDI_SUCCESS);
2252 }
2253 
2254 
2255 /*
2256  * hermon_get_addr_path()
2257  *    Context: Can be called from interrupt or base context.
2258  *
2259  * Note: Just like hermon_set_addr_path() above, this routine is used for two
2260  * purposes.  It is used to read in the Hermon UDAV fields, and it is used to
2261  * read in the address path information for QPs.  Because the two Hermon
2262  * structures are similar, common fields can be read in here.  But because
2263  * they are slightly different, we pass an additional flag to indicate which
2264  * type is being read.
2265  */
2266 void
2267 hermon_get_addr_path(hermon_state_t *state, hermon_hw_addr_path_t *path,
2268     ibt_adds_vect_t *av, uint_t type)
2269 {
2270 	uint_t		gidtbl_sz;
2271 
2272 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2273 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2274 
2275 	av->av_src_path	= path->mlid;
2276 	av->av_dlid	= path->rlid;
2277 
2278 	/* Set "av_ipd" value from max_stat_rate */
2279 	switch (path->max_stat_rate) {
2280 	case 7:				/* 1xSDR-2.5Gb/s injection rate */
2281 		av->av_srate = IBT_SRATE_2; break;
2282 	case 8:				/* 4xSDR-10.0Gb/s injection rate */
2283 		av->av_srate = IBT_SRATE_10; break;
2284 	case 9:				/* 12xSDR-30Gb/s injection rate */
2285 		av->av_srate = IBT_SRATE_30; break;
2286 	case 10:			/* 1xDDR-5Gb/s injection rate */
2287 		av->av_srate = IBT_SRATE_5; break;
2288 	case 11:			/* 4xDDR-20Gb/s injection rate */
2289 		av->av_srate = IBT_SRATE_20; break;
2290 	case 12:			/* xQDR-40Gb/s injection rate */
2291 		av->av_srate = IBT_SRATE_40; break;
2292 	case 13:			/* 12xDDR-60Gb/s injection rate */
2293 		av->av_srate = IBT_SRATE_60; break;
2294 	case 14:			/* 8xQDR-80Gb/s injection rate */
2295 		av->av_srate = IBT_SRATE_80; break;
2296 	case 15:			/* 12xQDR-120Gb/s injection rate */
2297 		av->av_srate = IBT_SRATE_120; break;
2298 	case 0:				/* max */
2299 		av->av_srate = IBT_SRATE_10; break;
2300 	default:			/* 1x injection rate */
2301 		av->av_srate = IBT_SRATE_1X;
2302 	}
2303 
2304 	/*
2305 	 * Extract all "global" values regardless of the value in the GRH
2306 	 * flag.  Because "av_send_grh" is set only if "grh" is set, software
2307 	 * knows to ignore the other "global" values as necessary.  Note: SW
2308 	 * does it this way to enable these query operations to return exactly
2309 	 * the same params that were passed when the addr path was last written.
2310 	 */
2311 	av->av_send_grh		= path->grh;
2312 	if (type == HERMON_ADDRPATH_QP) {
2313 		av->av_sgid_ix  = path->mgid_index;
2314 	} else {
2315 		/*
2316 		 * For Hermon UDAV, the "mgid_index" field is the index into
2317 		 * a combined table (not a per-port table).
2318 		 */
2319 		gidtbl_sz = (1 << state->hs_queryport.log_max_gid);
2320 		av->av_sgid_ix = path->mgid_index - ((av->av_port_num - 1) *
2321 		    gidtbl_sz);
2322 
2323 		av->av_port_num = ((hermon_hw_udav_t *)(void *)path)->portnum;
2324 	}
2325 	av->av_flow		= path->flow_label;
2326 	av->av_tclass		= path->tclass;
2327 	av->av_hop		= path->hop_limit;
2328 	/* this is for alignment issue w/ the addr path struct in Hermon */
2329 	bcopy(&(path->rgid_h), &(av->av_dgid.gid_prefix), sizeof (uint64_t));
2330 	bcopy(&(path->rgid_l), &(av->av_dgid.gid_guid), sizeof (uint64_t));
2331 }
2332 
2333 
2334 /*
2335  * hermon_portnum_is_valid()
2336  *    Context: Can be called from interrupt or base context.
2337  */
2338 int
2339 hermon_portnum_is_valid(hermon_state_t *state, uint_t portnum)
2340 {
2341 	uint_t	max_port;
2342 
2343 	max_port = state->hs_cfg_profile->cp_num_ports;
2344 	if ((portnum <= max_port) && (portnum != 0)) {
2345 		return (1);
2346 	} else {
2347 		return (0);
2348 	}
2349 }
2350 
2351 
2352 /*
2353  * hermon_pkeyindex_is_valid()
2354  *    Context: Can be called from interrupt or base context.
2355  */
2356 int
2357 hermon_pkeyindex_is_valid(hermon_state_t *state, uint_t pkeyindx)
2358 {
2359 	uint_t	max_pkeyindx;
2360 
2361 	max_pkeyindx = 1 << state->hs_cfg_profile->cp_log_max_pkeytbl;
2362 	if (pkeyindx < max_pkeyindx) {
2363 		return (1);
2364 	} else {
2365 		return (0);
2366 	}
2367 }
2368 
2369 
2370 /*
2371  * hermon_queue_alloc()
2372  *    Context: Can be called from interrupt or base context.
2373  */
2374 int
2375 hermon_queue_alloc(hermon_state_t *state, hermon_qalloc_info_t *qa_info,
2376     uint_t sleepflag)
2377 {
2378 	ddi_dma_attr_t		dma_attr;
2379 	int			(*callback)(caddr_t);
2380 	uint64_t		realsize, alloc_mask;
2381 	uint_t			type;
2382 	int			flag, status;
2383 
2384 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2385 
2386 	/* Set the callback flag appropriately */
2387 	callback = (sleepflag == HERMON_SLEEP) ? DDI_DMA_SLEEP :
2388 	    DDI_DMA_DONTWAIT;
2389 
2390 	/*
2391 	 * Initialize many of the default DMA attributes.  Then set additional
2392 	 * alignment restrictions as necessary for the queue memory.  Also
2393 	 * respect the configured value for IOMMU bypass
2394 	 */
2395 	hermon_dma_attr_init(state, &dma_attr);
2396 	dma_attr.dma_attr_align = qa_info->qa_bind_align;
2397 	type = state->hs_cfg_profile->cp_iommu_bypass;
2398 	if (type == HERMON_BINDMEM_BYPASS) {
2399 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2400 	}
2401 
2402 	/* Allocate a DMA handle */
2403 	status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr, callback, NULL,
2404 	    &qa_info->qa_dmahdl);
2405 	if (status != DDI_SUCCESS) {
2406 		return (DDI_FAILURE);
2407 	}
2408 
2409 	/*
2410 	 * Determine the amount of memory to allocate, depending on the values
2411 	 * in "qa_bind_align" and "qa_alloc_align".  The problem we are trying
2412 	 * to solve here is that allocating a DMA handle with IOMMU bypass
2413 	 * (DDI_DMA_FORCE_PHYSICAL) constrains us to only requesting alignments
2414 	 * that are less restrictive than the page size.  Since we may need
2415 	 * stricter alignments on the memory allocated by ddi_dma_mem_alloc()
2416 	 * (e.g. in Hermon QP work queue memory allocation), we use the
2417 	 * following method to calculate how much additional memory to request,
2418 	 * and we enforce our own alignment on the allocated result.
2419 	 */
2420 	alloc_mask = qa_info->qa_alloc_align - 1;
2421 	if (qa_info->qa_bind_align == qa_info->qa_alloc_align) {
2422 		realsize = qa_info->qa_size;
2423 	} else {
2424 		realsize = qa_info->qa_size + alloc_mask;
2425 	}
2426 
2427 	/*
2428 	 * If we are to allocate the queue from system memory, then use
2429 	 * ddi_dma_mem_alloc() to find the space.  Otherwise, this is a
2430 	 * host memory allocation, use ddi_umem_alloc(). In either case,
2431 	 * return a pointer to the memory range allocated (including any
2432 	 * necessary alignment adjustments), the "real" memory pointer,
2433 	 * the "real" size, and a ddi_acc_handle_t to use when reading
2434 	 * from/writing to the memory.
2435 	 */
2436 	if (qa_info->qa_location == HERMON_QUEUE_LOCATION_NORMAL) {
2437 		/* Allocate system memory for the queue */
2438 		status = ddi_dma_mem_alloc(qa_info->qa_dmahdl, realsize,
2439 		    &state->hs_reg_accattr, DDI_DMA_CONSISTENT, callback, NULL,
2440 		    (caddr_t *)&qa_info->qa_buf_real,
2441 		    (size_t *)&qa_info->qa_buf_realsz, &qa_info->qa_acchdl);
2442 		if (status != DDI_SUCCESS) {
2443 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2444 			return (DDI_FAILURE);
2445 		}
2446 
2447 		/*
2448 		 * Save temporary copy of the real pointer.  (This may be
2449 		 * modified in the last step below).
2450 		 */
2451 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2452 
2453 		bzero(qa_info->qa_buf_real, qa_info->qa_buf_realsz);
2454 
2455 	} else { /* HERMON_QUEUE_LOCATION_USERLAND */
2456 
2457 		/* Allocate userland mappable memory for the queue */
2458 		flag = (sleepflag == HERMON_SLEEP) ? DDI_UMEM_SLEEP :
2459 		    DDI_UMEM_NOSLEEP;
2460 		qa_info->qa_buf_real = ddi_umem_alloc(realsize, flag,
2461 		    &qa_info->qa_umemcookie);
2462 		if (qa_info->qa_buf_real == NULL) {
2463 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2464 			return (DDI_FAILURE);
2465 		}
2466 
2467 		/*
2468 		 * Save temporary copy of the real pointer.  (This may be
2469 		 * modified in the last step below).
2470 		 */
2471 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2472 
2473 	}
2474 
2475 	/*
2476 	 * The next to last step is to ensure that the final address
2477 	 * ("qa_buf_aligned") has the appropriate "alloc" alignment
2478 	 * restriction applied to it (if necessary).
2479 	 */
2480 	if (qa_info->qa_bind_align != qa_info->qa_alloc_align) {
2481 		qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2482 		    qa_info->qa_buf_aligned + alloc_mask) & ~alloc_mask);
2483 	}
2484 	/*
2485 	 * The last step is to figure out the offset of the start relative
2486 	 * to the first page of the region - will be used in the eqc/cqc
2487 	 * passed to the HW
2488 	 */
2489 	qa_info->qa_pgoffs = (uint_t)((uintptr_t)
2490 	    qa_info->qa_buf_aligned & HERMON_PAGEMASK);
2491 
2492 	return (DDI_SUCCESS);
2493 }
2494 
2495 
2496 /*
2497  * hermon_queue_free()
2498  *    Context: Can be called from interrupt or base context.
2499  */
2500 void
2501 hermon_queue_free(hermon_qalloc_info_t *qa_info)
2502 {
2503 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2504 
2505 	/*
2506 	 * Depending on how (i.e. from where) we allocated the memory for
2507 	 * this queue, we choose the appropriate method for releasing the
2508 	 * resources.
2509 	 */
2510 	if (qa_info->qa_location == HERMON_QUEUE_LOCATION_NORMAL) {
2511 
2512 		ddi_dma_mem_free(&qa_info->qa_acchdl);
2513 
2514 	} else if (qa_info->qa_location == HERMON_QUEUE_LOCATION_USERLAND) {
2515 
2516 		ddi_umem_free(qa_info->qa_umemcookie);
2517 
2518 	}
2519 
2520 	/* Always free the dma handle */
2521 	ddi_dma_free_handle(&qa_info->qa_dmahdl);
2522 }
2523 
2524 /*
2525  * hermon_destroy_fmr_pool()
2526  * Create a pool of FMRs.
2527  *     Context: Can be called from kernel context only.
2528  */
2529 int
2530 hermon_create_fmr_pool(hermon_state_t *state, hermon_pdhdl_t pd,
2531     ibt_fmr_pool_attr_t *fmr_attr, hermon_fmrhdl_t *fmrpoolp)
2532 {
2533 	hermon_fmrhdl_t	fmrpool;
2534 	hermon_fmr_list_t *fmr, *fmr_next;
2535 	hermon_mrhdl_t   mr;
2536 	char		taskqname[48];
2537 	int		status;
2538 	int		sleep;
2539 	int		i;
2540 
2541 	sleep = (fmr_attr->fmr_flags & IBT_MR_SLEEP) ? HERMON_SLEEP :
2542 	    HERMON_NOSLEEP;
2543 	if ((sleep == HERMON_SLEEP) &&
2544 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
2545 		return (IBT_INVALID_PARAM);
2546 	}
2547 
2548 	fmrpool = (hermon_fmrhdl_t)kmem_zalloc(sizeof (*fmrpool), sleep);
2549 	if (fmrpool == NULL) {
2550 		status = IBT_INSUFF_RESOURCE;
2551 		goto fail;
2552 	}
2553 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmrpool))
2554 
2555 	mutex_init(&fmrpool->fmr_lock, NULL, MUTEX_DRIVER,
2556 	    DDI_INTR_PRI(state->hs_intrmsi_pri));
2557 
2558 	fmrpool->fmr_state	    = state;
2559 	fmrpool->fmr_flush_function = fmr_attr->fmr_func_hdlr;
2560 	fmrpool->fmr_flush_arg	    = fmr_attr->fmr_func_arg;
2561 	fmrpool->fmr_pool_size	    = 0;
2562 	fmrpool->fmr_cache	    = 0;
2563 	fmrpool->fmr_max_pages	    = fmr_attr->fmr_max_pages_per_fmr;
2564 	fmrpool->fmr_page_sz	    = fmr_attr->fmr_page_sz;
2565 	fmrpool->fmr_dirty_watermark = fmr_attr->fmr_dirty_watermark;
2566 	fmrpool->fmr_dirty_len	    = 0;
2567 	fmrpool->fmr_flags	    = fmr_attr->fmr_flags;
2568 
2569 	/* Create taskq to handle cleanup and flush processing */
2570 	(void) snprintf(taskqname, 50, "fmrpool/%d/%d @ 0x%" PRIx64,
2571 	    fmr_attr->fmr_pool_size, hermon_debug_fmrpool_cnt,
2572 	    (uint64_t)(uintptr_t)fmrpool);
2573 	fmrpool->fmr_taskq = ddi_taskq_create(state->hs_dip, taskqname,
2574 	    HERMON_TASKQ_NTHREADS, TASKQ_DEFAULTPRI, 0);
2575 	if (fmrpool->fmr_taskq == NULL) {
2576 		status = IBT_INSUFF_RESOURCE;
2577 		goto fail1;
2578 	}
2579 
2580 	fmrpool->fmr_free_list = NULL;
2581 	fmrpool->fmr_dirty_list = NULL;
2582 
2583 	if (fmr_attr->fmr_cache) {
2584 		hermon_fmr_cache_init(fmrpool);
2585 	}
2586 
2587 	for (i = 0; i < fmr_attr->fmr_pool_size; i++) {
2588 		status = hermon_mr_alloc_fmr(state, pd, fmrpool, &mr);
2589 		if (status != DDI_SUCCESS) {
2590 			goto fail2;
2591 		}
2592 
2593 		fmr = (hermon_fmr_list_t *)kmem_zalloc(
2594 		    sizeof (hermon_fmr_list_t), sleep);
2595 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2596 
2597 		fmr->fmr = mr;
2598 		fmr->fmr_refcnt = 0;
2599 		fmr->fmr_remaps = 0;
2600 		fmr->fmr_pool = fmrpool;
2601 		fmr->fmr_in_cache = 0;
2602 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
2603 		mr->mr_fmr = fmr;
2604 
2605 		fmr->fmr_next = fmrpool->fmr_free_list;
2606 		fmrpool->fmr_free_list = fmr;
2607 		fmrpool->fmr_pool_size++;
2608 	}
2609 
2610 	/* Set to return pool */
2611 	*fmrpoolp = fmrpool;
2612 
2613 	return (IBT_SUCCESS);
2614 fail2:
2615 	hermon_fmr_cache_fini(fmrpool);
2616 	for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
2617 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2618 		fmr_next = fmr->fmr_next;
2619 		(void) hermon_mr_dealloc_fmr(state, &fmr->fmr);
2620 		kmem_free(fmr, sizeof (hermon_fmr_list_t));
2621 	}
2622 	ddi_taskq_destroy(fmrpool->fmr_taskq);
2623 fail1:
2624 	kmem_free(fmrpool, sizeof (*fmrpool));
2625 fail:
2626 	if (status == DDI_FAILURE) {
2627 		return (ibc_get_ci_failure(0));
2628 	} else {
2629 		return (status);
2630 	}
2631 }
2632 
2633 /*
2634  * hermon_destroy_fmr_pool()
2635  * Destroy an FMR pool and free all associated resources.
2636  *     Context: Can be called from kernel context only.
2637  */
2638 int
2639 hermon_destroy_fmr_pool(hermon_state_t *state, hermon_fmrhdl_t fmrpool)
2640 {
2641 	hermon_fmr_list_t	*fmr, *fmr_next;
2642 	int			status;
2643 
2644 	mutex_enter(&fmrpool->fmr_lock);
2645 	status = hermon_fmr_cleanup(state, fmrpool);
2646 	if (status != DDI_SUCCESS) {
2647 		mutex_exit(&fmrpool->fmr_lock);
2648 		return (status);
2649 	}
2650 
2651 	if (fmrpool->fmr_cache) {
2652 		hermon_fmr_cache_fini(fmrpool);
2653 	}
2654 
2655 	for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
2656 		fmr_next = fmr->fmr_next;
2657 
2658 		(void) hermon_mr_dealloc_fmr(state, &fmr->fmr);
2659 		kmem_free(fmr, sizeof (hermon_fmr_list_t));
2660 	}
2661 	mutex_exit(&fmrpool->fmr_lock);
2662 
2663 	ddi_taskq_destroy(fmrpool->fmr_taskq);
2664 	mutex_destroy(&fmrpool->fmr_lock);
2665 
2666 	kmem_free(fmrpool, sizeof (*fmrpool));
2667 	return (DDI_SUCCESS);
2668 }
2669 
2670 /*
2671  * hermon_flush_fmr_pool()
2672  * Ensure that all unmapped FMRs are fully invalidated.
2673  *     Context: Can be called from kernel context only.
2674  */
2675 int
2676 hermon_flush_fmr_pool(hermon_state_t *state, hermon_fmrhdl_t fmrpool)
2677 {
2678 	int		status;
2679 
2680 	/*
2681 	 * Force the unmapping of all entries on the dirty list, regardless of
2682 	 * whether the watermark has been hit yet.
2683 	 */
2684 	/* grab the pool lock */
2685 	mutex_enter(&fmrpool->fmr_lock);
2686 	status = hermon_fmr_cleanup(state, fmrpool);
2687 	mutex_exit(&fmrpool->fmr_lock);
2688 	return (status);
2689 }
2690 
2691 /*
2692  * hermon_deregister_fmr()
2693  * Map memory into FMR
2694  *    Context: Can be called from interrupt or base context.
2695  */
2696 int
2697 hermon_register_physical_fmr(hermon_state_t *state, hermon_fmrhdl_t fmrpool,
2698     ibt_pmr_attr_t *mem_pattr, hermon_mrhdl_t *mr,
2699     ibt_pmr_desc_t *mem_desc_p)
2700 {
2701 	hermon_fmr_list_t	*fmr;
2702 	hermon_fmr_list_t	query;
2703 	avl_index_t		where;
2704 	int			status;
2705 
2706 	/* Check length */
2707 	mutex_enter(&fmrpool->fmr_lock);
2708 	if (mem_pattr->pmr_len < 1 || (mem_pattr->pmr_num_buf >
2709 	    fmrpool->fmr_max_pages)) {
2710 		mutex_exit(&fmrpool->fmr_lock);
2711 		return (IBT_MR_LEN_INVALID);
2712 	}
2713 
2714 	mutex_enter(&fmrpool->fmr_cachelock);
2715 	/* lookup in fmr cache */
2716 	/* if exists, grab it, and return it */
2717 	if (fmrpool->fmr_cache) {
2718 		query.fmr_desc.pmd_iova = mem_pattr->pmr_iova;
2719 		query.fmr_desc.pmd_phys_buf_list_sz = mem_pattr->pmr_len;
2720 		fmr = (hermon_fmr_list_t *)avl_find(&fmrpool->fmr_cache_avl,
2721 		    &query, &where);
2722 
2723 		/*
2724 		 * If valid FMR was found in cache, return that fmr info
2725 		 */
2726 		if (fmr != NULL) {
2727 			fmr->fmr_refcnt++;
2728 			/* Store pmr desc for use in cache */
2729 			(void) memcpy(mem_desc_p, &fmr->fmr_desc,
2730 			    sizeof (ibt_pmr_desc_t));
2731 			*mr = (hermon_mrhdl_t)fmr->fmr;
2732 			mutex_exit(&fmrpool->fmr_cachelock);
2733 			mutex_exit(&fmrpool->fmr_lock);
2734 			return (DDI_SUCCESS);
2735 		}
2736 	}
2737 
2738 	/* FMR does not exist in cache, proceed with registration */
2739 
2740 	/* grab next free entry */
2741 	fmr = fmrpool->fmr_free_list;
2742 	if (fmr == NULL) {
2743 		mutex_exit(&fmrpool->fmr_cachelock);
2744 		mutex_exit(&fmrpool->fmr_lock);
2745 		return (IBT_INSUFF_RESOURCE);
2746 	}
2747 
2748 	fmrpool->fmr_free_list = fmrpool->fmr_free_list->fmr_next;
2749 	fmr->fmr_next = NULL;
2750 
2751 	status = hermon_mr_register_physical_fmr(state, mem_pattr, fmr->fmr,
2752 	    mem_desc_p);
2753 	if (status != DDI_SUCCESS) {
2754 		mutex_exit(&fmrpool->fmr_cachelock);
2755 		mutex_exit(&fmrpool->fmr_lock);
2756 		return (status);
2757 	}
2758 
2759 	fmr->fmr_refcnt = 1;
2760 	fmr->fmr_remaps++;
2761 
2762 	/* Store pmr desc for use in cache */
2763 	(void) memcpy(&fmr->fmr_desc, mem_desc_p, sizeof (ibt_pmr_desc_t));
2764 	*mr = (hermon_mrhdl_t)fmr->fmr;
2765 
2766 	/* Store in cache */
2767 	if (fmrpool->fmr_cache) {
2768 		if (!fmr->fmr_in_cache) {
2769 			avl_insert(&fmrpool->fmr_cache_avl, fmr, where);
2770 			fmr->fmr_in_cache = 1;
2771 		}
2772 	}
2773 
2774 	mutex_exit(&fmrpool->fmr_cachelock);
2775 	mutex_exit(&fmrpool->fmr_lock);
2776 	return (DDI_SUCCESS);
2777 }
2778 
2779 /*
2780  * hermon_deregister_fmr()
2781  * Unmap FMR
2782  *    Context: Can be called from kernel context only.
2783  */
2784 int
2785 hermon_deregister_fmr(hermon_state_t *state, hermon_mrhdl_t mr)
2786 {
2787 	hermon_fmr_list_t	*fmr;
2788 	hermon_fmrhdl_t		fmrpool;
2789 	int			status;
2790 
2791 	fmr = mr->mr_fmr;
2792 	fmrpool = fmr->fmr_pool;
2793 
2794 	/* Grab pool lock */
2795 	mutex_enter(&fmrpool->fmr_lock);
2796 	fmr->fmr_refcnt--;
2797 
2798 	if (fmr->fmr_refcnt == 0) {
2799 		/*
2800 		 * First, do some bit of invalidation, reducing our exposure to
2801 		 * having this region still registered in hardware.
2802 		 */
2803 		(void) hermon_mr_invalidate_fmr(state, mr);
2804 
2805 		/*
2806 		 * If we've exhausted our remaps then add the FMR to the dirty
2807 		 * list, not allowing it to be re-used until we have done a
2808 		 * flush.  Otherwise, simply add it back to the free list for
2809 		 * re-mapping.
2810 		 */
2811 		if (fmr->fmr_remaps <
2812 		    state->hs_cfg_profile->cp_fmr_max_remaps) {
2813 			/* add to free list */
2814 			fmr->fmr_next = fmrpool->fmr_free_list;
2815 			fmrpool->fmr_free_list = fmr;
2816 		} else {
2817 			/* add to dirty list */
2818 			fmr->fmr_next = fmrpool->fmr_dirty_list;
2819 			fmrpool->fmr_dirty_list = fmr;
2820 			fmrpool->fmr_dirty_len++;
2821 
2822 			status = ddi_taskq_dispatch(fmrpool->fmr_taskq,
2823 			    hermon_fmr_processing, fmrpool, DDI_NOSLEEP);
2824 			if (status == DDI_FAILURE) {
2825 				mutex_exit(&fmrpool->fmr_lock);
2826 				return (IBT_INSUFF_RESOURCE);
2827 			}
2828 		}
2829 	}
2830 	/* Release pool lock */
2831 	mutex_exit(&fmrpool->fmr_lock);
2832 
2833 	return (DDI_SUCCESS);
2834 }
2835 
2836 
2837 /*
2838  * hermon_fmr_processing()
2839  * If required, perform cleanup.
2840  *     Context: Called from taskq context only.
2841  */
2842 static void
2843 hermon_fmr_processing(void *fmr_args)
2844 {
2845 	hermon_fmrhdl_t		fmrpool;
2846 	int			status;
2847 
2848 	ASSERT(fmr_args != NULL);
2849 
2850 	fmrpool = (hermon_fmrhdl_t)fmr_args;
2851 
2852 	/* grab pool lock */
2853 	mutex_enter(&fmrpool->fmr_lock);
2854 	if (fmrpool->fmr_dirty_len >= fmrpool->fmr_dirty_watermark) {
2855 		status = hermon_fmr_cleanup(fmrpool->fmr_state, fmrpool);
2856 		if (status != DDI_SUCCESS) {
2857 			mutex_exit(&fmrpool->fmr_lock);
2858 			return;
2859 		}
2860 
2861 		if (fmrpool->fmr_flush_function != NULL) {
2862 			(void) fmrpool->fmr_flush_function(
2863 			    (ibc_fmr_pool_hdl_t)fmrpool,
2864 			    fmrpool->fmr_flush_arg);
2865 		}
2866 	}
2867 
2868 	/* let pool lock go */
2869 	mutex_exit(&fmrpool->fmr_lock);
2870 }
2871 
2872 /*
2873  * hermon_fmr_cleanup()
2874  * Perform cleaning processing, walking the list and performing the MTT sync
2875  * operation if required.
2876  *    Context: can be called from taskq or base context.
2877  */
2878 static int
2879 hermon_fmr_cleanup(hermon_state_t *state, hermon_fmrhdl_t fmrpool)
2880 {
2881 	hermon_fmr_list_t	*fmr;
2882 	hermon_fmr_list_t	*fmr_next;
2883 	int			sync_needed;
2884 	int			status;
2885 
2886 	ASSERT(MUTEX_HELD(&fmrpool->fmr_lock));
2887 
2888 	sync_needed = 0;
2889 	for (fmr = fmrpool->fmr_dirty_list; fmr; fmr = fmr_next) {
2890 		fmr_next = fmr->fmr_next;
2891 		fmr->fmr_remaps = 0;
2892 
2893 		(void) hermon_mr_deregister_fmr(state, fmr->fmr);
2894 
2895 		/*
2896 		 * Update lists.
2897 		 * - add fmr back to free list
2898 		 * - remove fmr from dirty list
2899 		 */
2900 		fmr->fmr_next = fmrpool->fmr_free_list;
2901 		fmrpool->fmr_free_list = fmr;
2902 
2903 
2904 		/*
2905 		 * Because we have updated the dirty list, and deregistered the
2906 		 * FMR entry, we do need to sync the TPT, so we set the
2907 		 * 'sync_needed' flag here so we sync once we finish dirty_list
2908 		 * processing.
2909 		 */
2910 		sync_needed = 1;
2911 	}
2912 
2913 	fmrpool->fmr_dirty_list = NULL;
2914 	fmrpool->fmr_dirty_len = 0;
2915 
2916 	if (sync_needed) {
2917 		status = hermon_sync_tpt_cmd_post(state,
2918 		    HERMON_CMD_NOSLEEP_SPIN);
2919 		if (status != HERMON_CMD_SUCCESS) {
2920 			return (status);
2921 		}
2922 	}
2923 
2924 	return (DDI_SUCCESS);
2925 }
2926 
2927 /*
2928  * hermon_fmr_avl_compare()
2929  *    Context: Can be called from user or kernel context.
2930  */
2931 static int
2932 hermon_fmr_avl_compare(const void *q, const void *e)
2933 {
2934 	hermon_fmr_list_t *entry, *query;
2935 
2936 	entry = (hermon_fmr_list_t *)e;
2937 	query = (hermon_fmr_list_t *)q;
2938 
2939 	if (query->fmr_desc.pmd_iova < entry->fmr_desc.pmd_iova) {
2940 		return (-1);
2941 	} else if (query->fmr_desc.pmd_iova > entry->fmr_desc.pmd_iova) {
2942 		return (+1);
2943 	} else {
2944 		return (0);
2945 	}
2946 }
2947 
2948 
2949 /*
2950  * hermon_fmr_cache_init()
2951  *    Context: Can be called from user or kernel context.
2952  */
2953 static void
2954 hermon_fmr_cache_init(hermon_fmrhdl_t fmr)
2955 {
2956 	/* Initialize the lock used for FMR cache AVL tree access */
2957 	mutex_init(&fmr->fmr_cachelock, NULL, MUTEX_DRIVER,
2958 	    DDI_INTR_PRI(fmr->fmr_state->hs_intrmsi_pri));
2959 
2960 	/* Initialize the AVL tree for the FMR cache */
2961 	avl_create(&fmr->fmr_cache_avl, hermon_fmr_avl_compare,
2962 	    sizeof (hermon_fmr_list_t),
2963 	    offsetof(hermon_fmr_list_t, fmr_avlnode));
2964 
2965 	fmr->fmr_cache = 1;
2966 }
2967 
2968 
2969 /*
2970  * hermon_fmr_cache_fini()
2971  *    Context: Can be called from user or kernel context.
2972  */
2973 static void
2974 hermon_fmr_cache_fini(hermon_fmrhdl_t fmr)
2975 {
2976 	void			*cookie;
2977 
2978 	/*
2979 	 * Empty all entries (if necessary) and destroy the AVL tree.
2980 	 * The FMRs themselves are freed as part of destroy_pool()
2981 	 */
2982 	cookie = NULL;
2983 	while (((void *)(hermon_fmr_list_t *)avl_destroy_nodes(
2984 	    &fmr->fmr_cache_avl, &cookie)) != NULL) {
2985 		/* loop through */
2986 	}
2987 	avl_destroy(&fmr->fmr_cache_avl);
2988 
2989 	/* Destroy the lock used for FMR cache */
2990 	mutex_destroy(&fmr->fmr_cachelock);
2991 }
2992 
2993 /*
2994  * hermon_get_dma_cookies()
2995  * Return DMA cookies in the pre-allocated paddr_list_p based on the length
2996  * needed.
2997  *    Context: Can be called from interrupt or base context.
2998  */
2999 int
3000 hermon_get_dma_cookies(hermon_state_t *state, ibt_phys_buf_t *paddr_list_p,
3001     ibt_va_attr_t *va_attrs, uint_t list_len, uint_t *cookiecnt,
3002     ibc_ma_hdl_t *ibc_ma_hdl_p)
3003 {
3004 	ddi_dma_handle_t	dma_hdl;
3005 	ddi_dma_attr_t		dma_attr;
3006 	ddi_dma_cookie_t	dmacookie;
3007 	int			(*callback)(caddr_t);
3008 	int			status;
3009 	int			i;
3010 
3011 	/* Set the callback flag appropriately */
3012 	callback = (va_attrs->va_flags & IBT_VA_NOSLEEP) ? DDI_DMA_DONTWAIT :
3013 	    DDI_DMA_SLEEP;
3014 	if ((callback == DDI_DMA_SLEEP) &&
3015 	    (HERMON_SLEEP != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
3016 		return (IBT_INVALID_PARAM);
3017 	}
3018 
3019 	/*
3020 	 * Initialize many of the default DMA attributes and allocate the DMA
3021 	 * handle.  Then, if we're bypassing the IOMMU, set the
3022 	 * DDI_DMA_FORCE_PHYSICAL flag.
3023 	 */
3024 	hermon_dma_attr_init(state, &dma_attr);
3025 
3026 #ifdef __x86
3027 	/*
3028 	 * On x86 we can specify a maximum segment length for our returned
3029 	 * cookies.
3030 	 */
3031 	if (va_attrs->va_flags & IBT_VA_FMR) {
3032 		dma_attr.dma_attr_seg = PAGESIZE - 1;
3033 	}
3034 #endif
3035 
3036 	/*
3037 	 * Check to see if the RO flag is set, and if so,
3038 	 * set that bit in the attr structure as well.
3039 	 *
3040 	 * NOTE 1:  This function is ONLY called by consumers, and only for
3041 	 *	    data buffers
3042 	 */
3043 	if (hermon_kernel_data_ro == HERMON_RO_ENABLED) {
3044 		dma_attr.dma_attr_flags |= DDI_DMA_RELAXED_ORDERING;
3045 	}
3046 
3047 	status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
3048 	    callback, NULL, &dma_hdl);
3049 	if (status != DDI_SUCCESS) {
3050 		switch (status) {
3051 		case DDI_DMA_NORESOURCES:
3052 			return (IBT_INSUFF_RESOURCE);
3053 		case DDI_DMA_BADATTR:
3054 		default:
3055 			return (ibc_get_ci_failure(0));
3056 		}
3057 	}
3058 
3059 	/*
3060 	 * Now bind the handle with the correct DMA attributes.
3061 	 */
3062 	if (va_attrs->va_flags & IBT_VA_BUF) {
3063 		status = ddi_dma_buf_bind_handle(dma_hdl, va_attrs->va_buf,
3064 		    DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT,
3065 		    NULL, &dmacookie, cookiecnt);
3066 	} else {
3067 		status = ddi_dma_addr_bind_handle(dma_hdl, NULL,
3068 		    (caddr_t)(uintptr_t)va_attrs->va_vaddr, va_attrs->va_len,
3069 		    DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT,
3070 		    NULL, &dmacookie, cookiecnt);
3071 	}
3072 	if (status != DDI_SUCCESS) {
3073 		ddi_dma_free_handle(&dma_hdl);
3074 
3075 		switch (status) {
3076 		case DDI_DMA_NORESOURCES:
3077 			return (IBT_INSUFF_RESOURCE);
3078 		case DDI_DMA_TOOBIG:
3079 			return (IBT_INVALID_PARAM);
3080 		case DDI_DMA_PARTIAL_MAP:
3081 		case DDI_DMA_INUSE:
3082 		case DDI_DMA_NOMAPPING:
3083 		default:
3084 			return (ibc_get_ci_failure(0));
3085 		}
3086 	}
3087 
3088 	/*
3089 	 * Verify our physical buffer list (PBL) is large enough to handle the
3090 	 * number of cookies that were returned.
3091 	 */
3092 	if (*cookiecnt > list_len) {
3093 		(void) ddi_dma_unbind_handle(dma_hdl);
3094 		ddi_dma_free_handle(&dma_hdl);
3095 		return (IBT_PBL_TOO_SMALL);
3096 	}
3097 
3098 	/*
3099 	 * We store the cookies returned by the DDI into our own PBL.  This
3100 	 * sets the cookies up for later processing (for example, if we want to
3101 	 * split up the cookies into smaller chunks).  We use the laddr and
3102 	 * size fields in each cookie to create each individual entry (PBE).
3103 	 */
3104 
3105 	/*
3106 	 * Store first cookie info first
3107 	 */
3108 	paddr_list_p[0].p_laddr = dmacookie.dmac_laddress;
3109 	paddr_list_p[0].p_size = dmacookie.dmac_size;
3110 
3111 	/*
3112 	 * Loop through each cookie, storing each cookie into our physical
3113 	 * buffer list.
3114 	 */
3115 	for (i = 1; i < *cookiecnt; i++) {
3116 		ddi_dma_nextcookie(dma_hdl, &dmacookie);
3117 
3118 		paddr_list_p[i].p_laddr = dmacookie.dmac_laddress;
3119 		paddr_list_p[i].p_size  = dmacookie.dmac_size;
3120 	}
3121 
3122 	/* return handle */
3123 	*ibc_ma_hdl_p = (ibc_ma_hdl_t)dma_hdl;
3124 	return (DDI_SUCCESS);
3125 }
3126 
3127 /*
3128  * hermon_split_dma_cookies()
3129  * Split up cookies passed in from paddr_list_p, returning the new list in the
3130  * same buffers, based on the pagesize to split the cookies into.
3131  *    Context: Can be called from interrupt or base context.
3132  */
3133 /* ARGSUSED */
3134 int
3135 hermon_split_dma_cookies(hermon_state_t *state, ibt_phys_buf_t *paddr_list,
3136     ib_memlen_t *paddr_offset, uint_t list_len, uint_t *cookiecnt,
3137     uint_t pagesize)
3138 {
3139 	uint64_t	pageoffset;
3140 	uint64_t	pagemask;
3141 	uint_t		pageshift;
3142 	uint_t		current_cookiecnt;
3143 	uint_t		cookies_needed;
3144 	uint64_t	last_size, extra_cookie;
3145 	int		i_increment;
3146 	int		i, k;
3147 	int		status;
3148 
3149 	/* Setup pagesize calculations */
3150 	pageoffset = pagesize - 1;
3151 	pagemask = (~pageoffset);
3152 	pageshift = highbit(pagesize) - 1;
3153 
3154 	/*
3155 	 * Setup first cookie offset based on pagesize requested.
3156 	 */
3157 	*paddr_offset = paddr_list[0].p_laddr & pageoffset;
3158 	paddr_list[0].p_laddr &= pagemask;
3159 
3160 	/* Save away the current number of cookies that are passed in */
3161 	current_cookiecnt = *cookiecnt;
3162 
3163 	/* Perform splitting up of current cookies into pagesize blocks */
3164 	for (i = 0; i < current_cookiecnt; i += i_increment) {
3165 		/*
3166 		 * If the cookie is smaller than pagesize, or already is
3167 		 * pagesize, then we are already within our limits, so we skip
3168 		 * it.
3169 		 */
3170 		if (paddr_list[i].p_size <= pagesize) {
3171 			i_increment = 1;
3172 			continue;
3173 		}
3174 
3175 		/*
3176 		 * If this is our first cookie, then we have to deal with the
3177 		 * offset that may be present in the first address.  So add
3178 		 * that to our size, to calculate potential change to the last
3179 		 * cookie's size.
3180 		 *
3181 		 * Also, calculate the number of cookies that we'll need to
3182 		 * split up this block into.
3183 		 */
3184 		if (i == 0) {
3185 			last_size = (paddr_list[i].p_size + *paddr_offset) &
3186 			    pageoffset;
3187 			cookies_needed = (paddr_list[i].p_size +
3188 			    *paddr_offset) >> pageshift;
3189 		} else {
3190 			last_size = 0;
3191 			cookies_needed = paddr_list[i].p_size >> pageshift;
3192 		}
3193 
3194 		/*
3195 		 * If our size is not a multiple of pagesize, we need one more
3196 		 * cookie.
3197 		 */
3198 		if (last_size) {
3199 			extra_cookie = 1;
3200 		} else {
3201 			extra_cookie = 0;
3202 		}
3203 
3204 		/*
3205 		 * Split cookie into pagesize chunks, shifting list of cookies
3206 		 * down, using more cookie slots in the PBL if necessary.
3207 		 */
3208 		status = hermon_dma_cookie_shift(paddr_list, i, list_len,
3209 		    current_cookiecnt - i, cookies_needed + extra_cookie);
3210 		if (status != 0) {
3211 			return (status);
3212 		}
3213 
3214 		/*
3215 		 * If the very first cookie, we must take possible offset into
3216 		 * account.
3217 		 */
3218 		if (i == 0) {
3219 			paddr_list[i].p_size = pagesize - *paddr_offset;
3220 		} else {
3221 			paddr_list[i].p_size = pagesize;
3222 		}
3223 
3224 		/*
3225 		 * We have shifted the existing cookies down the PBL, now fill
3226 		 * in the blank entries by splitting up our current block.
3227 		 */
3228 		for (k = 1; k < cookies_needed; k++) {
3229 			paddr_list[i + k].p_laddr =
3230 			    paddr_list[i + k - 1].p_laddr + pagesize;
3231 			paddr_list[i + k].p_size = pagesize;
3232 		}
3233 
3234 		/* If we have one extra cookie (of less than pagesize...) */
3235 		if (extra_cookie) {
3236 			paddr_list[i + k].p_laddr =
3237 			    paddr_list[i + k - 1].p_laddr + pagesize;
3238 			paddr_list[i + k].p_size = (size_t)last_size;
3239 		}
3240 
3241 		/* Increment cookiecnt appropriately based on cookies used */
3242 		i_increment = cookies_needed + extra_cookie;
3243 		current_cookiecnt += i_increment - 1;
3244 	}
3245 
3246 	/* Update to new cookie count */
3247 	*cookiecnt = current_cookiecnt;
3248 	return (DDI_SUCCESS);
3249 }
3250 
3251 /*
3252  * hermon_dma_cookie_shift()
3253  *    Context: Can be called from interrupt or base context.
3254  */
3255 int
3256 hermon_dma_cookie_shift(ibt_phys_buf_t *paddr_list, int start, int end,
3257     int cookiecnt, int num_shift)
3258 {
3259 	int shift_start;
3260 	int i;
3261 
3262 	/* Calculating starting point in the PBL list */
3263 	shift_start = start + cookiecnt - 1;
3264 
3265 	/* Check if we're at the end of our PBL list */
3266 	if ((shift_start + num_shift - 1) >= end) {
3267 		return (IBT_PBL_TOO_SMALL);
3268 	}
3269 
3270 	for (i = shift_start; i > start; i--) {
3271 		paddr_list[i + num_shift - 1] = paddr_list[i];
3272 	}
3273 
3274 	return (DDI_SUCCESS);
3275 }
3276 
3277 
3278 /*
3279  * hermon_free_dma_cookies()
3280  *    Context: Can be called from interrupt or base context.
3281  */
3282 int
3283 hermon_free_dma_cookies(ibc_ma_hdl_t ma_hdl)
3284 {
3285 	ddi_dma_handle_t	dma_hdl;
3286 	int			status;
3287 
3288 	dma_hdl = (ddi_dma_handle_t)ma_hdl;
3289 
3290 	status = ddi_dma_unbind_handle(dma_hdl);
3291 	if (status != DDI_SUCCESS) {
3292 		return (ibc_get_ci_failure(0));
3293 	}
3294 	ddi_dma_free_handle(&dma_hdl);
3295 
3296 	return (DDI_SUCCESS);
3297 }
3298