xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/hermon/hermon_misc.c (revision c7facc54c4abed9e554ff80225311e6b7048d3c9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * hermon_misc.c
29  *    Hermon Miscellaneous routines - Address Handle, Multicast, Protection
30  *    Domain, and port-related operations
31  *
32  *    Implements all the routines necessary for allocating, freeing, querying
33  *    and modifying Address Handles and Protection Domains.  Also implements
34  *    all the routines necessary for adding and removing Queue Pairs to/from
35  *    Multicast Groups.  Lastly, it implements the routines necessary for
36  *    port-related query and modify operations.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/bitmap.h>
45 #include <sys/sysmacros.h>
46 
47 #include <sys/ib/adapters/hermon/hermon.h>
48 
49 extern uint32_t hermon_kernel_data_ro;
50 extern int hermon_rdma_debug;
51 
52 /* used for helping uniquify fmr pool taskq name */
53 static uint_t hermon_debug_fmrpool_cnt = 0x00000000;
54 
55 static int hermon_mcg_qplist_add(hermon_state_t *state, hermon_mcghdl_t mcg,
56     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp, uint_t *qp_found);
57 static int hermon_mcg_qplist_remove(hermon_mcghdl_t mcg,
58     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp);
59 static void hermon_qp_mcg_refcnt_inc(hermon_qphdl_t qp);
60 static void hermon_qp_mcg_refcnt_dec(hermon_qphdl_t qp);
61 static uint_t hermon_mcg_walk_mgid_hash(hermon_state_t *state,
62     uint64_t start_indx, ib_gid_t mgid, uint_t *prev_indx);
63 static void hermon_mcg_setup_new_hdr(hermon_mcghdl_t mcg,
64     hermon_hw_mcg_t *mcg_hdr, ib_gid_t mgid, hermon_rsrc_t *mcg_rsrc);
65 static int hermon_mcg_hash_list_remove(hermon_state_t *state, uint_t curr_indx,
66     uint_t prev_indx, hermon_hw_mcg_t *mcg_entry);
67 static int hermon_mcg_entry_invalidate(hermon_state_t *state,
68     hermon_hw_mcg_t *mcg_entry, uint_t indx);
69 static int hermon_mgid_is_valid(ib_gid_t gid);
70 static int hermon_mlid_is_valid(ib_lid_t lid);
71 static void hermon_fmr_processing(void *fmr_args);
72 static int hermon_fmr_cleanup(hermon_state_t *state, hermon_fmrhdl_t pool);
73 static void hermon_fmr_cache_init(hermon_fmrhdl_t fmr);
74 static void hermon_fmr_cache_fini(hermon_fmrhdl_t fmr);
75 static int hermon_fmr_avl_compare(const void *q, const void *e);
76 
77 
78 #define	HERMON_MAX_DBR_PAGES_PER_USER	64
79 #define	HERMON_DBR_KEY(index, page) \
80 	(((uint64_t)index) * HERMON_MAX_DBR_PAGES_PER_USER + (page))
81 
82 static hermon_udbr_page_t *
83 hermon_dbr_new_user_page(hermon_state_t *state, uint_t index,
84     uint_t page)
85 {
86 	hermon_udbr_page_t *pagep;
87 	ddi_dma_attr_t dma_attr;
88 	uint_t cookiecnt;
89 	int status;
90 	hermon_umap_db_entry_t *umapdb;
91 
92 	pagep = kmem_alloc(sizeof (*pagep), KM_SLEEP);
93 	pagep->upg_index = page;
94 	pagep->upg_nfree = PAGESIZE / sizeof (hermon_dbr_t);
95 
96 	/* Allocate 1 bit per dbr for free/alloc management (0 => "free") */
97 	pagep->upg_free = kmem_zalloc(PAGESIZE / sizeof (hermon_dbr_t) / 8,
98 	    KM_SLEEP);
99 	pagep->upg_kvaddr = ddi_umem_alloc(PAGESIZE, DDI_UMEM_SLEEP,
100 	    &pagep->upg_umemcookie); /* not HERMON_PAGESIZE here */
101 
102 	pagep->upg_buf = ddi_umem_iosetup(pagep->upg_umemcookie, 0,
103 	    PAGESIZE, B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
104 
105 	hermon_dma_attr_init(state, &dma_attr);
106 #ifdef	__sparc
107 	if (state->hs_cfg_profile->cp_iommu_bypass == HERMON_BINDMEM_BYPASS)
108 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
109 #endif
110 	status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
111 	    DDI_DMA_SLEEP, NULL, &pagep->upg_dmahdl);
112 	if (status != DDI_SUCCESS) {
113 		IBTF_DPRINTF_L2("hermon", "hermon_new_user_page: "
114 		    "ddi_dma_buf_bind_handle failed: %d", status);
115 		return (NULL);
116 	}
117 	status = ddi_dma_buf_bind_handle(pagep->upg_dmahdl,
118 	    pagep->upg_buf, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
119 	    DDI_DMA_SLEEP, NULL, &pagep->upg_dmacookie, &cookiecnt);
120 	if (status != DDI_SUCCESS) {
121 		IBTF_DPRINTF_L2("hermon", "hermon_dbr_new_user_page: "
122 		    "ddi_dma_buf_bind_handle failed: %d", status);
123 		ddi_dma_free_handle(&pagep->upg_dmahdl);
124 		return (NULL);
125 	}
126 	ASSERT(cookiecnt == 1);
127 
128 	/* create db entry for mmap */
129 	umapdb = hermon_umap_db_alloc(state->hs_instance,
130 	    HERMON_DBR_KEY(index, page), MLNX_UMAP_DBRMEM_RSRC,
131 	    (uint64_t)(uintptr_t)pagep);
132 	hermon_umap_db_add(umapdb);
133 	return (pagep);
134 }
135 
136 
137 /*ARGSUSED*/
138 static int
139 hermon_user_dbr_alloc(hermon_state_t *state, uint_t index,
140     ddi_acc_handle_t *acchdl, hermon_dbr_t **vdbr, uint64_t *pdbr,
141     uint64_t *mapoffset)
142 {
143 	hermon_user_dbr_t *udbr;
144 	hermon_udbr_page_t *pagep;
145 	uint_t next_page;
146 	int dbr_index;
147 	int i1, i2, i3, last;
148 	uint64_t u64, mask;
149 
150 	mutex_enter(&state->hs_dbr_lock);
151 	for (udbr = state->hs_user_dbr; udbr != NULL; udbr = udbr->udbr_link)
152 		if (udbr->udbr_index == index)
153 			break;
154 	if (udbr == NULL) {
155 		udbr = kmem_alloc(sizeof (*udbr), KM_SLEEP);
156 		udbr->udbr_link = state->hs_user_dbr;
157 		state->hs_user_dbr = udbr;
158 		udbr->udbr_index = index;
159 		udbr->udbr_pagep = NULL;
160 	}
161 	pagep = udbr->udbr_pagep;
162 	next_page = (pagep == NULL) ? 0 : (pagep->upg_index + 1);
163 	while (pagep != NULL)
164 		if (pagep->upg_nfree > 0)
165 			break;
166 		else
167 			pagep = pagep->upg_link;
168 	if (pagep == NULL) {
169 		pagep = hermon_dbr_new_user_page(state, index, next_page);
170 		if (pagep == NULL) {
171 			mutex_exit(&state->hs_dbr_lock);
172 			return (DDI_FAILURE);
173 		}
174 		pagep->upg_link = udbr->udbr_pagep;
175 		udbr->udbr_pagep = pagep;
176 	}
177 
178 	/* Since nfree > 0, we're assured the loops below will succeed */
179 
180 	/* First, find a 64-bit (not ~0) that has a free dbr */
181 	last = PAGESIZE / sizeof (uint64_t) / 64;
182 	mask = ~0ull;
183 	for (i1 = 0; i1 < last; i1++)
184 		if ((pagep->upg_free[i1] & mask) != mask)
185 			break;
186 	u64 = pagep->upg_free[i1];
187 
188 	/* Second, find a byte (not 0xff) that has a free dbr */
189 	last = sizeof (uint64_t) / sizeof (uint8_t);
190 	for (i2 = 0, mask = 0xff; i2 < last; i2++, mask <<= 8)
191 		if ((u64 & mask) != mask)
192 			break;
193 
194 	/* Third, find a bit that is free (0) */
195 	for (i3 = 0; i3 < sizeof (uint64_t) / sizeof (uint8_t); i3++)
196 		if ((u64 & (1ul << (i3 + 8 * i2))) == 0)
197 			break;
198 
199 	/* Mark it as allocated */
200 	pagep->upg_free[i1] |= (1ul << (i3 + 8 * i2));
201 
202 	dbr_index = ((i1 * sizeof (uint64_t)) + i2) * sizeof (uint64_t) + i3;
203 	pagep->upg_nfree--;
204 	((uint64_t *)(void *)pagep->upg_kvaddr)[dbr_index] = 0;	/* clear dbr */
205 	*mapoffset = ((HERMON_DBR_KEY(index, pagep->upg_index) <<
206 	    MLNX_UMAP_RSRC_TYPE_SHIFT) | MLNX_UMAP_DBRMEM_RSRC) << PAGESHIFT;
207 	*vdbr = (hermon_dbr_t *)((uint64_t *)(void *)pagep->upg_kvaddr +
208 	    dbr_index);
209 	*pdbr = pagep->upg_dmacookie.dmac_laddress + dbr_index *
210 	    sizeof (uint64_t);
211 
212 	mutex_exit(&state->hs_dbr_lock);
213 	return (DDI_SUCCESS);
214 }
215 
216 static void
217 hermon_user_dbr_free(hermon_state_t *state, uint_t index, hermon_dbr_t *record)
218 {
219 	hermon_user_dbr_t	*udbr;
220 	hermon_udbr_page_t	*pagep;
221 	caddr_t			kvaddr;
222 	uint_t			dbr_index;
223 	uint_t			max_free = PAGESIZE / sizeof (hermon_dbr_t);
224 	int			i1, i2;
225 
226 	dbr_index = (uintptr_t)record & PAGEOFFSET; /* offset (not yet index) */
227 	kvaddr = (caddr_t)record - dbr_index;
228 	dbr_index /= sizeof (hermon_dbr_t); /* now it's the index */
229 
230 	mutex_enter(&state->hs_dbr_lock);
231 	for (udbr = state->hs_user_dbr; udbr != NULL; udbr = udbr->udbr_link)
232 		if (udbr->udbr_index == index)
233 			break;
234 	if (udbr == NULL) {
235 		IBTF_DPRINTF_L2("hermon", "free user dbr: udbr struct not "
236 		    "found for index %x", index);
237 		mutex_exit(&state->hs_dbr_lock);
238 		return;
239 	}
240 	for (pagep = udbr->udbr_pagep; pagep != NULL; pagep = pagep->upg_link)
241 		if (pagep->upg_kvaddr == kvaddr)
242 			break;
243 	if (pagep == NULL) {
244 		IBTF_DPRINTF_L2("hermon", "free user dbr: pagep struct not"
245 		    " found for index %x, kvaddr %p, DBR index %x",
246 		    index, kvaddr, dbr_index);
247 		mutex_exit(&state->hs_dbr_lock);
248 		return;
249 	}
250 	if (pagep->upg_nfree >= max_free) {
251 		IBTF_DPRINTF_L2("hermon", "free user dbr: overflow: "
252 		    "UCE index %x, DBR index %x", index, dbr_index);
253 		mutex_exit(&state->hs_dbr_lock);
254 		return;
255 	}
256 	ASSERT(dbr_index < max_free);
257 	i1 = dbr_index / 64;
258 	i2 = dbr_index % 64;
259 	ASSERT((pagep->upg_free[i1] & (1ul << i2)) == (1ul << i2));
260 	pagep->upg_free[i1] &= ~(1ul << i2);
261 	pagep->upg_nfree++;
262 	mutex_exit(&state->hs_dbr_lock);
263 }
264 
265 /*
266  * hermon_dbr_page_alloc()
267  *	first page allocation - called from attach or open
268  *	in this case, we want exactly one page per call, and aligned on a
269  *	page - and may need to be mapped to the user for access
270  */
271 int
272 hermon_dbr_page_alloc(hermon_state_t *state, hermon_dbr_info_t **dinfo)
273 {
274 	int			status;
275 	ddi_dma_handle_t	dma_hdl;
276 	ddi_acc_handle_t	acc_hdl;
277 	ddi_dma_attr_t		dma_attr;
278 	ddi_dma_cookie_t	cookie;
279 	uint_t			cookie_cnt;
280 	int			i;
281 	hermon_dbr_info_t 	*info;
282 	caddr_t			dmaaddr;
283 	uint64_t		dmalen;
284 
285 	info = kmem_zalloc(sizeof (hermon_dbr_info_t), KM_SLEEP);
286 
287 	/*
288 	 * Initialize many of the default DMA attributes.  Then set additional
289 	 * alignment restrictions if necessary for the dbr memory, meaning
290 	 * page aligned.  Also use the configured value for IOMMU bypass
291 	 */
292 	hermon_dma_attr_init(state, &dma_attr);
293 	dma_attr.dma_attr_align = PAGESIZE;
294 	dma_attr.dma_attr_sgllen = 1;	/* make sure only one cookie */
295 #ifdef	__sparc
296 	if (state->hs_cfg_profile->cp_iommu_bypass == HERMON_BINDMEM_BYPASS)
297 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
298 #endif
299 
300 	status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
301 	    DDI_DMA_SLEEP, NULL, &dma_hdl);
302 	if (status != DDI_SUCCESS) {
303 		kmem_free((void *)info, sizeof (hermon_dbr_info_t));
304 		cmn_err(CE_NOTE, "dbr DMA handle alloc failed\n");
305 		return (DDI_FAILURE);
306 	}
307 
308 	status = ddi_dma_mem_alloc(dma_hdl, PAGESIZE,
309 	    &state->hs_reg_accattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP,
310 	    NULL, &dmaaddr, (size_t *)&dmalen, &acc_hdl);
311 	if (status != DDI_SUCCESS)	{
312 		ddi_dma_free_handle(&dma_hdl);
313 		cmn_err(CE_CONT, "dbr DMA mem alloc failed(status %d)", status);
314 		kmem_free((void *)info, sizeof (hermon_dbr_info_t));
315 		return (DDI_FAILURE);
316 	}
317 
318 	/* this memory won't be IB registered, so do the bind here */
319 	status = ddi_dma_addr_bind_handle(dma_hdl, NULL,
320 	    dmaaddr, (size_t)dmalen, DDI_DMA_RDWR |
321 	    DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &cookie, &cookie_cnt);
322 	if (status != DDI_SUCCESS) {
323 		ddi_dma_mem_free(&acc_hdl);
324 		ddi_dma_free_handle(&dma_hdl);
325 		kmem_free((void *)info, sizeof (hermon_dbr_info_t));
326 		cmn_err(CE_CONT, "dbr DMA bind handle failed (status %d)",
327 		    status);
328 		return (DDI_FAILURE);
329 	}
330 	*dinfo = info;		/* Pass back the pointer */
331 
332 	/* init the info structure with returned info */
333 	info->dbr_dmahdl = dma_hdl;
334 	info->dbr_acchdl = acc_hdl;
335 	info->dbr_page   = (hermon_dbr_t *)(void *)dmaaddr;
336 	info->dbr_link = NULL;
337 	/* extract the phys addr from the cookie */
338 	info->dbr_paddr = cookie.dmac_laddress;
339 	info->dbr_firstfree = 0;
340 	info->dbr_nfree = HERMON_NUM_DBR_PER_PAGE;
341 	/* link all DBrs onto the free list */
342 	for (i = 0; i < HERMON_NUM_DBR_PER_PAGE; i++) {
343 		info->dbr_page[i] = i + 1;
344 	}
345 
346 	return (DDI_SUCCESS);
347 }
348 
349 
350 /*
351  * hermon_dbr_alloc()
352  *	DBr record allocation - called from alloc cq/qp/srq
353  *	will check for available dbrs in current
354  *	page - if needed it will allocate another and link them
355  */
356 
357 int
358 hermon_dbr_alloc(hermon_state_t *state, uint_t index, ddi_acc_handle_t *acchdl,
359     hermon_dbr_t **vdbr, uint64_t *pdbr, uint64_t *mapoffset)
360 {
361 	hermon_dbr_t		*record = NULL;
362 	hermon_dbr_info_t	*info = NULL;
363 	uint32_t		idx;
364 	int			status;
365 
366 	if (index != state->hs_kernel_uar_index)
367 		return (hermon_user_dbr_alloc(state, index, acchdl, vdbr, pdbr,
368 		    mapoffset));
369 
370 	mutex_enter(&state->hs_dbr_lock);
371 	for (info = state->hs_kern_dbr; info != NULL; info = info->dbr_link)
372 		if (info->dbr_nfree != 0)
373 			break;		/* found a page w/ one available */
374 
375 	if (info == NULL) {	/* did NOT find a page with one available */
376 		status = hermon_dbr_page_alloc(state, &info);
377 		if (status != DDI_SUCCESS) {
378 			/* do error handling */
379 			mutex_exit(&state->hs_dbr_lock);
380 			return (DDI_FAILURE);
381 		}
382 		/* got a new page, so link it in. */
383 		info->dbr_link = state->hs_kern_dbr;
384 		state->hs_kern_dbr = info;
385 	}
386 	idx = info->dbr_firstfree;
387 	record = info->dbr_page + idx;
388 	info->dbr_firstfree = *record;
389 	info->dbr_nfree--;
390 	*record = 0;
391 
392 	*acchdl = info->dbr_acchdl;
393 	*vdbr = record;
394 	*pdbr = info->dbr_paddr + idx * sizeof (hermon_dbr_t);
395 	mutex_exit(&state->hs_dbr_lock);
396 	return (DDI_SUCCESS);
397 }
398 
399 /*
400  * hermon_dbr_free()
401  *	DBr record deallocation - called from free cq/qp
402  *	will update the counter in the header, and invalidate
403  *	the dbr, but will NEVER free pages of dbrs - small
404  *	price to pay, but userland access never will anyway
405  */
406 void
407 hermon_dbr_free(hermon_state_t *state, uint_t indx, hermon_dbr_t *record)
408 {
409 	hermon_dbr_t		*page;
410 	hermon_dbr_info_t	*info;
411 
412 	if (indx != state->hs_kernel_uar_index) {
413 		hermon_user_dbr_free(state, indx, record);
414 		return;
415 	}
416 	page = (hermon_dbr_t *)(uintptr_t)((uintptr_t)record & PAGEMASK);
417 	mutex_enter(&state->hs_dbr_lock);
418 	for (info = state->hs_kern_dbr; info != NULL; info = info->dbr_link)
419 		if (info->dbr_page == page)
420 			break;
421 	ASSERT(info != NULL);
422 	*record = info->dbr_firstfree;
423 	info->dbr_firstfree = record - info->dbr_page;
424 	info->dbr_nfree++;
425 	mutex_exit(&state->hs_dbr_lock);
426 }
427 
428 /*
429  * hermon_dbr_kern_free()
430  *    Context: Can be called only from detach context.
431  *
432  *	Free all kernel dbr pages.  This includes the freeing of all the dma
433  *	resources acquired during the allocation of the pages.
434  *
435  *	Also, free all the user dbr pages.
436  */
437 void
438 hermon_dbr_kern_free(hermon_state_t *state)
439 {
440 	hermon_dbr_info_t	*info, *link;
441 	hermon_user_dbr_t	*udbr, *next;
442 	hermon_udbr_page_t	*pagep, *nextp;
443 	hermon_umap_db_entry_t	*umapdb;
444 	int			instance, status;
445 	uint64_t		value;
446 	extern			hermon_umap_db_t hermon_userland_rsrc_db;
447 
448 	mutex_enter(&state->hs_dbr_lock);
449 	for (info = state->hs_kern_dbr; info != NULL; info = link) {
450 		(void) ddi_dma_unbind_handle(info->dbr_dmahdl);
451 		ddi_dma_mem_free(&info->dbr_acchdl);	/* free page */
452 		ddi_dma_free_handle(&info->dbr_dmahdl);
453 		link = info->dbr_link;
454 		kmem_free(info, sizeof (hermon_dbr_info_t));
455 	}
456 
457 	udbr = state->hs_user_dbr;
458 	instance = state->hs_instance;
459 	mutex_enter(&hermon_userland_rsrc_db.hdl_umapdb_lock);
460 	while (udbr != NULL) {
461 		pagep = udbr->udbr_pagep;
462 		while (pagep != NULL) {
463 			/* probably need to remove "db" */
464 			(void) ddi_dma_unbind_handle(pagep->upg_dmahdl);
465 			ddi_dma_free_handle(&pagep->upg_dmahdl);
466 			freerbuf(pagep->upg_buf);
467 			ddi_umem_free(pagep->upg_umemcookie);
468 			status = hermon_umap_db_find_nolock(instance,
469 			    HERMON_DBR_KEY(udbr->udbr_index,
470 			    pagep->upg_index), MLNX_UMAP_DBRMEM_RSRC,
471 			    &value, HERMON_UMAP_DB_REMOVE, &umapdb);
472 			if (status == DDI_SUCCESS)
473 				hermon_umap_db_free(umapdb);
474 			kmem_free(pagep->upg_free,
475 			    PAGESIZE / sizeof (hermon_dbr_t) / 8);
476 			nextp = pagep->upg_link;
477 			kmem_free(pagep, sizeof (*pagep));
478 			pagep = nextp;
479 		}
480 		next = udbr->udbr_link;
481 		kmem_free(udbr, sizeof (*udbr));
482 		udbr = next;
483 	}
484 	mutex_exit(&hermon_userland_rsrc_db.hdl_umapdb_lock);
485 	mutex_exit(&state->hs_dbr_lock);
486 }
487 
488 /*
489  * hermon_ah_alloc()
490  *    Context: Can be called only from user or kernel context.
491  */
492 int
493 hermon_ah_alloc(hermon_state_t *state, hermon_pdhdl_t pd,
494     ibt_adds_vect_t *attr_p, hermon_ahhdl_t *ahhdl, uint_t sleepflag)
495 {
496 	hermon_rsrc_t		*rsrc;
497 	hermon_hw_udav_t	*udav;
498 	hermon_ahhdl_t		ah;
499 	int			status;
500 
501 	/*
502 	 * Someday maybe the "ibt_adds_vect_t *attr_p" will be NULL to
503 	 * indicate that we wish to allocate an "invalid" (i.e. empty)
504 	 * address handle XXX
505 	 */
506 
507 	/* Validate that specified port number is legal */
508 	if (!hermon_portnum_is_valid(state, attr_p->av_port_num)) {
509 		return (IBT_HCA_PORT_INVALID);
510 	}
511 
512 	/*
513 	 * Allocate the software structure for tracking the address handle
514 	 * (i.e. the Hermon Address Handle struct).
515 	 */
516 	status = hermon_rsrc_alloc(state, HERMON_AHHDL, 1, sleepflag, &rsrc);
517 	if (status != DDI_SUCCESS) {
518 		return (IBT_INSUFF_RESOURCE);
519 	}
520 	ah = (hermon_ahhdl_t)rsrc->hr_addr;
521 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
522 
523 	/* Increment the reference count on the protection domain (PD) */
524 	hermon_pd_refcnt_inc(pd);
525 
526 	udav = (hermon_hw_udav_t *)kmem_zalloc(sizeof (hermon_hw_udav_t),
527 	    KM_SLEEP);
528 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*udav))
529 
530 	/*
531 	 * Fill in the UDAV data. We first zero out the UDAV, then populate
532 	 * it by then calling hermon_set_addr_path() to fill in the common
533 	 * portions that can be pulled from the "ibt_adds_vect_t" passed in
534 	 */
535 	status = hermon_set_addr_path(state, attr_p,
536 	    (hermon_hw_addr_path_t *)udav, HERMON_ADDRPATH_UDAV);
537 	if (status != DDI_SUCCESS) {
538 		hermon_pd_refcnt_dec(pd);
539 		hermon_rsrc_free(state, &rsrc);
540 		return (status);
541 	}
542 	udav->pd	= pd->pd_pdnum;
543 	udav->sl	= attr_p->av_srvl;
544 
545 	/*
546 	 * Fill in the rest of the Hermon Address Handle struct.
547 	 *
548 	 * NOTE: We are saving away a copy of the "av_dgid.gid_guid" field
549 	 * here because we may need to return it later to the IBTF (as a
550 	 * result of a subsequent query operation).  Unlike the other UDAV
551 	 * parameters, the value of "av_dgid.gid_guid" is not always preserved.
552 	 * The reason for this is described in hermon_set_addr_path().
553 	 */
554 	ah->ah_rsrcp	 = rsrc;
555 	ah->ah_pdhdl	 = pd;
556 	ah->ah_udav	 = udav;
557 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
558 	*ahhdl = ah;
559 
560 	return (DDI_SUCCESS);
561 }
562 
563 
564 /*
565  * hermon_ah_free()
566  *    Context: Can be called only from user or kernel context.
567  */
568 /* ARGSUSED */
569 int
570 hermon_ah_free(hermon_state_t *state, hermon_ahhdl_t *ahhdl, uint_t sleepflag)
571 {
572 	hermon_rsrc_t		*rsrc;
573 	hermon_pdhdl_t		pd;
574 	hermon_ahhdl_t		ah;
575 
576 	/*
577 	 * Pull all the necessary information from the Hermon Address Handle
578 	 * struct.  This is necessary here because the resource for the
579 	 * AH is going to be freed up as part of this operation.
580 	 */
581 	ah    = *ahhdl;
582 	mutex_enter(&ah->ah_lock);
583 	rsrc  = ah->ah_rsrcp;
584 	pd    = ah->ah_pdhdl;
585 	mutex_exit(&ah->ah_lock);
586 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
587 
588 	/* Free the UDAV memory */
589 	kmem_free(ah->ah_udav, sizeof (hermon_hw_udav_t));
590 
591 	/* Decrement the reference count on the protection domain (PD) */
592 	hermon_pd_refcnt_dec(pd);
593 
594 	/* Free the Hermon Address Handle structure */
595 	hermon_rsrc_free(state, &rsrc);
596 
597 	/* Set the ahhdl pointer to NULL and return success */
598 	*ahhdl = NULL;
599 
600 	return (DDI_SUCCESS);
601 }
602 
603 
604 /*
605  * hermon_ah_query()
606  *    Context: Can be called from interrupt or base context.
607  */
608 /* ARGSUSED */
609 int
610 hermon_ah_query(hermon_state_t *state, hermon_ahhdl_t ah, hermon_pdhdl_t *pd,
611     ibt_adds_vect_t *attr_p)
612 {
613 	mutex_enter(&ah->ah_lock);
614 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p))
615 
616 	/*
617 	 * Pull the PD and UDAV from the Hermon Address Handle structure
618 	 */
619 	*pd = ah->ah_pdhdl;
620 
621 	/*
622 	 * Fill in "ibt_adds_vect_t".  We call hermon_get_addr_path() to fill
623 	 * the common portions that can be pulled from the UDAV we pass in.
624 	 *
625 	 * NOTE: We will also fill the "av_dgid.gid_guid" field from the
626 	 * "ah_save_guid" field we have previously saved away.  The reason
627 	 * for this is described in hermon_ah_alloc() and hermon_ah_modify().
628 	 */
629 	hermon_get_addr_path(state, (hermon_hw_addr_path_t *)ah->ah_udav,
630 	    attr_p, HERMON_ADDRPATH_UDAV);
631 
632 	attr_p->av_dgid.gid_guid = ah->ah_save_guid;
633 
634 	mutex_exit(&ah->ah_lock);
635 	return (DDI_SUCCESS);
636 }
637 
638 
639 /*
640  * hermon_ah_modify()
641  *    Context: Can be called from interrupt or base context.
642  */
643 /* ARGSUSED */
644 int
645 hermon_ah_modify(hermon_state_t *state, hermon_ahhdl_t ah,
646     ibt_adds_vect_t *attr_p)
647 {
648 	hermon_hw_udav_t	old_udav;
649 	uint64_t		data_old;
650 	int			status, size, i;
651 
652 	/* Validate that specified port number is legal */
653 	if (!hermon_portnum_is_valid(state, attr_p->av_port_num)) {
654 		return (IBT_HCA_PORT_INVALID);
655 	}
656 
657 	mutex_enter(&ah->ah_lock);
658 
659 	/* Save a copy of the current UDAV data in old_udav. */
660 	bcopy(ah->ah_udav, &old_udav, sizeof (hermon_hw_udav_t));
661 
662 	/*
663 	 * Fill in the new UDAV with the caller's data, passed in via the
664 	 * "ibt_adds_vect_t" structure.
665 	 *
666 	 * NOTE: We also need to save away a copy of the "av_dgid.gid_guid"
667 	 * field here (just as we did during hermon_ah_alloc()) because we
668 	 * may need to return it later to the IBTF (as a result of a
669 	 * subsequent query operation).  As explained in hermon_ah_alloc(),
670 	 * unlike the other UDAV parameters, the value of "av_dgid.gid_guid"
671 	 * is not always preserved. The reason for this is described in
672 	 * hermon_set_addr_path().
673 	 */
674 	status = hermon_set_addr_path(state, attr_p,
675 	    (hermon_hw_addr_path_t *)ah->ah_udav, HERMON_ADDRPATH_UDAV);
676 	if (status != DDI_SUCCESS) {
677 		mutex_exit(&ah->ah_lock);
678 		return (status);
679 	}
680 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
681 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(ah->ah_udav)))
682 	ah->ah_udav->sl  = attr_p->av_srvl;
683 
684 	/*
685 	 * Copy changes into the new UDAV.
686 	 *    Note:  We copy in 64-bit chunks.  For the first two of these
687 	 *    chunks it is necessary to read the current contents of the
688 	 *    UDAV, mask off the modifiable portions (maintaining any
689 	 *    of the "reserved" portions), and then mask on the new data.
690 	 */
691 	size = sizeof (hermon_hw_udav_t) >> 3;
692 	for (i = 0; i < size; i++) {
693 		data_old = ((uint64_t *)&old_udav)[i];
694 
695 		/*
696 		 * Apply mask to change only the relevant values.
697 		 */
698 		if (i == 0) {
699 			data_old = data_old & HERMON_UDAV_MODIFY_MASK0;
700 		} else if (i == 1) {
701 			data_old = data_old & HERMON_UDAV_MODIFY_MASK1;
702 		} else {
703 			data_old = 0;
704 		}
705 
706 		/* Store the updated values to the UDAV */
707 		((uint64_t *)ah->ah_udav)[i] |= data_old;
708 	}
709 
710 	/*
711 	 * Put the valid PD number back into the UDAV entry, as it
712 	 * might have been clobbered above.
713 	 */
714 	ah->ah_udav->pd = old_udav.pd;
715 
716 
717 	mutex_exit(&ah->ah_lock);
718 	return (DDI_SUCCESS);
719 }
720 
721 /*
722  * hermon_mcg_attach()
723  *    Context: Can be called only from user or kernel context.
724  */
725 int
726 hermon_mcg_attach(hermon_state_t *state, hermon_qphdl_t qp, ib_gid_t gid,
727     ib_lid_t lid)
728 {
729 	hermon_rsrc_t		*rsrc;
730 	hermon_hw_mcg_t		*mcg_entry;
731 	hermon_hw_mcg_qp_list_t	*mcg_entry_qplist;
732 	hermon_mcghdl_t		mcg, newmcg;
733 	uint64_t		mgid_hash;
734 	uint32_t		end_indx;
735 	int			status;
736 	uint_t			qp_found;
737 
738 	/*
739 	 * It is only allowed to attach MCG to UD queue pairs.  Verify
740 	 * that the intended QP is of the appropriate transport type
741 	 */
742 	if (qp->qp_serv_type != HERMON_QP_UD) {
743 		return (IBT_QP_SRV_TYPE_INVALID);
744 	}
745 
746 	/*
747 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
748 	 * LIDs should be within a well defined range.  If the specified LID
749 	 * is outside of that range, then return an error.
750 	 */
751 	if (hermon_mlid_is_valid(lid) == 0) {
752 		return (IBT_MC_MLID_INVALID);
753 	}
754 	/*
755 	 * Check for invalid Multicast GID.  All Multicast GIDs should have
756 	 * a well-defined pattern of bits and flags that are allowable.  If
757 	 * the specified GID does not meet the criteria, then return an error.
758 	 */
759 	if (hermon_mgid_is_valid(gid) == 0) {
760 		return (IBT_MC_MGID_INVALID);
761 	}
762 
763 	/*
764 	 * Compute the MGID hash value.  Since the MCG table is arranged as
765 	 * a number of separate hash chains, this operation converts the
766 	 * specified MGID into the starting index of an entry in the hash
767 	 * table (i.e. the index for the start of the appropriate hash chain).
768 	 * Subsequent operations below will walk the chain searching for the
769 	 * right place to add this new QP.
770 	 */
771 	status = hermon_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
772 	    &mgid_hash, HERMON_SLEEPFLAG_FOR_CONTEXT());
773 	if (status != HERMON_CMD_SUCCESS) {
774 		cmn_err(CE_CONT, "Hermon: MGID_HASH command failed: %08x\n",
775 		    status);
776 		if (status == HERMON_CMD_INVALID_STATUS) {
777 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
778 		}
779 		return (ibc_get_ci_failure(0));
780 	}
781 
782 	/*
783 	 * Grab the multicast group mutex.  Then grab the pre-allocated
784 	 * temporary buffer used for holding and/or modifying MCG entries.
785 	 * Zero out the temporary MCG entry before we begin.
786 	 */
787 	mutex_enter(&state->hs_mcglock);
788 	mcg_entry = state->hs_mcgtmp;
789 	mcg_entry_qplist = HERMON_MCG_GET_QPLIST_PTR(mcg_entry);
790 	bzero(mcg_entry, HERMON_MCGMEM_SZ(state));
791 
792 	/*
793 	 * Walk through the array of MCG entries starting at "mgid_hash".
794 	 * Try to find the appropriate place for this new QP to be added.
795 	 * This could happen when the first entry of the chain has MGID == 0
796 	 * (which means that the hash chain is empty), or because we find
797 	 * an entry with the same MGID (in which case we'll add the QP to
798 	 * that MCG), or because we come to the end of the chain (in which
799 	 * case this is the first QP being added to the multicast group that
800 	 * corresponds to the MGID.  The hermon_mcg_walk_mgid_hash() routine
801 	 * walks the list and returns an index into the MCG table.  The entry
802 	 * at this index is then checked to determine which case we have
803 	 * fallen into (see below).  Note:  We are using the "shadow" MCG
804 	 * list (of hermon_mcg_t structs) for this lookup because the real
805 	 * MCG entries are in hardware (and the lookup process would be much
806 	 * more time consuming).
807 	 */
808 	end_indx = hermon_mcg_walk_mgid_hash(state, mgid_hash, gid, NULL);
809 	mcg	 = &state->hs_mcghdl[end_indx];
810 
811 	/*
812 	 * If MGID == 0, then the hash chain is empty.  Just fill in the
813 	 * current entry.  Note:  No need to allocate an MCG table entry
814 	 * as all the hash chain "heads" are already preallocated.
815 	 */
816 	if ((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) {
817 
818 		/* Fill in the current entry in the "shadow" MCG list */
819 		hermon_mcg_setup_new_hdr(mcg, mcg_entry, gid, NULL);
820 
821 		/*
822 		 * Try to add the new QP number to the list.  This (and the
823 		 * above) routine fills in a temporary MCG.  The "mcg_entry"
824 		 * and "mcg_entry_qplist" pointers simply point to different
825 		 * offsets within the same temporary copy of the MCG (for
826 		 * convenience).  Note:  If this fails, we need to invalidate
827 		 * the entries we've already put into the "shadow" list entry
828 		 * above.
829 		 */
830 		status = hermon_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
831 		    &qp_found);
832 		if (status != DDI_SUCCESS) {
833 			bzero(mcg, sizeof (struct hermon_sw_mcg_list_s));
834 			mutex_exit(&state->hs_mcglock);
835 			return (status);
836 		}
837 		if (!qp_found)
838 			mcg_entry->member_cnt = (mcg->mcg_num_qps + 1);
839 			    /* set the member count */
840 
841 		/*
842 		 * Once the temporary MCG has been filled in, write the entry
843 		 * into the appropriate location in the Hermon MCG entry table.
844 		 * If it's successful, then drop the lock and return success.
845 		 * Note: In general, this operation shouldn't fail.  If it
846 		 * does, then it is an indication that something (probably in
847 		 * HW, but maybe in SW) has gone seriously wrong.  We still
848 		 * want to zero out the entries that we've filled in above
849 		 * (in the hermon_mcg_setup_new_hdr() routine).
850 		 */
851 		status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
852 		    HERMON_CMD_NOSLEEP_SPIN);
853 		if (status != HERMON_CMD_SUCCESS) {
854 			bzero(mcg, sizeof (struct hermon_sw_mcg_list_s));
855 			mutex_exit(&state->hs_mcglock);
856 			HERMON_WARNING(state, "failed to write MCG entry");
857 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
858 			    "%08x\n", status);
859 			if (status == HERMON_CMD_INVALID_STATUS) {
860 				hermon_fm_ereport(state, HCA_SYS_ERR,
861 				    HCA_ERR_SRV_LOST);
862 			}
863 			return (ibc_get_ci_failure(0));
864 		}
865 
866 		/*
867 		 * Now that we know all the Hermon firmware accesses have been
868 		 * successful, we update the "shadow" MCG entry by incrementing
869 		 * the "number of attached QPs" count.
870 		 *
871 		 * We increment only if the QP is not already part of the
872 		 * MCG by checking the 'qp_found' flag returned from the
873 		 * qplist_add above.
874 		 */
875 		if (!qp_found) {
876 			mcg->mcg_num_qps++;
877 
878 			/*
879 			 * Increment the refcnt for this QP.  Because the QP
880 			 * was added to this MCG, the refcnt must be
881 			 * incremented.
882 			 */
883 			hermon_qp_mcg_refcnt_inc(qp);
884 		}
885 
886 		/*
887 		 * We drop the lock and return success.
888 		 */
889 		mutex_exit(&state->hs_mcglock);
890 		return (DDI_SUCCESS);
891 	}
892 
893 	/*
894 	 * If the specified MGID matches the MGID in the current entry, then
895 	 * we need to try to add the QP to the current MCG entry.  In this
896 	 * case, it means that we need to read the existing MCG entry (into
897 	 * the temporary MCG), add the new QP number to the temporary entry
898 	 * (using the same method we used above), and write the entry back
899 	 * to the hardware (same as above).
900 	 */
901 	if ((mcg->mcg_mgid_h == gid.gid_prefix) &&
902 	    (mcg->mcg_mgid_l == gid.gid_guid)) {
903 
904 		/*
905 		 * Read the current MCG entry into the temporary MCG.  Note:
906 		 * In general, this operation shouldn't fail.  If it does,
907 		 * then it is an indication that something (probably in HW,
908 		 * but maybe in SW) has gone seriously wrong.
909 		 */
910 		status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
911 		    HERMON_CMD_NOSLEEP_SPIN);
912 		if (status != HERMON_CMD_SUCCESS) {
913 			mutex_exit(&state->hs_mcglock);
914 			HERMON_WARNING(state, "failed to read MCG entry");
915 			cmn_err(CE_CONT, "Hermon: READ_MGM command failed: "
916 			    "%08x\n", status);
917 			if (status == HERMON_CMD_INVALID_STATUS) {
918 				hermon_fm_ereport(state, HCA_SYS_ERR,
919 				    HCA_ERR_SRV_LOST);
920 			}
921 			return (ibc_get_ci_failure(0));
922 		}
923 
924 		/*
925 		 * Try to add the new QP number to the list.  This routine
926 		 * fills in the necessary pieces of the temporary MCG.  The
927 		 * "mcg_entry_qplist" pointer is used to point to the portion
928 		 * of the temporary MCG that holds the QP numbers.
929 		 *
930 		 * Note: hermon_mcg_qplist_add() returns SUCCESS if it
931 		 * already found the QP in the list.  In this case, the QP is
932 		 * not added on to the list again.  Check the flag 'qp_found'
933 		 * if this value is needed to be known.
934 		 *
935 		 */
936 		status = hermon_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
937 		    &qp_found);
938 		if (status != DDI_SUCCESS) {
939 			mutex_exit(&state->hs_mcglock);
940 			return (status);
941 		}
942 		if (!qp_found)
943 			mcg_entry->member_cnt = (mcg->mcg_num_qps + 1);
944 			    /* set the member count */
945 
946 		/*
947 		 * Once the temporary MCG has been updated, write the entry
948 		 * into the appropriate location in the Hermon MCG entry table.
949 		 * If it's successful, then drop the lock and return success.
950 		 * Note: In general, this operation shouldn't fail.  If it
951 		 * does, then it is an indication that something (probably in
952 		 * HW, but maybe in SW) has gone seriously wrong.
953 		 */
954 		status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
955 		    HERMON_CMD_NOSLEEP_SPIN);
956 		if (status != HERMON_CMD_SUCCESS) {
957 			mutex_exit(&state->hs_mcglock);
958 			HERMON_WARNING(state, "failed to write MCG entry");
959 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
960 			    "%08x\n", status);
961 			if (status == HERMON_CMD_INVALID_STATUS) {
962 				hermon_fm_ereport(state, HCA_SYS_ERR,
963 				    HCA_ERR_SRV_LOST);
964 			}
965 			return (ibc_get_ci_failure(0));
966 		}
967 
968 		/*
969 		 * Now that we know all the Hermon firmware accesses have been
970 		 * successful, we update the current "shadow" MCG entry by
971 		 * incrementing the "number of attached QPs" count.
972 		 *
973 		 * We increment only if the QP is not already part of the
974 		 * MCG by checking the 'qp_found' flag returned
975 		 * hermon_mcg_walk_mgid_hashfrom the qplist_add above.
976 		 */
977 		if (!qp_found) {
978 			mcg->mcg_num_qps++;
979 
980 			/*
981 			 * Increment the refcnt for this QP.  Because the QP
982 			 * was added to this MCG, the refcnt must be
983 			 * incremented.
984 			 */
985 			hermon_qp_mcg_refcnt_inc(qp);
986 		}
987 
988 		/*
989 		 * We drop the lock and return success.
990 		 */
991 		mutex_exit(&state->hs_mcglock);
992 		return (DDI_SUCCESS);
993 	}
994 
995 	/*
996 	 * If we've reached here, then we're at the end of the hash chain.
997 	 * We need to allocate a new MCG entry, fill it in, write it to Hermon,
998 	 * and update the previous entry to link the new one to the end of the
999 	 * chain.
1000 	 */
1001 
1002 	/*
1003 	 * Allocate an MCG table entry.  This will be filled in with all
1004 	 * the necessary parameters to define the multicast group.  Then it
1005 	 * will be written to the hardware in the next-to-last step below.
1006 	 */
1007 	status = hermon_rsrc_alloc(state, HERMON_MCG, 1, HERMON_NOSLEEP, &rsrc);
1008 	if (status != DDI_SUCCESS) {
1009 		mutex_exit(&state->hs_mcglock);
1010 		return (IBT_INSUFF_RESOURCE);
1011 	}
1012 
1013 	/*
1014 	 * Fill in the new entry in the "shadow" MCG list.  Note:  Just as
1015 	 * it does above, hermon_mcg_setup_new_hdr() also fills in a portion
1016 	 * of the temporary MCG entry (the rest of which will be filled in by
1017 	 * hermon_mcg_qplist_add() below)
1018 	 */
1019 	newmcg = &state->hs_mcghdl[rsrc->hr_indx];
1020 	hermon_mcg_setup_new_hdr(newmcg, mcg_entry, gid, rsrc);
1021 
1022 	/*
1023 	 * Try to add the new QP number to the list.  This routine fills in
1024 	 * the final necessary pieces of the temporary MCG.  The
1025 	 * "mcg_entry_qplist" pointer is used to point to the portion of the
1026 	 * temporary MCG that holds the QP numbers.  If we fail here, we
1027 	 * must undo the previous resource allocation.
1028 	 *
1029 	 * Note: hermon_mcg_qplist_add() can we return SUCCESS if it already
1030 	 * found the QP in the list.  In this case, the QP is not added on to
1031 	 * the list again.  Check the flag 'qp_found' if this value is needed
1032 	 * to be known.
1033 	 */
1034 	status = hermon_mcg_qplist_add(state, newmcg, mcg_entry_qplist, qp,
1035 	    &qp_found);
1036 	if (status != DDI_SUCCESS) {
1037 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1038 		hermon_rsrc_free(state, &rsrc);
1039 		mutex_exit(&state->hs_mcglock);
1040 		return (status);
1041 	}
1042 	mcg_entry->member_cnt = (newmcg->mcg_num_qps + 1);
1043 	    /* set the member count */
1044 
1045 	/*
1046 	 * Once the temporary MCG has been updated, write the entry into the
1047 	 * appropriate location in the Hermon MCG entry table.  If this is
1048 	 * successful, then we need to chain the previous entry to this one.
1049 	 * Note: In general, this operation shouldn't fail.  If it does, then
1050 	 * it is an indication that something (probably in HW, but maybe in
1051 	 * SW) has gone seriously wrong.
1052 	 */
1053 	status = hermon_write_mgm_cmd_post(state, mcg_entry, rsrc->hr_indx,
1054 	    HERMON_CMD_NOSLEEP_SPIN);
1055 	if (status != HERMON_CMD_SUCCESS) {
1056 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1057 		hermon_rsrc_free(state, &rsrc);
1058 		mutex_exit(&state->hs_mcglock);
1059 		HERMON_WARNING(state, "failed to write MCG entry");
1060 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1061 		    status);
1062 		if (status == HERMON_CMD_INVALID_STATUS) {
1063 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1064 		}
1065 		return (ibc_get_ci_failure(0));
1066 	}
1067 
1068 	/*
1069 	 * Now read the current MCG entry (the one previously at the end of
1070 	 * hash chain) into the temporary MCG.  We are going to update its
1071 	 * "next_gid_indx" now and write the entry back to the MCG table.
1072 	 * Note:  In general, this operation shouldn't fail.  If it does, then
1073 	 * it is an indication that something (probably in HW, but maybe in SW)
1074 	 * has gone seriously wrong.  We will free up the MCG entry resource,
1075 	 * but we will not undo the previously written MCG entry in the HW.
1076 	 * This is OK, though, because the MCG entry is not currently attached
1077 	 * to any hash chain.
1078 	 */
1079 	status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
1080 	    HERMON_CMD_NOSLEEP_SPIN);
1081 	if (status != HERMON_CMD_SUCCESS) {
1082 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1083 		hermon_rsrc_free(state, &rsrc);
1084 		mutex_exit(&state->hs_mcglock);
1085 		HERMON_WARNING(state, "failed to read MCG entry");
1086 		cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
1087 		    status);
1088 		if (status == HERMON_CMD_INVALID_STATUS) {
1089 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1090 		}
1091 		return (ibc_get_ci_failure(0));
1092 	}
1093 
1094 	/*
1095 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
1096 	 * and attempt to write the entry back into the Hermon MCG table.  If
1097 	 * this succeeds, then we update the "shadow" list to reflect the
1098 	 * change, drop the lock, and return success.  Note:  In general, this
1099 	 * operation shouldn't fail.  If it does, then it is an indication
1100 	 * that something (probably in HW, but maybe in SW) has gone seriously
1101 	 * wrong.  Just as we do above, we will free up the MCG entry resource,
1102 	 * but we will not try to undo the previously written MCG entry.  This
1103 	 * is OK, though, because (since we failed here to update the end of
1104 	 * the chain) that other entry is not currently attached to any chain.
1105 	 */
1106 	mcg_entry->next_gid_indx = rsrc->hr_indx;
1107 	status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
1108 	    HERMON_CMD_NOSLEEP_SPIN);
1109 	if (status != HERMON_CMD_SUCCESS) {
1110 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1111 		hermon_rsrc_free(state, &rsrc);
1112 		mutex_exit(&state->hs_mcglock);
1113 		HERMON_WARNING(state, "failed to write MCG entry");
1114 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1115 		    status);
1116 		if (status == HERMON_CMD_INVALID_STATUS) {
1117 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1118 		}
1119 		return (ibc_get_ci_failure(0));
1120 	}
1121 	mcg = &state->hs_mcghdl[end_indx];
1122 	mcg->mcg_next_indx = rsrc->hr_indx;
1123 
1124 	/*
1125 	 * Now that we know all the Hermon firmware accesses have been
1126 	 * successful, we update the new "shadow" MCG entry by incrementing
1127 	 * the "number of attached QPs" count.  Then we drop the lock and
1128 	 * return success.
1129 	 */
1130 	newmcg->mcg_num_qps++;
1131 
1132 	/*
1133 	 * Increment the refcnt for this QP.  Because the QP
1134 	 * was added to this MCG, the refcnt must be
1135 	 * incremented.
1136 	 */
1137 	hermon_qp_mcg_refcnt_inc(qp);
1138 
1139 	mutex_exit(&state->hs_mcglock);
1140 	return (DDI_SUCCESS);
1141 }
1142 
1143 
1144 /*
1145  * hermon_mcg_detach()
1146  *    Context: Can be called only from user or kernel context.
1147  */
1148 int
1149 hermon_mcg_detach(hermon_state_t *state, hermon_qphdl_t qp, ib_gid_t gid,
1150     ib_lid_t lid)
1151 {
1152 	hermon_hw_mcg_t		*mcg_entry;
1153 	hermon_hw_mcg_qp_list_t	*mcg_entry_qplist;
1154 	hermon_mcghdl_t		mcg;
1155 	uint64_t		mgid_hash;
1156 	uint32_t		end_indx, prev_indx;
1157 	int			status;
1158 
1159 	/*
1160 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
1161 	 * LIDs should be within a well defined range.  If the specified LID
1162 	 * is outside of that range, then return an error.
1163 	 */
1164 	if (hermon_mlid_is_valid(lid) == 0) {
1165 		return (IBT_MC_MLID_INVALID);
1166 	}
1167 
1168 	/*
1169 	 * Compute the MGID hash value.  As described above, the MCG table is
1170 	 * arranged as a number of separate hash chains.  This operation
1171 	 * converts the specified MGID into the starting index of an entry in
1172 	 * the hash table (i.e. the index for the start of the appropriate
1173 	 * hash chain).  Subsequent operations below will walk the chain
1174 	 * searching for a matching entry from which to attempt to remove
1175 	 * the specified QP.
1176 	 */
1177 	status = hermon_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
1178 	    &mgid_hash, HERMON_SLEEPFLAG_FOR_CONTEXT());
1179 	if (status != HERMON_CMD_SUCCESS) {
1180 		cmn_err(CE_CONT, "Hermon: MGID_HASH command failed: %08x\n",
1181 		    status);
1182 		if (status == HERMON_CMD_INVALID_STATUS) {
1183 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1184 		}
1185 		return (ibc_get_ci_failure(0));
1186 	}
1187 
1188 	/*
1189 	 * Grab the multicast group mutex.  Then grab the pre-allocated
1190 	 * temporary buffer used for holding and/or modifying MCG entries.
1191 	 */
1192 	mutex_enter(&state->hs_mcglock);
1193 	mcg_entry = state->hs_mcgtmp;
1194 	mcg_entry_qplist = HERMON_MCG_GET_QPLIST_PTR(mcg_entry);
1195 
1196 	/*
1197 	 * Walk through the array of MCG entries starting at "mgid_hash".
1198 	 * Try to find an MCG entry with a matching MGID.  The
1199 	 * hermon_mcg_walk_mgid_hash() routine walks the list and returns an
1200 	 * index into the MCG table.  The entry at this index is checked to
1201 	 * determine whether it is a match or not.  If it is a match, then
1202 	 * we continue on to attempt to remove the QP from the MCG.  If it
1203 	 * is not a match (or not a valid MCG entry), then we return an error.
1204 	 */
1205 	end_indx = hermon_mcg_walk_mgid_hash(state, mgid_hash, gid, &prev_indx);
1206 	mcg	 = &state->hs_mcghdl[end_indx];
1207 
1208 	/*
1209 	 * If MGID == 0 (the hash chain is empty) or if the specified MGID
1210 	 * does not match the MGID in the current entry, then return
1211 	 * IBT_MC_MGID_INVALID (to indicate that the specified MGID is not
1212 	 * valid).
1213 	 */
1214 	if (((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) ||
1215 	    ((mcg->mcg_mgid_h != gid.gid_prefix) ||
1216 	    (mcg->mcg_mgid_l != gid.gid_guid))) {
1217 		mutex_exit(&state->hs_mcglock);
1218 		return (IBT_MC_MGID_INVALID);
1219 	}
1220 
1221 	/*
1222 	 * Read the current MCG entry into the temporary MCG.  Note: In
1223 	 * general, this operation shouldn't fail.  If it does, then it is
1224 	 * an indication that something (probably in HW, but maybe in SW)
1225 	 * has gone seriously wrong.
1226 	 */
1227 	status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
1228 	    HERMON_CMD_NOSLEEP_SPIN);
1229 	if (status != HERMON_CMD_SUCCESS) {
1230 		mutex_exit(&state->hs_mcglock);
1231 		HERMON_WARNING(state, "failed to read MCG entry");
1232 		cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
1233 		    status);
1234 		if (status == HERMON_CMD_INVALID_STATUS) {
1235 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1236 		}
1237 		return (ibc_get_ci_failure(0));
1238 	}
1239 
1240 	/*
1241 	 * Search the QP number list for a match.  If a match is found, then
1242 	 * remove the entry from the QP list.  Otherwise, if no match is found,
1243 	 * return an error.
1244 	 */
1245 	status = hermon_mcg_qplist_remove(mcg, mcg_entry_qplist, qp);
1246 	if (status != DDI_SUCCESS) {
1247 		mutex_exit(&state->hs_mcglock);
1248 		return (status);
1249 	}
1250 
1251 	/*
1252 	 * Decrement the MCG count for this QP.  When the 'qp_mcg'
1253 	 * field becomes 0, then this QP is no longer a member of any
1254 	 * MCG.
1255 	 */
1256 	hermon_qp_mcg_refcnt_dec(qp);
1257 
1258 	/*
1259 	 * If the current MCG's QP number list is about to be made empty
1260 	 * ("mcg_num_qps" == 1), then remove the entry itself from the hash
1261 	 * chain.  Otherwise, just write the updated MCG entry back to the
1262 	 * hardware.  In either case, once we successfully update the hardware
1263 	 * chain, then we decrement the "shadow" list entry's "mcg_num_qps"
1264 	 * count (or zero out the entire "shadow" list entry) before returning
1265 	 * success.  Note:  Zeroing out the "shadow" list entry is done
1266 	 * inside of hermon_mcg_hash_list_remove().
1267 	 */
1268 	if (mcg->mcg_num_qps == 1) {
1269 
1270 		/* Remove an MCG entry from the hash chain */
1271 		status = hermon_mcg_hash_list_remove(state, end_indx, prev_indx,
1272 		    mcg_entry);
1273 		if (status != DDI_SUCCESS) {
1274 			mutex_exit(&state->hs_mcglock);
1275 			return (status);
1276 		}
1277 
1278 	} else {
1279 		/*
1280 		 * Write the updated MCG entry back to the Hermon MCG table.
1281 		 * If this succeeds, then we update the "shadow" list to
1282 		 * reflect the change (i.e. decrement the "mcg_num_qps"),
1283 		 * drop the lock, and return success.  Note:  In general,
1284 		 * this operation shouldn't fail.  If it does, then it is an
1285 		 * indication that something (probably in HW, but maybe in SW)
1286 		 * has gone seriously wrong.
1287 		 */
1288 		mcg_entry->member_cnt = (mcg->mcg_num_qps - 1);
1289 		status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
1290 		    HERMON_CMD_NOSLEEP_SPIN);
1291 		if (status != HERMON_CMD_SUCCESS) {
1292 			mutex_exit(&state->hs_mcglock);
1293 			HERMON_WARNING(state, "failed to write MCG entry");
1294 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
1295 			    "%08x\n", status);
1296 			if (status == HERMON_CMD_INVALID_STATUS) {
1297 				hermon_fm_ereport(state, HCA_SYS_ERR,
1298 				    HCA_ERR_SRV_LOST);
1299 			}
1300 			return (ibc_get_ci_failure(0));
1301 		}
1302 		mcg->mcg_num_qps--;
1303 	}
1304 
1305 	mutex_exit(&state->hs_mcglock);
1306 	return (DDI_SUCCESS);
1307 }
1308 
1309 /*
1310  * hermon_qp_mcg_refcnt_inc()
1311  *    Context: Can be called from interrupt or base context.
1312  */
1313 static void
1314 hermon_qp_mcg_refcnt_inc(hermon_qphdl_t qp)
1315 {
1316 	/* Increment the QP's MCG reference count */
1317 	mutex_enter(&qp->qp_lock);
1318 	qp->qp_mcg_refcnt++;
1319 	mutex_exit(&qp->qp_lock);
1320 }
1321 
1322 
1323 /*
1324  * hermon_qp_mcg_refcnt_dec()
1325  *    Context: Can be called from interrupt or base context.
1326  */
1327 static void
1328 hermon_qp_mcg_refcnt_dec(hermon_qphdl_t qp)
1329 {
1330 	/* Decrement the QP's MCG reference count */
1331 	mutex_enter(&qp->qp_lock);
1332 	qp->qp_mcg_refcnt--;
1333 	mutex_exit(&qp->qp_lock);
1334 }
1335 
1336 
1337 /*
1338  * hermon_mcg_qplist_add()
1339  *    Context: Can be called from interrupt or base context.
1340  */
1341 static int
1342 hermon_mcg_qplist_add(hermon_state_t *state, hermon_mcghdl_t mcg,
1343     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp,
1344     uint_t *qp_found)
1345 {
1346 	uint_t		qplist_indx;
1347 
1348 	ASSERT(MUTEX_HELD(&state->hs_mcglock));
1349 
1350 	qplist_indx = mcg->mcg_num_qps;
1351 
1352 	/*
1353 	 * Determine if we have exceeded the maximum number of QP per
1354 	 * multicast group.  If we have, then return an error
1355 	 */
1356 	if (qplist_indx >= state->hs_cfg_profile->cp_num_qp_per_mcg) {
1357 		return (IBT_HCA_MCG_QP_EXCEEDED);
1358 	}
1359 
1360 	/*
1361 	 * Determine if the QP is already attached to this MCG table.  If it
1362 	 * is, then we break out and treat this operation as a NO-OP
1363 	 */
1364 	for (qplist_indx = 0; qplist_indx < mcg->mcg_num_qps;
1365 	    qplist_indx++) {
1366 		if (mcg_qplist[qplist_indx].qpn == qp->qp_qpnum) {
1367 			break;
1368 		}
1369 	}
1370 
1371 	/*
1372 	 * If the QP was already on the list, set 'qp_found' to TRUE.  We still
1373 	 * return SUCCESS in this case, but the qplist will not have been
1374 	 * updated because the QP was already on the list.
1375 	 */
1376 	if (qplist_indx < mcg->mcg_num_qps) {
1377 		*qp_found = 1;
1378 	} else {
1379 		/*
1380 		 * Otherwise, append the new QP number to the end of the
1381 		 * current QP list.  Note: We will increment the "mcg_num_qps"
1382 		 * field on the "shadow" MCG list entry later (after we know
1383 		 * that all necessary Hermon firmware accesses have been
1384 		 * successful).
1385 		 *
1386 		 * Set 'qp_found' to 0 so we know the QP was added on to the
1387 		 * list for sure.
1388 		 */
1389 		mcg_qplist[qplist_indx].qpn =
1390 		    (qp->qp_qpnum | HERMON_MCG_QPN_BLOCK_LB);
1391 		*qp_found = 0;
1392 	}
1393 
1394 	return (DDI_SUCCESS);
1395 }
1396 
1397 
1398 
1399 /*
1400  * hermon_mcg_qplist_remove()
1401  *    Context: Can be called from interrupt or base context.
1402  */
1403 static int
1404 hermon_mcg_qplist_remove(hermon_mcghdl_t mcg,
1405     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp)
1406 {
1407 	uint_t		i, qplist_indx;
1408 
1409 	/*
1410 	 * Search the MCG QP list for a matching QPN.  When
1411 	 * it's found, we swap the last entry with the current
1412 	 * one, set the last entry to zero, decrement the last
1413 	 * entry, and return.  If it's not found, then it's
1414 	 * and error.
1415 	 */
1416 	qplist_indx = mcg->mcg_num_qps;
1417 	for (i = 0; i < qplist_indx; i++) {
1418 		if (mcg_qplist[i].qpn == qp->qp_qpnum) {
1419 			mcg_qplist[i] = mcg_qplist[qplist_indx - 1];
1420 			mcg_qplist[qplist_indx - 1].qpn = 0;
1421 
1422 			return (DDI_SUCCESS);
1423 		}
1424 	}
1425 
1426 	return (IBT_QP_HDL_INVALID);
1427 }
1428 
1429 
1430 /*
1431  * hermon_mcg_walk_mgid_hash()
1432  *    Context: Can be called from interrupt or base context.
1433  */
1434 static uint_t
1435 hermon_mcg_walk_mgid_hash(hermon_state_t *state, uint64_t start_indx,
1436     ib_gid_t mgid, uint_t *p_indx)
1437 {
1438 	hermon_mcghdl_t	curr_mcghdl;
1439 	uint_t		curr_indx, prev_indx;
1440 
1441 	ASSERT(MUTEX_HELD(&state->hs_mcglock));
1442 
1443 	/* Start at the head of the hash chain */
1444 	curr_indx   = (uint_t)start_indx;
1445 	prev_indx   = curr_indx;
1446 	curr_mcghdl = &state->hs_mcghdl[curr_indx];
1447 
1448 	/* If the first entry in the chain has MGID == 0, then stop */
1449 	if ((curr_mcghdl->mcg_mgid_h == 0) &&
1450 	    (curr_mcghdl->mcg_mgid_l == 0)) {
1451 		goto end_mgid_hash_walk;
1452 	}
1453 
1454 	/* If the first entry in the chain matches the MGID, then stop */
1455 	if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1456 	    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1457 		goto end_mgid_hash_walk;
1458 	}
1459 
1460 	/* Otherwise, walk the hash chain looking for a match */
1461 	while (curr_mcghdl->mcg_next_indx != 0) {
1462 		prev_indx = curr_indx;
1463 		curr_indx = curr_mcghdl->mcg_next_indx;
1464 		curr_mcghdl = &state->hs_mcghdl[curr_indx];
1465 
1466 		if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1467 		    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1468 			break;
1469 		}
1470 	}
1471 
1472 end_mgid_hash_walk:
1473 	/*
1474 	 * If necessary, return the index of the previous entry too.  This
1475 	 * is primarily used for detaching a QP from a multicast group.  It
1476 	 * may be necessary, in that case, to delete an MCG entry from the
1477 	 * hash chain and having the index of the previous entry is helpful.
1478 	 */
1479 	if (p_indx != NULL) {
1480 		*p_indx = prev_indx;
1481 	}
1482 	return (curr_indx);
1483 }
1484 
1485 
1486 /*
1487  * hermon_mcg_setup_new_hdr()
1488  *    Context: Can be called from interrupt or base context.
1489  */
1490 static void
1491 hermon_mcg_setup_new_hdr(hermon_mcghdl_t mcg, hermon_hw_mcg_t *mcg_hdr,
1492     ib_gid_t mgid, hermon_rsrc_t *mcg_rsrc)
1493 {
1494 	/*
1495 	 * Fill in the fields of the "shadow" entry used by software
1496 	 * to track MCG hardware entry
1497 	 */
1498 	mcg->mcg_mgid_h	   = mgid.gid_prefix;
1499 	mcg->mcg_mgid_l	   = mgid.gid_guid;
1500 	mcg->mcg_rsrcp	   = mcg_rsrc;
1501 	mcg->mcg_next_indx = 0;
1502 	mcg->mcg_num_qps   = 0;
1503 
1504 	/*
1505 	 * Fill the header fields of the MCG entry (in the temporary copy)
1506 	 */
1507 	mcg_hdr->mgid_h		= mgid.gid_prefix;
1508 	mcg_hdr->mgid_l		= mgid.gid_guid;
1509 	mcg_hdr->next_gid_indx	= 0;
1510 }
1511 
1512 
1513 /*
1514  * hermon_mcg_hash_list_remove()
1515  *    Context: Can be called only from user or kernel context.
1516  */
1517 static int
1518 hermon_mcg_hash_list_remove(hermon_state_t *state, uint_t curr_indx,
1519     uint_t prev_indx, hermon_hw_mcg_t *mcg_entry)
1520 {
1521 	hermon_mcghdl_t		curr_mcg, prev_mcg, next_mcg;
1522 	uint_t			next_indx;
1523 	int			status;
1524 
1525 	/* Get the pointer to "shadow" list for current entry */
1526 	curr_mcg = &state->hs_mcghdl[curr_indx];
1527 
1528 	/*
1529 	 * If this is the first entry on a hash chain, then attempt to replace
1530 	 * the entry with the next entry on the chain.  If there are no
1531 	 * subsequent entries on the chain, then this is the only entry and
1532 	 * should be invalidated.
1533 	 */
1534 	if (curr_indx == prev_indx) {
1535 
1536 		/*
1537 		 * If this is the only entry on the chain, then invalidate it.
1538 		 * Note:  Invalidating an MCG entry means writing all zeros
1539 		 * to the entry.  This is only necessary for those MCG
1540 		 * entries that are the "head" entries of the individual hash
1541 		 * chains.  Regardless of whether this operation returns
1542 		 * success or failure, return that result to the caller.
1543 		 */
1544 		next_indx = curr_mcg->mcg_next_indx;
1545 		if (next_indx == 0) {
1546 			status = hermon_mcg_entry_invalidate(state, mcg_entry,
1547 			    curr_indx);
1548 			bzero(curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
1549 			return (status);
1550 		}
1551 
1552 		/*
1553 		 * Otherwise, this is just the first entry on the chain, so
1554 		 * grab the next one
1555 		 */
1556 		next_mcg = &state->hs_mcghdl[next_indx];
1557 
1558 		/*
1559 		 * Read the next MCG entry into the temporary MCG.  Note:
1560 		 * In general, this operation shouldn't fail.  If it does,
1561 		 * then it is an indication that something (probably in HW,
1562 		 * but maybe in SW) has gone seriously wrong.
1563 		 */
1564 		status = hermon_read_mgm_cmd_post(state, mcg_entry, next_indx,
1565 		    HERMON_CMD_NOSLEEP_SPIN);
1566 		if (status != HERMON_CMD_SUCCESS) {
1567 			HERMON_WARNING(state, "failed to read MCG entry");
1568 			cmn_err(CE_CONT, "Hermon: READ_MGM command failed: "
1569 			    "%08x\n", status);
1570 			if (status == HERMON_CMD_INVALID_STATUS) {
1571 				hermon_fm_ereport(state, HCA_SYS_ERR,
1572 				    HCA_ERR_SRV_LOST);
1573 			}
1574 			return (ibc_get_ci_failure(0));
1575 		}
1576 
1577 		/*
1578 		 * Copy/Write the temporary MCG back to the hardware MCG list
1579 		 * using the current index.  This essentially removes the
1580 		 * current MCG entry from the list by writing over it with
1581 		 * the next one.  If this is successful, then we can do the
1582 		 * same operation for the "shadow" list.  And we can also
1583 		 * free up the Hermon MCG entry resource that was associated
1584 		 * with the (old) next entry.  Note:  In general, this
1585 		 * operation shouldn't fail.  If it does, then it is an
1586 		 * indication that something (probably in HW, but maybe in SW)
1587 		 * has gone seriously wrong.
1588 		 */
1589 		status = hermon_write_mgm_cmd_post(state, mcg_entry, curr_indx,
1590 		    HERMON_CMD_NOSLEEP_SPIN);
1591 		if (status != HERMON_CMD_SUCCESS) {
1592 			HERMON_WARNING(state, "failed to write MCG entry");
1593 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
1594 			    "%08x\n", status);
1595 			if (status == HERMON_CMD_INVALID_STATUS) {
1596 				hermon_fm_ereport(state, HCA_SYS_ERR,
1597 				    HCA_ERR_SRV_LOST);
1598 			}
1599 			return (ibc_get_ci_failure(0));
1600 		}
1601 
1602 		/*
1603 		 * Copy all the software tracking information from the next
1604 		 * entry on the "shadow" MCG list into the current entry on
1605 		 * the list.  Then invalidate (zero out) the other "shadow"
1606 		 * list entry.
1607 		 */
1608 		bcopy(next_mcg, curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
1609 		bzero(next_mcg, sizeof (struct hermon_sw_mcg_list_s));
1610 
1611 		/*
1612 		 * Free up the Hermon MCG entry resource used by the "next"
1613 		 * MCG entry.  That resource is no longer needed by any
1614 		 * MCG entry which is first on a hash chain (like the "next"
1615 		 * entry has just become).
1616 		 */
1617 		hermon_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1618 
1619 		return (DDI_SUCCESS);
1620 	}
1621 
1622 	/*
1623 	 * Else if this is the last entry on the hash chain (or a middle
1624 	 * entry, then we update the previous entry's "next_gid_index" field
1625 	 * to make it point instead to the next entry on the chain.  By
1626 	 * skipping over the removed entry in this way, we can then free up
1627 	 * any resources associated with the current entry.  Note:  We don't
1628 	 * need to invalidate the "skipped over" hardware entry because it
1629 	 * will no be longer connected to any hash chains, and if/when it is
1630 	 * finally re-used, it will be written with entirely new values.
1631 	 */
1632 
1633 	/*
1634 	 * Read the next MCG entry into the temporary MCG.  Note:  In general,
1635 	 * this operation shouldn't fail.  If it does, then it is an
1636 	 * indication that something (probably in HW, but maybe in SW) has
1637 	 * gone seriously wrong.
1638 	 */
1639 	status = hermon_read_mgm_cmd_post(state, mcg_entry, prev_indx,
1640 	    HERMON_CMD_NOSLEEP_SPIN);
1641 	if (status != HERMON_CMD_SUCCESS) {
1642 		HERMON_WARNING(state, "failed to read MCG entry");
1643 		cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
1644 		    status);
1645 		if (status == HERMON_CMD_INVALID_STATUS) {
1646 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1647 		}
1648 		return (ibc_get_ci_failure(0));
1649 	}
1650 
1651 	/*
1652 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
1653 	 * and attempt to write the entry back into the Hermon MCG table.  If
1654 	 * this succeeds, then we update the "shadow" list to reflect the
1655 	 * change, free up the Hermon MCG entry resource that was associated
1656 	 * with the current entry, and return success.  Note:  In general,
1657 	 * this operation shouldn't fail.  If it does, then it is an indication
1658 	 * that something (probably in HW, but maybe in SW) has gone seriously
1659 	 * wrong.
1660 	 */
1661 	mcg_entry->next_gid_indx = curr_mcg->mcg_next_indx;
1662 	status = hermon_write_mgm_cmd_post(state, mcg_entry, prev_indx,
1663 	    HERMON_CMD_NOSLEEP_SPIN);
1664 	if (status != HERMON_CMD_SUCCESS) {
1665 		HERMON_WARNING(state, "failed to write MCG entry");
1666 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1667 		    status);
1668 		if (status == HERMON_CMD_INVALID_STATUS) {
1669 			hermon_fm_ereport(state, HCA_SYS_ERR,
1670 			    HCA_ERR_SRV_LOST);
1671 		}
1672 		return (ibc_get_ci_failure(0));
1673 	}
1674 
1675 	/*
1676 	 * Get the pointer to the "shadow" MCG list entry for the previous
1677 	 * MCG.  Update its "mcg_next_indx" to point to the next entry
1678 	 * the one after the current entry. Note:  This next index may be
1679 	 * zero, indicating the end of the list.
1680 	 */
1681 	prev_mcg = &state->hs_mcghdl[prev_indx];
1682 	prev_mcg->mcg_next_indx = curr_mcg->mcg_next_indx;
1683 
1684 	/*
1685 	 * Free up the Hermon MCG entry resource used by the current entry.
1686 	 * This resource is no longer needed because the chain now skips over
1687 	 * the current entry.  Then invalidate (zero out) the current "shadow"
1688 	 * list entry.
1689 	 */
1690 	hermon_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1691 	bzero(curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
1692 
1693 	return (DDI_SUCCESS);
1694 }
1695 
1696 
1697 /*
1698  * hermon_mcg_entry_invalidate()
1699  *    Context: Can be called only from user or kernel context.
1700  */
1701 static int
1702 hermon_mcg_entry_invalidate(hermon_state_t *state, hermon_hw_mcg_t *mcg_entry,
1703     uint_t indx)
1704 {
1705 	int		status;
1706 
1707 	/*
1708 	 * Invalidate the hardware MCG entry by zeroing out this temporary
1709 	 * MCG and writing it the the hardware.  Note: In general, this
1710 	 * operation shouldn't fail.  If it does, then it is an indication
1711 	 * that something (probably in HW, but maybe in SW) has gone seriously
1712 	 * wrong.
1713 	 */
1714 	bzero(mcg_entry, HERMON_MCGMEM_SZ(state));
1715 	status = hermon_write_mgm_cmd_post(state, mcg_entry, indx,
1716 	    HERMON_CMD_NOSLEEP_SPIN);
1717 	if (status != HERMON_CMD_SUCCESS) {
1718 		HERMON_WARNING(state, "failed to write MCG entry");
1719 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1720 		    status);
1721 		if (status == HERMON_CMD_INVALID_STATUS) {
1722 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1723 		}
1724 		return (ibc_get_ci_failure(0));
1725 	}
1726 
1727 	return (DDI_SUCCESS);
1728 }
1729 
1730 
1731 /*
1732  * hermon_mgid_is_valid()
1733  *    Context: Can be called from interrupt or base context.
1734  */
1735 static int
1736 hermon_mgid_is_valid(ib_gid_t gid)
1737 {
1738 	uint_t		topbits, flags, scope;
1739 
1740 	/*
1741 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1742 	 * "multicast GID" must have its top eight bits set to all ones
1743 	 */
1744 	topbits = (gid.gid_prefix >> HERMON_MCG_TOPBITS_SHIFT) &
1745 	    HERMON_MCG_TOPBITS_MASK;
1746 	if (topbits != HERMON_MCG_TOPBITS) {
1747 		return (0);
1748 	}
1749 
1750 	/*
1751 	 * The next 4 bits are the "flag" bits.  These are valid only
1752 	 * if they are "0" (which correspond to permanently assigned/
1753 	 * "well-known" multicast GIDs) or "1" (for so-called "transient"
1754 	 * multicast GIDs).  All other values are reserved.
1755 	 */
1756 	flags = (gid.gid_prefix >> HERMON_MCG_FLAGS_SHIFT) &
1757 	    HERMON_MCG_FLAGS_MASK;
1758 	if (!((flags == HERMON_MCG_FLAGS_PERM) ||
1759 	    (flags == HERMON_MCG_FLAGS_NONPERM))) {
1760 		return (0);
1761 	}
1762 
1763 	/*
1764 	 * The next 4 bits are the "scope" bits.  These are valid only
1765 	 * if they are "2" (Link-local), "5" (Site-local), "8"
1766 	 * (Organization-local) or "E" (Global).  All other values
1767 	 * are reserved (or currently unassigned).
1768 	 */
1769 	scope = (gid.gid_prefix >> HERMON_MCG_SCOPE_SHIFT) &
1770 	    HERMON_MCG_SCOPE_MASK;
1771 	if (!((scope == HERMON_MCG_SCOPE_LINKLOC) ||
1772 	    (scope == HERMON_MCG_SCOPE_SITELOC)	 ||
1773 	    (scope == HERMON_MCG_SCOPE_ORGLOC)	 ||
1774 	    (scope == HERMON_MCG_SCOPE_GLOBAL))) {
1775 		return (0);
1776 	}
1777 
1778 	/*
1779 	 * If it passes all of the above checks, then we will consider it
1780 	 * a valid multicast GID.
1781 	 */
1782 	return (1);
1783 }
1784 
1785 
1786 /*
1787  * hermon_mlid_is_valid()
1788  *    Context: Can be called from interrupt or base context.
1789  */
1790 static int
1791 hermon_mlid_is_valid(ib_lid_t lid)
1792 {
1793 	/*
1794 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1795 	 * "multicast DLID" must be between 0xC000 and 0xFFFE.
1796 	 */
1797 	if ((lid < IB_LID_MC_FIRST) || (lid > IB_LID_MC_LAST)) {
1798 		return (0);
1799 	}
1800 
1801 	return (1);
1802 }
1803 
1804 
1805 /*
1806  * hermon_pd_alloc()
1807  *    Context: Can be called only from user or kernel context.
1808  */
1809 int
1810 hermon_pd_alloc(hermon_state_t *state, hermon_pdhdl_t *pdhdl, uint_t sleepflag)
1811 {
1812 	hermon_rsrc_t	*rsrc;
1813 	hermon_pdhdl_t	pd;
1814 	int		status;
1815 
1816 	/*
1817 	 * Allocate the software structure for tracking the protection domain
1818 	 * (i.e. the Hermon Protection Domain handle).  By default each PD
1819 	 * structure will have a unique PD number assigned to it.  All that
1820 	 * is necessary is for software to initialize the PD reference count
1821 	 * (to zero) and return success.
1822 	 */
1823 	status = hermon_rsrc_alloc(state, HERMON_PDHDL, 1, sleepflag, &rsrc);
1824 	if (status != DDI_SUCCESS) {
1825 		return (IBT_INSUFF_RESOURCE);
1826 	}
1827 	pd = (hermon_pdhdl_t)rsrc->hr_addr;
1828 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1829 
1830 	pd->pd_refcnt = 0;
1831 	*pdhdl = pd;
1832 
1833 	return (DDI_SUCCESS);
1834 }
1835 
1836 
1837 /*
1838  * hermon_pd_free()
1839  *    Context: Can be called only from user or kernel context.
1840  */
1841 int
1842 hermon_pd_free(hermon_state_t *state, hermon_pdhdl_t *pdhdl)
1843 {
1844 	hermon_rsrc_t	*rsrc;
1845 	hermon_pdhdl_t	pd;
1846 
1847 	/*
1848 	 * Pull all the necessary information from the Hermon Protection Domain
1849 	 * handle.  This is necessary here because the resource for the
1850 	 * PD is going to be freed up as part of this operation.
1851 	 */
1852 	pd   = *pdhdl;
1853 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1854 	rsrc = pd->pd_rsrcp;
1855 
1856 	/*
1857 	 * Check the PD reference count.  If the reference count is non-zero,
1858 	 * then it means that this protection domain is still referenced by
1859 	 * some memory region, queue pair, address handle, or other IB object
1860 	 * If it is non-zero, then return an error.  Otherwise, free the
1861 	 * Hermon resource and return success.
1862 	 */
1863 	if (pd->pd_refcnt != 0) {
1864 		return (IBT_PD_IN_USE);
1865 	}
1866 
1867 	/* Free the Hermon Protection Domain handle */
1868 	hermon_rsrc_free(state, &rsrc);
1869 
1870 	/* Set the pdhdl pointer to NULL and return success */
1871 	*pdhdl = (hermon_pdhdl_t)NULL;
1872 
1873 	return (DDI_SUCCESS);
1874 }
1875 
1876 
1877 /*
1878  * hermon_pd_refcnt_inc()
1879  *    Context: Can be called from interrupt or base context.
1880  */
1881 void
1882 hermon_pd_refcnt_inc(hermon_pdhdl_t pd)
1883 {
1884 	/* Increment the protection domain's reference count */
1885 	atomic_inc_32(&pd->pd_refcnt);
1886 }
1887 
1888 
1889 /*
1890  * hermon_pd_refcnt_dec()
1891  *    Context: Can be called from interrupt or base context.
1892  */
1893 void
1894 hermon_pd_refcnt_dec(hermon_pdhdl_t pd)
1895 {
1896 	/* Decrement the protection domain's reference count */
1897 	atomic_dec_32(&pd->pd_refcnt);
1898 }
1899 
1900 
1901 /*
1902  * hermon_port_query()
1903  *    Context: Can be called only from user or kernel context.
1904  */
1905 int
1906 hermon_port_query(hermon_state_t *state, uint_t port, ibt_hca_portinfo_t *pi)
1907 {
1908 	sm_portinfo_t		portinfo;
1909 	sm_guidinfo_t		guidinfo;
1910 	sm_pkey_table_t		pkeytable;
1911 	ib_gid_t		*sgid;
1912 	uint_t			sgid_max, pkey_max, tbl_size;
1913 	int			i, j, indx, status;
1914 	ib_pkey_t		*pkeyp;
1915 	ib_guid_t		*guidp;
1916 
1917 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pi))
1918 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1919 
1920 	/* Validate that specified port number is legal */
1921 	if (!hermon_portnum_is_valid(state, port)) {
1922 		return (IBT_HCA_PORT_INVALID);
1923 	}
1924 	pkeyp = state->hs_pkey[port - 1];
1925 	guidp = state->hs_guid[port - 1];
1926 
1927 	/*
1928 	 * We use the Hermon MAD_IFC command to post a GetPortInfo MAD
1929 	 * to the firmware (for the specified port number).  This returns
1930 	 * a full PortInfo MAD (in "portinfo") which we subsequently
1931 	 * parse to fill in the "ibt_hca_portinfo_t" structure returned
1932 	 * to the IBTF.
1933 	 */
1934 	status = hermon_getportinfo_cmd_post(state, port,
1935 	    HERMON_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
1936 	if (status != HERMON_CMD_SUCCESS) {
1937 		cmn_err(CE_CONT, "Hermon: GetPortInfo (port %02d) command "
1938 		    "failed: %08x\n", port, status);
1939 		if (status == HERMON_CMD_INVALID_STATUS) {
1940 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1941 		}
1942 		return (ibc_get_ci_failure(0));
1943 	}
1944 
1945 	/*
1946 	 * Parse the PortInfo MAD and fill in the IBTF structure
1947 	 */
1948 	pi->p_base_lid		= portinfo.LID;
1949 	pi->p_qkey_violations	= portinfo.Q_KeyViolations;
1950 	pi->p_pkey_violations	= portinfo.P_KeyViolations;
1951 	pi->p_sm_sl		= portinfo.MasterSMSL;
1952 	pi->p_sm_lid		= portinfo.MasterSMLID;
1953 	pi->p_linkstate		= portinfo.PortState;
1954 	pi->p_port_num		= portinfo.LocalPortNum;
1955 	pi->p_phys_state	= portinfo.PortPhysicalState;
1956 	pi->p_width_supported	= portinfo.LinkWidthSupported;
1957 	pi->p_width_enabled	= portinfo.LinkWidthEnabled;
1958 	pi->p_width_active	= portinfo.LinkWidthActive;
1959 	pi->p_speed_supported	= portinfo.LinkSpeedSupported;
1960 	pi->p_speed_enabled	= portinfo.LinkSpeedEnabled;
1961 	pi->p_speed_active	= portinfo.LinkSpeedActive;
1962 	pi->p_mtu		= portinfo.MTUCap;
1963 	pi->p_lmc		= portinfo.LMC;
1964 	pi->p_max_vl		= portinfo.VLCap;
1965 	pi->p_subnet_timeout	= portinfo.SubnetTimeOut;
1966 	pi->p_msg_sz		= ((uint32_t)1 << HERMON_QP_LOG_MAX_MSGSZ);
1967 	tbl_size = state->hs_cfg_profile->cp_log_max_gidtbl;
1968 	pi->p_sgid_tbl_sz	= (1 << tbl_size);
1969 	tbl_size = state->hs_cfg_profile->cp_log_max_pkeytbl;
1970 	pi->p_pkey_tbl_sz	= (1 << tbl_size);
1971 	state->hs_sn_prefix[port - 1] = portinfo.GidPrefix;
1972 
1973 	/*
1974 	 * Convert InfiniBand-defined port capability flags to the format
1975 	 * specified by the IBTF
1976 	 */
1977 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM)
1978 		pi->p_capabilities |= IBT_PORT_CAP_SM;
1979 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM_DISABLED)
1980 		pi->p_capabilities |= IBT_PORT_CAP_SM_DISABLED;
1981 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SNMP_SUPPD)
1982 		pi->p_capabilities |= IBT_PORT_CAP_SNMP_TUNNEL;
1983 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_DM_SUPPD)
1984 		pi->p_capabilities |= IBT_PORT_CAP_DM;
1985 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_VM_SUPPD)
1986 		pi->p_capabilities |= IBT_PORT_CAP_VENDOR;
1987 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_CLNT_REREG_SUPPD)
1988 		pi->p_capabilities |= IBT_PORT_CAP_CLNT_REREG;
1989 
1990 	/*
1991 	 * Fill in the SGID table.  Since the only access to the Hermon
1992 	 * GID tables is through the firmware's MAD_IFC interface, we
1993 	 * post as many GetGUIDInfo MADs as necessary to read in the entire
1994 	 * contents of the SGID table (for the specified port).  Note:  The
1995 	 * GetGUIDInfo command only gets eight GUIDs per operation.  These
1996 	 * GUIDs are then appended to the GID prefix for the port (from the
1997 	 * GetPortInfo above) to form the entire SGID table.
1998 	 */
1999 	for (i = 0; i < pi->p_sgid_tbl_sz; i += 8) {
2000 		status = hermon_getguidinfo_cmd_post(state, port, i >> 3,
2001 		    HERMON_SLEEPFLAG_FOR_CONTEXT(), &guidinfo);
2002 		if (status != HERMON_CMD_SUCCESS) {
2003 			cmn_err(CE_CONT, "Hermon: GetGUIDInfo (port %02d) "
2004 			    "command failed: %08x\n", port, status);
2005 			if (status == HERMON_CMD_INVALID_STATUS) {
2006 				hermon_fm_ereport(state, HCA_SYS_ERR,
2007 				    HCA_ERR_SRV_LOST);
2008 			}
2009 			return (ibc_get_ci_failure(0));
2010 		}
2011 
2012 		/* Figure out how many of the entries are valid */
2013 		sgid_max = min((pi->p_sgid_tbl_sz - i), 8);
2014 		for (j = 0; j < sgid_max; j++) {
2015 			indx = (i + j);
2016 			sgid = &pi->p_sgid_tbl[indx];
2017 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgid))
2018 			sgid->gid_prefix = portinfo.GidPrefix;
2019 			guidp[indx] = sgid->gid_guid =
2020 			    guidinfo.GUIDBlocks[j];
2021 		}
2022 	}
2023 
2024 	/*
2025 	 * Fill in the PKey table.  Just as for the GID tables above, the
2026 	 * only access to the Hermon PKey tables is through the firmware's
2027 	 * MAD_IFC interface.  We post as many GetPKeyTable MADs as necessary
2028 	 * to read in the entire contents of the PKey table (for the specified
2029 	 * port).  Note:  The GetPKeyTable command only gets 32 PKeys per
2030 	 * operation.
2031 	 */
2032 	for (i = 0; i < pi->p_pkey_tbl_sz; i += 32) {
2033 		status = hermon_getpkeytable_cmd_post(state, port, i,
2034 		    HERMON_SLEEPFLAG_FOR_CONTEXT(), &pkeytable);
2035 		if (status != HERMON_CMD_SUCCESS) {
2036 			cmn_err(CE_CONT, "Hermon: GetPKeyTable (port %02d) "
2037 			    "command failed: %08x\n", port, status);
2038 			if (status == HERMON_CMD_INVALID_STATUS) {
2039 				hermon_fm_ereport(state, HCA_SYS_ERR,
2040 				    HCA_ERR_SRV_LOST);
2041 			}
2042 			return (ibc_get_ci_failure(0));
2043 		}
2044 
2045 		/* Figure out how many of the entries are valid */
2046 		pkey_max = min((pi->p_pkey_tbl_sz - i), 32);
2047 		for (j = 0; j < pkey_max; j++) {
2048 			indx = (i + j);
2049 			pkeyp[indx] = pi->p_pkey_tbl[indx] =
2050 			    pkeytable.P_KeyTableBlocks[j];
2051 		}
2052 	}
2053 
2054 	return (DDI_SUCCESS);
2055 }
2056 
2057 
2058 /*
2059  * hermon_port_modify()
2060  *    Context: Can be called only from user or kernel context.
2061  */
2062 /* ARGSUSED */
2063 int
2064 hermon_port_modify(hermon_state_t *state, uint8_t port,
2065     ibt_port_modify_flags_t flags, uint8_t init_type)
2066 {
2067 	sm_portinfo_t		portinfo;
2068 	uint32_t		capmask;
2069 	int			status;
2070 	hermon_hw_set_port_t	set_port;
2071 
2072 	/*
2073 	 * Return an error if either of the unsupported flags are set
2074 	 */
2075 	if ((flags & IBT_PORT_SHUTDOWN) ||
2076 	    (flags & IBT_PORT_SET_INIT_TYPE)) {
2077 		return (IBT_NOT_SUPPORTED);
2078 	}
2079 
2080 	bzero(&set_port, sizeof (set_port));
2081 
2082 	/*
2083 	 * Determine whether we are trying to reset the QKey counter
2084 	 */
2085 	if (flags & IBT_PORT_RESET_QKEY)
2086 		set_port.rqk = 1;
2087 
2088 	/* Validate that specified port number is legal */
2089 	if (!hermon_portnum_is_valid(state, port)) {
2090 		return (IBT_HCA_PORT_INVALID);
2091 	}
2092 
2093 	/*
2094 	 * Use the Hermon MAD_IFC command to post a GetPortInfo MAD to the
2095 	 * firmware (for the specified port number).  This returns a full
2096 	 * PortInfo MAD (in "portinfo") from which we pull the current
2097 	 * capability mask.  We then modify the capability mask as directed
2098 	 * by the "pmod_flags" field, and write the updated capability mask
2099 	 * using the Hermon SET_IB command (below).
2100 	 */
2101 	status = hermon_getportinfo_cmd_post(state, port,
2102 	    HERMON_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
2103 	if (status != HERMON_CMD_SUCCESS) {
2104 		if (status == HERMON_CMD_INVALID_STATUS) {
2105 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2106 		}
2107 		return (ibc_get_ci_failure(0));
2108 	}
2109 
2110 	/*
2111 	 * Convert InfiniBand-defined port capability flags to the format
2112 	 * specified by the IBTF.  Specifically, we modify the capability
2113 	 * mask based on the specified values.
2114 	 */
2115 	capmask = portinfo.CapabilityMask;
2116 
2117 	if (flags & IBT_PORT_RESET_SM)
2118 		capmask &= ~SM_CAP_MASK_IS_SM;
2119 	else if (flags & IBT_PORT_SET_SM)
2120 		capmask |= SM_CAP_MASK_IS_SM;
2121 
2122 	if (flags & IBT_PORT_RESET_SNMP)
2123 		capmask &= ~SM_CAP_MASK_IS_SNMP_SUPPD;
2124 	else if (flags & IBT_PORT_SET_SNMP)
2125 		capmask |= SM_CAP_MASK_IS_SNMP_SUPPD;
2126 
2127 	if (flags & IBT_PORT_RESET_DEVMGT)
2128 		capmask &= ~SM_CAP_MASK_IS_DM_SUPPD;
2129 	else if (flags & IBT_PORT_SET_DEVMGT)
2130 		capmask |= SM_CAP_MASK_IS_DM_SUPPD;
2131 
2132 	if (flags & IBT_PORT_RESET_VENDOR)
2133 		capmask &= ~SM_CAP_MASK_IS_VM_SUPPD;
2134 	else if (flags & IBT_PORT_SET_VENDOR)
2135 		capmask |= SM_CAP_MASK_IS_VM_SUPPD;
2136 
2137 	set_port.cap_mask = capmask;
2138 
2139 	/*
2140 	 * Use the Hermon SET_PORT command to update the capability mask and
2141 	 * (possibly) reset the QKey violation counter for the specified port.
2142 	 * Note: In general, this operation shouldn't fail.  If it does, then
2143 	 * it is an indication that something (probably in HW, but maybe in
2144 	 * SW) has gone seriously wrong.
2145 	 */
2146 	status = hermon_set_port_cmd_post(state, &set_port, port,
2147 	    HERMON_SLEEPFLAG_FOR_CONTEXT());
2148 	if (status != HERMON_CMD_SUCCESS) {
2149 		HERMON_WARNING(state, "failed to modify port capabilities");
2150 		cmn_err(CE_CONT, "Hermon: SET_IB (port %02d) command failed: "
2151 		    "%08x\n", port, status);
2152 		if (status == HERMON_CMD_INVALID_STATUS) {
2153 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2154 		}
2155 		return (ibc_get_ci_failure(0));
2156 	}
2157 
2158 	return (DDI_SUCCESS);
2159 }
2160 
2161 
2162 /*
2163  * hermon_set_addr_path()
2164  *    Context: Can be called from interrupt or base context.
2165  *
2166  * Note: This routine is used for two purposes.  It is used to fill in the
2167  * Hermon UDAV fields, and it is used to fill in the address path information
2168  * for QPs.  Because the two Hermon structures are similar, common fields can
2169  * be filled in here.  Because they are different, however, we pass
2170  * an additional flag to indicate which type is being filled and do each one
2171  * uniquely
2172  */
2173 
2174 int hermon_srate_override = -1;	/* allows ease of testing */
2175 
2176 int
2177 hermon_set_addr_path(hermon_state_t *state, ibt_adds_vect_t *av,
2178     hermon_hw_addr_path_t *path, uint_t type)
2179 {
2180 	uint_t		gidtbl_sz;
2181 	hermon_hw_udav_t *udav;
2182 
2183 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2184 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2185 
2186 	udav = (hermon_hw_udav_t *)(void *)path;
2187 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*udav))
2188 	path->mlid	= av->av_src_path;
2189 	path->rlid	= av->av_dlid;
2190 
2191 	switch (av->av_srate) {
2192 	case IBT_SRATE_2:	/* 1xSDR-2.5Gb/s injection rate */
2193 		path->max_stat_rate = 7; break;
2194 	case IBT_SRATE_10:	/* 4xSDR-10.0Gb/s injection rate */
2195 		path->max_stat_rate = 8; break;
2196 	case IBT_SRATE_30:	/* 12xSDR-30Gb/s injection rate */
2197 		path->max_stat_rate = 9; break;
2198 	case IBT_SRATE_5:	/* 1xDDR-5Gb/s injection rate */
2199 		path->max_stat_rate = 10; break;
2200 	case IBT_SRATE_20:	/* 4xDDR-20Gb/s injection rate */
2201 		path->max_stat_rate = 11; break;
2202 	case IBT_SRATE_40:	/* 4xQDR-40Gb/s injection rate */
2203 		path->max_stat_rate = 12; break;
2204 	case IBT_SRATE_60:	/* 12xDDR-60Gb/s injection rate */
2205 		path->max_stat_rate = 13; break;
2206 	case IBT_SRATE_80:	/* 8xQDR-80Gb/s injection rate */
2207 		path->max_stat_rate = 14; break;
2208 	case IBT_SRATE_120:	/* 12xQDR-120Gb/s injection rate */
2209 		path->max_stat_rate = 15; break;
2210 	case IBT_SRATE_NOT_SPECIFIED:	/* Max */
2211 		path->max_stat_rate = 0; break;
2212 	default:
2213 		return (IBT_STATIC_RATE_INVALID);
2214 	}
2215 	if (hermon_srate_override != -1) /* for evaluating HCA firmware */
2216 		path->max_stat_rate = hermon_srate_override;
2217 
2218 	/* If "grh" flag is set, then check for valid SGID index too */
2219 	gidtbl_sz = (1 << state->hs_queryport.log_max_gid);
2220 	if ((av->av_send_grh) && (av->av_sgid_ix > gidtbl_sz)) {
2221 		return (IBT_SGID_INVALID);
2222 	}
2223 
2224 	/*
2225 	 * Fill in all "global" values regardless of the value in the GRH
2226 	 * flag.  Because "grh" is not set unless "av_send_grh" is set, the
2227 	 * hardware will ignore the other "global" values as necessary.  Note:
2228 	 * SW does this here to enable later query operations to return
2229 	 * exactly the same params that were passed when the addr path was
2230 	 * last written.
2231 	 */
2232 	path->grh = av->av_send_grh;
2233 	if (type == HERMON_ADDRPATH_QP) {
2234 		path->mgid_index = av->av_sgid_ix;
2235 	} else {
2236 		/*
2237 		 * For Hermon UDAV, the "mgid_index" field is the index into
2238 		 * a combined table (not a per-port table), but having sections
2239 		 * for each port. So some extra calculations are necessary.
2240 		 */
2241 
2242 		path->mgid_index = ((av->av_port_num - 1) * gidtbl_sz) +
2243 		    av->av_sgid_ix;
2244 
2245 		udav->portnum = av->av_port_num;
2246 	}
2247 
2248 	/*
2249 	 * According to Hermon PRM, the (31:0) part of rgid_l must be set to
2250 	 * "0x2" if the 'grh' or 'g' bit is cleared.  It also says that we
2251 	 * only need to do it for UDAV's.  So we enforce that here.
2252 	 *
2253 	 * NOTE: The entire 64 bits worth of GUID info is actually being
2254 	 * preserved (for UDAVs) by the callers of this function
2255 	 * (hermon_ah_alloc() and hermon_ah_modify()) and as long as the
2256 	 * 'grh' bit is not set, the upper 32 bits (63:32) of rgid_l are
2257 	 * "don't care".
2258 	 */
2259 	if ((path->grh) || (type == HERMON_ADDRPATH_QP)) {
2260 		path->flow_label = av->av_flow;
2261 		path->tclass	 = av->av_tclass;
2262 		path->hop_limit	 = av->av_hop;
2263 		bcopy(&(av->av_dgid.gid_prefix), &(path->rgid_h),
2264 		    sizeof (uint64_t));
2265 		bcopy(&(av->av_dgid.gid_guid), &(path->rgid_l),
2266 		    sizeof (uint64_t));
2267 	} else {
2268 		path->rgid_l	 = 0x2;
2269 		path->flow_label = 0;
2270 		path->tclass	 = 0;
2271 		path->hop_limit	 = 0;
2272 		path->rgid_h	 = 0;
2273 	}
2274 	/* extract the default service level */
2275 	udav->sl = (HERMON_DEF_SCHED_SELECTION & 0x3C) >> 2;
2276 
2277 	return (DDI_SUCCESS);
2278 }
2279 
2280 
2281 /*
2282  * hermon_get_addr_path()
2283  *    Context: Can be called from interrupt or base context.
2284  *
2285  * Note: Just like hermon_set_addr_path() above, this routine is used for two
2286  * purposes.  It is used to read in the Hermon UDAV fields, and it is used to
2287  * read in the address path information for QPs.  Because the two Hermon
2288  * structures are similar, common fields can be read in here.  But because
2289  * they are slightly different, we pass an additional flag to indicate which
2290  * type is being read.
2291  */
2292 void
2293 hermon_get_addr_path(hermon_state_t *state, hermon_hw_addr_path_t *path,
2294     ibt_adds_vect_t *av, uint_t type)
2295 {
2296 	uint_t		gidtbl_sz;
2297 
2298 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2299 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2300 
2301 	av->av_src_path	= path->mlid;
2302 	av->av_dlid	= path->rlid;
2303 
2304 	/* Set "av_ipd" value from max_stat_rate */
2305 	switch (path->max_stat_rate) {
2306 	case 7:				/* 1xSDR-2.5Gb/s injection rate */
2307 		av->av_srate = IBT_SRATE_2; break;
2308 	case 8:				/* 4xSDR-10.0Gb/s injection rate */
2309 		av->av_srate = IBT_SRATE_10; break;
2310 	case 9:				/* 12xSDR-30Gb/s injection rate */
2311 		av->av_srate = IBT_SRATE_30; break;
2312 	case 10:			/* 1xDDR-5Gb/s injection rate */
2313 		av->av_srate = IBT_SRATE_5; break;
2314 	case 11:			/* 4xDDR-20Gb/s injection rate */
2315 		av->av_srate = IBT_SRATE_20; break;
2316 	case 12:			/* xQDR-40Gb/s injection rate */
2317 		av->av_srate = IBT_SRATE_40; break;
2318 	case 13:			/* 12xDDR-60Gb/s injection rate */
2319 		av->av_srate = IBT_SRATE_60; break;
2320 	case 14:			/* 8xQDR-80Gb/s injection rate */
2321 		av->av_srate = IBT_SRATE_80; break;
2322 	case 15:			/* 12xQDR-120Gb/s injection rate */
2323 		av->av_srate = IBT_SRATE_120; break;
2324 	case 0:				/* max */
2325 		av->av_srate = IBT_SRATE_NOT_SPECIFIED; break;
2326 	default:			/* 1x injection rate */
2327 		av->av_srate = IBT_SRATE_1X;
2328 	}
2329 
2330 	/*
2331 	 * Extract all "global" values regardless of the value in the GRH
2332 	 * flag.  Because "av_send_grh" is set only if "grh" is set, software
2333 	 * knows to ignore the other "global" values as necessary.  Note: SW
2334 	 * does it this way to enable these query operations to return exactly
2335 	 * the same params that were passed when the addr path was last written.
2336 	 */
2337 	av->av_send_grh		= path->grh;
2338 	if (type == HERMON_ADDRPATH_QP) {
2339 		av->av_sgid_ix  = path->mgid_index;
2340 	} else {
2341 		/*
2342 		 * For Hermon UDAV, the "mgid_index" field is the index into
2343 		 * a combined table (not a per-port table).
2344 		 */
2345 		gidtbl_sz = (1 << state->hs_queryport.log_max_gid);
2346 		av->av_sgid_ix = path->mgid_index - ((av->av_port_num - 1) *
2347 		    gidtbl_sz);
2348 
2349 		av->av_port_num = ((hermon_hw_udav_t *)(void *)path)->portnum;
2350 	}
2351 	av->av_flow		= path->flow_label;
2352 	av->av_tclass		= path->tclass;
2353 	av->av_hop		= path->hop_limit;
2354 	/* this is for alignment issue w/ the addr path struct in Hermon */
2355 	bcopy(&(path->rgid_h), &(av->av_dgid.gid_prefix), sizeof (uint64_t));
2356 	bcopy(&(path->rgid_l), &(av->av_dgid.gid_guid), sizeof (uint64_t));
2357 }
2358 
2359 
2360 /*
2361  * hermon_portnum_is_valid()
2362  *    Context: Can be called from interrupt or base context.
2363  */
2364 int
2365 hermon_portnum_is_valid(hermon_state_t *state, uint_t portnum)
2366 {
2367 	uint_t	max_port;
2368 
2369 	max_port = state->hs_cfg_profile->cp_num_ports;
2370 	if ((portnum <= max_port) && (portnum != 0)) {
2371 		return (1);
2372 	} else {
2373 		return (0);
2374 	}
2375 }
2376 
2377 
2378 /*
2379  * hermon_pkeyindex_is_valid()
2380  *    Context: Can be called from interrupt or base context.
2381  */
2382 int
2383 hermon_pkeyindex_is_valid(hermon_state_t *state, uint_t pkeyindx)
2384 {
2385 	uint_t	max_pkeyindx;
2386 
2387 	max_pkeyindx = 1 << state->hs_cfg_profile->cp_log_max_pkeytbl;
2388 	if (pkeyindx < max_pkeyindx) {
2389 		return (1);
2390 	} else {
2391 		return (0);
2392 	}
2393 }
2394 
2395 
2396 /*
2397  * hermon_queue_alloc()
2398  *    Context: Can be called from interrupt or base context.
2399  */
2400 int
2401 hermon_queue_alloc(hermon_state_t *state, hermon_qalloc_info_t *qa_info,
2402     uint_t sleepflag)
2403 {
2404 	ddi_dma_attr_t		dma_attr;
2405 	int			(*callback)(caddr_t);
2406 	uint64_t		realsize, alloc_mask;
2407 	int			flag, status;
2408 
2409 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2410 
2411 	/* Set the callback flag appropriately */
2412 	callback = (sleepflag == HERMON_SLEEP) ? DDI_DMA_SLEEP :
2413 	    DDI_DMA_DONTWAIT;
2414 
2415 	/*
2416 	 * Initialize many of the default DMA attributes.  Then set additional
2417 	 * alignment restrictions as necessary for the queue memory.  Also
2418 	 * respect the configured value for IOMMU bypass
2419 	 */
2420 	hermon_dma_attr_init(state, &dma_attr);
2421 	dma_attr.dma_attr_align = qa_info->qa_bind_align;
2422 #ifdef	__sparc
2423 	if (state->hs_cfg_profile->cp_iommu_bypass == HERMON_BINDMEM_BYPASS) {
2424 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2425 	}
2426 #endif
2427 
2428 	/* Allocate a DMA handle */
2429 	status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr, callback, NULL,
2430 	    &qa_info->qa_dmahdl);
2431 	if (status != DDI_SUCCESS) {
2432 		return (DDI_FAILURE);
2433 	}
2434 
2435 	/*
2436 	 * Determine the amount of memory to allocate, depending on the values
2437 	 * in "qa_bind_align" and "qa_alloc_align".  The problem we are trying
2438 	 * to solve here is that allocating a DMA handle with IOMMU bypass
2439 	 * (DDI_DMA_FORCE_PHYSICAL) constrains us to only requesting alignments
2440 	 * that are less restrictive than the page size.  Since we may need
2441 	 * stricter alignments on the memory allocated by ddi_dma_mem_alloc()
2442 	 * (e.g. in Hermon QP work queue memory allocation), we use the
2443 	 * following method to calculate how much additional memory to request,
2444 	 * and we enforce our own alignment on the allocated result.
2445 	 */
2446 	alloc_mask = qa_info->qa_alloc_align - 1;
2447 	if (qa_info->qa_bind_align == qa_info->qa_alloc_align) {
2448 		realsize = qa_info->qa_size;
2449 	} else {
2450 		realsize = qa_info->qa_size + alloc_mask;
2451 	}
2452 
2453 	/*
2454 	 * If we are to allocate the queue from system memory, then use
2455 	 * ddi_dma_mem_alloc() to find the space.  Otherwise, this is a
2456 	 * host memory allocation, use ddi_umem_alloc(). In either case,
2457 	 * return a pointer to the memory range allocated (including any
2458 	 * necessary alignment adjustments), the "real" memory pointer,
2459 	 * the "real" size, and a ddi_acc_handle_t to use when reading
2460 	 * from/writing to the memory.
2461 	 */
2462 	if (qa_info->qa_location == HERMON_QUEUE_LOCATION_NORMAL) {
2463 		/* Allocate system memory for the queue */
2464 		status = ddi_dma_mem_alloc(qa_info->qa_dmahdl, realsize,
2465 		    &state->hs_reg_accattr, DDI_DMA_CONSISTENT, callback, NULL,
2466 		    (caddr_t *)&qa_info->qa_buf_real,
2467 		    (size_t *)&qa_info->qa_buf_realsz, &qa_info->qa_acchdl);
2468 		if (status != DDI_SUCCESS) {
2469 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2470 			return (DDI_FAILURE);
2471 		}
2472 
2473 		/*
2474 		 * Save temporary copy of the real pointer.  (This may be
2475 		 * modified in the last step below).
2476 		 */
2477 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2478 
2479 		bzero(qa_info->qa_buf_real, qa_info->qa_buf_realsz);
2480 
2481 	} else { /* HERMON_QUEUE_LOCATION_USERLAND */
2482 
2483 		/* Allocate userland mappable memory for the queue */
2484 		flag = (sleepflag == HERMON_SLEEP) ? DDI_UMEM_SLEEP :
2485 		    DDI_UMEM_NOSLEEP;
2486 		qa_info->qa_buf_real = ddi_umem_alloc(realsize, flag,
2487 		    &qa_info->qa_umemcookie);
2488 		if (qa_info->qa_buf_real == NULL) {
2489 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2490 			return (DDI_FAILURE);
2491 		}
2492 
2493 		/*
2494 		 * Save temporary copy of the real pointer.  (This may be
2495 		 * modified in the last step below).
2496 		 */
2497 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2498 
2499 	}
2500 
2501 	/*
2502 	 * The next to last step is to ensure that the final address
2503 	 * ("qa_buf_aligned") has the appropriate "alloc" alignment
2504 	 * restriction applied to it (if necessary).
2505 	 */
2506 	if (qa_info->qa_bind_align != qa_info->qa_alloc_align) {
2507 		qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2508 		    qa_info->qa_buf_aligned + alloc_mask) & ~alloc_mask);
2509 	}
2510 	/*
2511 	 * The last step is to figure out the offset of the start relative
2512 	 * to the first page of the region - will be used in the eqc/cqc
2513 	 * passed to the HW
2514 	 */
2515 	qa_info->qa_pgoffs = (uint_t)((uintptr_t)
2516 	    qa_info->qa_buf_aligned & HERMON_PAGEOFFSET);
2517 
2518 	return (DDI_SUCCESS);
2519 }
2520 
2521 
2522 /*
2523  * hermon_queue_free()
2524  *    Context: Can be called from interrupt or base context.
2525  */
2526 void
2527 hermon_queue_free(hermon_qalloc_info_t *qa_info)
2528 {
2529 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2530 
2531 	/*
2532 	 * Depending on how (i.e. from where) we allocated the memory for
2533 	 * this queue, we choose the appropriate method for releasing the
2534 	 * resources.
2535 	 */
2536 	if (qa_info->qa_location == HERMON_QUEUE_LOCATION_NORMAL) {
2537 
2538 		ddi_dma_mem_free(&qa_info->qa_acchdl);
2539 
2540 	} else if (qa_info->qa_location == HERMON_QUEUE_LOCATION_USERLAND) {
2541 
2542 		ddi_umem_free(qa_info->qa_umemcookie);
2543 
2544 	}
2545 
2546 	/* Always free the dma handle */
2547 	ddi_dma_free_handle(&qa_info->qa_dmahdl);
2548 }
2549 
2550 /*
2551  * hermon_create_fmr_pool()
2552  * Create a pool of FMRs.
2553  *     Context: Can be called from kernel context only.
2554  */
2555 int
2556 hermon_create_fmr_pool(hermon_state_t *state, hermon_pdhdl_t pd,
2557     ibt_fmr_pool_attr_t *fmr_attr, hermon_fmrhdl_t *fmrpoolp)
2558 {
2559 	hermon_fmrhdl_t	fmrpool;
2560 	hermon_fmr_list_t *fmr, *fmr_next;
2561 	hermon_mrhdl_t   mr;
2562 	char		taskqname[48];
2563 	int		status;
2564 	int		sleep;
2565 	int		i;
2566 
2567 	sleep = (fmr_attr->fmr_flags & IBT_MR_SLEEP) ? HERMON_SLEEP :
2568 	    HERMON_NOSLEEP;
2569 	if ((sleep == HERMON_SLEEP) &&
2570 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
2571 		return (IBT_INVALID_PARAM);
2572 	}
2573 
2574 	fmrpool = (hermon_fmrhdl_t)kmem_zalloc(sizeof (*fmrpool), sleep);
2575 	if (fmrpool == NULL) {
2576 		status = IBT_INSUFF_RESOURCE;
2577 		goto fail;
2578 	}
2579 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmrpool))
2580 
2581 	mutex_init(&fmrpool->fmr_lock, NULL, MUTEX_DRIVER,
2582 	    DDI_INTR_PRI(state->hs_intrmsi_pri));
2583 
2584 	fmrpool->fmr_state	    = state;
2585 	fmrpool->fmr_flush_function = fmr_attr->fmr_func_hdlr;
2586 	fmrpool->fmr_flush_arg	    = fmr_attr->fmr_func_arg;
2587 	fmrpool->fmr_pool_size	    = 0;
2588 	fmrpool->fmr_cache	    = 0;
2589 	fmrpool->fmr_max_pages	    = fmr_attr->fmr_max_pages_per_fmr;
2590 	fmrpool->fmr_page_sz	    = fmr_attr->fmr_page_sz;
2591 	fmrpool->fmr_dirty_watermark = fmr_attr->fmr_dirty_watermark;
2592 	fmrpool->fmr_dirty_len	    = 0;
2593 	fmrpool->fmr_flags	    = fmr_attr->fmr_flags;
2594 
2595 	/* Create taskq to handle cleanup and flush processing */
2596 	(void) snprintf(taskqname, 50, "fmrpool/%d/%d @ 0x%" PRIx64,
2597 	    fmr_attr->fmr_pool_size, hermon_debug_fmrpool_cnt,
2598 	    (uint64_t)(uintptr_t)fmrpool);
2599 	fmrpool->fmr_taskq = ddi_taskq_create(state->hs_dip, taskqname,
2600 	    HERMON_TASKQ_NTHREADS, TASKQ_DEFAULTPRI, 0);
2601 	if (fmrpool->fmr_taskq == NULL) {
2602 		status = IBT_INSUFF_RESOURCE;
2603 		goto fail1;
2604 	}
2605 
2606 	fmrpool->fmr_free_list = NULL;
2607 	fmrpool->fmr_dirty_list = NULL;
2608 
2609 	if (fmr_attr->fmr_cache) {
2610 		hermon_fmr_cache_init(fmrpool);
2611 	}
2612 
2613 	for (i = 0; i < fmr_attr->fmr_pool_size; i++) {
2614 		status = hermon_mr_alloc_fmr(state, pd, fmrpool, &mr);
2615 		if (status != DDI_SUCCESS) {
2616 			goto fail2;
2617 		}
2618 
2619 		fmr = (hermon_fmr_list_t *)kmem_zalloc(
2620 		    sizeof (hermon_fmr_list_t), sleep);
2621 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2622 
2623 		fmr->fmr = mr;
2624 		fmr->fmr_refcnt = 0;
2625 		fmr->fmr_remaps = 0;
2626 		fmr->fmr_pool = fmrpool;
2627 		fmr->fmr_in_cache = 0;
2628 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
2629 		mr->mr_fmr = fmr;
2630 
2631 		fmr->fmr_next = fmrpool->fmr_free_list;
2632 		fmrpool->fmr_free_list = fmr;
2633 		fmrpool->fmr_pool_size++;
2634 	}
2635 
2636 	/* Set to return pool */
2637 	*fmrpoolp = fmrpool;
2638 
2639 	return (IBT_SUCCESS);
2640 fail2:
2641 	hermon_fmr_cache_fini(fmrpool);
2642 	for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
2643 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2644 		fmr_next = fmr->fmr_next;
2645 		(void) hermon_mr_dealloc_fmr(state, &fmr->fmr);
2646 		kmem_free(fmr, sizeof (hermon_fmr_list_t));
2647 	}
2648 	ddi_taskq_destroy(fmrpool->fmr_taskq);
2649 fail1:
2650 	kmem_free(fmrpool, sizeof (*fmrpool));
2651 fail:
2652 	if (status == DDI_FAILURE) {
2653 		return (ibc_get_ci_failure(0));
2654 	} else {
2655 		return (status);
2656 	}
2657 }
2658 
2659 /*
2660  * hermon_destroy_fmr_pool()
2661  * Destroy an FMR pool and free all associated resources.
2662  *     Context: Can be called from kernel context only.
2663  */
2664 int
2665 hermon_destroy_fmr_pool(hermon_state_t *state, hermon_fmrhdl_t fmrpool)
2666 {
2667 	hermon_fmr_list_t	*fmr, *fmr_next;
2668 	int			status;
2669 
2670 	mutex_enter(&fmrpool->fmr_lock);
2671 	status = hermon_fmr_cleanup(state, fmrpool);
2672 	if (status != DDI_SUCCESS) {
2673 		mutex_exit(&fmrpool->fmr_lock);
2674 		return (status);
2675 	}
2676 
2677 	if (fmrpool->fmr_cache) {
2678 		hermon_fmr_cache_fini(fmrpool);
2679 	}
2680 
2681 	for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
2682 		fmr_next = fmr->fmr_next;
2683 
2684 		(void) hermon_mr_dealloc_fmr(state, &fmr->fmr);
2685 		kmem_free(fmr, sizeof (hermon_fmr_list_t));
2686 	}
2687 	mutex_exit(&fmrpool->fmr_lock);
2688 
2689 	ddi_taskq_destroy(fmrpool->fmr_taskq);
2690 	mutex_destroy(&fmrpool->fmr_lock);
2691 
2692 	kmem_free(fmrpool, sizeof (*fmrpool));
2693 	return (DDI_SUCCESS);
2694 }
2695 
2696 /*
2697  * hermon_flush_fmr_pool()
2698  * Ensure that all unmapped FMRs are fully invalidated.
2699  *     Context: Can be called from kernel context only.
2700  */
2701 int
2702 hermon_flush_fmr_pool(hermon_state_t *state, hermon_fmrhdl_t fmrpool)
2703 {
2704 	int		status;
2705 
2706 	/*
2707 	 * Force the unmapping of all entries on the dirty list, regardless of
2708 	 * whether the watermark has been hit yet.
2709 	 */
2710 	/* grab the pool lock */
2711 	mutex_enter(&fmrpool->fmr_lock);
2712 	status = hermon_fmr_cleanup(state, fmrpool);
2713 	mutex_exit(&fmrpool->fmr_lock);
2714 	return (status);
2715 }
2716 
2717 /*
2718  * hermon_deregister_fmr()
2719  * Map memory into FMR
2720  *    Context: Can be called from interrupt or base context.
2721  */
2722 int
2723 hermon_register_physical_fmr(hermon_state_t *state, hermon_fmrhdl_t fmrpool,
2724     ibt_pmr_attr_t *mem_pattr, hermon_mrhdl_t *mr,
2725     ibt_pmr_desc_t *mem_desc_p)
2726 {
2727 	hermon_fmr_list_t	*fmr;
2728 	hermon_fmr_list_t	query;
2729 	avl_index_t		where;
2730 	int			status;
2731 
2732 	/* Check length */
2733 	mutex_enter(&fmrpool->fmr_lock);
2734 	if (mem_pattr->pmr_len < 1 || (mem_pattr->pmr_num_buf >
2735 	    fmrpool->fmr_max_pages)) {
2736 		mutex_exit(&fmrpool->fmr_lock);
2737 		return (IBT_MR_LEN_INVALID);
2738 	}
2739 
2740 	mutex_enter(&fmrpool->fmr_cachelock);
2741 	/* lookup in fmr cache */
2742 	/* if exists, grab it, and return it */
2743 	if (fmrpool->fmr_cache) {
2744 		query.fmr_desc.pmd_iova = mem_pattr->pmr_iova;
2745 		query.fmr_desc.pmd_phys_buf_list_sz = mem_pattr->pmr_len;
2746 		fmr = (hermon_fmr_list_t *)avl_find(&fmrpool->fmr_cache_avl,
2747 		    &query, &where);
2748 
2749 		/*
2750 		 * If valid FMR was found in cache, return that fmr info
2751 		 */
2752 		if (fmr != NULL) {
2753 			fmr->fmr_refcnt++;
2754 			/* Store pmr desc for use in cache */
2755 			(void) memcpy(mem_desc_p, &fmr->fmr_desc,
2756 			    sizeof (ibt_pmr_desc_t));
2757 			*mr = (hermon_mrhdl_t)fmr->fmr;
2758 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2759 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(
2760 			    *(fmr->fmr->mr_mptrsrcp)))
2761 			if (hermon_rdma_debug & 0x4)
2762 				IBTF_DPRINTF_L2("fmr", "  reg cache: mr %p "
2763 				    "index %x", fmr->fmr,
2764 				    fmr->fmr->mr_mptrsrcp->hr_indx);
2765 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(
2766 			    *(fmr->fmr->mr_mptrsrcp)))
2767 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2768 			mutex_exit(&fmrpool->fmr_cachelock);
2769 			mutex_exit(&fmrpool->fmr_lock);
2770 			return (DDI_SUCCESS);
2771 		}
2772 	}
2773 
2774 	/* FMR does not exist in cache, proceed with registration */
2775 
2776 	/* grab next free entry */
2777 	fmr = fmrpool->fmr_free_list;
2778 	if (fmr == NULL) {
2779 		IBTF_DPRINTF_L2("fmr", "WARNING: no free fmr resource");
2780 		mutex_exit(&fmrpool->fmr_cachelock);
2781 		mutex_exit(&fmrpool->fmr_lock);
2782 		return (IBT_INSUFF_RESOURCE);
2783 	}
2784 
2785 	fmrpool->fmr_free_list = fmrpool->fmr_free_list->fmr_next;
2786 	fmr->fmr_next = NULL;
2787 
2788 	status = hermon_mr_register_physical_fmr(state, mem_pattr, fmr->fmr,
2789 	    mem_desc_p);
2790 	if (status != DDI_SUCCESS) {
2791 		mutex_exit(&fmrpool->fmr_cachelock);
2792 		mutex_exit(&fmrpool->fmr_lock);
2793 		return (status);
2794 	}
2795 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr->fmr))
2796 	if (hermon_rdma_debug & 0x4)
2797 		IBTF_DPRINTF_L2("fmr", "  reg: mr %p  key %x",
2798 		    fmr->fmr, fmr->fmr->mr_rkey);
2799 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*fmr->fmr))
2800 
2801 	fmr->fmr_refcnt = 1;
2802 	fmr->fmr_remaps++;
2803 
2804 	/* Store pmr desc for use in cache */
2805 	(void) memcpy(&fmr->fmr_desc, mem_desc_p, sizeof (ibt_pmr_desc_t));
2806 	*mr = (hermon_mrhdl_t)fmr->fmr;
2807 
2808 	/* Store in cache */
2809 	if (fmrpool->fmr_cache) {
2810 		if (!fmr->fmr_in_cache) {
2811 			avl_insert(&fmrpool->fmr_cache_avl, fmr, where);
2812 			fmr->fmr_in_cache = 1;
2813 		}
2814 	}
2815 
2816 	mutex_exit(&fmrpool->fmr_cachelock);
2817 	mutex_exit(&fmrpool->fmr_lock);
2818 	return (DDI_SUCCESS);
2819 }
2820 
2821 /*
2822  * hermon_deregister_fmr()
2823  * Unmap FMR
2824  *    Context: Can be called from kernel context only.
2825  */
2826 int
2827 hermon_deregister_fmr(hermon_state_t *state, hermon_mrhdl_t mr)
2828 {
2829 	hermon_fmr_list_t	*fmr;
2830 	hermon_fmrhdl_t		fmrpool;
2831 	int			status;
2832 
2833 	fmr = mr->mr_fmr;
2834 	fmrpool = fmr->fmr_pool;
2835 
2836 	/* Grab pool lock */
2837 	mutex_enter(&fmrpool->fmr_lock);
2838 	fmr->fmr_refcnt--;
2839 
2840 	if (fmr->fmr_refcnt == 0) {
2841 		/*
2842 		 * First, do some bit of invalidation, reducing our exposure to
2843 		 * having this region still registered in hardware.
2844 		 */
2845 		(void) hermon_mr_invalidate_fmr(state, mr);
2846 
2847 		/*
2848 		 * If we've exhausted our remaps then add the FMR to the dirty
2849 		 * list, not allowing it to be re-used until we have done a
2850 		 * flush.  Otherwise, simply add it back to the free list for
2851 		 * re-mapping.
2852 		 */
2853 		if (fmr->fmr_remaps <
2854 		    state->hs_cfg_profile->cp_fmr_max_remaps) {
2855 			/* add to free list */
2856 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2857 			if (hermon_rdma_debug & 0x4)
2858 				IBTF_DPRINTF_L2("fmr", "dereg: mr %p  key %x",
2859 				    fmr->fmr, fmr->fmr->mr_rkey);
2860 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2861 			fmr->fmr_next = fmrpool->fmr_free_list;
2862 			fmrpool->fmr_free_list = fmr;
2863 		} else {
2864 			/* add to dirty list */
2865 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2866 			if (hermon_rdma_debug & 0x4)
2867 				IBTF_DPRINTF_L2("fmr", "dirty: mr %p  key %x",
2868 				    fmr->fmr, fmr->fmr->mr_rkey);
2869 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2870 			fmr->fmr_next = fmrpool->fmr_dirty_list;
2871 			fmrpool->fmr_dirty_list = fmr;
2872 			fmrpool->fmr_dirty_len++;
2873 
2874 			status = ddi_taskq_dispatch(fmrpool->fmr_taskq,
2875 			    hermon_fmr_processing, fmrpool, DDI_NOSLEEP);
2876 			if (status == DDI_FAILURE) {
2877 				mutex_exit(&fmrpool->fmr_lock);
2878 				return (IBT_INSUFF_RESOURCE);
2879 			}
2880 		}
2881 	}
2882 	/* Release pool lock */
2883 	mutex_exit(&fmrpool->fmr_lock);
2884 
2885 	return (DDI_SUCCESS);
2886 }
2887 
2888 
2889 /*
2890  * hermon_fmr_processing()
2891  * If required, perform cleanup.
2892  *     Context: Called from taskq context only.
2893  */
2894 static void
2895 hermon_fmr_processing(void *fmr_args)
2896 {
2897 	hermon_fmrhdl_t		fmrpool;
2898 	int			status;
2899 
2900 	ASSERT(fmr_args != NULL);
2901 
2902 	fmrpool = (hermon_fmrhdl_t)fmr_args;
2903 
2904 	/* grab pool lock */
2905 	mutex_enter(&fmrpool->fmr_lock);
2906 	if (fmrpool->fmr_dirty_len >= fmrpool->fmr_dirty_watermark) {
2907 		status = hermon_fmr_cleanup(fmrpool->fmr_state, fmrpool);
2908 		if (status != DDI_SUCCESS) {
2909 			mutex_exit(&fmrpool->fmr_lock);
2910 			return;
2911 		}
2912 
2913 		if (fmrpool->fmr_flush_function != NULL) {
2914 			(void) fmrpool->fmr_flush_function(
2915 			    (ibc_fmr_pool_hdl_t)fmrpool,
2916 			    fmrpool->fmr_flush_arg);
2917 		}
2918 	}
2919 
2920 	/* let pool lock go */
2921 	mutex_exit(&fmrpool->fmr_lock);
2922 }
2923 
2924 /*
2925  * hermon_fmr_cleanup()
2926  * Perform cleaning processing, walking the list and performing the MTT sync
2927  * operation if required.
2928  *    Context: can be called from taskq or base context.
2929  */
2930 static int
2931 hermon_fmr_cleanup(hermon_state_t *state, hermon_fmrhdl_t fmrpool)
2932 {
2933 	hermon_fmr_list_t	*fmr;
2934 	hermon_fmr_list_t	*fmr_next;
2935 	int			sync_needed;
2936 	int			status;
2937 
2938 	ASSERT(MUTEX_HELD(&fmrpool->fmr_lock));
2939 
2940 	sync_needed = 0;
2941 	for (fmr = fmrpool->fmr_dirty_list; fmr; fmr = fmr_next) {
2942 		fmr_next = fmr->fmr_next;
2943 		fmr->fmr_remaps = 0;
2944 
2945 		(void) hermon_mr_deregister_fmr(state, fmr->fmr);
2946 
2947 		/*
2948 		 * Update lists.
2949 		 * - add fmr back to free list
2950 		 * - remove fmr from dirty list
2951 		 */
2952 		fmr->fmr_next = fmrpool->fmr_free_list;
2953 		fmrpool->fmr_free_list = fmr;
2954 
2955 
2956 		/*
2957 		 * Because we have updated the dirty list, and deregistered the
2958 		 * FMR entry, we do need to sync the TPT, so we set the
2959 		 * 'sync_needed' flag here so we sync once we finish dirty_list
2960 		 * processing.
2961 		 */
2962 		sync_needed = 1;
2963 	}
2964 
2965 	fmrpool->fmr_dirty_list = NULL;
2966 	fmrpool->fmr_dirty_len = 0;
2967 
2968 	if (sync_needed) {
2969 		status = hermon_sync_tpt_cmd_post(state,
2970 		    HERMON_CMD_NOSLEEP_SPIN);
2971 		if (status != HERMON_CMD_SUCCESS) {
2972 			return (status);
2973 		}
2974 	}
2975 
2976 	return (DDI_SUCCESS);
2977 }
2978 
2979 /*
2980  * hermon_fmr_avl_compare()
2981  *    Context: Can be called from user or kernel context.
2982  */
2983 static int
2984 hermon_fmr_avl_compare(const void *q, const void *e)
2985 {
2986 	hermon_fmr_list_t *entry, *query;
2987 
2988 	entry = (hermon_fmr_list_t *)e;
2989 	query = (hermon_fmr_list_t *)q;
2990 
2991 	if (query->fmr_desc.pmd_iova < entry->fmr_desc.pmd_iova) {
2992 		return (-1);
2993 	} else if (query->fmr_desc.pmd_iova > entry->fmr_desc.pmd_iova) {
2994 		return (+1);
2995 	} else {
2996 		return (0);
2997 	}
2998 }
2999 
3000 
3001 /*
3002  * hermon_fmr_cache_init()
3003  *    Context: Can be called from user or kernel context.
3004  */
3005 static void
3006 hermon_fmr_cache_init(hermon_fmrhdl_t fmr)
3007 {
3008 	/* Initialize the lock used for FMR cache AVL tree access */
3009 	mutex_init(&fmr->fmr_cachelock, NULL, MUTEX_DRIVER,
3010 	    DDI_INTR_PRI(fmr->fmr_state->hs_intrmsi_pri));
3011 
3012 	/* Initialize the AVL tree for the FMR cache */
3013 	avl_create(&fmr->fmr_cache_avl, hermon_fmr_avl_compare,
3014 	    sizeof (hermon_fmr_list_t),
3015 	    offsetof(hermon_fmr_list_t, fmr_avlnode));
3016 
3017 	fmr->fmr_cache = 1;
3018 }
3019 
3020 
3021 /*
3022  * hermon_fmr_cache_fini()
3023  *    Context: Can be called from user or kernel context.
3024  */
3025 static void
3026 hermon_fmr_cache_fini(hermon_fmrhdl_t fmr)
3027 {
3028 	void			*cookie;
3029 
3030 	/*
3031 	 * Empty all entries (if necessary) and destroy the AVL tree.
3032 	 * The FMRs themselves are freed as part of destroy_pool()
3033 	 */
3034 	cookie = NULL;
3035 	while (((void *)(hermon_fmr_list_t *)avl_destroy_nodes(
3036 	    &fmr->fmr_cache_avl, &cookie)) != NULL) {
3037 		/* loop through */
3038 	}
3039 	avl_destroy(&fmr->fmr_cache_avl);
3040 
3041 	/* Destroy the lock used for FMR cache */
3042 	mutex_destroy(&fmr->fmr_cachelock);
3043 }
3044