xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_misc.c (revision d9c882fa1001c77987b156290d6733010c824ec4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_misc.c
29  *    Tavor Miscellaneous routines - Address Handle, Multicast, Protection
30  *    Domain, and port-related operations
31  *
32  *    Implements all the routines necessary for allocating, freeing, querying
33  *    and modifying Address Handles and Protection Domains.  Also implements
34  *    all the routines necessary for adding and removing Queue Pairs to/from
35  *    Multicast Groups.  Lastly, it implements the routines necessary for
36  *    port-related query and modify operations.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/bitmap.h>
45 #include <sys/sysmacros.h>
46 
47 #include <sys/ib/adapters/tavor/tavor.h>
48 
49 /* used for helping uniquify fmr pool taskq name */
50 static uint_t tavor_debug_fmrpool_cnt = 0x00000000;
51 
52 static void tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav,
53     uint_t flag);
54 static int tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg,
55     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp, uint_t *qp_found);
56 static int tavor_mcg_qplist_remove(tavor_mcghdl_t mcg,
57     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp);
58 static void tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp);
59 static void tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp);
60 static uint_t tavor_mcg_walk_mgid_hash(tavor_state_t *state,
61     uint64_t start_indx, ib_gid_t mgid, uint_t *prev_indx);
62 static void tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg,
63     tavor_hw_mcg_t *mcg_hdr, ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc);
64 static int tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx,
65     uint_t prev_indx, tavor_hw_mcg_t *mcg_entry);
66 static int tavor_mcg_entry_invalidate(tavor_state_t *state,
67     tavor_hw_mcg_t *mcg_entry, uint_t indx);
68 static int tavor_mgid_is_valid(ib_gid_t gid);
69 static int tavor_mlid_is_valid(ib_lid_t lid);
70 static void tavor_fmr_processing(void *fmr_args);
71 static int tavor_fmr_cleanup(tavor_state_t *state, tavor_fmrhdl_t pool);
72 static void tavor_fmr_cache_init(tavor_fmrhdl_t fmr);
73 static void tavor_fmr_cache_fini(tavor_fmrhdl_t fmr);
74 static int tavor_fmr_avl_compare(const void *q, const void *e);
75 
76 
77 /*
78  * tavor_ah_alloc()
79  *    Context: Can be called only from user or kernel context.
80  */
81 int
82 tavor_ah_alloc(tavor_state_t *state, tavor_pdhdl_t pd,
83     ibt_adds_vect_t *attr_p, tavor_ahhdl_t *ahhdl, uint_t sleepflag)
84 {
85 	tavor_rsrc_t		*udav, *rsrc;
86 	tavor_hw_udav_t		udav_entry;
87 	tavor_ahhdl_t		ah;
88 	ibt_mr_attr_t		mr_attr;
89 	tavor_mr_options_t	op;
90 	tavor_mrhdl_t		mr;
91 	uint64_t		data;
92 	uint32_t		size;
93 	int			status, i, flag;
94 	char			*errormsg;
95 
96 	TAVOR_TNF_ENTER(tavor_ah_alloc);
97 
98 	/*
99 	 * Someday maybe the "ibt_adds_vect_t *attr_p" will be NULL to
100 	 * indicate that we wish to allocate an "invalid" (i.e. empty)
101 	 * address handle XXX
102 	 */
103 
104 	/* Validate that specified port number is legal */
105 	if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) {
106 		/* Set "status" and "errormsg" and goto failure */
107 		TAVOR_TNF_FAIL(IBT_HCA_PORT_INVALID, "invalid port num");
108 		goto ahalloc_fail;
109 	}
110 
111 	/*
112 	 * Allocate a UDAV entry.  This will be filled in with all the
113 	 * necessary parameters to define the Address Handle.  Unlike the
114 	 * other hardware resources no ownership transfer takes place as
115 	 * these UDAV entries are always owned by hardware.
116 	 */
117 	status = tavor_rsrc_alloc(state, TAVOR_UDAV, 1, sleepflag, &udav);
118 	if (status != DDI_SUCCESS) {
119 		/* Set "status" and "errormsg" and goto failure */
120 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed UDAV");
121 		goto ahalloc_fail;
122 	}
123 
124 	/*
125 	 * Allocate the software structure for tracking the address handle
126 	 * (i.e. the Tavor Address Handle struct).  If we fail here, we must
127 	 * undo the previous resource allocation.
128 	 */
129 	status = tavor_rsrc_alloc(state, TAVOR_AHHDL, 1, sleepflag, &rsrc);
130 	if (status != DDI_SUCCESS) {
131 		/* Set "status" and "errormsg" and goto failure */
132 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed AH handler");
133 		goto ahalloc_fail1;
134 	}
135 	ah = (tavor_ahhdl_t)rsrc->tr_addr;
136 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
137 
138 	/* Increment the reference count on the protection domain (PD) */
139 	tavor_pd_refcnt_inc(pd);
140 
141 	/*
142 	 * Fill in the UDAV entry.  Note: We are only filling in a temporary
143 	 * copy here, which we will later copy into the actual entry in
144 	 * Tavor DDR memory.  This starts be zeroing out the temporary copy
145 	 * and then calling tavor_set_addr_path() to fill in the common
146 	 * portions that can be pulled from the "ibt_adds_vect_t" passed in
147 	 */
148 	bzero(&udav_entry, sizeof (tavor_hw_udav_t));
149 	status = tavor_set_addr_path(state, attr_p,
150 	    (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL);
151 	if (status != DDI_SUCCESS) {
152 		tavor_pd_refcnt_dec(pd);
153 		tavor_rsrc_free(state, &rsrc);
154 		tavor_rsrc_free(state, &udav);
155 		/* Set "status" and "errormsg" and goto failure */
156 		TAVOR_TNF_FAIL(status, "failed in tavor_set_addr_path");
157 		goto ahalloc_fail;
158 	}
159 	udav_entry.pd	  = pd->pd_pdnum;
160 	udav_entry.msg_sz = state->ts_cfg_profile->cp_max_mtu - 1;
161 
162 	/*
163 	 * Register the memory for the UDAV.  The memory for the UDAV must
164 	 * be registered in the Tavor TPT tables.  This gives us the LKey
165 	 * that we will need when we later post a UD work request that
166 	 * uses this address handle.
167 	 * We might be able to pre-register all the memory for the UDAV XXX
168 	 */
169 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
170 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)udav->tr_addr;
171 	mr_attr.mr_len	 = udav->tr_len;
172 	mr_attr.mr_as	 = NULL;
173 	mr_attr.mr_flags = flag;
174 	op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
175 	op.mro_bind_dmahdl = NULL;
176 	op.mro_bind_override_addr = 0;
177 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
178 	if (status != DDI_SUCCESS) {
179 		/* Set "status" and "errormsg" and goto failure */
180 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
181 		goto ahalloc_fail2;
182 	}
183 
184 	/*
185 	 * Fill in the UDAV entry.  Here we copy all the information from
186 	 * the temporary UDAV into the DDR memory for the real UDAV entry.
187 	 * Note that we copy everything but the first 64-bit word.  This
188 	 * is where the PD number for the address handle resides.
189 	 * By filling everything except the PD and then writing the PD in
190 	 * a separate step below, we can ensure that the UDAV is not
191 	 * accessed while there are partially written values in it (something
192 	 * which really should not happen anyway).  This is guaranteed
193 	 * because we take measures to ensure that the PD number is zero for
194 	 * all unused UDAV (and because PD#0 is reserved for Tavor).
195 	 */
196 	size = sizeof (tavor_hw_udav_t) >> 3;
197 	for (i = 1; i < size; i++) {
198 		data = ((uint64_t *)&udav_entry)[i];
199 		ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i),
200 		    data);
201 	}
202 	data = ((uint64_t *)&udav_entry)[0];
203 	ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, data);
204 
205 	/*
206 	 * Fill in the rest of the Tavor Address Handle struct.  Having
207 	 * successfully copied the UDAV into the hardware, we update the
208 	 * following fields for use in further operations on the AH.
209 	 *
210 	 * NOTE: We are saving away a copy of the "av_dgid.gid_guid" field
211 	 * here because we may need to return it later to the IBTF (as a
212 	 * result of a subsequent query operation).  Unlike the other UDAV
213 	 * parameters, the value of "av_dgid.gid_guid" is not always preserved
214 	 * by being written to hardware.  The reason for this is described in
215 	 * tavor_set_addr_path().
216 	 */
217 	ah->ah_udavrsrcp = udav;
218 	ah->ah_rsrcp	 = rsrc;
219 	ah->ah_pdhdl	 = pd;
220 	ah->ah_mrhdl	 = mr;
221 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
222 	ah->ah_save_srate = attr_p->av_srate;
223 	*ahhdl = ah;
224 
225 	/* Determine if later ddi_dma_sync will be necessary */
226 	ah->ah_sync = TAVOR_UDAV_IS_SYNC_REQ(state);
227 
228 	/* Sync the UDAV for use by the hardware */
229 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
230 
231 	TAVOR_TNF_EXIT(tavor_ah_alloc);
232 	return (DDI_SUCCESS);
233 
234 ahalloc_fail2:
235 	tavor_pd_refcnt_dec(pd);
236 	tavor_rsrc_free(state, &rsrc);
237 ahalloc_fail1:
238 	tavor_rsrc_free(state, &udav);
239 ahalloc_fail:
240 	TNF_PROBE_1(tavor_ah_alloc_fail, TAVOR_TNF_ERROR, "",
241 	    tnf_string, msg, errormsg);
242 	TAVOR_TNF_EXIT(tavor_ah_alloc);
243 	return (status);
244 }
245 
246 
247 /*
248  * tavor_ah_free()
249  *    Context: Can be called only from user or kernel context.
250  */
251 /* ARGSUSED */
252 int
253 tavor_ah_free(tavor_state_t *state, tavor_ahhdl_t *ahhdl, uint_t sleepflag)
254 {
255 	tavor_rsrc_t		*udav, *rsrc;
256 	tavor_pdhdl_t		pd;
257 	tavor_mrhdl_t		mr;
258 	tavor_ahhdl_t		ah;
259 	int			status;
260 
261 	TAVOR_TNF_ENTER(tavor_ah_free);
262 
263 	/*
264 	 * Pull all the necessary information from the Tavor Address Handle
265 	 * struct.  This is necessary here because the resource for the
266 	 * AH is going to be freed up as part of this operation.
267 	 */
268 	ah    = *ahhdl;
269 	mutex_enter(&ah->ah_lock);
270 	udav  = ah->ah_udavrsrcp;
271 	rsrc  = ah->ah_rsrcp;
272 	pd    = ah->ah_pdhdl;
273 	mr    = ah->ah_mrhdl;
274 	mutex_exit(&ah->ah_lock);
275 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
276 
277 	/*
278 	 * Deregister the memory for the UDAV.  If this fails for any reason,
279 	 * then it is an indication that something (either in HW or SW) has
280 	 * gone seriously wrong.  So we print a warning message and return
281 	 * failure.
282 	 */
283 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
284 	    sleepflag);
285 	if (status != DDI_SUCCESS) {
286 		TNF_PROBE_0(tavor_ah_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
287 		TAVOR_TNF_EXIT(tavor_ah_free);
288 		return (ibc_get_ci_failure(0));
289 	}
290 
291 	/*
292 	 * Write zero to the first 64-bit word in the UDAV entry.  As
293 	 * described above (in tavor_ah_alloc), the PD number is stored in
294 	 * the first 64-bits of each UDAV and setting this to zero is
295 	 * guaranteed to invalidate the entry.
296 	 */
297 	ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, 0);
298 
299 	/* Sync the UDAV for use by the hardware */
300 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
301 
302 	/* Decrement the reference count on the protection domain (PD) */
303 	tavor_pd_refcnt_dec(pd);
304 
305 	/* Free the Tavor Address Handle structure */
306 	tavor_rsrc_free(state, &rsrc);
307 
308 	/* Free up the UDAV entry resource */
309 	tavor_rsrc_free(state, &udav);
310 
311 	/* Set the ahhdl pointer to NULL and return success */
312 	*ahhdl = NULL;
313 
314 	TAVOR_TNF_EXIT(tavor_ah_free);
315 	return (DDI_SUCCESS);
316 }
317 
318 
319 /*
320  * tavor_ah_query()
321  *    Context: Can be called from interrupt or base context.
322  */
323 /* ARGSUSED */
324 int
325 tavor_ah_query(tavor_state_t *state, tavor_ahhdl_t ah, tavor_pdhdl_t *pd,
326     ibt_adds_vect_t *attr_p)
327 {
328 	tavor_hw_udav_t		udav_entry;
329 	tavor_rsrc_t		*udav;
330 	uint64_t		data;
331 	uint32_t		size;
332 	int			i;
333 
334 	TAVOR_TNF_ENTER(tavor_ah_query);
335 
336 	mutex_enter(&ah->ah_lock);
337 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p))
338 
339 	/*
340 	 * Pull all the necessary information from the Tavor Address Handle
341 	 * structure
342 	 */
343 	udav	= ah->ah_udavrsrcp;
344 	*pd	= ah->ah_pdhdl;
345 
346 	/*
347 	 * Copy the UDAV entry into the temporary copy.  Here we copy all
348 	 * the information from the UDAV entry in DDR memory into the
349 	 * temporary UDAV.  Note:  We don't need to sync the UDAV for
350 	 * reading by software because Tavor HW never modifies the entry.
351 	 */
352 	size = sizeof (tavor_hw_udav_t) >> 3;
353 	for (i = 0; i < size; i++) {
354 		data = ddi_get64(udav->tr_acchdl,
355 		    ((uint64_t *)udav->tr_addr + i));
356 		((uint64_t *)&udav_entry)[i] = data;
357 	}
358 
359 	/*
360 	 * Fill in "ibt_adds_vect_t".  We call tavor_get_addr_path() to fill
361 	 * the common portions that can be pulled from the UDAV we pass in.
362 	 *
363 	 * NOTE: We will also fill the "av_dgid.gid_guid" field from the
364 	 * "ah_save_guid" field we have previously saved away.  The reason
365 	 * for this is described in tavor_ah_alloc() and tavor_ah_modify().
366 	 */
367 	tavor_get_addr_path(state, (tavor_hw_addr_path_t *)&udav_entry,
368 	    attr_p, TAVOR_ADDRPATH_UDAV, NULL);
369 
370 	attr_p->av_dgid.gid_guid = ah->ah_save_guid;
371 	attr_p->av_srate = ah->ah_save_srate;
372 
373 	mutex_exit(&ah->ah_lock);
374 	TAVOR_TNF_EXIT(tavor_ah_query);
375 	return (DDI_SUCCESS);
376 }
377 
378 
379 /*
380  * tavor_ah_modify()
381  *    Context: Can be called from interrupt or base context.
382  */
383 /* ARGSUSED */
384 int
385 tavor_ah_modify(tavor_state_t *state, tavor_ahhdl_t ah,
386     ibt_adds_vect_t *attr_p)
387 {
388 	tavor_hw_udav_t		udav_entry;
389 	tavor_rsrc_t		*udav;
390 	uint64_t		data_new, data_old;
391 	uint32_t		udav_pd, size, portnum_new;
392 	int			i, status;
393 
394 	TAVOR_TNF_ENTER(tavor_ah_modify);
395 
396 	/* Validate that specified port number is legal */
397 	if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) {
398 		TNF_PROBE_1(tavor_ah_modify_inv_portnum,
399 		    TAVOR_TNF_ERROR, "", tnf_uint, port, attr_p->av_port_num);
400 		TAVOR_TNF_EXIT(tavor_ah_modify);
401 		return (IBT_HCA_PORT_INVALID);
402 	}
403 
404 	mutex_enter(&ah->ah_lock);
405 
406 	/*
407 	 * Pull all the necessary information from the Tavor Address Handle
408 	 * structure
409 	 */
410 	udav = ah->ah_udavrsrcp;
411 
412 	/*
413 	 * Fill in the UDAV entry.  Note: we are only filling in a temporary
414 	 * copy here, which we will later copy into the actual entry in
415 	 * Tavor DDR memory.  This starts be zeroing out the temporary copy
416 	 * and then calling tavor_set_addr_path() to fill in the common
417 	 * portions that can be pulled from the "ibt_adds_vect_t" passed in
418 	 *
419 	 * NOTE: We also need to save away a copy of the "av_dgid.gid_guid"
420 	 * field here (just as we did during tavor_ah_alloc()) because we
421 	 * may need to return it later to the IBTF (as a result of a
422 	 * subsequent query operation).  As explained in tavor_ah_alloc(),
423 	 * unlike the other UDAV parameters, the value of "av_dgid.gid_guid"
424 	 * is not always preserved by being written to hardware.  The reason
425 	 * for this is described in tavor_set_addr_path().
426 	 */
427 	bzero(&udav_entry, sizeof (tavor_hw_udav_t));
428 	status = tavor_set_addr_path(state, attr_p,
429 	    (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL);
430 	if (status != DDI_SUCCESS) {
431 		mutex_exit(&ah->ah_lock);
432 		TNF_PROBE_0(tavor_ah_modify_setaddrpath_fail,
433 		    TAVOR_TNF_ERROR, "");
434 		TAVOR_TNF_EXIT(tavor_ah_modify);
435 		return (status);
436 	}
437 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
438 	ah->ah_save_srate = attr_p->av_srate;
439 
440 	/*
441 	 * Save away the current PD number for this UDAV.  Then temporarily
442 	 * invalidate the entry (by setting the PD to zero).  Note:  Since
443 	 * the first 32 bits of the UDAV actually contain the current port
444 	 * number _and_ current PD number, we need to mask off some bits.
445 	 */
446 	udav_pd = ddi_get32(udav->tr_acchdl, (uint32_t *)udav->tr_addr);
447 	udav_pd = udav_pd & 0xFFFFFF;
448 	ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, 0);
449 
450 	/* Sync the UDAV for use by the hardware */
451 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
452 
453 	/*
454 	 * Copy UDAV structure to the entry
455 	 *    Note:  We copy in 64-bit chunks.  For the first two of these
456 	 *    chunks it is necessary to read the current contents of the
457 	 *    UDAV, mask off the modifiable portions (maintaining any
458 	 *    of the "reserved" portions), and then mask on the new data.
459 	 */
460 	size = sizeof (tavor_hw_udav_t) >> 3;
461 	for (i = 0; i < size; i++) {
462 		data_new = ((uint64_t *)&udav_entry)[i];
463 		data_old = ddi_get64(udav->tr_acchdl,
464 		    ((uint64_t *)udav->tr_addr + i));
465 
466 		/*
467 		 * Apply mask to change only the relevant values.  Note: We
468 		 * extract the new portnum from the address handle here
469 		 * because the "PD" and "portnum" fields are in the same
470 		 * 32-bit word in the UDAV.  We will use the (new) port
471 		 * number extracted here when we write the valid PD number
472 		 * in the last step below.
473 		 */
474 		if (i == 0) {
475 			data_old = data_old & TAVOR_UDAV_MODIFY_MASK0;
476 			portnum_new = data_new >> 56;
477 		} else if (i == 1) {
478 			data_old = data_old & TAVOR_UDAV_MODIFY_MASK1;
479 		} else {
480 			data_old = 0;
481 		}
482 
483 		/* Write the updated values to the UDAV (in DDR) */
484 		data_new = data_old | data_new;
485 		ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i),
486 		    data_new);
487 	}
488 
489 	/*
490 	 * Sync the body of the UDAV for use by the hardware.  After we
491 	 * have updated the PD number (to make the UDAV valid), we sync
492 	 * again to push the entire entry out for hardware access.
493 	 */
494 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
495 
496 	/*
497 	 * Put the valid PD number back into UDAV entry.  Note: Because port
498 	 * number and PD number are in the same word, we must mask the
499 	 * new port number with the old PD number before writing it back
500 	 * to the UDAV entry
501 	 */
502 	udav_pd = ((portnum_new << 24) | udav_pd);
503 	ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, udav_pd);
504 
505 	/* Sync the rest of the UDAV for use by the hardware */
506 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
507 
508 	mutex_exit(&ah->ah_lock);
509 	TAVOR_TNF_EXIT(tavor_ah_modify);
510 	return (DDI_SUCCESS);
511 }
512 
513 
514 /*
515  * tavor_udav_sync()
516  *    Context: Can be called from interrupt or base context.
517  */
518 /* ARGSUSED */
519 static void
520 tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav, uint_t flag)
521 {
522 	ddi_dma_handle_t	dmahdl;
523 	off_t			offset;
524 	int			status;
525 
526 	TAVOR_TNF_ENTER(tavor_udav_sync);
527 
528 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
529 
530 	/* Determine if AH needs to be synced or not */
531 	if (ah->ah_sync == 0) {
532 		TAVOR_TNF_EXIT(tavor_udav_sync);
533 		return;
534 	}
535 
536 	/* Get the DMA handle from AH handle */
537 	dmahdl = ah->ah_mrhdl->mr_bindinfo.bi_dmahdl;
538 
539 	/* Calculate offset into address handle */
540 	offset = (off_t)0;
541 	status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_udav_t), flag);
542 	if (status != DDI_SUCCESS) {
543 		TNF_PROBE_0(tavor_udav_sync_getnextentry_fail,
544 		    TAVOR_TNF_ERROR, "");
545 		TAVOR_TNF_EXIT(tavor_udav_sync);
546 		return;
547 	}
548 
549 	TAVOR_TNF_EXIT(tavor_udav_sync);
550 }
551 
552 
553 /*
554  * tavor_mcg_attach()
555  *    Context: Can be called only from user or kernel context.
556  */
557 int
558 tavor_mcg_attach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid,
559     ib_lid_t lid)
560 {
561 	tavor_rsrc_t		*rsrc;
562 	tavor_hw_mcg_t		*mcg_entry;
563 	tavor_hw_mcg_qp_list_t	*mcg_entry_qplist;
564 	tavor_mcghdl_t		mcg, newmcg;
565 	uint64_t		mgid_hash;
566 	uint32_t		end_indx;
567 	int			status;
568 	uint_t			qp_found;
569 	char			*errormsg;
570 
571 	TAVOR_TNF_ENTER(tavor_mcg_attach);
572 
573 	/*
574 	 * It is only allowed to attach MCG to UD queue pairs.  Verify
575 	 * that the intended QP is of the appropriate transport type
576 	 */
577 	if (qp->qp_serv_type != TAVOR_QP_UD) {
578 		/* Set "status" and "errormsg" and goto failure */
579 		TAVOR_TNF_FAIL(IBT_QP_SRV_TYPE_INVALID, "invalid service type");
580 		goto mcgattach_fail;
581 	}
582 
583 	/*
584 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
585 	 * LIDs should be within a well defined range.  If the specified LID
586 	 * is outside of that range, then return an error.
587 	 */
588 	if (tavor_mlid_is_valid(lid) == 0) {
589 		/* Set "status" and "errormsg" and goto failure */
590 		TAVOR_TNF_FAIL(IBT_MC_MLID_INVALID, "invalid MLID");
591 		goto mcgattach_fail;
592 	}
593 	/*
594 	 * Check for invalid Multicast GID.  All Multicast GIDs should have
595 	 * a well-defined pattern of bits and flags that are allowable.  If
596 	 * the specified GID does not meet the criteria, then return an error.
597 	 */
598 	if (tavor_mgid_is_valid(gid) == 0) {
599 		/* Set "status" and "errormsg" and goto failure */
600 		TAVOR_TNF_FAIL(IBT_MC_MGID_INVALID, "invalid MGID");
601 		goto mcgattach_fail;
602 	}
603 
604 	/*
605 	 * Compute the MGID hash value.  Since the MCG table is arranged as
606 	 * a number of separate hash chains, this operation converts the
607 	 * specified MGID into the starting index of an entry in the hash
608 	 * table (i.e. the index for the start of the appropriate hash chain).
609 	 * Subsequent operations below will walk the chain searching for the
610 	 * right place to add this new QP.
611 	 */
612 	status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
613 	    &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT());
614 	if (status != TAVOR_CMD_SUCCESS) {
615 		cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n",
616 		    status);
617 		TNF_PROBE_1(tavor_mcg_attach_mgid_hash_cmd_fail,
618 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
619 		TAVOR_TNF_EXIT(tavor_mcg_attach);
620 		return (ibc_get_ci_failure(0));
621 	}
622 
623 	/*
624 	 * Grab the multicast group mutex.  Then grab the pre-allocated
625 	 * temporary buffer used for holding and/or modifying MCG entries.
626 	 * Zero out the temporary MCG entry before we begin.
627 	 */
628 	mutex_enter(&state->ts_mcglock);
629 	mcg_entry = state->ts_mcgtmp;
630 	mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry);
631 	bzero(mcg_entry, TAVOR_MCGMEM_SZ(state));
632 
633 	/*
634 	 * Walk through the array of MCG entries starting at "mgid_hash".
635 	 * Try to find the appropriate place for this new QP to be added.
636 	 * This could happen when the first entry of the chain has MGID == 0
637 	 * (which means that the hash chain is empty), or because we find
638 	 * an entry with the same MGID (in which case we'll add the QP to
639 	 * that MCG), or because we come to the end of the chain (in which
640 	 * case this is the first QP being added to the multicast group that
641 	 * corresponds to the MGID.  The tavor_mcg_walk_mgid_hash() routine
642 	 * walks the list and returns an index into the MCG table.  The entry
643 	 * at this index is then checked to determine which case we have
644 	 * fallen into (see below).  Note:  We are using the "shadow" MCG
645 	 * list (of tavor_mcg_t structs) for this lookup because the real
646 	 * MCG entries are in hardware (and the lookup process would be much
647 	 * more time consuming).
648 	 */
649 	end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, NULL);
650 	mcg	 = &state->ts_mcghdl[end_indx];
651 
652 	/*
653 	 * If MGID == 0, then the hash chain is empty.  Just fill in the
654 	 * current entry.  Note:  No need to allocate an MCG table entry
655 	 * as all the hash chain "heads" are already preallocated.
656 	 */
657 	if ((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) {
658 
659 		/* Fill in the current entry in the "shadow" MCG list */
660 		tavor_mcg_setup_new_hdr(mcg, mcg_entry, gid, NULL);
661 
662 		/*
663 		 * Try to add the new QP number to the list.  This (and the
664 		 * above) routine fills in a temporary MCG.  The "mcg_entry"
665 		 * and "mcg_entry_qplist" pointers simply point to different
666 		 * offsets within the same temporary copy of the MCG (for
667 		 * convenience).  Note:  If this fails, we need to invalidate
668 		 * the entries we've already put into the "shadow" list entry
669 		 * above.
670 		 */
671 		status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
672 		    &qp_found);
673 		if (status != DDI_SUCCESS) {
674 			bzero(mcg, sizeof (struct tavor_sw_mcg_list_s));
675 			mutex_exit(&state->ts_mcglock);
676 			/* Set "status" and "errormsg" and goto failure */
677 			TAVOR_TNF_FAIL(status, "failed qplist add");
678 			goto mcgattach_fail;
679 		}
680 
681 		/*
682 		 * Once the temporary MCG has been filled in, write the entry
683 		 * into the appropriate location in the Tavor MCG entry table.
684 		 * If it's successful, then drop the lock and return success.
685 		 * Note: In general, this operation shouldn't fail.  If it
686 		 * does, then it is an indication that something (probably in
687 		 * HW, but maybe in SW) has gone seriously wrong.  We still
688 		 * want to zero out the entries that we've filled in above
689 		 * (in the tavor_mcg_setup_new_hdr() routine).
690 		 */
691 		status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
692 		    TAVOR_CMD_NOSLEEP_SPIN);
693 		if (status != TAVOR_CMD_SUCCESS) {
694 			bzero(mcg, sizeof (struct tavor_sw_mcg_list_s));
695 			mutex_exit(&state->ts_mcglock);
696 			TAVOR_WARNING(state, "failed to write MCG entry");
697 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
698 			    "%08x\n", status);
699 			TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
700 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
701 			    tnf_uint, indx, end_indx);
702 			TAVOR_TNF_EXIT(tavor_mcg_attach);
703 			return (ibc_get_ci_failure(0));
704 		}
705 
706 		/*
707 		 * Now that we know all the Tavor firmware accesses have been
708 		 * successful, we update the "shadow" MCG entry by incrementing
709 		 * the "number of attached QPs" count.
710 		 *
711 		 * We increment only if the QP is not already part of the
712 		 * MCG by checking the 'qp_found' flag returned from the
713 		 * qplist_add above.
714 		 */
715 		if (!qp_found) {
716 			mcg->mcg_num_qps++;
717 
718 			/*
719 			 * Increment the refcnt for this QP.  Because the QP
720 			 * was added to this MCG, the refcnt must be
721 			 * incremented.
722 			 */
723 			tavor_qp_mcg_refcnt_inc(qp);
724 		}
725 
726 		/*
727 		 * We drop the lock and return success.
728 		 */
729 		mutex_exit(&state->ts_mcglock);
730 		TAVOR_TNF_EXIT(tavor_mcg_attach);
731 		return (DDI_SUCCESS);
732 	}
733 
734 	/*
735 	 * If the specified MGID matches the MGID in the current entry, then
736 	 * we need to try to add the QP to the current MCG entry.  In this
737 	 * case, it means that we need to read the existing MCG entry (into
738 	 * the temporary MCG), add the new QP number to the temporary entry
739 	 * (using the same method we used above), and write the entry back
740 	 * to the hardware (same as above).
741 	 */
742 	if ((mcg->mcg_mgid_h == gid.gid_prefix) &&
743 	    (mcg->mcg_mgid_l == gid.gid_guid)) {
744 
745 		/*
746 		 * Read the current MCG entry into the temporary MCG.  Note:
747 		 * In general, this operation shouldn't fail.  If it does,
748 		 * then it is an indication that something (probably in HW,
749 		 * but maybe in SW) has gone seriously wrong.
750 		 */
751 		status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
752 		    TAVOR_CMD_NOSLEEP_SPIN);
753 		if (status != TAVOR_CMD_SUCCESS) {
754 			mutex_exit(&state->ts_mcglock);
755 			TAVOR_WARNING(state, "failed to read MCG entry");
756 			cmn_err(CE_CONT, "Tavor: READ_MGM command failed: "
757 			    "%08x\n", status);
758 			TNF_PROBE_2(tavor_mcg_attach_read_mgm_cmd_fail,
759 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
760 			    tnf_uint, indx, end_indx);
761 			TAVOR_TNF_EXIT(tavor_mcg_attach);
762 			return (ibc_get_ci_failure(0));
763 		}
764 
765 		/*
766 		 * Try to add the new QP number to the list.  This routine
767 		 * fills in the necessary pieces of the temporary MCG.  The
768 		 * "mcg_entry_qplist" pointer is used to point to the portion
769 		 * of the temporary MCG that holds the QP numbers.
770 		 *
771 		 * Note: tavor_mcg_qplist_add() returns SUCCESS if it
772 		 * already found the QP in the list.  In this case, the QP is
773 		 * not added on to the list again.  Check the flag 'qp_found'
774 		 * if this value is needed to be known.
775 		 *
776 		 */
777 		status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
778 		    &qp_found);
779 		if (status != DDI_SUCCESS) {
780 			mutex_exit(&state->ts_mcglock);
781 			/* Set "status" and "errormsg" and goto failure */
782 			TAVOR_TNF_FAIL(status, "failed qplist add");
783 			goto mcgattach_fail;
784 		}
785 
786 		/*
787 		 * Once the temporary MCG has been updated, write the entry
788 		 * into the appropriate location in the Tavor MCG entry table.
789 		 * If it's successful, then drop the lock and return success.
790 		 * Note: In general, this operation shouldn't fail.  If it
791 		 * does, then it is an indication that something (probably in
792 		 * HW, but maybe in SW) has gone seriously wrong.
793 		 */
794 		status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
795 		    TAVOR_CMD_NOSLEEP_SPIN);
796 		if (status != TAVOR_CMD_SUCCESS) {
797 			mutex_exit(&state->ts_mcglock);
798 			TAVOR_WARNING(state, "failed to write MCG entry");
799 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
800 			    "%08x\n", status);
801 			TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
802 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
803 			    tnf_uint, indx, end_indx);
804 			TAVOR_TNF_EXIT(tavor_mcg_attach);
805 			return (ibc_get_ci_failure(0));
806 		}
807 
808 		/*
809 		 * Now that we know all the Tavor firmware accesses have been
810 		 * successful, we update the current "shadow" MCG entry by
811 		 * incrementing the "number of attached QPs" count.
812 		 *
813 		 * We increment only if the QP is not already part of the
814 		 * MCG by checking the 'qp_found' flag returned from the
815 		 * qplist_add above.
816 		 */
817 		if (!qp_found) {
818 			mcg->mcg_num_qps++;
819 
820 			/*
821 			 * Increment the refcnt for this QP.  Because the QP
822 			 * was added to this MCG, the refcnt must be
823 			 * incremented.
824 			 */
825 			tavor_qp_mcg_refcnt_inc(qp);
826 		}
827 
828 		/*
829 		 * We drop the lock and return success.
830 		 */
831 		mutex_exit(&state->ts_mcglock);
832 		TAVOR_TNF_EXIT(tavor_mcg_attach);
833 		return (DDI_SUCCESS);
834 	}
835 
836 	/*
837 	 * If we've reached here, then we're at the end of the hash chain.
838 	 * We need to allocate a new MCG entry, fill it in, write it to Tavor,
839 	 * and update the previous entry to link the new one to the end of the
840 	 * chain.
841 	 */
842 
843 	/*
844 	 * Allocate an MCG table entry.  This will be filled in with all
845 	 * the necessary parameters to define the multicast group.  Then it
846 	 * will be written to the hardware in the next-to-last step below.
847 	 */
848 	status = tavor_rsrc_alloc(state, TAVOR_MCG, 1, TAVOR_NOSLEEP, &rsrc);
849 	if (status != DDI_SUCCESS) {
850 		mutex_exit(&state->ts_mcglock);
851 		/* Set "status" and "errormsg" and goto failure */
852 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MCG");
853 		goto mcgattach_fail;
854 	}
855 
856 	/*
857 	 * Fill in the new entry in the "shadow" MCG list.  Note:  Just as
858 	 * it does above, tavor_mcg_setup_new_hdr() also fills in a portion
859 	 * of the temporary MCG entry (the rest of which will be filled in by
860 	 * tavor_mcg_qplist_add() below)
861 	 */
862 	newmcg = &state->ts_mcghdl[rsrc->tr_indx];
863 	tavor_mcg_setup_new_hdr(newmcg, mcg_entry, gid, rsrc);
864 
865 	/*
866 	 * Try to add the new QP number to the list.  This routine fills in
867 	 * the final necessary pieces of the temporary MCG.  The
868 	 * "mcg_entry_qplist" pointer is used to point to the portion of the
869 	 * temporary MCG that holds the QP numbers.  If we fail here, we
870 	 * must undo the previous resource allocation.
871 	 *
872 	 * Note: tavor_mcg_qplist_add() can we return SUCCESS if it already
873 	 * found the QP in the list.  In this case, the QP is not added on to
874 	 * the list again.  Check the flag 'qp_found' if this value is needed
875 	 * to be known.
876 	 */
877 	status = tavor_mcg_qplist_add(state, newmcg, mcg_entry_qplist, qp,
878 	    &qp_found);
879 	if (status != DDI_SUCCESS) {
880 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
881 		tavor_rsrc_free(state, &rsrc);
882 		mutex_exit(&state->ts_mcglock);
883 		/* Set "status" and "errormsg" and goto failure */
884 		TAVOR_TNF_FAIL(status, "failed qplist add");
885 		goto mcgattach_fail;
886 	}
887 
888 	/*
889 	 * Once the temporary MCG has been updated, write the entry into the
890 	 * appropriate location in the Tavor MCG entry table.  If this is
891 	 * successful, then we need to chain the previous entry to this one.
892 	 * Note: In general, this operation shouldn't fail.  If it does, then
893 	 * it is an indication that something (probably in HW, but maybe in
894 	 * SW) has gone seriously wrong.
895 	 */
896 	status = tavor_write_mgm_cmd_post(state, mcg_entry, rsrc->tr_indx,
897 	    TAVOR_CMD_NOSLEEP_SPIN);
898 	if (status != TAVOR_CMD_SUCCESS) {
899 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
900 		tavor_rsrc_free(state, &rsrc);
901 		mutex_exit(&state->ts_mcglock);
902 		TAVOR_WARNING(state, "failed to write MCG entry");
903 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
904 		    status);
905 		TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
906 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
907 		    tnf_uint, indx, rsrc->tr_indx);
908 		TAVOR_TNF_EXIT(tavor_mcg_attach);
909 		return (ibc_get_ci_failure(0));
910 	}
911 
912 	/*
913 	 * Now read the current MCG entry (the one previously at the end of
914 	 * hash chain) into the temporary MCG.  We are going to update its
915 	 * "next_gid_indx" now and write the entry back to the MCG table.
916 	 * Note:  In general, this operation shouldn't fail.  If it does, then
917 	 * it is an indication that something (probably in HW, but maybe in SW)
918 	 * has gone seriously wrong.  We will free up the MCG entry resource,
919 	 * but we will not undo the previously written MCG entry in the HW.
920 	 * This is OK, though, because the MCG entry is not currently attached
921 	 * to any hash chain.
922 	 */
923 	status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
924 	    TAVOR_CMD_NOSLEEP_SPIN);
925 	if (status != TAVOR_CMD_SUCCESS) {
926 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
927 		tavor_rsrc_free(state, &rsrc);
928 		mutex_exit(&state->ts_mcglock);
929 		TAVOR_WARNING(state, "failed to read MCG entry");
930 		cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
931 		    status);
932 		TNF_PROBE_2(tavor_mcg_attach_read_mgm_cmd_fail,
933 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
934 		    tnf_uint, indx, end_indx);
935 		TAVOR_TNF_EXIT(tavor_mcg_attach);
936 		return (ibc_get_ci_failure(0));
937 	}
938 
939 	/*
940 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
941 	 * and attempt to write the entry back into the Tavor MCG table.  If
942 	 * this succeeds, then we update the "shadow" list to reflect the
943 	 * change, drop the lock, and return success.  Note:  In general, this
944 	 * operation shouldn't fail.  If it does, then it is an indication
945 	 * that something (probably in HW, but maybe in SW) has gone seriously
946 	 * wrong.  Just as we do above, we will free up the MCG entry resource,
947 	 * but we will not try to undo the previously written MCG entry.  This
948 	 * is OK, though, because (since we failed here to update the end of
949 	 * the chain) that other entry is not currently attached to any chain.
950 	 */
951 	mcg_entry->next_gid_indx = rsrc->tr_indx;
952 	status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
953 	    TAVOR_CMD_NOSLEEP_SPIN);
954 	if (status != TAVOR_CMD_SUCCESS) {
955 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
956 		tavor_rsrc_free(state, &rsrc);
957 		mutex_exit(&state->ts_mcglock);
958 		TAVOR_WARNING(state, "failed to write MCG entry");
959 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
960 		    status);
961 		TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
962 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
963 		    tnf_uint, indx, end_indx);
964 		TAVOR_TNF_EXIT(tavor_mcg_attach);
965 		return (ibc_get_ci_failure(0));
966 	}
967 	mcg = &state->ts_mcghdl[end_indx];
968 	mcg->mcg_next_indx = rsrc->tr_indx;
969 
970 	/*
971 	 * Now that we know all the Tavor firmware accesses have been
972 	 * successful, we update the new "shadow" MCG entry by incrementing
973 	 * the "number of attached QPs" count.  Then we drop the lock and
974 	 * return success.
975 	 */
976 	newmcg->mcg_num_qps++;
977 
978 	/*
979 	 * Increment the refcnt for this QP.  Because the QP
980 	 * was added to this MCG, the refcnt must be
981 	 * incremented.
982 	 */
983 	tavor_qp_mcg_refcnt_inc(qp);
984 
985 	mutex_exit(&state->ts_mcglock);
986 	TAVOR_TNF_EXIT(tavor_mcg_attach);
987 	return (DDI_SUCCESS);
988 
989 mcgattach_fail:
990 	TNF_PROBE_1(tavor_mcg_attach_fail, TAVOR_TNF_ERROR, "", tnf_string,
991 	    msg, errormsg);
992 	TAVOR_TNF_EXIT(tavor_mcg_attach);
993 	return (status);
994 }
995 
996 
997 /*
998  * tavor_mcg_detach()
999  *    Context: Can be called only from user or kernel context.
1000  */
1001 int
1002 tavor_mcg_detach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid,
1003     ib_lid_t lid)
1004 {
1005 	tavor_hw_mcg_t		*mcg_entry;
1006 	tavor_hw_mcg_qp_list_t	*mcg_entry_qplist;
1007 	tavor_mcghdl_t		mcg;
1008 	uint64_t		mgid_hash;
1009 	uint32_t		end_indx, prev_indx;
1010 	int			status;
1011 
1012 	TAVOR_TNF_ENTER(tavor_mcg_detach);
1013 
1014 	/*
1015 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
1016 	 * LIDs should be within a well defined range.  If the specified LID
1017 	 * is outside of that range, then return an error.
1018 	 */
1019 	if (tavor_mlid_is_valid(lid) == 0) {
1020 		TNF_PROBE_0(tavor_mcg_detach_invmlid_fail, TAVOR_TNF_ERROR, "");
1021 		TAVOR_TNF_EXIT(tavor_mcg_detach);
1022 		return (IBT_MC_MLID_INVALID);
1023 	}
1024 
1025 	/*
1026 	 * Compute the MGID hash value.  As described above, the MCG table is
1027 	 * arranged as a number of separate hash chains.  This operation
1028 	 * converts the specified MGID into the starting index of an entry in
1029 	 * the hash table (i.e. the index for the start of the appropriate
1030 	 * hash chain).  Subsequent operations below will walk the chain
1031 	 * searching for a matching entry from which to attempt to remove
1032 	 * the specified QP.
1033 	 */
1034 	status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
1035 	    &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT());
1036 	if (status != TAVOR_CMD_SUCCESS) {
1037 		cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n",
1038 		    status);
1039 		TNF_PROBE_1(tavor_mcg_detach_mgid_hash_cmd_fail,
1040 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1041 		TAVOR_TNF_EXIT(tavor_mcg_attach);
1042 		return (ibc_get_ci_failure(0));
1043 	}
1044 
1045 	/*
1046 	 * Grab the multicast group mutex.  Then grab the pre-allocated
1047 	 * temporary buffer used for holding and/or modifying MCG entries.
1048 	 */
1049 	mutex_enter(&state->ts_mcglock);
1050 	mcg_entry = state->ts_mcgtmp;
1051 	mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry);
1052 
1053 	/*
1054 	 * Walk through the array of MCG entries starting at "mgid_hash".
1055 	 * Try to find an MCG entry with a matching MGID.  The
1056 	 * tavor_mcg_walk_mgid_hash() routine walks the list and returns an
1057 	 * index into the MCG table.  The entry at this index is checked to
1058 	 * determine whether it is a match or not.  If it is a match, then
1059 	 * we continue on to attempt to remove the QP from the MCG.  If it
1060 	 * is not a match (or not a valid MCG entry), then we return an error.
1061 	 */
1062 	end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, &prev_indx);
1063 	mcg	 = &state->ts_mcghdl[end_indx];
1064 
1065 	/*
1066 	 * If MGID == 0 (the hash chain is empty) or if the specified MGID
1067 	 * does not match the MGID in the current entry, then return
1068 	 * IBT_MC_MGID_INVALID (to indicate that the specified MGID is not
1069 	 * valid).
1070 	 */
1071 	if (((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) ||
1072 	    ((mcg->mcg_mgid_h != gid.gid_prefix) ||
1073 	    (mcg->mcg_mgid_l != gid.gid_guid))) {
1074 		mutex_exit(&state->ts_mcglock);
1075 		TNF_PROBE_0(tavor_mcg_detach_invmgid_fail, TAVOR_TNF_ERROR, "");
1076 		TAVOR_TNF_EXIT(tavor_mcg_detach);
1077 		return (IBT_MC_MGID_INVALID);
1078 	}
1079 
1080 	/*
1081 	 * Read the current MCG entry into the temporary MCG.  Note: In
1082 	 * general, this operation shouldn't fail.  If it does, then it is
1083 	 * an indication that something (probably in HW, but maybe in SW)
1084 	 * has gone seriously wrong.
1085 	 */
1086 	status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
1087 	    TAVOR_CMD_NOSLEEP_SPIN);
1088 	if (status != TAVOR_CMD_SUCCESS) {
1089 		mutex_exit(&state->ts_mcglock);
1090 		TAVOR_WARNING(state, "failed to read MCG entry");
1091 		cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
1092 		    status);
1093 		TNF_PROBE_2(tavor_mcg_detach_read_mgm_cmd_fail,
1094 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1095 		    tnf_uint, indx, end_indx);
1096 		TAVOR_TNF_EXIT(tavor_mcg_attach);
1097 		return (ibc_get_ci_failure(0));
1098 	}
1099 
1100 	/*
1101 	 * Search the QP number list for a match.  If a match is found, then
1102 	 * remove the entry from the QP list.  Otherwise, if no match is found,
1103 	 * return an error.
1104 	 */
1105 	status = tavor_mcg_qplist_remove(mcg, mcg_entry_qplist, qp);
1106 	if (status != DDI_SUCCESS) {
1107 		mutex_exit(&state->ts_mcglock);
1108 		TAVOR_TNF_EXIT(tavor_mcg_detach);
1109 		return (status);
1110 	}
1111 
1112 	/*
1113 	 * Decrement the MCG count for this QP.  When the 'qp_mcg'
1114 	 * field becomes 0, then this QP is no longer a member of any
1115 	 * MCG.
1116 	 */
1117 	tavor_qp_mcg_refcnt_dec(qp);
1118 
1119 	/*
1120 	 * If the current MCG's QP number list is about to be made empty
1121 	 * ("mcg_num_qps" == 1), then remove the entry itself from the hash
1122 	 * chain.  Otherwise, just write the updated MCG entry back to the
1123 	 * hardware.  In either case, once we successfully update the hardware
1124 	 * chain, then we decrement the "shadow" list entry's "mcg_num_qps"
1125 	 * count (or zero out the entire "shadow" list entry) before returning
1126 	 * success.  Note:  Zeroing out the "shadow" list entry is done
1127 	 * inside of tavor_mcg_hash_list_remove().
1128 	 */
1129 	if (mcg->mcg_num_qps == 1) {
1130 
1131 		/* Remove an MCG entry from the hash chain */
1132 		status = tavor_mcg_hash_list_remove(state, end_indx, prev_indx,
1133 		    mcg_entry);
1134 		if (status != DDI_SUCCESS) {
1135 			mutex_exit(&state->ts_mcglock);
1136 			TAVOR_TNF_EXIT(tavor_mcg_detach);
1137 			return (status);
1138 		}
1139 
1140 	} else {
1141 		/*
1142 		 * Write the updated MCG entry back to the Tavor MCG table.
1143 		 * If this succeeds, then we update the "shadow" list to
1144 		 * reflect the change (i.e. decrement the "mcg_num_qps"),
1145 		 * drop the lock, and return success.  Note:  In general,
1146 		 * this operation shouldn't fail.  If it does, then it is an
1147 		 * indication that something (probably in HW, but maybe in SW)
1148 		 * has gone seriously wrong.
1149 		 */
1150 		status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
1151 		    TAVOR_CMD_NOSLEEP_SPIN);
1152 		if (status != TAVOR_CMD_SUCCESS) {
1153 			mutex_exit(&state->ts_mcglock);
1154 			TAVOR_WARNING(state, "failed to write MCG entry");
1155 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
1156 			    "%08x\n", status);
1157 			TNF_PROBE_2(tavor_mcg_detach_write_mgm_cmd_fail,
1158 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1159 			    tnf_uint, indx, end_indx);
1160 			TAVOR_TNF_EXIT(tavor_mcg_detach);
1161 			return (ibc_get_ci_failure(0));
1162 		}
1163 		mcg->mcg_num_qps--;
1164 	}
1165 
1166 	mutex_exit(&state->ts_mcglock);
1167 	TAVOR_TNF_EXIT(tavor_mcg_detach);
1168 	return (DDI_SUCCESS);
1169 }
1170 
1171 /*
1172  * tavor_qp_mcg_refcnt_inc()
1173  *    Context: Can be called from interrupt or base context.
1174  */
1175 static void
1176 tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp)
1177 {
1178 	/* Increment the QP's MCG reference count */
1179 	mutex_enter(&qp->qp_lock);
1180 	qp->qp_mcg_refcnt++;
1181 	TNF_PROBE_1_DEBUG(tavor_qp_mcg_refcnt_inc, TAVOR_TNF_TRACE, "",
1182 	    tnf_uint, refcnt, qp->qp_mcg_refcnt);
1183 	mutex_exit(&qp->qp_lock);
1184 }
1185 
1186 
1187 /*
1188  * tavor_qp_mcg_refcnt_dec()
1189  *    Context: Can be called from interrupt or base context.
1190  */
1191 static void
1192 tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp)
1193 {
1194 	/* Decrement the QP's MCG reference count */
1195 	mutex_enter(&qp->qp_lock);
1196 	qp->qp_mcg_refcnt--;
1197 	TNF_PROBE_1_DEBUG(tavor_qp_mcg_refcnt_dec, TAVOR_TNF_TRACE, "",
1198 	    tnf_uint, refcnt, qp->qp_mcg_refcnt);
1199 	mutex_exit(&qp->qp_lock);
1200 }
1201 
1202 
1203 /*
1204  * tavor_mcg_qplist_add()
1205  *    Context: Can be called from interrupt or base context.
1206  */
1207 static int
1208 tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg,
1209     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp,
1210     uint_t *qp_found)
1211 {
1212 	uint_t		qplist_indx;
1213 
1214 	TAVOR_TNF_ENTER(tavor_mcg_qplist_add);
1215 
1216 	ASSERT(MUTEX_HELD(&state->ts_mcglock));
1217 
1218 	qplist_indx = mcg->mcg_num_qps;
1219 
1220 	/*
1221 	 * Determine if we have exceeded the maximum number of QP per
1222 	 * multicast group.  If we have, then return an error
1223 	 */
1224 	if (qplist_indx >= state->ts_cfg_profile->cp_num_qp_per_mcg) {
1225 		TNF_PROBE_0(tavor_mcg_qplist_add_too_many_qps,
1226 		    TAVOR_TNF_ERROR, "");
1227 		TAVOR_TNF_EXIT(tavor_mcg_qplist_add);
1228 		return (IBT_HCA_MCG_QP_EXCEEDED);
1229 	}
1230 
1231 	/*
1232 	 * Determine if the QP is already attached to this MCG table.  If it
1233 	 * is, then we break out and treat this operation as a NO-OP
1234 	 */
1235 	for (qplist_indx = 0; qplist_indx < mcg->mcg_num_qps;
1236 	    qplist_indx++) {
1237 		if (mcg_qplist[qplist_indx].qpn == qp->qp_qpnum) {
1238 			break;
1239 		}
1240 	}
1241 
1242 	/*
1243 	 * If the QP was already on the list, set 'qp_found' to TRUE.  We still
1244 	 * return SUCCESS in this case, but the qplist will not have been
1245 	 * updated because the QP was already on the list.
1246 	 */
1247 	if (qplist_indx < mcg->mcg_num_qps) {
1248 		*qp_found = 1;
1249 	} else {
1250 		/*
1251 		 * Otherwise, append the new QP number to the end of the
1252 		 * current QP list.  Note: We will increment the "mcg_num_qps"
1253 		 * field on the "shadow" MCG list entry later (after we know
1254 		 * that all necessary Tavor firmware accesses have been
1255 		 * successful).
1256 		 *
1257 		 * Set 'qp_found' to 0 so we know the QP was added on to the
1258 		 * list for sure.
1259 		 */
1260 		mcg_qplist[qplist_indx].q   = TAVOR_MCG_QPN_VALID;
1261 		mcg_qplist[qplist_indx].qpn = qp->qp_qpnum;
1262 		*qp_found = 0;
1263 	}
1264 
1265 	TAVOR_TNF_EXIT(tavor_mcg_qplist_add);
1266 	return (DDI_SUCCESS);
1267 }
1268 
1269 
1270 
1271 /*
1272  * tavor_mcg_qplist_remove()
1273  *    Context: Can be called from interrupt or base context.
1274  */
1275 static int
1276 tavor_mcg_qplist_remove(tavor_mcghdl_t mcg, tavor_hw_mcg_qp_list_t *mcg_qplist,
1277     tavor_qphdl_t qp)
1278 {
1279 	uint_t		i, qplist_indx;
1280 
1281 	TAVOR_TNF_ENTER(tavor_mcg_qplist_remove);
1282 
1283 	/*
1284 	 * Search the MCG QP list for a matching QPN.  When
1285 	 * it's found, we swap the last entry with the current
1286 	 * one, set the last entry to zero, decrement the last
1287 	 * entry, and return.  If it's not found, then it's
1288 	 * and error.
1289 	 */
1290 	qplist_indx = mcg->mcg_num_qps;
1291 	for (i = 0; i < qplist_indx; i++) {
1292 		if (mcg_qplist[i].qpn == qp->qp_qpnum) {
1293 			mcg_qplist[i] = mcg_qplist[qplist_indx - 1];
1294 			mcg_qplist[qplist_indx - 1].q = TAVOR_MCG_QPN_INVALID;
1295 			mcg_qplist[qplist_indx - 1].qpn = 0;
1296 
1297 			TAVOR_TNF_EXIT(tavor_mcg_qplist_remove);
1298 			return (DDI_SUCCESS);
1299 		}
1300 	}
1301 
1302 	TNF_PROBE_0(tavor_mcg_qplist_remove_invqphdl_fail, TAVOR_TNF_ERROR, "");
1303 	TAVOR_TNF_EXIT(tavor_mcg_qplist_remove);
1304 	return (IBT_QP_HDL_INVALID);
1305 }
1306 
1307 
1308 /*
1309  * tavor_mcg_walk_mgid_hash()
1310  *    Context: Can be called from interrupt or base context.
1311  */
1312 static uint_t
1313 tavor_mcg_walk_mgid_hash(tavor_state_t *state, uint64_t start_indx,
1314     ib_gid_t mgid, uint_t *p_indx)
1315 {
1316 	tavor_mcghdl_t	curr_mcghdl;
1317 	uint_t		curr_indx, prev_indx;
1318 
1319 	TAVOR_TNF_ENTER(tavor_mcg_walk_mgid_hash);
1320 
1321 	ASSERT(MUTEX_HELD(&state->ts_mcglock));
1322 
1323 	/* Start at the head of the hash chain */
1324 	curr_indx   = start_indx;
1325 	prev_indx   = curr_indx;
1326 	curr_mcghdl = &state->ts_mcghdl[curr_indx];
1327 
1328 	/* If the first entry in the chain has MGID == 0, then stop */
1329 	if ((curr_mcghdl->mcg_mgid_h == 0) &&
1330 	    (curr_mcghdl->mcg_mgid_l == 0)) {
1331 		goto end_mgid_hash_walk;
1332 	}
1333 
1334 	/* If the first entry in the chain matches the MGID, then stop */
1335 	if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1336 	    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1337 		goto end_mgid_hash_walk;
1338 	}
1339 
1340 	/* Otherwise, walk the hash chain looking for a match */
1341 	while (curr_mcghdl->mcg_next_indx != 0) {
1342 		prev_indx = curr_indx;
1343 		curr_indx = curr_mcghdl->mcg_next_indx;
1344 		curr_mcghdl = &state->ts_mcghdl[curr_indx];
1345 
1346 		if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1347 		    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1348 			break;
1349 		}
1350 	}
1351 
1352 end_mgid_hash_walk:
1353 	/*
1354 	 * If necessary, return the index of the previous entry too.  This
1355 	 * is primarily used for detaching a QP from a multicast group.  It
1356 	 * may be necessary, in that case, to delete an MCG entry from the
1357 	 * hash chain and having the index of the previous entry is helpful.
1358 	 */
1359 	if (p_indx != NULL) {
1360 		*p_indx = prev_indx;
1361 	}
1362 	TAVOR_TNF_EXIT(tavor_mcg_walk_mgid_hash);
1363 	return (curr_indx);
1364 }
1365 
1366 
1367 /*
1368  * tavor_mcg_setup_new_hdr()
1369  *    Context: Can be called from interrupt or base context.
1370  */
1371 static void
1372 tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg, tavor_hw_mcg_t *mcg_hdr,
1373     ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc)
1374 {
1375 	TAVOR_TNF_ENTER(tavor_mcg_setup_new_hdr);
1376 
1377 	/*
1378 	 * Fill in the fields of the "shadow" entry used by software
1379 	 * to track MCG hardware entry
1380 	 */
1381 	mcg->mcg_mgid_h	   = mgid.gid_prefix;
1382 	mcg->mcg_mgid_l	   = mgid.gid_guid;
1383 	mcg->mcg_rsrcp	   = mcg_rsrc;
1384 	mcg->mcg_next_indx = 0;
1385 	mcg->mcg_num_qps   = 0;
1386 
1387 	/*
1388 	 * Fill the header fields of the MCG entry (in the temporary copy)
1389 	 */
1390 	mcg_hdr->mgid_h		= mgid.gid_prefix;
1391 	mcg_hdr->mgid_l		= mgid.gid_guid;
1392 	mcg_hdr->next_gid_indx	= 0;
1393 
1394 	TAVOR_TNF_EXIT(tavor_mcg_setup_new_hdr);
1395 }
1396 
1397 
1398 /*
1399  * tavor_mcg_hash_list_remove()
1400  *    Context: Can be called only from user or kernel context.
1401  */
1402 static int
1403 tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx,
1404     uint_t prev_indx, tavor_hw_mcg_t *mcg_entry)
1405 {
1406 	tavor_mcghdl_t		curr_mcg, prev_mcg, next_mcg;
1407 	uint_t			next_indx;
1408 	int			status;
1409 
1410 	/* Get the pointer to "shadow" list for current entry */
1411 	curr_mcg = &state->ts_mcghdl[curr_indx];
1412 
1413 	/*
1414 	 * If this is the first entry on a hash chain, then attempt to replace
1415 	 * the entry with the next entry on the chain.  If there are no
1416 	 * subsequent entries on the chain, then this is the only entry and
1417 	 * should be invalidated.
1418 	 */
1419 	if (curr_indx == prev_indx) {
1420 
1421 		/*
1422 		 * If this is the only entry on the chain, then invalidate it.
1423 		 * Note:  Invalidating an MCG entry means writing all zeros
1424 		 * to the entry.  This is only necessary for those MCG
1425 		 * entries that are the "head" entries of the individual hash
1426 		 * chains.  Regardless of whether this operation returns
1427 		 * success or failure, return that result to the caller.
1428 		 */
1429 		next_indx = curr_mcg->mcg_next_indx;
1430 		if (next_indx == 0) {
1431 			status = tavor_mcg_entry_invalidate(state, mcg_entry,
1432 			    curr_indx);
1433 			bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1434 			TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1435 			return (status);
1436 		}
1437 
1438 		/*
1439 		 * Otherwise, this is just the first entry on the chain, so
1440 		 * grab the next one
1441 		 */
1442 		next_mcg = &state->ts_mcghdl[next_indx];
1443 
1444 		/*
1445 		 * Read the next MCG entry into the temporary MCG.  Note:
1446 		 * In general, this operation shouldn't fail.  If it does,
1447 		 * then it is an indication that something (probably in HW,
1448 		 * but maybe in SW) has gone seriously wrong.
1449 		 */
1450 		status = tavor_read_mgm_cmd_post(state, mcg_entry, next_indx,
1451 		    TAVOR_CMD_NOSLEEP_SPIN);
1452 		if (status != TAVOR_CMD_SUCCESS) {
1453 			TAVOR_WARNING(state, "failed to read MCG entry");
1454 			cmn_err(CE_CONT, "Tavor: READ_MGM command failed: "
1455 			    "%08x\n", status);
1456 			TNF_PROBE_2(tavor_mcg_hash_list_rem_read_mgm_cmd_fail,
1457 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1458 			    tnf_uint, indx, next_indx);
1459 			TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1460 			return (ibc_get_ci_failure(0));
1461 		}
1462 
1463 		/*
1464 		 * Copy/Write the temporary MCG back to the hardware MCG list
1465 		 * using the current index.  This essentially removes the
1466 		 * current MCG entry from the list by writing over it with
1467 		 * the next one.  If this is successful, then we can do the
1468 		 * same operation for the "shadow" list.  And we can also
1469 		 * free up the Tavor MCG entry resource that was associated
1470 		 * with the (old) next entry.  Note:  In general, this
1471 		 * operation shouldn't fail.  If it does, then it is an
1472 		 * indication that something (probably in HW, but maybe in SW)
1473 		 * has gone seriously wrong.
1474 		 */
1475 		status = tavor_write_mgm_cmd_post(state, mcg_entry, curr_indx,
1476 		    TAVOR_CMD_NOSLEEP_SPIN);
1477 		if (status != TAVOR_CMD_SUCCESS) {
1478 			TAVOR_WARNING(state, "failed to write MCG entry");
1479 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
1480 			    "%08x\n", status);
1481 			TNF_PROBE_2(tavor_mcg_hash_list_rem_write_mgm_cmd_fail,
1482 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1483 			    tnf_uint, indx, curr_indx);
1484 			TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1485 			return (ibc_get_ci_failure(0));
1486 		}
1487 
1488 		/*
1489 		 * Copy all the software tracking information from the next
1490 		 * entry on the "shadow" MCG list into the current entry on
1491 		 * the list.  Then invalidate (zero out) the other "shadow"
1492 		 * list entry.
1493 		 */
1494 		bcopy(next_mcg, curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1495 		bzero(next_mcg, sizeof (struct tavor_sw_mcg_list_s));
1496 
1497 		/*
1498 		 * Free up the Tavor MCG entry resource used by the "next"
1499 		 * MCG entry.  That resource is no longer needed by any
1500 		 * MCG entry which is first on a hash chain (like the "next"
1501 		 * entry has just become).
1502 		 */
1503 		tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1504 
1505 		TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1506 		return (DDI_SUCCESS);
1507 	}
1508 
1509 	/*
1510 	 * Else if this is the last entry on the hash chain (or a middle
1511 	 * entry, then we update the previous entry's "next_gid_index" field
1512 	 * to make it point instead to the next entry on the chain.  By
1513 	 * skipping over the removed entry in this way, we can then free up
1514 	 * any resources associated with the current entry.  Note:  We don't
1515 	 * need to invalidate the "skipped over" hardware entry because it
1516 	 * will no be longer connected to any hash chains, and if/when it is
1517 	 * finally re-used, it will be written with entirely new values.
1518 	 */
1519 
1520 	/*
1521 	 * Read the next MCG entry into the temporary MCG.  Note:  In general,
1522 	 * this operation shouldn't fail.  If it does, then it is an
1523 	 * indication that something (probably in HW, but maybe in SW) has
1524 	 * gone seriously wrong.
1525 	 */
1526 	status = tavor_read_mgm_cmd_post(state, mcg_entry, prev_indx,
1527 	    TAVOR_CMD_NOSLEEP_SPIN);
1528 	if (status != TAVOR_CMD_SUCCESS) {
1529 		TAVOR_WARNING(state, "failed to read MCG entry");
1530 		cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
1531 		    status);
1532 		TNF_PROBE_2(tavor_mcg_hash_list_rem_read_mgm_cmd_fail,
1533 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1534 		    tnf_uint, indx, prev_indx);
1535 		TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1536 		return (ibc_get_ci_failure(0));
1537 	}
1538 
1539 	/*
1540 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
1541 	 * and attempt to write the entry back into the Tavor MCG table.  If
1542 	 * this succeeds, then we update the "shadow" list to reflect the
1543 	 * change, free up the Tavor MCG entry resource that was associated
1544 	 * with the current entry, and return success.  Note:  In general,
1545 	 * this operation shouldn't fail.  If it does, then it is an indication
1546 	 * that something (probably in HW, but maybe in SW) has gone seriously
1547 	 * wrong.
1548 	 */
1549 	mcg_entry->next_gid_indx = curr_mcg->mcg_next_indx;
1550 	status = tavor_write_mgm_cmd_post(state, mcg_entry, prev_indx,
1551 	    TAVOR_CMD_NOSLEEP_SPIN);
1552 	if (status != TAVOR_CMD_SUCCESS) {
1553 		TAVOR_WARNING(state, "failed to write MCG entry");
1554 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
1555 		    status);
1556 		TNF_PROBE_2(tavor_mcg_hash_list_rem_write_mgm_cmd_fail,
1557 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1558 		    tnf_uint, indx, prev_indx);
1559 		TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1560 		return (ibc_get_ci_failure(0));
1561 	}
1562 
1563 	/*
1564 	 * Get the pointer to the "shadow" MCG list entry for the previous
1565 	 * MCG.  Update its "mcg_next_indx" to point to the next entry
1566 	 * the one after the current entry. Note:  This next index may be
1567 	 * zero, indicating the end of the list.
1568 	 */
1569 	prev_mcg = &state->ts_mcghdl[prev_indx];
1570 	prev_mcg->mcg_next_indx = curr_mcg->mcg_next_indx;
1571 
1572 	/*
1573 	 * Free up the Tavor MCG entry resource used by the current entry.
1574 	 * This resource is no longer needed because the chain now skips over
1575 	 * the current entry.  Then invalidate (zero out) the current "shadow"
1576 	 * list entry.
1577 	 */
1578 	tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1579 	bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1580 
1581 	TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1582 	return (DDI_SUCCESS);
1583 }
1584 
1585 
1586 /*
1587  * tavor_mcg_entry_invalidate()
1588  *    Context: Can be called only from user or kernel context.
1589  */
1590 static int
1591 tavor_mcg_entry_invalidate(tavor_state_t *state, tavor_hw_mcg_t *mcg_entry,
1592     uint_t indx)
1593 {
1594 	int		status;
1595 
1596 	TAVOR_TNF_ENTER(tavor_mcg_entry_invalidate);
1597 
1598 	/*
1599 	 * Invalidate the hardware MCG entry by zeroing out this temporary
1600 	 * MCG and writing it the the hardware.  Note: In general, this
1601 	 * operation shouldn't fail.  If it does, then it is an indication
1602 	 * that something (probably in HW, but maybe in SW) has gone seriously
1603 	 * wrong.
1604 	 */
1605 	bzero(mcg_entry, TAVOR_MCGMEM_SZ(state));
1606 	status = tavor_write_mgm_cmd_post(state, mcg_entry, indx,
1607 	    TAVOR_CMD_NOSLEEP_SPIN);
1608 	if (status != TAVOR_CMD_SUCCESS) {
1609 		TAVOR_WARNING(state, "failed to write MCG entry");
1610 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
1611 		    status);
1612 		TNF_PROBE_2(tavor_mcg_entry_invalidate_write_mgm_cmd_fail,
1613 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1614 		    tnf_uint, indx, indx);
1615 		TAVOR_TNF_EXIT(tavor_mcg_entry_invalidate);
1616 		return (ibc_get_ci_failure(0));
1617 	}
1618 
1619 	TAVOR_TNF_EXIT(tavor_mcg_entry_invalidate);
1620 	return (DDI_SUCCESS);
1621 }
1622 
1623 
1624 /*
1625  * tavor_mgid_is_valid()
1626  *    Context: Can be called from interrupt or base context.
1627  */
1628 static int
1629 tavor_mgid_is_valid(ib_gid_t gid)
1630 {
1631 	uint_t		topbits, flags, scope;
1632 
1633 	TAVOR_TNF_ENTER(tavor_mgid_is_valid);
1634 
1635 	/*
1636 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1637 	 * "multicast GID" must have its top eight bits set to all ones
1638 	 */
1639 	topbits = (gid.gid_prefix >> TAVOR_MCG_TOPBITS_SHIFT) &
1640 	    TAVOR_MCG_TOPBITS_MASK;
1641 	if (topbits != TAVOR_MCG_TOPBITS) {
1642 		TNF_PROBE_0(tavor_mgid_is_valid_invbits_fail, TAVOR_TNF_ERROR,
1643 		    "");
1644 		TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1645 		return (0);
1646 	}
1647 
1648 	/*
1649 	 * The next 4 bits are the "flag" bits.  These are valid only
1650 	 * if they are "0" (which correspond to permanently assigned/
1651 	 * "well-known" multicast GIDs) or "1" (for so-called "transient"
1652 	 * multicast GIDs).  All other values are reserved.
1653 	 */
1654 	flags = (gid.gid_prefix >> TAVOR_MCG_FLAGS_SHIFT) &
1655 	    TAVOR_MCG_FLAGS_MASK;
1656 	if (!((flags == TAVOR_MCG_FLAGS_PERM) ||
1657 	    (flags == TAVOR_MCG_FLAGS_NONPERM))) {
1658 		TNF_PROBE_1(tavor_mgid_is_valid_invflags_fail, TAVOR_TNF_ERROR,
1659 		    "", tnf_uint, flags, flags);
1660 		TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1661 		return (0);
1662 	}
1663 
1664 	/*
1665 	 * The next 4 bits are the "scope" bits.  These are valid only
1666 	 * if they are "2" (Link-local), "5" (Site-local), "8"
1667 	 * (Organization-local) or "E" (Global).  All other values
1668 	 * are reserved (or currently unassigned).
1669 	 */
1670 	scope = (gid.gid_prefix >> TAVOR_MCG_SCOPE_SHIFT) &
1671 	    TAVOR_MCG_SCOPE_MASK;
1672 	if (!((scope == TAVOR_MCG_SCOPE_LINKLOC) ||
1673 	    (scope == TAVOR_MCG_SCOPE_SITELOC)	 ||
1674 	    (scope == TAVOR_MCG_SCOPE_ORGLOC)	 ||
1675 	    (scope == TAVOR_MCG_SCOPE_GLOBAL))) {
1676 		TNF_PROBE_1(tavor_mgid_is_valid_invscope_fail, TAVOR_TNF_ERROR,
1677 		    "", tnf_uint, scope, scope);
1678 		TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1679 		return (0);
1680 	}
1681 
1682 	/*
1683 	 * If it passes all of the above checks, then we will consider it
1684 	 * a valid multicast GID.
1685 	 */
1686 	TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1687 	return (1);
1688 }
1689 
1690 
1691 /*
1692  * tavor_mlid_is_valid()
1693  *    Context: Can be called from interrupt or base context.
1694  */
1695 static int
1696 tavor_mlid_is_valid(ib_lid_t lid)
1697 {
1698 	TAVOR_TNF_ENTER(tavor_mlid_is_valid);
1699 
1700 	/*
1701 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1702 	 * "multicast DLID" must be between 0xC000 and 0xFFFE.
1703 	 */
1704 	if ((lid < IB_LID_MC_FIRST) || (lid > IB_LID_MC_LAST)) {
1705 		TNF_PROBE_1(tavor_mlid_is_valid_invdlid_fail, TAVOR_TNF_ERROR,
1706 		    "", tnf_uint, mlid, lid);
1707 		TAVOR_TNF_EXIT(tavor_mlid_is_valid);
1708 		return (0);
1709 	}
1710 
1711 	TAVOR_TNF_EXIT(tavor_mlid_is_valid);
1712 	return (1);
1713 }
1714 
1715 
1716 /*
1717  * tavor_pd_alloc()
1718  *    Context: Can be called only from user or kernel context.
1719  */
1720 int
1721 tavor_pd_alloc(tavor_state_t *state, tavor_pdhdl_t *pdhdl, uint_t sleepflag)
1722 {
1723 	tavor_rsrc_t	*rsrc;
1724 	tavor_pdhdl_t	pd;
1725 	int		status;
1726 
1727 	TAVOR_TNF_ENTER(tavor_pd_alloc);
1728 
1729 	/*
1730 	 * Allocate the software structure for tracking the protection domain
1731 	 * (i.e. the Tavor Protection Domain handle).  By default each PD
1732 	 * structure will have a unique PD number assigned to it.  All that
1733 	 * is necessary is for software to initialize the PD reference count
1734 	 * (to zero) and return success.
1735 	 */
1736 	status = tavor_rsrc_alloc(state, TAVOR_PDHDL, 1, sleepflag, &rsrc);
1737 	if (status != DDI_SUCCESS) {
1738 		TNF_PROBE_0(tavor_pd_alloc_rsrcalloc_fail, TAVOR_TNF_ERROR, "");
1739 		TAVOR_TNF_EXIT(tavor_pd_alloc);
1740 		return (IBT_INSUFF_RESOURCE);
1741 	}
1742 	pd = (tavor_pdhdl_t)rsrc->tr_addr;
1743 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1744 
1745 	pd->pd_refcnt = 0;
1746 	*pdhdl = pd;
1747 
1748 	TAVOR_TNF_EXIT(tavor_pd_alloc);
1749 	return (DDI_SUCCESS);
1750 }
1751 
1752 
1753 /*
1754  * tavor_pd_free()
1755  *    Context: Can be called only from user or kernel context.
1756  */
1757 int
1758 tavor_pd_free(tavor_state_t *state, tavor_pdhdl_t *pdhdl)
1759 {
1760 	tavor_rsrc_t	*rsrc;
1761 	tavor_pdhdl_t	pd;
1762 
1763 	TAVOR_TNF_ENTER(tavor_pd_free);
1764 
1765 	/*
1766 	 * Pull all the necessary information from the Tavor Protection Domain
1767 	 * handle.  This is necessary here because the resource for the
1768 	 * PD is going to be freed up as part of this operation.
1769 	 */
1770 	pd   = *pdhdl;
1771 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1772 	rsrc = pd->pd_rsrcp;
1773 
1774 	/*
1775 	 * Check the PD reference count.  If the reference count is non-zero,
1776 	 * then it means that this protection domain is still referenced by
1777 	 * some memory region, queue pair, address handle, or other IB object
1778 	 * If it is non-zero, then return an error.  Otherwise, free the
1779 	 * Tavor resource and return success.
1780 	 */
1781 	if (pd->pd_refcnt != 0) {
1782 		TNF_PROBE_1(tavor_pd_free_refcnt_fail, TAVOR_TNF_ERROR, "",
1783 		    tnf_int, refcnt, pd->pd_refcnt);
1784 		TAVOR_TNF_EXIT(tavor_pd_free);
1785 		return (IBT_PD_IN_USE);
1786 	}
1787 
1788 	/* Free the Tavor Protection Domain handle */
1789 	tavor_rsrc_free(state, &rsrc);
1790 
1791 	/* Set the pdhdl pointer to NULL and return success */
1792 	*pdhdl = (tavor_pdhdl_t)NULL;
1793 
1794 	TAVOR_TNF_EXIT(tavor_pd_free);
1795 	return (DDI_SUCCESS);
1796 }
1797 
1798 
1799 /*
1800  * tavor_pd_refcnt_inc()
1801  *    Context: Can be called from interrupt or base context.
1802  */
1803 void
1804 tavor_pd_refcnt_inc(tavor_pdhdl_t pd)
1805 {
1806 	/* Increment the protection domain's reference count */
1807 	mutex_enter(&pd->pd_lock);
1808 	TNF_PROBE_1_DEBUG(tavor_pd_refcnt_inc, TAVOR_TNF_TRACE, "",
1809 	    tnf_uint, refcnt, pd->pd_refcnt);
1810 	pd->pd_refcnt++;
1811 	mutex_exit(&pd->pd_lock);
1812 
1813 }
1814 
1815 
1816 /*
1817  * tavor_pd_refcnt_dec()
1818  *    Context: Can be called from interrupt or base context.
1819  */
1820 void
1821 tavor_pd_refcnt_dec(tavor_pdhdl_t pd)
1822 {
1823 	/* Decrement the protection domain's reference count */
1824 	mutex_enter(&pd->pd_lock);
1825 	pd->pd_refcnt--;
1826 	TNF_PROBE_1_DEBUG(tavor_pd_refcnt_dec, TAVOR_TNF_TRACE, "",
1827 	    tnf_uint, refcnt, pd->pd_refcnt);
1828 	mutex_exit(&pd->pd_lock);
1829 
1830 }
1831 
1832 
1833 /*
1834  * tavor_port_query()
1835  *    Context: Can be called only from user or kernel context.
1836  */
1837 int
1838 tavor_port_query(tavor_state_t *state, uint_t port, ibt_hca_portinfo_t *pi)
1839 {
1840 	sm_portinfo_t		portinfo;
1841 	sm_guidinfo_t		guidinfo;
1842 	sm_pkey_table_t		pkeytable;
1843 	ib_gid_t		*sgid;
1844 	uint_t			sgid_max, pkey_max, tbl_size;
1845 	int			i, j, indx, status;
1846 
1847 	TAVOR_TNF_ENTER(tavor_port_query);
1848 
1849 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pi))
1850 
1851 	/* Validate that specified port number is legal */
1852 	if (!tavor_portnum_is_valid(state, port)) {
1853 		TNF_PROBE_1(tavor_port_query_inv_portnum_fail,
1854 		    TAVOR_TNF_ERROR, "", tnf_uint, port, port);
1855 		TAVOR_TNF_EXIT(tavor_port_query);
1856 		return (IBT_HCA_PORT_INVALID);
1857 	}
1858 
1859 	/*
1860 	 * We use the Tavor MAD_IFC command to post a GetPortInfo MAD
1861 	 * to the firmware (for the specified port number).  This returns
1862 	 * a full PortInfo MAD (in "portinfo") which we subsequently
1863 	 * parse to fill in the "ibt_hca_portinfo_t" structure returned
1864 	 * to the IBTF.
1865 	 */
1866 	status = tavor_getportinfo_cmd_post(state, port,
1867 	    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
1868 	if (status != TAVOR_CMD_SUCCESS) {
1869 		cmn_err(CE_CONT, "Tavor: GetPortInfo (port %02d) command "
1870 		    "failed: %08x\n", port, status);
1871 		TNF_PROBE_1(tavor_port_query_getportinfo_cmd_fail,
1872 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1873 		TAVOR_TNF_EXIT(tavor_port_query);
1874 		return (ibc_get_ci_failure(0));
1875 	}
1876 
1877 	/*
1878 	 * Parse the PortInfo MAD and fill in the IBTF structure
1879 	 */
1880 	pi->p_base_lid		= portinfo.LID;
1881 	pi->p_qkey_violations	= portinfo.Q_KeyViolations;
1882 	pi->p_pkey_violations	= portinfo.P_KeyViolations;
1883 	pi->p_sm_sl		= portinfo.MasterSMSL;
1884 	pi->p_sm_lid		= portinfo.MasterSMLID;
1885 	pi->p_linkstate		= portinfo.PortState;
1886 	pi->p_port_num		= portinfo.LocalPortNum;
1887 	pi->p_phys_state	= portinfo.PortPhysicalState;
1888 	pi->p_width_supported	= portinfo.LinkWidthSupported;
1889 	pi->p_width_enabled	= portinfo.LinkWidthEnabled;
1890 	pi->p_width_active	= portinfo.LinkWidthActive;
1891 	pi->p_speed_supported	= portinfo.LinkSpeedSupported;
1892 	pi->p_speed_enabled	= portinfo.LinkSpeedEnabled;
1893 	pi->p_speed_active	= portinfo.LinkSpeedActive;
1894 	pi->p_mtu		= portinfo.MTUCap;
1895 	pi->p_lmc		= portinfo.LMC;
1896 	pi->p_max_vl		= portinfo.VLCap;
1897 	pi->p_subnet_timeout	= portinfo.SubnetTimeOut;
1898 	pi->p_msg_sz		= ((uint32_t)1 << TAVOR_QP_LOG_MAX_MSGSZ);
1899 	tbl_size = state->ts_cfg_profile->cp_log_max_gidtbl;
1900 	pi->p_sgid_tbl_sz	= (1 << tbl_size);
1901 	tbl_size = state->ts_cfg_profile->cp_log_max_pkeytbl;
1902 	pi->p_pkey_tbl_sz	= (1 << tbl_size);
1903 
1904 	/*
1905 	 * Convert InfiniBand-defined port capability flags to the format
1906 	 * specified by the IBTF
1907 	 */
1908 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM)
1909 		pi->p_capabilities |= IBT_PORT_CAP_SM;
1910 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM_DISABLED)
1911 		pi->p_capabilities |= IBT_PORT_CAP_SM_DISABLED;
1912 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SNMP_SUPPD)
1913 		pi->p_capabilities |= IBT_PORT_CAP_SNMP_TUNNEL;
1914 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_DM_SUPPD)
1915 		pi->p_capabilities |= IBT_PORT_CAP_DM;
1916 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_VM_SUPPD)
1917 		pi->p_capabilities |= IBT_PORT_CAP_VENDOR;
1918 
1919 	/*
1920 	 * Fill in the SGID table.  Since the only access to the Tavor
1921 	 * GID tables is through the firmware's MAD_IFC interface, we
1922 	 * post as many GetGUIDInfo MADs as necessary to read in the entire
1923 	 * contents of the SGID table (for the specified port).  Note:  The
1924 	 * GetGUIDInfo command only gets eight GUIDs per operation.  These
1925 	 * GUIDs are then appended to the GID prefix for the port (from the
1926 	 * GetPortInfo above) to form the entire SGID table.
1927 	 */
1928 	for (i = 0; i < pi->p_sgid_tbl_sz; i += 8) {
1929 		status = tavor_getguidinfo_cmd_post(state, port, i >> 3,
1930 		    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &guidinfo);
1931 		if (status != TAVOR_CMD_SUCCESS) {
1932 			cmn_err(CE_CONT, "Tavor: GetGUIDInfo (port %02d) "
1933 			    "command failed: %08x\n", port, status);
1934 			TNF_PROBE_1(tavor_port_query_getguidinfo_cmd_fail,
1935 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1936 			TAVOR_TNF_EXIT(tavor_port_query);
1937 			return (ibc_get_ci_failure(0));
1938 		}
1939 
1940 		/* Figure out how many of the entries are valid */
1941 		sgid_max = min((pi->p_sgid_tbl_sz - i), 8);
1942 		for (j = 0; j < sgid_max; j++) {
1943 			indx = (i + j);
1944 			sgid = &pi->p_sgid_tbl[indx];
1945 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgid))
1946 			sgid->gid_prefix = portinfo.GidPrefix;
1947 			sgid->gid_guid	 = guidinfo.GUIDBlocks[j];
1948 		}
1949 	}
1950 
1951 	/*
1952 	 * Fill in the PKey table.  Just as for the GID tables above, the
1953 	 * only access to the Tavor PKey tables is through the firmware's
1954 	 * MAD_IFC interface.  We post as many GetPKeyTable MADs as necessary
1955 	 * to read in the entire contents of the PKey table (for the specified
1956 	 * port).  Note:  The GetPKeyTable command only gets 32 PKeys per
1957 	 * operation.
1958 	 */
1959 	for (i = 0; i < pi->p_pkey_tbl_sz; i += 32) {
1960 		status = tavor_getpkeytable_cmd_post(state, port, i,
1961 		    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &pkeytable);
1962 		if (status != TAVOR_CMD_SUCCESS) {
1963 			cmn_err(CE_CONT, "Tavor: GetPKeyTable (port %02d) "
1964 			    "command failed: %08x\n", port, status);
1965 			TNF_PROBE_1(tavor_port_query_getpkeytable_cmd_fail,
1966 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1967 			TAVOR_TNF_EXIT(tavor_port_query);
1968 			return (ibc_get_ci_failure(0));
1969 		}
1970 
1971 		/* Figure out how many of the entries are valid */
1972 		pkey_max = min((pi->p_pkey_tbl_sz - i), 32);
1973 		for (j = 0; j < pkey_max; j++) {
1974 			indx = (i + j);
1975 			pi->p_pkey_tbl[indx] = pkeytable.P_KeyTableBlocks[j];
1976 		}
1977 	}
1978 
1979 	TAVOR_TNF_EXIT(tavor_port_query);
1980 	return (DDI_SUCCESS);
1981 }
1982 
1983 
1984 /*
1985  * tavor_port_modify()
1986  *    Context: Can be called only from user or kernel context.
1987  */
1988 /* ARGSUSED */
1989 int
1990 tavor_port_modify(tavor_state_t *state, uint8_t port,
1991     ibt_port_modify_flags_t flags, uint8_t init_type)
1992 {
1993 	sm_portinfo_t	portinfo;
1994 	uint32_t	capmask, reset_qkey;
1995 	int		status;
1996 
1997 	TAVOR_TNF_ENTER(tavor_port_modify);
1998 
1999 	/*
2000 	 * Return an error if either of the unsupported flags are set
2001 	 */
2002 	if ((flags & IBT_PORT_SHUTDOWN) ||
2003 	    (flags & IBT_PORT_SET_INIT_TYPE)) {
2004 		TNF_PROBE_1(tavor_port_modify_inv_flags_fail,
2005 		    TAVOR_TNF_ERROR, "", tnf_uint, flags, flags);
2006 		TAVOR_TNF_EXIT(tavor_port_modify);
2007 		return (IBT_NOT_SUPPORTED);
2008 	}
2009 
2010 	/*
2011 	 * Determine whether we are trying to reset the QKey counter
2012 	 */
2013 	reset_qkey = (flags & IBT_PORT_RESET_QKEY) ? 1 : 0;
2014 
2015 	/* Validate that specified port number is legal */
2016 	if (!tavor_portnum_is_valid(state, port)) {
2017 		TNF_PROBE_1(tavor_port_modify_inv_portnum_fail,
2018 		    TAVOR_TNF_ERROR, "", tnf_uint, port, port);
2019 		TAVOR_TNF_EXIT(tavor_port_modify);
2020 		return (IBT_HCA_PORT_INVALID);
2021 	}
2022 
2023 	/*
2024 	 * Use the Tavor MAD_IFC command to post a GetPortInfo MAD to the
2025 	 * firmware (for the specified port number).  This returns a full
2026 	 * PortInfo MAD (in "portinfo") from which we pull the current
2027 	 * capability mask.  We then modify the capability mask as directed
2028 	 * by the "pmod_flags" field, and write the updated capability mask
2029 	 * using the Tavor SET_IB command (below).
2030 	 */
2031 	status = tavor_getportinfo_cmd_post(state, port,
2032 	    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
2033 	if (status != TAVOR_CMD_SUCCESS) {
2034 		TNF_PROBE_1(tavor_port_modify_getportinfo_cmd_fail,
2035 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
2036 		TAVOR_TNF_EXIT(tavor_port_modify);
2037 		return (ibc_get_ci_failure(0));
2038 	}
2039 
2040 	/*
2041 	 * Convert InfiniBand-defined port capability flags to the format
2042 	 * specified by the IBTF.  Specifically, we modify the capability
2043 	 * mask based on the specified values.
2044 	 */
2045 	capmask = portinfo.CapabilityMask;
2046 
2047 	if (flags & IBT_PORT_RESET_SM)
2048 		capmask &= ~SM_CAP_MASK_IS_SM;
2049 	else if (flags & IBT_PORT_SET_SM)
2050 		capmask |= SM_CAP_MASK_IS_SM;
2051 
2052 	if (flags & IBT_PORT_RESET_SNMP)
2053 		capmask &= ~SM_CAP_MASK_IS_SNMP_SUPPD;
2054 	else if (flags & IBT_PORT_SET_SNMP)
2055 		capmask |= SM_CAP_MASK_IS_SNMP_SUPPD;
2056 
2057 	if (flags & IBT_PORT_RESET_DEVMGT)
2058 		capmask &= ~SM_CAP_MASK_IS_DM_SUPPD;
2059 	else if (flags & IBT_PORT_SET_DEVMGT)
2060 		capmask |= SM_CAP_MASK_IS_DM_SUPPD;
2061 
2062 	if (flags & IBT_PORT_RESET_VENDOR)
2063 		capmask &= ~SM_CAP_MASK_IS_VM_SUPPD;
2064 	else if (flags & IBT_PORT_SET_VENDOR)
2065 		capmask |= SM_CAP_MASK_IS_VM_SUPPD;
2066 
2067 	/*
2068 	 * Use the Tavor SET_IB command to update the capability mask and
2069 	 * (possibly) reset the QKey violation counter for the specified port.
2070 	 * Note: In general, this operation shouldn't fail.  If it does, then
2071 	 * it is an indication that something (probably in HW, but maybe in
2072 	 * SW) has gone seriously wrong.
2073 	 */
2074 	status = tavor_set_ib_cmd_post(state, capmask, port, reset_qkey,
2075 	    TAVOR_SLEEPFLAG_FOR_CONTEXT());
2076 	if (status != TAVOR_CMD_SUCCESS) {
2077 		TAVOR_WARNING(state, "failed to modify port capabilities");
2078 		cmn_err(CE_CONT, "Tavor: SET_IB (port %02d) command failed: "
2079 		    "%08x\n", port, status);
2080 		TNF_PROBE_1(tavor_port_modify_set_ib_cmd_fail,
2081 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
2082 		TAVOR_TNF_EXIT(tavor_port_modify);
2083 		return (ibc_get_ci_failure(0));
2084 	}
2085 
2086 	TAVOR_TNF_EXIT(tavor_port_modify);
2087 	return (DDI_SUCCESS);
2088 }
2089 
2090 
2091 /*
2092  * tavor_set_addr_path()
2093  *    Context: Can be called from interrupt or base context.
2094  *
2095  * Note: This routine is used for two purposes.  It is used to fill in the
2096  * Tavor UDAV fields, and it is used to fill in the address path information
2097  * for QPs.  Because the two Tavor structures are similar, common fields can
2098  * be filled in here.  Because they are slightly different, however, we pass
2099  * an additional flag to indicate which type is being filled.
2100  */
2101 int
2102 tavor_set_addr_path(tavor_state_t *state, ibt_adds_vect_t *av,
2103     tavor_hw_addr_path_t *path, uint_t type, tavor_qphdl_t qp)
2104 {
2105 	uint_t		gidtbl_sz;
2106 
2107 	TAVOR_TNF_ENTER(tavor_set_addr_path);
2108 
2109 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2110 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2111 
2112 	path->ml_path	= av->av_src_path;
2113 	path->rlid	= av->av_dlid;
2114 	path->sl	= av->av_srvl;
2115 
2116 	/* Port number only valid (in "av_port_num") if this is a UDAV */
2117 	if (type == TAVOR_ADDRPATH_UDAV) {
2118 		path->portnum = av->av_port_num;
2119 	}
2120 
2121 	/*
2122 	 * Validate (and fill in) static rate.
2123 	 *
2124 	 * The stat_rate_sup is used to decide how to set the rate and
2125 	 * if it is zero, the driver uses the old interface.
2126 	 */
2127 	if (state->ts_devlim.stat_rate_sup) {
2128 		if (av->av_srate == IBT_SRATE_20) {
2129 			path->max_stat_rate = 0; /* 4x@DDR injection rate */
2130 		} else if (av->av_srate == IBT_SRATE_5) {
2131 			path->max_stat_rate = 3; /* 1x@DDR injection rate */
2132 		} else if (av->av_srate == IBT_SRATE_10) {
2133 			path->max_stat_rate = 2; /* 4x@SDR injection rate */
2134 		} else if (av->av_srate == IBT_SRATE_2) {
2135 			path->max_stat_rate = 1; /* 1x@SDR injection rate */
2136 		} else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) {
2137 			path->max_stat_rate = 0; /* Max */
2138 		} else {
2139 			TNF_PROBE_1(tavor_set_addr_path_inv_srate_fail,
2140 			    TAVOR_TNF_ERROR, "", tnf_uint, srate, av->av_srate);
2141 			TAVOR_TNF_EXIT(tavor_set_addr_path);
2142 			return (IBT_STATIC_RATE_INVALID);
2143 		}
2144 	} else {
2145 		if (av->av_srate == IBT_SRATE_10) {
2146 			path->max_stat_rate = 0; /* 4x@SDR injection rate */
2147 		} else if (av->av_srate == IBT_SRATE_2) {
2148 			path->max_stat_rate = 1; /* 1x@SDR injection rate */
2149 		} else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) {
2150 			path->max_stat_rate = 0; /* Max */
2151 		} else {
2152 			TNF_PROBE_1(tavor_set_addr_path_inv_srate_fail,
2153 			    TAVOR_TNF_ERROR, "", tnf_uint, srate, av->av_srate);
2154 			TAVOR_TNF_EXIT(tavor_set_addr_path);
2155 			return (IBT_STATIC_RATE_INVALID);
2156 		}
2157 	}
2158 
2159 	/*
2160 	 * If this is a QP operation save asoft copy.
2161 	 */
2162 	if (qp) {
2163 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(qp->qp_save_srate))
2164 		qp->qp_save_srate = av->av_srate;
2165 	}
2166 
2167 	/* If "grh" flag is set, then check for valid SGID index too */
2168 	gidtbl_sz = (1 << state->ts_devlim.log_max_gid);
2169 	if ((av->av_send_grh) && (av->av_sgid_ix > gidtbl_sz)) {
2170 		TNF_PROBE_1(tavor_set_addr_path_inv_sgid_ix_fail,
2171 		    TAVOR_TNF_ERROR, "", tnf_uint, sgid_ix, av->av_sgid_ix);
2172 		TAVOR_TNF_EXIT(tavor_set_addr_path);
2173 		return (IBT_SGID_INVALID);
2174 	}
2175 
2176 	/*
2177 	 * Fill in all "global" values regardless of the value in the GRH
2178 	 * flag.  Because "grh" is not set unless "av_send_grh" is set, the
2179 	 * hardware will ignore the other "global" values as necessary.  Note:
2180 	 * SW does this here to enable later query operations to return
2181 	 * exactly the same params that were passed when the addr path was
2182 	 * last written.
2183 	 */
2184 	path->grh = av->av_send_grh;
2185 	if (type == TAVOR_ADDRPATH_QP) {
2186 		path->mgid_index = av->av_sgid_ix;
2187 	} else {
2188 		/*
2189 		 * For Tavor UDAV, the "mgid_index" field is the index into
2190 		 * a combined table (not a per-port table). So some extra
2191 		 * calculations are necessary.
2192 		 */
2193 		path->mgid_index = ((av->av_port_num - 1) * gidtbl_sz) +
2194 		    av->av_sgid_ix;
2195 	}
2196 	path->flow_label = av->av_flow;
2197 	path->tclass	 = av->av_tclass;
2198 	path->hop_limit	 = av->av_hop;
2199 	path->rgid_h	 = av->av_dgid.gid_prefix;
2200 
2201 	/*
2202 	 * According to Tavor PRM, the (31:0) part of rgid_l must be set to
2203 	 * "0x2" if the 'grh' or 'g' bit is cleared.  It also says that we
2204 	 * only need to do it for UDAV's.  So we enforce that here.
2205 	 *
2206 	 * NOTE: The entire 64 bits worth of GUID info is actually being
2207 	 * preserved (for UDAVs) by the callers of this function
2208 	 * (tavor_ah_alloc() and tavor_ah_modify()) and as long as the
2209 	 * 'grh' bit is not set, the upper 32 bits (63:32) of rgid_l are
2210 	 * "don't care".
2211 	 */
2212 	if ((path->grh) || (type == TAVOR_ADDRPATH_QP)) {
2213 		path->rgid_l = av->av_dgid.gid_guid;
2214 	} else {
2215 		path->rgid_l = 0x2;
2216 	}
2217 
2218 	TAVOR_TNF_EXIT(tavor_set_addr_path);
2219 	return (DDI_SUCCESS);
2220 }
2221 
2222 
2223 /*
2224  * tavor_get_addr_path()
2225  *    Context: Can be called from interrupt or base context.
2226  *
2227  * Note: Just like tavor_set_addr_path() above, this routine is used for two
2228  * purposes.  It is used to read in the Tavor UDAV fields, and it is used to
2229  * read in the address path information for QPs.  Because the two Tavor
2230  * structures are similar, common fields can be read in here.  But because
2231  * they are slightly different, we pass an additional flag to indicate which
2232  * type is being read.
2233  */
2234 void
2235 tavor_get_addr_path(tavor_state_t *state, tavor_hw_addr_path_t *path,
2236     ibt_adds_vect_t *av, uint_t type, tavor_qphdl_t qp)
2237 {
2238 	uint_t		gidtbl_sz;
2239 
2240 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2241 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2242 
2243 	av->av_src_path	= path->ml_path;
2244 	av->av_port_num	= path->portnum;
2245 	av->av_dlid	= path->rlid;
2246 	av->av_srvl	= path->sl;
2247 
2248 	/*
2249 	 * Set "av_ipd" value from max_stat_rate.
2250 	 */
2251 	if (qp) {
2252 		/*
2253 		 * If a QP operation use the soft copy
2254 		 */
2255 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(qp->qp_save_srate))
2256 		av->av_srate = qp->qp_save_srate;
2257 	} else {
2258 		/*
2259 		 * The stat_rate_sup is used to decide how the srate value is
2260 		 * set and
2261 		 * if it is zero, the driver uses the old interface.
2262 		 */
2263 		if (state->ts_devlim.stat_rate_sup) {
2264 			if (path->max_stat_rate	== 0) {
2265 				av->av_srate = IBT_SRATE_20; /* 4x@DDR rate */
2266 			} else if (path->max_stat_rate	== 1) {
2267 				av->av_srate = IBT_SRATE_2;  /* 1x@SDR rate */
2268 			} else if (path->max_stat_rate	== 2) {
2269 				av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */
2270 			} else if (path->max_stat_rate	== 3) {
2271 				av->av_srate = IBT_SRATE_5;  /* 1xDDR rate */
2272 			}
2273 		} else {
2274 			if (path->max_stat_rate	== 0) {
2275 				av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */
2276 			} else if (path->max_stat_rate	== 1) {
2277 				av->av_srate = IBT_SRATE_2;  /* 1x@SDR rate */
2278 			}
2279 		}
2280 	}
2281 
2282 	/*
2283 	 * Extract all "global" values regardless of the value in the GRH
2284 	 * flag.  Because "av_send_grh" is set only if "grh" is set, software
2285 	 * knows to ignore the other "global" values as necessary.  Note: SW
2286 	 * does it this way to enable these query operations to return exactly
2287 	 * the same params that were passed when the addr path was last written.
2288 	 */
2289 	av->av_send_grh		= path->grh;
2290 	if (type == TAVOR_ADDRPATH_QP) {
2291 		av->av_sgid_ix  = path->mgid_index;
2292 	} else {
2293 		/*
2294 		 * For Tavor UDAV, the "mgid_index" field is the index into
2295 		 * a combined table (not a per-port table). So some extra
2296 		 * calculations are necessary.
2297 		 */
2298 		gidtbl_sz = (1 << state->ts_devlim.log_max_gid);
2299 		av->av_sgid_ix = path->mgid_index - ((av->av_port_num - 1) *
2300 		    gidtbl_sz);
2301 	}
2302 	av->av_flow		= path->flow_label;
2303 	av->av_tclass		= path->tclass;
2304 	av->av_hop		= path->hop_limit;
2305 	av->av_dgid.gid_prefix	= path->rgid_h;
2306 	av->av_dgid.gid_guid	= path->rgid_l;
2307 }
2308 
2309 
2310 /*
2311  * tavor_portnum_is_valid()
2312  *    Context: Can be called from interrupt or base context.
2313  */
2314 int
2315 tavor_portnum_is_valid(tavor_state_t *state, uint_t portnum)
2316 {
2317 	uint_t	max_port;
2318 
2319 	max_port = state->ts_cfg_profile->cp_num_ports;
2320 	if ((portnum <= max_port) && (portnum != 0)) {
2321 		return (1);
2322 	} else {
2323 		return (0);
2324 	}
2325 }
2326 
2327 
2328 /*
2329  * tavor_pkeyindex_is_valid()
2330  *    Context: Can be called from interrupt or base context.
2331  */
2332 int
2333 tavor_pkeyindex_is_valid(tavor_state_t *state, uint_t pkeyindx)
2334 {
2335 	uint_t	max_pkeyindx;
2336 
2337 	max_pkeyindx = 1 << state->ts_cfg_profile->cp_log_max_pkeytbl;
2338 	if (pkeyindx < max_pkeyindx) {
2339 		return (1);
2340 	} else {
2341 		return (0);
2342 	}
2343 }
2344 
2345 
2346 /*
2347  * tavor_queue_alloc()
2348  *    Context: Can be called from interrupt or base context.
2349  */
2350 int
2351 tavor_queue_alloc(tavor_state_t *state, tavor_qalloc_info_t *qa_info,
2352     uint_t sleepflag)
2353 {
2354 	ddi_dma_attr_t		dma_attr;
2355 	int			(*callback)(caddr_t);
2356 	uint64_t		realsize, alloc_mask;
2357 	uint_t			dma_xfer_mode, type;
2358 	int			flag, status;
2359 
2360 	TAVOR_TNF_ENTER(tavor_queue_alloc);
2361 
2362 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2363 
2364 	/* Set the callback flag appropriately */
2365 	callback = (sleepflag == TAVOR_SLEEP) ? DDI_DMA_SLEEP :
2366 	    DDI_DMA_DONTWAIT;
2367 
2368 	/*
2369 	 * Initialize many of the default DMA attributes.  Then set additional
2370 	 * alignment restrictions as necessary for the queue memory.  Also
2371 	 * respect the configured value for IOMMU bypass
2372 	 */
2373 	tavor_dma_attr_init(&dma_attr);
2374 	dma_attr.dma_attr_align = qa_info->qa_bind_align;
2375 	type = state->ts_cfg_profile->cp_iommu_bypass;
2376 	if (type == TAVOR_BINDMEM_BYPASS) {
2377 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2378 	}
2379 
2380 	/* Allocate a DMA handle */
2381 	status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr, callback, NULL,
2382 	    &qa_info->qa_dmahdl);
2383 	if (status != DDI_SUCCESS) {
2384 		TNF_PROBE_0(tavor_queue_alloc_dmahdl_fail, TAVOR_TNF_ERROR, "");
2385 		TAVOR_TNF_EXIT(tavor_queue_alloc);
2386 		return (DDI_FAILURE);
2387 	}
2388 
2389 	/*
2390 	 * Determine the amount of memory to allocate, depending on the values
2391 	 * in "qa_bind_align" and "qa_alloc_align".  The problem we are trying
2392 	 * to solve here is that allocating a DMA handle with IOMMU bypass
2393 	 * (DDI_DMA_FORCE_PHYSICAL) constrains us to only requesting alignments
2394 	 * that are less than the page size.  Since we may need stricter
2395 	 * alignments on the memory allocated by ddi_dma_mem_alloc() (e.g. in
2396 	 * Tavor QP work queue memory allocation), we use the following method
2397 	 * to calculate how much additional memory to request, and we enforce
2398 	 * our own alignment on the allocated result.
2399 	 */
2400 	alloc_mask = qa_info->qa_alloc_align - 1;
2401 	if (qa_info->qa_bind_align == qa_info->qa_alloc_align) {
2402 		realsize = qa_info->qa_size;
2403 	} else {
2404 		realsize = qa_info->qa_size + alloc_mask;
2405 	}
2406 
2407 	/*
2408 	 * If we are to allocate the queue from system memory, then use
2409 	 * ddi_dma_mem_alloc() to find the space.  Otherwise, if we are to
2410 	 * allocate the queue from locally-attached DDR memory, then use the
2411 	 * vmem allocator to find the space.  In either case, return a pointer
2412 	 * to the memory range allocated (including any necessary alignment
2413 	 * adjustments), the "real" memory pointer, the "real" size, and a
2414 	 * ddi_acc_handle_t to use when reading from/writing to the memory.
2415 	 */
2416 	if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) {
2417 
2418 		/*
2419 		 * Determine whether to map STREAMING or CONSISTENT.  This is
2420 		 * based on the value set in the configuration profile at
2421 		 * attach time.
2422 		 */
2423 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
2424 
2425 		/* Allocate system memory for the queue */
2426 		status = ddi_dma_mem_alloc(qa_info->qa_dmahdl, realsize,
2427 		    &state->ts_reg_accattr, dma_xfer_mode, callback, NULL,
2428 		    (caddr_t *)&qa_info->qa_buf_real,
2429 		    (size_t *)&qa_info->qa_buf_realsz, &qa_info->qa_acchdl);
2430 		if (status != DDI_SUCCESS) {
2431 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2432 			TNF_PROBE_0(tavor_queue_alloc_dma_memalloc_fail,
2433 			    TAVOR_TNF_ERROR, "");
2434 			TAVOR_TNF_EXIT(tavor_queue_alloc);
2435 			return (DDI_FAILURE);
2436 		}
2437 
2438 		/*
2439 		 * Save temporary copy of the real pointer.  (This may be
2440 		 * modified in the last step below).
2441 		 */
2442 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2443 
2444 	} else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) {
2445 
2446 		/* Allocate userland mappable memory for the queue */
2447 		flag = (sleepflag == TAVOR_SLEEP) ? DDI_UMEM_SLEEP :
2448 		    DDI_UMEM_NOSLEEP;
2449 		qa_info->qa_buf_real = ddi_umem_alloc(realsize, flag,
2450 		    &qa_info->qa_umemcookie);
2451 		if (qa_info->qa_buf_real == NULL) {
2452 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2453 			TNF_PROBE_0(tavor_queue_alloc_umem_fail,
2454 			    TAVOR_TNF_ERROR, "");
2455 			TAVOR_TNF_EXIT(tavor_queue_alloc);
2456 			return (DDI_FAILURE);
2457 		}
2458 
2459 		/*
2460 		 * Save temporary copy of the real pointer.  (This may be
2461 		 * modified in the last step below).
2462 		 */
2463 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2464 
2465 	} else {  /* TAVOR_QUEUE_LOCATION_INDDR */
2466 
2467 		/* Allocate DDR memory for the queue */
2468 		flag = (sleepflag == TAVOR_SLEEP) ? VM_SLEEP : VM_NOSLEEP;
2469 		qa_info->qa_buf_real = (uint32_t *)vmem_xalloc(
2470 		    state->ts_ddrvmem, realsize, qa_info->qa_bind_align, 0, 0,
2471 		    NULL, NULL, flag);
2472 		if (qa_info->qa_buf_real == NULL) {
2473 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2474 			TNF_PROBE_0(tavor_queue_alloc_vmxa_fail,
2475 			    TAVOR_TNF_ERROR, "");
2476 			TAVOR_TNF_EXIT(tavor_queue_alloc);
2477 			return (DDI_FAILURE);
2478 		}
2479 
2480 		/*
2481 		 * Since "qa_buf_real" will be a PCI address (the offset into
2482 		 * the DDR memory), we first need to do some calculations to
2483 		 * convert it to its kernel mapped address.  (Note: This may
2484 		 * be modified again below, when any additional "alloc"
2485 		 * alignment constraint is applied).
2486 		 */
2487 		qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2488 		    state->ts_reg_ddr_baseaddr) + ((uintptr_t)
2489 		    qa_info->qa_buf_real - state->ts_ddr.ddr_baseaddr));
2490 		qa_info->qa_buf_realsz	= realsize;
2491 		qa_info->qa_acchdl	= state->ts_reg_ddrhdl;
2492 	}
2493 
2494 	/*
2495 	 * The last step is to ensure that the final address ("qa_buf_aligned")
2496 	 * has the appropriate "alloc" alignment restriction applied to it
2497 	 * (if necessary).
2498 	 */
2499 	if (qa_info->qa_bind_align != qa_info->qa_alloc_align) {
2500 		qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2501 		    qa_info->qa_buf_aligned + alloc_mask) & ~alloc_mask);
2502 	}
2503 
2504 	TAVOR_TNF_EXIT(tavor_queue_alloc);
2505 	return (DDI_SUCCESS);
2506 }
2507 
2508 
2509 /*
2510  * tavor_queue_free()
2511  *    Context: Can be called from interrupt or base context.
2512  */
2513 void
2514 tavor_queue_free(tavor_state_t *state, tavor_qalloc_info_t *qa_info)
2515 {
2516 	TAVOR_TNF_ENTER(tavor_queue_free);
2517 
2518 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2519 
2520 	/*
2521 	 * Depending on how (i.e. from where) we allocated the memory for
2522 	 * this queue, we choose the appropriate method for releasing the
2523 	 * resources.
2524 	 */
2525 	if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) {
2526 
2527 		ddi_dma_mem_free(&qa_info->qa_acchdl);
2528 
2529 	} else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) {
2530 
2531 		ddi_umem_free(qa_info->qa_umemcookie);
2532 
2533 	} else {  /* TAVOR_QUEUE_LOCATION_INDDR */
2534 
2535 		vmem_xfree(state->ts_ddrvmem, qa_info->qa_buf_real,
2536 		    qa_info->qa_buf_realsz);
2537 	}
2538 
2539 	/* Always free the dma handle */
2540 	ddi_dma_free_handle(&qa_info->qa_dmahdl);
2541 
2542 	TAVOR_TNF_EXIT(tavor_queue_free);
2543 }
2544 
2545 
2546 /*
2547  * tavor_dmaattr_get()
2548  *    Context: Can be called from interrupt or base context.
2549  */
2550 void
2551 tavor_dma_attr_init(ddi_dma_attr_t *dma_attr)
2552 {
2553 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*dma_attr))
2554 
2555 	dma_attr->dma_attr_version	= DMA_ATTR_V0;
2556 	dma_attr->dma_attr_addr_lo	= 0;
2557 	dma_attr->dma_attr_addr_hi	= 0xFFFFFFFFFFFFFFFFull;
2558 	dma_attr->dma_attr_count_max	= 0xFFFFFFFFFFFFFFFFull;
2559 	dma_attr->dma_attr_align	= 1;
2560 	dma_attr->dma_attr_burstsizes	= 0x3FF;
2561 	dma_attr->dma_attr_minxfer	= 1;
2562 	dma_attr->dma_attr_maxxfer	= 0xFFFFFFFFFFFFFFFFull;
2563 	dma_attr->dma_attr_seg		= 0xFFFFFFFFFFFFFFFFull;
2564 	dma_attr->dma_attr_sgllen	= 0x7FFFFFFF;
2565 	dma_attr->dma_attr_granular	= 1;
2566 	dma_attr->dma_attr_flags	= 0;
2567 }
2568 
2569 /*
2570  * tavor_destroy_fmr_pool()
2571  * Create a pool of FMRs.
2572  *     Context: Can be called from kernel context only.
2573  */
2574 int
2575 tavor_create_fmr_pool(tavor_state_t *state, tavor_pdhdl_t pd,
2576     ibt_fmr_pool_attr_t *fmr_attr, tavor_fmrhdl_t *fmrpoolp)
2577 {
2578 	tavor_fmrhdl_t	fmrpool;
2579 	tavor_fmr_list_t *fmr, *fmr_next;
2580 	tavor_mrhdl_t   mr;
2581 	char		taskqname[48];
2582 	char		*errormsg;
2583 	int		status;
2584 	int		sleep;
2585 	int		i;
2586 
2587 	TAVOR_TNF_ENTER(tavor_create_fmr_pool);
2588 
2589 	sleep = (fmr_attr->fmr_flags & IBT_MR_SLEEP) ? TAVOR_SLEEP :
2590 	    TAVOR_NOSLEEP;
2591 	if ((sleep == TAVOR_SLEEP) &&
2592 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
2593 		TNF_PROBE_0(tavor_create_fmr_pool_invalid_flags,
2594 		    TAVOR_TNF_ERROR, "");
2595 		TAVOR_TNF_EXIT(tavor_create_fmr_pool);
2596 		return (IBT_INVALID_PARAM);
2597 	}
2598 
2599 	fmrpool = (tavor_fmrhdl_t)kmem_zalloc(sizeof (*fmrpool), sleep);
2600 	if (fmrpool == NULL) {
2601 		/* Set "status" and "errormsg" and goto failure */
2602 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed FMR Pool handle");
2603 		goto fail;
2604 	}
2605 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmrpool))
2606 
2607 	mutex_init(&fmrpool->fmr_lock, NULL, MUTEX_DRIVER,
2608 	    DDI_INTR_PRI(state->ts_intrmsi_pri));
2609 
2610 	fmrpool->fmr_state	    = state;
2611 	fmrpool->fmr_flush_function = fmr_attr->fmr_func_hdlr;
2612 	fmrpool->fmr_flush_arg	    = fmr_attr->fmr_func_arg;
2613 	fmrpool->fmr_pool_size	    = 0;
2614 	fmrpool->fmr_cache	    = 0;
2615 	fmrpool->fmr_max_pages	    = fmr_attr->fmr_max_pages_per_fmr;
2616 	fmrpool->fmr_page_sz	    = fmr_attr->fmr_page_sz;
2617 	fmrpool->fmr_dirty_watermark = fmr_attr->fmr_dirty_watermark;
2618 	fmrpool->fmr_dirty_len	    = 0;
2619 	fmrpool->fmr_flags	    = fmr_attr->fmr_flags;
2620 
2621 	/* Create taskq to handle cleanup and flush processing */
2622 	(void) snprintf(taskqname, 50, "fmrpool/%d/%d @ 0x%" PRIx64,
2623 	    fmr_attr->fmr_pool_size, tavor_debug_fmrpool_cnt,
2624 	    (uint64_t)(uintptr_t)fmrpool);
2625 	fmrpool->fmr_taskq = ddi_taskq_create(state->ts_dip, taskqname,
2626 	    TAVOR_TASKQ_NTHREADS, TASKQ_DEFAULTPRI, 0);
2627 	if (fmrpool->fmr_taskq == NULL) {
2628 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed task queue");
2629 		goto fail1;
2630 	}
2631 
2632 	fmrpool->fmr_free_list = NULL;
2633 	fmrpool->fmr_dirty_list = NULL;
2634 
2635 	if (fmr_attr->fmr_cache) {
2636 		tavor_fmr_cache_init(fmrpool);
2637 	}
2638 
2639 	for (i = 0; i < fmr_attr->fmr_pool_size; i++) {
2640 		status = tavor_mr_alloc_fmr(state, pd, fmrpool, &mr);
2641 		if (status != DDI_SUCCESS) {
2642 			TAVOR_TNF_FAIL(status, "failed fmr alloc");
2643 			goto fail2;
2644 		}
2645 
2646 		fmr = (tavor_fmr_list_t *)kmem_zalloc(
2647 		    sizeof (tavor_fmr_list_t), sleep);
2648 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2649 
2650 		fmr->fmr = mr;
2651 		fmr->fmr_refcnt = 0;
2652 		fmr->fmr_remaps = 0;
2653 		fmr->fmr_pool = fmrpool;
2654 		fmr->fmr_in_cache = 0;
2655 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
2656 		mr->mr_fmr = fmr;
2657 
2658 		fmr->fmr_next = fmrpool->fmr_free_list;
2659 		fmrpool->fmr_free_list = fmr;
2660 		fmrpool->fmr_pool_size++;
2661 	}
2662 
2663 	/* Set to return pool */
2664 	*fmrpoolp = fmrpool;
2665 
2666 	TAVOR_TNF_EXIT(tavor_create_fmr_pool);
2667 	return (IBT_SUCCESS);
2668 fail2:
2669 	tavor_fmr_cache_fini(fmrpool);
2670 	for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
2671 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2672 		fmr_next = fmr->fmr_next;
2673 		(void) tavor_mr_dealloc_fmr(state, &fmr->fmr);
2674 		kmem_free(fmr, sizeof (tavor_fmr_list_t));
2675 	}
2676 	ddi_taskq_destroy(fmrpool->fmr_taskq);
2677 fail1:
2678 	kmem_free(fmrpool, sizeof (*fmrpool));
2679 fail:
2680 	TNF_PROBE_1(tavor_create_fmr_pool_fail, TAVOR_TNF_ERROR, "",
2681 	    tnf_string, msg, errormsg);
2682 	TAVOR_TNF_EXIT(tavor_create_fmr_pool);
2683 	if (status == DDI_FAILURE) {
2684 		return (ibc_get_ci_failure(0));
2685 	} else {
2686 		return (status);
2687 	}
2688 }
2689 
2690 /*
2691  * tavor_destroy_fmr_pool()
2692  * Destroy an FMR pool and free all associated resources.
2693  *     Context: Can be called from kernel context only.
2694  */
2695 int
2696 tavor_destroy_fmr_pool(tavor_state_t *state, tavor_fmrhdl_t fmrpool)
2697 {
2698 	tavor_fmr_list_t	*fmr, *fmr_next;
2699 	char			*errormsg;
2700 	int			status;
2701 
2702 	TAVOR_TNF_ENTER(tavor_destroy_fmr_pool);
2703 
2704 	mutex_enter(&fmrpool->fmr_lock);
2705 	status = tavor_fmr_cleanup(state, fmrpool);
2706 	if (status != DDI_SUCCESS) {
2707 		mutex_exit(&fmrpool->fmr_lock);
2708 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed fmr cleanup");
2709 		goto fail;
2710 	}
2711 
2712 	if (fmrpool->fmr_cache) {
2713 		tavor_fmr_cache_fini(fmrpool);
2714 	}
2715 
2716 	for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
2717 		fmr_next = fmr->fmr_next;
2718 
2719 		(void) tavor_mr_dealloc_fmr(state, &fmr->fmr);
2720 		kmem_free(fmr, sizeof (tavor_fmr_list_t));
2721 	}
2722 	mutex_exit(&fmrpool->fmr_lock);
2723 
2724 	ddi_taskq_destroy(fmrpool->fmr_taskq);
2725 	mutex_destroy(&fmrpool->fmr_lock);
2726 
2727 	kmem_free(fmrpool, sizeof (*fmrpool));
2728 
2729 	TAVOR_TNF_EXIT(tavor_destroy_fmr_pool);
2730 	return (DDI_SUCCESS);
2731 fail:
2732 	TNF_PROBE_1(tavor_destroy_fmr_pool_fail, TAVOR_TNF_ERROR, "",
2733 	    tnf_string, msg, errormsg);
2734 	TAVOR_TNF_EXIT(tavor_destroy_fmr_pool);
2735 	return (status);
2736 }
2737 
2738 /*
2739  * tavor_flush_fmr_pool()
2740  * Ensure that all unmapped FMRs are fully invalidated.
2741  *     Context: Can be called from kernel context only.
2742  */
2743 int
2744 tavor_flush_fmr_pool(tavor_state_t *state, tavor_fmrhdl_t fmrpool)
2745 {
2746 	char		*errormsg;
2747 	int		status;
2748 
2749 	TAVOR_TNF_ENTER(tavor_flush_fmr_pool);
2750 
2751 	/*
2752 	 * Force the unmapping of all entries on the dirty list, regardless of
2753 	 * whether the watermark has been hit yet.
2754 	 */
2755 	/* grab the pool lock */
2756 	mutex_enter(&fmrpool->fmr_lock);
2757 	status = tavor_fmr_cleanup(state, fmrpool);
2758 	if (status != DDI_SUCCESS) {
2759 		mutex_exit(&fmrpool->fmr_lock);
2760 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed fmr cleanup");
2761 		goto fail;
2762 	}
2763 	/* release the pool lock */
2764 	mutex_exit(&fmrpool->fmr_lock);
2765 
2766 	TAVOR_TNF_EXIT(tavor_flush_fmr_pool);
2767 	return (DDI_SUCCESS);
2768 fail:
2769 	TNF_PROBE_1(tavor_flush_fmr_pool_fail, TAVOR_TNF_ERROR, "",
2770 	    tnf_string, msg, errormsg);
2771 	TAVOR_TNF_EXIT(tavor_flush_fmr_pool);
2772 	return (status);
2773 }
2774 
2775 /*
2776  * tavor_deregister_fmr()
2777  * Map memory into FMR
2778  *    Context: Can be called from interrupt or base context.
2779  */
2780 int
2781 tavor_register_physical_fmr(tavor_state_t *state, tavor_fmrhdl_t fmrpool,
2782     ibt_pmr_attr_t *mem_pattr, tavor_mrhdl_t *mr,
2783     ibt_pmr_desc_t *mem_desc_p)
2784 {
2785 	tavor_fmr_list_t	*fmr;
2786 	tavor_fmr_list_t	query;
2787 	avl_index_t		where;
2788 	int			status;
2789 
2790 	TAVOR_TNF_ENTER(tavor_register_physical_fmr);
2791 
2792 	/* Check length */
2793 	mutex_enter(&fmrpool->fmr_lock);
2794 	if (mem_pattr->pmr_len < 1 || (mem_pattr->pmr_num_buf >
2795 	    fmrpool->fmr_max_pages)) {
2796 		mutex_exit(&fmrpool->fmr_lock);
2797 		TNF_PROBE_0(tavor_register_physical_fmr_length_fail,
2798 		    TAVOR_TNF_ERROR, "");
2799 		TAVOR_TNF_EXIT(tavor_register_physical_fmr);
2800 		return (IBT_MR_LEN_INVALID);
2801 	}
2802 
2803 	mutex_enter(&fmrpool->fmr_cachelock);
2804 	/* lookup in fmr cache */
2805 	/* if exists, grab it, and return it */
2806 	if (fmrpool->fmr_cache) {
2807 		query.fmr_desc.pmd_iova = mem_pattr->pmr_iova;
2808 		query.fmr_desc.pmd_phys_buf_list_sz = mem_pattr->pmr_len;
2809 		fmr = (tavor_fmr_list_t *)avl_find(&fmrpool->fmr_cache_avl,
2810 		    &query, &where);
2811 
2812 		/*
2813 		 * If valid FMR was found in cache, return that fmr info
2814 		 */
2815 		if (fmr != NULL) {
2816 			fmr->fmr_refcnt++;
2817 			/* Store pmr desc for use in cache */
2818 			(void) memcpy(mem_desc_p, &fmr->fmr_desc,
2819 			    sizeof (ibt_pmr_desc_t));
2820 			*mr = (tavor_mrhdl_t)fmr->fmr;
2821 			mutex_exit(&fmrpool->fmr_cachelock);
2822 			mutex_exit(&fmrpool->fmr_lock);
2823 			TAVOR_TNF_EXIT(tavor_register_physical_fmr);
2824 			return (DDI_SUCCESS);
2825 		}
2826 	}
2827 
2828 	/* FMR does not exist in cache, proceed with registration */
2829 
2830 	/* grab next free entry */
2831 	fmr = fmrpool->fmr_free_list;
2832 	if (fmr == NULL) {
2833 		mutex_exit(&fmrpool->fmr_cachelock);
2834 		mutex_exit(&fmrpool->fmr_lock);
2835 		TNF_PROBE_0(tavor_register_physical_fmr_none_free,
2836 		    TAVOR_TNF_ERROR, "");
2837 		TAVOR_TNF_EXIT(tavor_register_physical_fmr);
2838 		return (IBT_INSUFF_RESOURCE);
2839 	}
2840 
2841 	fmrpool->fmr_free_list = fmrpool->fmr_free_list->fmr_next;
2842 	fmr->fmr_next = NULL;
2843 
2844 	status = tavor_mr_register_physical_fmr(state, mem_pattr, fmr->fmr,
2845 	    mem_desc_p);
2846 	if (status != DDI_SUCCESS) {
2847 		mutex_exit(&fmrpool->fmr_cachelock);
2848 		mutex_exit(&fmrpool->fmr_lock);
2849 		TNF_PROBE_0(tavor_register_physical_fmr_reg_fail,
2850 		    TAVOR_TNF_ERROR, "");
2851 		TAVOR_TNF_EXIT(tavor_register_physical_fmr);
2852 		return (status);
2853 	}
2854 
2855 	fmr->fmr_refcnt = 1;
2856 	fmr->fmr_remaps++;
2857 
2858 	/* Store pmr desc for use in cache */
2859 	(void) memcpy(&fmr->fmr_desc, mem_desc_p, sizeof (ibt_pmr_desc_t));
2860 	*mr = (tavor_mrhdl_t)fmr->fmr;
2861 
2862 	/* Store in cache */
2863 	if (fmrpool->fmr_cache) {
2864 		if (!fmr->fmr_in_cache) {
2865 			avl_insert(&fmrpool->fmr_cache_avl, fmr, where);
2866 			fmr->fmr_in_cache = 1;
2867 		}
2868 	}
2869 
2870 	mutex_exit(&fmrpool->fmr_cachelock);
2871 	mutex_exit(&fmrpool->fmr_lock);
2872 	TAVOR_TNF_EXIT(tavor_register_physical_fmr);
2873 	return (DDI_SUCCESS);
2874 }
2875 
2876 /*
2877  * tavor_deregister_fmr()
2878  * Unmap FMR
2879  *    Context: Can be called from kernel context only.
2880  */
2881 int
2882 tavor_deregister_fmr(tavor_state_t *state, tavor_mrhdl_t mr)
2883 {
2884 	tavor_fmr_list_t	*fmr;
2885 	tavor_fmrhdl_t		fmrpool;
2886 	int			status;
2887 
2888 	fmr = mr->mr_fmr;
2889 	fmrpool = fmr->fmr_pool;
2890 
2891 	/* Grab pool lock */
2892 	mutex_enter(&fmrpool->fmr_lock);
2893 	fmr->fmr_refcnt--;
2894 
2895 	if (fmr->fmr_refcnt == 0) {
2896 		/*
2897 		 * First, do some bit of invalidation, reducing our exposure to
2898 		 * having this region still registered in hardware.
2899 		 */
2900 		(void) tavor_mr_invalidate_fmr(state, mr);
2901 
2902 		/*
2903 		 * If we've exhausted our remaps then add the FMR to the dirty
2904 		 * list, not allowing it to be re-used until we have done a
2905 		 * flush.  Otherwise, simply add it back to the free list for
2906 		 * re-mapping.
2907 		 */
2908 		if (fmr->fmr_remaps <
2909 		    state->ts_cfg_profile->cp_fmr_max_remaps) {
2910 			/* add to free list */
2911 			fmr->fmr_next = fmrpool->fmr_free_list;
2912 			fmrpool->fmr_free_list = fmr;
2913 		} else {
2914 			/* add to dirty list */
2915 			fmr->fmr_next = fmrpool->fmr_dirty_list;
2916 			fmrpool->fmr_dirty_list = fmr;
2917 			fmrpool->fmr_dirty_len++;
2918 
2919 			status = ddi_taskq_dispatch(fmrpool->fmr_taskq,
2920 			    tavor_fmr_processing, fmrpool, DDI_NOSLEEP);
2921 			if (status == DDI_FAILURE) {
2922 				mutex_exit(&fmrpool->fmr_lock);
2923 				TNF_PROBE_0(tavor_agent_request_cb_taskq_fail,
2924 				    TAVOR_TNF_ERROR, "");
2925 				return (IBT_INSUFF_RESOURCE);
2926 			}
2927 		}
2928 	}
2929 	/* Release pool lock */
2930 	mutex_exit(&fmrpool->fmr_lock);
2931 
2932 	return (DDI_SUCCESS);
2933 }
2934 
2935 
2936 /*
2937  * tavor_fmr_processing()
2938  * If required, perform cleanup.
2939  *     Context: Called from taskq context only.
2940  */
2941 static void
2942 tavor_fmr_processing(void *fmr_args)
2943 {
2944 	tavor_fmrhdl_t		fmrpool;
2945 	char			*errormsg;
2946 	int			status;
2947 
2948 	TAVOR_TNF_ENTER(tavor_fmr_processing);
2949 
2950 	ASSERT(fmr_args != NULL);
2951 
2952 	fmrpool = (tavor_fmrhdl_t)fmr_args;
2953 
2954 	/* grab pool lock */
2955 	mutex_enter(&fmrpool->fmr_lock);
2956 	if (fmrpool->fmr_dirty_len >= fmrpool->fmr_dirty_watermark) {
2957 		status = tavor_fmr_cleanup(fmrpool->fmr_state, fmrpool);
2958 		if (status != DDI_SUCCESS) {
2959 			mutex_exit(&fmrpool->fmr_lock);
2960 			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
2961 			    "failed fmr cleanup");
2962 			goto fail;
2963 		}
2964 
2965 		if (fmrpool->fmr_flush_function != NULL) {
2966 			(void) fmrpool->fmr_flush_function(
2967 			    (ibc_fmr_pool_hdl_t)fmrpool,
2968 			    fmrpool->fmr_flush_arg);
2969 		}
2970 	}
2971 
2972 	/* let pool lock go */
2973 	mutex_exit(&fmrpool->fmr_lock);
2974 
2975 	TAVOR_TNF_EXIT(tavor_fmr_processing);
2976 	return;
2977 fail:
2978 	TNF_PROBE_1(tavor_fmr_processing, TAVOR_TNF_ERROR, "",
2979 	    tnf_string, msg, errormsg);
2980 	TAVOR_TNF_EXIT(tavor_fmr_processing);
2981 }
2982 
2983 /*
2984  * tavor_fmr_cleanup()
2985  * Perform cleaning processing, walking the list and performing the MTT sync
2986  * operation if required.
2987  *    Context: can be called from taskq or base context.
2988  */
2989 static int
2990 tavor_fmr_cleanup(tavor_state_t *state, tavor_fmrhdl_t fmrpool)
2991 {
2992 	tavor_fmr_list_t	*fmr;
2993 	tavor_fmr_list_t	*fmr_next;
2994 	int			sync_needed;
2995 	int			status;
2996 
2997 	TAVOR_TNF_ENTER(tavor_fmr_cleanup);
2998 
2999 	ASSERT(MUTEX_HELD(&fmrpool->fmr_lock));
3000 
3001 	sync_needed = 0;
3002 	for (fmr = fmrpool->fmr_dirty_list; fmr; fmr = fmr_next) {
3003 		fmr_next = fmr->fmr_next;
3004 		fmr->fmr_remaps = 0;
3005 
3006 		(void) tavor_mr_deregister_fmr(state, fmr->fmr);
3007 
3008 		/*
3009 		 * Update lists.
3010 		 * - add fmr back to free list
3011 		 * - remove fmr from dirty list
3012 		 */
3013 		fmr->fmr_next = fmrpool->fmr_free_list;
3014 		fmrpool->fmr_free_list = fmr;
3015 
3016 
3017 		/*
3018 		 * Because we have updated the dirty list, and deregistered the
3019 		 * FMR entry, we do need to sync the TPT, so we set the
3020 		 * 'sync_needed' flag here so we sync once we finish dirty_list
3021 		 * processing.
3022 		 */
3023 		sync_needed = 1;
3024 	}
3025 
3026 	fmrpool->fmr_dirty_list = NULL;
3027 	fmrpool->fmr_dirty_len = 0;
3028 
3029 	if (sync_needed) {
3030 		status = tavor_sync_tpt_cmd_post(state, TAVOR_CMD_NOSLEEP_SPIN);
3031 		if (status != TAVOR_CMD_SUCCESS) {
3032 			TNF_PROBE_0(tavor_fmr_cleanup, TAVOR_TNF_ERROR, "");
3033 			TAVOR_TNF_EXIT(tavor_fmr_cleanup);
3034 			return (status);
3035 		}
3036 	}
3037 
3038 	TAVOR_TNF_EXIT(tavor_fmr_cleanup);
3039 	return (DDI_SUCCESS);
3040 }
3041 
3042 /*
3043  * tavor_fmr_avl_compare()
3044  *    Context: Can be called from user or kernel context.
3045  */
3046 static int
3047 tavor_fmr_avl_compare(const void *q, const void *e)
3048 {
3049 	tavor_fmr_list_t *entry, *query;
3050 
3051 	TAVOR_TNF_ENTER(tavor_qpn_avl_compare);
3052 
3053 	entry = (tavor_fmr_list_t *)e;
3054 	query = (tavor_fmr_list_t *)q;
3055 
3056 	if (query->fmr_desc.pmd_iova < entry->fmr_desc.pmd_iova) {
3057 		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
3058 		return (-1);
3059 	} else if (query->fmr_desc.pmd_iova > entry->fmr_desc.pmd_iova) {
3060 		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
3061 		return (+1);
3062 	} else {
3063 		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
3064 		return (0);
3065 	}
3066 }
3067 
3068 
3069 /*
3070  * tavor_fmr_cache_init()
3071  *    Context: Can be called from user or kernel context.
3072  */
3073 static void
3074 tavor_fmr_cache_init(tavor_fmrhdl_t fmr)
3075 {
3076 	TAVOR_TNF_ENTER(tavor_fmr_cache_init);
3077 
3078 	/* Initialize the lock used for FMR cache AVL tree access */
3079 	mutex_init(&fmr->fmr_cachelock, NULL, MUTEX_DRIVER,
3080 	    DDI_INTR_PRI(fmr->fmr_state->ts_intrmsi_pri));
3081 
3082 	/* Initialize the AVL tree for the FMR cache */
3083 	avl_create(&fmr->fmr_cache_avl, tavor_fmr_avl_compare,
3084 	    sizeof (tavor_fmr_list_t),
3085 	    offsetof(tavor_fmr_list_t, fmr_avlnode));
3086 
3087 	fmr->fmr_cache = 1;
3088 
3089 	TAVOR_TNF_EXIT(tavor_fmr_cache_init);
3090 }
3091 
3092 
3093 /*
3094  * tavor_fmr_cache_fini()
3095  *    Context: Can be called from user or kernel context.
3096  */
3097 static void
3098 tavor_fmr_cache_fini(tavor_fmrhdl_t fmr)
3099 {
3100 	void			*cookie;
3101 
3102 	TAVOR_TNF_ENTER(tavor_fmr_cache_fini);
3103 
3104 	/*
3105 	 * Empty all entries (if necessary) and destroy the AVL tree.
3106 	 * The FMRs themselves are freed as part of destroy_pool()
3107 	 */
3108 	cookie = NULL;
3109 	while (((void *)(tavor_fmr_list_t *)avl_destroy_nodes(
3110 	    &fmr->fmr_cache_avl, &cookie)) != NULL) {
3111 		/* loop through */
3112 	}
3113 	avl_destroy(&fmr->fmr_cache_avl);
3114 
3115 	/* Destroy the lock used for FMR cache */
3116 	mutex_destroy(&fmr->fmr_cachelock);
3117 
3118 	TAVOR_TNF_EXIT(tavor_fmr_cache_fini);
3119 }
3120 
3121 /*
3122  * tavor_get_dma_cookies()
3123  * Return DMA cookies in the pre-allocated paddr_list_p based on the length
3124  * needed.
3125  *    Context: Can be called from interrupt or base context.
3126  */
3127 int
3128 tavor_get_dma_cookies(tavor_state_t *state, ibt_phys_buf_t *paddr_list_p,
3129     ibt_va_attr_t *va_attrs, uint_t list_len, uint_t *cookiecnt,
3130     ibc_ma_hdl_t *ibc_ma_hdl_p)
3131 {
3132 	ddi_dma_handle_t	dma_hdl;
3133 	ddi_dma_attr_t		dma_attr;
3134 	ddi_dma_cookie_t	dmacookie;
3135 	uint_t			dma_xfer_mode;
3136 	int			(*callback)(caddr_t);
3137 	int			status;
3138 	int			i;
3139 
3140 	TAVOR_TNF_ENTER(tavor_get_dma_cookies);
3141 
3142 	/* Set the callback flag appropriately */
3143 	callback = (va_attrs->va_flags & IBT_VA_NOSLEEP) ? DDI_DMA_DONTWAIT :
3144 	    DDI_DMA_SLEEP;
3145 	if ((callback == DDI_DMA_SLEEP) &&
3146 	    (TAVOR_SLEEP != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
3147 		TNF_PROBE_0(tavor_ci_map_mem_area_invalid_flags,
3148 		    TAVOR_TNF_ERROR, "");
3149 		TAVOR_TNF_EXIT(tavor_ci_map_mem_area);
3150 		return (IBT_INVALID_PARAM);
3151 	}
3152 
3153 	/*
3154 	 * Initialize many of the default DMA attributes and allocate the DMA
3155 	 * handle.  Then, if we're bypassing the IOMMU, set the
3156 	 * DDI_DMA_FORCE_PHYSICAL flag.
3157 	 */
3158 	tavor_dma_attr_init(&dma_attr);
3159 
3160 #ifdef __x86
3161 	/*
3162 	 * On x86 we can specify a maximum segment length for our returned
3163 	 * cookies.
3164 	 */
3165 	if (va_attrs->va_flags & IBT_VA_FMR) {
3166 		dma_attr.dma_attr_seg = PAGESIZE - 1;
3167 	}
3168 #endif
3169 
3170 	/* Determine whether to map STREAMING or CONSISTENT */
3171 	dma_xfer_mode = (va_attrs->va_flags & IBT_VA_NONCOHERENT) ?
3172 	    DDI_DMA_STREAMING : DDI_DMA_CONSISTENT;
3173 
3174 #ifdef	__sparc
3175 	/*
3176 	 * First, disable streaming and switch to consistent if
3177 	 * configured to do so and IOMMU BYPASS is enabled.
3178 	 */
3179 	if (state->ts_cfg_profile->cp_disable_streaming_on_bypass &&
3180 	    dma_xfer_mode == DDI_DMA_STREAMING &&
3181 	    state->ts_cfg_profile->cp_iommu_bypass == TAVOR_BINDMEM_BYPASS) {
3182 		dma_xfer_mode = DDI_DMA_CONSISTENT;
3183 	}
3184 
3185 	/*
3186 	 * Then, if streaming is still specified, then "bypass" is not
3187 	 * allowed.
3188 	 */
3189 	if ((dma_xfer_mode == DDI_DMA_CONSISTENT) &&
3190 	    (state->ts_cfg_profile->cp_iommu_bypass == TAVOR_BINDMEM_BYPASS)) {
3191 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
3192 	}
3193 #endif
3194 
3195 	status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr,
3196 	    callback, NULL, &dma_hdl);
3197 	if (status != DDI_SUCCESS) {
3198 		TNF_PROBE_1(tavor_ci_map_mem_area_alloc_handle_fail,
3199 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
3200 		TAVOR_TNF_EXIT(tavor_ci_map_mem_area);
3201 
3202 		switch (status) {
3203 		case DDI_DMA_NORESOURCES:
3204 			return (IBT_INSUFF_RESOURCE);
3205 		case DDI_DMA_BADATTR:
3206 		default:
3207 			return (ibc_get_ci_failure(0));
3208 		}
3209 	}
3210 
3211 	/*
3212 	 * Now bind the handle with the correct DMA attributes.
3213 	 */
3214 	if (va_attrs->va_flags & IBT_VA_BUF) {
3215 		status = ddi_dma_buf_bind_handle(dma_hdl, va_attrs->va_buf,
3216 		    DDI_DMA_RDWR | dma_xfer_mode, DDI_DMA_DONTWAIT,
3217 		    NULL, &dmacookie, cookiecnt);
3218 	} else {
3219 		status = ddi_dma_addr_bind_handle(dma_hdl, NULL,
3220 		    (caddr_t)(uintptr_t)va_attrs->va_vaddr, va_attrs->va_len,
3221 		    DDI_DMA_RDWR | dma_xfer_mode, DDI_DMA_DONTWAIT,
3222 		    NULL, &dmacookie, cookiecnt);
3223 	}
3224 	if (status != DDI_SUCCESS) {
3225 		ddi_dma_free_handle(&dma_hdl);
3226 		TNF_PROBE_0(tavor_ci_map_mem_area_bind_handle_fail,
3227 		    TAVOR_TNF_ERROR, "");
3228 		TAVOR_TNF_EXIT(tavor_ci_map_mem_area);
3229 
3230 		switch (status) {
3231 		case DDI_DMA_NORESOURCES:
3232 			return (IBT_INSUFF_RESOURCE);
3233 		case DDI_DMA_TOOBIG:
3234 			return (IBT_INVALID_PARAM);
3235 		case DDI_DMA_PARTIAL_MAP:
3236 		case DDI_DMA_INUSE:
3237 		case DDI_DMA_NOMAPPING:
3238 		default:
3239 			return (ibc_get_ci_failure(0));
3240 		}
3241 	}
3242 
3243 	/*
3244 	 * Verify our physical buffer list (PBL) is large enough to handle the
3245 	 * number of cookies that were returned.
3246 	 */
3247 	if (*cookiecnt > list_len) {
3248 		(void) ddi_dma_unbind_handle(dma_hdl);
3249 		ddi_dma_free_handle(&dma_hdl);
3250 		TNF_PROBE_0(tavor_ci_map_mem_area_toomany_cookie_fail,
3251 		    TAVOR_TNF_ERROR, "");
3252 		TAVOR_TNF_EXIT(tavor_ci_map_mem_area);
3253 		return (IBT_PBL_TOO_SMALL);
3254 	}
3255 
3256 	/*
3257 	 * We store the cookies returned by the DDI into our own PBL.  This
3258 	 * sets the cookies up for later processing (for example, if we want to
3259 	 * split up the cookies into smaller chunks).  We use the laddr and
3260 	 * size fields in each cookie to create each individual entry (PBE).
3261 	 */
3262 
3263 	/*
3264 	 * Store first cookie info first
3265 	 */
3266 	paddr_list_p[0].p_laddr = dmacookie.dmac_laddress;
3267 	paddr_list_p[0].p_size = dmacookie.dmac_size;
3268 
3269 	/*
3270 	 * Loop through each cookie, storing each cookie into our physical
3271 	 * buffer list.
3272 	 */
3273 	for (i = 1; i < *cookiecnt; i++) {
3274 		ddi_dma_nextcookie(dma_hdl, &dmacookie);
3275 
3276 		paddr_list_p[i].p_laddr = dmacookie.dmac_laddress;
3277 		paddr_list_p[i].p_size  = dmacookie.dmac_size;
3278 	}
3279 
3280 	/* return handle */
3281 	*ibc_ma_hdl_p = (ibc_ma_hdl_t)dma_hdl;
3282 	TAVOR_TNF_EXIT(tavor_get_dma_cookies);
3283 	return (DDI_SUCCESS);
3284 }
3285 
3286 /*
3287  * tavor_split_dma_cookies()
3288  * Split up cookies passed in from paddr_list_p, returning the new list in the
3289  * same buffers, based on the pagesize to split the cookies into.
3290  *    Context: Can be called from interrupt or base context.
3291  */
3292 /* ARGSUSED */
3293 int
3294 tavor_split_dma_cookies(tavor_state_t *state, ibt_phys_buf_t *paddr_list,
3295     ib_memlen_t *paddr_offset, uint_t list_len, uint_t *cookiecnt,
3296     uint_t pagesize)
3297 {
3298 	uint64_t	pageoffset;
3299 	uint64_t	pagemask;
3300 	uint_t		pageshift;
3301 	uint_t		current_cookiecnt;
3302 	uint_t		cookies_needed;
3303 	uint64_t	last_size, extra_cookie;
3304 	int		i_increment;
3305 	int		i, k;
3306 	int		status;
3307 
3308 	TAVOR_TNF_ENTER(tavor_split_dma_cookies);
3309 
3310 	/* Setup pagesize calculations */
3311 	pageoffset = pagesize - 1;
3312 	pagemask = (~pageoffset);
3313 	pageshift = highbit(pagesize) - 1;
3314 
3315 	/*
3316 	 * Setup first cookie offset based on pagesize requested.
3317 	 */
3318 	*paddr_offset = paddr_list[0].p_laddr & pageoffset;
3319 	paddr_list[0].p_laddr &= pagemask;
3320 
3321 	/* Save away the current number of cookies that are passed in */
3322 	current_cookiecnt = *cookiecnt;
3323 
3324 	/* Perform splitting up of current cookies into pagesize blocks */
3325 	for (i = 0; i < current_cookiecnt; i += i_increment) {
3326 		/*
3327 		 * If the cookie is smaller than pagesize, or already is
3328 		 * pagesize, then we are already within our limits, so we skip
3329 		 * it.
3330 		 */
3331 		if (paddr_list[i].p_size <= pagesize) {
3332 			i_increment = 1;
3333 			continue;
3334 		}
3335 
3336 		/*
3337 		 * If this is our first cookie, then we have to deal with the
3338 		 * offset that may be present in the first address.  So add
3339 		 * that to our size, to calculate potential change to the last
3340 		 * cookie's size.
3341 		 *
3342 		 * Also, calculate the number of cookies that we'll need to
3343 		 * split up this block into.
3344 		 */
3345 		if (i == 0) {
3346 			last_size = (paddr_list[i].p_size + *paddr_offset) &
3347 			    pageoffset;
3348 			cookies_needed = (paddr_list[i].p_size +
3349 			    *paddr_offset) >> pageshift;
3350 		} else {
3351 			last_size = 0;
3352 			cookies_needed = paddr_list[i].p_size >> pageshift;
3353 		}
3354 
3355 		/*
3356 		 * If our size is not a multiple of pagesize, we need one more
3357 		 * cookie.
3358 		 */
3359 		if (last_size) {
3360 			extra_cookie = 1;
3361 		} else {
3362 			extra_cookie = 0;
3363 		}
3364 
3365 		/*
3366 		 * Split cookie into pagesize chunks, shifting list of cookies
3367 		 * down, using more cookie slots in the PBL if necessary.
3368 		 */
3369 		status = tavor_dma_cookie_shift(paddr_list, i, list_len,
3370 		    current_cookiecnt - i, cookies_needed + extra_cookie);
3371 		if (status != 0) {
3372 			TNF_PROBE_0(tavor_split_cookies_toomany_fail,
3373 			    TAVOR_TNF_ERROR, "");
3374 			TAVOR_TNF_EXIT(tavor_dma_split_cookies);
3375 			return (status);
3376 		}
3377 
3378 		/*
3379 		 * If the very first cookie, we must take possible offset into
3380 		 * account.
3381 		 */
3382 		if (i == 0) {
3383 			paddr_list[i].p_size = pagesize - *paddr_offset;
3384 		} else {
3385 			paddr_list[i].p_size = pagesize;
3386 		}
3387 
3388 		/*
3389 		 * We have shifted the existing cookies down the PBL, now fill
3390 		 * in the blank entries by splitting up our current block.
3391 		 */
3392 		for (k = 1; k < cookies_needed; k++) {
3393 			paddr_list[i + k].p_laddr =
3394 			    paddr_list[i + k - 1].p_laddr + pagesize;
3395 			paddr_list[i + k].p_size = pagesize;
3396 		}
3397 
3398 		/* If we have one extra cookie (of less than pagesize...) */
3399 		if (extra_cookie) {
3400 			paddr_list[i + k].p_laddr =
3401 			    paddr_list[i + k - 1].p_laddr + pagesize;
3402 			paddr_list[i + k].p_size = last_size;
3403 		}
3404 
3405 		/* Increment cookiecnt appropriately based on cookies used */
3406 		i_increment = cookies_needed + extra_cookie;
3407 		current_cookiecnt += i_increment - 1;
3408 	}
3409 
3410 	/* Update to new cookie count */
3411 	*cookiecnt = current_cookiecnt;
3412 	TAVOR_TNF_EXIT(tavor_dma_split_cookies);
3413 	return (DDI_SUCCESS);
3414 }
3415 
3416 /*
3417  * tavor_dma_cookie_shift()
3418  *    Context: Can be called from interrupt or base context.
3419  */
3420 int
3421 tavor_dma_cookie_shift(ibt_phys_buf_t *paddr_list, int start, int end,
3422     int cookiecnt, int num_shift)
3423 {
3424 	int shift_start;
3425 	int i;
3426 
3427 	TAVOR_TNF_ENTER(tavor_dma_cookie_shift);
3428 
3429 	/* Calculating starting point in the PBL list */
3430 	shift_start = start + cookiecnt - 1;
3431 
3432 	/* Check if we're at the end of our PBL list */
3433 	if ((shift_start + num_shift - 1) >= end) {
3434 		TNF_PROBE_0(tavor_dma_cookie_shift_toomany_fail,
3435 		    TAVOR_TNF_ERROR, "");
3436 		TAVOR_TNF_EXIT(tavor_dma_cookie_shift);
3437 		return (IBT_PBL_TOO_SMALL);
3438 	}
3439 
3440 	for (i = shift_start; i > start; i--) {
3441 		paddr_list[i + num_shift - 1] = paddr_list[i];
3442 	}
3443 
3444 	TAVOR_TNF_EXIT(tavor_dma_cookie_shift);
3445 	return (DDI_SUCCESS);
3446 }
3447 
3448 
3449 /*
3450  * tavor_free_dma_cookies()
3451  *    Context: Can be called from interrupt or base context.
3452  */
3453 int
3454 tavor_free_dma_cookies(ibc_ma_hdl_t ma_hdl)
3455 {
3456 	ddi_dma_handle_t	dma_hdl;
3457 	int			status;
3458 
3459 	dma_hdl = (ddi_dma_handle_t)ma_hdl;
3460 
3461 	status = ddi_dma_unbind_handle(dma_hdl);
3462 	if (status != DDI_SUCCESS) {
3463 		TNF_PROBE_0(tavor_ci_free_dma_unbind_fail,
3464 		    TAVOR_TNF_ERROR, "");
3465 		TAVOR_TNF_EXIT(tavor_ci_unmap_mem_area);
3466 		return (ibc_get_ci_failure(0));
3467 	}
3468 
3469 	ddi_dma_free_handle(&dma_hdl);
3470 
3471 	return (DDI_SUCCESS);
3472 }
3473